分類のお勉強
sklearnを用いた2クラス分類
#!/usr/bin/env python # coding: utf-8 import numpy as np from sklearn import datasets from sklearn import cross_validation from sklearn import metrics # DataSetの準備 iris = datasets.load_iris() # トレーニングデータ <type 'numpy.ndarray'>の(100,4) data = iris.data[0:100] # ラベルデータ (100) target = [] for t in iris.target[0:100]: if t==1: target.append(1) else: target.append(-1) # 全体のデータの2割を検証用 train_x, test_x, train_y, test_y = cross_validation.train_test_split(data, target, test_size=0.2) # 最小二乗法で学習 w = np.linalg.inv(train_x.T.dot(train_x)).dot(train_x.T).dot(train_y) # 最小二乗法で推定 for x in test_x: if w.dot(x) > 0: pred_y = np.array([1 if w.dot(x) > 0 else -1 for x in test_x]) # テストデータに対する正答率 print metrics.accuracy_score(test_y, pred_y)
参考:
Home · levelfour/machine-learning-2014 Wiki · GitHub
https://kaigi.org/jsai/webprogram/2012/pdf/340.pdf
とにかくCNNに学習させる大量の画像データが欲しいでごわすという人のためのスクリプト - shi3zの長文日記
画像データの2クラス分類
#!/usr/bin/env python #-*- encoding: utf-8 -*- from PIL import Image import numpy as np import os import pandas as pd import pylab as pl from sklearn.decomposition import RandomizedPCA from sklearn.externals import joblib from sklearn.svm import LinearSVC # STANDARD_SIZE = (300, 167) STANDARD_SIZE = (480, 640) def img_to_matrix(filename, verbose=False): img = Image.open(filename) if verbose: print 'changing size from %s to %s' % (str(img.size), str(STANDARD_SIZE)) img = img.resize(STANDARD_SIZE) imgArray = np.asarray(img) return imgArray # imgArray.shape = (167 x 300 x 3) def flatten_image(img): s = img.shape[0] * img.shape[1] * img.shape[2] img_wide = img.reshape(1, s) return img_wide[0] def main(): img_dir = 'images/' images = [img_dir + f for f in os.listdir(img_dir)] labels = ['takuya' if 'takuya' in f.split('/')[-1] else 'iron_man' for f in images] #labels = ['takuya' if 'takuya' in f.split('/')[-1] else 'iron_man' for f in images] #print images #print labels data = [] for image in images: img = img_to_matrix(image) img = flatten_image(img) data.append(img) data = np.array(data) #print data.shape # dataの数だけ一様乱数を生成し、0.7以下のものだけtrainに is_train = np.random.uniform(0, 1, len(data)) <= 0.7 # np.array(labels) == 'takuya'の時は1を、そうでないときは0を返す # yはlabel y = np.where(np.array(labels) == 'takuya', 1, 0) # train用をぶちこむ train_x, train_y = data[is_train], y[is_train] # plot in 2 dimensions # 主成分分析 pca = RandomizedPCA(n_components=2) # 分析結果を元にデータセットを主成分に変換する X = pca.fit_transform(data) #print X.shape df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label": np.where(y == 1, 'takuya', 'iron_man')}) colors = ['red', 'yellow'] for label, color in zip(df['label'].unique(), colors): mask = df['label'] == label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.savefig('pca_feature.png') # training a classifier pca = RandomizedPCA(n_components=5) train_x = pca.fit_transform(train_x) # C : ペナルティ項 svm = LinearSVC(C=1.0) svm.fit(train_x, train_y) joblib.dump(svm, 'model.pkl') # evaluating the model test_x, test_y = data[is_train == False], y[is_train == False] test_x = pca.transform(test_x) print pd.crosstab(test_y, svm.predict(test_x), rownames=['Actual'], colnames=['Predicted']) if __name__ == '__main__': main()
参考:
pythonを使って簡単な画像分類を実現する - stMind
Numpyのwhereで配列インデックスを取得(python) | コード7区
Python: scikit-learn で主成分分析 (PCA) してみる - CUBE SUGAR CONTAINER
scikit.learn手法徹底比較! SVM編 - Risky Dune