Classification with HDF5 data
1.導入庫
1 import os 2 import h5py 3 import shutil 4 import sklearn 5 import tempfile 6 import numpy as np 7 import pandas as pd 8 import sklearn.datasets 9 import sklearn.linear_model 10 import matplotlib.pyplot as plt 11 %matplotlib inline
2.產生數據
sklearn.datasets.make_classification產生測試數據。
10000組數據,特征向量維數為4。
sklearn.cross_validation.train_test_split為交叉驗證。就是把data拆分為不同的train set和test set。
這里拆分為7500:2500
1 X, y = sklearn.datasets.make_classification( 2 n_samples=10000, n_features=4, n_redundant=0, n_informative=2, 3 n_clusters_per_class=2, hypercube=False, random_state=0 4 ) 5 6 # Split into train and test 7 X, Xt, y, yt = sklearn.cross_validation.train_test_split(X, y)
3.數據可視化
1 # Visualize sample of the data 2 # np.random.permutation產生序列或隨機交換序列 3 # X.shape=7500 4 # 在此產生0-7499亂序序列並取前1000 5 ind = np.random.permutation(X.shape[0])[:1000] 6 df = pd.DataFrame(X[ind]) 7 # 繪圖 'kde'核密度估計,'hist'直方圖 8 _ = pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])
pd.scatter_matrix函數說明

1 def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, 2 diagonal='hist', marker='.', density_kwds=None, 3 hist_kwds=None, range_padding=0.05, **kwds): 4 """ 5 Draw a matrix of scatter plots. 6 7 Parameters 8 ---------- 9 frame : DataFrame 10 alpha : float, optional 11 amount of transparency applied 12 figsize : (float,float), optional 13 a tuple (width, height) in inches 14 ax : Matplotlib axis object, optional 15 grid : bool, optional 16 setting this to True will show the grid 17 diagonal : {'hist', 'kde'} 18 pick between 'kde' and 'hist' for 19 either Kernel Density Estimation or Histogram 20 plot in the diagonal 21 marker : str, optional 22 Matplotlib marker type, default '.' 23 hist_kwds : other plotting keyword arguments 24 To be passed to hist function 25 density_kwds : other plotting keyword arguments 26 To be passed to kernel density estimate plot 27 range_padding : float, optional 28 relative extension of axis range in x and y 29 with respect to (x_max - x_min) or (y_max - y_min), 30 default 0.05 31 kwds : other plotting keyword arguments 32 To be passed to scatter function 33 34 Examples 35 -------- 36 >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) 37 >>> scatter_matrix(df, alpha=0.2) 38 """
4.SGD learning及正確率
documents:scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
1 # Train and test the scikit-learn SGD logistic regression. 2 clf = sklearn.linear_model.SGDClassifier( 3 loss='log', n_iter=1000, penalty='l2', alpha=1e-3, class_weight='auto') 4 5 # Fit linear model with Stochastic Gradient Descent. 6 clf.fit(X, y) 7 # Predict class labels for samples in X. 8 yt_pred = clf.predict(Xt) 9 print('Accuracy: {:.3f}'.format(sklearn.metrics.accuracy_score(yt, yt_pred)))
5.寫HDF5數據。很直觀的文件讀寫操作。需要注意路徑。我沒有改路徑,而是把生成的數據手動復制到了caffe_root/examples/hdf5_classification中
1 # Write out the data to HDF5 files in a temp directory. 2 # This file is assumed to be caffe_root/examples/hdf5_classification.ipynb 3 dirname = os.path.abspath('./hdf5_classification/data') 4 if not os.path.exists(dirname): 5 os.makedirs(dirname) 6 7 train_filename = os.path.join(dirname, 'train.h5') 8 test_filename = os.path.join(dirname, 'test.h5') 9 10 # HDF5DataLayer source should be a file containing a list of HDF5 filenames. 11 # To show this off, we'll list the same data file twice. 12 with h5py.File(train_filename, 'w') as f: 13 f['data'] = X 14 f['label'] = y.astype(np.float32) 15 with open(os.path.join(dirname, 'train.txt'), 'w') as f: 16 f.write(train_filename + '\n') 17 f.write(train_filename + '\n') 18 19 # HDF5 is pretty efficient, but can be further compressed. 20 comp_kwargs = {'compression': 'gzip', 'compression_opts': 1} 21 with h5py.File(test_filename, 'w') as f: 22 f.create_dataset('data', data=Xt, **comp_kwargs) 23 f.create_dataset('label', data=yt.astype(np.float32), **comp_kwargs) 24 with open(os.path.join(dirname, 'test.txt'), 'w') as f: 25 f.write(test_filename + '\n')
6.更改路徑到caffe_root,用solver.prototxt設置參數,train_val.prototxt配置模型。
模型分析看這里www.cnblogs.com/nwpuxuezha/p/4297298.html
1 # Run caffe. Scroll down in the output to see the final 2 # test accuracy, which should be about the same as above. 3 !cd .. && ./build/tools/caffe train -solver examples/hdf5_classification/solver.prototxt
7.使用非線性模型進行優化,用solver2.prototxt設置參數,train_val2.prototxt配置模型。(占坑)
1 !cd .. && ./build/tools/caffe train -solver examples/hdf5_classification/solver2.prototxt
總結:467步驟我的計算結果和歷程中的結果有一些差距,7步驟最高,只能做到0.73左右。原因待思考。