# 邏輯回歸
## 邏輯回歸處理二元分類
%matplotlib inline import matplotlib.pyplot as plt #顯示中文 from matplotlib.font_manager import FontProperties font=FontProperties(fname=r"c:\windows\fonts\msyh.ttc", size=10)
import numpy as np plt.figure() plt.axis([-6,6,0,1]) plt.grid(True) X=np.arange(-6,6,0.1) y=1/(1+np.e**(-X)) plt.plot(X,y,'b-')
## 垃圾郵件分類
import pandas as pd df=pd.read_csv('SMSSpamCollection',delimiter='\t',header=None) df.head()
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split #用pandas加載數據.csv文件,然后用train_test_split分成訓練集(75%)和測試集(25%): X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0]) #我們建一個TfidfVectorizer實例來計算TF-IDF權重: vectorizer=TfidfVectorizer() X_train=vectorizer.fit_transform(X_train_raw) X_test=vectorizer.transform(X_test_raw) #LogisticRegression同樣實現了fit()和predict()方法 classifier=LogisticRegression() classifier.fit(X_train,y_train) predictions=classifier.predict(X_test) for i ,prediction in enumerate(predictions[-5:]): print '預測類型:%s.信息:%s' %(prediction,X_test_raw.iloc[i])
輸出結果:
預測類型:ham.信息:Waiting in e car 4 my mum lor. U leh? Reach home already?
預測類型:ham.信息:Dear got train and seat mine lower seat
預測類型:spam.信息:I just really need shit before tomorrow and I know you won't be awake before like 6
預測類型:ham.信息:What should i eat fo lunch senor
預測類型:ham.信息:645
## 二元分類效果評估方法
#混淆矩陣 from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt y_test = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] y_pred = [0, 1, 0, 0, 0, 0, 0, 1, 1, 1] confusion_matrix=confusion_matrix(y_test,y_pred) print confusion_matrix
plt.matshow(confusion_matrix) plt.title(u'混淆矩陣') plt.colorbar() plt.ylabel(u'實際類型') plt.xlabel(u'預測類型') plt.show()
## 准確率
import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split,cross_val_score df=pd.read_csv('SMSSpamCollection',delimiter='\t',names=["label","message"]) X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label']) vectorizer=TfidfVectorizer() X_train=vectorizer.fit_transform(X_train_raw) X_test=vectorizer.transform(X_test_raw) classifier=LogisticRegression() classifier.fit(X_train,y_train) scores=cross_val_score(classifier,X_train,y_train,cv=5) print '准確率',np.mean(scores),scores
輸出結果:
## 精確率和召回率
scikit-learn結合真實類型數據,提供了一個函數來計算一組預測值的精確率和召回率。
%matplotlib inline import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split, cross_val_score df['label']=pd.factorize(df['label'])[0] X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'],df['label']) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring= 'precision') print u'精確率:', np.mean(precisions), precisions recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall') print u'召回率:', np.mean(recalls), recalls plt.scatter(recalls, precisions)
輸出結果:
精確率: 0.990243902439 [ 1. 0.95121951 1. 1. 1. ]
召回率: 0.691498103666 [ 0.65486726 0.69026549 0.69911504 0.71681416 0.69642857]
## 計算綜合評價指標
fls=cross_val_score(classifier,X_train,y_train,cv=5,scoring='f1') print '綜合指標評價',np.mean(fls),fls
輸出結果:
## 網格搜索
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.cross_validation import train_test_split from sklearn.metrics import precision_score, recall_score, accuracy_score pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (2500, 5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'vect__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.01, 0.1, 1, 10), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3) df=pd.read_csv('SMSSpamCollection',delimiter='\t',names=["label","message"]) df['label']=pd.factorize(df['label'])[0] X_train, X_test, y_train, y_test = train_test_split(df['message'],df['label']) grid_search.fit(X_train, y_train) print('最佳效果:%0.3f' % grid_search.best_score_)
輸出結果;
最佳效果:0.986
print '最優參數組合' best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()): print '\t%s:%r' %(param_name,best_parameters[param_name]) predictions=grid_search.predict(X_test) print '准確率:',accuracy_score(y_test,predictions) print '精確率:',precision_score(y_test,predictions) print '召回率:',recall_score(y_test,predictions)
輸出結果:
clf__C:10
clf__penalty:'l2'
vect__max_df:0.25
vect__max_features:2500
vect__ngram_range:(1, 2)
vect__norm:'l2'
vect__stop_words:None
vect__use_idf:True
准確率: 0.979899497487
精確率: 0.974683544304
召回率: 0.865168539326
# logistics 多分類
import pandas as pd df=pd.read_csv("logistic_data/train.tsv",header=0,delimiter='\t') print df.count() print df.head() df.Phrase.head(10) df.Sentiment.describe() df.Sentiment.value_counts() df.Sentiment.value_counts()/df.Sentiment.count()
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report,accuracy_score,confusion_matrix from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV pipeline=Pipeline([ ('vect',TfidfVectorizer(stop_words='english')), ('clf',LogisticRegression())]) parameters={ 'vect__max_df':(0.25,0.5), 'vect__ngram_range':((1,1),(1,2)), 'vect__use_idf':(True,False), 'clf__C':(0.1,1,10), } df=pd.read_csv("logistic_data/train.tsv",header=0,delimiter='\t') X,y=df.Phrase,df.Sentiment.as_matrix() X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.5) grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring="accuracy") grid_search.fit(X_train,y_train) print u'最佳效果:%0.3f'%grid_search.best_score_ print u'最優參數組合:' best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s:%r'%(param_name,best_parameters[param_name])
數據結果:
Fitting 3 folds for each of 24 candidates, totalling 72 fits
## 多類分類效果評估
predictions=grid_search.predict(X_test) print u'准確率',accuracy_score(y_test,predictions) print u'混淆矩陣',confusion_matrix(y_test,predictions) print u'分類報告',classification_report(y_test,predictions)
數據結果:
准確率 0.636614122773
混淆矩陣 [[ 1133 1712 595 67 1]
[ 919 6136 6006 553 35]
[ 213 3212 32637 3634 138]
[ 22 420 6548 8155 1274]
[ 4 45 546 2411 1614]]
分類報告 precision recall f1-score support
0 0.49 0.32 0.39 3508
1 0.53 0.45 0.49 13649
2 0.70 0.82 0.76 39834
3 0.55 0.50 0.52 16419
4 0.53 0.35 0.42 4620
avg / total 0.62 0.64 0.62 78030