Python_sklearn機器學習庫學習筆記(三)logistic regression(邏輯回歸)


# 邏輯回歸

## 邏輯回歸處理二元分類

%matplotlib inline
import matplotlib.pyplot as plt
#顯示中文
from matplotlib.font_manager import FontProperties
font=FontProperties(fname=r"c:\windows\fonts\msyh.ttc", size=10)
import numpy as np
plt.figure()
plt.axis([-6,6,0,1])
plt.grid(True)
X=np.arange(-6,6,0.1)
y=1/(1+np.e**(-X))
plt.plot(X,y,'b-')

## 垃圾郵件分類

import pandas as pd
df=pd.read_csv('SMSSpamCollection',delimiter='\t',header=None)
df.head()


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
#用pandas加載數據.csv文件,然后用train_test_split分成訓練集(75%)和測試集(25%):
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])
#我們建一個TfidfVectorizer實例來計算TF-IDF權重:
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train_raw)
X_test=vectorizer.transform(X_test_raw)
#LogisticRegression同樣實現了fit()和predict()方法
classifier=LogisticRegression()
classifier.fit(X_train,y_train)
predictions=classifier.predict(X_test)

for i ,prediction in enumerate(predictions[-5:]):
    print '預測類型:%s.信息:%s' %(prediction,X_test_raw.iloc[i])

輸出結果:

預測類型:ham.信息:Waiting in e car 4 my mum lor. U leh? Reach home already?
預測類型:ham.信息:Dear got train and seat mine lower seat
預測類型:spam.信息:I just really need shit before tomorrow and I know you won't be awake before like 6
預測類型:ham.信息:What should i eat fo lunch senor
預測類型:ham.信息:645

## 二元分類效果評估方法

#混淆矩陣
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
y_test = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [0, 1, 0, 0, 0, 0, 0, 1, 1, 1]
confusion_matrix=confusion_matrix(y_test,y_pred)
print confusion_matrix
plt.matshow(confusion_matrix)
plt.title(u'混淆矩陣')
plt.colorbar()
plt.ylabel(u'實際類型')
plt.xlabel(u'預測類型')
plt.show()

## 准確率

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score

df=pd.read_csv('SMSSpamCollection',delimiter='\t',names=["label","message"])
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label'])
vectorizer=TfidfVectorizer()
X_train=vectorizer.fit_transform(X_train_raw)
X_test=vectorizer.transform(X_test_raw)
classifier=LogisticRegression()
classifier.fit(X_train,y_train)
scores=cross_val_score(classifier,X_train,y_train,cv=5)
print '准確率',np.mean(scores),scores

輸出結果:

准確率 0.954292731612 [ 0.96057348  0.96052632  0.94617225  0.95808383  0.94610778]

 ## 精確率和召回率

scikit-learn結合真實類型數據,提供了一個函數來計算一組預測值的精確率和召回率。

%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score

df['label']=pd.factorize(df['label'])[0]
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'],df['label'])
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring=
'precision')
print u'精確率:', np.mean(precisions), precisions
recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')
print u'召回率:', np.mean(recalls), recalls
plt.scatter(recalls, precisions)

輸出結果:

精確率: 0.990243902439 [ 1.          0.95121951  1.          1.          1.        ]
召回率: 0.691498103666 [ 0.65486726  0.69026549  0.69911504  0.71681416  0.69642857]

 

## 計算綜合評價指標

fls=cross_val_score(classifier,X_train,y_train,cv=5,scoring='f1')
print '綜合指標評價',np.mean(fls),fls  

輸出結果:

綜合指標評價 0.791683999687 [ 0.76243094  0.79781421  0.8         0.77094972  0.82722513]

## ROC AUC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.metrics import roc_curve,auc


df['label']=pd.factorize(df['label'])[0]
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'],df['label'])
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions=classifier.predict_proba(X_test)#每一類的概率
false_positive_rate, recall, thresholds = roc_curve(y_test, predictions[:
, 1])
roc_auc=auc(false_positive_rate,recall)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out')
plt.show()

 

## 網格搜索

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

pipeline = Pipeline([
('vect', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression())
])

parameters = {
'vect__max_df': (0.25, 0.5, 0.75),
'vect__stop_words': ('english', None),
'vect__max_features': (2500, 5000, 10000, None),
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__use_idf': (True, False),
'vect__norm': ('l1', 'l2'),
'clf__penalty': ('l1', 'l2'),
'clf__C': (0.01, 0.1, 1, 10),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)
df=pd.read_csv('SMSSpamCollection',delimiter='\t',names=["label","message"])
df['label']=pd.factorize(df['label'])[0]

X_train, X_test, y_train, y_test = train_test_split(df['message'],df['label'])
grid_search.fit(X_train, y_train)
print('最佳效果:%0.3f' % grid_search.best_score_)

輸出結果;

最佳效果:0.986

print '最優參數組合'
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s:%r' %(param_name,best_parameters[param_name])
    
predictions=grid_search.predict(X_test)
print '准確率:',accuracy_score(y_test,predictions)
print '精確率:',precision_score(y_test,predictions)
print '召回率:',recall_score(y_test,predictions)

輸出結果:

clf__C:10
clf__penalty:'l2'
vect__max_df:0.25
vect__max_features:2500
vect__ngram_range:(1, 2)
vect__norm:'l2'
vect__stop_words:None
vect__use_idf:True
准確率: 0.979899497487
精確率: 0.974683544304
召回率: 0.865168539326

# logistics 多分類

import pandas as pd
df=pd.read_csv("logistic_data/train.tsv",header=0,delimiter='\t')
print df.count()
print df.head()
df.Phrase.head(10)
df.Sentiment.describe()
df.Sentiment.value_counts()
df.Sentiment.value_counts()/df.Sentiment.count()
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

pipeline=Pipeline([
        ('vect',TfidfVectorizer(stop_words='english')),
        ('clf',LogisticRegression())])
parameters={
            'vect__max_df':(0.25,0.5),
            'vect__ngram_range':((1,1),(1,2)),
            'vect__use_idf':(True,False),
            'clf__C':(0.1,1,10),
           }
df=pd.read_csv("logistic_data/train.tsv",header=0,delimiter='\t')
X,y=df.Phrase,df.Sentiment.as_matrix()
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.5)
grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring="accuracy")
grid_search.fit(X_train,y_train)
print u'最佳效果:%0.3f'%grid_search.best_score_
print u'最優參數組合:'
best_parameters=grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print '\t%s:%r'%(param_name,best_parameters[param_name])

數據結果:

Fitting 3 folds for each of 24 candidates, totalling 72 fits
 [Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 2.0min [Parallel(n_jobs=-1)]: Done 72 out of 72 | elapsed: 4.5min finished
 最佳效果:0.619 最優參數組合: clf__C:10 vect__max_df:0.25 vect__ngram_range:(1, 2) vect__use_idf:False

 ## 多類分類效果評估

predictions=grid_search.predict(X_test)
print u'准確率',accuracy_score(y_test,predictions)
print u'混淆矩陣',confusion_matrix(y_test,predictions)
print u'分類報告',classification_report(y_test,predictions)

數據結果:

准確率 0.636614122773
混淆矩陣 [[ 1133  1712   595    67     1]
[  919  6136  6006   553    35]
[  213  3212 32637  3634   138]
[   22   420  6548  8155  1274]
[    4    45   546  2411  1614]]
分類報告              precision    recall  f1-score   support

          0       0.49      0.32      0.39      3508
          1       0.53      0.45      0.49     13649
          2       0.70      0.82      0.76     39834
          3       0.55      0.50      0.52     16419
          4       0.53      0.35      0.42      4620

avg / total       0.62      0.64      0.62     78030

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM