kddcup2015,二分类,课程逃课预测。写了好久了,突然想起简单整理一下,以备后需。
step1,预处理,利用numpy和pandas库,数值化特征,简单而优雅
#!/usr/bin/env python # coding=utf-8 import pickle import pandas as pd import numpy as np source_dict={'server':0,'browser':1} event_dict = {"problem":5,"video":3,"access":1,"wiki":4,"discussion":6,"navigate":2,"page_close":0} def gen_time_dict(): rng = pd.date_range('2013-10-27','2014-08-01') time_dict = pd.Series(np.arange(len(rng)),index=rng) fw = open('data/time_dict.csv','w') pickle.dump(time_dict,fw) return time_dict def gen_courseid_dict(): df = pd.read_csv('data/date.csv',usecols=[0]) course_map = pd.factorize(df.course_id)[1] course_dict = dict(zip(course_map,range(len(course_map)))) fw = open('data/course_idTrain2.csv','w') pickle.dump(course_dict,fw) print "course_dict done" return course_dict def gen_object_dict(): df = pd.read_csv('data/log_train.csv',usecols=[4]) obj_map = pd.factorize(df.object)[1] obj_dict = dict(zip(obj_map,range(len(obj_map)))) df2 = pd.read_csv('data/test/log_test.csv',usecols=[4]) obj_map2 = pd.factorize(df2.object)[1] diff = [w for w in obj_map2 if w not in obj_map] obj_dict2 =dict(zip(diff,np.arange(len(obj_map),len(obj_map)+len(diff)))) obj_dict.update(obj_dict2) fw = open('data/object_pkl.csv','w') pickle.dump(obj_dict,fw) print "obj_dict done.." return obj_dict def time_map(x): x = x[:10] return time_dict[x] def obj_map(x): return obj_dict[x] def course_map(x): return course_dict[x] time_dict = gen_time_dict() course_dict= gen_courseid_dict() obj_dict= gen_object_dict() def log_trainData(): print "read log_train.csv " df1 = pd.read_csv('data/log_train.csv',converters={1:time_map,4:obj_map}) print df1.head() df1.source = df1.source.map(lambda x:source_dict[x]) df1.event = df1.event.map(lambda x:event_dict[x]) print df1.head() print df1.tail() df1.to_csv('data/log_trainData.csv',index=False) def course_Data(): df2 = pd.read_csv('data/enrollment_train.csv',usecols=[0,2],converters={2:course_map}) df3 = pd.read_csv('data/date.csv',converters={0:course_map,1:time_map,2:time_map}) df4 = pd.merge(df2,df3,on='course_id',how='outer') df4 = df4.sort_index(by='enrollment_id') print df4.tail(10) df4.to_csv("data/course_Trainpkl.csv",index=False) df1 = pd.read_csv('data/test/enrollment_test.csv',usecols=[0,2],converters={2:course_map}) df4 = pd.merge(df1,df3) df4 = df4.sort_index(by='enrollment_id') print df4.tail(10) df4.to_csv("data/test/course_Testpkl.csv",index=False) def log_testData(): print "read log_test.csv " df1 = pd.read_csv('data/test/log_test.csv',converters={1:time_map,4:obj_map}) print df1.tail(10) df1.source = df1.source.map(lambda x:source_dict[x]) df1.event = df1.event.map(lambda x:event_dict[x]) print df1.tail(10) df1.to_csv('data/test/log_testData.csv',index=False) log_trainData() log_testData() course_Data()
2. 使用各种机器学习方法进行建模,预测
#!/usr/bin/env python # coding=utf-8 import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn import preprocessing from sklearn.metrics import roc_auc_score from sklearn.cross_validation import train_test_split from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC import pickle debug=True if debug: N=5000 else: N = 20000 class DropOutPredict(object): course_dict={} def __init__(self): print "welcom kdd2015 contest, jkmiao@526588996" fr = open("data/coursePkl.pkl") self.course_dict = pickle.load(fr) def datestr2num(s): return pd.to_datetime(s) def norm_res(self,x): if x<0.0001: x=0 elif x>0.98: x=1.0 return x def norm_course(self,c): return self.course_dict[c] def loadTrainData(self): df1 = pd.read_csv('./data/log_train.csv',usecols=[0,2,3,4]) df1.source=pd.factorize(df1.source)[0] df1.event=pd.factorize(df1.event)[0] df1.object=pd.factorize(df1.object)[0] gp = df1.groupby("enrollment_id") gp2 = df1.groupby(["enrollment_id","source"]) df2 = pd.read_csv('data/enrollment_train.csv',usecols=[2]) df2.course_id=pd.factorize(df2.course_id)[0] df3 = pd.read_csv('data/truth_train.csv',usecols=[1],names=["drop"]) data = df1.pivot_table("source",rows="enrollment_id",cols="event",aggfunc="count",fill_value=0) data["browser"] = gp2.event.count().unstack()[0] data["server"] = gp2.event.count().unstack()[1] data["course_id"]=df2.course_id data["cnt"] = gp.event.count() data["std"] = gp.object.std() data["var"] =gp.event.var() data["mean"] =gp.event.mean() data = data.fillna(0) print data.head() X = data.values y = np.ravel(df3["drop"]) return X,y def loadTestData(self): df1 = pd.read_csv('data/test/log_test.csv',usecols=[0,2,3,4]) df1.source = pd.factorize(df1.source)[0] df1.event = pd.factorize(df1.event)[0] df1.object = pd.factorize(df1.object)[0] gp = df1.groupby("enrollment_id") gp2 = df1.groupby(["enrollment_id","source"]) df2 = pd.read_csv("data/test/enrollment_test.csv",usecols=[2]) df2.course_id = pd.factorize(df2.course_id)[0] data = df1.pivot_table("source",rows="enrollment_id",cols="event",aggfunc="count",fill_value=0) data["browser"] = gp2.event.count().unstack()[0] data["server"] = gp2.event.count().unstack()[1] data["course_id"] = df2.course_id data["cnt"] = gp.event.count() data["std"] = gp.object.std() data["var"] = gp.event.var() data["mean"] = gp.event.mean() data = data.fillna(0) print "test data head():...\n",data.head() test = data.values return test def gbdt_clf(self,x_train,x_test,y_train,y_test,test): clf = GradientBoostingClassifier(n_estimators=450,learning_rate=0.1,random_state=20) clf.fit(x_train,y_train) y_pred = clf.predict_proba(x_test)[:,1] scores = roc_auc_score(y_test,y_pred) print "gbdt_clf scores ... ",scores pred = clf.predict_proba(test)[:,1] print pred[:5] self.saveResult(pred,"data/test/gbdt_clf.csv") def svc_clf(self,x_train,x_test,y_train,y_test,test): tuned_parameters = [{'kernel':['poly'],'C':[10,500,1200]}, {'kernel':['linear'],'C':[200,500,800]}] clf = GridSearchCV(SVC(probability=True),tuned_parameters,cv=5,scoring="roc_auc") # clf = svm.SVC(C=2.0,kernel="rbf",probability=True,random_state=42) clf.fit(x_train,y_train) print "Best parameters set found : " print clf.best_params_ y_pred = clf.predict_proba(x_test)[:,1] scores = roc_auc_score(y_test,y_pred) print "svm clf scores...",scores pred = clf.predict_proba(test)[:,1] self.saveResult(pred,"data/test/svc_res"+str(scores)+".csv") return pred[:5] def saveResult(self,pred,fileName): enrollment_test = pd.read_csv('./data/test/enrollment_test.csv',usecols=[0]) enrollment_test['drop'] = pred res = enrollment_test[['enrollment_id','drop']]; print "***"*30 print res.head() res.to_csv(fileName,index=False,header=False) def drop_predict(self): print "loading train data..." X,y = self.loadTrainData() x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.23,random_state=20) print "loading test data..." test = self.loadTestData() print "moding gbdt_clf..." self.gbdt_clf(x_train,x_test,y_train,y_test,test) x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=28) print "moding svm_clf ..." self.svc_clf(x_train,x_test,y_train,y_test,test) def em_result(self): print "ensemable results..." df_gbdt = pd.read_csv("data/test/gbdt_res.csv",header=None,names=["id","drop1"]) df_svm = pd.read_csv("data/test/svc_res.csv",header=None,usecols=[1],names=["id","drop2"]) # df_knn = pd.read_csv("data/test/knn_res.csv",header=None,usecols=[1],names=["id","drop3"]) # df_ex1 = pd.read_csv("data/test/gbdt_clf0.861831055542.csv",header=None,usecols=[1],names=["drop4"]) # df_ex2 = pd.read_csv("data/test/gbdt_clf0.863249041131.csv",header=None,usecols=[1],names=["drop5"]) # final result # df = pd.concat([df_gbdt,df_svm,df_knn,df_ex1,df_ex2],axis=1) # df["drop"] = df["drop1"]*0.4+df["drop2"]*0.2+df["drop3"]*0.2+df["drop4"]*0.1+df["drop5"]*0.1 df = pd.concat([df_gbdt,df_svm],axis=1) df["drop"] = df.drop1*0.7+df.drop2*0.3 df["drop"] = map(lambda x:self.norm_res(x),df["drop"]) print df.head() # df.drop(["drop1","drop2","drop3","drop4","drop5"],axis=1,inplace=True) df.drop(["drop1","drop2"],axis=1,inplace=True) print df.head() df.to_csv("data/test/em_res.csv",header=False,index=False) if __name__ == '__main__': drop = DropOutPredict() drop.drop_predict() drop.em_result() print "done." # 准确率召回率AOC值可达84%左右
3, 继续特征工程,加强提取特征,AOC值接近89%
#!/usr/bin/env python # coding=utf-8 import numpy as np import pandas as pd import cPickle as pickle from sklearn import svm from sklearn import linear_model from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import scale from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score def norm( x ): if x<0.000001: x=0 elif x>0.96: x=1 return x def last_time(x): return x.max()-x.min() def loadTrainData(): df1 = pd.read_csv('data/log_trainData.csv') print df1.head() print df1.tail() df2 = pd.read_csv('data/truth_train.csv',header=None,usecols=[1],names=["drop"]) df3 = pd.read_csv('data/course_Trainpkl.csv',usecols=[1,2,3]) gp = df1.groupby("enrollment_id") data = df1.pivot_table("source",rows='enrollment_id',cols="event",aggfunc='count',fill_value=0) eventdf = gp.event.describe().unstack() timedf = gp.time.describe().unstack() timedf.drop('count',axis=1) sourcedf = gp.source.describe().unstack() sourcedf.drop(['count','min','max'],axis=1) objectdf = gp.object.describe().unstack() objectdf.drop(['count'],axis=1) # 并连"特征表" data = pd.concat([data,eventdf],axis=1) data = pd.concat([data,timedf],axis=1) data = pd.concat([data,sourcedf],axis=1) data = pd.concat([data,objectdf],axis=1) # 课程特征,持续时间,id号,从?天到?天 data['dtime'] = gp.time.apply(last_time) data["course_id"] = df3["course_id"].values data["from"] = df3["from"].values data["to"] = df3["to"].values # 最大最小值规范化,并未什么提升 # X = MinMaxScaler().fit_transform(X) print "origin data: " print data.tail() data = data.fillna(0) data.to_csv('data/trainData.csv',index=False) X = data.values # 去均值后规范化 X = scale(X) # fw = open("data/train/trainData.pkl",'w') # pickle.dump(X,fw) y = np.ravel(df2['drop']) print "y: ",y[:5] return X,y def loadTestData(): df1 = pd.read_csv('data/test/log_testData.csv') print df1.head() df3 = pd.read_csv('data/test/course_Testpkl.csv',usecols=[1,2,3]) gp = df1.groupby("enrollment_id") data = df1.pivot_table("source",rows='enrollment_id',cols="event",aggfunc='count',fill_value=0) eventdf = gp.event.describe().unstack() timedf = gp.time.describe().unstack() timedf.drop('count',axis=1) sourcedf = gp.source.describe().unstack() sourcedf.drop(['count','min','max'],axis=1) objectdf = gp.object.describe().unstack() objectdf.drop(['count'],axis=1) data = pd.concat([data,eventdf],axis=1) data = pd.concat([data,timedf],axis=1) data = pd.concat([data,sourcedf],axis=1) data = pd.concat([data,objectdf],axis=1) data['dtime'] = gp.time.apply(last_time) data["course_id"] = df3["course_id"].values data["from"] = df3["from"].values data["to"] = df3["to"].values # data["cnt"]=gp.size() # data["eventstd"] = gp.event.std() # data['eventmean'] = gp.event.mean() # data['eventmdeian'] = gp.event.median() # data['equantile0.25'] = gp.event.quantile(0.25) # data['equantile0.75'] = gp.event.quantile(0.75) # data['equantilemad'] = gp.event.mad() print "test data: " print data.tail(10) data = data.fillna(0) # 写入文件,以备后需,直接读取 data.to_csv('data/test/testData.csv',index=False) # 也可以直接生成序列化文件 # fw = open("data/test/testData.pkl",'w') # pickle.dump(data,fw) test = data.values # test = MinMaxScaler().fit_transform(test) test = scale(test) return test def svc_clf(x_train,x_test,y_train,y_test,test): clf = svm.SVC(kernel='linear',probability=True,random_state=42) clf.fit(x_train,y_train) y_pred= clf.predict_proba(x_test)[:,1] scores = roc_auc_score(y_test,y_pred) # 必须test值写在前面,否则报错 print "svm scores:...",scores pred = clf.predict_proba(test)[:,1] saveResult(pred,'data/test/svc_res.csv') def lr_clf(x_train,x_test,y_train,y_test,test): clf = linear_model.LogisticRegression() clf.fit(x_train,y_train) y_pred = clf.predict_proba(x_test)[:,1] scores= roc_auc_score(y_test,y_pred) print "lr_clf scores: ",scores y_pred = map(norm,y_pred) score2 = roc_auc_score(y_test,y_pred) print "after nomailzied score ... ",score2 pred = clf.predict_proba(test)[:,1] saveResult(pred,'data/test/lr_res.csv') def rf_clf(x_train,x_test,y_train,y_test,test): clf = RandomForestClassifier(n_estimators=100) clf.fit(x_train,x_train) y_pred = clf.predict_proba(x_test)[:,1] scores = roc_auc_score(y_test,y_pred) pred = clf.predict(test)[:,1] print "rf_scores: ",scores saveResult(pred,'./data/test/rf_res.csv') def gbdt_clf(x_train,x_test,y_train,y_test,test): clf = GradientBoostingClassifier(n_estimators=500) clf.fit(x_train,y_train) y_pred=clf.predict_proba(x_test)[:,1] scores = roc_auc_score(y_test,y_pred) pred = clf.predict_proba(test)[:,1] print "gbdt_clf scores: ",scores saveResult(pred,'data/test/gbdt_clf'+str(scores)+'.csv') def saveResult(pred,fileName): # 获取用户id号 df = pd.read_csv('data/test/enrollment_test.csv',usecols=[0]) # 加上预测值 df['drop'] = pred print df.head() # 写入提交文件 df.to_csv(fileName,index=False,header=False) # 将较好的几个结果ensemble一下 def em_res(): df = pd.read_csv("data/test/gbdt_res.csv",header=None,names=["id","drop"]) df1 = pd.read_csv("data/test/gbdt_clf0.875919444048.csv",header=None,usecols=[1],names=["drop1"]) df2 = pd.read_csv("data/test/final_res.csv",header=None,usecols=[1],names=["drop2"]) df["drop"] =df["drop"]*0.5+ df1["drop1"]*0.2+df2["drop2"]*0.3 df["drop"] = df["drop"] df.to_csv("data/test/final_res.csv",index=None,header=None) # 后续使用时,直接读取,加快速度 def loadPickleTrainData(): df1 = pd.read_csv('data/trainData.csv') print df1.head() X = df1.values # X = scale(X) fr2 = open("data/train/trainLabel.txt") y = pickle.load(fr2) return X,y def loadPickleTestData(): df1 = pd.read_csv('data/test/testData.csv') test = df1.values # test = scale(test) return test def dropPredict(): em_res() print "loading train data..." X,y = loadPickleTrainData() # X,y = loadTrainData() print "loading test data... " test = loadPickleTestData() # test = loadTestData() print "\nmodeling lr..." x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.31,random_state=148) lr_clf(x_train,x_test,y_train,y_test,test) x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.28,random_state=151) print "\nmodeling rf..." # rf_clf(x_train,x_test,y_train,y_test,test) print "\nmodeling gbdt..." gbdt_clf(x_train,x_test,y_train,y_test,test) print "\nmodeling svm..." svc_clf(x_train,x_test,y_train,y_test,test) if __name__ =="__main__": print "start>>>" dropPredict() print "done"