1 from sklearn.preprocessing import OneHotEncoder 2 import numpy 3 onehot = OneHotEncoder() 4 #建立一個映射,將多個特征共存的情況定義為單個數表示比如有特征{a,b},a用1,b用2,ab用3----(1) 5 import pandas 6 data = pandas.read_csv("userFeature_part_bat.csv") 7 8 #列出所有特征值,遍歷,uid是用戶唯一標識,不應該算作特征 9 10 userfeature = ["age","carrier","consumptionAbility","ct","education","gender","house","interest1","interest2","interest3","interest4","interest5","kw1","kw2","kw3","marriageStatus","os","topic1","topic2","topic3","LBS","appIdAction","appIdInstall"] 11 #for index in data[feature] : 12 userdata = [] 13 for index in range(len(data["uid"])): 14 feature_li = [] 15 for feature in userfeature: 16 # a = data[feature] 17 # print(a[0],type(a[index]),isinstance(a[0],(numpy.int64))) 18 19 if isinstance(data[feature][index],numpy.int64): 20 feature_li.append(int(data[feature][index])) 21 elif isinstance(data[feature][index],numpy.float64): 22 feature_li.append(0)#缺失值用0填充,這是不合理的,有待改進 23 elif isinstance(data[feature][index], numpy.float): 24 feature_li.append(0) 25 else : 26 trans = data[feature][index].strip().split(" ") 27 trans = map(int,trans) 28 trans = sorted(trans) 29 #print(trans) 30 s = 0 31 for num in trans : 32 s += num 33 feature_li.append(s) 34 35 print(feature_li) 36 userdata.append(feature_li) 37 userdata = numpy.array(userdata) 38 print("--------------------------------------------------------------------") 39 print(userdata) 40 ''' 41 第一步要把所有特征值id提取, 42 特征特征值id進行排序,組合成一個特征值,這樣就實現了(1)的目標 43 多特征值id時我們把特征值排序再進行onehot編碼,帶來的缺點: 44 例如 特征a有特征id1,2,3 b有特征id1,2他們顯然是相近的,我的方法onehot 45 編碼后他們不再相關;我想應該有更好的方法可以解決,比如:word2vec 46 '''