數據分析與建模,本次嘗試使用C++進行處理,數據在excel中,遂考慮使用Python進行excel轉txt操作,代碼如下:
1 # -*- coding: UTF-8 -*- 2 import sys 3 import json 4 5 import pandas as pd 6 import numpy as np 7 8 filename=r'C:\Users\lenovo\Desktop\that.txt' 9 raw_score = r'C:\Users\lenovo\Desktop\data1.xlsx' 10 #讀取excel保存成txt格式 11 excel_file = pd.read_excel(raw_score) 12 excel_file.to_csv(filename, sep=' ', index=False)
Python聚類分析代碼:
1 import numpy as np 2 import matplotlib.pyplot as plt 3 4 # 加載數據 5 def loadDataSet(fileName): 6 data = np.loadtxt(fileName,delimiter=' ') 7 return data 8 9 # 歐氏距離計算 10 def distEclud(x,y): 11 return np.sqrt(np.sum((x-y)**2)) # 計算歐氏距離 12 13 # 為給定數據集構建一個包含K個隨機質心的集合 14 def randCent(dataSet,k): 15 m,n = dataSet.shape 16 centroids = np.zeros((k,n)) 17 for i in range(k): 18 index = int(np.random.uniform(0,m)) # 19 centroids[i,:] = dataSet[index,:] 20 return centroids 21 22 # k均值聚類 23 def KMeans(dataSet,k): 24 25 m = np.shape(dataSet)[0] #行的數目 26 # 第一列存樣本屬於哪一簇 27 # 第二列存樣本的到簇的中心點的誤差 28 clusterAssment = np.mat(np.zeros((m,2))) 29 clusterChange = True 30 31 # 第1步 初始化centroids 32 centroids = randCent(dataSet,k) 33 while clusterChange: 34 clusterChange = False 35 36 # 遍歷所有的樣本(行數) 37 for i in range(m): 38 minDist = 100000.0 39 minIndex = -1 40 41 # 遍歷所有的質心 42 #第2步 找出最近的質心 43 for j in range(k): 44 # 計算該樣本到質心的歐式距離 45 distance = distEclud(centroids[j,:],dataSet[i,:]) 46 if distance < minDist: 47 minDist = distance 48 minIndex = j 49 # 第 3 步:更新每一行樣本所屬的簇 50 if clusterAssment[i,0] != minIndex: 51 clusterChange = True 52 clusterAssment[i,:] = minIndex,minDist**2 53 #第 4 步:更新質心 54 for j in range(k): 55 pointsInCluster = dataSet[np.nonzero(clusterAssment[:,0].A == j)[0]] # 獲取簇類所有的點 56 centroids[j,:] = np.mean(pointsInCluster,axis=0) # 對矩陣的行求均值 57 58 print("Congratulations,cluster complete!") 59 return centroids,clusterAssment 60 61 def showCluster(dataSet,k,centroids,clusterAssment): 62 m,n = dataSet.shape 63 if n != 2: 64 print("數據不是二維的") 65 return 1 66 67 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] 68 if k > len(mark): 69 print("k值太大了") 70 return 1 71 72 # 繪制所有的樣本 73 for i in range(m): 74 markIndex = int(clusterAssment[i,0]) 75 plt.plot(dataSet[i,0],dataSet[i,1],mark[markIndex]) 76 plt.annotate(i+1,(dataSet[i,0],dataSet[i,1])) 77 78 mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] 79 # 繪制質心 80 for i in range(k): 81 print('質心:',str(centroids[i,0])+' '+str(centroids[i,1])) 82 plt.plot(centroids[i,0],centroids[i,1],mark[i]) 83 plt.title('Clustering map of specialists') 84 plt.show() 85 86 87 dataSet = loadDataSet(r'C:\Users\lenovo\Desktop\normal.txt') 88 k = 3 89 centroids,clusterAssment = KMeans(dataSet,k) 90 91 showCluster(dataSet,k,centroids,clusterAssment)
聚類結果:
話說,c++建模也還可,就是需要自己編寫相關指標的算法,但是也挺有意
代碼如下:
1 /* 2 *language:c++ 3 *version:11 4 *encoding:GBK 5 *made by Luo Wenshui 6 *last modified:2020/5/10 7 *data file:input.txt 8 */ 9 #include<iostream> 10 #include<map> 11 #include<math.h> 12 #include<string.h> 13 #include<algorithm> 14 #include<cstdio> 15 using namespace std; 16 inline int read(){ 17 int ans=0,w=1; 18 char ch=getchar(); 19 while(!isdigit(ch)){if(ch=='-')w=-1;ch=getchar();} 20 while(isdigit(ch))ans=(ans<<3)+(ans<<1)+ch-'0',ch=getchar(); 21 return ans*w; 22 } 23 int n,m,t; 24 const int maxn=1200; 25 const int maxm=50; 26 struct node{//每一件參賽作品 27 double final_score; 28 double raw_score[4]; 29 int score_givener[4]; 30 double std_score[4]; 31 int rank;//該作品在所有參賽作品中的排序 32 }a[maxn]; 33 struct specialist{ 34 double raw_score[100]; 35 double std_score[100]; 36 int num_of_work[100];//每次評閱的作品編號 37 int count;//表示專家批閱的項目數量; 38 double std;//該專家打分的標准差 39 double mean;//該專家打分的均值 40 int rank[maxn];//該專家對相應編號作品的打分在該專家所有分數中的排名 41 double delta0i,delta1i; 42 double P0i,P1i;//定義為P0i=1-delta0i,P1i=1-delta1i 43 double P0T,P1T;//專家多次比賽獲得的平均可行度水平,本論文中不涉及 44 int kide;//專家在聚類之后所屬的類別 45 }E[maxm]; 46 struct work{//對所有作品確定排名的排序類 47 int index; 48 double score; 49 }w[maxn]; 50 double R[maxm][maxm];//相似度矩陣 51 double total_mean;//所有專家打分的均值 52 double total_std;//所有專家打分的標准差 53 int kide[100]={3,2,3,3,3,2,3,3,3,1,3,2,2,1,3,3,3,2,3,2,1,3,2,3,2,1,2,3,2,1,3,2,3,3,3,3,2,1,3,3,3,2,2,3,1};//專家類別表,由聚類分析得出 54 double get_std(specialist& s)//獲取每個專家打分的標准差 55 { 56 double mean=0.0; 57 for(int i = 1; i <= s.count; i ++ )//計算均值 58 { 59 mean+=s.raw_score[i]; 60 } 61 mean/=s.count; 62 double ans = 0.0; 63 for(int i = 1; i <= s.count; i ++ ) 64 { 65 ans+=(s.raw_score[i]-mean)*(s.raw_score[i]-mean); 66 } 67 return sqrt(ans/s.count); 68 } 69 double get_mean(specialist& s)//獲取s打分的均值 70 { 71 double ans = 0.0; 72 for(int i = 1; i <= s.count; i ++ ) 73 { 74 ans += s.raw_score[i]; 75 } 76 return ans/s.count; 77 } 78 int get_rank(double score,specialist& s)//獲取某一得分在專家所有打分中的排名 79 { 80 int rank = 0; 81 for(int i = 0; i <= s.count; i ++ ) 82 { 83 if(s.raw_score[i] < score)rank++; 84 } 85 return rank; 86 } 87 double get_delta0i(specialist& Ei)//獲取專家E[i]的delta0i 88 { 89 double ans = 0.0; 90 for(int j = 1; j <= Ei.count; j ++ ) 91 { 92 double final_score = a[Ei.num_of_work[j]].final_score;//專家Ei在評閱的作品j的總評得分 93 ans += fabs(Ei.raw_score[j]-final_score)/final_score; 94 } 95 return ans/(double)Ei.count; 96 } 97 double get_delta1i(specialist& Ei)//獲取專家E[i]的delta1i 98 { 99 double ans = 0.0; 100 int count = Ei.count; 101 for(int j = 1; j <= Ei.count; j ++ ) 102 { 103 double rank = a[Ei.num_of_work[j]].rank;//Ei評閱的第j件作品的總評排序 104 ans += fabs((double)Ei.rank[j]-rank)/(double)(count-1); 105 } 106 return ans/count; 107 } 108 void get_R()//計算定義的相似度矩陣 109 { 110 double M = -1e10; 111 for(int i = 1; i <= 45; i ++ ) 112 { 113 for(int j = 1; j <= 45; j ++ ) 114 { 115 double P0i = E[i].P0i; 116 double P0j = E[j].P0i; 117 double P1i = E[i].P1i; 118 double P1j = E[j].P1i; 119 double tmp = P0i*P0j+P1i*P1j; 120 M = max(M,tmp); 121 if(i == j)R[i][j]=1; 122 else 123 { 124 R[i][j]=tmp; 125 } 126 } 127 } 128 for(int i = 1; i <= 45; i ++ ) 129 { 130 for( int j = 1; j <= 45; j ++ ) 131 { 132 if(i==j)continue; 133 R[i][j] = R[i][j]/M;//映射到(0,1)區間 134 } 135 } 136 } 137 double get_total_mean()//獲得總體均值 138 { 139 double ans = 0.0; 140 for(int i = 1; i <= 1046; i ++ ) 141 { 142 ans += a[i].final_score; 143 } 144 return ans/1046; 145 } 146 double get_total_std()//獲得總體方差 147 { 148 double ans = 0.0; 149 int count = 0; 150 for(int i = 1; i <= 45;i ++ ) 151 { 152 count += E[i].count; 153 for(int j = 1; j <= E[i].count; j ++ ) 154 { 155 int num = E[i].num_of_work[j]; 156 ans += pow(a[num].final_score-E[i].raw_score[j],2); 157 } 158 } 159 return sqrt(ans/count); 160 } 161 void display();//展示專家信息 162 void display_R();//展示相關系數矩陣 163 void display_P();//打印可信度表 164 struct test{ 165 int index; 166 }; 167 168 bool cmp(work& a1,work& a2){return a1.score<a2.score;}//按照得分進行升序排序 169 int main() 170 { 171 freopen("input.txt","r",stdin); 172 // freopen("output.txt","w",stdout); 173 std::ios::sync_with_stdio(false); 174 int number,number_of_work; 175 double tmp; 176 int x; 177 double tmp1,tmp2; 178 for(int i = 1;i <= 1046;i ++)//獲取1046條數據 179 { 180 cin>>tmp; 181 number = (int)tmp; 182 cin>>a[number].final_score; 183 w[i].index = i; 184 w[i].score = a[number].final_score; 185 //獲取三個專家的信息 186 for(int j = 1; j <= 3; j ++ ) 187 { 188 cin>>tmp; 189 number = (int)tmp; 190 cin>>tmp1>>tmp2; 191 192 int count = ++E[number].count; 193 E[number].raw_score[count]=tmp1; 194 E[number].std_score[count]=tmp2; 195 E[number].num_of_work[count]=i;//評閱的作品編號 196 197 a[i].raw_score[j] = tmp1; 198 a[i].score_givener[j] = number; 199 a[i].std_score[j] = tmp2; 200 } 201 } 202 203 for(int i = 1; i <= 45; i ++ ) 204 { 205 E[i].std = get_std(E[i]); 206 E[i].mean = get_mean(E[i]); 207 for(int j = 1; j <= E[i].count; j ++ )//計算在E[i]所打的分中作品Aj的排名 208 { 209 E[i].rank[j] = get_rank(E[i].raw_score[j],E[i]); 210 } 211 } 212 sort(w+1,w+1046+1,cmp);//對作品final_score進行升序排序 213 for(int i = 1; i <=1046; i ++ ) 214 { 215 a[w[i].index].rank=i;//得到每件作品的排名 216 } 217 218 for(int i = 1; i <= 45; i ++ )//獲取每位專家的分值偏差和排序偏差與可信度 219 { 220 E[i].delta0i = get_delta0i(E[i]); 221 E[i].delta1i = get_delta1i(E[i]); 222 E[i].P0i = 1 - E[i].delta0i; 223 E[i].P1i = 1 - E[i].delta1i; 224 } 225 // cout<<get_total_mean()<<" "<<get_total_std()<<endl; 226 // display();//通過display 函數可以展示每一個專家的詳細信息 227 // get_R();//計算相關矩陣 228 // display_R(); 229 // 230 // display_P(); 231 } 232 void display() 233 { 234 cout<<"四元組(a,b,c,d) => 該專家第a次批閱了作品b,給了c分,該作品的標准分是d"<<endl; 235 for(int j = 1; j <= 45; j ++ ) 236 { 237 cout<<"第"<<j<<"個專家:"<<endl; 238 cout<<"總體:"<<E[j].count<<endl; 239 for(int i = 1; i <= E[j].count; i ++ ) 240 { 241 cout<<"("<<i<<","<<E[j].num_of_work[i]<<","<<E[j].raw_score[j]<<","<<E[j].std_score[i]<<")"<<endl; 242 cout<<"該作品在該專家所以給分中的排名:"<<E[1].rank[i]<<endl; 243 } 244 cout<<"標准差:"<<E[j].std<<endl; 245 cout<<"delta0i = "<<E[j].delta0i<<endl; 246 cout<<"delta1i = "<<E[j].delta1i<<endl; 247 248 cout<<"P0i = "<<E[j].P0i<<endl; 249 cout<<"P1i = "<<E[j].P1i<<endl; 250 cout<<"#########################################################"<<endl; 251 } 252 } 253 void display_R() 254 { 255 cout<<" "; 256 for(int i = 1;i <= 45; i ++ ) 257 { 258 cout<<"("<<i<<")"<<" "; 259 } 260 cout<<endl; 261 for(int i = 1; i <= 45; i ++ ) 262 { 263 cout<<"("<<i<<")"<<" "; 264 for(int j = 1; j <= 45; j ++ ) 265 { 266 cout<<R[i][j]<<" "; 267 } 268 cout<<endl; 269 } 270 } 271 void display_P() 272 { 273 cout<<"可信度矩陣"<<endl; 274 for(int i = 1; i <= 45; i ++ ) 275 { 276 cout<<E[i].P0i<<" "<<E[i].P1i<<endl; 277 } 278 }