import pandas as pd
data=pd.read_csv('creditcard.csv')
count_classes=pd.value_counts(data['Class'],sort=True).sort_index()
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
from sklearn.preprocessing import StandardScaler
data['normAmount']=StandardScaler().fit_transform(data['Amount'].reshape(-1,1))#对差异较大的列进行标准化操作
data=data.drop(['Time','Amount'],axis=1)
data.head()
x=data.ix[:,data.columns!='Class']
y=data.ix[:,data.columns=='Class']
#class=1--->存在欺诈行为,class=0--->不存在欺诈行为
number_records_fraud=len(data[data.Class==1])#找class=1的个数
fraud_indices=np.array(data[data.Class==1].index)#找出存在欺诈行为的数据的index
normal_indices=data[data.Class==0].index
random_normal_indices=np.random.choice(normal_indices,number_records_fraud,replace=False)
random_normal_indices=np.array(random_normal_indices)
under_sample_indices=np.concatenate([fraud_indices,random_normal_indices])
under_sample_data=data.iloc[under_sample_indices,:]
X_undersample=under_sample_data.ix[:,under_sample_data.columns!='Class']
Y_undersample=under_sample_data.ix[:,under_sample_data.columns=='Class']
print('Percentage of normal transactions:',len(under_sample_data[under_sample_data.Class==0])/len(under_sample_data))
print('Percentage of fraud transactions:',len(under_sample_data[under_sample_data.Class==1])/len(under_sample_data))
print('Total number of transactions in resampled data:',len(under_sample_data))
#将数据丢入机器学习包
#将数据分为训练集和测试集
#原始数据切分
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
print('Number transactions train dataset:',len(X_train))
print('Number transactions test dataset:',len(X_test))