Relief 過濾式特征選擇


給定訓練集{(x1,y1),(x2,y2).....(xm,ym)} ,對每個示例xi,Relief在xi的同類樣本中尋找其最近鄰xi,nh(猜中近鄰),再從xi的異類樣本中尋找其最近鄰xi,nm(猜錯近鄰)

     代碼如下:

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 28 20:16:09 2018

@author: jzc
"""
import numpy as np
import csv
from random import randrange
from sklearn import preprocessing
#抽樣次數
m=8    
def Compute_Distance_Discrete(diff_distance):
    # 使用歐幾里得距離計算最近鄰
    counter = np.power(diff_distance,2) 
    counter = np.sum(counter)
    counter = np.sqrt(counter)
    return counter
def loadcsv(filename):
    """載入文件"""
    lines = csv.reader(open(filename,'r'))
    data = list(lines)
    for i in range(1,len(data)):
        data[i] = [float(x) for x in data[i]]
    result = np.array(data[1:])
    features = result[:,1:-1]
    labels = result[:,-1]
    return features,labels
def Relief(features,labels):
    #初始化
    (n_samples,n_features)=np.shape(features)
    distance = np.zeros((n_samples,n_samples))
    weights = np.zeros(n_features) 
    nearHit= list()
    nearMiss= list()
    distance_sort=list()
    """尋找每個樣本的距離"""
    for i in range(0,n_samples):
        for j in range(0,n_samples):
            diff_distance = features[i]-features[j]
            if i==j:
                distance[i,j]=99999
            else:
                distance[i,j] = Compute_Distance_Discrete(diff_distance)
            
          
    for i in range(0,m):
        one_sample = randrange(0,n_samples,1) #隨機選擇一個樣本
        one_feature = features[one_sample]
        for index in range(n_samples):
            distance_sort.append([distance[one_sample,index],index,labels[index]])
        #從小到大排序
        distance_sort.sort(key = lambda x:x[0])
        """尋找樣本的猜錯近鄰和猜中近鄰"""
        for index in range(n_samples):
            if nearHit ==[] and distance_sort[index][2]==labels[one_sample]:
                nearHit = features[distance_sort[index][1]]
            elif nearMiss==[] and distance_sort[index][2]!=labels[one_sample]:
                nearMiss = features[distance_sort[index][1]]
            elif nearHit!=[] and nearMiss!=[]:
                break;
            else:
                continue;
        sum_nh = list()
        sum_nm =list()
        # 若屬性j離散,Xaj==Xbj 則diff的值為0;否則為1
        for k in range(len(one_feature[:-2])):
            if one_feature[k] != nearHit[k]:
                sum_nh.append(1)
            else:
                sum_nh.append(0)
            if one_feature[k] != nearMiss[k]:
                sum_nm.append(1)
            else:
                sum_nm.append(0)
        #print sum_nh,sum_nm
        #print one_feature[-2:]-nearHit[-2:]
        """若為屬性j為連續, diff(Xaj-Xbj)=|Xaj-Xbj| 並且Xaj,Xbj要歸一化到[0,1]區間"""
        weights[-2:] = weights[-2:]-np.power(one_feature[-2:]-nearHit[-2:],2)
        +np.power(one_feature[-2:]-nearMiss[-2:],2)
        weights[:-2] = weights[:-2]-np.power(sum_nh,2)+np.power(sum_nm,2)
        #print weights/n_samples
    return weights/n_samples
filename = '/Users/jzc/DeepLearning(7.8-)/data/watermelon3_0.csv'
features,labels = loadcsv(filename)
#features[-2:] = preprocessing.normalize(features[-2:],norm='l2')
#print features
for x in range(1,10):
    result = Relief(features,labels)
    print result
#print features[0],labels[0]
    
        
        

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM