最近爬了下自如網在深圳地域的租房信息,發現房價是一個很大的問題。
正好也剛看了機器學習實戰這本書,感覺可以試一下寫個圖像識別來針對下這個問題=0=
(其實當時試了好多網上的方法,不知道為啥一張很明顯的數字圖片,就是讀不出來,所以就自己模仿着寫了個)
自如圖片down下來后類似這種,由0-9十個數字,300*30大小的png格式組成的圖片

下面的有兩個數據庫文件扔不上來,所以直接跑應該必報錯。直接扔到github上了。
github:https://github.com/CzaOrz/smallStorage/tree/master/scrapy_shenzhen/ziru
兩個文件:cza_keys.txt、cza_values.txt
import os
from PIL import Image
import numpy as np
import operator
"""img2gsi"""
def img2gsi(img,threshold):
"""傳入image對象進行灰度、二值處理"""
img = img.convert("L") # 轉灰度
pixdata = img.load()
w, h = img.size
for x in range(w):
for y in range(h):
if pixdata[x, y] > threshold:
pixdata[x, y] = 1
else:
pixdata[x, y] = 0
return img
"""自如圖片由包含0-9的300*30大小的png格式圖片組成,切割下"""
def splitImage(img, rownum, colnum):
list = []
w, h = img.size
if rownum <= h and colnum <= w:
print('Original image info: %sx%s, %s, %s' % (w, h, img.format, img.mode))
print('開始處理圖片切割, 請稍候...')
num = 0
rowheight = h // rownum
colwidth = w // colnum
for r in range(rownum):
for c in range(colnum):
box = (c * colwidth, r * rowheight, (c + 1) * colwidth, (r + 1) * rowheight)
list.append(np.array(img.crop(box), 'f'))#.save(os.path.join(basename + '_' + str(num) + '.' + ext), ext)
num = num + 1
print('圖片切割完畢,共生成 %s 張小圖片。' % num)
return list #切割之后,返回每一個圖片的數組陣嗎
else:
print('不合法的行列切割參數!')
"""機器學習實戰上寫的函數,直接手動copy"""
def classify0(inX, dataSet, labels, k): # inX is values you want to match, dataSet is learning database
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5 # ((x-x)**2 + (x-x)**2)**0.5, remember it is still a array
sortedDistIndicies = distances.argsort() # rerurn the array's index by reverse = False
classCount = {} # define a dictionary
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(),
key = operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]#,sortedClassCount[0][1],sortedClassCount[1][0],sortedClassCount[1][1]
def img2vector(data):
returnVect = np.zeros((1,900))
#fr = open(filename)
#print(fr)
count = 0
for row in data:#range(30):
for gsi in row:#range(30):
returnVect[0, count] = gsi#float(lineStr[j])
count += 1
return returnVect
def handwritingClassTest(testData):
list = [] #there may exist bug when run it
trainingMat = np.loadtxt(os.path.join(os.getcwd(), 'cza_values.txt')) #讀取訓練數據庫,數據庫我沒貼上來=0=
with open(os.path.join(os.getcwd(), 'cza_keys.txt'),'r') as f_r: #讀取訓練數據庫,數據庫我沒貼上來=0=
data = f_r.readline()
hwLabels = [int(i) for i in data]
print('read db done')
for data in testData:#range(mTest):
vectorUnderTest = img2vector(data)#os.getcwd()+'\\test\\%s'%fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
list.append(classifierResult)
print('trainingMat_resylt_is ', classifierResult)#,'real result is ',classNumStr)
return list
def img2num(picture): # input a picture name is ok
img = Image.open(picture)
img = img2gsi(img,140)
testData = splitImage(img, 1, 10)
print('start handel')
result = handwritingClassTest(testData)
return result # this is a list including picture2num
if __name__ == '__main__':
img2num('123.png')
img2num('456.png')
最后結果類似這種,只是針對自如的這種圖片可以達到成功率100%,其他的就不談了,bug無解
菜鳥一個,剛學不久,不會的還是太多了QAQ

