python的requests模塊是個神器,這里用request模塊實現模擬登登陸:
#coding:utf-8 import sys import requests from bs4 import BeautifulSoup import re from pylsy import pylsytable #驗證碼識別# import os os.chdir("C:\Python27\Lib\site-packages") from pytesser import * #驗證碼識別的庫 login_url = 'http://mis.teach.ustc.edu.cn/userinit.do' a_url = 'http://mis.teach.ustc.edu.cn/login.do' pre_url = 'http://mis.teach.ustc.edu.cn/' grades_url = 'http://mis.teach.ustc.edu.cn/querycjxx.do' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36', 'Referer': 'http://mis.teach.ustc.edu.cn/userinit.do', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', } pre_data = {'userbz': 's'} login_data = { 'userbz': 's', 'hidjym': '', } grades_data = { 'xuenian': '', 'chaxun': '+%B2%E9++%D1%AF+', 'px': '1', 'zd': '0', } #提交post指令 def judging(name): divide = 125#闕值根據具體調試 list1 = [] for i in range(256): if i < divide: list1.append(0) else: list1.append(1) image = Image.open(name) image2 = image.convert('L') #根據闕值二值化分割 image_text = image2.point(table,'1') return image_to_string(image_text) #識別率有待改進 def getGrades(filename): userid=raw_input("name:") password=raw_input("password:") s = requests.Session() login_r = s.post(login_url, headers=headers, data=pre_data) soup = BeautifulSoup(login_r.text,"html.parser") img_src = pre_url + soup.find('img', id='random')['src'] f = open('c.png', 'wb') img = s.get(img_src) f.write(img.content) f.close() code = judging('c.png') login_data['userCode']=userid login_data['passWord']=password login_data['check'] = code li_r = s.post(a_url, headers=headers, data=login_data) grades = s.post(grades_url, headers=headers, data=grades_data) f = open(filename, 'w') reload(sys) sys.setdefaultencoding('utf8') f.writelines(grades.text) f.close() def sousa(filename): f = open(filename) text = f.read() #html.parser soup=BeautifulSoup(text,"html.parser") trs=soup.find_all('tr',class_='bg') courseName=[] courseGrades=[] courseGPA=[] del trs[0] for course in trs: tds=course.find_all('td',class_='bg') courseName.append(tds[2].string) courseGrades.append(tds[4].string) courseGPA.append(tds[6].string) return (courseName,courseGrades,courseGPA) def writeGrades(filename): courseName,courseGrades,courseGPA=sousa() f=open(filename,'w') for i in range(len(courseGPA)): f.write('%s %s %s \n' % (courseName[i],courseGrades[i],courseGPA[i])) f.close() if __name__ == '__main__': getGrades('test.txt') courseName,courseGrades,courseGPA=sousa('test.txt') attributes=['courseName','courseGrades','coursePoints'] table=pylsytable(attributes) table.add_data('courseName',courseName) table.add_data('courseGrades',courseGrades) table.add_data('coursePoints',courseGPA) print table
利用requests.Session()並構造post指令,具體情況具體分析。
圖像處理用到了PIL,pytesser庫 ,pytesser調用的tesseract是谷歌的一個用於識別的開源框架,可用於數字、字母、漢字識別(需要優化)。
相關主要代碼:
image = Image.open(name) image_text = image2.point(table,'1') return image_to_string(image_text)