最近直播答題app很熱門,由於之前看過跳一跳的python腳本(非常棒),於是也想寫一個答題的腳本。
https://github.com/huanmsf/cai
思路:
1、截圖
2、文字識別,提取問題和選項(分割后識別准確性會提高)
3、爬取網頁數據,根據規則匹配選項
4、根據選項自動點擊屏幕該位置(應該循環點擊,防止剛好切換到西瓜妹)
5、重復前面步驟
存在的問題:
1、答題時間有限,如果爬去的鏈接多了,還沒解析完時間就到了。爬取的少就缺少分析數據,結果不靠譜。
2、問題和選項需要提取關鍵字匹配
3、可能要試試其他搜索引擎(百度垃圾信息嚴重影響正確率)
目錄:
├── baidu.py ├── cai.png ├── main.py ├── need │ └── chi_sim.traineddata ├── README └── screenshot.py
main.py:
from screenshot import pull_screenshot
import time, urllib.request, baidu, os
try:
import Image
except ImportError:
from PIL import Image, ImageDraw
import pytesseract
# 屏幕頂端到問題的距離/屏幕高度,隨分辨率變化(默認1920*1080)
top_off_c = 0.15
# 問題高度
que_h = 300
# 答案高度
ans_h = 170
# 左右偏移量
l_r_off = 40
# 問題過濾器
que_filter = ['.', ' ']
# 答案過濾器
ans_filter = ["《", "》", ' ']
# 問題列表
que_list = []
# 選項坐標
point_A = (0, 0, 0, 0)
point_B = (0, 0, 0, 0)
point_C = (0, 0, 0, 0)
# 輔助找到文字區域
def draw():
img = Image.open('cai.png')
w, h = img.size
draw = ImageDraw.Draw(img)
draw.line((40, h * 0.15, w - 40, h * 0.15), fill="red")
draw.line((40, h * 0.15 + 300, w - 40, h * 0.15 + 300), fill="red")
draw.line((40, h * 0.15 + 470, w * 0.7, h * 0.15 + 470), fill="red")
draw.line((40, h * 0.15 + 640, w * 0.7, h * 0.15 + 640), fill="red")
draw.line((40, h * 0.15 + 810, w * 0.7, h * 0.15 + 810), fill="red")
img.show()
def click(point):
# img = Image.open('cai.png')
# w, h = img.size
# draw = ImageDraw.Draw(img)
# draw.arc(point, 0, 360, fill="red")
# img.show()
cmd = 'adb shell input swipe {x1} {y1} {x2} {y2} {duration}'.format(
x1=point[0],
y1=point[1],
x2=point[2],
y2=point[3],
duration=1
)
os.system(cmd)
def main():
while True:
print(">>>>>>")
pull_screenshot()
img = Image.open('cai.png')
img = img.convert('L')
w, h = img.size
img_q = img.crop((l_r_off, h * top_off_c, w - l_r_off, h * top_off_c + que_h))
img_a = img.crop((l_r_off, h * top_off_c + que_h, w * 0.7, h * top_off_c + que_h + ans_h))
img_b = img.crop((l_r_off, h * top_off_c + que_h + ans_h, w * 0.7, h * top_off_c + que_h + ans_h * 2))
img_c = img.crop((l_r_off, h * top_off_c + que_h + ans_h * 2, w * 0.7, h * top_off_c + que_h + ans_h * 3))
point_A = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 - 20, w / 3, h * top_off_c + que_h + ans_h / 2)
point_B = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 * 3 - 20, w / 3, h * top_off_c + que_h + ans_h / 2 * 3)
point_C = (w / 3 - 20, h * top_off_c + que_h + ans_h / 2 * 5 - 20, w / 3, h * top_off_c + que_h + ans_h / 2 * 5)
# need 下的chi文件 復制到/usr/share/tesseract-ocr/4.00/
question = pytesseract.image_to_string(img_q, lang='chi_sim')
ans_a = pytesseract.image_to_string(img_a, lang='chi_sim')
ans_b = pytesseract.image_to_string(img_b, lang='chi_sim')
ans_c = pytesseract.image_to_string(img_c, lang='chi_sim')
ans = ["1", "1", "1"]
for f in que_filter:
question = question.strip().replace(f, "")
for f in ans_filter:
ans_a = ans_a.strip().replace(f, "")
ans_b = ans_b.strip().replace(f, "")
ans_c = ans_c.strip().replace(f, "")
ans[0] = ans_a
ans[1] = ans_b
ans[2] = ans_c
for a in ans:
if not a.strip():
ind = ans.index(a)
ans[ind] = "&*&"
print(question)
print(ans)
if que_list.__contains__(question):
continue
index = baidu.search(question, ans)
# 選第1,2,3個
if index == 0:
click(point_A)
elif index == 1:
click(point_B)
else:
click(point_C)
print("index" + str(index))
que_list.append(question)
if __name__ == '__main__':
main()
baidu.py:
# -*- coding:utf-8 -*-
import urllib, time, re
import lxml.etree as etree
# 答案積分規則
"""
某個答案首次出現在一篇文章中+10,再次+3
"""
def search(question, ans):
cont = {}
q_url = "http://www.baidu.com/s?word=" + urllib.parse.quote(question)
top_page = getdata(q_url)
selector = etree.HTML(top_page)
url_list = selector.xpath('//h3[@class]/a[@data-click]/@href')[0:5]
for url_item in url_list:
if not url_item.startswith('http'):
continue
print(url_item)
sub_page = getdata(url_item)
selector = etree.HTML(sub_page)
try:
content_list = selector.xpath('//div/text()|//span/text()|//p/text()')
except:
return 0
ans_tmp_list = []
for con in content_list:
if con.strip():
for a in ans:
if a in con:
if ans_tmp_list.__contains__(a):
if a in cont.keys():
cont[a] += 3
else:
cont[a] = 3
else:
if a in cont.keys():
cont[a] += 10
else:
cont[a] = 10
ans_tmp_list.append(a)
print(con)
print(cont)
if not cont:
return 0
else:
l = sorted(cont.items(), key=lambda x: x[1], reverse=True)
return ans.index(l[0][0])
def getdata(url):
req = urllib.request.Request(url)
try:
response = urllib.request.urlopen(req)
except:
return " "
top_page = ""
try:
top_page = response.read().decode("utf-8", 'ignore')
except:
pass
# print(top_page)
return top_page
screenshot.py:
# -*- coding: utf-8 -*-
"""
手機屏幕截圖的代碼(參考跳一跳外掛源碼)
"""
import subprocess
import os
import sys
from PIL import Image
SCREENSHOT_WAY = 3
def pull_screenshot():
global SCREENSHOT_WAY
if 1 <= SCREENSHOT_WAY <= 3:
process = subprocess.Popen(
'adb shell screencap -p',
shell=True, stdout=subprocess.PIPE)
binary_screenshot = process.stdout.read()
if SCREENSHOT_WAY == 2:
binary_screenshot = binary_screenshot.replace(b'\r\n', b'\n')
elif SCREENSHOT_WAY == 1:
binary_screenshot = binary_screenshot.replace(b'\r\r\n', b'\n')
f = open('cai.png', 'wb')
f.write(binary_screenshot)
f.close()
elif SCREENSHOT_WAY == 0:
os.system('adb shell screencap -p /sdcard/cai.png')
os.system('adb pull /sdcard/cai.png .')
文字識別 sudo pip3 install pytesseract sudo apt-get install tesseract-ocr
初級版本效果:

題外話:
最近在瀏覽FB站看到
文中提到可以提前10秒得到題目(不知是否屬實),由於訪問權限不能看,如有知道怎么搞的請留言交流下,謝謝
