需求:想要實現這樣的功能:用戶輸入喜歡的電影名字,程序即可在電影天堂https://www.ygdy8.com爬取電影所對應的下載鏈接,並將下載鏈接打印出來
遇到的問題:獲取磁力的鏈接中包含中文,打印出來后亂碼
解決辦法:手動指定編碼方式:
if res.encoding == 'ISO-8859-1': encodings = requests.utils.get_encodings_from_content(res.text) if encodings: encoding = encodings[0] else: encoding = res.apparent_encoding else: encoding = res.encoding encode_content = res.content.decode(encoding, 'replace').encode('utf-8', 'replace')
# 想要實現這樣的功能:用戶輸入喜歡的電影名字,程序即可在電影天堂https://www.ygdy8.com爬取電影所對應的下載鏈接,並將下載鏈接打印出來 import requests from bs4 import BeautifulSoup from urllib.request import pathname2url # 為躲避反爬機制,偽裝成瀏覽器的請求頭 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 OPR/65.0.3467.78 (Edition Baidu)'} # 獲取電影磁力鏈接 def getMovieDownloadLink(filmlink): res = requests.get(filmlink, headers=headers) if res.status_code == 200: # 請求后的內容中文亂碼處理辦法: # 當response編碼是‘ISO-8859-1’,我們應該首先查找response header設置的編碼;如果此編碼不存在,查看返回的Html的header設置的編碼 if res.encoding == 'ISO-8859-1': encodings = requests.utils.get_encodings_from_content(res.text) if encodings: encoding = encodings[0] else: encoding = res.apparent_encoding else: encoding = res.encoding encode_content = res.content.decode(encoding, 'replace').encode('utf-8', 'replace') soup = BeautifulSoup(encode_content, 'html.parser') Zoom = soup.select_one('#Zoom') fileurl = Zoom.find('table').find('a').text with open('./17-電影天堂磁力.txt','a', newline='') as file: file.write(fileurl + '\n') else: print('電影鏈接:{}請求失敗!'.format(filmlink)) def main(): dyurl = 'https://www.ygdy8.com' # movie = input('請輸入電影名稱:') movie = '沉睡魔咒' movie = movie.encode('gbk') url = 'http://s.ygdy8.com/plus/s0.php?typeid=1&keyword={0}'.format(pathname2url(movie)) res = requests.get(url, headers=headers) if res.status_code == 200: htmltext = res.text soup = BeautifulSoup(htmltext, 'html.parser') co_content8 = soup.find('div', class_='co_content8') tables = co_content8.find('ul').find_all('table') if len(tables) <= 0: print('沒有找到相關的資源,可到站點上搜索 {0}'.format(dyurl)) else: for table in tables: filmlink = dyurl + table.find('a')['href'] getMovieDownloadLink(filmlink) else: print('請求失敗!') main()
結果:
參考:
https://blog.csdn.net/guoxinian/article/details/82978067
http://blog.csdn.net/a491057947/article/details/47292923
http://docs.python-requests.org/en/latest/user/quickstart/#response-content