(1):分析網頁
分析ajax的請求網址,和需要的參數。通過不斷向下拉動滾動條,發現請求的參數中offset一直在變化,所以每次請求通過offset來控制新的ajax請求。
(2)上代碼
a、通過ajax請求獲取頁面數據
# 獲取頁面數據 def get_page_index(offset, keyword): # 參數通過分析頁面的ajax請求獲得 data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # 將字典轉換為url參數形式 try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('請求索引頁錯誤') return None
b、分析ajax請求的返回結果,獲取圖片集的url
# 分析ajax請求的返回結果,獲取圖片集的url def parse_page_index(html): data = json.loads(html) # 加載返回的json數據 if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url')
c、得到圖集url后獲取圖集的內容
# 獲取詳情頁的內容 def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('詳情頁頁錯誤', url) return None
d、其他看完整代碼
完整代碼:
# -*- coding: utf-8 -*- # @Author : FELIX # @Date : 2018/4/4 12:49 import json import os from hashlib import md5 import requests from urllib.parse import urlencode from requests.exceptions import RequestException from bs4 import BeautifulSoup import re import pymongo from multiprocessing import Pool MONGO_URL='localhost' MONGO_DB='toutiao' MONGO_TABLE='toutiao' GROUP_START=1 GROUP_END=20 KEYWORD='街拍' client = pymongo.MongoClient(MONGO_URL) # 連接MongoDB db = client[MONGO_DB] # 如果已經存在連接,否則創建數據庫 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } # 獲取頁面數據 def get_page_index(offset, keyword): # 參數通過分析頁面的ajax請求獲得 data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # 將字典轉換為url參數形式 try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('請求索引頁錯誤') return None # 分析ajax請求的返回結果,獲取圖片集的url def parse_page_index(html): data = json.loads(html) # 加載返回的json數據 if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') # 獲取詳情頁的內容 def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('詳情頁頁錯誤', url) return None def parse_page_detail(html, url): # soup=BeautifulSoup(html,'lxml') # print(soup) # title=soup.select('tetle').get_text() # print(title) images_pattern = re.compile('articleInfo:.*?title: \'(.*?)\'.*?content.*?\'(.*?)\'', re.S) result = re.search(images_pattern, html) if result: title = result.group(1) url_pattern = re.compile('"(http:.*?)"') img_url = re.findall(url_pattern, str(result.group(2))) if img_url: for img in img_url: download_img(img) # 下載 data = { 'title': title, 'url': url, 'images': img_url, } return data def save_to_mongo(result): if result: if db[MONGO_TABLE].insert(result): # 插入數據 print('存儲成功', result) return True return False def download_img(url): print('正在下載', url) try: response = requests.get(url, headers=headers) if response.status_code == 200: save_img(response.content) else: return None except RequestException: print('下載圖片錯誤', url) return None def save_img(content): # os.getcwd()獲取當前文件路徑,用md5命名,保證不重復 file_path = '{}/imgs/{}.{}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb')as f: f.write(content) def main(offset): html = get_page_index(offset, KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) print(url,'++++++++++++++++++++++++++++++++++++++++++++++++') print(html) if html: result = parse_page_detail(html, url) save_to_mongo(result) # print(result) # print(url) if __name__ == '__main__': groups = [i * 20 for i in range(GROUP_START, GROUP_END + 1)] pool = Pool() pool.map(main, groups)