試了一下爬取今日頭條的組圖。
首先是進入輸入關鍵詞后的索引頁,使用chrom的開發者工具可以看到這是一個get請求,且包含一些數據,於是在這一步應該構造這個get請求,請求成功則會返回一個json數據。
def get_page_index(offset,keyword): data = { 'offset':offset, 'format':'json', 'keyword':keyword, 'autoload':'true', 'count':'20', 'cur_tab':3 } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) try: response = requests.get(url) if response.status_code == 200: return response.text else: return None except RequestException: print("請求索引頁失敗") return None
對於上一步返回的數據進行解析,取出需要的article_url字段
def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url')
對每個組圖的url進行請求:
def get_detail_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: print("請求詳情頁面出錯") return None
進行解析:
def parse_detail_page(html,url): pattern = re.compile('BASE_DATA.galleryInfo = (.*?);',re.S) result = re.search(pattern,html) data = result.group(1) pattern_title = re.compile('title:(.*?),',re.S) #print(result) result2 = re.search(pattern_title,data) #print(data) title = result2.group(1) pattern_image = re.compile('gallery: JSON.parse\("(.*?)"\)') result3 = re.search(pattern_image,data) #print(result3.group(1)) jsonStr = re.sub(r'\\{1,2}', '',result3.group(1)) #print(jsonStr) if result3: data_image = json.loads(jsonStr) if data_image and 'sub_images' in data_image.keys(): sub_images = data_image.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return {"title":title, "url":url, "images":images}
對於解析出的每個圖片信息保存至MongoDB:
def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print("存儲到MongoDB成功") return True return False def download_image(url): try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except RequestException: print("下載圖片出錯") return None
將圖片保存至本地:
def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(content) f.close()
一部分保存下來的圖片:
!!!!