""" 使用Requests庫完成Post表單操作 """ #_*_codingn:utf8 _*_ import requests from bs4 import BeautifulSoup ''' 設置請求頭,讓程序發出的請求更像來源於瀏覽器 ''' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"} if __name__ == "__main__": params ={"username": "anything","password": "password"} session =requests.session() post_obj = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params) s = session.get("http://pythonscraping.com/pages/cookies/profile.php") print(post_obj.text.encode("utf-8")) print(s.text.encode("utf-8")) #session.cookies.get_dict() #獲取cooking print(session.cookies.get_dict())
# -*- coding: utf-8 -*- ''' 目標站點分析 網頁結構分析 --開干-- 1、單頁內容 2、正則 3、保存json 4、多線程循環 ''' # .*具有貪婪的性質,首先匹配到不能匹配為止,根據后面的正則表達式,會進行回溯。 # .*?(短)則相反,一個匹配以后,就往下進行,所以不會進行回溯,具有最小匹配的性質。 # re.S 讓.匹配換行符 #---------------------------------- import json import requests from requests.exceptions import RequestException import re import time from multiprocessing import Pool headers = { # 非常重要 'Accept-Language': 'en-US,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', 'Connection': 'keep-alive', 'Referer': 'http://maoyan.com/board/6' } def get_one_page(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None # 非200 except RequestException: return None def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern, html) for item in items: yield { # 變成生成器 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], # 字符串處理 (移除字符串頭尾指定的字符序列) 'time': item[4].strip()[5:], 'score': item[5] + item[6] # 分開匹配加起來 } def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: # 編碼3 f.write(json.dumps(content, ensure_ascii=False) + '\n') # json.dumps 序列化時對中文默認使用的ascii編碼 def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) # return返回參數 # print(html) for item in parse_one_page(html): # print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 10) time.sleep(1) # 進程池 # pool=Pool() # pool.map(main,[i*10 for i in range(10)])
# coding=utf-8 ''' 1、抓取索引頁內容 2、抓取詳情頁內容 3、下載圖片保存數據庫 4、循環及多線程 ''' import requests from requests.exceptions import RequestException from json import loads from bs4 import BeautifulSoup user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)" headers = {"User-Agent": user_agent} def get_onepage_index(i, keywords): data = { "offset": i, "format": "json", "keyword": keywords, "autoload": "true", "count": "20", "cur_tab": "1", "from": "search_tab" } url = 'https://www.toutiao.com/search_content/?' try: response = requests.get(url, params=data) if response.status_code == 200: return response.text return None except RequestException: print('something is wrong!') return None def parse_onepage_index(html): # json.loads()用於將str類型的數據轉成dict。 data = loads(html) if data and 'data' in data.keys(): ##獲取所有的key 值 for item in data.get('data'): # get() 函數返回指定鍵的值,如果值不在字典中返回默認值。 yield item.get('article_url') def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: # print(response.status_code) return response.text return None except RequestException: print('wrong url:', url) return None def parsepage(html): soup = BeautifulSoup(html, 'lxml') title = soup.title.string print(title) def main(): for i in range(1, 2): i = str(i * 20) html = get_onepage_index(i, '街拍') parse_onepage_index(html) for url in parse_onepage_index(html): print(url) detailhtml = get_page_detail(url) # 返回網頁文本 # print(detailhtml) if detailhtml == None: pass else: parsepage(detailhtml) # bs4去解析 # get_page_detail('http://toutiao.com/group/6596305324645286404/') if __name__ == '__main__': main()
如有疑問,請留言。
如覺得有幫助,請點個贊,謝謝!