1 import requests 2 from fake_useragent import UserAgent 3 from lxml import etree 4 import pymysql 5 6 7 conn = pymysql.connect(host='47.107.190.1', port=3306, user='zhangsan', 8 password='Changeme_123', database='qiubai', charset='utf8') 9 # 創建游標 10 cursor = conn.cursor() 11 12 url = 'https://www.qiushibaike.com/text/' 13 14 ua = UserAgent() 15 16 headers = { 17 'User-Agent': ua.random 18 } 19 20 response = requests.get(url=url, headers=headers) 21 # print(response.text) 22 23 # 解析數據 24 html = etree.HTML(response.text) 25 # 先拿到存放段子的div 26 divs = html.xpath('//div[contains(@id, "qiushi_tag_")]') 27 # print(len(divs)) 28 # 循環從每一個div中提取需要的信息 29 for div in divs: 30 # 定位到具體元素之后, 在當前元素下提取內容,一定要寫./ 31 img = div.xpath('.//div[contains(@class, "author")]/a[1]/img/@src')[0] 32 author = div.xpath('string(.//div[contains(@class, "author")]/a[2])').strip() 33 detail_url = 'https://www.qiushibaike.com' + div.xpath('.//a[@class="contentHerf"]/@href')[0] 34 # print(detail_url) 35 # 進入詳情頁,爬取詳情頁內容 36 detail_response = requests.get(url=detail_url, headers=headers) 37 detail_html = etree.HTML(detail_response.text) 38 content = detail_html.xpath('string(//div[@class="content"])').strip() 39 # print(content) 40 41 # 把爬下來的內容存入到數據庫. 42 try: 43 sql = 'insert into duanzi(img, author, detail_url, content) values("%s", "%s", "%s", "%s")' 44 cursor.execute(sql, (img, author, detail_url, content)) 45 # 提交 46 conn.commit() 47 except Exception as e: 48 print(e) 49 conn.rollback() 50 51 cursor.close() 52 conn.close()