注:本文代碼中的cookie都需要替換為讀者自己的cookie
1. 將數據導出到文本文檔
1.1 測試代碼
import requests from bs4 import BeautifulSoup from requests.exceptions import RequestException import time headers = { 'cookie': '', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } # 替換為自己的cookie def get_page(url): try: html = requests.get(url, headers=headers, timeout=5) if html.status_code == 200: print('請求成功') return html.text else: # 這個else語句不是必須的 return None except RequestException: print('請求失敗') def parse_page(html): html = BeautifulSoup(html, "html.parser") titles = html.find_all("h2", {'class': 'HotItem-title'})[:10] links = html.find_all('div', {"class": "HotItem-content"})[:10] hot_values = html.find_all('div',{"class": "HotItem-content"})[:10] texts = html.find_all('div', {"class": "HotItem-content"})[:10] return titles, links, hot_values, texts # , title_links def store_data(titles, links, hot_values, texts): with open('熱榜測試.txt', 'a') as f: f.write('+'*80 + '\n') f.write(time.asctime().center(80) + '\n') f.write('+'*80 + '\n'*2) index = 1 for title, link, hot_value, text in zip(titles, links, hot_values, texts): print(title.get_text(), '\n', link.a.attrs['href'], '\n', hot_value.div.get_text().replace('\u200b', '')) f.write(str(index) + ': ' + title.get_text() + '\n') f.write(' ' + link.a.attrs['href'] + '\n') f.write(' ' + hot_value.div.get_text().replace('\u200b', ' ') + '\n') # '\u200b'不可見字符 if text.p is None: f.write(' ' + 'None:網頁沒有顯示文章內容' + '\n') else: f.write(' ' + text.p.get_text() + '\n') f.write('\n') index += 1 if __name__ == '__main__': input_url = 'https://www.zhihu.com/hot' a, b, c, d, = parse_page(get_page(input_url)) store_data(a, b, c, d)
1.2 結果截圖
2. 將數據導出到Excel
2.1 測試代碼示例
# coding='utf-8' import requests from bs4 import BeautifulSoup import openpyxl from requests.exceptions import RequestException import re import datetime def get_page(url): try: html = requests.get(url) if html.status_code == 200: # print(html.text) return html.text else: return None except RequestException: print('請求失敗') def parse_page(html): html = BeautifulSoup(html, 'html.parser') topic_items = html.find_all('div', {"class": "topic-name"}) topic_values = html.find_all('span', {"class": "topic-num"}) topic_statements = html.find_all('div', {"class": "topic-info"}) topic_imgs = html.find_all('li', {"class": "topic-top-item"}) return topic_items, topic_values, topic_statements, topic_imgs def store_data(topic_items, topic_values, topic_statements, topic_imgs): regex = re.compile(r'\d+(\.\d+)?') wb = openpyxl.load_workbook('貼吧熱榜Excel.xlsx') sheet = wb['Sheet1'] sheet.freeze_panes = 'A2' for item, value, statement, img in zip(topic_items, topic_values, topic_statements, topic_imgs): print(item.a.get_text(), '\n', item.a['href'], '\n', float(regex.search(value.get_text()).group()), '萬') sheet.append([item.a.get_text(), item.a['href'], float(regex.search(value.get_text()).group()), statement.p.get_text(), img.img['src'], datetime.datetime.now()]) wb.save('貼吧熱榜Excel.xlsx') def main(url): html = get_page(url) topic_items, topic_values, topic_statements, topic_imgs = parse_page(html) store_data(topic_items, topic_values, topic_statements, topic_imgs) if __name__ == '__main__': input_url = 'http://tieba.baidu.com/hottopic/browse/topicList?res_type=1&red_tag=i0626384809' main(input_url)
2.2 結果截圖
3. 將數據寫入mysql
3.1 創建數據庫
import pymysql db = pymysql.connect(host='localhost', user='root', password='', port=3306) # 替換為自己的密碼 cursor = db.cursor() sql = "CREATE DATABASE IF NOT EXISTS test_db" cursor.execute(sql) db.close()
3.1.1 結果截圖
打開Navicat便可看到剛剛創建的數據庫
3.2 創建表
下面這個mysql語句將會在以后添加記錄的同時自動填充記錄的創建時間和更新時間,一般以id為主鍵,但筆者在這里嘗試以title為主鍵
import pymysql
db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='test_db') # 替換為自己的密碼 cursor = db.cursor() sql = """ CREATE TABLE IF NOT EXISTS hot_lists (title VARCHAR(255) NOT NULL, link VARCHAR(255) NOT NULL, value_thousand float(6,1) NOT NULL, content VARCHAR(10000) NOT NULL, create_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, update_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (title)) """ cursor.execute(sql) db.close()
3.2.2 結果截圖
3.3 實踐代碼測試
import requests from bs4 import BeautifulSoup import pymysql from requests.exceptions import RequestException import random import datetime def get_page(url): headers = { 'cookie': '', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } # 替換為自己的cookie try: html = requests.get(url, headers=headers, timeout=5) # print(html.text) # print(BeautifulSoup(html.text, "html.parser")) return html.text except RequestException: print('請求失敗') def parse_page(html): html = BeautifulSoup(html, "html.parser") titles = html.find_all("h2", {'class': 'HotItem-title'})[:10] links = html.find_all('div', {"class": "HotItem-content"})[:10] hot_values = html.find_all('div', {"class": "HotItem-content"})[:10] texts = html.find_all('div', {"class": "HotItem-content"})[:10] return titles, links, hot_values, texts # , title_links def store_data(titles, links, hot_values, texts): con = pymysql.connect(host='localhost', user='root', password='120888', port=3306, db='test_db') cur = con.cursor() sql = 'INSERT INTO hot_lists (title, link, value_thousand, content) VALUES (%s, %s, %s, %s)' for title, link, hot_value, text in zip(titles, links, hot_values, texts): try: if text.p is None: cur.execute(sql, (title.get_text(), link.a.attrs['href'], float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10, 'None')) con.commit() else: cur.execute(sql, (title.get_text(), link.a.attrs['href'], float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10, text.p.get_text())) con.commit() except: print('Failed') con.rollback() def main(): url = 'https://www.zhihu.com/hot' html = get_page(url) titles, links, hot_values, texts = parse_page(html) store_data(titles, links, hot_values, texts) if __name__ == '__main__': random.seed(datetime.datetime.now()) main()
3.3.1 結果截圖
4. 將數據寫入mongodb
mongodb操作比mysql簡單,不用事先創建數據庫和表
4.1 測試代碼
import requests from bs4 import BeautifulSoup import pymysql from requests.exceptions import RequestException import random import datetime from pymongo import MongoClient client = MongoClient('localhost') db = client['test_db'] def get_page(url): headers = { "cookie": '', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } # 替換為自己的cookie try: html = requests.get(url, headers=headers, timeout=5) # print(html.text) # print(BeautifulSoup(html.text, "html.parser")) return html.text except RequestException: print('請求失敗') def parse_page(html): html = BeautifulSoup(html, "html.parser") titles = html.find_all("h2", {'class': 'HotItem-title'})[:10] links = html.find_all('div', {"class": "HotItem-content"})[:10] hot_values = html.find_all('div', {"class": "HotItem-content"})[:10] texts = html.find_all('div', {"class": "HotItem-content"})[:10] return titles, links, hot_values, texts # , title_links def store_data(titles, links, hot_values, texts): for title, link, hot_value, text in zip(titles, links, hot_values, texts): try: if text.p is None: db['hot_lists'].insert({"title": title.get_text(), "link": link.a.attrs['href'], "value_thousand": float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10,"content": 'None'}) else: db['hot_lists'].insert({"title": title.get_text(), "link": link.a.attrs['href'], "value_thousand": float(hot_value.div.get_text().replace('\u200b', ' ').split()[0]) * 10, "content": text.p.get_text()}) except: print('Failed') def main(): url = 'https://www.zhihu.com/hot' html = get_page(url) titles, links, hot_values, texts = parse_page(html) store_data(titles, links, hot_values, texts) if __name__ == '__main__': random.seed(datetime.datetime.now()) main()
4.2 結果截圖