爬蟲:將數據存儲到文件及數據庫(某乎及某吧)


  注:本文代碼中的cookie都需要替換為讀者自己的cookie

1.  將數據導出到文本文檔 

1.1 測試代碼

import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import time

headers = {
    'cookie': '',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}           # 替換為自己的cookie


def get_page(url):
    try:
        html = requests.get(url, headers=headers, timeout=5)
        if html.status_code == 200:
            print('請求成功')
            return html.text
        else:   # 這個else語句不是必須的
            return None
    except RequestException:
        print('請求失敗')


def parse_page(html):
    html = BeautifulSoup(html, "html.parser")
    titles = html.find_all("h2", {'class': 'HotItem-title'})[:10]
    links = html.find_all('div', {"class": "HotItem-content"})[:10]
    hot_values = html.find_all('div',{"class": "HotItem-content"})[:10]
    texts = html.find_all('div', {"class": "HotItem-content"})[:10]
    return titles, links, hot_values, texts  # , title_links


def store_data(titles, links, hot_values, texts):
    with open('熱榜測試.txt', 'a') as f:
        f.write('+'*80 + '\n')
        f.write(time.asctime().center(80) + '\n')
        f.write('+'*80 + '\n'*2)

        index = 1
        for title, link, hot_value, text in zip(titles, links, hot_values, texts):
            print(title.get_text(), '\n', link.a.attrs['href'], '\n', hot_value.div.get_text().replace('\u200b', ''))
            f.write(str(index) + ': ' + title.get_text() + '\n')
            f.write('   ' + link.a.attrs['href'] + '\n')
            f.write('   ' + hot_value.div.get_text().replace('\u200b', ' ') + '\n')  # '\u200b'不可見字符
            if text.p is None:
                f.write('   ' + 'None:網頁沒有顯示文章內容' + '\n')
            else:
                f.write('   ' + text.p.get_text() + '\n')
            f.write('\n')
            index += 1


if __name__ == '__main__':
    input_url = 'https://www.zhihu.com/hot'
    a, b, c, d, = parse_page(get_page(input_url))
    store_data(a, b, c, d)

1.2 結果截圖

2. 將數據導出到Excel

2.1  測試代碼示例

# coding='utf-8'


import requests
from bs4 import BeautifulSoup
import openpyxl
from requests.exceptions import RequestException
import re
import datetime


def get_page(url):
    try:
        html = requests.get(url)
        if html.status_code == 200:
            # print(html.text)
            return html.text
        else:
            return None
    except RequestException:
        print('請求失敗')


def parse_page(html):
    html = BeautifulSoup(html, 'html.parser')
    topic_items = html.find_all('div', {"class": "topic-name"})
    topic_values = html.find_all('span', {"class": "topic-num"})
    topic_statements = html.find_all('div', {"class": "topic-info"})
    topic_imgs = html.find_all('li', {"class": "topic-top-item"})
    return topic_items, topic_values, topic_statements, topic_imgs


def store_data(topic_items, topic_values, topic_statements, topic_imgs):
    regex = re.compile(r'\d+(\.\d+)?')
    wb = openpyxl.load_workbook('貼吧熱榜Excel.xlsx')
    sheet = wb['Sheet1']
    sheet.freeze_panes = 'A2'
    for item, value, statement, img in zip(topic_items, topic_values, topic_statements, topic_imgs):
        print(item.a.get_text(), '\n', item.a['href'], '\n', float(regex.search(value.get_text()).group()), '')
        sheet.append([item.a.get_text(), item.a['href'], float(regex.search(value.get_text()).group()), statement.p.get_text(), img.img['src'], datetime.datetime.now()])
    wb.save('貼吧熱榜Excel.xlsx')


def main(url):
    html = get_page(url)
    topic_items, topic_values, topic_statements, topic_imgs = parse_page(html)
    store_data(topic_items, topic_values, topic_statements, topic_imgs)


if __name__ == '__main__':
    input_url = 'http://tieba.baidu.com/hottopic/browse/topicList?res_type=1&red_tag=i0626384809'
    main(input_url)

 

2.2 結果截圖

 

 

3. 將數據寫入mysql

3.1 創建數據庫

import pymysql
db = pymysql.connect(host='localhost', user='root', password='', port=3306)  # 替換為自己的密碼
cursor = db.cursor()
sql = "CREATE DATABASE IF NOT EXISTS test_db"

cursor.execute(sql)
db.close()

3.1.1 結果截圖

  打開Navicat便可看到剛剛創建的數據庫

 

 

3.2  創建表

  下面這個mysql語句將會在以后添加記錄的同時自動填充記錄的創建時間和更新時間,一般以id為主鍵,但筆者在這里嘗試以title為主鍵

import pymysql
db
= pymysql.connect(host='localhost', user='root', password='', port=3306, db='test_db') # 替換為自己的密碼 cursor = db.cursor() sql = """ CREATE TABLE IF NOT EXISTS hot_lists (title VARCHAR(255) NOT NULL, link VARCHAR(255) NOT NULL, value_thousand float(6,1) NOT NULL, content VARCHAR(10000) NOT NULL, create_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, update_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, PRIMARY KEY (title)) """ cursor.execute(sql) db.close()

3.2.2  結果截圖

 

 

3.3  實踐代碼測試

import requests
from bs4 import BeautifulSoup
import pymysql
from requests.exceptions import RequestException
import random
import datetime


def get_page(url):
    headers = {
        'cookie': '',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }   # 替換為自己的cookie
    try:
        html = requests.get(url, headers=headers, timeout=5)
        # print(html.text)
        # print(BeautifulSoup(html.text, "html.parser"))
        return html.text
    except RequestException:
        print('請求失敗')


def parse_page(html):
    html = BeautifulSoup(html, "html.parser")
    titles = html.find_all("h2", {'class': 'HotItem-title'})[:10]
    links = html.find_all('div', {"class": "HotItem-content"})[:10]
    hot_values = html.find_all('div', {"class": "HotItem-content"})[:10]
    texts = html.find_all('div', {"class": "HotItem-content"})[:10]
    return titles, links, hot_values, texts  # , title_links


def store_data(titles, links, hot_values, texts):
    con = pymysql.connect(host='localhost', user='root', password='120888', port=3306, db='test_db')
    cur = con.cursor()
    sql = 'INSERT INTO hot_lists (title, link, value_thousand, content) VALUES (%s, %s, %s, %s)'
    for title, link, hot_value, text in zip(titles, links, hot_values, texts):
        try:
            if text.p is None:
                cur.execute(sql, (title.get_text(), link.a.attrs['href'], float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10, 'None'))
                con.commit()
            else:
                cur.execute(sql, (title.get_text(), link.a.attrs['href'], float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10, text.p.get_text()))
                con.commit()
        except:
            print('Failed')
            con.rollback()


def main():
    url = 'https://www.zhihu.com/hot'
    html = get_page(url)
    titles, links, hot_values, texts = parse_page(html)
    store_data(titles, links, hot_values, texts)


if __name__ == '__main__':
    random.seed(datetime.datetime.now())
    main()

3.3.1  結果截圖

 

 

4. 將數據寫入mongodb

  mongodb操作比mysql簡單,不用事先創建數據庫和表

4.1 測試代碼

import requests
from bs4 import BeautifulSoup
import pymysql
from requests.exceptions import RequestException
import random
import datetime
from pymongo import MongoClient

client = MongoClient('localhost')
db = client['test_db']


def get_page(url):
    headers = {
        "cookie": '',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }   # 替換為自己的cookie
    try:
        html = requests.get(url, headers=headers, timeout=5)
        # print(html.text)
        # print(BeautifulSoup(html.text, "html.parser"))
        return html.text
    except RequestException:
        print('請求失敗')


def parse_page(html):
    html = BeautifulSoup(html, "html.parser")
    titles = html.find_all("h2", {'class': 'HotItem-title'})[:10]
    links = html.find_all('div', {"class": "HotItem-content"})[:10]
    hot_values = html.find_all('div', {"class": "HotItem-content"})[:10]
    texts = html.find_all('div', {"class": "HotItem-content"})[:10]
    return titles, links, hot_values, texts  # , title_links


def store_data(titles, links, hot_values, texts):
    for title, link, hot_value, text in zip(titles, links, hot_values, texts):
        try:
            if text.p is None:
                db['hot_lists'].insert({"title": title.get_text(), "link": link.a.attrs['href'], "value_thousand": float(hot_value.div.get_text().replace('\u200b', ' ').split()[0])*10,"content": 'None'})
            else:
                db['hot_lists'].insert({"title": title.get_text(), "link": link.a.attrs['href'], "value_thousand": float(hot_value.div.get_text().replace('\u200b', ' ').split()[0]) * 10, "content": text.p.get_text()})
        except:
            print('Failed')


def main():
    url = 'https://www.zhihu.com/hot'
    html = get_page(url)
    titles, links, hot_values, texts = parse_page(html)
    store_data(titles, links, hot_values, texts)


if __name__ == '__main__':
    random.seed(datetime.datetime.now())
    main()

 

4.2  結果截圖

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM