python爬取豆瓣書籍排行


最近想通過爬取豆瓣數據來練習下爬蟲,這次做一個爬取豆瓣書籍的信息。

需求:通過爬取豆瓣圖書小說這一標簽的數據,將數據存入csv或者數據庫里面。

思路:先從網頁上爬取數據,然后存到csv,然后讀取csv的數據寫到數據庫中。(別問我為什么不直接寫數據庫,還要在csv中轉一次。o(╯□╰)o。。。因為這個項目是逐漸練手的,是先寫完csv,然后准備統計數據畫圖,所以想到還是存mysql好一點,就這樣了。。。)

直接上個代碼吧.。。。。。

畫圖表的方法還沒完善,先上傳上來,后面完善了再更

——————————————————————————————————————————————————

更新畫圖

 

# -*- coding: utf-8 -*-
'''
Created on 2018年8月17日

@author: zww
'''
import requests
import re
import random
import time
from lxml import etree
import pandas as pd
import matplotlib.pyplot as plt
import pymysql
from pymysql import charset
import csv
import codecs
# rating_list:評分, pl_list:評論人數
title_list, pub_list, rating_list, pl_list = [], [], [], []


def scrapy_contents(currentPage):
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
  cookies = {
    'cookies': '這里弄成自己的cookies'}

  url = ''.join(['https://book.douban.com/tag/小說?start=',
    str((currentPage + 1) * 20), '&type=T'])
  res = requests.get(url, headers=headers, cookies=cookies)
  res.encoding = "utf-8"
  if (res.status_code == 200):
    print('\n第{}頁的數據爬取成功'.format(currentPage))
    print(url)
  else:
    print('\n o(╯□╰)o第{}頁的數據爬取失敗'.format(currentPage))
    print(url)
  x = etree.HTML(res.text)
  # 豆瓣每一頁有20條數據
  for i in range(1, 21):
  title = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/h2/a/@title'.format(i))
  pub = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/div[1]/text()'.format(i))
  rating = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/div[2]/span[2]/text()'.format(i))
  pl = x.xpath(
  '//*[@id="subject_list"]/ul/li[{}]/div[2]/div[2]/span[3]/text()'.format(i))
  # 遇到有一頁只有19條數據。。。。
  try:
    title_list.append(str(title[0]).strip())
    pub_list.append(str(pub[0]).strip())
    rating_list.append(str(rating[0]))
    # 抓出來的數據是: (13376人評價),這里只取數字
    num = re.findall(r"\d+", str(pl[0]))
    pl_list.append(num[0])
  except Exception as e:
    print('第%d條記錄獲取數據失敗' % i)
    print(e)
  continue


def draw_chart(name_list, num_list, title=u'評分比例'):

  plt.bar(range(len(num_list)), num_list,
  tick_label=name_list, facecolor='#ff9999', edgecolor='white', )
  plt.title(title)
  plt.savefig(u'圖書評分的柱狀圖') # 保存
  plt.show()


def draw_pie(name_list, num_list, title=u'評分比例'):

  plt.title(title)
  # 正圓
  plt.axes(aspect='equal')

  # patches, l_texts, p_texts,為了得到餅圖的返回值,p_texts餅圖內部文本的,l_texts餅圖外label的文本
  patches, l_text, p_text = plt.pie(
  num_list, labels=name_list, autopct='%1.1f%%',
  pctdistance=0.8, textprops={'fontsize': 6, 'color': 'k'}, radius=1)

  plt.savefig(u'圖書評分的餅圖') # 保存
  plt.show()


def save_file(filename):
  infos = {'書名': title_list, '出版信息': pub_list,
  '評分': rating_list, '評論人數': pl_list}
  data = pd.DataFrame(
  infos, columns=['書名', '出版信息', '評分', '評論人數'])

  data.to_csv(filename, index=False)


def insert(cur, sql, args):
  cur.execute(sql, args)


def get_conn(host, port, user, passwd, db):
  conn = pymysql.connect(
    host=host, port=port, user=user, passwd=passwd, db=db, charset='utf8')
  return conn


def query(cur, sql):
  cur.execute(sql)
  result = cur.fetchall()
  return result


def read_csv_to_mysql(filename):
  with codecs.open(filename=filename, mode='r', encoding='utf-8') as f:
  reader = csv.reader(f)
  head = next(reader)
  conn = get_conn(
    'localhost', 3306, 'root', '123456', 'test_scrapy')
  cur = conn.cursor()
  sql = '''insert into novel(BookName,Pub,Score,CommentNum) values(%s,%s,%s,%s)'''
  for item in reader:
    args = tuple(item)
    insert(cur, sql=sql, args=args)
    conn.commit()
  cur.close()
  conn.close()


def Drawing(name_list, num_list):
  plt.rcParams['figure.figsize'] = (12, 8) # 設置圖片的大小1200*800
  plt.rcParams['font.sans-serif'] = ['SimHei'] # 這兩句是為了顯示中文不亂碼
  plt.rcParams['font.family'] = 'sans-serif'
  draw_chart(name_list, num_list)
  # draw_pie(name_list, num_list)


def main(scrapyPage):
  for i in range(1, scrapyPage + 1):
    scrapy_contents(i)
    # 隨機等待時間,免得被封ip
    time.sleep(round(random.uniform(1, 2), 2))
    now = time.strftime('%Y-%m-%d %H-%M-%S', time.localtime())
    filename = now + "豆瓣圖書評分信息.csv"
    save_file(filename)
  print('scrapy done!')
  # 寫到mysql里面
  read_csv_to_mysql(filename)


if __name__ == '__main__':
  main(48)

  conn = get_conn(
  'localhost', 3306, 'root', '123456', 'test_scrapy')
  cur = conn.cursor(pymysql.cursors.DictCursor) # 字典形式返回
  cur_list = conn.cursor() # 元組形式返回
  sql = '''SELECT DISTINCT(Score) from novel ORDER BY Score desc'''
  Scores = query(cur_list, sql)
  Scores_num = {}
  for i in Scores:
    sql = 'SELECT count(*) as c from novel where Score =%s' % i
    num = query(cur, sql)
    num_value = num[0]['c']
    num_key = str(i[0])
    Scores_num[num_key] = num_value
    name_list = list(Scores_num.keys())
    num_list = list(Scores_num.values())
  cur.close()
  cur_list.close()
  conn.close()
  Drawing(name_list, num_list)


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM