Python小爬蟲——抓取豆瓣電影Top250數據


python抓取豆瓣電影Top250數據

1.豆瓣地址:https://movie.douban.com/top250?start=25&filter=

2.主要流程是抓取該網址下的Top250的數據,存入本地的txt文件中,並將數據持久化寫入數據庫中

環境准備:

1.本地安裝mysql數據庫,具體下載以及安裝參照:https://blog.csdn.net/chic_data/article/details/72286329

2.安裝好數據后創建database和table,並創建字段

如:我安裝的版本是mysqlV8.0

CREATE TABLE doubanTop250(
    ID int PRIMARY KEY AUTO_INCREMENT,
    rankey int,
    name varchar(50),
    alias varchar(100),
    director varchar(50),
    showYear varchar(50),
    makeCountry varchar(50),
    movieType varchar(50),
    movieScore float,
    scoreNum int,
    shortFilm varchar(255)
)ENGINE=InnoDB DEFAULT CHARSET=utf8;

最后我們直接來看代碼:

  

  1 from urllib import request
  2 import re
  3 import pymysql
  4 class MovieTop(object):
  5     def __init__(self):
  6         self.start = 0
  7         self.param = '&filter'
  8         self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) "
  9                                        "AppleWebKit/537.36 (KHTML, like Gecko) "
 10                                        "Chrome/65.0.3325.146 Safari/537.36"}
 11         self.movieList = []
 12         self.filePath = './DoubanTop250.txt'
 13 
 14     def get_page(self):
 15         try:
 16             url = 'https://movie.douban.com/top250?start=' + str(self.start) + '&filter='
 17             myRequest = request.Request(url, headers=self.headers)
 18             response = request.urlopen(myRequest)
 19             page = response.read().decode('utf-8')
 20             print('正在獲取第' + str((self.start+25)//25) + '頁數據...')
 21             self.start += 25
 22             return page
 23         except request.URLError as e:
 24             if hasattr(e, 'reason'):
 25                 print('獲取失敗,失敗原因:', e.reason)
 26 
 27     def get_page_info(self):
 28         patern = re.compile(u'<div.*?class="item">.*?'
 29                             + u'<div.*?class="pic">.*?'
 30                             + u'<em.*?class="">(.*?)</em>.*?'
 31                             + u'<div.*?class="info">.*?'
 32                             + u'<span.*?class="title">(.*?)</span>.*?'
 33                             + u'<span.*?class="other">(.*?)</span>.*?'
 34                             + u'<div.*?class="bd">.*?'
 35                             + u'<p.*?class="">.*?'
 36                             + u'導演:\s(.*?)\s.*?<br>'
 37                             + u'(.*?)&nbsp;/&nbsp;'
 38                             + u'(.*?)&nbsp;/&nbsp;(.*?)</p>.*?'
 39                             + u'<div.*?class="star">.*?'
 40                             + u'<span.*?class="rating_num".*?property="v:average">'
 41                             + u'(.*?)</span>.*?'
 42                             + u'<span>(.*?)人評價</span>.*?'
 43                             + u'<span.*?class="inq">(.*?)</span>'
 44                             , re.S)
 45 
 46         while self.start <= 225:
 47             page = self.get_page()
 48             movies = re.findall(patern, page)
 49             for movie in movies:
 50                 self.movieList.append([movie[0],
 51                                        movie[1],
 52                                        movie[2].lstrip('&nbsp;/&nbsp;'),
 53                                        movie[3],
 54                                        movie[4].lstrip(),
 55                                        movie[5],
 56                                        movie[6].rstrip(),
 57                                        movie[7],
 58                                        movie[8],
 59                                        movie[9]])
 60 
 61     def write_page(self):
 62         print('開始寫入文件...')
 63         file = open(self.filePath, 'w', encoding='utf-8')
 64         try:
 65             for movie in self.movieList:
 66                 file.write('電影排名:' + movie[0] + '\n')
 67                 file.write('電影名稱:' + movie[1] + '\n')
 68                 file.write('電影別名:' + movie[2] + '\n')
 69                 file.write('導演:' + movie[3] + '\n')
 70                 file.write('上映年份:' + movie[4] + '\n')
 71                 file.write('制作國家/地區:' + movie[5] + '\n')
 72                 file.write('電影類別:' + movie[6] + '\n')
 73                 file.write('評分:' + movie[7] + '\n')
 74                 file.write('參評人數:' + movie[8] + '\n')
 75                 file.write('簡短影評:' + movie[9] + '\n')
 76                 file.write('\n')
 77             print('成功寫入文件...')
 78         except Exception as e:
 79             print(e)
 80         finally:
 81             file.close()
 82 
 83     def upload(self):
 84         db = pymysql.connect("localhost", "root", "root", "PythonTest", charset='utf8')
 85         cursor = db.cursor()
 86 
 87         insertStr = "INSERT INTO doubanTop250(rankey, name, alias, director," \
 88                     "showYear, makeCountry, movieType, movieScore, scoreNum, shortFilm)" \
 89                     "VALUES (%d, '%s', '%s', '%s', '%s', '%s', '%s', %f, %d, '%s')"
 90 
 91         try:
 92             for movie in self.movieList:
 93                 insertSQL = insertStr % (int(movie[0]), str(movie[1]), str(movie[2]), str(movie[3]),
 94                                          str(movie[4]), str(movie[5]), str(movie[6]), float(movie[7]),
 95                                          int(movie[8]), str(movie[9]))
 96                 cursor.execute(insertSQL)
 97             db.commit()
 98             print('成功上傳至數據庫...')
 99         except Exception as e:
100             print(e)
101             db.rollback()
102         finally:
103             db.close()
104 
105 if __name__ == '__main__':
106     mt = MovieTop()
107     mt.get_page_info()
108     mt.write_page()
109     mt.upload()

執行結果:

 

參照原文地址:https://www.cnblogs.com/AlvinZH/p/8576841.html#_label0

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM