Python爬虫爬取1905电影网视频电影并存储到mysql数据库

本文转载自查看原文 2021-05-15 23:03 2318 软件工程/ Python数据处理/ 树懒电影项目（个人总结记录）

数据获取方式：微信搜索关注【靠谱杨阅读人生】回复【电影】。
整理不易，资源付费，谢谢支持！

代码：

 1 import time  2 import traceback  3 import requests  4 from lxml import etree  5 import re  6 from bs4 import BeautifulSoup  7 from lxml.html.diff import end_tag  8 import json  9 import pymysql  10 
 11 def get1905():  12     url='https://www.1905.com/vod/list/n_1/o3p1.html'
 13     headers={  14         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
 15  }  16     templist=[]  17     dataRes=[]  18     #最热
 19     #1905电影网一共有99页，每页24部电影 for1-100 输出1-99页
 20     for i in range(1,100):  21         url_1='https://www.1905.com/vod/list/n_1/o3p'
 22         auto=str(i)  23         url_2='.html'
 24         url=url_1+auto+url_2  25         print(url)  26         response = requests.get(url, headers)  27         response.encoding = 'utf-8'
 28         page_text = response.text  29         soup = BeautifulSoup(page_text, 'lxml')  30         # print(page_text)
 31         movie_all = soup.find_all('div', class_="grid-2x grid-3x-md grid-6x-sm")  32         for single in movie_all:  33             part_html=str(single)  34             part_soup=BeautifulSoup(part_html,'lxml')  35             #添加名字
 36             name=part_soup.find('a')['title']  37  templist.append(name)  38             # print(name)
 39             #添加评分
 40             try:  41                 score=part_soup.find('i').text  42             except:  43                 if(len(score)==0):  44                     score="1905暂无评分"
 45  templist.append(score)  46             # print(score)
 47             #添加path
 48             path=part_soup.find('a',class_="pic-pack-outer")['href']  49  templist.append(path)  50             # print(path)
 51             #添加state
 52             state="免费"
 53  templist.append(state)  54             print(templist)  55  dataRes.append(templist)  56             templist=[]  57         print(len(dataRes))  58     # print(movie_all)
 59 
 60     #---------------------------------------------
 61     #好评
 62     templist = []  63     # 1905电影网一共有99页，每页24部电影 for1-100 输出1-99页
 64     for i in range(1, 100):  65         url_1 = 'https://www.1905.com/vod/list/n_1/o4p'
 66         auto = str(i)  67         url_2 = '.html'
 68         url = url_1 + auto + url_2  69         print(url)  70         response = requests.get(url, headers)  71         response.encoding = 'utf-8'
 72         page_text = response.text  73         soup = BeautifulSoup(page_text, 'lxml')  74         # print(page_text)
 75         movie_all = soup.find_all('div', class_="grid-2x grid-3x-md grid-6x-sm")  76         for single in movie_all:  77             part_html = str(single)  78             part_soup = BeautifulSoup(part_html, 'lxml')  79             # 添加名字
 80             name = part_soup.find('a')['title']  81  templist.append(name)  82             # print(name)
 83             # 添加评分
 84             try:  85                 score = part_soup.find('i').text  86             except:  87                 if (len(score) == 0):  88                     score = "1905暂无评分"
 89  templist.append(score)  90             # print(score)
 91             # 添加path
 92             path = part_soup.find('a', class_="pic-pack-outer")['href']  93  templist.append(path)  94             # print(path)
 95             # 添加state
 96             state = "免费"
 97  templist.append(state)  98             print(templist)  99  dataRes.append(templist) 100             templist = [] 101         print(len(dataRes)) 102         #---------------------------------------------
103         # 最新
104         templist = [] 105         # 1905电影网一共有99页，每页24部电影 for1-100 输出1-99页
106     for i in range(1, 100): 107         url_1 = 'https://www.1905.com/vod/list/n_1/o1p'
108         auto = str(i) 109         url_2 = '.html'
110         url = url_1 + auto + url_2 111         print(url) 112         response = requests.get(url, headers) 113         response.encoding = 'utf-8'
114         page_text = response.text 115         soup = BeautifulSoup(page_text, 'lxml') 116         # print(page_text)
117         movie_all = soup.find_all('div', class_="grid-2x grid-3x-md grid-6x-sm") 118         for single in movie_all: 119             part_html = str(single) 120             part_soup = BeautifulSoup(part_html, 'lxml') 121             # 添加名字
122             name = part_soup.find('a')['title'] 123  templist.append(name) 124             # print(name)
125             # 添加评分
126             try: 127                 score = part_soup.find('i').text 128             except: 129                 if (len(score) == 0): 130                     score = "1905暂无评分"
131  templist.append(score) 132             # print(score)
133             # 添加path
134             path = part_soup.find('a', class_="pic-pack-outer")['href'] 135  templist.append(path) 136             # print(path)
137             # 添加state
138             state = "免费"
139  templist.append(state) 140             print(templist) 141  dataRes.append(templist) 142             templist = [] 143         print(len(dataRes)) 144     #去重
145     old_list = dataRes 146     new_list = [] 147     for i in old_list: 148         if i not in new_list: 149  new_list.append(i) 150             print(len(new_list)) 151     print("总数: "+str(len(new_list))) 152     return new_list 153 def insert_1905(): 154     cursor = None 155     conn = None 156     try: 157         count = 0 158         list = get1905() 159         print(f"{time.asctime()}开始插入1905电影数据") 160         conn, cursor = get_conn() 161         sql = "insert into movie1905 (id,name,score,path,state) values(%s,%s,%s,%s,%s)"
162         for item in list: 163             print(item) 164             # 异常捕获，防止数据库主键冲突
165             try: 166                 cursor.execute(sql, [0, item[0], item[1], item[2], item[3]]) 167             except pymysql.err.IntegrityError: 168                 print("重复！跳过！") 169         conn.commit()  # 提交事务 update delete insert操作
170         print(f"{time.asctime()}插入1905电影数据完毕") 171     except: 172  traceback.print_exc() 173     finally: 174  close_conn(conn, cursor) 175     return; 176 
177 #连接数据库 获取游标
178 def get_conn(): 179     """
180  :return: 连接，游标 181     """
182     # 创建连接
183     conn = pymysql.connect(host="127.0.0.1", 184                     user="root", 185                     password="000429", 186                     db="movierankings", 187                     charset="utf8") 188     # 创建游标
189     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
190     if ((conn != None) & (cursor != None)): 191         print("数据库连接成功！游标创建成功！") 192     else: 193         print("数据库连接失败！") 194     return conn, cursor 195 #关闭数据库连接和游标
196 def close_conn(conn, cursor): 197     if cursor: 198  cursor.close() 199     if conn: 200  conn.close() 201     return 1
202 
203 if __name__ == '__main__': 204     # get1905()
205     insert_1905()

运行截图：

数据库

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 python爬虫--爬取某网站电影信息并写入mysql数据库爬虫与Python：（三）基本库的使用——扩展:requests爬取阳光电影网源码 python爬虫-爬取豆瓣电影数据 python爬虫爬取爱奇艺电影数据并存入excel 猫眼电影爬取(一)：requests+正则，并将数据存储到mysql数据库 Python爬虫爬取豆瓣电影名称和链接，分别存入txt，excel和数据库 Python爬虫爬取ECVA论文标题作者摘要关键字等信息并存储到mysql数据库【Python3 爬虫】U09_爬取hao6v电影网爬取4567电影网 python爬取电影网站信息