Python爬虫爬取ECVA论文标题作者摘要关键字等信息并存储到mysql数据库

本文转载自查看原文 2021-06-13 16:21 161 团队开发中的个人总结/ 软件工程/ Python

网站截图：

源代码：

 1 import re  2 import requests  3 import pymysql  4 from bs4 import BeautifulSoup  5 import lxml  6 import traceback  7 import time  8 import json  9 from lxml import etree  10 def query(sql,*args):  11     """
 12  封装通用查询  13  :param sql:  14  :param args:  15  :return: 返回查询结果以((),(),)形式  16     """
 17     conn,cursor = get_conn();  18  cursor.execute(sql)  19     res=cursor.fetchall()  20  close_conn(conn,cursor)  21     return res  22 def get_paper():  23     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
 24     url='https://www.ecva.net/papers.php'
 25     headers = {  26         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
 27  }  28     response=requests.get(url,headers)  29     response.encoding='utf-8'
 30     page_text=response.text  31     #输出页面html
 32     # print(page_text)
 33     soup = BeautifulSoup(page_text,'lxml')  34     all_dt=soup.find_all('dt',class_='ptitle')  35     print("dt:"+str(len(all_dt)))  36     #暂存信息
 37     temp_res=[]  38     #最后结果集
 39     res=[]  40     #链接
 41     link_res = []  42     for dt in all_dt:  43         single_dt=str(dt)  44         single_soup=BeautifulSoup(single_dt,'lxml')  45         title=single_soup.find('a').text  46         #存标题
 47         temp_res.append(title[2:])  48         #存摘要
 49 
 50         #存关键字
 51 
 52         #存源链接
 53         sourcelink=single_soup.find('a')['href']  54         sourcelink="https://www.ecva.net/"+sourcelink  55  temp_res.append(sourcelink)  56  res.append(temp_res)  57         temp_res=[]  58     #爬取作者和pdf文件链接
 59     all_dd=soup.find_all('dd')  60     print("dd:"+str(len(all_dd)))  61     flag=0  62     temp_link=[]  63     author=[]       #作者列表 一层list
 64     for item in all_dd:  65         if(flag%2==0):  66             #保存作者
 67  author.append(item)  68         else:  69             linktext=str(item)  70             linksoup=BeautifulSoup(linktext,'lxml')  71             link_list=linksoup.find_all('a')  72             for i in link_list:  73                 if(i.get('href')==None):  74                     temp_link.append("fakelink")  75                 else:  76                     # print(i)
 77                     if("http" not in str(i.get('href')) and "papers" in str(i.get('href'))):  78                         temp_link.append(("https://www.ecva.net/"+str(i.get('href'))))  79                     else:  80                         temp_link.append(i.get('href'))  81             print(temp_link)  82  link_res.append(temp_link)  83             temp_link=[]  84             #解析download 和 pdfinfo
 85         flag = flag + 1
 86     """
 87  继续使用beautifulsoup  88  download_text 和 pdfinfo_text  89  存储author  90  "https://www.ecva.net/"  91     """
 92     linkflag=1
 93     print("------------------------------")  94     #把作者和download pdfinfo 存到res
 95     for i in range(0,len(author)):  96         #添加作者
 97         str_author=str(author[i])  98         new_author=str_author.replace("<dd>","")  99         new_author=new_author.replace(" </dd>","") 100         new_author = new_author.replace("\n", "") 101  res[i].append(new_author) 102         # print("link_res:"+str(len(link_res)))
103         if(len(link_res[i])==2): 104             #添加download
105  res[i].append(link_res[i][0]) 106             #添加pdfinfo
107             res[i].append(link_res[i][1]) 108         else: 109             # 添加download
110  res[i].append(link_res[i][0]) 111             # 添加pdfinfo
112             res[i].append(link_res[i][2]) 113     print("----------------------") 114     # print(len(author))
115     # print(len(download))
116     # print(len(pdfinfo))
117     # for item in res:
118     # print(item)
119     return res 120 #############################################################
121 #继续爬取abstract 和 keyword
122 def get_further(): 123     res=get_paper() 124     temp_res=[] 125     further_res=[] 126     db_res=[] 127     sql="SELECT pdfinfo FROM pdf;"
128     db_res=query(sql)  #返回元祖 要继续[0]访问数据
129     #对结果集的链接发起请求
130     for i in range(1358,len(db_res)): 131         url=db_res[i][0]       #获取url
132         print(url) 133         headers={ 134             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
135                          "Chrome/91.0.4472.101 Safari/537.36"
136  } 137         try: 138             response=requests.get(url,headers) 139             response.encoding = "utf-8"
140             page_text = response.text 141             # print(page_text)
142             soup = BeautifulSoup(page_text, 'lxml') 143 
144             abstract = soup.find('p', id='Par1').text 145             #去掉\n
146             abstract = abstract.replace("\n","") 147             print("摘要:"+abstract) 148             keyword = soup.find_all('span', class_="Keyword") 149             # print(keyword)
150             # find_keyword=re.compile('<span class="Keyword">(.*?)</span>')
151             keyword_str = ""
152             for items in keyword: 153                 # 获取所有文本
154                 keyword_str = keyword_str + items.get_text() 155             print("关键字:"+keyword_str) 156             #去掉 \xa0
157             keyword_str=keyword_str.replace("\xa0",",") 158             #去掉末尾的一个逗号
159             keyword_str = keyword_str[0:-1] 160             # 最后添加 摘要和关键字
161  temp_res.append(abstract) 162  temp_res.append(keyword_str) 163  further_res.append(temp_res) 164             print(temp_res) 165             print("~~~~~~~~~~~~~~~~~~~~~~~~~~~") 166             temp_res = [] 167         except: 168             print("链接无效！") 169             try: 170                 if(len(further_res[i][0])==0): 171                     res[i].append("no abstract") 172                 else: 173  res[i].append(further_res[i][0]) 174                 if(len(further_res[i][1])==0): 175                     res[i].append("no keyword") 176                 else: 177                     res[i].append(further_res[i][1]) 178                 print(res[i]) 179                 # 插入数据库
180                 # insert_paper_1(res[i], i)
181             except: 182                 print("IndexError: list index out of range") 183     return
184 
185 #连接数据库 获取游标
186 def get_conn(): 187     """
188  :return: 连接，游标 189     """
190     # 创建连接
191     conn = pymysql.connect(host="127.0.0.1", 192                     user="root", 193                     password="000429", 194                     db="paperinfo", 195                     charset="utf8") 196     # 创建游标
197     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
198     if ((conn != None) & (cursor != None)): 199         print("数据库连接成功！游标创建成功！") 200     else: 201         print("数据库连接失败！") 202     return conn, cursor 203 #关闭数据库连接和游标
204 def close_conn(conn, cursor): 205     if cursor: 206  cursor.close() 207     if conn: 208  conn.close() 209     return 1
210 def insert_paper_0(): 211     conn,cursor=get_conn() 212     res=get_paper() 213     print(f"{time.asctime()}开始插入论文详情数据") 214     try: 215         sql = "insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s," \ 216               "%s,%s,%s,%s)"
217         for item in res: 218             print(item) 219             # 异常捕获，防止数据库主键冲突
220             try: 221                 cursor.execute(sql, [item[0], item[1], item[2], item[3],"",""]) 222             except pymysql.err.IntegrityError: 223                 print("重复!") 224         print("###########################") 225         conn.commit()  # 提交事务 update delete insert操作
226         print(f"{time.asctime()}插入论文详情数据完毕") 227     except: 228  traceback.print_exc() 229     finally: 230  close_conn(conn, cursor) 231     return
232 #########################################
233 def insert_paper_1(res,count): 234     conn,cursor=get_conn() 235     print(f"{time.asctime()}开始插入论文详情数据") 236     try: 237         sql = "insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s," \ 238               "%s,%s,%s,%s)"
239         print(res) 240         # 异常捕获，防止数据库主键冲突
241         try: 242             cursor.execute(sql, [res[0], res[1], res[2], res[3],res[5],res[6]]) 243         except pymysql.err.IntegrityError: 244             print("重复!") 245         print("###########################") 246         conn.commit()  # 提交事务 update delete insert操作
247         print(f"{time.asctime()}插入第"+str(count+1)+"条论文详情数据完毕") 248     except: 249  traceback.print_exc() 250     finally: 251  close_conn(conn, cursor) 252     return
253 
254 #单独插入 pdfinfo
255 def inseet_pdf(): 256     conn, cursor = get_conn() 257     res=get_paper() 258     print(f"{time.asctime()}开始插入论文pdfinfo数据") 259     try: 260         sql = "insert into pdf (id,pdfinfo) values(%s,%s)"
261         # 异常捕获，防止数据库主键冲突
262         for item in res: 263             print(item) 264             # 异常捕获，防止数据库主键冲突
265             try: 266                 cursor.execute(sql, [0,item[4]]) 267             except pymysql.err.IntegrityError: 268                 print("重复!") 269         print("###########################") 270         conn.commit()  # 提交事务 update delete insert操作
271         print(f"{time.asctime()}插入论文pdfinfo完毕") 272     except: 273  traceback.print_exc() 274     finally: 275  close_conn(conn, cursor) 276     return
277 if (__name__=='__main__'): 278  get_further() 279     # inseet_pdf()

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 [爬虫]python根据职位或者公司关键字爬取某招聘网站招聘数据并存储到数据库和exeal中 Python爬虫爬取1905电影网视频电影并存储到mysql数据库 Python爬虫根据关键词爬取知网论文摘要并保存到数据库中【入门必学】 python爬取网页数据并存储到mysql数据库 Python爬取腾讯疫情实时数据并存储到mysql数据库 [原创]python爬虫之BeautifulSoup,爬取网页上所有图片标题并存储到本地文件 python连接mysql数据库（用with关键字） python爬虫：爬取易迅网价格信息，并写入Mysql数据库 python爬虫--爬取某网站电影信息并写入mysql数据库分类爬取新闻并存入mysql数据库