Python爬虫爬取ECVA论文标题作者摘要关键字等信息并存储到mysql数据库


网站截图:

 

 

 

 源代码:

 1 import re  2 import requests  3 import pymysql  4 from bs4 import BeautifulSoup  5 import lxml  6 import traceback  7 import time  8 import json  9 from lxml import etree  10 def query(sql,*args):  11     """
 12  封装通用查询  13  :param sql:  14  :param args:  15  :return: 返回查询结果以((),(),)形式  16     """
 17     conn,cursor = get_conn();  18  cursor.execute(sql)  19     res=cursor.fetchall()  20  close_conn(conn,cursor)  21     return res  22 def get_paper():  23     #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
 24     url='https://www.ecva.net/papers.php'
 25     headers = {  26         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
 27  }  28     response=requests.get(url,headers)  29     response.encoding='utf-8'
 30     page_text=response.text  31     #输出页面html
 32     # print(page_text)
 33     soup = BeautifulSoup(page_text,'lxml')  34     all_dt=soup.find_all('dt',class_='ptitle')  35     print("dt:"+str(len(all_dt)))  36     #暂存信息
 37     temp_res=[]  38     #最后结果集
 39     res=[]  40     #链接
 41     link_res = []  42     for dt in all_dt:  43         single_dt=str(dt)  44         single_soup=BeautifulSoup(single_dt,'lxml')  45         title=single_soup.find('a').text  46         #存标题
 47         temp_res.append(title[2:])  48         #存摘要
 49 
 50         #存关键字
 51 
 52         #存源链接
 53         sourcelink=single_soup.find('a')['href']  54         sourcelink="https://www.ecva.net/"+sourcelink  55  temp_res.append(sourcelink)  56  res.append(temp_res)  57         temp_res=[]  58     #爬取作者和pdf文件链接
 59     all_dd=soup.find_all('dd')  60     print("dd:"+str(len(all_dd)))  61     flag=0  62     temp_link=[]  63     author=[]       #作者列表 一层list
 64     for item in all_dd:  65         if(flag%2==0):  66             #保存作者
 67  author.append(item)  68         else:  69             linktext=str(item)  70             linksoup=BeautifulSoup(linktext,'lxml')  71             link_list=linksoup.find_all('a')  72             for i in link_list:  73                 if(i.get('href')==None):  74                     temp_link.append("fakelink")  75                 else:  76                     # print(i)
 77                     if("http" not in str(i.get('href')) and "papers" in str(i.get('href'))):  78                         temp_link.append(("https://www.ecva.net/"+str(i.get('href'))))  79                     else:  80                         temp_link.append(i.get('href'))  81             print(temp_link)  82  link_res.append(temp_link)  83             temp_link=[]  84             #解析download 和 pdfinfo
 85         flag = flag + 1
 86     """
 87  继续使用beautifulsoup  88  download_text 和 pdfinfo_text  89  存储author  90  "https://www.ecva.net/"  91     """
 92     linkflag=1
 93     print("------------------------------")  94     #把作者和download pdfinfo 存到res
 95     for i in range(0,len(author)):  96         #添加作者
 97         str_author=str(author[i])  98         new_author=str_author.replace("<dd>","")  99         new_author=new_author.replace(" </dd>","") 100         new_author = new_author.replace("\n", "") 101  res[i].append(new_author) 102         # print("link_res:"+str(len(link_res)))
103         if(len(link_res[i])==2): 104             #添加download
105  res[i].append(link_res[i][0]) 106             #添加pdfinfo
107             res[i].append(link_res[i][1]) 108         else: 109             # 添加download
110  res[i].append(link_res[i][0]) 111             # 添加pdfinfo
112             res[i].append(link_res[i][2]) 113     print("----------------------") 114     # print(len(author))
115     # print(len(download))
116     # print(len(pdfinfo))
117     # for item in res:
118     # print(item)
119     return res 120 #############################################################
121 #继续爬取abstract 和 keyword
122 def get_further(): 123     res=get_paper() 124     temp_res=[] 125     further_res=[] 126     db_res=[] 127     sql="SELECT pdfinfo FROM pdf;"
128     db_res=query(sql)  #返回元祖 要继续[0]访问数据
129     #对结果集的链接发起请求
130     for i in range(1358,len(db_res)): 131         url=db_res[i][0]       #获取url
132         print(url) 133         headers={ 134             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
135                          "Chrome/91.0.4472.101 Safari/537.36"
136  } 137         try: 138             response=requests.get(url,headers) 139             response.encoding = "utf-8"
140             page_text = response.text 141             # print(page_text)
142             soup = BeautifulSoup(page_text, 'lxml') 143 
144             abstract = soup.find('p', id='Par1').text 145             #去掉\n
146             abstract = abstract.replace("\n","") 147             print("摘要:"+abstract) 148             keyword = soup.find_all('span', class_="Keyword") 149             # print(keyword)
150             # find_keyword=re.compile('<span class="Keyword">(.*?)</span>')
151             keyword_str = ""
152             for items in keyword: 153                 # 获取所有文本
154                 keyword_str = keyword_str + items.get_text() 155             print("关键字:"+keyword_str) 156             #去掉 \xa0
157             keyword_str=keyword_str.replace("\xa0",",") 158             #去掉末尾的一个逗号
159             keyword_str = keyword_str[0:-1] 160             # 最后添加 摘要和关键字
161  temp_res.append(abstract) 162  temp_res.append(keyword_str) 163  further_res.append(temp_res) 164             print(temp_res) 165             print("~~~~~~~~~~~~~~~~~~~~~~~~~~~") 166             temp_res = [] 167         except: 168             print("链接无效!") 169             try: 170                 if(len(further_res[i][0])==0): 171                     res[i].append("no abstract") 172                 else: 173  res[i].append(further_res[i][0]) 174                 if(len(further_res[i][1])==0): 175                     res[i].append("no keyword") 176                 else: 177                     res[i].append(further_res[i][1]) 178                 print(res[i]) 179                 # 插入数据库
180                 # insert_paper_1(res[i], i)
181             except: 182                 print("IndexError: list index out of range") 183     return
184 
185 #连接数据库 获取游标
186 def get_conn(): 187     """
188  :return: 连接,游标 189     """
190     # 创建连接
191     conn = pymysql.connect(host="127.0.0.1", 192                     user="root", 193                     password="000429", 194                     db="paperinfo", 195                     charset="utf8") 196     # 创建游标
197     cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
198     if ((conn != None) & (cursor != None)): 199         print("数据库连接成功!游标创建成功!") 200     else: 201         print("数据库连接失败!") 202     return conn, cursor 203 #关闭数据库连接和游标
204 def close_conn(conn, cursor): 205     if cursor: 206  cursor.close() 207     if conn: 208  conn.close() 209     return 1
210 def insert_paper_0(): 211     conn,cursor=get_conn() 212     res=get_paper() 213     print(f"{time.asctime()}开始插入论文详情数据") 214     try: 215         sql = "insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s," \ 216               "%s,%s,%s,%s)"
217         for item in res: 218             print(item) 219             # 异常捕获,防止数据库主键冲突
220             try: 221                 cursor.execute(sql, [item[0], item[1], item[2], item[3],"",""]) 222             except pymysql.err.IntegrityError: 223                 print("重复!") 224         print("###########################") 225         conn.commit()  # 提交事务 update delete insert操作
226         print(f"{time.asctime()}插入论文详情数据完毕") 227     except: 228  traceback.print_exc() 229     finally: 230  close_conn(conn, cursor) 231     return
232 #########################################
233 def insert_paper_1(res,count): 234     conn,cursor=get_conn() 235     print(f"{time.asctime()}开始插入论文详情数据") 236     try: 237         sql = "insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s," \ 238               "%s,%s,%s,%s)"
239         print(res) 240         # 异常捕获,防止数据库主键冲突
241         try: 242             cursor.execute(sql, [res[0], res[1], res[2], res[3],res[5],res[6]]) 243         except pymysql.err.IntegrityError: 244             print("重复!") 245         print("###########################") 246         conn.commit()  # 提交事务 update delete insert操作
247         print(f"{time.asctime()}插入第"+str(count+1)+"条论文详情数据完毕") 248     except: 249  traceback.print_exc() 250     finally: 251  close_conn(conn, cursor) 252     return
253 
254 #单独插入 pdfinfo
255 def inseet_pdf(): 256     conn, cursor = get_conn() 257     res=get_paper() 258     print(f"{time.asctime()}开始插入论文pdfinfo数据") 259     try: 260         sql = "insert into pdf (id,pdfinfo) values(%s,%s)"
261         # 异常捕获,防止数据库主键冲突
262         for item in res: 263             print(item) 264             # 异常捕获,防止数据库主键冲突
265             try: 266                 cursor.execute(sql, [0,item[4]]) 267             except pymysql.err.IntegrityError: 268                 print("重复!") 269         print("###########################") 270         conn.commit()  # 提交事务 update delete insert操作
271         print(f"{time.asctime()}插入论文pdfinfo完毕") 272     except: 273  traceback.print_exc() 274     finally: 275  close_conn(conn, cursor) 276     return
277 if (__name__=='__main__'): 278  get_further() 279     # inseet_pdf()

 

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM