
import requests from selenium import webdriver import time def grasp(urlT): driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') #自动化测试程序工具本地所在地 resAll = [] #用于存储单条数据 rest = {} #用于存储单个数据 res=requests.get(urlT) for i in range(0,29): print(f'第{i+1}条新闻开始') print(res.json()['data'][i]['title']) try: print(res.json()['data'][i]['newsTime']) except: print('None') print(res.json()['data'][i]['source']) print(res.json()['data'][i]['url']) rest['title']=res.json()['data'][i]['title'] try: rest['newsTime'] = res.json()['data'][i]['newsTime'] except: rest['newsTime'] = 'None' rest['source'] = res.json()['data'][i]['source'] url = res.json()['data'][i]['url'] rest['url'] = res.json()['data'][i]['url'] try: driver.get(url) time.sleep(4) contend = driver.find_element_by_class_name('text-3zQ3cZD4').text rest['contend'] = str(contend) print(contend) driver.back() print(f'第{i+1}条新闻结束') time.sleep(6) except: contend = driver.find_element_by_class_name('topic_column-5QvrwcWi').text rest['contend'] = str(contend) print(contend) driver.back() time.sleep(6) print(f'第{i+1}条新闻格式不同') print('#-----------------------某些格式不符合------------------------#') resAll.append(rest) with open('./news.txt', 'a+', encoding='utf-8') as f: try: f.write(''.join(resAll[i].values())+'\n') except: print('写入失败') url = "https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219" grasp(url) # # # class Grasp: # # def __init__(self): # self.driver = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') # self.resAll = []#用于存储单条数据 # self.rest = {}#用于存储单个数据 # self.res = requests.get("https://shankapi.ifeng.com/spring/finance/index/newInfoIndex/75219")#目标链接 # # def run(self): # for i in range(0, len(self.res.json()['data'])): # print(f'第{i+1}条新闻开始') # print(self.res.json()['data'][i]['title']) #输出标题 # try: # print(self.res.json()['data'][i]['newsTime']) #输出时间 # except: # print('None') # print(self.res.json()['data'][i]['source']) #输出来源 # print(self.res.json()['data'][i]['url']) #输出链接地址 # self.rest['title'] = self.res.json()['data'][i]['title'] #获取标题 # try: # self.rest['newsTime'] = self.res.json()['data'][i]['newsTime'] #获取时间 # except: # self.rest['newsTime'] = 'None' # self.rest['source'] = self.res.json()['data'][i]['source'] #获取来源 # self.url = self.res.json()['data'][i]['url'] # self.rest['url'] = self.res.json()['data'][i]['url']#获取链接地址 # try: # self.driver.get(url) # time.sleep(4) # self.contend = self.driver.find_element_by_class_name('text-3zQ3cZD4').text#获取网页标签下的文本 # self.rest['contend'] = str(self.contend)#插入单条数据 # print(f'第{i}条新闻成功') # self.driver.back() # time.sleep(4) # except: # contend = driver.find_element_by_class_name('topic_column-5QvrwcWi').text # rest['contend'] = str(contend) # driver.back() # time.sleep(6) # print(f'第{i+1}条新闻格式不同') # print('#-----------------------某些格式不符合------------------------#') # self.resAll.append(self.rest) # with open('./news.txt', 'a+', encoding='utf-8') as f: # try: # # f.write(''.join(self.resAll[i].values()) + '\n') #写入数据 # f.write(f'第{i+1}条新闻结束') # except: # print('写入失败') # # g = Grasp() # g.run()
电脑性能差,如若想获取其他页面的数据,将规则写在except中,即可
希望,帮到大家