爬蟲數據存儲為json格式
data為你的數據
def save_data(data):
with open('name.json', 'w', encoding='utf-8')as f:
json.dump(data, f)
案例:
聲明:該案例 僅用於 學習,請勿商用,或非法用途,造成的一切后果於本人無關!
import json
import requests
from fake_useragent import UserAgent
from lxml import etree
# 獲取到結構化的頁面
def get_html(url):
headers = {
'User-Agent': UserAgent().chrome
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
return etree.HTML(response.text)
# 獲取頁面中要提取的數據
def get_detail(response):
detail_list = response.xpath('//div[@class="content-left mb16"]/div')
print(detail_list)
for da in detail_list:
# 新聞的鏈接
href = da.xpath('./a/@href')[0]
# 新聞的標題
title = da.xpath('./em/a/text()')[0]
# 新聞的概述
content = da.xpath('./p/text()')[0]
print(href, title, content)
items = {}
items['title'] = title
items['href'] = href
items['content'] = content
data_list.append(items)
print(data_list)
print('------------------------')
# 將獲取到的數據存儲為json
def wirte_json(data):
with open('detail.json', 'w', encoding='utf-8') as f:
json.dump(data, f)
# 主運行函數
def main():
for i in range(1,11):
html_data = get_html('http://www.szhk.com/news/newlist/news/28148690427226187/{}.htm'.format(1))
get_detail(html_data)
if __name__ == '__main__':
data_list = []
main()
# 將數據存儲為 json格式
wirte_json(data_list)