爬去http://bj.58.com/pbdn/0/pn2/中除轉轉、推廣商品以外的產品信息
# coding:utf-8
# 爬取58同城二手電腦信息
# 進入http://bj.58.com/pbdn/0/pn2/頁面
# 爬取列表中除轉轉、推廣商品外的正常商品
from bs4 import BeautifulSoup
import requests
import time
def get_links_from(who_sells): # 爬取列表中除轉轉、推廣商品外的正常商品爬取列表中除轉轉、推廣商品外的正常商品的連接
urls = []
list_view = 'http://bj.58.com/pbdn/{}/pn2/'.format(str(who_sells))
wb_data = requests.get(list_view)
soup = BeautifulSoup(wb_data.text, 'lxml')
# 通過對頁面分析 發現商品鏈接在 tr > td.t > a.t 中
for link in soup.select('tr td.t a.t'):
if len(link.get('href').split('?')[0]) == 53: # 因為轉轉商品也符合 tr > td.t > a.t,要排除,觀察發現正常商品鏈接
# 的長度為53, 可通過字符串長度篩選去正常的連接
urls.append(link.get('href').split('?')[0])
return urls
def get_views(url):
id = url.split('/')[-1].strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(id)
js = requests.get(api)
views = js.text.split('=')[-1]
return views
def get_item_info(who_sells=0): #
urls = get_links_from(who_sells)
for url in urls:
time.sleep(2)
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
data = {
'title': soup.title.text,
'price': soup.find_all('span', 'price c_f50')[0].text,
'area': list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span','c_25d') else None,
'date': soup.select('.time')[0].text,
'cate': '個人' if who_sells == 0 else '商家',
'views': get_views(url)
}
print(data)
get_item_info()