import urllib.request import urllib.parse from bs4 import BeautifulSoup import json class ZhiLianSpider(object): url = "https://sou.zhaopin.com/?" def __init__(self, jl, kw, start_page, end_page): self.jl = jl self.kw = kw self.start_page = start_page self.end_page = end_page self.items = [] # 定義一個空列表,存放所有的工作信息 # 解析html文件,提取所需的內容 def parse_content(self, content): soup = BeautifulSoup(content, 'html.parser') table_list = soup.select('#listContent > table')[1:] for table in table_list: zwmc = table.select('.zwmc > div > a')[0].text gsmc = table.select('.gsmc > a')[0].text zwyx = table.select('.zwyx')[0].text gzdd = table.select('.gzdd')[0].text gxsj = table.select('.gxsj > span')[0].text item = { '職位名稱': zwmc, '公司名稱': gsmc, '職位月薪': zwyx, '工作地點': gzdd, '更新時間': gxsj, } self.items.append(item) # 啟動爬蟲 def run(self): for page in range(self.start_page, self.end_page+1): request = self.handler_request(page) # 構建request對象 content = urllib.request.urlopen(request).read().decode() # 發起get請求,獲得html文件 self.parse_content(content) string_items = json.dumps(self.items, ensure_ascii=False) # 將列表類型轉化為字符串類型 with open("zhilian.txt", "w", encoding="utf-8") as f: # 設置ensure_ascii,打開txt文件時顯示中文 f.write(string_items) def handler_request(self, page): # 處理url,構建request對象 data = { 'jl': self.jl, 'kw': self.kw, 'p': page } get_url = ZhiLianSpider.url + urllib.parse.urlencode(data) # url中有中文,需要urlencode編碼 # print(get_url) headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } request = urllib.request.Request(url=get_url, headers=headers) return request def main(): jl = input("請輸入工作地點:") kw = input("請輸入工作關鍵詞:") start_page = int(input("請輸入查詢起始頁面:")) end_page = int(input("查詢結束頁面:")) # 創建對象,啟動爬取程序 spider = ZhiLianSpider(jl, kw, start_page, end_page) spider.run() if __name__ == '__main__': main()