1.普通爬取數據

"""普通爬取數據爬蟲,只要有反爬,cookie就不是很穩定。並不利於數據爬取""" import requests import re from lxml import etree headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", # Cookie需要時常換取 "Cookie": "_ga=GA1.2.1553999204.1538311958; user_trace_token=20180930205504-0cebb367-c4b0-11e8-bb68-5254005c3644; " "LGUID=20180930205504-0cebbcd1-c4b0-11e8-bb68-5254005c3644; showExpriedIndex=1; showExpriedCompanyHome=1; " "showExpriedMyPublish=1; index_location_city=%E6%B7%B1%E5%9C%B3; sensorsdata2015jssdkcross=%7B%22distinct_" "id%22%3A%22166811f974d15e-026ab47692a8d1-181c7151-2073600-166811f974e549%22%2C%22%24device_id%22%3A%2216681" "1f974d15e-026ab47692a8d1-181c7151-2073600-166811f974e549%22%7D; LG_LOGIN_USER_ID=1d0d39f3227c1f914a3f9c4d95f" "4816a5c6667141cc1313edac4603b4bd6d789; hasDeliver=6; _gid=GA1.2.2026255269.1540465512; WEBTJ-ID=2018102519420" "2-166ab0808f9cb-06806b898438ec-181c7151-2073600-166ab0808fb7ef; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=15401" "21809,1540210602,1540465512,1540467723; LGSID=20181025194505-6ab63d2a-d84b-11e8-8168-5254005c3644; PRE_UTM=m_cf_" "cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%2" "6rsv_idx%3D1%26tn%3Dbaidu%26wd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_pq%3D8d0dc05a0000aada%26rsv_t%" "3D4664T41fswButqvfw6ZM6FGWfkWjtwR%252Fmpsskb6hctTVnUHewMo9o1%252BqRGk%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D7%26r" "sv_sug1%3D8%26rsv_sug7%3D100; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc;" " _putrc=1D33894D7A6BEB76123F89F2B170EADC; JSESSIONID=ABAAABAAAGFABEF9CEC8B1F38F5075A286961D31667AC5C; login=true; unick=%E6%9D%A" "8%E7%A6%B9; gate_login_token=b0629019d50bbe97eb829d61be9770ad4b570c1e68e239c68ae16cc71c68c808; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a" "3d81c6ccf756e6=1540469398; LGRID=20181025201301-5183464a-d84f-11e8-a347-525400f775ce; TG-TRACK-CODE=index_search; SEARCH_ID=06714" "3e245964eb7af08d8c8d316cd44" } def request_list_page(): # 獲取詳情頁面url url = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false" data = { "first": "false", "pn": 1, "kd": "Python" } for x in range(1, 31): data["pn"] = x rep = requests.post(url=url, headers=headers, data=data) # json方法,如果返回來的是json數據,自動轉換為字典 result = rep.json() # json在線解析分析數據格式 positions = result["content"]["positionResult"]["result"] for position in positions: positionId = position["positionId"] # 找到詳情頁url,並傳遞給解析函數 position_url = "https://www.lagou.com/jobs/{}.html".format(positionId) parse_position_detail(position_url) break break def parse_position_detail(url): # 頁面解析 res = requests.get(url, headers=headers) text = res.text html = etree.HTML(text) position_depart = html.xpath("//div[@class='company']/text()") position_names = html.xpath("//span[@class='name']/text()") job_requests = html.xpath("//dd[@class='job_request']//span/text()") salary_span = re.sub(r"[\s/]", "", job_requests[0].strip()) addr_span = re.sub(r"[\s/]", "", job_requests[1].strip()) exper_span = re.sub(r"[\s/]", "", job_requests[2].strip()) education_span = re.sub(r"[\s/]", "", job_requests[3].strip()) full_span = re.sub(r"[\s/]", "", job_requests[4].strip()) desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() print(position_depart, position_names, salary_span, addr_span, exper_span, education_span, full_span, desc) if __name__ == '__main__': request_list_page()
2.操作selenium獲取數據

"""selenium自動化爬取,模擬人的動作,操作瀏覽器訪問""" from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree import random import csv import time import re class LagouSpider(object): # 獲取chromedriver.exe路徑 driver_path = r"E:\Program Files\chromedriver.exe" def __init__(self): # 實例化這個路徑 self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path) self.url = "https://www.lagou.com/jobs/list_python?city=全國&cl=false&fromSearch=true&labelWords=&suginput=" def run(self): # 頁面列表僅限一次,取出while循環 self.driver.get(self.url) while True: # 獲取頁面源碼 source = self.driver.page_source # 有異步加載,數據出來的慢,需要等待 WebDriverWait(driver=self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]")) ) # 將完整的源碼傳遞給函數解析 self.parse_list_page(source) try: # 獲取下一頁點擊標簽 next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]") # 是最后一頁,停止循環 if "pager_next_disabled" in next_btn.get_attribute("class"): break else: # 繼續循環 next_btn.click() except: print(source) time.sleep(1) def parse_list_page(self, source): # 獲取詳情url html = etree.HTML(source) links = html.xpath("//a[@class='position_link']/@href") for link in links: # 將獲取的url傳遞給函數 self.request_detail_page(link) time.sleep(random.randint(0, 2)) def request_detail_page(self, url): # 打開新的頁面窗口 self.driver.execute_script("window.open('%s')" % url) # 移動到新的頁面窗口 self.driver.switch_to.window(self.driver.window_handles[1]) # 有異步加載,數據出來的慢,需要等待 WebDriverWait(self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']")) ) source = self.driver.page_source # 將源碼傳遞給函數解析 self.parse_detail_page(source) # 關閉當前這個詳情頁 self.driver.close() # 繼續切換回職位列表頁 self.driver.switch_to.window(self.driver.window_handles[0]) def parse_detail_page(self, source): # 開始解析數據(不做介紹,前面有詳解) html = etree.HTML(source) position_name = html.xpath("//span[@class='name']/text()")[0] job_request_spans = html.xpath("//dd[@class='job_request']//span") salary = job_request_spans[0].xpath(".//text()")[0].strip() city = job_request_spans[1].xpath(".//text()")[0].strip() city = re.sub(r'[\s/]]', "", city) work_years = job_request_spans[2].xpath(".//text()")[0].strip() work_years = re.sub(r'[\s/]]', "", work_years) education = job_request_spans[3].xpath(".//text()")[0].strip() education = re.sub(r'[\s/]]', "", education) desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() company_name = html.xpath("//h2[@class='fl']/text()")[0].strip() position = {(position_name, company_name, salary, city, work_years, education, desc)} with open("lagou.csv", "a+", encoding="utf-8", newline="") as fp: writer = csv.writer(fp) writer.writerows(position) if __name__ == '__main__': spider = LagouSpider() spider.run()
3.操作selenium爬取boss直聘

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree import random import time import csv import re class BossSpider(object): # 獲取chromedriver.exe路徑 driver_path = r"E:\Program Files\chromedriver.exe" def __init__(self): # 實例化對象 self.driver = webdriver.Chrome(executable_path=BossSpider.driver_path) self.url = "https://www.zhipin.com/job_detail/?query=python&scity=100010000&industry=&position=" def run(self): # 獲取頁面列表 self.driver.get(self.url) while True: # 獲取頁面列表源碼 source = self.driver.page_source # 有異步加載,數據出來的慢,需要等待 WebDriverWait(driver=self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='page']/a[last()]")) ) # 將完整的源碼傳遞給函數解析 self.parse_list_page(source) try: # 獲取下一頁點擊標簽 next_btn = self.driver.find_element_by_xpath("//div[@class='page']/a[last()]") # 是最后一頁,停止循環 if "next disabled" in next_btn.get_attribute("class"): break else: # 繼續循環 next_btn.click() except: print(source) time.sleep(random.randint(1, 5)) def parse_list_page(self, source): html = etree.HTML(source) # 獲取詳情url links = html.xpath("//div[@class='info-primary']//a/@href") for link in links: url = "https://www.zhipin.com" + link # 將獲取的url傳遞給函數 self.request_detail_page(url) time.sleep(random.randint(1, 5)) def request_detail_page(self, url): # 打開新的頁面窗口 self.driver.execute_script("window.open('%s')" % url) # 移動到新的頁面窗口 self.driver.switch_to.window(self.driver.window_handles[1]) # 有異步加載,數據出來的慢,需要等待 WebDriverWait(self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, "//div[@class='info-primary']//div[@class='name']")) ) source = self.driver.page_source # 將完整的源碼傳遞給函數解析 self.parse_detail_page(source) # 關閉當前這個詳情頁 self.driver.close() # 繼續切換回列表頁 self.driver.switch_to.window(self.driver.window_handles[0]) def parse_detail_page(self, source): # 開始解析並獲取數據 html = etree.HTML(source) position_name = html.xpath("//div[@class='name']/h1/text()")[0].strip() company_name = html.xpath("//div[@class='info-company']//h3[@class='name']/a/text()")[0].strip() salary = html.xpath("//div[@class='name']/span[@class='badge']/text()")[0].strip() job_request_ps = html.xpath("//div[@class='info-primary']//p/text()") city = job_request_ps[0].strip() work_years = job_request_ps[1].strip() education = job_request_ps[2].strip() desc_tags = html.xpath("//div[@class='job-sec']/div[@class='text']") contents = "" for desc in desc_tags: tag_list = desc.xpath("./text()") for tag in tag_list: # 替換無用字符串 text = re.sub(r"[\s\\\xa0]", "", tag) contents += text position = {(position_name, company_name, salary, city, work_years, education, contents)} # 打開文件以utf-8編碼不換行進行追加, with open("boss.csv", "a+", encoding="utf-8", newline="") as fp: writer = csv.writer(fp) writer.writerows(position) if __name__ == '__main__': spider = BossSpider() spider.run()
4.12306搶票初步實現地址,內有包含步驟和說明
https://github.com/yangyu57587720/12306GrabVotes