python3 post請求數據獲取案例


# coding=utf-8
import requests
from lxml import etree
import json

 

class TianYuan:

  def __init__(self):
    self.url = "http://www.tylaw.com.cn/CN/Team.aspx"
    self.headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
    }

# 先請求獲取href拼接url 再請求獲取html頁面

# 設置post請求的data,因為每次的data數據里的currentPage都在變,別處無法獲取下一頁的url,所以遍歷增加currentPage的值,
# 每增加一次就發送一次url請求,得到href值去拼接url.
  def get_href(self):
"""發送8次post請求"""
# 定義一個空的href列表。
    href_lists = []
    tempdata = {
    "__VIEWSTATEGENERATOR": "CB7A4B54",

    "Lan": "CN",
    "MenuID": "00000000000000000004",

    "currentPage": 1
    }
    for x in range(9):

      response = requests.post(self.url, data=tempdata, headers=self.headers)
      # 請求一次后currentPage+1
      tempdata['currentPage'] += 1

      # 獲取響應對象

      r = response.content.decode()
      h = etree.HTML(r)

      # 獲取href
      href_list = h.xpath('//h3/a/@href')
      href_lists.append(href_list)
    return href_lists

  def get_url_list(self, href_lists):
    url_list = []
    for href_list in href_lists:
      for i in href_list:
        url = "http://www.tylaw.com.cn/CN/{}".format(i)
        url_list.append(url) 
    return url_list

   def parse_url(self, url):
      response = requests.get(url, headers=self.headers)
      return etree.HTML(response.content.decode())

  def get_content_list(self, html):
    content_list = []
    item = {}
    item["name"] = html.xpath('//*[@id="containerLawyer"]/div/div/div[2]/div[2]/div[1]/div[1]/div/div/text()[2]')[0].strip()
    item["email"] = html.xpath('//*[@id="containerLawyer"]/div/div/div[2]/div[2]/div[1]/div[7]/div/div/text()')[0].strip()
    # print(item)
    content_list.append(item)
    return content_list

  def save_content(self, content_list):
    with open("tianyuan.json", "a") as f:
      for content in content_list:
        json.dump(content, f, ensure_ascii=False, indent=2)
        f.write(',\n')

  def run(self):  

    """run函數實現主邏輯"""

    id_list = self.get_href()

    url_list = self.get_url_list(id_list)
    for url in url_list:
      html = self.parse_url(url)
      content_list = self.get_content_list(html)
      self.save_content(content_list)

if __name__ == '__main__':
tianyuan = TianYuan()
tianyuan.run()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM