python爬取通讯录


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

# 1.创建浏览器对象
#chrome版本较高,禁用GPU加速,否则一直报错
chrome_opt = webdriver.ChromeOptions()
chrome_opt.add_argument('--disable-gpu')
path = r"chromedriver.exe"
driver = webdriver.Chrome(executable_path=path,chrome_options=chrome_opt)


# 2.操作浏览器对象
driver.get('http://111111111/tx.aspx?fid=0')


#取出内容放入列表
def get_content():
    list=[]
    for i in range(2,30):
        for s in range(1,10):
            #遍历出xpath路径
            str=f'//*[@id="form1"]/table/tbody/tr[{i}]/td[{s}]'  
            text=driver.find_element_by_xpath(str).text
            list.append(text)
    return list

#对内容列表进行分组,形成列表的列表
def sort_writer(*list):
    step=9
    listers=[list[i:i+9] for i in range(0,len(list),step)]
    with open("./zhaopin.csv","w",newline='') as f:
        writer=csv.writer(f)
        writer.writerows(listers)


#循环控制页数

for i in range(1,400):
    try:
        a=get_content()
        sort_writer(*a)
        driver.find_element_by_link_text("下一页").click()
    
    except Exception as ide:
        print("出错了!停止")
        driver.quit()
        break
    finally:
        time.sleep(1)

  

所有通讯录内容在 tbody》tr》td 中

from pyquery import PyQuery as pq
import requests
import csv

url="http://localhost:8080/index.htm"
res=requests.get(url).content
opq=pq(res)

#把查找到的文本组装成list
listconters=[]
conters=opq("tbody").eq(1).find("tr").children()
for td in conters:
    w=td.text
    listconters.append(w)

#列表按个数重新分组,形成列表的列表,类似[ [a],[b],[c]..]
step=9
listconter=[listconters[i:i+step]for i in range(0,len(listconters),step)]
print(listconter)

#writerow写一行,writerows写列表每一项为一行,newline属性可以避免多一行空白行
with open("./通讯录.csv","w",newline="") as f:
    writer = csv.writer (f)
    writer.writerows(listconter)



免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM