python爬虫之妹子图
懂的人都懂!
import urllib.request
import os
import re
import time
#关于re模块使用的连接https://www.cnblogs.com/shenjianping/p/11647473.html
def url_open(url):
#header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362"}
header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"}
req = urllib.request.Request(url,headers=header)
response = urllib.request.urlopen(req)
html = response.read()
return html
def find_resource(url):
html = url_open(url).decode("utf-8") #获取网页
#print(html) ##查看网页
img_addrs = re.findall('''<img alt=".*?" src="(.*?)" />''',html) ##改动处
#print(img_addrs) ##查看地址
return img_addrs
def save_imgs(floder,img_addrs):
for each in img_addrs:
time.sleep(0.3)
filename = each.split('/')[-1]
with open(filename,'ab') as f:
if re.findall("^\\bh",each) == []:
each = "https://www.xiuaa.com" + each #当图片地址不完整时使用
img = url_open(each) #打开图片地址
f.write(img) #下载图片
def main(floder='download'): #主程序,传入文件夹名称参数
os.chdir(floder)
for i in range(0,10): #翻页
num = i
url = "https://www.xiuaa.com/xgmn/4492_" +str(num) + ".html" ##改动处
img_addrs = find_resource(url)
save_imgs(floder,img_addrs)
print("第",i+1,"张..")
print("爬取完毕!")
if __name__ == '__main__':
main()
2020.8.12尝试了可以运行