爬蟲練習--爬妹子圖


import urllib.request
import urllib.parse
import re,os,time
'''
拼接url,發送請求,得到響應內容,分析響應內容,保存數據
'''
def get_request(new_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
request = urllib.request.Request(url=new_url,headers=headers)
return request

def get_content(request):
response = urllib.request.urlopen(request)
# print(response.read().decode('gbk'))
return response.read().decode('gbk')

def parse_content(content):
'''
<div class="thumb">
<a href="/article/122125840" target="_blank">
<img src="//pic.qiushibaike.com/system/pictures/12212/122125840/medium/EAB3PVYM1XFGJF1A.jpg" alt="是胖到下巴都是肉的我">
</a>
</div>
<div class="pic">
<a target="_blank" href="https://www.meizitu.com/a/5521.html">
<img src="http://pic.topmeizi.com/wp-content/uploads/2017a/04/08/limg.jpg" alt="<b>周末福利圖,我舅服最后一張帶感的</b>">
</a>
</div>
<h3 class="tit">
<a href="https://www.meizitu.com/a/5521.html" target="_blank">
<b>周末福利圖,我舅服最后一張帶感的</b>
</a>
</h3>
'''
patten = re.compile(r'<div class="pic">.*?<img src="(.*?)" alt="(.*?)".*?</div>',re.S)
ret = patten.findall(content)
print(ret)
# print(len(ret))
down_load(ret)
#
def down_load(ret):
dirname = 'mz'
for tp in ret:
#取出圖片地址
image_url = tp[0]
#取出圖片名稱
image_name=tp[-1][3:-4]
# image_name = tp[1]
# patten1 = re.compile(r'<b>(.*)</b>')
# ret1 = patten1.findall(image_name)
#生成文件夾名字
if not os.path.exists(dirname):
os.mkdir(dirname)
filename =image_name+'.'+ image_url.split('.')[-1]
# print(filename)
filepath = os.path.join(dirname,filename)
# print(filepath)
print('正在下載圖片%s。。。。'%filename)
urllib.request.urlretrieve(image_url,filepath)
print('結束下載圖片%s。。。。'%filename)
time.sleep(2)

def main():
#輸入起始頁碼
start_page = int(input("請輸入起始頁碼"))
end_page =int(input("請輸入結束頁碼"))
url = 'https://www.meizitu.com/a/'
for page in range(start_page,end_page+1):
print("正在下載第%s頁....."%page)
#拼接url
new_url = url + 'list_1_'+str(page)+ '.html'
# print(page)
# print(new_url)
request = get_request(new_url)
content = get_content(request)
# print(content.read().decode('gbk'))
parse_content(content)
print("結束下載第%s頁....." % page)
time.sleep(2)
if __name__ == '__main__':
main()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM