有兩種解決方式
1.使用response.encoding = 'utf-8'
2.使用.encode('iso-8859-1').decode('gbk')
爬取美女壁紙縮略圖並解決標題亂碼問題
http://pic.netbian.com/4kmeinv/
http://pic.netbian.com/4kmeinv/index_2.html
import requests
from lxml import etree
start_page = int(input('start page num:'))
end_page = int(input('end page num:'))
if not os.path.exists('./meinvs'):
os.mkdir('./meinvs')
#通用的url模板(不能修改)
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in range(start_page,end_page+1):
if page == 1:
new_url = 'http://pic.netbian.com/4kmeinv/'
else:
new_url = format(url%page)
response = requests.get(url=new_url,headers=headers)
# response.encoding = 'utf-8' 第一種方式
page_text = response.text
#解析名稱和圖片的src屬性值
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
for li in li_list:
img_name = li.xpath('./a/img/@alt')[0]
img_name = img_name.encode('iso-8859-1').decode('gbk')+'.jpg' # 第二種方式
img_src = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_path = './meinvs/'+img_name
request.urlretrieve(img_src,img_path)
print(img_name,'下載成功!!!')