京東的rotobs協議,網址:
User-agent: * #任何爬蟲的來源都應該遵守如下協議額 Disallow: /?* #不允許爬取以?開頭的路徑 Disallow: /pop/*.html #不允許訪問/pop/....html的頁面 Disallow: /pinpai/*.html?* #不允許訪問 /pinpai/....html?...
接下來的4個爬蟲禁止訪問京東的任何資源(被認為是惡意爬蟲) User-agent: EtaoSpider Disallow: / User-agent: HuihuiSpider Disallow: / User-agent: GwdangSpider Disallow: / User-agent: WochachaSpider Disallow: /
Robots協議的遵守方式~~~~
1.京東商品頁面的爬取
2.亞馬遜商品頁面的爬取【改變頭部User-Agent為瀏覽器(偽裝自己)】
3.百度360搜索關鍵詞提交
直接使用params
4.網絡圖片的爬取和存儲[這個只是一張圖片的]
附上自己按照小甲魚方法寫的爬取地理網圖片,無限刷!!!
直接可用!!!
import urllib.request import os def url_oprn(url): rq=urllib.request.Request(url) rq.add_header=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36") response=urllib.request.urlopen(rq) html=response.read() return html def find_img(page_url): imglist=[] html=url_oprn(page_url).decode('utf-8') a=html.find("img src=") while a!=-1: b=html.find(".jpg",a,a+255) if b!=-1: imglist.append(html[a+9:b+9]) else: b=a+9 a=html.find("img src=",b) return imglist def save(img_addrs): for each in img_addrs: filename=each.split("/")[-1].split("@")[0] #print(filename) try: with open(filename,'wb') as f: img=url_oprn(each) f.write(img) except: continue def download_img(path="G:\\PY\\imgs",page=5): ok=True a=0 while ok: a+=1 path=path+str(a) if not os.path.exists(path): ok=False os.mkdir(path) os.chdir(path) url="http://www.dili360.com/cng/pic/" page_num=2019 for i in range(page): page_url=url+str(page_num)+'.htm' img_addrs=find_img(page_url) save(img_addrs) page_num-=1 if __name__=="__main__": download_img()
5.IP地址歸屬地的自動查詢
import requests from lxml import etree url="http://www.ip138.com/ips138.asp?ip=" ip=input("請輸入查詢ip") if not ip: ip="192.168.1.22" try: response=requests.get(url+ip) response.raise_for_status() response.encoding=response.apparent_encoding html=response.text dom=etree.HTML(html) textBD=dom.xpath('//ul[@class="ul1"]/li/text()') print("查詢的ip地址為:"+ip) print(textBD) except: print("出現異常")