Python 多線程實現爬取妹子圖


前陣子網上看到有人寫爬取妹子圖的派森代碼,於是乎我也想寫一個教程,很多教程都是調用的第三方模塊,今天就使用原生庫來爬,並且擴展實現了圖片鑒定,圖片去重等操作,經過了爬站驗證,穩如老狗,我已經爬了幾萬張了,只要你硬盤夠大。

妹子圖網站被扒倒閉了,下面的代碼只能參考了。

前端,被一個 img標簽包起來 <img src="https://mtl.gzhuibei.com/images/img/10431/5.jpg" alt= 直接正則匹配

先來生成頁面鏈接,代碼如下

# 傳入參數,對頁面進行拼接並返回列表
def SplicingPage(page,start,end):
    url = []
    for each in range(start,end):
        temporary = page.format(each)
        url.append(temporary)
    return url

接着使用內置庫爬行

# 通過內置庫,獲取到頁面的URL源代碼
def GetPageURL(page):
    head = GetUserAgent(page)
    req = request.Request(url=page,headers=head,method="GET")
    respon = request.urlopen(req,timeout=3)
    if respon.status == 200:
        html = respon.read().decode("utf-8")
        return html

最后正則匹配爬取,完事了。代碼自己研究一下就明白了,太簡單了,

    page_list = SplicingPage(str(args.url),2,100)
    for item in page_list:
            respon = GetPageURL(str(item))
            subject = re.findall('<img src="([^"]+\.jpg)"',respon,re.S)
            for each in subject:
                img_name = each.split("/")[-1]
                img_type = each.split("/")[-1].split(".")[1]
                save_name = str(random.randint(1111111,99999999)) + "." + img_type
                print("[+] 原始名稱: {} 保存為: {} 路徑: {}".format(img_name,save_name,each))
                urllib.request.urlretrieve(each,save_name,None)

也可以通過外部庫提取。

from lxml import etree
 
html = etree.HTML(response.content.decode())
src_list = html.xpath('//ul[@id="pins"]/li/a/img/@data-original')
alt_list = html.xpath('//ul[@id="pins"]/li/a/img/@alt')

一些請求頭信息,用於繞過反爬蟲策略

    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6:
	"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25"

運行結果,就是這樣,同學們,都把褲子給我穿上!好好學習!

接着我們來擴展一個知識點,如何使用Python實現自動鑒別圖片,鑒別黃色圖片的思路是,講圖片中的每一個位讀入內存然后將皮膚顏色填充為白色,將衣服填充為黑色,計算出整個人物的像素大小,然后計算身體顏色與衣服的比例,如果超出預定義的范圍則認為是黃圖,這是基本的原理,實現起來需要各種算法的支持,Python有一個庫可以實現 pip install Pillow porndetective 鑒別代碼如下。

>>> from porndetective import PornDetective
>>> test=PornDetective("c://1.jpg")
>>> test.parse()
c://1.jpg JPEG 1600×2400: result=True message='Porn Pic!!'
<porndetective.PornDetective object at 0x0000021ACBA0EFD0>
>>>
>>> test=PornDetective("c://2.jpg")
>>> test.parse()
c://2.jpg JPEG 1620×2430: result=False message='Total skin percentage lower than 15 (12.51)'
<porndetective.PornDetective object at 0x0000021ACBA5F5E0>
>>> test.result
False

鑒定結果如下,識別率不是很高,其實第一張並不算嚴格意義上的黃圖,你可以使用爬蟲爬取所有妹子圖,然后通過調用這個庫對其進行檢測,如果是則保留,不是的直接刪除,只保留優質資源。

他這個庫使用的算法有一些問題,如果照這樣來分析,那肚皮舞之類的都會被鑒別為黃圖,而且一般都會使用機器學習識別率更高,這種硬編碼的方式一般的還可以,如果更加深入的鑒別根本做不到,是不是黃圖,不能只從暴露皮膚方面判斷,還要綜合考量,姿勢,暴露尺度,衣服類型,等各方面,不過這也夠用,如果想要在海量圖片中篩選出比較優質的資源,你可以這樣來寫。

from PIL import Image
import os
from porndetective import PornDetective

if __name__ == "__main__":
    img_dic = os.listdir("./meizitu/")
    
    for each in img_dic:
        img = Image.open("./meizitu/{}".format(each))
        width = img.size[0]  # 寬度
        height = img.size[1] # 高度
        img = img.resize((int(width*0.3), int(height*0.3)), Image.ANTIALIAS)
        img.save("image.jpg")

        test = PornDetective("./image.jpg")
        test.parse()
        if test.result == True:
            print("{} 圖片大贊,自動為你保留.".format(each))
        else:
            print("----> {} 圖片正常,自動清除,節約空間,存着真的是浪費資源老鐵".format(each))
            os.remove("./meizitu/"+str(each))

妹子圖去重,代碼如下,這個代碼我寫了好一陣子,一開始沒思路,后來才想到的,其原理是利用CRC32算法,計算圖片hash值,比對hash值,並將目錄與hash關聯,最后定位到目錄,只刪除多余的圖片,保留其中的一張,這里給出思路代碼。

import zlib,os

def Find_Repeat_File(file_path,file_type):
    Catalogue = os.listdir(file_path)
    CatalogueDict = {}  # 查詢字典,方便后期查詢鍵值對對應參數
    for each in Catalogue:
        path = (file_path + each)
        if os.path.splitext(path)[1] == file_type:
            with open(path,"rb") as fp:
                crc32 = zlib.crc32(fp.read())
                # print("[*] 文件名: {} CRC32校驗: {}".format(path,str(crc32)))
                CatalogueDict[each] = str(crc32)
    CatalogueList = []
    for value in CatalogueDict.values():
    # 該過程實現提取字典中的crc32特征組合成列表 CatalogueList
        CatalogueList.append(value)

    CountDict = {}
    for each in CatalogueList:
    # 該過程用於存儲文件特征與特征重復次數,放入 CountDict
        CountDict[each] = CatalogueList.count(each)
        
    RepeatFileFeatures = []
    for key,value in CountDict.items():
    # 循環查找字典中的數據,如果value大於1就存入 RepeatFileFeatures
        if value > 1:
            print("[-] 文件特征: {} 重復次數: {}".format(key,value))
            RepeatFileFeatures.append(key)

    for key,value in CatalogueDict.items():
        if value == "1926471896":
            print("[*] 重復文件所在目錄: {}".format(file_path + key))

if __name__ == "__main__":
    Find_Repeat_File("D://python/",".jpg")

來來來,小老弟,我們去探討一下技術,學好技術,每天都開葷

蜘蛛爬蟲最終代碼:

import os,re,random,urllib,argparse
from urllib import request,parse

# 隨機獲取一個請求體
def GetUserAgent(url):
    UsrHead = ["Windows; U; Windows NT 6.1; en-us","Windows NT 5.1; x86_64","Ubuntu U; NT 18.04; x86_64",
    "Windows NT 10.0; WOW64","X11; Ubuntu i686;","X11; Centos x86_64;","compatible; MSIE 9.0; Windows NT 8.1;",
    "X11; Linux i686","Macintosh; U; Intel Mac OS X 10_6_8; en-us","compatible; MSIE 7.0; Windows Server 6.1",
    "Macintosh; Intel Mac OS X 10.6.8; U; en","compatible; MSIE 7.0; Windows NT 5.1","iPad; CPU OS 4_3_3;"]
    UsrFox = ["Chrome/60.0.3100.0","Auburn Browser","Safari/522.13","Chrome/80.0.1211.0","Firefox/74.0",
    "Gecko/20100101 Firefox/4.0.1","Presto/2.8.131 Version/11.11","Mobile/8J2 Safari/6533.18.5",
    "Version/4.0 Safari/534.13","wOSBrowser/233.70 Baidu Browser/534.6 TouchPad/1.0","BrowserNG/7.1.18124",
    "rident/4.0; SE 2.X MetaSr 1.0;","360SE/80.1","wOSBrowser/233.70","UCWEB7.0.2.37/28/999","Opera/UCWEB7.0.2.37"]
    UsrAgent = "Mozilla/5.0 (" + str(random.sample(UsrHead,1)[0]) + ") AppleWebKit/" + str(random.randint(100,1000)) \
    + ".36 (KHTML, like Gecko) " + str(random.sample(UsrFox,1)[0])
    
    UsrRefer = str(url + "/" + "".join(random.sample("abcdef23457sdadw",10)))
    UserAgent = {"User-Agent": UsrAgent,"Referer":UsrRefer}
    return UserAgent

# 通過內置庫,獲取到頁面的URL源代碼
def GetPageURL(page):
    head = GetUserAgent(page)
    req = request.Request(url=page,headers=head,method="GET")
    respon = request.urlopen(req,timeout=3)
    if respon.status == 200:
        html = respon.read().decode("utf-8") # 或是gbk根據頁面屬性而定
        return html

# 傳入參數,對頁面進行拼接並返回列表
def SplicingPage(page,start,end):
    url = []
    for each in range(start,end):
        temporary = page.format(each)
        url.append(temporary)
    return url
 
if __name__ == "__main__":

    urls = "https://www.meitulu.com/item/{}_{}.html".format(str(random.randint(1000,20000)),"{}")
    
    page_list = SplicingPage(urls,2,100)
    for item in page_list:
        try:
            respon = GetPageURL(str(item))
            subject = re.findall('<img src="([^"]+\.jpg)"',respon,re.S)
            for each in subject:
                img_name = each.split("/")[-1]
                img_type = each.split("/")[-1].split(".")[1]
                save_name = str(random.randint(11111111,999999999)) + "." + img_type
                print("[+] 原始名稱: {} 保存為: {} 路徑: {}".format(img_name,save_name,each))
                #urllib.request.urlretrieve(each,save_name,None)  # 無請求體的下載圖片方式
                head = GetUserAgent(str(urls))                # 隨機彈出請求頭
                ret = urllib.request.Request(each,headers=head)   # each = 訪問圖片路徑
                respons = urllib.request.urlopen(ret,timeout=10)  # 打開圖片路徑
                with open(save_name,"wb") as fp:
                    fp.write(respons.read())
        except Exception:
            # 刪除當前目錄下小於100kb的圖片
            for each in os.listdir():
                if each.split(".")[1] == "jpg":
                    if int(os.stat(each).st_size / 1024) < 100:
                        print("[-] 自動清除 {} 小於100kb文件.".format(each))
                        os.remove(each)
            exit(1)

最后的效果,高並發下載(代碼分工明確:有負責清理重復的,有負責刪除小於150kb的,有負責爬行的,包工頭非你莫屬)今晚通宵

上方代碼還有許多需要優化的地方,例如我們是隨機爬取,現在我們只想爬取其中的一部分妹子圖,所以我們需要改進一下,首先來獲取到需要的鏈接,找首先找所有A標簽,提取出頁面A標題。

from bs4 import BeautifulSoup
import requests

if __name__ == "__main__":

    get_url = []
    urls = requests.get("https://www.meitulu.com/t/youhuo/")
    soup = BeautifulSoup(urls.text,"html.parser")
    soup_ret = soup.select('div[class="boxs"] ul[class="img"] a')
    for each in soup_ret:
        if str(each["href"]).endswith("html"):
            get_url.append(each["href"])
            
    for item in get_url:
        for each in range(2,30):
            url = item.replace(".html","_{}.html".format(each))
            with open("url.log","a+") as fp:
                fp.write(url + "\n")

接着直接循環爬取就好,這里並沒有多線程,爬行會有點慢的

from bs4 import BeautifulSoup
import requests,random

def GetUserAgent(url):
    UsrHead = ["Windows; U; Windows NT 6.1; en-us","Windows NT 5.1; x86_64","Ubuntu U; NT 18.04; x86_64",
    "Windows NT 10.0; WOW64","X11; Ubuntu i686;","X11; Centos x86_64;","compatible; MSIE 9.0; Windows NT 8.1;",
    "X11; Linux i686","Macintosh; U; Intel Mac OS X 10_6_8; en-us","compatible; MSIE 7.0; Windows Server 6.1",
    "Macintosh; Intel Mac OS X 10.6.8; U; en","compatible; MSIE 7.0; Windows NT 5.1","iPad; CPU OS 4_3_3;"]
    UsrFox = ["Chrome/60.0.3100.0","Auburn Browser","Safari/522.13","Chrome/80.0.1211.0","Firefox/74.0",
    "Gecko/20100101 Firefox/4.0.1","Presto/2.8.131 Version/11.11","Mobile/8J2 Safari/6533.18.5",
    "Version/4.0 Safari/534.13","wOSBrowser/233.70 Baidu Browser/534.6 TouchPad/1.0","BrowserNG/7.1.18124",
    "rident/4.0; SE 2.X MetaSr 1.0;","360SE/80.1","wOSBrowser/233.70","UCWEB7.0.2.37/28/999","Opera/UCWEB7.0.2.37"]
    UsrAgent = "Mozilla/5.0 (" + str(random.sample(UsrHead,1)[0]) + ") AppleWebKit/" + str(random.randint(100,1000)) \
    + ".36 (KHTML, like Gecko) " + str(random.sample(UsrFox,1)[0])
    
    UsrRefer = str(url + "/" + "".join(random.sample("abcdef23457sdadw",10)))
    UserAgent = {"User-Agent": UsrAgent,"Referer":UsrRefer}
    return UserAgent

url = []

with open("url.log","r") as fp:
    files = fp.readlines()
    for i in files:
        
        url.append(i.replace("\n",""))
        
        
    for i in range(0,9999):
        aget = GetUserAgent(url[i])
        try:
            ret = requests.get(url[i],timeout=10,headers=aget)
            if ret.status_code == 200:
                soup = BeautifulSoup(ret.text,"html.parser")
                soup_ret = soup.select('div[class="content"] img')
                for x in soup_ret:
                    try:
                        down = x["src"]
                        save_name = str(random.randint(11111111,999999999)) + ".jpg"
                        print("xiazai -> {}".format(save_name))
                        img_download = requests.get(url=down, headers=aget, stream=True)
                        with open(save_name,"wb") as fp:
                            for chunk in img_download.iter_content(chunk_size=1024):
                                fp.write(chunk)
                    except Exception:
                        pass
        except Exception:
            pass

另外兩個網站的爬蟲程序公開: wuso

import os,urllib,random,argparse,sys
from urllib import request,parse
from bs4 import BeautifulSoup

def GetUserAgent(url):
    UsrHead = ["Windows; U; Windows NT 6.1; en-us","Windows NT 5.1; x86_64","Ubuntu U; NT 18.04; x86_64",
    "Windows NT 10.0; WOW64","X11; Ubuntu i686;","X11; Centos x86_64;","compatible; MSIE 9.0; Windows NT 8.1;",
    "X11; Linux i686","Macintosh; U; Intel Mac OS X 10_6_8; en-us","compatible; MSIE 7.0; Windows Server 6.1",
    "Macintosh; Intel Mac OS X 10.6.8; U; en","compatible; MSIE 7.0; Windows NT 5.1","iPad; CPU OS 4_3_3;"]
    UsrFox = ["Chrome/60.0.3100.0","Auburn Browser","Safari/522.13","Chrome/80.0.1211.0","Firefox/74.0",
    "Gecko/20100101 Firefox/4.0.1","Presto/2.8.131 Version/11.11","Mobile/8J2 Safari/6533.18.5",
    "Version/4.0 Safari/534.13","wOSBrowser/233.70 Baidu Browser/534.6 TouchPad/1.0","BrowserNG/7.1.18124",
    "rident/4.0; SE 2.X MetaSr 1.0;","360SE/80.1","wOSBrowser/233.70","UCWEB7.0.2.37/28/999","Opera/UCWEB7.0.2.37"]
    UsrAgent = "Mozilla/5.0 (" + str(random.sample(UsrHead,1)[0]) + ") AppleWebKit/" + str(random.randint(100,1000)) \
    + ".36 (KHTML, like Gecko) " + str(random.sample(UsrFox,1)[0])
    
    UsrRefer = url + str("/" + "".join(random.sample("abcdefghi123457sdadw",10)))
    UserAgent = {"User-Agent": UsrAgent,"Referer":UsrRefer}
    return UserAgent

def GetPageURL(page):
    head = GetUserAgent(page)
    req = request.Request(url=page,headers=head,method="GET")
    respon = request.urlopen(req,timeout=30)
    if respon.status == 200:
        html = respon.read().decode("utf-8")
        return html

if __name__ == "__main__":
    runt = []
    waibu = GetPageURL("https://xxx.me/forum.php?mod=forumdisplay&fid=48&typeid=114&filter=typeid&typeid=114")
    soup1 = BeautifulSoup(waibu,"html.parser")
    ret1 = soup1.select("div[id='threadlist'] ul[id='waterfall'] a")
    for x in ret1:
        runt.append(x.attrs["href"])
    for ss in runt:
        print("[+] 爬行: {}".format(ss))
        try:
            resp = []
            respon = GetPageURL(str(ss))
            soup = BeautifulSoup(respon,"html.parser")
            ret = soup.select("div[class='pct'] div[class='pcb'] td[class='t_f'] img")
            try:
                for i in ret:
                    url = "https://xxx.me/" + str(i.attrs["file"])
                    print(url)
                    resp.append(url)
            except Exception:
                pass
                
            for each in resp:
                try:
                    img_name = each.split("/")[-1]
                    print("down: {}".format(img_name))
                    head=GetUserAgent("https://wuso.me")
                    ret = urllib.request.Request(each,headers=head)
                    respons = urllib.request.urlopen(ret,timeout=60)
                    with open(img_name,"wb") as fp:
                        fp.write(respons.read())
                        fp.close()
                except Exception:
                    pass
        except Exception:
            pass

2.0

import os,urllib,random,argparse,sys
from urllib import request,parse
from bs4 import BeautifulSoup

def GetUserAgent(url):
    UsrHead = ["Windows; U; Windows NT 6.1; en-us","Windows NT 5.1; x86_64","Ubuntu U; NT 18.04; x86_64",
    "Windows NT 10.0; WOW64","X11; Ubuntu i686;","X11; Centos x86_64;","compatible; MSIE 9.0; Windows NT 8.1;",
    "X11; Linux i686","Macintosh; U; Intel Mac OS X 10_6_8; en-us","compatible; MSIE 7.0; Windows Server 6.1",
    "Macintosh; Intel Mac OS X 10.6.8; U; en","compatible; MSIE 7.0; Windows NT 5.1","iPad; CPU OS 4_3_3;"]
    UsrFox = ["Chrome/60.0.3100.0","Auburn Browser","Safari/522.13","Chrome/80.0.1211.0","Firefox/74.0",
    "Gecko/20100101 Firefox/4.0.1","Presto/2.8.131 Version/11.11","Mobile/8J2 Safari/6533.18.5",
    "Version/4.0 Safari/534.13","wOSBrowser/233.70 Baidu Browser/534.6 TouchPad/1.0","BrowserNG/7.1.18124",
    "rident/4.0; SE 2.X MetaSr 1.0;","360SE/80.1","wOSBrowser/233.70","UCWEB7.0.2.37/28/999","Opera/UCWEB7.0.2.37"]
    UsrAgent = "Mozilla/5.0 (" + str(random.sample(UsrHead,1)[0]) + ") AppleWebKit/" + str(random.randint(100,1000)) \
    + ".36 (KHTML, like Gecko) " + str(random.sample(UsrFox,1)[0])
    
    UsrRefer = url + str("/" + "".join(random.sample("abcdefghi123457sdadw",10)))
    UserAgent = {"User-Agent": UsrAgent,"Referer":UsrRefer}
    return UserAgent

def GetPageURL(page):
    head = GetUserAgent(page)
    req = request.Request(url=page,headers=head,method="GET")
    respon = request.urlopen(req,timeout=30)
    if respon.status == 200:
        html = respon.read().decode("utf-8")
        return html

# 獲取到當前頁面中所有連接
def getpage():
# https://.me/forum.php?mod=forumdisplay&fid=48&filter=typeid&typeid=17
    waibu = GetPageURL("https://.me/forum.php?mod=forumdisplay&fid=48&filter=typeid&typeid=17")
    soup1 = BeautifulSoup(waibu,"html.parser")
    ret1 = soup1.select("div[id='threadlist'] ul[id='waterfall'] a")
    for x in ret1:
        print(x.attrs["href"])

# 獲取到頁面中的圖片路徑
def get_page_image(url):
    respon = GetPageURL(str(url))
    soup = BeautifulSoup(respon,"html.parser")
    ret = soup.select("div[class='pcb'] div[class='pattl'] div[class='mbn savephotop'] img")
    resp = []
    try:
        for i in ret:
        
            url = "https://.me/" + str(i.attrs["file"])
            print(url)
            resp.append(url)
    except Exception:
        pass
    return resp

# 下載
if __name__ == "__main__":
# https://.me/forum.php?mod=viewthread&tid=747730&extra=page%3D1%26filter%3Dtypeid%26typeid%3D17
# python main.py ""
    args = sys.argv
    user = str(args[1])
    resp = get_page_image(user)
    for each in resp:
        try:
            img_name = each.split("/")[-1]
            head=GetUserAgent("https://.me")
            ret = urllib.request.Request(each,headers=head)
            respons = urllib.request.urlopen(ret,timeout=10)
            with open(img_name,"wb") as fp:
                fp.write(respons.read())
                fp.close()
            print("down: {}".format(img_name))
        except Exception:
            pass

第二個爬蟲程序: 這個開一個多線程,用另外一個程序開多進程,爬取速度非常快,CPU 100%利用率

import os,sys
import subprocess
# 每行一個人物名稱。
fp = open("lis.log","r")
aaa = fp.readlines()

for i in aaa:
    nam = i.replace("\n","")
    cmd = "python thread.py " + nam
    os.popen(cmd)

多線程代碼。

import requests,random
from bs4 import BeautifulSoup
import os,re,random,urllib,argparse
from urllib import request,parse
import threading,sys

def GetUserAgent(url):
    head = ["Windows; U; Windows NT 6.1; en-us","Windows NT 6.3; x86_64","Windows U; NT 6.2; x86_64",
    "Windows NT 6.1; WOW64","X11; Linux i686;","X11; Linux x86_64;","compatible; MSIE 9.0; Windows NT 6.1;",
    "X11; Linux i686","Macintosh; U; Intel Mac OS X 10_6_8; en-us","compatible; MSIE 7.0; Windows NT 6.0",
    "Macintosh; Intel Mac OS X 10.6.8; U; en","compatible; MSIE 7.0; Windows NT 5.1","iPad; CPU OS 4_3_3;",]
    fox = ["Chrome/60.0.3100.0","Chrome/59.0.2100.0","Safari/522.13","Chrome/80.0.1211.0","Firefox/74.0",
    "Gecko/20100101 Firefox/4.0.1","Presto/2.8.131 Version/11.11","Mobile/8J2 Safari/6533.18.5",
    "Version/4.0 Safari/534.13","wOSBrowser/233.70 Safari/534.6 TouchPad/1.0","BrowserNG/7.1.18124"]
    agent = "Mozilla/5.0 (" + str(random.sample(head,1)[0]) + ") AppleWebKit/" + str(random.randint(100,1000)) \
    + ".36 (KHTML, like Gecko) " + str(random.sample(fox,1)[0])
    refer = url
    UserAgent = {"User-Agent": agent,"Referer":refer}
    return UserAgent

def run(user):
    head = GetUserAgent("aHR0cHM6Ly93d3cuYW1ldGFydC5jb20v")
    ret = requests.get("aHR0cHM6Ly93d3cuYW1ldGFydC5jb20vbW9kZWxzL3t9Lw==".format(user),headers=head,timeout=3)
    scan_url = []
    if ret.status_code == 200:
        soup = BeautifulSoup(ret.text,"html.parser")
        a = soup.select("div[class='thumbs'] a")
        for each in a:
            url = "aHR0cHM6Ly93d3cuYW1ldGFydC5jb20v" + str(each["href"])
            scan_url.append(url)

    rando = random.choice(scan_url)
    print("隨機編號: {}".format(rando))

    try:
        ret = requests.get(url=str(rando),headers=head,timeout=10)
        if ret.status_code == 200:
            soup = BeautifulSoup(ret.text,"html.parser")
            img = soup.select("div[class='container'] div div a")
            try:
                for each in img:
                    head = GetUserAgent(str(each["href"]))
                    down = requests.get(url=str(each["href"]),headers=head)
                    img_name = str(random.randint(100000000,9999999999)) + ".jpg"
                    print("[+] 圖片解析: {} 保存為: {}".format(each["href"],img_name))
                    with open(img_name,"wb") as fp:
                        fp.write(down.content)
            except Exception:
                pass
    except Exception:
        exit(1)

if __name__ == "__main__":
    args = sys.argv
    user = str(args[1])
    try:
        os.mkdir(user)
        os.chdir("D://python/test/" + user)
        for item in range(100):
            t = threading.Thread(target=run,args=(user,))
            t.start()
    except FileExistsError:
        exit(0)

開20個進程,每個進程里面馱着100個線程,並發訪問每秒,1500次請求,因為有去重程序在不斷地掃描,所有圖片無重復,並保留質量最高的圖片,突然發現,妹子圖多了之后,妹子都不好看了 ,哈哈哈


經過爬站,之后我們得到了幾萬張妹子圖,但是如果我們想看其中的一個妹子的寫真,腫么辦? 接下來登場的是AI人臉識別軍團,通過簡單地機器學習,識別特定人臉,來篩選我們想要看的妹子圖。

import cv2
import numpy as np

def Display_Face(img_path):
    img = cv2.imread(img_path)                                                  # 讀取圖片
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)                                # 將圖片轉化成灰度
    face_cascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml") # 加載級聯分類器模型
    face_cascade.load("haarcascade_frontalface_default.xml")
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    for (x, y, w, h) in faces:
    # 在原圖上畫出包圍框(藍色框,寬度3)
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 3)
    cv2.namedWindow("img",0);
    cv2.resizeWindow("img", 300, 400);
    cv2.imshow('img', img)
    cv2.waitKey()

def Return_Face(img_path):
    img = cv2.imread(img_path)  
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5)
    if (len(faces) == 0):
        return None,None
    (x, y, w, h) = faces[0]
    return gray[y:y + w, x:x + h], faces[0]

ret = Return_Face("./meizi/172909315.jpg")
print(ret)
Display_Face("./meizi/172909315.jpg")

import cv2,os
import numpy as np

def Return_Face(img_path):
    img = cv2.imread(img_path)  
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    face_cascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5)
    if (len(faces) == 0):
        return None,None
    (x, y, w, h) = faces[0]
    return gray[y:y + w, x:x + h], faces[0]


#載入圖像   讀取ORL人臉數據庫,准備訓練數據
def LoadImages(data):
    images=[]
    names=[]
    labels=[]
    label=0
    #遍歷所有文件夾
    for subdir in os.listdir(data):
        subpath=os.path.join(data,subdir)
        #print('path',subpath)
        #判斷文件夾是否存在
        if os.path.isdir(subpath):
            #在每一個文件夾中存放着一個人的許多照片
            names.append(subdir)
            #遍歷文件夾中的圖片文件
            for filename in os.listdir(subpath):
                imgpath=os.path.join(subpath,filename)
                img=cv2.imread(imgpath,cv2.IMREAD_COLOR)
                gray_img=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
                #cv2.imshow('1',img)
                #cv2.waitKey(0)
                images.append(gray_img)
                labels.append(label)
            label+=1
    images=np.asarray(images)
    #names=np.asarray(names)
    labels=np.asarray(labels)
    return images,labels,names

images,labels,names = LoadImages("./")
face_recognizer = cv2.face.LBPHFaceRecognizer_create()

# 創建LBPH識別器並開始訓練
face_recognizer.train(images, labels)

收集的其他爬蟲: 網絡收集的其他爬蟲寫法,可參考。
1

# -*- coding: UTF-8 -*-
import sys,requests
from bs4 import BeautifulSoup
sys.path.append("/Python")
import conf.mysql_db as mysqldb

image_count = 1
#獲取套圖下的每套圖片信息
def get_photo_info(url,layout_tablename):
    global PhotoNames
    html = get_html(url)
    # html = fread('ttb.html')
    soup = BeautifulSoup(html, "lxml")
    db = mysqldb.Database()
    icount = 1
    for ul in soup.find_all(class_ = 'ul960c'):
        for li in ul:
            if (str(li).strip()):
                PhotoName = li.span.string
                PhotoUrl = li.img['src']
                imageUrl = 'http://www.quantuwang.co'+li.a['href']
                print('第'+str(icount)+'套圖:'+PhotoName+' '+PhotoUrl+' '+imageUrl)
                sql = "insert into "+layout_tablename+"(picname,girlname,picpath,flodername) values('%s','%s'," \
                      "'%s','%s')" % (imageUrl,PhotoName,PhotoUrl,PhotoName)
                db.execute(sql)
                icount = icount + 1
    db.close()
    return True

#查找套圖內的每張圖片信息並保存
def get_images(image_tablename,pic_nums,pic_title,url,layout_count):
    global image_count
    db = mysqldb.Database()
    try:
        for i in range(1, int(pic_nums)):
            pic_url = url[:-5] + str(i) + '.jpg'
            sql = "insert into "+image_tablename+"(id,imageid,flodername,imagepath) " \
                  "values (" + str(i) + ","+str(image_count)+",'" + pic_title + "','" + pic_url + "')"
            db.execute(sql)
            print('第'+str(layout_count)+'套寫真'+str(image_count)+',第'+str(i)+'張圖片:'+pic_title+' url:'+pic_url)
            image_count = image_count + 1
    except Exception as e:
        print('Error',e)
    db.close()

#獲取首頁圖片信息中的每頁鏈接
def get_image_pages(url):
    html = get_html(url)
    soup = BeautifulSoup(html, "lxml")
    # print(html)
    image_pages = []
    image_pages.append(url)
    try:
        for ul in soup.find_all(class_='c_page'):
            for li in ul.find_all('a'):
                image_pages.append('http://www.quantuwang.co/'+li.get('href'))
    except Exception as e:
        print('Error',e)
    return len(image_pages)

#獲取網頁信息,得到的html就是網頁的源代碼,傳url,返回html
def get_html(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        # 'Accept - Encoding': 'gzip, deflate',
        # 'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
    }
    resp = requests.get(url,headers=headers)
    resp.encoding='utf-8'
    html = resp.text
    # fwrite(html)
    return html

#處理url
def handle_url(i,url):
    if i == 1:
        return url
    else:
        url = url[:-5] + "_" + format(i) + ".html"
        return url

def main():
    global image_count
    # image_count = 1391
    url = 'http://www.quantuwang.co/t/f4543e3a7d545391.html'
    layoyt_name = '糯美子Mini'
    layout_tablename = 'pc_dic_'+'nuomeizi'
    image_tablename = 'po_'+'nuomeizi'

    #復制表結構
    db = mysqldb.Database()
    try:
        sql = "create table if not exists "+layout_tablename+"(LIKE pc_dic_toxic)"
        db.execute(sql)
        print('創建表:'+layout_tablename)
        sql = "create table if not exists " + image_tablename + "(LIKE po_toxic)"
        db.execute(sql)
        print('創建表:'+image_tablename)
    except Exception as e:
        print('Error',e)
    db.close()

    #第一步:搜索頁面信息截取
    get_photo_info(url,layout_tablename)

    #第二步:找出圖集中的每張圖片,插入數據庫
    layout_count = 1
    db = mysqldb.Database()
    sql = 'select * from '+layout_tablename+' where ID>0'
    results = db.fetch_all(sql)
    for row in results:
        # 找出套圖信息:圖片數量
        imgage_nums = get_image_pages(row['picname']) + 1
        get_images(image_tablename,imgage_nums,row['flodername'],row['picpath'],layout_count)
        layout_count = layout_count + 1
    db.close()

    #更新總表
    db = mysqldb.Database()
    try:
        sql = "select max(imageid) as maxcount from "+image_tablename
        results = db.fetch_one(sql)
        sql = "insert into pc_dic_lanvshen(BeautyName,MinID,MaxID,TableName,IndexName,IndexType) values ('%s',%d,%d,'%s'," \
              "'%s',%d)" % (layoyt_name,1,int(results['maxcount']),image_tablename,layout_tablename,1)
        db.execute(sql)
        print('數據已更新到總表:'+layout_tablename+' '+image_tablename)
    except Exception as e:
        print('Error',e)
    db.close()

if __name__ == '__main__':
    main()

2

#!/usr/local/Cellar/python/3.7.3/bin
# -*- coding: UTF-8 -*-
# https://www.meitulu.com
import sys,requests,time,random,re
from bs4 import BeautifulSoup
sys.path.append("/Python")
import conf.mysql_db as mysqldb
album_count = 1
image_count = 1
#獲取套圖下的每套圖片信息
def get_photo_info(url,layout_tablename):
    global album_count
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    }
    req = requests.get(url, headers=headers)
    req.encoding = 'utf-8'
    # print(req.text)
    soup = BeautifulSoup(req.text, "lxml")
    db = mysqldb.Database()
    for ul in soup.find_all(class_ = 'img'):
        for li in ul:
            if (str(li).strip()):
                AlbumName = li.img['alt']
                AlbumNums = re.findall(r"\d+\.?\d*", li.p.string)[0]
                AlbumUrl = li.a['href']
                PhotoUrl = li.img['src']
                print('第'+str(album_count)+'套圖:'+AlbumName+' '+AlbumUrl+' '+PhotoUrl)
                sql = "insert into "+layout_tablename+"(picname,girlname,picpath,imageid,flodername) values('%s','%s','%s','%s')" % (AlbumUrl,AlbumName,PhotoUrl,AlbumNums,AlbumName)
                db.execute(sql)
                album_count = album_count + 1
    db.close()
    return True

#保存每張圖片信息
def get_images(image_tablename,image_nums,flodername,image_url,albumID):
    global image_count
    db = mysqldb.Database()
    for i in range(1, int(image_nums)+1):
        image_path = image_url[:-6] + '/' + str(i) + '.jpg'
        sql = "insert into " + image_tablename + "(imageid,flodername,imagepath,id) values('%s','%s','%s','%s')" % (image_count, flodername, image_path, i)
        db.execute(sql)
        print('第'+str(albumID)+'套寫真'+str(image_count)+',第'+str(i)+'張圖片:'+flodername+' url:'+image_path)
        image_count = image_count + 1
    db.close()

#判斷網頁是否存在
def get_html_status(url):
    req = requests.get(url).status_code
    if(req == 200):
        return True
    else:
        return False

def main():
    global album_count
    global image_count
    # image_count = 1391
    url = 'https://www.meitulu.com/t/dingziku/'
    album_name = '丁字褲美女'
    album_tablename = 'pc_dic_'+'dingziku'
    image_tablename = 'po_'+'dingziku'

    #復制表結構
    db = mysqldb.Database()
    try:
        sql = "create table if not exists "+album_tablename+"(LIKE pc_dic_toxic)"
        db.execute(sql)
        print('創建表:'+album_tablename)
        sql = "create table if not exists " + image_tablename + "(LIKE po_toxic)"
        db.execute(sql)
        print('創建表:'+image_tablename)
    except Exception as e:
        print('Error',e)
    db.close()

    #第一步:搜索頁面信息截取
    get_photo_info(url,album_tablename)
    for i in range(2,100):
        urls = url +str(i)+'.html'
        # urls = url +str(i)+'.html'
        if(get_html_status(urls)):
            get_photo_info(urls,album_tablename)
            time.sleep(random.randint(1, 3))
        else:
           break

    #第二步:找出圖集中的每張圖片,插入數據庫
    db = mysqldb.Database()
    sql = 'select * from '+album_tablename+' where ID>0'
    results = db.fetch_all(sql)
    for row in results:
        get_images(image_tablename,row['imageid'],row['flodername'],row['picpath'],row['ID'])
    db.close()

    #更新總表
    db = mysqldb.Database()
    try:
        sql = "select max(imageid) as maxcount from "+image_tablename
        results = db.fetch_one(sql)
        sql = "insert into pc_dic_lanvshen(BeautyName,MinID,MaxID,TableName,IndexName,IndexType) values ('%s',%d,%d,'%s'," \
              "'%s',%d)" % (album_name,1,int(results['maxcount']),image_tablename,album_tablename,1)
        db.execute(sql)
        print('數據已更新到總表:'+album_tablename+' '+image_tablename)
    except Exception as e:
        print('Error',e)
    db.close()


if __name__ == '__main__':
    main()

3

#!/usr/local/Cellar/python/3.7.3/bin
# -*- coding: UTF-8 -*-
# https://www.lanvshen.com
import sys,requests,re,time,random
from bs4 import BeautifulSoup
sys.path.append("/Python")
import conf.mysql_db as mysqldb
layout_count = 1
image_count = 1
#查找每套圖集信息
def get_layout(url,layout_tablename):
    global layout_count
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    }
    req = requests.get(url, headers=headers)
    req.encoding = 'utf-8'
    # print(req.text)
    soup = BeautifulSoup(req.text, "lxml")
    db = mysqldb.Database()
    try:
        for ul1 in soup.find_all(class_='hezi'):
            for ul2 in ul1:
                if(str(ul2).strip()):
                    for li in ul2:
                        if (str(li).strip()):
                            layout_url = li.a['href']
                            cover_url = li.img['src']
                            layout_nums = re.findall('(\d+)', li.span.string)[0]
                            layout_name = li.find_all("p", class_="biaoti")[0].a.string
                            print('第'+str(layout_count)+'套寫真:'+layout_name+" url:"+layout_url)
                            # print('寫真集:'+layout_name+' 圖片數:'+str(layout_nums)+' 鏈接:'+cover_url)
                            sql = "insert into "+layout_tablename+"(ID,picname,girlname,picpath,imageid,flodername) values (" +\
                                  str(layout_count)+ ",'" + layout_url + "','" + layout_name + "','"+cover_url+"',"+str(layout_nums)+",'" + layout_name + "')"
                            db.execute(sql)
                            layout_count=layout_count+1
    except Exception as e:
        print('Error',e)
    db.close()

#查找套圖內的每張圖片信息
def get_images(image_tablename,pic_nums,pic_title,url):
    global image_count
    global layout_count
    url_num = re.findall('(\d+)', url)[0]
    db = mysqldb.Database()
    for i in range(1, int(pic_nums)):
        pic_url = 'https://img.hywly.com/a/1/' + url_num + '/' + str(i) + '.jpg'
        sql = "insert into "+image_tablename+"(id,imageid,flodername,imagepath) " \
              "values (" + str(i) + ","+str(image_count)+",'" + pic_title + "','" + pic_url + "')"
        db.execute(sql)
        print('第'+str(layout_count)+'套寫真,第'+str(i)+'張圖片:'+pic_title+' url:'+pic_url)
        image_count = image_count + 1
    db.close()

#判斷網頁是否存在
def get_html_status(url):
    req = requests.get(url).status_code
    if(req == 200):
        return True
    else:
        return False

def  main():
    global layout_count
    url='https://www.lanvshen.com/s/16/'
    layoyt_name = '蕾絲美女'
    layout_tablename = 'pc_dic_'+'leisi'
    image_tablename = 'po_'+'leisi'

    #復制表結構
    db = mysqldb.Database()
    try:
        sql = "create table if not exists "+layout_tablename+"(LIKE pc_dic_toxic)"
        db.execute(sql)
        print('創建表:'+layout_tablename)
        sql = "create table if not exists " + image_tablename + "(LIKE po_toxic)"
        db.execute(sql)
        print('創建表:'+image_tablename)
    except Exception as e:
        print('Error',e)
    db.close()

    #找出寫真集的每套圖集信息,插入數據庫
    get_layout(url,layout_tablename)
    for i in range(1,100):
        urls = url + 'index_'+str(i)+'.html'
        # urls = url +str(i)+'.html'
        if(get_html_status(urls)):
            get_layout(urls,layout_tablename)
            time.sleep(random.randint(1, 3))
        else:
           break

    #找出圖集中的每張圖片,插入數據庫
    layout_count = 1
    db = mysqldb.Database()
    sql = 'select * from '+layout_tablename+' order by ID'
    results = db.fetch_all(sql)
    for row in results:
        get_images(image_tablename,row['imageid'],row['flodername'],row['picname'])
        layout_count = layout_count + 1
    db.close()

    #更新總表
    db = mysqldb.Database()
    try:
        sql = "select max(imageid) as maxcount from "+image_tablename
        results = db.fetch_one(sql)
        sql = "insert into pc_dic_lanvshen(BeautyName,MinID,MaxID,TableName,IndexName,IndexType) values ('%s',%d,%d,'%s'," \
              "'%s',%d)" % (layoyt_name,1,int(results['maxcount']),image_tablename,layout_tablename,1)
        db.execute(sql)
        print('數據已更新到總表:'+layout_tablename+' '+image_tablename)
    except Exception as e:
        print('Error',e)
    db.close()

if __name__ == '__main__':
    main()

咳咳,快!派森扶我起來,我還能學挖掘機技術,未完待續。。。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM