python手機號前7位歸屬地爬蟲


需求分析

項目上需要用到手機號前7位,判斷號碼是否合法,還有歸屬地查詢。舊的數據是幾年前了太久了,打算用python爬蟲重新爬一份

單線程版本

# coding:utf-8
import requests
from datetime import datetime


class PhoneInfoSpider:
    def __init__(self, phoneSections):
        self.phoneSections = phoneSections

    def phoneInfoHandler(self, textData):
        text = textData.splitlines(True)
        # print("text length:" + str(len(text)))

        if len(text) >= 9:
            number = text[1].split('\'')[1]
            province = text[2].split('\'')[1]
            mobile_area = text[3].split('\'')[1]
            postcode = text[5].split('\'')[1]
            line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
            line_text = number + "," + province + "," + mobile_area + "," + postcode
            print(line_text)
            # print("province:" + province)

            try:
                f = open('./result.txt', 'a')
                f.write(str(line_text) + '\n')
            except Exception as e:
                print(Exception, ":", e)

    def requestPhoneInfo(self, phoneNum):
        try:
            url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
            response = requests.get(url)
            self.phoneInfoHandler(response.text)
        except Exception as e:
            print(Exception, ":", e)

    def requestAllSections(self):
        # last用於接上次異常退出前的號碼
        last = 0
        # last = 4
        # 自動生成手機號碼,后四位補0
        for head in self.phoneSections:
            head_begin = datetime.now()
            print(head + " begin time:" + str(head_begin))

            # for i in range(last, 10000):
            for i in range(last, 10):
                middle = str(i).zfill(4)
                phoneNum = head + middle + "0000"
                self.requestPhoneInfo(phoneNum)
            last = 0

            head_end = datetime.now()
            print(head + " end time:" + str(head_end))


if __name__ == '__main__':
    task_begin = datetime.now()
    print("phone check begin time:" + str(task_begin))

    # 電信,聯通,移動,虛擬運營商
    dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
    lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']
    yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172',
          '178', '182', '183', '184', '187', '188', '198']
    add = ['170']
    all_num = dx + lt + yd + add

    # print(all_num)
    print(len(all_num))

    # 要爬的號碼段
    spider = PhoneInfoSpider(all_num)
    spider.requestAllSections()

    task_end = datetime.now()
    print("phone check end time:" + str(task_end))

發現爬取一個號段,共10000次查詢,單線程版大概要多1個半小時,太慢了。

多線程版本

# coding:utf-8
import requests
from datetime import datetime
import queue
import threading

threadNum = 32


class MyThread(threading.Thread):
    def __init__(self, func):
        threading.Thread.__init__(self)
        self.func = func

    def run(self):
        self.func()


def requestPhoneInfo():
    global lock
    while True:
        lock.acquire()
        if q.qsize() != 0:
            print("queue size:" + str(q.qsize()))
            p = q.get()  # 獲得任務
            lock.release()

            middle = str(9999 - q.qsize()).zfill(4)
            phoneNum = phone_head + middle + "0000"
            print("phoneNum:" + phoneNum)

            try:
                url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
                # print(url)
                response = requests.get(url)
                # print(response.text)
                phoneInfoHandler(response.text)
            except Exception as e:
                print(Exception, ":", e)
        else:
            lock.release()
            break


def phoneInfoHandler(textData):
    text = textData.splitlines(True)

    if len(text) >= 9:
        number = text[1].split('\'')[1]
        province = text[2].split('\'')[1]
        mobile_area = text[3].split('\'')[1]
        postcode = text[5].split('\'')[1]
        line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
        line_text = number + "," + province + "," + mobile_area + "," + postcode
        print(line_text)
        # print("province:" + province)

        try:
            f = open('./result.txt', 'a')
            f.write(str(line_text) + '\n')
        except Exception as e:
            print(Exception, ":", e)


if __name__ == '__main__':
    task_begin = datetime.now()
    print("phone check begin time:" + str(task_begin))

    dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
    lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']
    yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',
          '182', '183', '184', '187', '188', '198']
    all_num = dx + lt + yd
    print(len(all_num))

    for head in all_num:
        head_begin = datetime.now()
        print(head + " begin time:" + str(head_begin))

        q = queue.Queue()
        threads = []
        lock = threading.Lock()

        for p in range(10000):
            q.put(p + 1)

        print(q.qsize())

        for i in range(threadNum):
            middle = str(i).zfill(4)
            global phone_head
            phone_head = head

            thread = MyThread(requestPhoneInfo)
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        head_end = datetime.now()
        print(head + " end time:" + str(head_end))

    task_end = datetime.now()
    print("phone check end time:" + str(task_end))

多線程版的1個號碼段1000條數據,大概2,3min就好,cpu使用飆升,大概維持在70%左右。
總共40多個號段,爬完大概1,2個小時,總數據41w左右


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM