python寫入數據到excel,寫入百萬數據&讀取數據,並取出重復數據


現實遇到的問題

在測試導入excel數據時,需要自己造數據,如果寫入大批量數據不可能自己一個個造,所以寫了一個寫入數據到excel的demo

下面為寫入excel的demo

import os
import random
from faker import Faker

file_path = os.getcwd()
print(file_path)
f = Faker(locale='en_US')


class GenExcelUser(object):

    def __init__(self, num=10, type=1, scenario=0):
        """
        :param num: 生成手機號數量
        :type 1為導入用戶,excel中有三個字段:買家賬號,買家手機號,用戶標記;2為黑名單用戶,只有手機號一個字段
        :scenario 只針對導入用戶生效,默認為正常數據,即買家賬號,手機號,標記均存在
        1:最后一條無手機號;2:最后一條無昵稱:3:全無手機號;4:全無昵稱;5:昵稱手機號均為空
        """
        self.dirname = '\導入測試數據.xlsx'
        self.buername = 'autotest'
        self.num = num
        self.type = type
        self.lable = '50w_mark'  # 標簽名
        self.scenario = scenario

    def gen_phone(self):
        a = random.sample(range(0, 10), 8)
        ll = [str(i) for i in a]
        res = ('179' + ''.join(ll))
        return res

    def generate_random_str(self):
        """
        :return: 隨機字符串 返回3位字符串
        """
        import string
        # a-z小寫列表
        lower_str = list(string.ascii_lowercase)
        # a-z大寫列表
        up_str = list(string.ascii_uppercase)
        # 0-9的列表,列表對應的為字符串
        int_str = [str(i) for i in range(10)]

        # 兩個列表合並,在小寫列表中做拓展
        lower_str.extend(up_str)
        lower_str.extend(int_str)

        # 在列表中隨機選取7個元素
        ff = random.sample(lower_str, 7)
        return ''.join(ff)

    def gene_data(self):
        """
        導入用戶
        """
        buername = self.buername + '_' + str(self.generate_random_str())
        phone = self.gen_phone()
        lable = self.lable  # 這邊定義寫死后,后面不管這么取,都為默認值
        return locals()

    # 棄用 該方法,該方法寫入十萬級別數據沒問題,百萬級別就會報錯
    # def main(self, num=100):
    #     import pandas as pd
    #     from pandas import DataFrame
    #
    #     list_name = [self.gene_data()['buername'] for i in range(num)]
    #     list_phone = [self.gene_data()['phone'] for i in range(num)]
    #     list_lable = [self.gene_data()['lable'] for i in range(num)]
    #     # 寫
    #     dic1 = {'賬號': list_name,
    #             '手機號': list_phone,
    #             '標記': list_lable
    #             }
    #     df = pd.DataFrame(dic1)
    #     df.to_excel(self.dirname, index=False)
    #     print('寫入完成')

    def xw_toExcel(self, num=100):  # xlsxwriter庫儲存數據到excel
        import xlsxwriter as xw
        fileName = file_path+self.dirname

        workbook = xw.Workbook(fileName)  # 創建工作簿
        worksheet1 = workbook.add_worksheet("sheet1")  # 創建子表
        worksheet1.activate()  # 激活表
        title = ['賬號', '手機號', '標記']  # 設置表頭
        worksheet1.write_row('A1', title)  # 從A1單元格開始寫入表頭
        i = 2  # 從第二行開始寫入數據
        data = [self.gene_data() for i in range(num)]
        for j in range(len(data)):
            insertData = [data[j]["buername"], data[j]["phone"], data[j]["lable"]]
            row = 'A' + str(i)
            worksheet1.write_row(row, insertData)
            i += 1
        workbook.close()  # 關閉表


if __name__ == '__main__':
    import time
    num = input('請輸入需要導入數量: ')
    start_time = time.time()
    cl = GenExcelUser()
    cl.xw_toExcel(num=int(num))
    end_time = time.time()
    print('總計耗時%s' % (end_time - start_time))

由於寫入數據是隨機的,無法保證數據是否有重復值,所以就寫了一個解析excel文件,查看是否有重復數據

代碼如下

import pandas as pd
import time

file_path = r'./導入測試數據.xlsx'
file_path1 = r'./測試.xlsx'


def excel_one_line_to_list():
    """
    讀取excel文件,讀取第一列數據
    """
    df = pd.read_excel(file_path, usecols=[0],
                       names=None)  # 讀取項目名稱列,不要列名,讀取兩列則usecols=[0,1],對應列
    df_li = df.values.tolist()
    result = []
    for s_li in df_li:
        result.append(s_li[0])

    print('列表數量為%d' % (len(result)))
    return result


def compar_data(data: list):
    """
    判斷是否有重復數據
    """
    num = 1
    dic1 = {}
    train_data = [i for i in data]
    for i in train_data:
        if i not in dic1.keys():
            dic1[i] = num
        else:
            dic1[i] += 1
    ll = []
    for k, v in dic1.items():
        if v == 2 or v > 2:
            ll.append(k)
    return ll


if __name__ == '__main__':
    start_time = time.time()
    train_data = excel_one_line_to_list()
    ss = compar_data(train_data)
    print('重復數量為%s,重復值列表為%s' % (str(len(ss)),str(ss)))
    end_time = time.time()
    print('總計耗時%s' % (end_time - start_time))


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM