現實遇到的問題
在測試導入excel數據時,需要自己造數據,如果寫入大批量數據不可能自己一個個造,所以寫了一個寫入數據到excel的demo
下面為寫入excel的demo
import os
import random
from faker import Faker
file_path = os.getcwd()
print(file_path)
f = Faker(locale='en_US')
class GenExcelUser(object):
def __init__(self, num=10, type=1, scenario=0):
"""
:param num: 生成手機號數量
:type 1為導入用戶,excel中有三個字段:買家賬號,買家手機號,用戶標記;2為黑名單用戶,只有手機號一個字段
:scenario 只針對導入用戶生效,默認為正常數據,即買家賬號,手機號,標記均存在
1:最后一條無手機號;2:最后一條無昵稱:3:全無手機號;4:全無昵稱;5:昵稱手機號均為空
"""
self.dirname = '\導入測試數據.xlsx'
self.buername = 'autotest'
self.num = num
self.type = type
self.lable = '50w_mark' # 標簽名
self.scenario = scenario
def gen_phone(self):
a = random.sample(range(0, 10), 8)
ll = [str(i) for i in a]
res = ('179' + ''.join(ll))
return res
def generate_random_str(self):
"""
:return: 隨機字符串 返回3位字符串
"""
import string
# a-z小寫列表
lower_str = list(string.ascii_lowercase)
# a-z大寫列表
up_str = list(string.ascii_uppercase)
# 0-9的列表,列表對應的為字符串
int_str = [str(i) for i in range(10)]
# 兩個列表合並,在小寫列表中做拓展
lower_str.extend(up_str)
lower_str.extend(int_str)
# 在列表中隨機選取7個元素
ff = random.sample(lower_str, 7)
return ''.join(ff)
def gene_data(self):
"""
導入用戶
"""
buername = self.buername + '_' + str(self.generate_random_str())
phone = self.gen_phone()
lable = self.lable # 這邊定義寫死后,后面不管這么取,都為默認值
return locals()
# 棄用 該方法,該方法寫入十萬級別數據沒問題,百萬級別就會報錯
# def main(self, num=100):
# import pandas as pd
# from pandas import DataFrame
#
# list_name = [self.gene_data()['buername'] for i in range(num)]
# list_phone = [self.gene_data()['phone'] for i in range(num)]
# list_lable = [self.gene_data()['lable'] for i in range(num)]
# # 寫
# dic1 = {'賬號': list_name,
# '手機號': list_phone,
# '標記': list_lable
# }
# df = pd.DataFrame(dic1)
# df.to_excel(self.dirname, index=False)
# print('寫入完成')
def xw_toExcel(self, num=100): # xlsxwriter庫儲存數據到excel
import xlsxwriter as xw
fileName = file_path+self.dirname
workbook = xw.Workbook(fileName) # 創建工作簿
worksheet1 = workbook.add_worksheet("sheet1") # 創建子表
worksheet1.activate() # 激活表
title = ['賬號', '手機號', '標記'] # 設置表頭
worksheet1.write_row('A1', title) # 從A1單元格開始寫入表頭
i = 2 # 從第二行開始寫入數據
data = [self.gene_data() for i in range(num)]
for j in range(len(data)):
insertData = [data[j]["buername"], data[j]["phone"], data[j]["lable"]]
row = 'A' + str(i)
worksheet1.write_row(row, insertData)
i += 1
workbook.close() # 關閉表
if __name__ == '__main__':
import time
num = input('請輸入需要導入數量: ')
start_time = time.time()
cl = GenExcelUser()
cl.xw_toExcel(num=int(num))
end_time = time.time()
print('總計耗時%s' % (end_time - start_time))
由於寫入數據是隨機的,無法保證數據是否有重復值,所以就寫了一個解析excel文件,查看是否有重復數據
代碼如下
import pandas as pd
import time
file_path = r'./導入測試數據.xlsx'
file_path1 = r'./測試.xlsx'
def excel_one_line_to_list():
"""
讀取excel文件,讀取第一列數據
"""
df = pd.read_excel(file_path, usecols=[0],
names=None) # 讀取項目名稱列,不要列名,讀取兩列則usecols=[0,1],對應列
df_li = df.values.tolist()
result = []
for s_li in df_li:
result.append(s_li[0])
print('列表數量為%d' % (len(result)))
return result
def compar_data(data: list):
"""
判斷是否有重復數據
"""
num = 1
dic1 = {}
train_data = [i for i in data]
for i in train_data:
if i not in dic1.keys():
dic1[i] = num
else:
dic1[i] += 1
ll = []
for k, v in dic1.items():
if v == 2 or v > 2:
ll.append(k)
return ll
if __name__ == '__main__':
start_time = time.time()
train_data = excel_one_line_to_list()
ss = compar_data(train_data)
print('重復數量為%s,重復值列表為%s' % (str(len(ss)),str(ss)))
end_time = time.time()
print('總計耗時%s' % (end_time - start_time))
