批量合並找到3種方式:
- open的方式,先讀后寫入
- pandas的concat方法
- HDF5
對前2種方式嘗試了,第3種方式待試驗。采用pandas的concat方法時合並的列會錯列,還待進一步找到原因,第一種方式已實現。
1.有幾個文件為xlsx格式,需要先轉成csv文件。其他都為csv格式。轉換代碼:
import pandas as pd
import glob
def xlsx_to_csv():
xlsx_list = [f for f in glob.glob('*.{}'.format("xlsx"))]
for i in xlsx_list:
rdata = pd.read_excel(i,index_col=0)
rdata.to_csv(i.split('.')[0] + '.csv',encoding='gb18030')
if __name__ == '__main__':
xlsx_to_csv()
2.批量合並文件代碼
import os,sys
import time
import glob
import pandas as pd
import xlrd
from xlrd import XLRDError
class ArgsError():
pass
class concatAndScreenData(object):
def __init__(self , path =None):
self.path = path
self.person_list = []
def set_path(self):
if not self.path:
self.path = os.getcwd()
def concat_data(self):
file_list = os.listdir(self.path)
csv_list = [f for f in file_list if os.path.splitext(f)[1] == '.csv']
# csv_list = [f for f in glob.glob('*.{}'.format("csv"))] # 或者glob獲取list
# csv_concat = pd.concat([ pd.read_csv(i , encoding='gb18030') for i in csv_list ],axis=0,ignore_index=False)
# csv_concat.to_csv('數據源.csv', index = 0 ,encoding= 'gb18030',sep= ',') # concat合並存在列錯位問題待解決
for i in csv_list:
fr = open(i, 'rb').read()
with open('數據源.csv','ab') as f:
f.write(fr)
def read_person(self):
lists = []
if os.path.exists('人員名單.xlsx'):
data = xlrd.open_workbook(self.path + '/人員名單.xlsx')
table = data.sheet_by_name('ty')
for i in range(table.nrows):
col = table.row_values(i)
lists.append(col)
for i in range(len(lists)):
self.person_list.append(lists[i][0])
return self.person_list
def screen_data(self):
if os.path.exists('數據源.csv'):
# df = pd.read_csv('數據源.csv',header= None ,chunksize= 100000 , encoding= 'gb18030',low_memory=False) #header= None 自動加列索引,從0開始
df = pd.read_csv('數據源.csv',header= None ,chunksize= 100000, encoding= 'gb18030',low_memory=False) #header= None 自動加列索引,從0開始
for chunk in df:
chunk.rename(columns={2:'names'},inplace=True)
filename = open("data.txt",'a',errors='ignore')
mylist = ''
for i in range(len(chunk)):
k = chunk.iloc[i]['names']
if k in self.person_list:
for m in chunk.columns.values:
mylist = mylist + str(chunk.iloc[i][m])
if m != 7:
mylist = mylist + ','
mylist = mylist + '\n'
filename.write(mylist)
filename.close()
else:
print(u'文件不存在!')
def run(self, path = None):
self.set_path()
#concat data
try:
self.concat_data()
time.sleep(100)
self.read_person()
except ArgsError:
raise ArgsError(u'文件路徑錯誤或未關閉')
except IOError or WindowsError:
raise ArgsError(u'文件路徑錯誤或未關閉')
# screen data
self.screen_data()
app = concatAndScreenData()
if __name__ == '__main__':
app.run(path = None)
