Python 實現合並
def merge_excel(dir): print('--- 執行合並 ---') filename_excel = [] # 存表名 frames = [] # 存表內容 d = dir.replace('/','\\\\') # 因pandsa讀取路徑為雙斜杠,需轉換 if d.endswith('\\\\') == False: # 若為磁盤根目錄則路徑結尾自帶\\,若為文件夾則無,需添加\\ d = d + '\\\\' print("路徑是:",d,"\n有以下文件:") for files in os.listdir(path=dir): # 遍歷目錄下的文件 print(files) if 'xlsx' in files or 'xls' in files : # 搜索xlsx/xls后綴文件 filename_excel.append(files) df = pd.read_excel(d+files) # 讀取一個表內容存入一個DataFrame frames.append(df) if len(frames)!= 0: # 若存在EXCEL表則合並保存 result = pd.concat(frames) # 上下連接多個df result.to_excel(d+"合並結果表.xlsx") merge_excel("D:/某文件夾")
Python 實現拆分
def split_excel(path,num): # print("--- 執行拆分 ---") p = path.replace('/', '\\\\') # 傳入pd庫read_excel方法的路徑,含文件名 dir = p[ : p.rfind('\\') + 1 ] # 輸出被拆分表的目錄,不含文件名 sheetname = path[ path.rfind('/') + 1 :].strip('.xlsx').strip('.xlx') # 無后綴的文件名 data = pd.read_excel(p) # 數據 nrows = data.shape[0] # 獲取行數 split_rows = num # 自定義要拆分的條數,即分隔成多少行一份 count = int(nrows/split_rows) + 1 # 拆分的份數 # print("應當拆分成%d份"%count) begin = 0 end = 0 for i in range(1,count+1): sheetname_temp = sheetname+str(i)+'.xlsx' # 拆分后的每個表名 if i == 1: end = split_rows elif i == count: begin = end end = nrows else: begin = end end = begin + split_rows print(sheetname_temp) data_temp = data.iloc[ begin:end , : ] # [ 行范圍 , 列范圍 ] data_temp.to_excel(dir + sheetname_temp) # print('拆分完成') split_excel("test.xlsx",5)
以上轉自:https://mp.weixin.qq.com/s/0qwnhY5t_FPBFEEDvEuYiA
Python 操作PDF的幾種方法(合並、拆分、水印、加密)
一、前言
大家好,有關Python操作PDF的案例之前已經寫過一個👉PDF批量合並,這個案例初衷只是給大家提供一個便利的腳本,並沒有太多講解原理,其中涉及的就是PDF處理很實用的模塊PyPDF2
,本文就好好剖析一下這個模塊,主要將涉及
os
模塊綜合應用glob
模塊綜合應用PyPDF2
模塊操作
二、基本操作
PyPDF2 導入模塊的代碼常常是:
from PyPDF2 import PdfFileReader, PdfFileWriter
這里導入了兩個方法:
PdfFileReader
可以理解為讀取器PdfFileWriter
可以理解為寫入器
from PyPDF2 import PdfFileReader, PdfFileWriter path = r'C:\Users\xxxxxx' pdf_writer = PdfFileWriter() for i in range(1, 6): pdf_reader = PdfFileReader(path + '/INV{}.pdf'.format(i)) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) with open(path + r'\合並PDF\merge.pdf', 'wb') as out: pdf_writer.write(out)
from PyPDF2 import PdfFileReader, PdfFileWriter path = r'C:\Users\xxx' pdf_reader = PdfFileReader(path + '\INV1.pdf') for page in range(pdf_reader.getNumPages()): # 遍歷到每一頁挨個生成寫入器 pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf_reader.getPage(page)) # 寫入器被添加一頁后立即輸出產生pdf with open(path + '\INV1-{}.pdf'.format(page + 1), 'wb') as out: pdf_writer.write(out)
以上轉自:https://mp.weixin.qq.com/s/YOunnZoOMvH-Ga13vq3xvg
import os from PyPDF2 import PdfFileReader, PdfFileWriter def GetFileName(dir_path): file_list = [os.path.join(dirpath, filesname) \ for dirpath, dirs, files in os.walk(dir_path) \ for filesname in files] return file_list def MergePDF(dir_path, file_name): output = PdfFileWriter() outputPages = 0 file_list = GetFileName(dir_path) for pdf_file in file_list: print("文件:%s" % pdf_file.split('\\')[-1], end=' ') # 讀取PDF文件 input = PdfFileReader(open(pdf_file, "rb")) # 獲得源PDF文件中頁面總數 pageCount = input.getNumPages() outputPages += pageCount print("頁數:%d" % pageCount) # 分別將page添加到輸出output中 for iPage in range(pageCount): output.addPage(input.getPage(iPage)) print("\n合並后的總頁數:%d" % outputPages) # 寫入到目標PDF文件 print("PDF文件正在合並,請稍等......") with open(os.path.join(dir_path, file_name), "wb") as outputfile: # 注意這里的寫法和正常的上下文文件寫入是相反的 output.write(outputfile) print("PDF文件合並完成") if __name__ == '__main__': # 設置存放多個pdf文件的文件夾 dir_path = r'C:\Scientific Research\Knowladge\Ophthalmology\Chinese Ophthalmology' # 目標文件的名字 file_name = "中華眼科學(第3版)合並版.pdf" MergePDF(dir_path, file_name)
以上轉自:https://mp.weixin.qq.com/s/ZlgpWMKpex9Iu2o64r077A
Python辦公自動化|從Word到Excel
# 導入需要的庫docx from docx import Document # 指定文件存放的路徑 path = r'C:\Users\word.docx' # 讀取文件 document = Document(path) # 讀取word中的所有表格 tables = document.tables
# 獲取第一張表
table0 = tables[0]
# 在全局放一個變量用來計數填序號 n = 0 for i in range(0, len(table0.rows) + 1, 3): # 日期 date = table0.cell(i, 1).text # 標題 title = table0.cell(i + 1, 1).text.strip() # 文號 dfn = tables[j].cell(i, 3).text.strip() print(n, date, tite, dfn)
import datetime n = 0 for i in range(0, len(table0.rows) + 1, 3): # 日期 date = table0.cell(i, 1).text # 有的條目時間是空的,這里不做過多判別 if '/' in date: date = datetime.datetime.strptime(date, '%d/%m').strftime('2020-%m-%d') else: date = '-' # 標題 title = table0.cell(i + 1, 1).text.strip() # 文號 dfn = tables[j].cell(i, 3).text.strip() print(n, date, tite, dfn)
n = 0 for j in range(len(tables)): for i in range(0, len(tables[j].rows)+1, 3): try: # 日期 date = tables[j].cell(i, 1).text if '/' in date: date = datetime.datetime.strptime(date, '%d/%m').strftime('2020-%m-%d') else: date = '-' # 標題 title = tables[j].cell(i + 1, 1).text.strip() # 文號 dfn = tables[j].cell(i, 3).text.strip() n += 1 print(n, date, title, dfn) except Exception as error: # 捕獲異常,也可以用log寫到日志里方便查看和管理 print(error) continue
from openpyxl import Workbook # 實例化 wb = Workbook() # 獲取當前sheet sheet = wb.active # 設立表頭 header = ['序號', '收文時間', '辦文編號', '文件標題', '文號', '備注'] sheet.append(header)
row = [n, date, ' ', title, dfn, ' ']
sheet.append(row)
線程的最后記得保存
wb.save(r'C:\Users\20200420.xlsx')
from docx import Document import datetime from openpyxl import Workbook wb = Workbook() sheet = wb.active header = ['序號', '收文時間', '辦文編號', '文件標題', '文號', '備注'] sheet.append(header) path = r'C:\Users\word.docx' document = Document(path) tables = document.tables n = 0 for j in range(len(tables)): for i in range(0, len(tables[j].rows)+1, 3): try: # 日期 date = tables[j].cell(i, 1).text if '/' in date: date = datetime.datetime.strptime(date, '%d/%m').strftime('2020-%m-%d') else: date = '-' # 標題 title = tables[j].cell(i + 1, 1).text.strip() # 文號 dfn = tables[j].cell(i, 3).text.strip() n += 1 print(n, date, title, dfn) row = [n, date, ' ', title, dfn, ' '] sheet.append(row) except Exception as error: # 捕獲異常,也可以用log寫到日志里方便查看和管理 print(error) continue wb.save(r'C:\Users\20200420.xlsx')
以上轉自:https://mp.weixin.qq.com/s/Gry1gjz-ZmKyQOFoEnOm3g
from openpyxl import load_workbook, Workbook import glob path = 'C:/Users/xxxxxx' new_workbook = Workbook() new_sheet = new_workbook.active # 用flag變量明確新表是否已經添加了表頭,只要添加過一次就無須重復再添加 flag = 0 for file in glob.glob(path + '/*.xlsx'): workbook = load_workbook(file)
#打開已經存在的Excel用load_workbook
,創建新的Excel用Workbook
sheet = workbook.active
buy_mount = sheet['F'] row_lst = []
for cell in buy_mount:
if isinstance(cell.value, int) and cell.value > 50:
print(cell.row)
row_lst.append(cell.row)
if not flag:
# 創建和 電商嬰兒數據 一樣的表頭(第一行) header = sheet[1] header_lst = [] for cell in header: header_lst.append(cell.value) new_sheet.append(header_lst) flag = 1 # 從舊表中根據行號提取符合條件的行,並遍歷單元格獲取值,以列表形式寫入新表 for row in row_lst: data_lst = [] for cell in sheet[row]: data_lst.append(cell.value) new_sheet.append(data_lst) new_workbook.save(path + '/' + '符合篩選條件的新表.xlsx')
注意這一列有可能有的單元格cell的值value不是數值類型,因此需要用isinstance()
進行判斷,當然也可以將單元格的值先用int()
轉為整型再判斷。
以上轉自:https://mp.weixin.qq.com/s/RD3h6vJe7_aSPwCpU1p9Ig