轉換繁體EPUB文件為簡體

本文轉載自查看原文 2021-07-03 12:32 335

轉換繁體EPUB文件為簡體

最近看一本電子書，打開文件后發現內容全都是繁體中文，雖然大部分文字都認識，但看起來還是很費勁，所以想着把文件轉換成簡體的中文。

epub文件格式

這種文件本身就是一個zip文件，只是后綴為epub，通常文件包含了三部分：

mimetype
META-INF文件夾
OEBPS文件夾，里面包含了images，xhtml(電子書正文的內容)，css文件等

現在需要把 xhtml 文件中的繁體中文轉換成簡體，這里選擇了 opencc 庫，可以通過python調用轉換文本。opencc 項目地址：https://github.com/BYVoid/OpenCC

按照說明安裝依賴，先將文件解壓，找到需要修改的xhtml文件，轉換成簡體中文，重新壓縮，具體代碼如下：

# 將電子書中的繁體中文轉換為簡體中文
import opencc
import os
import zipfile
import shutil


def test_convert():
    """
     s2t.json Simplified Chinese to Traditional Chinese 簡體到繁體
    t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體
    """
    converter = opencc.OpenCC('s2t.json')
    print(converter.convert('漢字acb'))  # 漢字

    converter = opencc.OpenCC('t2s.json')
    print(converter.convert('漢字123'))  # 漢字


def unzip_dir(zipfilename, unzipdirname):
    """解壓zip文件"""
    fullzipfilename = os.path.abspath(zipfilename)
    fullunzipdirname = os.path.abspath(unzipdirname)
    print("Start to unzip file %s to folder %s ..." % (zipfilename, unzipdirname))
    # Check input ...
    if not os.path.exists(fullzipfilename):
        print("Dir/File %s is not exist, Press any key to quit..." % fullzipfilename)
        inputStr = input()
        return
    if not os.path.exists(fullunzipdirname):
        os.mkdir(fullunzipdirname)
    else:
        if os.path.isfile(fullunzipdirname):
            print("File %s is exist, are you sure to delet it first ? [Y/N]" % fullunzipdirname)
            while 1:
                inputStr = input()
                if inputStr == "N" or inputStr == "n":
                    return
                else:
                    if inputStr == "Y" or inputStr == "y":
                        os.remove(fullunzipdirname)
                        print("Continue to unzip files ...")
                        break

                        # Start extract files ...
    srcZip = zipfile.ZipFile(fullzipfilename, "r")
    for eachfile in srcZip.namelist():
        if eachfile.endswith('/'):
            # is a directory
            print('Unzip directory %s ...' % eachfilename)
            os.makedirs(os.path.normpath(os.path.join(fullunzipdirname, eachfile)))
            continue
        print("Unzip file %s ..." % eachfile)
        eachfilename = os.path.normpath(os.path.join(fullunzipdirname, eachfile))
        eachdirname = os.path.dirname(eachfilename)
        if not os.path.exists(eachdirname):
            os.makedirs(eachdirname)
        fd = open(eachfilename, "wb")
        fd.write(srcZip.read(eachfile))
        fd.close()
    srcZip.close()
    print("Unzip file succeed!")


def zip_dir(dirname,zipfilename):
    filelist = []
    if os.path.isfile(dirname):
        filelist.append(dirname)
    else :
        for root, dirs, files in os.walk(dirname):
            for dir in dirs:
                filelist.append(os.path.join(root,dir))
            for name in files:
                filelist.append(os.path.join(root, name))

    zf = zipfile.ZipFile(zipfilename, "w", zipfile.zlib.DEFLATED)
    for tar in filelist:
        arcname = tar[len(dirname):]
        #print arcname
        zf.write(tar,arcname)
    zf.close()


def convert_file_to_chinese(file_path):
    """按行讀取文件，存儲到list集合中，轉換元素的語言(繁體->簡體)，將結果寫回到文件中"""
    file_lines = []
    converter = opencc.OpenCC('t2s.json')

    with open(file_path, mode='r', encoding='utf-8') as f:
        for line in f.readlines():
            file_lines.append(converter.convert(line))

    with open(file_path, mode='w', encoding='utf-8') as f:
        f.writelines(file_lines)


def convert_epub_simplified(file_path):
    """將epub文件轉繁體換成簡體"""
    if not (os.path.exists(file_path) or os.path.isfile(file_path)):
        raise Exception("請檢查文件路徑：{}", file_path)

    dir_name, file_name = os.path.split(file_path)
    unzip_dir_path = dir_name + "\\unzip"
    unzip_dir(epub_file_path, unzip_dir_path)

    files = find_content_files(unzip_dir_path)
    for file in files:
        convert_file_to_chinese(file)
    new_file_name = file_name[0:file_name.rindex(".")] + "-簡體.epub"
    new_epub_file_path = os.path.join(dir_name, new_file_name)

    zip_dir(unzip_dir_path, new_epub_file_path)
    # os.remove(unzip_dir_path)
    shutil.rmtree(unzip_dir_path)


def find_content_files(folder_path):
    """查詢文件夾中所有需要修改的文件的路徑，返回路徑的列表
    只轉換epub文件的內容，搜索后綴為 'xhtml' 的文件
    """
    result_files = []
    for root, dirs, files in os.walk(folder_path):
        # for dir in dirs:
        #     filelist.append(os.path.join(root, dir))
        for name in files:
            if name.endswith('xhtml'):
                result_files.append(os.path.join(root, name))

    return result_files


if __name__ == '__main__':
    # 測試繁體簡體轉換
    # test_convert()
    epub_file_path = "D:\\WorkSpace\\epub_file\\40%的工作沒意義，為什麼還搶著做？.epub"
    convert_epub_simplified(epub_file_path)

參考內容：

https://www.jianshu.com/p/d2edab6750df

https://blog.csdn.net/weixin_42108054/article/details/112963816

https://blog.csdn.net/weixin_39640444/article/details/110381545

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 簡體繁體轉換 js簡體繁體轉換簡體轉換繁體漢字簡體繁體轉換----Javascript c#簡體繁體轉換簡體、繁體相互轉換 oracle 繁體轉換為簡體 php 網站中文簡體繁體轉換類 Python 漢字簡體和繁體的相互轉換 java代碼實現簡體繁體轉換