python字符處理_只保留文本里的漢字

本文轉載自查看原文 2014-12-09 23:53 3318 python

 1 # -*- coding: utf-8 -*-
 2 
 3 def is_chinese(uchar):
 4     """判斷一個unicode是否是漢字"""
 5     if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
 6         return True
 7     else:
 8         return False
 9 
10 def is_number(uchar):
11     """判斷一個unicode是否是數字"""
12     if uchar >= u'\u0030' and uchar <= u'\u0039':
13         return True
14     else:
15         return False
16 
17 def is_alphabet(uchar):
18     """判斷一個unicode是否是英文字母"""
19     if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
20         return True
21     else:
22         return False
23 
24 def is_legal(uchar):
25     """判斷是否非漢字，數字和英文字符"""
26     if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
27         return False
28     else:
29         return True

check_legal.py

 1 # -*- coding: utf-8 -*-
 2 """
 3 輸入文件格式需要是gb2312
 4 """
 5 import check_legal
 6 fileRead = file('input.txt', 'r')
 7 fileWrite = file('result.txt', 'w')
 8 while True:
 9     line = fileRead.readline()
10     # check end file
11     if len(line) == 0:
12         break
13     # ignore作用: 忽略非法字符
14     strBuffer = line.decode('gb2312', 'ignore')
15     str = ""
16     for oneWord in strBuffer:
17         if check_legal.is_chinese(oneWord):
18             str += oneWord
19     fileWrite.write(str.encode('gb2312'))
20     fileWrite.write('\n')
21 fileRead.close()
22 fileWrite.close()

reserveChineseCharacter

至於文件格式問題，可以用一些文本編輯器指定保存格式，例如EmEditor.

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 php正則表達式剔除字符串中的除了漢字的字符（只保留漢字） php正則表達式剔除字符串中 ,除了漢字的字符（只保留漢字） php 正則只保留漢字，剔除所有符號使用python的re模塊匹配文本里固定字符串並取出字符串后的數字 bat批處理查找替換：批處理如何查找並替換文本里特定字符串中的部分內容 Python 基礎 - 統計文本里單詞的個數以及出現的次數 python 字符串過濾filter只保留數字和字母/大小寫轉換 PHP文本處理之中文漢字字符串轉換為數組處理后台傳過來的json數據-顯示到微信小程序的富文本里 shell把字符串中的字母去掉,只保留數字怎么把Word文檔里的漢字英文字符都去掉只留數字