
1 # -*- coding: utf-8 -*- 2 3 def is_chinese(uchar): 4 """判断一个unicode是否是汉字""" 5 if uchar >= u'\u4e00' and uchar <= u'\u9fa5': 6 return True 7 else: 8 return False 9 10 def is_number(uchar): 11 """判断一个unicode是否是数字""" 12 if uchar >= u'\u0030' and uchar <= u'\u0039': 13 return True 14 else: 15 return False 16 17 def is_alphabet(uchar): 18 """判断一个unicode是否是英文字母""" 19 if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'): 20 return True 21 else: 22 return False 23 24 def is_legal(uchar): 25 """判断是否非汉字,数字和英文字符""" 26 if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)): 27 return False 28 else: 29 return True

1 # -*- coding: utf-8 -*- 2 """ 3 输入文件格式需要是gb2312 4 """ 5 import check_legal 6 fileRead = file('input.txt', 'r') 7 fileWrite = file('result.txt', 'w') 8 while True: 9 line = fileRead.readline() 10 # check end file 11 if len(line) == 0: 12 break 13 # ignore作用: 忽略非法字符 14 strBuffer = line.decode('gb2312', 'ignore') 15 str = "" 16 for oneWord in strBuffer: 17 if check_legal.is_chinese(oneWord): 18 str += oneWord 19 fileWrite.write(str.encode('gb2312')) 20 fileWrite.write('\n') 21 fileRead.close() 22 fileWrite.close()
至于文件格式问题,可以用一些文本编辑器指定保存格式,例如EmEditor.