Python中re模塊的使用


Python的re模塊


#預備知識點
#正則表達式regex
#特殊符號和字符 ---> 元字符



正則表達式基礎知識
通配符 含義 正則示例 匹配結果
reg1 | reg2 匹配正則表達式reg1或reg2 foo | bar foo
. 匹配任何字符(\n除外) a.a abc
^ 匹配字符串起始部分 ^a ab....
$ 匹配字符串終止部分 .txt$ a.txt
* 匹配0次或者多次前面出現的正則表達式 a* aaaaa
+ 匹配1次或者多次前面出現的正則表達式 [a-z]+ aasx
? 匹配0次或者1次前面出現的正則表達式 first? first
{N} 匹配N次前面出現的正則表達式 *.c{2} first.c abc.c
{M,N} 匹配M~N次前面出現的正則表達式 *.c{0,1} one.c
[...] 匹配來自字符集的任意單個字符 [abc] b
[...x-y...] 匹配x~y范圍中的任意單個字符 [0-9] 9
[^...] 不匹配次字符集中任意單個字符 [^0-9] a
(*|+|?|{})? 匹配上面頻繁出現符號的非貪婪版 (*|+|?|{})? ({})
(...) 匹配封閉的正則表達式,然后另存為子組 ([0-1][0-9])? 12
\d 匹配任何十進制數字 \d.txt 1.txt
\w 匹配任何字母數字字符 \w{2}txt 1.txt
\s 匹配任何空格字符 a\sb a b
\b 匹配任何單詞邊界 The\bdog The dog
\N 匹配已保存的子組 ([0-9])\1 1
\. 匹配"."這個字符 a\.txt a.txt
常用正則表達式
正則表達式 描述 匹配結果
\d+(\.\d*)? 任意整數和浮點數 0.004 2 75.
\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b 首字母只能大寫 Boo Foo
^http:\/\/([\w-]+(\.[\w-]+)+(\/[\w-.\/\?%&=\u4e00-\u9fa5]*)?)?$ 驗證網址 http://www.baidu.com/?id=1
^[\u4e00-\u9fa5]{0,}$ 驗證漢字 漢字漢字
\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)* 驗證電子郵件 example@163.com
^[1-9]([0-9]{16}|[0-9]{13})[xX0-9]$ 驗證身份證 14525419951215445X
^13[0-9]{1}[0-9]{8}|^15[9]{1}[0-9]{8} 驗證手機號 138459572***
^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$ 驗證IP 192.168.1.1
^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.s|)$ 驗證域名 baidu.com
^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.txt(l)?$ 驗證文件路徑 C:\user\wo
<(.*)>(.*)<\/(.*)>|<(.*)\/> HTML標簽匹配 xxxx

 


#re模塊
#常用的方法
compile(pattern, flags = 0)        匹配任何可選的標記來編譯正則表達式的模式,然后返回一個正則表達式對象
match(pattern, string, flags = 0)     使用帶有可選標記的正則表達式的模式來匹配字符串。如果匹配成功,返回匹配對象,否則返回None
search(pattern, string ,flags = 0)     使用可選標記搜索字符串中第一次出現的正則表達式模式。如果匹配成功,則返回匹配對象,否則返回None
findall(pattern, string[,flags] )          查找字符串中所有(非重復)出現的正則表達式模式,並返回一個匹配列表
finditer(pattern, string[,flags] )        與findall()相同,但返回的是一個迭代器。對於每一次匹配,迭代器都能返回一個匹配對象
split(pattern, string, max = 0)         根據正則表達式的模式分隔符,split函數將字符串分割為列表,返回匹配列表,分割最多操作max次
group(num = 0)               返回整個匹配對象,或者編號為num的特定子組

 

import re
m = re.search('foo','asdasdfooasd')
#這里如果使用match將匹配不到任何字符串,因為match從第一個a開始匹配
if m is not None:
    print(m.group())
regex = <(.*)>(.*)<\/(.*)>|<(.*)\/>
m = re.search(regex,"aa<a>aaaa</a>")
#一樣只有search能匹配到標簽
if m is not None:
    print(m.group())
regex = '(foo\w)(\w)'
m = re.match(r'(foo\w)(\w)','fooasdfooasd')
if m is not None:
    print(m.group(1))
    print(m.groups())
#輸出
#fooa
#('fooa', 's')
regex = 'apple'
m = re.findall(regex,'apple1 apple2 apple3')
    print(m)
#輸出
#['apple', 'apple', 'apple']
regex = 'apple'
m = [ g.group() for g in re.finditer(regex,'apple1 apple2 apple3')]
print(m)
#輸出
#['apple', 'apple', 'apple']
list = [
'aaa, bbb ccc',
'ddd, eee fff',
]
for i in list:
    print(re.split(', |(?= (?:[a-z]{3})) ',i))
#輸出
#['aaa', 'bbb', 'ccc']
#['ddd', 'eee', 'fff']

 

re模塊小實例:

__author__ = 'cq'

import  re
from random import randrange,choice,randint
from string import ascii_lowercase as lc
from time import ctime


#生成數據文件
def generate_data():
    with open('./data.txt','w') as f:
        for i in range(randint(20,30)):
            tlds = ('com', 'edu', 'net', 'org', 'gov')
            dtint = randint(100000000,1200000000) #生成時間戳
            dtstr = ctime(dtint)  #將時間戳轉化為特定時間格式
            llen = randrange(4, 8) #用戶名長度
            login = ''.join(choice(lc) for i in range(llen))  #生成用戶名
            dlen = randrange(llen,13)                         #域名長度
            dom = ''.join(choice(lc) for i in range(dlen))    #生成域名

            data_line = "%s::%s@%s.%s::%d-%d-%d\n" % (dtstr, login, dom, choice(tlds), dtint, llen, dlen)
            f.write(data_line) #寫入文件
            print(data_line)   #打印每行記錄



#匹配指定日期的行
def match_date():
    regex = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)(.*)'
    with open('./data.txt','r') as f:
        m = re.findall(regex,f.read())
        for i in m:
            print(i)




#匹配在某時間段內的記錄
def match_time_slot():
    regex = ' ([0-9]{1,2}) .*([0-9]{4})::(.*)'
    # regex = ' ([0-9]{0,2}).*(::)(.*) '
    with open('./data.txt','r') as f:
        m = re.findall(regex,f.read())
        for i in m:
            if 2000 <= int(i[1]) and int(i[1]) <= 2020 and 20 <= int(i[0]) and int(i[0]) <= 31:
                print(i)


#匹配某名單中人員的記錄
def match_name():
    regex = '::([a-z]{2,13})@([a-z]{2,13})\.(com|edu|net|org|gov)'
    with open('./data.txt','r') as f:
        m = re.findall(regex,f.read())
        for i in m:
            print(i)



def main():
    generate_data()
    print("\n---------------match_date--------------------\n")
    match_date()
    print("\n---------------match_time_slot--------------------\n")
    match_time_slot()
    print("\n---------------match_name--------------------\n")
    match_name()


if '__main__' == __name__:
    main()

 

輸出結果
Sun Mar  5 00:55:55 1989::qvnc@ygeowwaf.com::605033755-4-8

Mon Oct 17 17:16:31 2005::yene@rtewqvvyfe.edu::1129540591-4-10

Tue Oct  7 06:33:30 2003::wlyi@coagmnososzy.edu::1065479610-4-12

Mon Oct 16 00:01:06 2006::zsgok@jkpiplcm.edu::1160928066-5-8

Wed Mar 15 06:37:35 2000::paok@anpekysphicu.com::953073455-4-12

Wed Mar 26 12:27:25 1980::bodqoe@iydohek.org::322892845-6-7

Mon Jun  5 13:54:28 1989::fgiy@oppcjnafx.gov::613029268-4-9

Sun Jul 25 05:27:23 2004::agmljfx@qvxgjqtkiwnl.org::1090704443-7-12

Mon Nov 14 16:15:36 2005::tctz@bcikib.gov::1131956136-4-6

Sun Jan 14 23:20:42 2007::qqlfkf@isslbh.com::1168788042-6-6

Sun Jul 27 02:00:13 1980::cpiqwau@drbpfsfglip.edu::333482413-7-11

Sun Feb 20 16:10:34 2005::aguqfd@hnrcaged.com::1108887034-6-8

Wed Jun 27 06:13:05 1979::kowyk@ruoackjavkpq.net::299283185-5-12

Wed Oct 12 19:52:54 1994::kqaol@mzewoas.edu::781962774-5-7

Thu Aug 23 01:46:59 1973::uofpdq@zdeidbobin.org::114889619-6-10

Sat Dec 21 11:36:20 1991::hodw@wfbw.org::693286580-4-4

Tue Jun 22 14:42:19 1993::azgagm@nfmguh.org::740731339-6-6

Sun Feb 23 04:50:57 2003::cysfu@fnzdo.com::1045947057-5-5

Fri Jun 10 13:38:02 1983::qdhqw@fcdsvlmnhx.net::424071482-5-10

Sat Jan 24 21:56:37 1998::dfyicjw@fklbymd.org::885650197-7-7

Sun Jun  3 07:48:45 2007::wptuyjk@tsngnle.edu::1180828125-7-7

Mon Nov 19 00:34:41 2001::ocjlb@nusyk.net::1006101281-5-5

Sat Dec  1 21:01:23 1973::bvhx@lmir.net::123598883-4-4

Sun Dec 16 17:42:51 1979::rpgs@hppau.org::314185371-4-5

Mon Jul 21 23:46:13 1986::fnsro@nmbcwdmie.org::522344773-5-9


---------------match_date--------------------

('Sun', ' Mar  5 00:55:55 1989::qvnc@ygeowwaf.com::605033755-4-8')
('Mon', ' Oct 17 17:16:31 2005::yene@rtewqvvyfe.edu::1129540591-4-10')
('Tue', ' Oct  7 06:33:30 2003::wlyi@coagmnososzy.edu::1065479610-4-12')
('Mon', ' Oct 16 00:01:06 2006::zsgok@jkpiplcm.edu::1160928066-5-8')
('Wed', ' Mar 15 06:37:35 2000::paok@anpekysphicu.com::953073455-4-12')
('Wed', ' Mar 26 12:27:25 1980::bodqoe@iydohek.org::322892845-6-7')
('Mon', ' Jun  5 13:54:28 1989::fgiy@oppcjnafx.gov::613029268-4-9')
('Sun', ' Jul 25 05:27:23 2004::agmljfx@qvxgjqtkiwnl.org::1090704443-7-12')
('Mon', ' Nov 14 16:15:36 2005::tctz@bcikib.gov::1131956136-4-6')
('Sun', ' Jan 14 23:20:42 2007::qqlfkf@isslbh.com::1168788042-6-6')
('Sun', ' Jul 27 02:00:13 1980::cpiqwau@drbpfsfglip.edu::333482413-7-11')
('Sun', ' Feb 20 16:10:34 2005::aguqfd@hnrcaged.com::1108887034-6-8')
('Wed', ' Jun 27 06:13:05 1979::kowyk@ruoackjavkpq.net::299283185-5-12')
('Wed', ' Oct 12 19:52:54 1994::kqaol@mzewoas.edu::781962774-5-7')
('Thu', ' Aug 23 01:46:59 1973::uofpdq@zdeidbobin.org::114889619-6-10')
('Sat', ' Dec 21 11:36:20 1991::hodw@wfbw.org::693286580-4-4')
('Tue', ' Jun 22 14:42:19 1993::azgagm@nfmguh.org::740731339-6-6')
('Sun', ' Feb 23 04:50:57 2003::cysfu@fnzdo.com::1045947057-5-5')
('Fri', ' Jun 10 13:38:02 1983::qdhqw@fcdsvlmnhx.net::424071482-5-10')
('Sat', ' Jan 24 21:56:37 1998::dfyicjw@fklbymd.org::885650197-7-7')
('Sun', ' Jun  3 07:48:45 2007::wptuyjk@tsngnle.edu::1180828125-7-7')
('Mon', ' Nov 19 00:34:41 2001::ocjlb@nusyk.net::1006101281-5-5')
('Sat', ' Dec  1 21:01:23 1973::bvhx@lmir.net::123598883-4-4')
('Sun', ' Dec 16 17:42:51 1979::rpgs@hppau.org::314185371-4-5')
('Mon', ' Jul 21 23:46:13 1986::fnsro@nmbcwdmie.org::522344773-5-9')

---------------match_time_slot--------------------

('25', '2004', 'agmljfx@qvxgjqtkiwnl.org::1090704443-7-12')
('20', '2005', 'aguqfd@hnrcaged.com::1108887034-6-8')
('23', '2003', 'cysfu@fnzdo.com::1045947057-5-5')

---------------match_name--------------------

('qvnc', 'ygeowwaf', 'com')
('yene', 'rtewqvvyfe', 'edu')
('wlyi', 'coagmnososzy', 'edu')
('zsgok', 'jkpiplcm', 'edu')
('paok', 'anpekysphicu', 'com')
('bodqoe', 'iydohek', 'org')
('fgiy', 'oppcjnafx', 'gov')
('agmljfx', 'qvxgjqtkiwnl', 'org')
('tctz', 'bcikib', 'gov')
('qqlfkf', 'isslbh', 'com')
('cpiqwau', 'drbpfsfglip', 'edu')
('aguqfd', 'hnrcaged', 'com')
('kowyk', 'ruoackjavkpq', 'net')
('kqaol', 'mzewoas', 'edu')
('uofpdq', 'zdeidbobin', 'org')
('hodw', 'wfbw', 'org')
('azgagm', 'nfmguh', 'org')
('cysfu', 'fnzdo', 'com')
('qdhqw', 'fcdsvlmnhx', 'net')
('dfyicjw', 'fklbymd', 'org')
('wptuyjk', 'tsngnle', 'edu')
('ocjlb', 'nusyk', 'net')
('bvhx', 'lmir', 'net')
('rpgs', 'hppau', 'org')
('fnsro', 'nmbcwdmie', 'org')

Process finished with exit code 0
View Code

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM