Python正則處理多行日志一例(可配置化)

本文轉載自查看原文 2016-07-11 18:39 5167 python/ 5日常小方案/ 正則/ 程序/ 多行匹配

　正則表達式基礎知識請參閱《正則表達式基礎知識》，本文使用正則表達式來匹配多行日志並從中解析出相應的信息。

假設現在有這樣的SQL日志：

SELECT * FROM open_app WHERE 1 and `client_id` = 'a08f5e32909cc9418f' and `is_valid` = '1' order by id desc limit 32700,100;
# Time: 160616 10:05:10
# User@Host: shuqin[qqqq] @  [1.1.1.1]  Id: 46765069
# Schema: db_xxx  Last_errno: 0  Killed: 0
# Query_time: 0.561383  Lock_time: 0.000048  Rows_sent: 100  Rows_examined: 191166  Rows_affected: 0
# Bytes_sent: 14653
SET timestamp=1466042710;
SELECT * FROM open_app WHERE 1 and `client_id` = 'a08f5e32909cc9418f' and `is_valid` = '1' order by id desc limit 36700,100;
# User@Host: shuqin[ssss] @  [2.2.2.2]  Id: 46765069
# Schema: db_yyy  Last_errno: 0  Killed: 0
# Query_time: 0.501094  Lock_time: 0.000042  Rows_sent: 100  Rows_examined: 192966  Rows_affected: 0
# Bytes_sent: 14966
SET timestamp=1466042727;

要求從中解析出相應的信息，有如下知識點：

　 (1) 默認正則是單行模式，要匹配多行，需要開啟 "多行模式"：　MULTILINE；對於點號，默認不匹配換行符，為了匹配換行符，也需要開啟 "DOTALL模式"；

　 (2) 為了匹配每個多行日志，必須使用非貪婪模式，即在 .* 后面加 ? ,　否則第一個匹配會匹配到末尾；

　 (3) 分而治之。編寫正確的正則表達式匹配指定長字符串是不容易的，采用的策略是分而治之，將整個字符串分解成多個子串，分別匹配字串。這里每個字串都是一行，匹配好一行后，可以進一步在行內更細化的匹配；

　 (4) 無處不在的空格符要使用 \s* 或 \s+ 來增強健壯性；固定的普通字符串可以在正則表達式中更好地標識各個字串，更容易地匹配到。

　 (5) Python 正則有兩個常用用法： re.findall , re.match ; 前者的匹配結果是一個列表，　每個列表元素是一個元組，匹配一個多行日志；元組的每個元素用來提取對應捕獲分組的字符串；　re.match 的匹配結果是一個 Match 對象，可以通過 group(n) 來獲取每個捕獲分組的匹配字符串。下面的程序特意兩種都用到了。對於多行匹配，使用了 re.findall ; 對於行內匹配，使用了 re.match ; 初學者常問這兩者那兩者有什么區別，其實動手試試就知道了。

　 (6) 展示結構使用 Map. 解析出結果后，必然要展示或做成報告，使用 Map & List 結合的復合結構通常是非常適宜的選擇。比如這一例，如果要展示所有 SQL 日志詳情，可以做成

{"tablename1": [{sqlobj11}, {sqlobj12}], ..., "tablenameN": [{sqlobjN1}, {sqlobjN2}] } ，每個 sqlobj 結構為：

　　　　{"sql": "select xxx", "QueryTime": 0.5600, ...}

要展示簡要的報告，比如每個表的 SQL 統計，　可以做成

　　　　{"tablename1": {"sql11": 98, "sql12": 16}, ..., "tablenameN": {"sqlN1": 75, "sqlN2": 23} }

　　Python 程序實現：

import re

globalRegex = r'^\s*(.*?)# (User@Host:.*?)# (Schema:.*?)# (Query_time:.*?)# Bytes_sent:(.*?)SET timestamp=(\d+);\s*$'
costRegex = r'Query_time:\s*(.*)\s*Lock_time:\s*(.*)\s*Rows_sent:\s*(\d+)\s*Rows_examined:\s*(\d+)\s*Rows_affected:\s*(\d+)\s*'
schemaRegex = r'Schema:\s*(.*)\s*Last_errno:(.*)\s*Killed:\s*(.*)\s*'

def readSlowSqlFile(slowSqlFilename):
    f = open(slowSqlFilename)
    ftext = ''
    for line in f:
         ftext += line
    f.close()
    return ftext

def findInText(regex, text):
    return re.findall(regex, text, flags=re.DOTALL+re.MULTILINE)

def parseSql(sqlobj, sqlText):
    try:
        if sqlText.find('#') != -1:
            sqlobj['sql'] = sqlText.split('#')[0].strip()
            sqlobj['time'] = sqlText.split('#')[1].strip()
        else:
            sqlobj['sql'] = sqlText.strip()
            sqlobj['time'] = ''
    except:
        sqlobj['sql'] = sqlText.strip()

def parseCost(sqlobj, costText):
    matched = re.match(costRegex, costText)
    sqlobj['Cost'] = costText
    if matched:
        sqlobj['QueryTime'] = matched.group(1).strip()
        sqlobj['LockTime'] = matched.group(2).strip()
        sqlobj['RowsSent'] = int(matched.group(3))
        sqlobj['RowsExamined'] = int(matched.group(4))
        sqlobj['RowsAffected'] = int(matched.group(5))

def parseSchema(sqlobj, schemaText):
    matched = re.match(schemaRegex, schemaText)
    sqlobj['Schema'] = schemaText
    if matched:
        sqlobj['Schema'] = matched.group(1).strip()
        sqlobj['LastErrno'] = int(matched.group(2))
        sqlobj['Killed'] = int(matched.group(3))

def parseSQLObj(matched):
    sqlobj = {}
    try:
        if matched and len(matched) > 0:
            parseSql(sqlobj, matched[0].strip())
            sqlobj['UserHost'] = matched[1].strip()
            sqlobj['ByteSent'] = int(matched[4])
            sqlobj['timestamp'] = int(matched[5])
            parseCost(sqlobj, matched[3].strip())
            parseSchema(sqlobj, matched[2].strip())
            return sqlobj
    except:
        return sqlobj


if __name__ == '__main__':

    files = ['slow_sqls.txt']

    alltext = ''
    for f in files:
        text = readSlowSqlFile(f)
        alltext += text
    allmatched = findInText(globalRegex, alltext)

    tablenames = ['open_app']

    if not allmatched or len(allmatched) == 0:
        print 'No matched. exit.'
        exit(1)

    sqlobjMap = {}
    for matched in allmatched:
        sqlobj = parseSQLObj(matched)
        if len(sqlobj) == 0:
            continue
        for tablename in tablenames:
            if sqlobj['sql'].find(tablename) != -1:
                 if not sqlobjMap.get(tablename):
                     sqlobjMap[tablename] = []
                 sqlobjMap[tablename].append(sqlobj)
                 break

    resultMap = {}
    for (tablename, sqlobjlist) in sqlobjMap.iteritems():
        sqlstat = {}
        for sqlobj in sqlobjlist:
            if sqlobj['sql'] not in sqlstat:
                sqlstat[sqlobj['sql']] = 0
            sqlstat[sqlobj['sql']] += 1
        resultMap[tablename] = sqlstat

    f_res = open('/tmp/res.txt', 'w')
    f_res.write('-------------------------------------: \n')
    f_res.write('Bref results: \n')
    for (tablename, sqlstat) in resultMap.iteritems():
        f_res.write('tablename: ' + tablename + '\n')
        sortedsqlstat = sorted(sqlstat.iteritems(), key=lambda d:d[1], reverse = True)
        for sortedsql in sortedsqlstat:
            f_res.write('sql = %s\ncounts: %d\n\n' % (sortedsql[0], sortedsql[1]))
    f_res.write('-------------------------------------: \n\n')

    f_res.write('-------------------------------------: \n')
    f_res.write('Detail results: \n')
    for (tablename, sqlobjlist) in sqlobjMap.iteritems():
        f_res.write('tablename: ' + tablename + '\n')
        f_res.write('sqlinfo: \n')
        for sqlobj in sqlobjlist:
            f_res.write('sql: ' + sqlobj['sql'] + ' QueryTime: ' + str(sqlobj.get('QueryTime')) + ' LockTime: ' + str(sqlobj.get('LockTime')) + '\n')
            f_res.write(str(sqlobj) + '\n\n')
    f_res.write('-------------------------------------: \n')
    f_res.close()

可配置

事實上，可以做成可配置的。只要給定行間及行內關鍵字集合，可以分割多行及行內字段，就可以分別提取相應的內容。

這里有個基本函數 matchOneLine：根據一個依序分割一行內容的關鍵字列表，匹配一行內容，得到每個關鍵字對應的內容。這個函數用於匹配行內內容。

配置方式：采用列表的列表。列表中的每個元素列表是可以分割和匹配單行內容的關鍵字列表。每個關鍵字都用於分割單行的某個區域的內容。為了提升解析性能，這里對關鍵字列表進行了預編譯正則表達式，以便在解析字符串的時候不做重復工作。

見如下代碼：

#!/usr/bin/python
#_*_encoding:utf-8_*_

import re

# config line keywords to seperate lines.
ksconf = [['S'], ['# User@Host:','Id:'] , ['# Schema:', 'Last_errno:', 'Killed:'], ['# Query_time:','Lock_time:', 'Rows_sent:', 'Rows_examined:', 'Rows_affected:'], ['# Bytes_sent:'], ['SET timestamp=']]
files = ['slow_sqls.txt']

#ksconf = [['id:'], ['name:'], ['able:']]
#files = ['stu.txt']

globalConf = {'ksconf': ksconf, 'files': files}

def produceRegex(keywordlistInOneLine):
    ''' build the regex to match keywords in the list of keywordlistInOneLine '''
    oneLineRegex = "^\s*"
    oneLineRegex += "(.*?)".join(keywordlistInOneLine)
    oneLineRegex += "(.*?)\s*$"
    return oneLineRegex

def readFile(filename):
    f = open(filename)
    ftext = ''
    for line in f:
        ftext += line
    f.close()
    return ftext

def readAllFiles(files):
    return ''.join(map(readFile, files))

def findInText(regex, text, linesConf):
    '''
       return a list of maps, each map is a match to multilines,
              in a map, key is the line keyword
                         and value is the content corresponding to the key
    '''
    matched = regex.findall(text)
    if empty(matched):
        return []

    allMatched = []
    linePatternMap = buildLinePatternMap(linesConf)
    for onematch in matched:
        oneMatchedMap = buildOneMatchMap(linesConf, onematch, linePatternMap)
        allMatched.append(oneMatchedMap)
    return allMatched

def buildOneMatchMap(linesConf, onematch, linePatternMap):
    sepLines = map(lambda ks:ks[0], linesConf)
    lenOflinesInOneMatch = len(sepLines)
    lineMatchedMap = {}
    for i in range(lenOflinesInOneMatch):
        lineContent = sepLines[i] + onematch[i].strip()
        linekey = getLineKey(linesConf[i])
        lineMatchedMap.update(matchOneLine(linesConf[i], lineContent, linePatternMap))
    
    return lineMatchedMap    

def matchOneLine(keywordlistOneLine, lineContent, patternMap):
    '''
       match lineContent with a list of keywords , and return a map 
       in which key is the keyword and value is the content matched the key.
       eg. 
       keywordlistOneLine = ["host:", "ip:"] , lineContent = "host: qinhost ip: 1.1.1.1"
       return {"host:": "qinhost", "ip": "1.1.1.1"}
    '''
    
    ksmatchedResult = {}
    if len(keywordlistOneLine) == 0 or lineContent.strip() == "":
        return {}
    linekey = getLineKey(keywordlistOneLine)
    
    if empty(patternMap):
        linePattern = getLinePattern(keywordlistOneLine)
    else:
        linePattern = patternMap.get(linekey)
    
    lineMatched = linePattern.findall(lineContent)
    if empty(lineMatched):
        return {}
    kslen = len(keywordlistOneLine)
    if kslen == 1:
        ksmatchedResult[cleankey(keywordlistOneLine[0])] = lineMatched[0].strip()
    else:
        for i in range(kslen):                            
            ksmatchedResult[cleankey(keywordlistOneLine[i])] = lineMatched[0][i].strip()
    
    return ksmatchedResult

def empty(obj):
    return obj is None or len(obj) == 0

def cleankey(dirtykey):
    ''' clean unused characters in key '''
    return re.sub(r"[# :]", "", dirtykey)

def printMatched(allMatched, linesConf):
    allks = []
    for kslist in linesConf:
        allks.extend(kslist)
    for matched in allMatched:
        for k in allks:
            print cleankey(k) , "=>", matched.get(cleankey(k))
        print '\n'    

def buildLinePatternMap(linesConf):
    linePatternMap = {}
    for keywordlistOneLine in linesConf:
        linekey = getLineKey(keywordlistOneLine)
        linePatternMap[linekey] = getLinePattern(keywordlistOneLine)
    return linePatternMap    

def getLineKey(keywordlistForOneLine):
    return "_".join(keywordlistForOneLine)

def getLinePattern(keywordlistForOneLine):
    return re.compile(produceRegex(keywordlistForOneLine))

def testMatchOneLine():
    assert len(matchOneLine([], "haha", {})) == 0
    assert len(matchOneLine(["host"], "", {})) == 0
    assert len(matchOneLine("", "haha", {})) == 0 
    assert len(matchOneLine(["host", "ip"], "host:qqq addr: 1.1.1.1", {})) == 0

    lineMatchMap1 = matchOneLine(["id:"], "id: 123456", {"id:": re.compile(produceRegex(["id:"]))})
    assert lineMatchMap1.get("id") == "123456"

    lineMatchMap2 = matchOneLine(["host:", "ip:"], "host: qinhost  ip: 1.1.1.1  ", {"host:_ip:": re.compile(produceRegex(["host:", "ip:"]))})
    assert lineMatchMap2.get("host") == "qinhost"
    assert lineMatchMap2.get("ip") == "1.1.1.1"
    print 'testMatchOneLine passed.'


if __name__ == '__main__':

    testMatchOneLine()

    files = globalConf['files']
    linesConf = globalConf['ksconf']
    sepLines = map(lambda ks:ks[0], linesConf)

    text = readAllFiles(files)
    wholeRegex = produceRegex(sepLines)
    print 'wholeRegex: ', wholeRegex

    compiledPattern = re.compile(wholeRegex, flags=re.DOTALL+re.MULTILINE)
    allMatched = findInText(compiledPattern, text, linesConf)
    printMatched(allMatched, linesConf)

如果想以下多行解析文本文件，只需要修改下 ksconf = [['id:'], ['name:'], ['able:']]。

id:1
name:shu
able:swim,study

id:2
name:qin
able:sleep,run

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 接口自動化實現用例可配置 Python爬蟲post一例 Vertica節點宕機處理一例 redis啟動異常處理一例 CentOS yum Fatal Error 處理一例處理OSS上傳失敗一例 apache日志配置一例，包括指定存儲目錄與格式、自動刪除過期的日志文件 ribbon區域親和配置一例 filebeat 多行日志的處理 python把字典寫入excel之一例