問題描述
客戶報告我們一個API在查詢到第11頁時老是出現504Gateway Timout ERROR
問題排查
根據Code我們發現該API是一個mongo的SQL 語句
db['sxaimsubscriber'].find({'organizationId': '12615054', 'flowDiscovered': true, 'mappedBy': {'$regex': '.*DHCP.*'}, 'ipAddress': {'$exists': 1}, 'deletedTime': {'$exists': 0}},projection={'_id': 1, 'ipAddress': 1}).sort({'_id':1}).skip(30000).limit(3000)ionStats")
但奇怪的是,同樣的SQL,在調用分頁1~10頁的時候都可以在5s內返回數據,但到第11頁5分鍾也不返回。
同時測試過,如果不分頁,總查詢數據量也就在5W左右,在2分鍾之內也能返回數據。
根據官方文檔https://docs.mongodb.com/manual/reference/method/cursor.skip/#cursor.skip ,skip可能會引發慢查詢,於是我們分析查詢計划
db['sxaimsubscriber'].find({'organizationId': '12615054', 'flowDiscovered': true, 'mappedBy': {'$regex': '.*DHCP.*'}, 'ipAddress': {'$exists': 1}, 'deletedTime': {'$exists': 0}},projection={'_id': 1, 'ipAddress': 1}).sort({'_id':1}).skip(10000).limit(3000).explain("executionStats")
當我們skip10000條的時候,執行計划中發現是走了Index的

"executionStats" : { "executionSuccess" : true, "nReturned" : 3000, "executionTimeMillis" : 17843, "totalKeysExamined" : 151864, "totalDocsExamined" : 53312, "executionStages" : { "stage" : "SKIP", "nReturned" : 3000, "executionTimeMillisEstimate" : 1136, "works" : 164866, "advanced" : 3000, "needTime" : 161865, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "skipAmount" : 0, "inputStage" : { "stage" : "PROJECTION", "nReturned" : 13000, "executionTimeMillisEstimate" : 1106, "works" : 164866, "advanced" : 13000, "needTime" : 151865, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "transformBy" : { "_id" : 1, "ipAddress" : 1 }, "inputStage" : { "stage" : "SORT", "nReturned" : 13000, "executionTimeMillisEstimate" : 1056, "works" : 164866, "advanced" : 13000, "needTime" : 151865, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "sortPattern" : { "_id" : -1 }, "memUsage" : 13239001, "memLimit" : 33554432, "limitAmount" : 13000, "inputStage" : { "stage" : "KEEP_MUTATIONS", "nReturned" : 46369, "executionTimeMillisEstimate" : 806, "works" : 151865, "advanced" : 46369, "needTime" : 105495, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "inputStage" : { "stage" : "SORT_KEY_GENERATOR", "nReturned" : 0, "executionTimeMillisEstimate" : 796, "works" : 151865, "advanced" : 0, "needTime" : 105495, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "inputStage" : { "stage" : "FETCH", "filter" : { "$and" : [ { "flowDiscovered" : { "$eq" : true } }, { "ipAddress" : { "$exists" : true } }, { "$not" : { "deletedTime" : { "$exists" : true } } } ] }, "nReturned" : 46369, "executionTimeMillisEstimate" : 707, "works" : 151864, "advanced" : 46369, "needTime" : 105494, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "docsExamined" : 53312, "alreadyHasObj" : 0, "inputStage" : { "stage" : "IXSCAN", "filter" : { "mappedBy" : /.*DHCP.*/ }, "nReturned" : 53312, "executionTimeMillisEstimate" : 467, "works" : 151864, "advanced" : 53312, "needTime" : 98551, "needYield" : 0, "saveState" : 13470, "restoreState" : 13470, "isEOF" : 1, "invalidates" : 50, "keyPattern" : { "organizationId" : 1, "mappedBy" : 1 }, "indexName" : "organizationId_mapped", "isMultiKey" : false, "isUnique" : false, "isSparse" : false, "isPartial" : false, "indexVersion" : 1, "direction" : "forward", "indexBounds" : { "organizationId" : [ "[\"12615054\", \"12615054\"]" ], "mappedBy" : [ "[\"\", {})", "[/.*DHCP.*/, /.*DHCP.*/]" ] }, "keysExamined" : 151864, "dupsTested" : 0, "dupsDropped" : 0, "seenInvalidated" : 0 } } } } } } } }
但我們skip30000條的時候,發現根本沒有走Index

"executionStats" : { "executionSuccess" : true, "nReturned" : 3000, "executionTimeMillis" : 561807, "totalKeysExamined" : 6149055, "totalDocsExamined" : 6149055, "executionStages" : { "stage" : "LIMIT", "nReturned" : 3000, "executionTimeMillisEstimate" : 74140, "works" : 6290481, "advanced" : 3000, "needTime" : 6146055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 1, "invalidates" : 1219693, "limitAmount" : 3000, "inputStage" : { "stage" : "SKIP", "nReturned" : 3000, "executionTimeMillisEstimate" : 74000, "works" : 6290480, "advanced" : 3000, "needTime" : 6146055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "skipAmount" : 0, "inputStage" : { "stage" : "PROJECTION", "nReturned" : 33000, "executionTimeMillisEstimate" : 73630, "works" : 6290480, "advanced" : 33000, "needTime" : 6116055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "transformBy" : { "_id" : 1, "ipAddress" : 1 }, "inputStage" : { "stage" : "FETCH", "filter" : { "$and" : [ { "flowDiscovered" : { "$eq" : true } }, { "organizationId" : { "$eq" : "12615054" } }, { "mappedBy" : /.*DHCP.*/ }, { "ipAddress" : { "$exists" : true } }, { "$not" : { "deletedTime" : { "$exists" : true } } } ] }, "nReturned" : 33000, "executionTimeMillisEstimate" : 73188, "works" : 6290480, "advanced" : 33000, "needTime" : 6116055, "needYield" : 141425, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "docsExamined" : 6149055, "alreadyHasObj" : 0, "inputStage" : { "stage" : "IXSCAN", "nReturned" : 6149055, "executionTimeMillisEstimate" : 4814, "works" : 6149055, "advanced" : 6149055, "needTime" : 0, "needYield" : 0, "saveState" : 163864, "restoreState" : 163864, "isEOF" : 0, "invalidates" : 1219693, "keyPattern" : { "_id" : 1 }, "indexName" : "_id_", "isMultiKey" : false, "isUnique" : true, "isSparse" : false, "isPartial" : false, "indexVersion" : 1, "direction" : "backward", "indexBounds" : { "_id" : [ "[MaxKey, MinKey]" ] }, "keysExamined" : 6149055, "dupsTested" : 0, "dupsDropped" : 0, "seenInvalidated" : 0 } } } } }
同時查看mongo日志,發現這個查詢掃描了大量的rows:
2020-04-07T08:03:18.805-0700 I COMMAND [conn64846408] query sxa.sxaimsubscriber query: { $query: { organizationId: "12615054", ipAddress: { $exists: 1 }, mappedBy: { $regex: ".*DHCP.*" }, flowDiscovered: true, deletedTime: { $exists: 0 } }, $orderby: { _id: 1 } } planSummary: IXSCAN { organizationId: 1.0, ipAddress: 1.0 }, IXSCAN { organizationId: 1.0, ipAddress: 1.0 } cursorid:537636864055 ntoreturn:3000 ntoskip:24000 keysExamined:151858 docsExamined:151858 hasSortStage:1 keyUpdates:0 writeConflicts:0 numYields:1398 nreturned:3000 reslen:242289 locks:{ Global: { acquireCount: { r: 2798 } }, MMAPV1Journal: { acquireCount: { r: 1405 }, acquireWaitCount: { r: 1 }, timeAcquiringMicros: { r: 7865 } }, Database: { acquireCount: { r: 1399 } }, Collection: { acquireCount: { R: 1399 }, acquireWaitCount: { R: 6 }, timeAcquiringMicros: { R: 5365 } } } 898ms 2020-04-07T08:03:20.366-0700 I COMMAND [conn64846413] query sxa.sxaimsubscriber query: { $query: { organizationId: "12615054", ipAddress: { $exists: 1 }, mappedBy: { $regex: ".*DHCP.*" }, flowDiscovered: true, deletedTime: { $exists: 0 } }, $orderby: { _id: 1 } } planSummary: IXSCAN { organizationId: 1.0, ipAddress: 1.0 }, IXSCAN { organizationId: 1.0, ipAddress: 1.0 } cursorid:539455280546 ntoreturn:3000 ntoskip:27000 keysExamined:151858 docsExamined:151858 hasSortStage:1 keyUpdates:0 writeConflicts:0 numYields:1422 nreturned:3000 reslen:242313 locks:{ Global: { acquireCount: { r: 2846 } }, MMAPV1Journal: { acquireCount: { r: 1426 }, acquireWaitCount: { r: 1 }, timeAcquiringMicros: { r: 16191 } }, Database: { acquireCount: { r: 1423 } }, Collection: { acquireCount: { R: 1423 }, acquireWaitCount: { R: 3 }, timeAcquiringMicros: { R: 3438 } } } 891ms 2020-04-07T08:08:25.309-0700 I COMMAND [conn64846425] query sxa.sxaimsubscriber query: { $query: { organizationId: "12615054", ipAddress: { $exists: 1 }, mappedBy: { $regex: ".*DHCP.*" }, flowDiscovered: true, deletedTime: { $exists: 0 } }, $orderby: { _id: 1 } } planSummary: IXSCAN { _id: 1 } cursorid:540082639314 ntoreturn:3000 ntoskip:30000 keysExamined:6158182 docsExamined:6158182 fromMultiPlanner:1 replanned:1 keyUpdates:0 writeConflicts:0 numYields:272958 nreturned:3000 reslen:242407 locks:{ Global: { acquireCount: { r: 545918 } }, MMAPV1Journal: { acquireCount: { r: 273683 }, acquireWaitCount: { r: 283 }, timeAcquiringMicros: { r: 3151680 } }, Database: { acquireCount: { r: 272959 } }, Collection: { acquireCount: { R: 272959 }, acquireWaitCount: { R: 724 }, timeAcquiringMicros: { R: 1861964 } } } 304275ms
同樣,我們測試過,如果不加sort排序,在執行到第11頁的時候也能快速返回,所以發現問題就在sort排序上。
根據官方文檔
Add an index, or specify a smaller limit In MongoDB, sort operations can obtain the sort order by retrieving documents based on the ordering in an index.
If the query planner cannot obtain the sort order from an index, it will sort the results in memory.
Sort operations that use an index often have better performance than those that do not use an index.
In addition, sort operations that do not use an index will abort when they use 32 megabytes of memory.
因為我們skip3000條記錄后,超過了系統默認的32MB內存排序,所以mongo重新使用了其他算法排序,出現大量掃描,導致查詢非常慢。
解決方法
系統默認使用32MB 作為buffer
> use admin switched to db admin > db.runCommand( { getParameter : 1, "internalQueryExecMaxBlockingSortBytes" : 1 } ) { "internalQueryExecMaxBlockingSortBytes" : 33554432, "ok" : 1 }
提高buffer到50MB
> db.adminCommand({setParameter: 1, internalQueryExecMaxBlockingSortBytes:50151432}) { "was" : 33554432, "ok" : 1 }
同樣也可以修改config配置文件
setParameter=internalQueryExecMaxBlockingSortBytes=309715200
改完后,再試API就能快速返回了。
參考文章:
https://zhuanlan.zhihu.com/p/61242164
https://askubuntu.com/questions/501937/how-to-increase-buffered-data-limit
https://stackoverflow.com/a/38408403/2400151
https://studio3t.com/knowledge-base/articles/mongodb-query-performance/
https://docs.mongodb.com/manual/core/index-compound/#index-ascending-and-descending