基礎信息
MySQL版本:5.7.27-30-log Percona Server (GPL), wsrep_31.39
MySQL參數:
innodb_purge_threads = 4
innodb_read_io_threads = 8
innodb_write_io_threads = 8
innodb_thread_concurrency = 24
innodb_thread_sleep_delay = 148358
innodb_concurrency_tickets = 5000
gtid_mode = ON
thread_handling = one-thread-per-connection
涉及SQL:
SELECT
`iast_agent_method_pool`.`id`, `iast_agent_method_pool`.`agent_id`,
`iast_agent_method_pool`.`url`, `iast_agent_method_pool`.`uri`,
`iast_agent_method_pool`.`http_method`, `iast_agent_method_pool`.`http_scheme`,
`iast_agent_method_pool`.`http_protocol`, `iast_agent_method_pool`.`req_header`,
`iast_agent_method_pool`.`req_params`, `iast_agent_method_pool`.`req_data`,
`iast_agent_method_pool`.`res_header`, `iast_agent_method_pool`.`res_body`,
`iast_agent_method_pool`.`req_header_for_search`, `iast_agent_method_pool`.`context_path`,
`iast_agent_method_pool`.`method_pool`, `iast_agent_method_pool`.`pool_sign`,
`iast_agent_method_pool`.`clent_ip`, `iast_agent_method_pool`.`create_time`,
`iast_agent_method_pool`.`update_time`, `iast_agent_method_pool`.`uri_sha1`
FROM `iast_agent_method_pool` FORCE INDEX(PRIMARY)
WHERE (
`iast_agent_method_pool`.`agent_id` IN (
SELECT U0.`id`
FROM `iast_agent` U0
WHERE ((U0.`project_name` IS NULL
OR U0.`bind_project_id` = 806)
AND U0.`online` = 1
AND U0.`project_version_id` = 806
AND U0.`user_id` = 1)
)
AND `iast_agent_method_pool`.`pool_sign` = 'd94e7aaee21470b45ceef899dc35c2f46e4dbd24'
)
ORDER BY `iast_agent_method_pool`.`id` ASC LIMIT 1;
涉及表結構:
CREATE TABLE `iast_agent_method_pool` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`agent_id` int(11) DEFAULT NULL COMMENT 'Agent',
`url` varchar(2000) DEFAULT NULL COMMENT 'URL',
`uri` varchar(2000) DEFAULT NULL COMMENT 'URI',
`http_method` varchar(10) DEFAULT NULL COMMENT 'HTTP請求方法',
`http_scheme` varchar(20) DEFAULT NULL COMMENT '協議',
`http_protocol` varchar(255) DEFAULT NULL COMMENT 'HTTP協議',
`req_header` varchar(2000) DEFAULT NULL COMMENT '請求頭',
`req_params` varchar(2000) DEFAULT NULL COMMENT '請求參數',
`req_data` varchar(4000) DEFAULT NULL COMMENT '請求體',
`res_header` varchar(1000) DEFAULT NULL COMMENT '響應頭',
`res_body` varchar(1000) DEFAULT NULL COMMENT '響應體',
`context_path` varchar(255) DEFAULT NULL COMMENT '應用上下文',
`method_pool` json DEFAULT NULL COMMENT '方法池',
`clent_ip` varchar(255) DEFAULT NULL COMMENT '客戶端IP',
`create_time` int(11) DEFAULT NULL COMMENT '創建時間',
`update_time` int(11) DEFAULT NULL COMMENT '修改時間',
`pool_sign` varchar(40) DEFAULT NULL COMMENT '方法池簽名',
`req_header_for_search` text COMMENT '用於搜索功能的請求頭,請求方法、uri、參數、協議、請求頭的拼接',
`uri_sha1` varchar(40) NOT NULL DEFAULT '',
PRIMARY KEY (`id`) USING BTREE,
UNIQUE KEY `pool_sign` (`pool_sign`,`agent_id`) USING BTREE,
KEY `agent_id` (`agent_id`) USING BTREE,
KEY `update_time` (`update_time`) USING BTREE,
KEY `iast_agent__uri_sha_aac6d7_idx` (`uri_sha1`,`http_method`,`agent_id`),
KEY `iast_agent_method_pool_uri_sha1_a64012e4` (`uri_sha1`),
KEY `iast_agent_method_pool_agent_id_30df78eb` (`agent_id`)
) ENGINE=InnoDB AUTO_INCREMENT=106703 DEFAULT CHARSET=utf8mb4;
CREATE TABLE `iast_agent` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`token` varchar(255) DEFAULT NULL COMMENT 'agent唯一標識',
`version` varchar(255) DEFAULT NULL COMMENT '版本',
`latest_time` int(11) DEFAULT NULL COMMENT '更新時間',
`user_id` int(11) NOT NULL COMMENT '用戶ID',
`server_id` int(11) DEFAULT NULL COMMENT '服務器ID',
`is_running` int(1) DEFAULT NULL COMMENT 'agent運行狀態',
`control` int(1) DEFAULT NULL COMMENT 'agent控制位,1-安裝、2-卸載、0-無控制',
`is_control` int(1) DEFAULT NULL COMMENT '是否正處於控制中,0-否,1-是',
`bind_project_id` int(11) DEFAULT '0' COMMENT '捆綁項目ID,存在則為已捆綁',
`project_name` varchar(255) DEFAULT NULL COMMENT '項目名稱,用於先啟動agent后創建項目',
`project_version_id` int(11) DEFAULT '0' COMMENT '項目版本ID',
`online` tinyint(4) DEFAULT '0' COMMENT '1在線運行,0未運行,同token,僅一個online',
`is_core_running` int(11) DEFAULT NULL COMMENT '核心引擎是否啟動',
`language` varchar(10) DEFAULT 'JAVA' COMMENT '語言,JAVA、PYTHON',
`startup_time` int(11) NOT NULL DEFAULT '0' COMMENT '啟動時間',
`alias` varchar(255) NOT NULL DEFAULT '' COMMENT 'agent別名',
`register_time` int(11) NOT NULL DEFAULT '0' COMMENT '注冊時間',
`is_audit` int(11) NOT NULL DEFAULT '0' COMMENT '0:未審核;1:已審核',
PRIMARY KEY (`id`) USING BTREE,
KEY `user_id` (`user_id`) USING BTREE,
KEY `server_id` (`server_id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=29519 DEFAULT CHARSET=utf8mb4;
存儲引擎信息
通過SHOW ENGINE INNODB STATUS輸出可發現,大量進程處於"not started sleeping before entering InnoDB"
---TRANSACTION 421116931020976, not started sleeping before entering InnoDB
mysql tables in use 2, locked 0
0 lock struct(s), heap size 1136, 0 row lock(s)
---TRANSACTION 421116931082032, not started sleeping before entering InnoDB
mysql tables in use 2, locked 0
0 lock struct(s), heap size 1136, 0 row lock(s)
---TRANSACTION 421116931076272, not started sleeping before entering InnoDB
mysql tables in use 2, locked 0
0 lock struct(s), heap size 1136, 0 row lock(s)
---TRANSACTION 421116931034800, not started sleeping before entering InnoDB
mysql tables in use 2, locked 0
只有少量進程(innodb_thread_concurrency)進入的INNODB存儲引擎層進行執行:
--------------
ROW OPERATIONS
--------------
24 queries inside InnoDB, 1182 queries in queue
25 read views open inside InnoDB
0 RW transactions active inside InnoDB
實例堆棧信息
從堆棧信息可以看到,上千個線程在使用srv_conc_enter_innodb方法:
Thread 539 (Thread 0x7f9036907700 (LWP 378610)):
#0 0x00007f9506517e3d in nanosleep () from /lib64/libpthread.so.0
#1 0x00000000010af8d0 in os_thread_sleep(unsigned long) ()
#2 0x000000000114a0b8 in srv_conc_enter_innodb(row_prebuilt_t*) ()
#3 0x000000000102306c in ha_innobase::index_read(unsigned char*, unsigned char const*, unsigned int, ha_rkey_function) ()
#4 0x000000000084e918 in handler::index_read_idx_map(unsigned char*, unsigned int, unsigned char const*, unsigned long, ha_rkey_function) ()
#5 0x0000000000851e5a in handler::ha_index_read_idx_map(unsigned char*, unsigned int, unsigned char const*, unsigned long, ha_rkey_function) ()
#6 0x0000000000d0e993 in read_const(TABLE*, st_table_ref*) ()
#7 0x0000000000d0ead0 in join_read_const_table(JOIN_TAB*, st_position*) ()
#8 0x0000000000d23f38 in JOIN::extract_func_dependent_tables() ()
#9 0x0000000000d2f00e in JOIN::make_join_plan() ()
#10 0x0000000000d332a6 in JOIN::optimize() ()
#11 0x0000000000d7e322 in st_select_lex::optimize(THD*) ()
#12 0x0000000000d7e5cf in handle_query(THD*, LEX*, Query_result*, unsigned long long, unsigned long long) ()
#13 0x0000000000d39e33 in execute_sqlcom_select(THD*, TABLE_LIST*) ()
#14 0x0000000000d40f38 in mysql_execute_command(THD*, bool) ()
#15 0x0000000000d419c5 in mysql_parse(THD*, Parser_state*, bool) ()
#16 0x0000000000d41b10 in wsrep_mysql_parse(THD*, char const*, unsigned int, Parser_state*, bool) [clone .clone.0] ()
#17 0x0000000000d43d2f in dispatch_command(THD*, COM_DATA const*, enum_server_command) ()
#18 0x0000000000d44efb in do_command(THD*) ()
#19 0x0000000000e24ce4 in handle_connection ()
#20 0x0000000000f9ebe4 in pfs_spawn_thread ()
#21 0x00007f9506510dd5 in start_thread () from /lib64/libpthread.so.0
#22 0x00007f9504db7ead in clone () from /lib64/libc.so.6
有24個線程在使用btr_copy_blob_prefix方法:
Thread 1081 (Thread 0x7f8fd756d700 (LWP 41431)):
#0 0x00007f9506514965 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1 0x00000000010ae7fb in os_event::wait_low(long) ()
#2 0x000000000115d6e9 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) ()
#3 0x000000000115fb98 in rw_lock_s_lock_spin(rw_lock_t*, unsigned long, char const*, unsigned long) ()
#4 0x00000000011ea9d3 in buf_page_get_gen(page_id_t const&, page_size_t const&, unsigned long, buf_block_t*, unsigned long, char const*, unsigned long, mtr_t*, bool, dberr_t*) ()
#5 0x00000000011c63fe in btr_copy_blob_prefix(unsigned char*, unsigned long, unsigned long, unsigned long, unsigned long) ()
#6 0x00000000011c6a9a in btr_copy_externally_stored_field(unsigned long*, unsigned char const*, page_size_t const&, unsigned long, mem_block_info_t*) ()
#7 0x0000000001124900 in row_sel_store_mysql_field_func(unsigned char*, row_prebuilt_t*, unsigned char const*, unsigned long const*, unsigned long, mysql_row_templ_t const*, unsigned long) ()
#8 0x0000000001124d9b in row_sel_store_mysql_rec(unsigned char*, row_prebuilt_t*, unsigned char const*, dtuple_t const*, unsigned long, dict_index_t const*, unsigned long const*, bool) ()
#9 0x000000000112b18f in row_search_mvcc(unsigned char*, page_cur_mode_t, row_prebuilt_t*, unsigned long, unsigned long) ()
#10 0x0000000001022889 in ha_innobase::general_fetch(unsigned char*, unsigned int, unsigned int) ()
#11 0x0000000000851bbb in handler::ha_index_next(unsigned char*) ()
#12 0x0000000000d0a6fc in join_read_next(READ_RECORD*) ()
#13 0x0000000000d0f0c6 in sub_select(JOIN*, QEP_TAB*, bool) ()
#14 0x0000000000d0e3aa in JOIN::exec() ()
#15 0x0000000000d7e600 in handle_query(THD*, LEX*, Query_result*, unsigned long long, unsigned long long) ()
#16 0x0000000000d39e33 in execute_sqlcom_select(THD*, TABLE_LIST*) ()
#17 0x0000000000d40f38 in mysql_execute_command(THD*, bool) ()
#18 0x0000000000d419c5 in mysql_parse(THD*, Parser_state*, bool) ()
#19 0x0000000000d41b10 in wsrep_mysql_parse(THD*, char const*, unsigned int, Parser_state*, bool) [clone .clone.0] ()
#20 0x0000000000d43d2f in dispatch_command(THD*, COM_DATA const*, enum_server_command) ()
#21 0x0000000000d44efb in do_command(THD*) ()
#22 0x0000000000e24ce4 in handle_connection ()
#23 0x0000000000f9ebe4 in pfs_spawn_thread ()
#24 0x00007f9506510dd5 in start_thread () from /lib64/libpthread.so.0
#25 0x00007f9504db7ead in clone () from /lib64/libc.so.6
實例監控信息
在異常階段,數據庫實例監控信息如下:
1、Thread Running爆漲至1200,同時QPS下滑接近為0:
2、大量數根讀入Buffer Pool中:
3、Buffer Poll使用內存快速上漲:
4、故障時產生大量的s_os_wait和s_spin_waits等待
5、數據庫服務器CPU使用率超過30%
6、數據庫服務器磁盤使用率達到100%
7、數據庫服務器磁盤讀操作吞吐超過500MB/s,磁盤讀操作IO超過35k/s,平均每次IO讀16KB
異常原因分析
由於表iast_agent_method_pool上存在唯一索引pool_sign(pool_sign
,agent_id
),子查詢返回的agent_id
數量有限,雖然表上存在諸多大字段和JSON,甚至部分記錄總長度超過28MB,但異常期間出現的SQL正常情況下均能在50ms內返回。在異常期間,大量查詢甚至按主鍵查詢的SQL執行時間超過10分鍾,結合各種監控數據分析,懷疑該SQL執行計划出現異常,由於查詢中包含ORDER BY iast_agent_method_pool
.id
ASC LIMIT 1 語句,查詢優化器可能選擇使用主鍵索引掃描的方式來執行,導致查詢需要掃描大量數據才能找到滿足WHERE條件的第一條記錄,由於所需數據未緩存於buffer pool中,需要從磁盤讀取並加載到buffer pool,因此產生大量的磁盤讀操作。
由於參數innodb_thread_concurrency的限制,每次只有24個請求能進入到INNODB存儲引擎層進行處理,這些執行計划異常導致查詢耗時嚴重的SQL長期在INNODB存儲引擎層占用,其他SQL無法獲得進入INNODB存儲引擎層執行的機會。
問題優化建議
1、盡量避免使用TEXT或BLOB還是JSON等大字段列。
2、盡量避免使用ORDER BY語句,避免在不需要排序的場景中使用ORDER BY語句。
3、盡量避免使用子查詢,查詢優化器很難准確評估子查詢的返回記錄,很容易引發執行計划異常。
4、合理設置參數innodb_thread_concurrency的值,推薦使用默認值0(即不限制並發),當使用>0的值時,需考慮超慢查詢對集群的影響。