运行环境
linux:CentOS release 6.8 (Final)
kernel:2.6.32-642.6.2.el6.x86_64
mysql Server version: 5.7.21-20-log Percona Server (GPL),MGR单主模式
1.故障现象
解析binlog
mysqlbinlog -v --base64-output=DECODE-ROWS mysql-bin.000042 --include-gtids='1bb1b861-f776-11e6-3319-010204058070:113111655'
1bb1b861-f776-11e6-3319-010204058070:113111655 该GTID的binlog内容 ycc01 与ycc03服务器内容一致
ycc01.prod.bj2 mysql_3319 SET @@SESSION.GTID_NEXT= '1bb1b861-f776-11e6-3319-010204058070:113111655'/*!*/; # at 418534101 #190412 12:41:44 server id 58070 end_log_pos 418534182 Query thread_id=4163376 exec_time=0 error_code=0 SET TIMESTAMP=1555044104/*!*/; BEGIN /*!*/; # at 418534182 #190412 12:41:44 server id 58070 end_log_pos 418534262 Table_map: `ycc_cfcenter`.`config_center_client_list` mapped to number 108 # at 418534262 #190412 12:41:44 server id 58070 end_log_pos 418534338 Delete_rows: table id 108 flags: STMT_END_F ### DELETE FROM `ycc_cfcenter`.`config_center_client_list` ### WHERE ### @1=380183102 ### @2=206 ### @3='10.1.97.131' ### @4='5566' ### @5=1555044043209 # at 418534338 #190412 12:41:44 server id 58070 end_log_pos 418534365 Xid = 604665111 COMMIT/*!*/; ycc03.prod.bj2 mysql_3319 SET @@SESSION.GTID_NEXT= '1bb1b861-f776-11e6-3319-010204058070:113111655'/*!*/; # at 747896436 #190412 12:41:44 server id 58070 end_log_pos 747896500 Query thread_id=4163376 exec_time=0 error_code=0 SET TIMESTAMP=1555044104/*!*/; BEGIN /*!*/; # at 747896500 #190412 12:41:44 server id 58070 end_log_pos 747896580 Table_map: `ycc_cfcenter`.`config_center_client_list` mapped to number 108 # at 747896580 #190412 12:41:44 server id 58070 end_log_pos 747896656 Delete_rows: table id 108 flags: STMT_END_F ### DELETE FROM `ycc_cfcenter`.`config_center_client_list` ### WHERE ### @1=380183102 ### @2=206 ### @3='10.1.97.131' ### @4='5566' ### @5=1555044043209 # at 747896656 #190412 12:41:44 server id 58070 end_log_pos 747896683 Xid = 263747865 COMMIT/*!*/;
mysqlbinlog -v --base64-output=DECODE-ROWS mysql-bin.000042 --include-gtids='1bb1b861-f776-11e6-3319-010204058070:113111656'
1bb1b861-f776-11e6-3319-010204058070:113111656 该GTID的binlog内容 ycc01 与ycc03服务器内容不一致
ycc01.db.prod.bj2 mysql_3319 SET @@SESSION.GTID_NEXT= '1bb1b861-f776-11e6-3319-010204058070:113111656'/*!*/; # at 418534426 #190412 12:41:44 server id 58070 end_log_pos 418534507 Query thread_id=4163392 exec_time=0 error_code=0 SET TIMESTAMP=1555044104/*!*/; BEGIN /*!*/; # at 418534507 #190412 12:41:44 server id 58070 end_log_pos 418534587 Table_map: `ycc_cfcenter`.`config_center_client_list` mapped to number 108 # at 418534587 #190412 12:41:44 server id 58070 end_log_pos 418534663 Delete_rows: table id 108 flags: STMT_END_F ### DELETE FROM `ycc_cfcenter`.`config_center_client_list` ### WHERE ### @1=380183123 ### @2=136 ### @3='10.1.94.137' ### @4='5566' ### @5=1555044044493 # at 418534663 #190412 12:41:44 server id 58070 end_log_pos 418534690 Xid = 604665114 COMMIT/*!*/; ycc03.db.prod.bj2 mysql_3319
SET @@SESSION.GTID_NEXT= '1bb1b861-f776-11e6-3319-010204058070:113111656'/*!*/; # at 747896744 #190412 12:41:44 server id 58070 end_log_pos 747896808 Query thread_id=4163376 exec_time=1 error_code=0 SET TIMESTAMP=1555044104/*!*/; BEGIN /*!*/; # at 747896808 #190412 12:41:44 server id 58070 end_log_pos 747896888 Table_map: `ycc_cfcenter`.`config_center_client_list` mapped to number 108 # at 747896888 #190412 12:41:44 server id 58070 end_log_pos 747896964 Write_rows: table id 108 flags: STMT_END_F ### INSERT INTO `ycc_cfcenter`.`config_center_client_list` ### SET ### @1=380183774 ### @2=206 ### @3='10.1.97.131' ### @4='5566' ### @5=1555044104617 # at 747896964 #190412 12:41:44 server id 58070 end_log_pos 747896991 Xid = 263747869 COMMIT/*!*/;
2.原因排查
官方已经反馈社区版MySQL 5.7.26和MySQL 8.0.16 中会修复,如果是企业版客户可以申请最新的hotfix版本。
在未升级 MySQL 版本前,若再发生此类故障,在修复时需要人工检查,检查切换时binlog中 GTID 信息与新主节点对应 GTID 的信息是否一致。
如果不一致需要人工修复至一致状态,一致才可以将被踢出的原主节点加回集群。
参考文档
3.修复过程
3.1 确定故障点
查看故障节点error.log
2019
-
04
-12T12:
41
:
45.834694
+
08
:
00
3561943
[ERROR] Slave SQL
for
channel
'group_replication_applier'
: Could not execute Write_rows event on table ycc_cfcenter.config_center_client_list; Duplicate entry '
1
36
-
10.1
.
94.137
-
5566
' for key '
uniq_idx_project_id
', Error_code: 1062; handler error HA_ERR_FOUND_DUPP_KEY; the event'
s master log FIRST, end_log_pos
327
, Error_code:
1062
2019
-
04
-12T12:
41
:
45.834713
+
08
:
00
3561943
[Warning] Slave: Duplicate entry
'136-10.1.94.137-5566'
for
key
'uniq_idx_project_id'
Error_code:
1062
|
查看故障节点当前GTID
SHOW MASTER STATUS
FILE POSITION Binlog_Do_DB Binlog_Ignore_DB Executed_Gtid_Set
---------------- --------- ------------ ---------------- --------------------------------------------------
mysql-bin.
000042
747898223
1bb1b861-f776-11e6-
3319
-
010204058070
:
1
-
113111661
|
解析故障节点包含【1bb1b861-f776-11e6-3319-010204058070:1-113111661】 GTID 的binlog
##mysqlbinlog -vvv relay-bin-group_replication_applier.000053 --include-gtids="1bb1b861-f776-11e6-3319-010204058070:113111661"
SET @
@SESSION
.GTID_NEXT=
'1bb1b861-f776-11e6-3319-010204058070:113111661'
/*!*/
;
# at
703004277
#
190412
12
:
41
:
45
server id
58070
end_log_pos
81
Query thread_id=
4163376
exec_time=
0
error_code=
0
SET TIMESTAMP=
1555044105
/*!*/
;
BEGIN
/*!*/
;
# at
703004358
#
190412
12
:
41
:
45
server id
58070
end_log_pos
161
Table_map: `ycc_cfcenter`.`config_center_client_list` mapped to number
108
# at
703004438
#
190412
12
:
41
:
45
server id
58070
end_log_pos
327
Write_rows: table id
108
flags: STMT_END_F
BINLOG '
'
/*!*/
;
### INSERT INTO `ycc_cfcenter`.`config_center_client_list`
### SET
###
@1
=
380183781
/* LONGINT meta=0 nullable=0 is_null=0 */
###
@2
=
206
/* LONGINT meta=0 nullable=1 is_null=0 */
###
@3
=
'10.1.97.133'
/* VARSTRING(300) meta=300 nullable=1 is_null=0 */
###
@4
=
'5566'
/* VARSTRING(90) meta=90 nullable=1 is_null=0 */
###
@5
=
1555044105821
/* LONGINT meta=0 nullable=1 is_null=0 */
### INSERT INTO `ycc_cfcenter`.`config_center_client_list`
### SET
###
@1
=
380183788
/* LONGINT meta=0 nullable=0 is_null=0 */
###
@2
=
136
/* LONGINT meta=0 nullable=1 is_null=0 */
###
@3
=
'10.1.94.137'
/* VARSTRING(300) meta=300 nullable=1 is_null=0 */
###
@4
=
'5566'
/* VARSTRING(90) meta=90 nullable=1 is_null=0 */
###
@5
=
1555044105821
/* LONGINT meta=0 nullable=1 is_null=0 */
### INSERT INTO `ycc_cfcenter`.`config_center_client_list`
### SET
###
@1
=
380183795
/* LONGINT meta=0 nullable=0 is_null=0 */
###
@2
=
206
/* LONGINT meta=0 nullable=1 is_null=0 */
###
@3
=
'10.1.97.131'
/* VARSTRING(300) meta=300 nullable=1 is_null=0 */
###
@4
=
'5566'
/* VARSTRING(90) meta=90 nullable=1 is_null=0 */
###
@5
=
1555044105821
/* LONGINT meta=0 nullable=1 is_null=0 */
# at
703004604
#
190412
12
:
41
:
45
server id
58070
end_log_pos
354
Xid =
604665142
COMMIT
/*!*/
;
|
解析主节点binlog 确定相同内容binlog event 对应的GTID值
##mysqlbinlog -vvv mysql-bin.000042 --include-gtids="1bb1b861-f776-11e6-3319-010204058070:113111660-113111665'
SET @
@SESSION
.GTID_NEXT=
'1bb1b861-f776-11e6-3319-010204058070:113111662'
/*!*/
;
# at
418536376
#
190412
12
:
41
:
45
server id
58070
end_log_pos
418536457
Query thread_id=
4163376
exec_time=
0
error_code=
0
SET TIMESTAMP=
1555044105
/*!*/
;
BEGIN
/*!*/
;
# at
418536457
#
190412
12
:
41
:
45
server id
58070
end_log_pos
418536537
Table_map: `ycc_cfcenter`.`config_center_client_list` mapped to number
108
# at
418536537
#
190412
12
:
41
:
45
server id
58070
end_log_pos
418536703
Write_rows: table id
108
flags: STMT_END_F
BINLOG '
'
/*!*/
;
### INSERT INTO `ycc_cfcenter`.`config_center_client_list`
### SET
###
@1
=
380183781
/* LONGINT meta=0 nullable=0 is_null=0 */
###
@2
=
206
/* LONGINT meta=0 nullable=1 is_null=0 */
###
@3
=
'10.1.97.133'
/* VARSTRING(300) meta=300 nullable=1 is_null=0 */
###
@4
=
'5566'
/* VARSTRING(90) meta=90 nullable=1 is_null=0 */
###
@5
=
1555044105821
/* LONGINT meta=0 nullable=1 is_null=0 */
### INSERT INTO `ycc_cfcenter`.`config_center_client_list`
### SET
###
@1
=
380183788
/* LONGINT meta=0 nullable=0 is_null=0 */
###
@2
=
136
/* LONGINT meta=0 nullable=1 is_null=0 */
###
@3
=
'10.1.94.137'
/* VARSTRING(300) meta=300 nullable=1 is_null=0 */
###
@4
=
'5566'
/* VARSTRING(90) meta=90 nullable=1 is_null=0 */
###
@5
=
1555044105821
/* LONGINT meta=0 nullable=1 is_null=0 */
### INSERT INTO `ycc_cfcenter`.`config_center_client_list`
### SET
###
@1
=
380183795
/* LONGINT meta=0 nullable=0 is_null=0 */
###
@2
=
206
/* LONGINT meta=0 nullable=1 is_null=0 */
###
@3
=
'10.1.97.131'
/* VARSTRING(300) meta=300 nullable=1 is_null=0 */
###
@4
=
'5566'
/* VARSTRING(90) meta=90 nullable=1 is_null=0 */
###
@5
=
1555044105821
/* LONGINT meta=0 nullable=1 is_null=0 */
# at
418536703
#
190412
12
:
41
:
45
server id
58070
end_log_pos
418536730
Xid =
604665142
COMMIT
/*!*/
;
|
3.2 确定故障点结果汇总
1.故障节点提示 Duplicate entry '136-10.1.94.137-5566' for key 'uniq_idx_project_id' 有唯一约束冲突
2.故障节点当前GTID 【1bb1b861-f776-11e6-3319-010204058070:1-113111660】
3.故障节点与主节点GTID相差一个值,相同binlog event 主节点GTID【1bb1b861-f776-11e6-3319-010204058070:113111662】 故障节点【1bb1b861-f776-11e6-3319-010204058070:113111661】
3.3 故障节点修复过程
3.3.1 停止故障节点MGR服务
STOP group_replication;
SET GLOBAL super_read_only=0
3.3.2 删除冲突键数据
DELETE FROM config_center_client_list WHERE project_id=136 AND client_ip='10.1.94.137' AND PORT=5566
3.3.3 清除故障节点GTID信息
RESET MASTER
3.3.4 重置故障节点GTID信息,故障节点因唯一约束冲突未重做的binlog event,在主节点对应的GTID为【1bb1b861-f776-11e6-3319-010204058070:113111662】。
SET @@GLOBAL.GTID_PURGED='1bb1b861-f776-11e6-3319-010204058070:1-113111661';
START group_replication;
SHOW MASTER STATUS
3.3.5 在故障节点执行, 需要跳过的GTID只有一个,也可以用以下方法,这样就不需要reset masert。
STOP group_replication;
SET GLOBAL super_read_only=0
set sql_log_bin=0
DELETE FROM config_center_client_list WHERE project_id=136 AND client_ip='10.1.94.137' AND PORT=5566
set sql_log_bin=1
set session gtid_next="1bb1b861-f776-11e6-3319-010204058070:113111661";
begin;commit;
set session gtid_next="AUTOMATIC";
START group_replication;