基於kubeadm Etcd集群數據恢復方案
etcd HA集群中單台etcd故障不更換ip恢復方式
概述
單master節點,會將90作為故障節點刪除並進行修復
查看當前節點狀態
/opt# kubectl -nkube-system get po|grep etcd
# 查看etcd集群狀態
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
# 查看節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"
模擬90節點宕機
# 模擬宕機節點,90節點上操作
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
/opt# kubectl -nkube-system get po|grep etcd
# 移除節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member remove 8e363d6f244214f6"
修復節點
snap# systemctl stop kubelet
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`
snap# ll -h /var/lib/etcd/member/
# 添加
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member add bj-idc1-10-53-6-90-10.53.6.90 --peer-urls='https://10.53.6.90:2380'"
# 查看
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"
snap# systemctl start kubelet
# 90節點查看,數據已同步過來
snap# ll -h /var/lib/etcd/member/snap/
概述
只要是相同集群中的node基於集群ca簽發的證書,都能作為擴展etcd的節點來使用
清除故障節點信息
/opt# kubectl -nkube-system get po|grep etcd
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
# 查看節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"
# 移除節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member remove 9de8f041daa634"
# 模擬宕機節點
/opt# rm -rf /etc/kubernetes/manifests/etcd.yaml
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`
簽發ca證書
~# mkdir ~/caadd103
# 需要依賴集群本身的ca證書 ~/caadd103# cp ../openssl/ca.* . ~/caadd103# vi server.cnf [ req ] req_extensions = v3_req distinguished_name = req_distinguished_name [req_distinguished_name] [ v3_req ] basicConstraints = CA:FALSE extendedKeyUsage = clientAuth, serverAuth keyUsage = nonRepudiation, digitalSignature, keyEncipherment subjectAltName = @alt_names [alt_names] IP.1 = 10.53.5.165 IP.2 = 10.53.4.221 IP.3 = 10.53.6.90 IP.4 = 10.53.4.103 ~/caadd103# vi peer.cnf [ req ] req_extensions = v3_req distinguished_name = req_distinguished_name [req_distinguished_name] [ v3_req ] extendedKeyUsage = clientAuth, serverAuth keyUsage = critical, digitalSignature, keyEncipherment subjectAltName = @alt_names [alt_names] IP.1 = 10.53.5.165 IP.2 = 10.53.4.221 IP.3 = 10.53.6.90 IP.4 = 10.53.4.103 ~/caadd103# openssl genrsa -out server.key 4096 ~/caadd103# openssl req -new -key server.key -out server.csr -subj "/CN=10.53.5.165" -config server.cnf ~/caadd103# openssl x509 -req -in server.csr -CA ca.crt \ -CAkey ca.key -CAcreateserial \ -out server.crt -days 1825 \ -extfile server.cnf -extensions v3_req ~/caadd103# openssl genrsa -out peer.key 4096 ~/caadd103# openssl req -new -key peer.key -out peer.csr \ -subj "/CN=10.53.5.165" \ -config peer.cnf ~/caadd103# openssl x509 -req -in peer.csr \ -CA ca.crt -CAkey ca.key -CAcreateserial \ -out peer.crt -days 1825 \ -extfile peer.cnf -extensions v3_req ~# cd ~ ~# scp -i diamond.yaml -r ~/caadd103 ubuntu@10.53.4.103:/home/ubuntu
新節點103操作
caadd103# cd ~/caadd103 caadd103# cp ca.crt ca.key peer.crt peer.key server.crt server.key /etc/kubernetes/pki/etcd/ caadd103# ll /etc/kubernetes/pki/etcd/
加入成員
# 添加 ~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member add bj-idc1-10-53-4-103-10.53.4.103 --peer-urls='https://10.53.4.103:2380'" # 查看 ~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"
添加etcd.yaml
caadd103# vi /etc/kubernetes/manifests/etcd.yaml
# 關鍵配置
- --advertise-client-urls=https://10.53.4.103:2379
- --initial-advertise-peer-urls=https://10.53.4.103:2380
- --initial-cluster=wangshile-vendor-4-10.53.5.165=https://10.53.5.165:2380,bj-idc1-10-53-4-221-10.53.4.221=https://10.53.4.221:2380,bj-idc1-10-53-4-103-10.53.4.103=https://10.53.4.103:2380
- --initial-cluster-state=existing
- --listen-client-urls=https://127.0.0.1:2379,https://10.53.4.103:2379
- --listen-peer-urls=https://10.53.4.103:2380
- --name=bj-idc1-10-53-4-103-10.53.4.103
caadd103# docker ps -a | grep etcd caadd103# netstat -tnlp| grep etcd ~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health" ~# kubectl -nkube-system get po| grep etcd
另外還得修改apiserver參數
- --etcd-servers=https://10.53.5.165:2379,https://10.53.4.221:2379,https://10.53.4.103:2379
# 原節點再次介入集群中需要修改配置\- --initial-cluster-state=existing,並清楚該節點數據 2020-06-18 06:35:08.560068 E | rafthttp: request cluster ID mismatch (got cdf818194e3a8c32 want 4ec9131dabe34047) 2020-06-18 06:35:08.560482 E | rafthttp: request sent was ignored (cluster ID mismatch: peer[22efed4061b48b44]=cdf818194e3a8c32, local=4ec9131dabe34047)
etcdmain: error validating peerURLs {ClusterID:e7d9e721043e9bfa Members:[&{ID:a11f7c7b82c99552 RaftAttributes:{PeerURLs:[https://localhost:2380]} Attributes:{Name:"infra1" ClientURLs:[https://localhost:2379]}} &{ID:f7bd378938cf704d RaftAttributes:{PeerURLs:[https://10.10.30.53:2380]} Attributes:{Name: ClientURLs:[]}} &{ID:842133e992b120ec RaftAttributes:{PeerURLs:[https://10.10.30.51:2380]} Attributes:{Name:"infra0" ClientURLs:[https://10.10.30.51:2379]}}] RemovedMemberIDs:[]}: unmatched member while checking PeerURLs
通過大神帶領排查問題為peer-url問題,進入pod中查看確實有問題
/ # etcdctl --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list
8e9e05c52164694d, started, wangshile-vendor-4-10.53.5.165, http://localhost:2380, https://10.53.5.165:2379
其他節點排查根本沒有http://localhost:2380的問題
之后通過手動update修改,解決該問題,可以保證etcd能夠進行擴展
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd-certs:/etc/kubernetes/pki/etcd-certs' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd-certs/client.pem --key=/etc/kubernetes/pki/etcd-certs/client-key.pem --cacert=/etc/kubernetes/pki/etcd-certs/ca.pem member update 8e9e05c52164694d https://10.53.5.165:2380"
其他操作
# 數據恢復 docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl snapshot restore --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt '/backup/0617_snapshot.db';cp -r /default.etcd/member/* /var/lib/etcd/member/" # 備份命令 docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt snapshot save /backup/$(date +%m%d)_snapshot.db"
