kubeadm Etcd高可用恢復方案


基於kubeadm Etcd集群數據恢復方案

etcd HA集群中單台etcd故障不更換ip恢復方式

概述

單master節點,會將90作為故障節點刪除並進行修復

 

 

查看當前節點狀態

/opt# kubectl -nkube-system get po|grep etcd
    
 # 查看etcd集群狀態
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
    
 # 查看節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

 模擬90節點宕機

# 模擬宕機節點,90節點上操作
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`

/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
/opt# kubectl -nkube-system get po|grep etcd

# 移除節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member remove 8e363d6f244214f6"

 修復節點

snap# systemctl stop kubelet

/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`
snap# ll -h /var/lib/etcd/member/

# 添加
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt  member add bj-idc1-10-53-6-90-10.53.6.90 --peer-urls='https://10.53.6.90:2380'"

# 查看
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

snap# systemctl start kubelet

# 90節點查看,數據已同步過來
snap# ll -h /var/lib/etcd/member/snap/

模擬etcd節點宕機,並將新節點加入集群

概述

只要是相同集群中的node基於集群ca簽發的證書,都能作為擴展etcd的節點來使用

 清除故障節點信息

/opt# kubectl -nkube-system get po|grep etcd
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"

# 查看節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

# 移除節點成員
/opt# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member remove 9de8f041daa634"

# 模擬宕機節點
/opt# rm -rf /etc/kubernetes/manifests/etcd.yaml
/opt# rm -rf /var/lib/etcd/member/*
/opt# docker rm -vf `docker ps -a | grep etcd | awk '{print $1}'`

 簽發ca證書

~# mkdir ~/caadd103
# 需要依賴集群本身的ca證書 ~/caadd103# cp ../openssl/ca.* . ~/caadd103# vi server.cnf [ req ] req_extensions = v3_req distinguished_name = req_distinguished_name [req_distinguished_name] [ v3_req ] basicConstraints = CA:FALSE extendedKeyUsage = clientAuth, serverAuth keyUsage = nonRepudiation, digitalSignature, keyEncipherment subjectAltName = @alt_names [alt_names] IP.1 = 10.53.5.165 IP.2 = 10.53.4.221 IP.3 = 10.53.6.90 IP.4 = 10.53.4.103 ~/caadd103# vi peer.cnf [ req ] req_extensions = v3_req distinguished_name = req_distinguished_name [req_distinguished_name] [ v3_req ] extendedKeyUsage = clientAuth, serverAuth keyUsage = critical, digitalSignature, keyEncipherment subjectAltName = @alt_names [alt_names] IP.1 = 10.53.5.165 IP.2 = 10.53.4.221 IP.3 = 10.53.6.90 IP.4 = 10.53.4.103 ~/caadd103# openssl genrsa -out server.key 4096 ~/caadd103# openssl req -new -key server.key -out server.csr -subj "/CN=10.53.5.165" -config server.cnf ~/caadd103# openssl x509 -req -in server.csr -CA ca.crt \ -CAkey ca.key -CAcreateserial \ -out server.crt -days 1825 \ -extfile server.cnf -extensions v3_req ~/caadd103# openssl genrsa -out peer.key 4096 ~/caadd103# openssl req -new -key peer.key -out peer.csr \ -subj "/CN=10.53.5.165" \ -config peer.cnf ~/caadd103# openssl x509 -req -in peer.csr \ -CA ca.crt -CAkey ca.key -CAcreateserial \ -out peer.crt -days 1825 \ -extfile peer.cnf -extensions v3_req ~# cd ~ ~# scp -i diamond.yaml -r ~/caadd103 ubuntu@10.53.4.103:/home/ubuntu

 新節點103操作

caadd103# cd ~/caadd103
caadd103# cp ca.crt ca.key peer.crt peer.key server.crt server.key /etc/kubernetes/pki/etcd/
caadd103# ll /etc/kubernetes/pki/etcd/

 加入成員

# 添加
~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt  member add bj-idc1-10-53-4-103-10.53.4.103 --peer-urls='https://10.53.4.103:2380'"

# 查看
~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

 添加etcd.yaml

caadd103# vi /etc/kubernetes/manifests/etcd.yaml
# 關鍵配置
    - --advertise-client-urls=https://10.53.4.103:2379
    - --initial-advertise-peer-urls=https://10.53.4.103:2380
    - --initial-cluster=wangshile-vendor-4-10.53.5.165=https://10.53.5.165:2380,bj-idc1-10-53-4-221-10.53.4.221=https://10.53.4.221:2380,bj-idc1-10-53-4-103-10.53.4.103=https://10.53.4.103:2380
    - --initial-cluster-state=existing
    - --listen-client-urls=https://127.0.0.1:2379,https://10.53.4.103:2379
    - --listen-peer-urls=https://10.53.4.103:2380
    - --name=bj-idc1-10-53-4-103-10.53.4.103
caadd103# docker ps -a | grep etcd
caadd103# netstat -tnlp| grep etcd

~# docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd'  -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://10.53.4.221:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --key-file=/etc/kubernetes/pki/etcd/server.key --ca-file=/etc/kubernetes/pki/etcd/ca.crt cluster-health"
~# kubectl -nkube-system get po| grep etcd

 另外還得修改apiserver參數

- --etcd-servers=https://10.53.5.165:2379,https://10.53.4.221:2379,https://10.53.4.103:2379

實驗總結

1. 模擬擴展中產生的問題:后加入的etcd的--initial-cluster=當前有幾個填幾個,填多報錯

# 原節點再次介入集群中需要修改配置\- --initial-cluster-state=existing,並清楚該節點數據

2020-06-18 06:35:08.560068 E | rafthttp: request cluster ID mismatch (got cdf818194e3a8c32 want 4ec9131dabe34047)
2020-06-18 06:35:08.560482 E | rafthttp: request sent was ignored (cluster ID mismatch: peer[22efed4061b48b44]=cdf818194e3a8c32, local=4ec9131dabe34047)

 2. 問題描述 etcd peer 的值為localhost

etcdmain: error validating peerURLs {ClusterID:e7d9e721043e9bfa Members:[&{ID:a11f7c7b82c99552 RaftAttributes:{PeerURLs:[https://localhost:2380]} Attributes:{Name:"infra1" ClientURLs:[https://localhost:2379]}} &{ID:f7bd378938cf704d RaftAttributes:{PeerURLs:[https://10.10.30.53:2380]} Attributes:{Name: ClientURLs:[]}} &{ID:842133e992b120ec RaftAttributes:{PeerURLs:[https://10.10.30.51:2380]} Attributes:{Name:"infra0" ClientURLs:[https://10.10.30.51:2379]}}] RemovedMemberIDs:[]}: unmatched member while checking PeerURLs

通過大神帶領排查問題為peer-url問題,進入pod中查看確實有問題
/ # etcdctl  --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list
8e9e05c52164694d, started, wangshile-vendor-4-10.53.5.165, http://localhost:2380, https://10.53.5.165:2379

其他節點排查根本沒有http://localhost:2380的問題
之后通過手動update修改,解決該問題,可以保證etcd能夠進行擴展
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt member list"

docker run --rm --net=host -v '/etc/kubernetes/pki/etcd-certs:/etc/kubernetes/pki/etcd-certs' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl  --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd-certs/client.pem --key=/etc/kubernetes/pki/etcd-certs/client-key.pem --cacert=/etc/kubernetes/pki/etcd-certs/ca.pem member update 8e9e05c52164694d https://10.53.5.165:2380"

 其他操作

# 數據恢復
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl snapshot restore --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt '/backup/0617_snapshot.db';cp -r /default.etcd/member/* /var/lib/etcd/member/"

# 備份命令
docker run --rm --net=host -v '/etc/kubernetes/pki/etcd:/etc/kubernetes/pki/etcd' -v '/opt/sensetime/diamond/data/etcd-snapshot:/backup' --env ETCDCTL_API=3 -v '/var/lib/etcd:/var/lib/etcd' 'registry.sensetime.com/diamond/etcd:3.3.10' /bin/sh -c "etcdctl --endpoints=https://127.0.0.1:2379 --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key --cacert=/etc/kubernetes/pki/etcd/ca.crt snapshot save /backup/$(date +%m%d)_snapshot.db"

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM