k8s 集群
# k8s 集群
192.168.71.201 k8s-master01 master01
192.168.71.202 k8s-master02 master02 harbor
192.168.71.203 k8s-master03 master03
192.168.71.204 k8s-node01 node01
192.168.71.205 k8s-node02 node02
k8s-master01 Ready master 170m v1.16.0
k8s-master02 Ready master 167m v1.16.0
k8s-master03 Ready master 164m v1.16.0
k8s-node01 Ready <none> 162m v1.16.0
k8s-node02 Ready <none> 162m v1.16.0
集群虛擬ip 192.168.71.200
k8s 高可用集群部署參照之前文檔: https://www.cnblogs.com/lixinliang/p/12217033.html
故障重現
#制造故障
將第一個master 節點關機
因check_haproxy.sh 腳本,虛擬ip會漂移至 master02 節點
故障恢復
# 獲取etcd 集群故障 etcd member id
首先在一台健康的master02 上運行下面的命令獲取etcd集群中故障member的ID
ETCD=`docker ps|grep etcd|grep -v POD|awk '{print $1}'`
docker exec \
-it ${ETCD} \
etcdctl \
--endpoints https://127.0.0.1:2379 \
--ca-file /etc/kubernetes/pki/etcd/ca.crt \
--cert-file /etc/kubernetes/pki/etcd/peer.crt \
--key-file /etc/kubernetes/pki/etcd/peer.key \
cluster-health
結果顯示報錯: member 19c5f5e4748dc98b is unreachbel
那么故障id 為: 19c5f5e4748dc98b
#刪除故障etcd member id
由於故障節點已經被重置,因此相當於該ID對應的ETCD實例已經丟失,無法再取得聯系。因此直接運行下面命令將故障的member從etcd集群中刪除。
ETCD=`docker ps|grep etcd|grep -v POD|awk '{print $1}'`
docker exec \
-it ${ETCD} \
etcdctl \
--endpoints https://127.0.0.1:2379 \
--ca-file /etc/kubernetes/pki/etcd/ca.crt \
--cert-file /etc/kubernetes/pki/etcd/server.crt \
--key-file /etc/kubernetes/pki/etcd/server.key \
member remove 19c5f5e4748dc98b
# 再次查看etcd 集群狀態
再次查看只剩下兩個etcd 節點為 healthy 狀態
docker exec \
-it ${ETCD} \
etcdctl \
--endpoints https://127.0.0.1:2379 \
--ca-file /etc/kubernetes/pki/etcd/ca.crt \
--cert-file /etc/kubernetes/pki/etcd/peer.crt \
--key-file /etc/kubernetes/pki/etcd/peer.key \
cluster-health
# 加入新節點
+ 基礎配置
* 主機名修改為master01 原來主機名
* /etc/hosts 文件保持同步
* 免密登錄
+ 准備 keepalived、haproxy 配置文件
直接拷貝原來 master01 的相關配置,並啟動服務
+ 分發證書
在 master02 節點分發證書至 master01
#!/bin/bash
for index in 201; do
ip=192.168.71.${index}
ssh $ip "mkdir -p /etc/kubernetes/pki/etcd; mkdir -p ~/.kube/"
scp /etc/kubernetes/pki/ca.crt $ip:/etc/kubernetes/pki/ca.crt
scp /etc/kubernetes/pki/ca.key $ip:/etc/kubernetes/pki/ca.key
scp /etc/kubernetes/pki/sa.key $ip:/etc/kubernetes/pki/sa.key
scp /etc/kubernetes/pki/sa.pub $ip:/etc/kubernetes/pki/sa.pub
scp /etc/kubernetes/pki/front-proxy-ca.crt $ip:/etc/kubernetes/pki/front-proxy-ca.crt
scp /etc/kubernetes/pki/front-proxy-ca.key $ip:/etc/kubernetes/pki/front-proxy-ca.key
scp /etc/kubernetes/pki/etcd/ca.crt $ip:/etc/kubernetes/pki/etcd/ca.crt
scp /etc/kubernetes/pki/etcd/ca.key $ip:/etc/kubernetes/pki/etcd/ca.key
scp /etc/kubernetes/admin.conf $ip:/etc/kubernetes/admin.conf
scp /etc/kubernetes/admin.conf $ip:~/.kube/config
done
+ 在 master01 准備 kubeadm_master01.conf 配置文件
隨后將新的(初始化過的)節點加入到集群中,重新組成三節點的HA master,注意重建master的過程中使用了kubeadm的配置文件,該配置文件為HA master首次部署過程中使用過的,此處直接復用該配置文件。
注意: 以下文件需要修改兩處地方,不能拿原來配置文件直接使用
$ cat kubeadm_master01.conf
apiVersion: kubeadm.k8s.io/v1beta1
kind: InitConfiguration
localAPIEndpoint:
advertiseAddress: 192.168.71.201
bindPort: 6443
---
apiVersion: kubeadm.k8s.io/v1beta1
kind: ClusterConfiguration
kubernetesVersion: v1.16.0
controlPlaneEndpoint: "192.168.71.200:8443"
imageRepository: registry.aliyuncs.com/google_containers
apiServer:
certSANs:
- "master01"
- "master02"
- "master03"
- 192.168.71.201
- 192.168.71.202
- 192.168.71.203
- 192.168.71.200
networking:
podSubnet: "10.244.0.0/16"
serviceSubnet: "10.96.0.0/12"
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
etcd:
local:
extraArgs:
listen-client-urls: "https://127.0.0.1:2379,https://192.168.71.201:2379"
advertise-client-urls: "https://192.168.71.201:2379"
listen-peer-urls: "https://192.168.71.201:2380"
initial-advertise-peer-urls: "https://192.168.71.201:2380"
initial-cluster: "k8s-master01=https://192.168.71.201:2380,k8s-master02=https://192.168.71.202:2380,k8s-master03=https://192.168.71.203:2380"
# 第一處: 注意上一行需要修改,確保包括該重置節點在內的所有etcd節點的HOST=IP地址對都被列出在該配置中,不然新節點的etcd啟動失敗
initial-cluster-state: existing
# 第二處: 注意上一行需要修改,因為是集群已經存在,所以將剛開始的 new 修改為existing ,如果損壞的master 節點不是master01,那么此處將不用修改
serverCertSANs:
- master01
- 192.168.71.201
peerCertSANs:
- master01
- 192.168.71.201
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
mode: ipvs
在 master01 執行:
# 配置證書
kubeadm init phase certs all --config kubeadm_master01.conf
# 配置etcd
kubeadm init phase etcd local --config kubeadm_master01.conf
# 生成kubelet配置文件
kubeadm init phase kubeconfig kubelet --config kubeadm_master01.conf
# 啟動kubelet
kubeadm init phase kubelet-start --config kubeadm_master01.conf
# 將master01的etcd加入集群
kubectl exec -n kube-system etcd-k8s-master02 -- etcdctl --ca-file /etc/kubernetes/pki/etcd/ca.crt --cert-file /etc/kubernetes/pki/etcd/peer.crt --key-file /etc/kubernetes/pki/etcd/peer.key --endpoints=https://192.168.71.202:2379 member add master1 https://192.168.71.201:2380
# 啟動 kube-apiserver、kube-controller-manager、kube-scheduler
kubeadm init phase kubeconfig all --config kubeadm_master01.conf
kubeadm init phase control-plane all --config kubeadm_master01.conf
# 將節點標記為master
kubeadm init phase mark-control-plane --config kubeadm_master01.conf
# 查看
kubectl get nodes
# 再次查看etcd 集群狀態
[root@k8s-master02 ~]# docker exec -it ${ETCD} etcdctl --endpoints https://127.0.0.1:2379 --ca-file /etc/kubernetes/pki/etcd/ca.crt --cert-file /etc/kubernetes/pki/etcd/peer.crt --key-file /etc/kubernetes/pki/etcd/peer.key cluster-health
member 858768c8e151d5d8 is healthy: got healthy result from https://192.168.71.202:2379
member c79fe8ecd577a746 is healthy: got healthy result from https://192.168.71.203:2379
member e2892a4ec808af4e is healthy: got healthy result from https://192.168.71.201:2379
cluster is healthy
正常顯示 etcd 集群,證明mater01 已修復。