當前很多paas平台使用k8s搭建,k8s本身很消耗資源,為了保證資源可控,自研了一個Paas平台。
1. 整體架構:
ui--> controller -> service --> mapper -->mysql (后端,主要維護服務列表)
⬇
AgentService --> sqlite (虛機層面,主要維護當前虛機的運行服務,同時向腳本發布 install,start, stop , uninstall, take-over, monitor_trace, installall, uninstallall 等命令)
⬇
腳本 (包括中間件mysql, redis, kafka, elk, nacos, sentinel, mogodb, skywallking 等的維護,接管的java服務的維護)
2. 整個平台精華全在腳本,后端主要是向腳本下發命令,維護庫的信息等。
3. 腳本簡述

agentctl.sh 舉例:
#!/bin/bash # JAR 包目錄 readonly PACKAGE_FULL_WAY=/opt/download/packages # MySQL執行腳本目錄 readonly SH_HOME=agentctl.sh # JAR 包執行目錄 readonly AGENT_INSTALL_HOME=/opt/agent # JAR 包名稱 readonly JAR_NAME=sitesupport-agent-0.0.1-SNAPSHOT.jar readonly NODE_EXPORTER=node_exporter-1.1.2.linux-amd64.tar.gz # 引入commmon.sh # shellcheck disable=SC1091 source "${PACKAGE_FULL_WAY}"/common.sh || exit function createSshkey() { if [ -e /root/.ssh ] && [ -e /root/.ssh/id_rsa ] && [ -e /root/.ssh/id_rsa.pub ]; then logInfo "ssh-key已存在!" else cd /root && if [ ! -e .ssh ]; then mkdir .ssh; fi cd .ssh || exit ssh-keygen -f "id_rsa" -N "" logInfo "ssh-key生成成功" fi } #check jdk function checkJdk() { logInfo "start check jdk...." if java -version &>/dev/null; then logInfo "start remove old jdk..." yum remove jdk -y # shellcheck disable=SC1091 source /etc/profile fi logInfo "start install new jdk..." if ! rpm -ivh ${PACKAGE_FULL_WAY}/"${JDK_PKG_NAME}"; then logError "jdk1.8.0_291 未安裝成功,請重新安裝!" fi # 允許jmx遠程訪問 local jmxremote_conf=/usr/java/jdk1.8.0_291-amd64/jre/lib/management cd ${jmxremote_conf} || logError "${jmxremote_conf} 不存在!" cp jmxremote.password.template jmxremote.password chmod +w jmxremote.password echo "monitorRole QED" >>jmxremote.password echo "controlRole R&D" >>jmxremote.password chmod 0400 jmxremote.password logInfo "the jdk is installed and the environment variables are configured" } # 檢查定時任務狀態 function checkCrond() { local state="" state=$(systemctl status crond | awk 'NR==3{print}' | awk '{print $3}' | tail -c +2 | head -c -2) if [[ ${state} != "running" ]]; then # 啟動定時任務服務 service crond start fi # 設置cron開機自啟 systemctl enable crond.service } function installNodeExporter() { logInfo "start install node exporter..." if [ ! -e ${PACKAGE_FULL_WAY}/${NODE_EXPORTER} ]; then logInfo "node exporter不存在!" return fi # 解壓node exporter到安裝主目錄 mkdir ${AGENT_INSTALL_HOME}/node_exporter tar -zxvf ${PACKAGE_FULL_WAY}/${NODE_EXPORTER} -C ${AGENT_INSTALL_HOME}/node_exporter >/dev/null 2>&1 checkResult $? "tar node exporter package error" local package_name="" # shellcheck disable=SC2010 package_name=$(ls ${AGENT_INSTALL_HOME}/node_exporter | grep node_exporter) mv ${AGENT_INSTALL_HOME}/node_exporter/"${package_name}"/* ${AGENT_INSTALL_HOME}/node_exporter rm -rf ${AGENT_INSTALL_HOME}/node_exporter/"${package_name}" cd ${AGENT_INSTALL_HOME}/node_exporter || logError "${AGENT_INSTALL_HOME}/node_exporter 不存在!" if [ -e /usr/lib/systemd/system/node_exporter.service ]; then rm -rf /usr/lib/systemd/system/node_exporter.service &>/dev/null fi cat <<EOF >>/usr/lib/systemd/system/node_exporter.service [Unit] Description=node_exporter After=network-online.target remote-fs.target nss-lookup.target Wants=network-online.target [Service] Type=simple ExecStart=${AGENT_INSTALL_HOME}/node_exporter/node_exporter ExecReload=/bin/kill -s HUP $MAINPID ExecStop=/bin/kill -s TERM $MAINPID [Install] WantedBy=multi-user.target EOF systemctl daemon-reload systemctl enable node_exporter.service systemctl start node_exporter.service # 修改prometheus服務端配置 # TODO local prometheus_ip="" if [[ ${prometheus_ip} = "" ]] || [[ ${prometheus_ip} = "null" ]]; then echo "下次一定!" # logInfo "nacos配置獲取失敗,開始從外部配置文件獲取配置..." # i=0 # temp="" # while true # do # i=`expr $i + 1` # temp=`sed -n "/^${i} /p" ${AGENT_INSTALL_HOME}/nacos_config | cut -d ' ' -f 2` # if [[ ${temp} = "" ]];then break;fi; # if [[ ${temp} =~ ^prometheus ]];then # sed -n "/^${i} /,/^}$/p" ${AGENT_INSTALL_HOME}/nacos_config | sed -n -e '/^{$/,/^}$/p' | jq -r ".install_ip" > ip.txt # fi # done # prometheus_ip=`cat ip.txt` && rm -rf ip.txt else rm -rf temp.json # shellcheck disable=SC2154 sshpass -p "${linux_password}" ssh -n -o StrictHostKeyChecking=no root@"${prometheus_ip}" "cd /opt/sitesupport/prometheus-standalone &>/dev/null || exit;sh prometheusctl.sh add_exporter -j node-${localIp}-exporter -h ${localIp} -p 9100" return 0 fi } function installAgent() { # 創建安裝目錄 if [[ -e ${AGENT_INSTALL_HOME} ]]; then logError "安裝目錄[${AGENT_INSTALL_HOME}]已存在,請檢查!"; fi mkdir ${AGENT_INSTALL_HOME} checkSshpass createSshkey checkCrond cp -f ${PACKAGE_FULL_WAY}/${JAR_NAME} ${AGENT_INSTALL_HOME} cp -f ${PACKAGE_FULL_WAY}/${SH_HOME} ${AGENT_INSTALL_HOME} cp -f ${PACKAGE_FULL_WAY}/common.sh ${AGENT_INSTALL_HOME} cp -f ${PACKAGE_FULL_WAY}/constant.sh ${AGENT_INSTALL_HOME} cp ${PACKAGE_FULL_WAY}/agent.db ${AGENT_INSTALL_HOME} chmod 755 ${AGENT_INSTALL_HOME}/${SH_HOME} # 增加定時任務 echo "*/1 * * * * root \`cd /opt/agent && sh agentctl.sh self_healing\`" >>/etc/crontab logInfo "config jar finish" } function install() { judgeMem 1024000 checkDepend installAgent start installNodeExporter } function print() { echo -e "====================== sitesupport-agent 啟動完成 ======================\n = private: http://${localIp}:8888 =\n ========================================================================" } function start() { local step=5 local res=1 local bool=1 for ((i = 0; i < 60; i = (i + step))); do serviceIsAlive res=$? if [ ${res} = 1 ]; then nohup java -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=18888 -jar ${AGENT_INSTALL_HOME}/${JAR_NAME} >${AGENT_INSTALL_HOME}/nohup.out 2>&1 & logInfo "${JAR_NAME}服務啟動中..." elif [ ${res} = 2 ]; then logInfo "${JAR_NAME}服務啟動中..." bool=1 else logInfo "${JAR_NAME}服務已正常啟動!" bool=0 print return fi sleep $step done # 啟動node-exporter systemctl start node_exporter.service if [ ${bool} = 1 ]; then local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') kill -9 "${pid}" logError "${JAR_NAME}服務啟動失敗!i will kill it!!" fi } function stop() { if serviceIsAlive; then local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') kill -9 "${pid}" logInfo "${JAR_NAME}停止成功" else logInfo "${JAR_NAME}沒有啟動" fi local node_pid="" node_pid=$(netstat -tnlp | grep 9100 | grep node_exporter | awk '{print $7}' | awk 'NR==1' | cut -d '/' -f 1) if [[ ${node_pid} != "" ]]; then kill -9 "${node_pid}"; fi } function serviceIsAlive() { setLocalIp local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') # 如果不存在返回1,存在返回0 if [ -z "${pid}" ]; then return 1 else if netstat -tlnp | grep "\b${pid}\b" >/dev/null; then if ! curl http://"${localIp}":8888 &>/dev/null; then return 2; fi # 存在端口但不提供服務 return 0 else return 2 # 存在pid不存在port,可能正在啟動,也可能啟動失敗 fi fi } # 服務自愈,可配合cron定時任務 function self_healing() { local step=5 local res=1 local bool=1 for ((i = 0; i < 60; i = (i + step))); do serviceIsAlive res=$? if [ ${res} = 1 ]; then logInfo "${JAR_NAME}服務開始啟動!" nohup java -jar ${AGENT_INSTALL_HOME}/${JAR_NAME} >${AGENT_INSTALL_HOME}/nohup.out 2>&1 & elif [ ${res} = 2 ]; then logInfo "${JAR_NAME}服務啟動中..." bool=1 else logInfo "${JAR_NAME}服務已正常啟動!" bool=0 fi sleep $step done if [ ${bool} = 1 ]; then local pid="" pid=$(ps -ef | grep ${JAR_NAME} | grep -v grep | awk '{print $2}') kill -9 "${pid}" logInfo "${JAR_NAME}服務啟動失敗!i will kill it!!" fi } function uninstall() { stop rm -rf ${AGENT_INSTALL_HOME} # 考慮殘留文件,再次判斷刪除 if [ -e ${AGENT_INSTALL_HOME} ]; then rm -rf ${AGENT_INSTALL_HOME}; fi # 刪除定時任務 sed -i '/agentctl.sh/d' /etc/crontab source /etc/crontab # 刪除exporter rm -rf /usr/lib/systemd/system/node_exporter.service } function check_status() { serviceIsAlive } case $1 in start) start ;; stop) stop ;; restart) stop start ;; install) install ;; uninstall) uninstall ;; check_status) check_status ;; self_healing) self_healing ;; *) logError "Usage: $0 {start|stop|install|uninstall|check_status|self_healing} {..}" ;; esac
