1.資源角色規划
2.配置
2.1對現有的全分布式集群hadoop配置做一個備份
cd /opt/hadoop-3.1.1/etc/ cp -r hadoop hadoop-full
2.2.配置hadoop-env.sh
注釋掉之前的secondarynamenode配置並添加zkfc 和jnn的配置
vi + hadoop-env.sh #末尾添加如下配置 export JAVA_HOME=/usr/java/jdk1.8.0_191-amd64 export HDFS_NAMENODE_USER=root export HDFS_DATANODE_USER=root #export HDFS_SECONDARYNAMENODE_USER=root export HDFS_ZKFC_USER=root export HDFS_JOURNALNODE_USER=root
2.4修改配置core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://node01:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/var/hadoop/ha</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
#配置zookeeper
<property>
<name>ha.zookeeper.quorum</name>
<value>node02:2181,node03:2181,node04:2181</value>
</property>
</configuration>
2.4修改hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name> #代表副本
<value>2</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>mycluster</value> #名稱自定義
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value> #這里高可用只配置兩個namenode官方推薦3個不超過5個
</property>
#配置對應的rpc
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>node01:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>node02:8020</value>
</property>
#http的地址
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>node01:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>node02:9870</value>
</property>
#journalnonde配置
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node01:8485;node02:8485;node03:8485、mycluster</value>
</property>
#配置故障轉移的代理類直接拷貝官網配置
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
#配置隔離所需的私鑰
<property>
<name>dfs.ha.fencing.methods</name>
<value>shell(/bin/true)</value>
</property>
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
#配置 jnn的edit日志存放位置
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/var/hadoop/ha/journalmode</value>
</property>
# 配置自動故障轉移
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
2.5配置分發
for i in {node01,node02,node03};do scp core-site.xml hdfs-site.xml hadoop-env.sh $i:`pwd`;done
3.搭建zookeeper集群
3.1.安裝
按照規划需要在node2 node3 node4 分別安裝
在node02操作
tar xf zookeeper-3.4.10.tar.gz -C /opt
3.2.配置zk的環境變量
vi + /etc/profile export ZOOKEEPER_HOME=/opt/zookeeper-3.4.10 PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$ZOOKEEPER_HOME/bin
每個節點source一下
3.3在conf目錄下配置zk
cp zoo_sample.cfg zoo.cfg
更改配置文件zoo.cfg
dataDir=/var/zk #修改上圖的默認的目錄 server.1=node02:2888:3888 server.2=node03:2888:3888 server.3=node04:2888:3888
3.4分發安裝包
cd /opt for i in {node03,node04};do scp -r zookeeper-3.4.10 $i:`pwd`;done 在node02 node03 node04分別創建數據目錄
3.5生成id文件
mkdir -p /var/zk #02 03 04 節點分別創建
[root@node02 opt]# echo 1 > /var/zk/myid [root@node03 opt]# echo 2 > /var/zk/myid [root@node04 opt]# echo 3 > /var/zk/myid
3.6.每個節點運行zk
zkServer.sh start
3.7查看各個節點角色分配
[root@node02 conf]# zkServer.sh status ZooKeeper JMX enabled by default Using config: /opt/zookeeper-3.4.10/bin/../conf/zoo.cfg Mode: follower [root@node03 hadoop]# zkServer.sh status ZooKeeper JMX enabled by default Using config: /opt/zookeeper-3.4.10/bin/../conf/zoo.cfg Mode: follower [root@node04 ~]# zkServer.sh status ZooKeeper JMX enabled by default Using config: /opt/zookeeper-3.4.10/bin/../conf/zoo.cfg Mode: leader 集群中過半的主機存活集群才可用 啟動客戶端 zkCli.sh
4.啟動高可用集群
4.1.啟動node01 02 03 的journalnode
hdfs --daemon start journalnode
journalnode啟動成功后會產生 ha目錄
4.2.在node01 或者node02上啟動集群 這里選在node01上
[root@node01 ~]# hdfs namenode -format 2018-12-14 20:14:46,691 INFO namenode.FSImage: Allocated new BlockPoolId: BP-1378401332-10.10.0.11-1544789686691 2018-12-14 20:14:46,712 INFO common.Storage: Storage directory /var/hadoop/ha/dfs/name has been successfully formatted. 2018-12-14 20:14:46,880 INFO namenode.FSImageFormatProtobuf: Saving image file /var/hadoop/ha/dfs/name/current/fsimage.ckpt_0000000000000000000 using no compression 2018-12-14 20:14:46,991 INFO namenode.FSImageFormatProtobuf: Image file /var/hadoop/ha/dfs/name/current/fsimage.ckpt_0000000000000000000 of size 389 bytes saved in 0 seconds . 2018-12-14 20:14:47,003 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0
4.3在啟動當前的namenode
[root@node01 ha]# hadoop-daemon.sh start namenode WARNING: Use of this script to start HDFS daemons is deprecated. WARNING: Attempting to execute replacement "hdfs --daemon start" instead. [root@node01 ha]# jps 1168 JournalNode 1907 Jps 1813 NameNode
4.4.node2同步node1
[root@node02 hadoop]# hdfs namenode -bootstrapStandby 2018-12-14 20:28:58,949 INFO common.Storage: Storage directory /var/hadoop/ha/dfs/name has been successfully formatted. 2018-12-14 20:28:59,027 INFO namenode.FSEditLog: Edit logging is async:true 2018-12-14 20:28:59,211 INFO namenode.TransferFsImage: Opening connection to http://node01:9870/imagetransfer?getimage=1&txid=0&storageInfo=-64:1201005544:1544789686691:CID-19ab0096-4ecd-4232-a0d2-18591ba70e24&bootstrapstandby=true 2018-12-14 20:28:59,393 INFO common.Util: Combined time for file download and fsync to all disks took 0.00s. The file download took 0.00s at 0.00 KB/s. Synchronous (fsync) write to disk of /var/hadoop/ha/dfs/name/current/fsimage.ckpt_0000000000000000000 took 0.00s. 2018-12-14 20:28:59,393 INFO namenode.TransferFsImage: Downloaded file fsimage.ckpt_0000000000000000000 size 389 bytes. 2018-12-14 20:28:59,450 INFO namenode.NameNode: SHUTDOWN_MSG: /************************************************************ SHUTDOWN_MSG: Shutting down NameNode at node02/10.10.0.12 ************************************************************/
4.5.驗證同步
[root@node01 hadoop]# cat /var/hadoop/ha/dfs/name/current/VERSION #Fri Dec 14 20:14:46 CST 2018 namespaceID=1201005544 clusterID=CID-19ab0096-4ecd-4232-a0d2-18591ba70e24 cTime=1544789686691 storageType=NAME_NODE blockpoolID=BP-1378401332-10.10.0.11-1544789686691 layoutVersion=-64 [root@node02 hadoop]# cat /var/hadoop/ha/dfs/name/current/VERSION #Fri Dec 14 20:28:58 CST 2018 namespaceID=1201005544 clusterID=CID-19ab0096-4ecd-4232-a0d2-18591ba70e24 cTime=1544789686691 storageType=NAME_NODE blockpoolID=BP-1378401332-10.10.0.11-1544789686691 layoutVersion=-64
4.6.zk 格式化
[root@node01 hadoop]# hdfs zkfc -formatZK
在zk的客戶端查看根目錄多了一個hadoop-ha節點
node01 和node02 注冊到 zk的mycluster 節點下,誰先注冊上誰就是主節點
4.7啟動hadoop集群
[root@node01 hadoop]# start-dfs.sh Starting namenodes on [node01 node02] Last login: Fri Dec 14 20:12:54 CST 2018 from 10.10.0.1 on pts/0 node01: namenode is running as process 1813. Stop it first. Starting datanodes
4.8驗證各節點角色分配情況
此時查看zk的mycluster目錄下的節點 多了兩個子節點
5.驗證
5.1 獲取節點狀態
[root@node01 ~]# hdfs haadmin -getServiceState nn1 active [root@node01 ~]# hdfs haadmin -getServiceState nn2 standby
5.2通過網頁驗證
5.3模擬主節點 node01的namenode 不可用
[root@node01 hadoop]# hdfs --daemon stop namenode [root@node01 hadoop]# jps 1168 JournalNode 4488 Jps 3711 DFSZKFailoverController [root@node01 hadoop]# hdfs haadmin -getServiceState nn2 #此時節點2變為active active
5.4zk客戶端查看
5.5再次查看網頁node01 已經不可用,node02變為active
5.6驗證非搶占
如果將node01的namenode繼續開啟,它不會搶占當前的active節點,除非當前node02的namenode掛了,它會由standby變為active
[root@node01 hadoop]# hdfs --daemon start namenode [root@node01 hadoop]# hdfs haadmin -getServiceState nn1 standby [root@node01 hadoop]# hdfs haadmin -getServiceState nn2 active 讓node02的namenode掛掉 [root@node02 logs]# hdfs --daemon stop namenode [root@node02 logs]# hdfs haadmin -getServiceState nn1 active
6踩坑記
6.1standby狀態的namenode的狀態不能切換為active
是一個配置項搞錯了,zkfc日志顯示 是ssh的時候出現錯誤
參考文章 https://blog.csdn.net/u014034934/article/details/76558833 解決
6.2集群的datanode起不來
查看datanode日志發現各種
Response message: This node has namespaceId '2127146492 and clusterId 'CID-7434d466-7cff-488c-b4fc-82ec6245cb34' but the requesting node expected '1201005544' and 'CID-19ab0096-4ecd-4232-a0d2-18591ba70e24' at org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream$URLLog$1.run(EditLogFileInputStream.java:435) at org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream$URLLog$1.run(EditLogFileInputStream.java:420) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.security.SecurityUtil.doAsUser(SecurityUtil.java:515) at org.apache.hadoop.security.SecurityUtil.doAsCurrentUser(SecurityUtil.java:509) at org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream$URLLog.getInputStream(EditLogFileInputStream.
