大數據集群部署


1、zookeeper部署

wget http://apache.fayea.com/zookeeper/zookeeper-3.4.10/zookeeper-3.4.10.tar.gz

tar xf zookeeper-3.4.10.tar.gz 

cd zookeeper-3.4.10

cp -a zoo_sample.cfg zoo.cfg

[root@node1 opt]# cat /opt/zookeeper-3.4.10/conf/zoo.cfg
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just
# example sakes.
dataDir=/opt/zookeeper-3.4.10/data
dataLogDir=/opt/zookeeper-3.4.10/logs
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
#autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
#autopurge.purgeInterval=1
server.1=10.61.98.71:2888:3888
server.2=10.61.98.72:2888:3888
server.3=10.61.98.73:2888:3888

scp -rp zookeeper-3.4.10 node2:/opt/

scp -rp zookeeper-3.4.10 node3:/opt/

2、hadoop部署

wget http://apache.01link.hk/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz

groupadd hadoop

useradd -m -g hadoop hadoop

tar xf hadoop-2.6.5.tar.gz 

cd hadoop-2.6.5

cd /opt/hadoop-2.6.5/etc/hadoop

[root@node1 hadoop]# egrep -v "^#|^$" core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- hdfs地址,ha模式中是連接到nameservice -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns1</value>
</property>
<!-- 這里的路徑默認是NameNode、DataNode、JournalNode等存放數據的公共目錄,也可以單獨指定 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/hadoop-2.6.5/tmp</value>
</property>
<!-- 指定ZooKeeper集群的地址和端口。注意,數量一定是奇數,且不少於三個節點-->
<property>
<name>ha.zookeeper.quorum</name>
<value>node1:2181,node2:2181,node3:2181</value>
</property>
</configuration>

 

 

[root@node1 hadoop]# egrep -v "^#|^$" hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- 指定副本數,不能超過機器節點數 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!-- 為namenode集群定義一個services name -->
<property>
<name>dfs.nameservices</name>
<value>ns1</value>
</property>
<!-- nameservice 包含哪些namenode,為各個namenode起名 -->
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>node1,node2</value>
</property>
<!-- 名為master188的namenode的rpc地址和端口號,rpc用來和datanode通訊 -->
<property>
<name>dfs.namenode.rpc-address.ns1.node1</name>
<value>node1:9000</value>
</property>
<!-- 名為master189的namenode的rpc地址和端口號,rpc用來和datanode通訊 -->
<property>
<name>dfs.namenode.rpc-address.ns1.node2</name>
<value>node2:9000</value>
</property>
<!--名為master188的namenode的http地址和端口號,用來和web客戶端通訊 -->
<property>
<name>dfs.namenode.http-address.ns1.node1</name>
<value>node1:50070</value>
</property>
<!-- 名為master189的namenode的http地址和端口號,用來和web客戶端通訊 -->
<property>
<name>dfs.namenode.http-address.ns1.node2</name>
<value>node2:50070</value>
</property>

<!-- namenode間用於共享編輯日志的journal節點列表 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/ns1</value>
</property>
<!-- 指定該集群出現故障時,是否自動切換到另一台namenode -->
<property>
<name>dfs.ha.automatic-failover.enabled.ns1</name>
<value>true</value>
</property>
<!-- journalnode 上用於存放edits日志的目錄 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/opt/hadoop-2.6.5/tmp/data/dfs/journalnode</value>
</property>
<!-- 客戶端連接可用狀態的NameNode所用的代理類 -->
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 一旦需要NameNode切換,使用ssh方式進行操作 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 如果使用ssh進行故障切換,使用ssh通信時用的密鑰存儲的位置 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/hadoop/.ssh/id_rsa</value>
</property>
<!-- connect-timeout超時時間 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
</configuration>

 

 

[root@node1 hadoop]# egrep -v "^#|^$" mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<!-- 采用yarn作為mapreduce的資源調度框架 -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>

 

[root@node1 hadoop]# egrep -v "^#|^$" yarn-site.xml
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<!-- 啟用HA高可用性 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 指定resourcemanager的名字 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yrc</value>
</property>
<!-- 使用了2個resourcemanager,分別指定Resourcemanager的地址 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>

<!-- 指定rm1的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>node1</value>
</property>

<!-- 指定rm2的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>node2</value>
</property>

<!-- 指定當前機器master188作為rm1 -->
<property>
<name>yarn.resourcemanager.ha.id</name>
<value>rm1</value>
</property>

<!-- 指定zookeeper集群機器 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>node1:2181,node2:2181,node3:2181</value>
</property>

<!-- NodeManager上運行的附屬服務,默認是mapreduce_shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>

 

[root@node1 hadoop]# egrep -v "^#|^$" slaves
node1
node2
node3

 

scp -r hadoop-2.6.5 hadoop02:/opt/
scp -r hadoop-2.6.5 hadoop03:/opt/

 

修改yarn-site.xml

在node2機器,即ResourceManager備用主節點上修改如下屬性,表示當前機器作為rm2::

 
  <property> <name>yarn.resourcemanager.ha.id</name> <value>rm2</value> </property>

 

啟動Hadoop

cd hadoop-2.6.5/sbin/

./hadoop-daemons.sh start journalnode
啟動前需要先初始話JournalNode.
2.0 登陸journalnode_1,journalnode_2,journalnode_3機器, 執行 sbin/hadoop-daemon.sh start journalnode
2.1 格式化nn1, 執行 bin/hadoop namenode -format
./hdfs zkfc -formatZK

2.2 在nn1上,執行 bin/hdfs namenode -initializeSharedEdits
2.3 啟動nn1, 執行 sbin/hadoop-daemon.sh start namenode
2.4 登錄nn2,拉取nn1 的元數據(注意,nn2無需進行格式化), bin/hdfs namenode -bootstrapStandby
2.5 啟動nn2, sbin/hadoop-daemon.sh start namenode

5)啟動HDFS、YARN、ZookeeperFailoverController
./start-dfs.sh

./start-yarn.sh

./hadoop-daemon.sh start zkfc

 在node2機器上,啟動ResourceManager,備用主節點的ResourceManager需要手動啟動:

yarn-daemon.sh start resourcemanager

6)查看Namenode、ResourceManager狀態

hdfs haadmin -getServiceState master188
yarn rmadmin -getServiceState rm1

hdfs haadmin -getServiceState master189
yarn rmadmin -getServiceState rm2

也可以通過Web界面來查看,瀏覽器中輸入 ip:50070 查看HDFS,輸入 ip:8088/cluster/cluster 查看YARN。

 

7)測試高可用
a.主節點--->備用主節點

kill掉主節點的namenode,查看備用主節點的namenode狀態是否切換為active;

kill掉主節點的ResourceManager,查看備用主節點的ResourceManager是否切換為active;

b.備用主節點--->主節點

若上述操作執行成功,那么再測試反向故障自動轉移

先啟動被殺死的原主節點的namenode和ResourceManager

hadoop-daemon.sh start namenode yarn-daemon.sh start resourcemanager 

再kill備用主節點的namenode和ResourceManager,查看主節點的狀態,若能切換為active,那么Hadoop HA高可用集群搭建完成。

 

五、安裝配置HBase

1、下載及安裝

下載地址:http://mirrors.hust.edu.cn/ap...

在master188機器上,解壓到/home/hadoop/目錄下:

tar -zxvf hbase-1.3.1-bin.tar.gz

2、配置

進入hbase-1.3.1/conf/目錄,修改配置文件:

cd hbase-1.3.1/conf/
1)vi hbase-env.sh
//配置JDK export JAVA_HOME=/opt/jdk //保存pid文件 export HBASE_PID_DIR=/home/hadoop/data/hbase/pids //修改HBASE_MANAGES_ZK,禁用HBase自帶的Zookeeper,因為我們是使用獨立的Zookeeper export HBASE_MANAGES_ZK=false
2)vi hbase-site.xml
<configuration> <!-- 設置HRegionServers共享目錄,請加上端口號 --> <property> <name>hbase.rootdir</name> <value>hdfs://master188:9000/hbase</value> </property> <!-- 指定HMaster主機 --> <property> <name>hbase.master</name> <value>hdfs://master188:60000</value> </property> <!-- 啟用分布式模式 --> <property> <name>hbase.cluster.distributed</name> <value>true</value> </property> <!-- 指定Zookeeper集群位置 --> <property> <name>hbase.zookeeper.quorum</name> <value>master188:2181,master189:2181,slave190:2181</value> </property> <!-- 指定獨立Zookeeper安裝路徑 --> <property> <name>hbase.zookeeper.property.dataDir</name> <value>/home/hadoop/zookeeper-3.4.11</value> </property> <!-- 指定ZooKeeper集群端口 --> <property> <name>hbase.zookeeper.property.clientPort</name> <value>2181</value> </property> </configuration> 
3)vi regionservers

修改regionservers文件,因為當前是使用獨立的Zookeeper集群,所以要指定RegionServers所在機器:


master188 master189 slave190 
4)創建pid文件保存目錄

在/home/hadoop/目錄下:

mkdir data/hbase/pids -p

3、拷貝HBase到其他機器

cd /home/hadoop/

scp -r hbase-1.3.1 hadoop@master189:/home/hadoop/ scp -r hbase-1.3.1 hadoop@slave190:/home/hadoop/ 

4、啟動HBase

在主節點上啟動HBase(這里的主節點是指NameNode狀態為active的節點,而非指文中對本實驗的機器聲明):

cd hbase-1.3.1/bin ./start-hbase.sh //查看HMaster、Regionserver進程是否啟動 jps 
注意:此時Hadoop集群應處於啟動狀態,並且是在主節點執行start-hbase.sh啟動HBase集群,否則HMaster進程將在啟動幾秒后消失,而備用的HMaster進程需要在備用主節點單獨啟動,命令是: ./hbase-daemon.sh start master

在備用主節點啟動HMaster進程,作為備用HMaster:

cd hbase-1.3.1/bin

./hbase-daemon.sh start master

5、HA高可用測試

在瀏覽器中輸入 ip:16010 ,查看主節點和備用主節點上的HMaster的狀態,在備用主節點的web界面中,可以看到“Current Active Master: master188”,表示當前HBase主節點是master188機器;

主節點--->備用主節點
這里的主節點指使用start-hbase.sh命令啟動HBase集群的機器

kill掉主節點的HMaster進程,在瀏覽器中查看備用主節點的HBase是否切換為active;

若上述操作成功,則在主節點啟動被殺死的HMaster進程:

cd hbase-1.3.1/bin/

./hbase-daemon.sh start master 

然后,kill掉備用主節點的HMaster進程,在瀏覽器中查看主節點的HBase是否切換為active,若操作成功,則HBase高可用集群搭建完成;

6、HBase基本操作

//啟動HBase
[root@vnet ~] start-hbase.sh //進入HBase Shell [root@vnet ~] hbase shell //查看當前HBase有哪些表 hbase(main):> list //創建表t_user,cf1和cf2是列族,列族一般不超過3個 hbase(main):> create 't_user','cf1','cf2' //獲得表t_user的描述信息 hbase(main):> describe 't_user' //禁用表 hbase(main):> disable 't_user' //刪除表,刪除表之前要先把表禁用掉 hbase(main):> drop 't_user' //查詢表是否存在 hbase(main):> exists 't_user' //查看全表數據 hbase(main):> scan 't_user' //插入數據,分別是表名、key、列(列族:具體列)、值。HBase是面向列的數據庫,列可無限擴充 hbase(main):> put 't_user' ,'001','cf1:name','chenxj' hbase(main):> put 't_user' ,'001','cf1:age','18' hbase(main):> put 't_user' ,'001','cf2:sex','man' hbase(main):> put 't_user' ,'002','cf1:name','chenxj' hbase(main):> put 't_user' ,'002','cf1:address','fuzhou' hbase(main):> put 't_user' ,'002','cf2:sex','man' //獲取數據,可根據key、key和列族等進行查詢 hbase(main):> get 't_user','001' hbase(main):> get 't_user','002','cf1' hbase(main):> get 't_user','001','cf1:age' 

六、集群啟動結果

Hadoop + Zookeeper + HBase 高可用集群啟動后,進程狀態如下:

描述 node1   node2 node3
HDFS主 NameNode NameNode
HDFS從 DataNode DataNode DataNode
YARN主 ResourceManager ResourceManager
YARN從 NodeManager NodeManager NodeManager
HBase主 HMaster HMaster
HBase從 HRegionServer HRegionServer HRegionServer
Zookeeper獨立進程 QuorumPeerMain QuorumPeerMain QuorumPeerMain
NameNodes數據同步 JournalNode JournalNode JournalNode
主備故障切換 DFSZKFailoverController DFSZKFailoverController

七、總結

需要注意的地方:

1)備用節點上的NameNode、ResourceManager、HMaster均需單獨啟動;
hadoop-daemon.sh start namenode yarn-daemon.sh start resourcemanager hbase-daemon.sh start master 
2)可以使用-forcemanual參數強制切換主節點與備用主節點,但強制切換后集群的自動故障轉移將會失效,需要重新格式化zkfc: hdfs zdfc -formatZK;
#hdfs haadmin -transitionToActive/transitionToStandby -forcemanual master node2
hdfs haadmin    -transitionToActive --forceactive node2  -forcemanual yarn rmadmin -transitionToActive/transitionToStandby -forcemanual rm2
3)在備用主節點同步主節點的元數據時,主節點的HDFS必須已經啟動;

   4)無法查看standby狀態的節點上的hdfs;

   5)格式化namenode時要先啟動各個JournalNode機器上的journalnode進程:hadoop-daemon.sh start journalnode

   6)若遇到問題,可以先考慮是哪個組件出現問題,然后查看該組件或與該組件相關的組件的日志信息;若各組件web頁面無法訪問,或存在其他連接問題,可以從「防火牆是否關閉」、「端口是否被占用」、「SSH」、「集群機器是否處於同一網段」內等角度考慮;

3、spark集群安裝
spark on yarn 模式

Spark安裝

下載解壓

進入官方下載地址下載最新版 Spark。我下載的是 spark-1.3.0-bin-hadoop2.4.tgz

~/workspace目錄下解壓

tar -zxvf spark-1.3.0-bin-hadoop2.4.tgz
mv spark-1.3.0-bin-hadoop2.4 spark-1.3.0 #原來的文件名太長了,修改下

 

配置 Spark

cd ~/workspace/spark- 1.3.0/conf #進入spark配置目錄
cp spark-env .sh.template spark-env.sh #從配置模板復制
vi spark-env .sh #添加配置內容

spark-env.sh末尾添加以下內容(這是我的配置,你可以自行修改):

export SCALA_HOME= /home/spark/workspace/scala-2.10.4
export JAVA_HOME= /home/spark/workspace/jdk1.7.0_75
export HADOOP_HOME= /home/spark/workspace/hadoop-2.6.0
export HADOOP_CONF_DIR= $HADOOP_HOME/etc/hadoop
SPARK_MASTER_IP=master
SPARK_LOCAL_DIRS= /home/spark/workspace/spark-1.3.0
SPARK_DRIVER_MEMORY= 1G

 

注:在設置Worker進程的CPU個數和內存大小,要注意機器的實際硬件條件,如果配置的超過當前Worker節點的硬件條件,Worker進程會啟動失敗。

vi slaves在slaves文件下填上slave主機名:

slave1
slave2

 

將配置好的spark-1.3.0文件夾分發給所有slaves吧

scp -r ~ /workspace/spark-1.3.0 spark@slave1:~/workspace/

 

啟動Spark

sbin/start- all.sh

驗證 Spark 是否安裝成功

jps檢查,在 master 上應該有以下幾個進程:

$ jps
7949 Jps
7328 SecondaryNameNode
7805 Master
7137 NameNode
7475 ResourceManager

 

在 slave 上應該有以下幾個進程:

$jps
3132 DataNode
3759 Worker
3858 Jps
3231 NodeManager

 

進入Spark的Web管理頁面: http://master:8080

運行示例

#本地模式兩線程運行
./bin/run-example SparkPi 10 --master local[2]
 
#Spark Standalone 集群模式運行
./bin/spark-submit \
-- class org.apache.spark.examples.SparkPi \
--master spark://master:7077 \
lib/spark-examples- 1.3.0-hadoop2.4.0.jar \
100
 
#Spark on YARN 集群上 yarn-cluster 模式運行
./bin/spark-submit \
-- class org.apache.spark.examples.SparkPi \
--master yarn-cluster \ # can also be `yarn-client`
lib/spark-examples*.jar \
10

注意 Spark on YARN 支持兩種運行模式,分別為yarn-clusteryarn-client,具體的區別可以看這篇博文,從廣義上講,yarn-cluster適用於生產環境;而yarn-client適用於交互和調試,也就是希望快速地看到application的輸出。



hive

Hive 簡介

Hive 是一個基於 hadoop 的開源數據倉庫工具,用於存儲和處理海量結構化數據。它把海量數據存儲於 hadoop 文件系統,而不是數據庫,但提供了一套類數據庫的數據存儲和處理機制,並采用 HQL (類 SQL )語言對這些數據進行自動化管理和處理。我們可以把 Hive 中海量結構化數據看成一個個的表,而實際上這些數據是分布式存儲在 HDFS 中的。 Hive 經過對語句進行解析和轉換,最終生成一系列基於 hadoop 的 map/reduce 任務,通過執行這些任務完成數據處理。

Hive 誕生於 facebook 的日志分析需求,面對海量的結構化數據, Hive 以較低的成本完成了以往需要大規模數據庫才能完成的任務,並且學習門檻相對較低,應用開發靈活而高效。

Hive 自 2009.4.29 發布第一個官方穩定版 0.3.0 至今,不過一年的時間,正在慢慢完善,網上能找到的相關資料相當少,尤其中文資料更少,本文結合業務對 Hive 的應用做了一些探索,並把這些經驗做一個總結,所謂前車之鑒,希望讀者能少走一些彎路。

准備工作

環境

JDK:1.8 Hadoop Release:2.7.4 centos:7.3 node1(master) 主機: 192.168.252.121 node2(slave1) 從機: 192.168.252.122 node3(slave2) 從機: 192.168.252.123 node4(mysql) 從機: 192.168.252.124 

依賴環境

安裝Apache Hive前提是要先安裝hadoop集群,並且hive只需要在hadoop的namenode節點集群里安裝即可(需要在有的namenode上安裝),可以不在datanode節點的機器上安裝。還需要說明的是,雖然修改配置文件並不需要把hadoop運行起來,但是本文中用到了hadoop的hdfs命令,在執行這些命令時你必須確保hadoop是正在運行着的,而且啟動hive的前提也需要hadoop在正常運行着,所以建議先把hadoop集群啟動起來。

安裝MySQL 用於存儲 Hive 的元數據(也可以用 Hive 自帶的嵌入式數據庫 Derby,但是 Hive 的生產環境一般不用 Derby),這里只需要安裝 MySQL 單機版即可,如果想保證高可用的化,也可以部署 MySQL 主從模式;

Hadoop

Hadoop-2.7.4 集群快速搭建

MySQL 隨意任選其一

CentOs7.3 安裝 MySQL 5.7.19 二進制版本

搭建 MySQL 5.7.19 主從復制,以及復制實現細節分析

安裝

下載解壓

su hadoop cd /home/hadoop/ wget https://mirrors.tuna.tsinghua.edu.cn/apache/hive/hive-2.3.0/apache-hive-2.3.0-bin.tar.gz tar -zxvf apache-hive-2.3.0-bin.tar.gz mv apache-hive-2.3.0-bin hive-2.3.0

環境變量

如果是對所有的用戶都生效就修改vi /etc/profile 文件
如果只針對當前用戶生效就修改 vi ~/.bahsrc 文件

sudo vi /etc/profile
#hive export PATH=${HIVE_HOME}/bin:$PATH export HIVE_HOME=/home/hadoop/hive-2.3.0/

使環境變量生效,運行 source /etc/profile使/etc/profile文件生效

Hive 配置 Hadoop HDFS

復制 hive-site.xml

cd /home/hadoop/hive-2.3.0/conf cp hive-default.xml.template hive-site.xml

新建 hdfs 目錄

使用 hadoop 新建 hdfs 目錄,因為在 hive-site.xml 中有默認如下配置:

<property> <name>hive.metastore.warehouse.dir</name> <value>/user/hive/warehouse</value> <description>location of default database for the warehouse</description> </property> <property>

進入 hadoop 安裝目錄 執行hadoop命令新建/user/hive/warehouse目錄,並授權,用於存儲文件

cd /home/hadoop/hadoop-2.7.4 bin/hadoop fs -mkdir -p /user/hive/warehouse bin/hadoop fs -mkdir -p /user/hive/tmp bin/hadoop fs -mkdir -p /user/hive/log bin/hadoop fs -chmod -R 777 /user/hive/warehouse bin/hadoop fs -chmod -R 777 /user/hive/tmp bin/hadoop fs -chmod -R 777 /user/hive/log 

用以下命令檢查目錄是否創建成功

bin/hadoop fs -ls /user/hive

修改 hive-site.xml

搜索hive.exec.scratchdir,將該name對應的value修改為/user/hive/tmp

<property> <name>hive.exec.scratchdir</name> <value>/user/hive/tmp</value> </property> 

搜索hive.querylog.location,將該name對應的value修改為/user/hive/log/hadoop

<property> <name>hive.querylog.location</name> <value>/user/hive/log/hadoop</value> <description>Location of Hive run time structured log file</description> </property>

搜索javax.jdo.option.connectionURL,將該name對應的value修改為MySQL的地址

<property> <name>javax.jdo.option.ConnectionURL</name> <value>jdbc:mysql://192.168.252.124:3306/hive?createDatabaseIfNotExist=true</value> <description> JDBC connect string for a JDBC metastore. To use SSL to encrypt/authenticate the connection, provide database-specific SSL flag in the connection URL. For example, jdbc:postgresql://myhost/db?ssl=true for postgres database. </description> </property>

搜索javax.jdo.option.ConnectionDriverName,將該name對應的value修改為MySQL驅動類路徑

<property> <name>javax.jdo.option.ConnectionDriverName</name> <value>com.mysql.jdbc.Driver</value> <description>Driver class name for a JDBC metastore</description> </property>

搜索javax.jdo.option.ConnectionUserName,將對應的value修改為MySQL數據庫登錄名

<property> <name>javax.jdo.option.ConnectionUserName</name> <value>root</value> <description>Username to use against metastore database</description> </property>

搜索javax.jdo.option.ConnectionPassword,將對應的value修改為MySQL數據庫的登錄密碼

<property> <name>javax.jdo.option.ConnectionPassword</name> <value>mima</value> <description>password to use against metastore database</description> </property>

創建 tmp 文件

mkdir /home/hadoop/hive-2.3.0/tmp

並在 hive-site.xml 中修改

{system:java.io.tmpdir} 改成 /home/hadoop/hive-2.3.0/tmp

把 {system:user.name} 改成 {user.name}

新建 hive-env.sh

cp hive-env.sh.template hive-env.sh

vi hive-env.sh

HADOOP_HOME=/home/hadoop/hadoop-2.7.4/ export HIVE_CONF_DIR=/home/hadoop/hive-2.3.0/conf export HIVE_AUX_JARS_PATH=/home/hadoop/hive-2.3.0/lib

下載 mysql 驅動包

cd /home/hadoop/hive-2.3.0/lib wget http://central.maven.org/maven2/mysql/mysql-connector-java/5.1.38/mysql-connector-java-5.1.38.jar

初始化 mysql

MySQL數據庫進行初始化

首先確保 mysql 中已經創建 hive 庫

cd /home/hadoop/hive-2.3.0/bin ./schematool -initSchema -dbType mysql

如果看到如下,表示初始化成功

Starting metastore schema initialization to 2.3.0 Initialization script hive-schema-2.3.0.mysql.sql Initialization script completed schemaTool completed

查看 mysql 數據庫

/usr/local/mysql/bin/mysql -uroot -p
mysql> show databases;
+--------------------+
| Database | +--------------------+ | information_schema | | hive | | mysql | | performance_schema | | sys | +--------------------+ 5 rows in set (0.00 sec)
mysql> use hive;
Reading table information for completion of table and column names You can turn off this feature to get a quicker startup with -A Database changed mysql> show tables; +---------------------------+ | Tables_in_hive | +---------------------------+ | AUX_TABLE | | BUCKETING_COLS | | CDS | | COLUMNS_V2 | | COMPACTION_QUEUE | | COMPLETED_COMPACTIONS | | COMPLETED_TXN_COMPONENTS | | DATABASE_PARAMS | | DBS | | DB_PRIVS | | DELEGATION_TOKENS | | FUNCS | | FUNC_RU | | GLOBAL_PRIVS | | HIVE_LOCKS | | IDXS | | INDEX_PARAMS | | KEY_CONSTRAINTS | | MASTER_KEYS | | NEXT_COMPACTION_QUEUE_ID | | NEXT_LOCK_ID | | NEXT_TXN_ID | | NOTIFICATION_LOG | | NOTIFICATION_SEQUENCE | | NUCLEUS_TABLES | | PARTITIONS | | PARTITION_EVENTS | | PARTITION_KEYS | | PARTITION_KEY_VALS | | PARTITION_PARAMS | | PART_COL_PRIVS | | PART_COL_STATS | | PART_PRIVS | | ROLES | | ROLE_MAP | | SDS | | SD_PARAMS | | SEQUENCE_TABLE | | SERDES | | SERDE_PARAMS | | SKEWED_COL_NAMES | | SKEWED_COL_VALUE_LOC_MAP | | SKEWED_STRING_LIST | | SKEWED_STRING_LIST_VALUES | | SKEWED_VALUES | | SORT_COLS | | TABLE_PARAMS | | TAB_COL_STATS | | TBLS | | TBL_COL_PRIVS | | TBL_PRIVS | | TXNS | | TXN_COMPONENTS | | TYPES | | TYPE_FIELDS | | VERSION | | WRITE_SET | +---------------------------+ 57 rows in set (0.00 sec)

啟動 Hive

簡單測試

啟動Hive

cd /home/hadoop/hive-2.3.0/bin ./hive

創建 hive 庫

hive>  create database ymq;
OK
Time taken: 0.742 seconds

選擇庫

hive> use ymq; OK Time taken: 0.036 seconds

創建表

hive> create table test (mykey string,myval string); OK Time taken: 0.569 seconds

插入數據

hive> insert into test values("1","www.ymq.io"); WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases. Query ID = hadoop_20170922011126_abadfa44-8ebe-4ffc-9615-4241707b3c03 Total jobs = 3 Launching Job 1 out of 3 Number of reduce tasks is set to 0 since there's no reduce operator Starting Job = job_1506006892375_0001, Tracking URL = http://node1:8088/proxy/application_1506006892375_0001/ Kill Command = /home/hadoop/hadoop-2.7.4//bin/hadoop job -kill job_1506006892375_0001 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 2017-09-22 01:12:12,763 Stage-1 map = 0%, reduce = 0% 2017-09-22 01:12:20,751 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.24 sec MapReduce Total cumulative CPU time: 1 seconds 240 msec Ended Job = job_1506006892375_0001 Stage-4 is selected by condition resolver. Stage-3 is filtered out by condition resolver. Stage-5 is filtered out by condition resolver. Moving data to directory hdfs://node1:9000/user/hive/warehouse/ymq.db/test/.hive-staging_hive_2017-09-22_01-11-26_242_8022847052615616955-1/-ext-10000 Loading data to table ymq.test MapReduce Jobs Launched: Stage-Stage-1: Map: 1 Cumulative CPU: 1.24 sec HDFS Read: 4056 HDFS Write: 77 SUCCESS Total MapReduce CPU Time Spent: 1 seconds 240 msec OK Time taken: 56.642 seconds

查詢數據

hive> select * from test; OK 1 www.ymq.io Time taken: 0.253 seconds, Fetched: 1 row(s)

頁面數據

在界面上查看剛剛寫入的hdfs數據

圖片描述

圖片描述

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM