一、准备环境: 创建Kafka Topic和HBase表
1. 在kerberos环境下创建Kafka Topic
1.1 因为kafka默认使用的协议为PLAINTEXT,在kerberos环境下需要变更其通信协议: 在${KAFKA_HOME}/config/producer.properties
和config/consumer.properties
下添加
security.protocol=SASL_PLAINTEXT
1.2 在执行前,需要在环境变量中添加KAFKA_OPT选项,否则kafka无法使用keytab:
export KAFKA_OPTS="$KAFKA_OPTS -Djava.security.auth.login.config=/usr/ndp/current/kafka_broker/conf/kafka_jaas.conf"
其中kafka_jaas.conf
内容如下:
cat /usr/ndp/current/kafka_broker/conf/kafka_jaas.conf KafkaServer { com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true keyTab="/etc/security/keytabs/kafka.service.keytab" storeKey=true useTicketCache=false serviceName="kafka" principal="kafka/hzadg-mammut-platform3.server.163.org@BDMS.163.COM"; }; KafkaClient { com.sun.security.auth.module.Krb5LoginModule required useTicketCache=true renewTicket=true serviceName="kafka"; }; Client { com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true keyTab="/etc/security/keytabs/kafka.service.keytab" storeKey=true useTicketCache=false serviceName="zookeeper" principal="kafka/hzadg-mammut-platform3.server.163.org@BDMS.163.COM"; };
1.3 创建新的topic:
bin/kafka-topics.sh --create --zookeeper hzadg-mammut-platform2.server.163.org:2181,hzadg-mammut-platform3.server.163.org:2181 --replication-factor 1 --partitions 1 --topic spark-test
1.4 创建生产者:
bin/kafka-console-producer.sh --broker-list hzadg-mammut-platform2.server.163.org:6667,hzadg-mammut-platform3.server.163.org:6667,hzadg-mammut-platform4.server.163.org:6667 --topic spark-test --producer.config ./config/producer.properties
1.5 测试消费者:
bin/kafka-console-consumer.sh --zookeeper hzadg-mammut-platform2.server.163.org:2181,hzadg-mammut-platform3.server.163.org:2181 --bootstrap-server hzadg-mammut-platform2.server.163.org:6667 --topic spark-test --from-beginning --new-consumer --consumer.config ./config/consumer.properties
2. 创建HBase表
2.1 kinit到hbase账号,否则无法创建hbase表
kinit -kt /etc/security/keytabs/hbase.service.keytab hbase/hzadg-mammut-platform2.server.163.org@BDMS.163.COM
./bin/hbase shell
> create 'recsys_logs', 'f'
二、编写Spark代码
编写简单的Spark Java程序,功能为: 从Kafka消费信息,同时将batch内统计的数量写入Hbase中,具体可以参考项目:
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.netease.spark.streaming.hbase; import com.netease.spark.utils.Consts; import com.netease.spark.utils.JConfig; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HConnection; import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.client.HTableInterface; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.util.Bytes; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka010.ConsumerStrategies; import org.apache.spark.streaming.kafka010.KafkaUtils; import org.apache.spark.streaming.kafka010.LocationStrategies; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class JavaKafkaToHBaseKerberos { private final static Logger LOGGER = LoggerFactory.getLogger(JavaKafkaToHBaseKerberos.class); private static HConnection connection = null; private static HTableInterface table = null; public static void openHBase(String tablename) throws IOException { Configuration conf = HBaseConfiguration.create(); synchronized (HConnection.class) { if (connection == null) connection = HConnectionManager.createConnection(conf); } synchronized (HTableInterface.class) { if (table == null) { table = connection.getTable("recsys_logs"); } } } public static void closeHBase() { if (table != null) try { table.close(); } catch (IOException e) { LOGGER.error("关闭 table 出错", e); } if (connection != null) try { connection.close(); } catch (IOException e) { LOGGER.error("关闭 connection 出错", e); } } public static void main(String[] args) throws Exception { String hbaseTable = JConfig.getInstance().getProperty(Consts.HBASE_TABLE); String kafkaBrokers = JConfig.getInstance().getProperty(Consts.KAFKA_BROKERS); String kafkaTopics = JConfig.getInstance().getProperty(Consts.KAFKA_TOPICS); String kafkaGroup = JConfig.getInstance().getProperty(Consts.KAFKA_GROUP); // open hbase try { openHBase(hbaseTable); } catch (IOException e) { LOGGER.error("建立HBase 连接失败", e); System.exit(-1); } SparkConf conf = new SparkConf().setAppName("JavaKafakaToHBase"); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(1000)); Set<String> topicsSet = new HashSet<>(Arrays.asList(kafkaTopics.split(","))); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", kafkaBrokers); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", kafkaGroup); kafkaParams.put("auto.offset.reset", "earliest"); kafkaParams.put("enable.auto.commit", false); // 在kerberos环境下,这个配置需要增加 kafkaParams.put("security.protocol", "SASL_PLAINTEXT"); // Create direct kafka stream with brokers and topics final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(Arrays.asList(topicsSet.toArray(new String[0])), kafkaParams) ); JavaDStream<String> lines = stream.map(new Function<ConsumerRecord<String, String>, String>() { private static final long serialVersionUID = -1801798365843350169L; @Override public String call(ConsumerRecord<String, String> record) { return record.value(); } }).filter(new Function<String, Boolean>() { private static final long serialVersionUID = 7786877762996470593L; @Override public Boolean call(String msg) throws Exception { return msg.length() > 0; } }); JavaDStream<Long> nums = lines.count(); nums.foreachRDD(new VoidFunction<JavaRDD<Long>>() { private SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd HH:mm:ss"); @Override public void call(JavaRDD<Long> rdd) throws Exception { Long num = rdd.take(1).get(0); String ts = sdf.format(new Date()); Put put = new Put(Bytes.toBytes(ts)); put.add(Bytes.toBytes("f"), Bytes.toBytes("nums"), Bytes.toBytes(num)); table.put(put); } }); ssc.start(); ssc.awaitTermination(); closeHBase(); } }
三、 编译并在Yarn环境下运行
3.1 切到项目路径下,编译项目:
mvn clean package
3.2 运行Spark环境
- 由于executor需要访问kafka,所以需要将Kafka授权过的kerberos用户下发至executor中;
- 由于集群环境的hdfs也是kerberos加密的,需要通过spark.yarn.keytab/spark.yarn.principal配置可以访问Hdfs/HBase的keytab信息;
在项目目录下执行如下:
/usr/ndp/current/spark2_client/bin/spark-submit \ --files ./kafka_client_jaas.conf,./kafka.service.keytab \ --conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=./kafka_client_jaas.conf" \ --driver-java-options "-Djava.security.auth.login.config=./kafka_client_jaas.conf" \ --conf spark.yarn.keytab=/etc/security/keytabs/hbase.service.keytab \ --conf spark.yarn.principal=hbase/hzadg-mammut-platform1.server.163.org@BDMS.163.COM \ --class com.netease.spark.streaming.hbase.JavaKafkaToHBaseKerberos \ --master yarn \ --deploy-mode client \ ./target/spark-demo-0.1.0-jar-with-dependencies.jar
其中kafka_client_jaas.conf
文件具体内容如下:
cat kafka_client_jaas.conf KafkaClient { com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true renewTicket=true keyTab="./kafka.service.keytab" storeKey=true useTicketCache=false serviceName="kafka" principal="kafka/hzadg-mammut-platform1.server.163.org@BDMS.163.COM"; };
3.2 执行结果

