1. 寫在前面
flume-ng高可用長在大數據處理環節第一個出現,對於處理日志文件有很好的作用,本篇博客將詳細介紹flume-ng的高可用負載均衡搭建
2. flume-ng高可用負載均衡描述
在一般情況下,Flume-ng高可用采用server和client模式,client主要負責數據源source及數據流向端的sink指向配置,server主要負責數據流向sink詳細配置,client需要將server的信息統一管理,server和sink之間數據連接通過channels
3. 配置server,這里配置三個server
flume-server1.properties
#set Agent name
agent.sources = r1
agent.channels = c1
agent.sinks = k1
#set channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1024000
agent.channels.c1.transactionCapacity = 10000
agent.channels.c1.byteCapacity=134217728
agent.channels.c1.byteCapacityBufferPercentage=80
# other node,nna to nns
agent.sources.r1.type = avro
agent.sources.r1.bind = ynjz003
agent.sources.r1.port = 52020
agent.sources.r1.interceptors = i1
agent.sources.r1.interceptors.i1.type = static
agent.sources.r1.interceptors.i1.key = Collector
agent.sources.r1.interceptors.i1.value = ynjz003
agent.sources.r1.channels = c1
#set sink to hdfs
agent.sinks.k1.channel = c1
agent.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.k1.brokerList = ynjz003:9092,ynjz004:9092,ynjz005:9092,ynjz006:9092,ynjz007:9092,ynjz008:9092,ynjz009:9092
agent.sinks.k1.topic = flume-kafka-meijs33
agent.sinks.k1.serializer.class = kafka.serializer.StringEncoder
flume-server2.properties
#set Agent name
agent.sources = r1
agent.channels = c1
agent.sinks = k1
#set channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1024000
agent.channels.c1.transactionCapacity = 10000
agent.channels.c1.byteCapacity=134217728
agent.channels.c1.byteCapacityBufferPercentage=80
# other node,nna to nns
agent.sources.r1.type = avro
agent.sources.r1.bind = ynjz004
agent.sources.r1.port = 52020
agent.sources.r1.interceptors = i1
agent.sources.r1.interceptors.i1.type = static
agent.sources.r1.interceptors.i1.key = Collector
agent.sources.r1.interceptors.i1.value = ynjz004
agent.sources.r1.channels = c1
#set sink to hdfs
agent.sinks.k1.channel = c1
agent.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.k1.brokerList = ynjz003:9092,ynjz004:9092,ynjz005:9092,ynjz006:9092,ynjz007:9092,ynjz008:9092,ynjz009:9092
agent.sinks.k1.topic = flume-kafka-meijs33
agent.sinks.k1.serializer.class = kafka.serializer.StringEncoder
flume-server3.properties
#set Agent name
agent.sources = r1
agent.channels = c1
agent.sinks = k1
#set channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 1024000
agent.channels.c1.transactionCapacity = 10000
agent.channels.c1.byteCapacity=134217728
agent.channels.c1.byteCapacityBufferPercentage=80
# other node,nna to nns
agent.sources.r1.type = avro
agent.sources.r1.bind = ynjz005
agent.sources.r1.port = 52020
agent.sources.r1.interceptors = i1
agent.sources.r1.interceptors.i1.type = static
agent.sources.r1.interceptors.i1.key = Collector
agent.sources.r1.interceptors.i1.value = ynjz005
agent.sources.r1.channels = c1
#set sink to hdfs
agent.sinks.k1.channel = c1
agent.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
agent.sinks.k1.brokerList = ynjz003:9092,ynjz004:9092,ynjz005:9092,ynjz006:9092,ynjz007:9092,ynjz008:9092,ynjz009:9092
agent.sinks.k1.topic = flume-kafka-meijs33
agent.sinks.k1.serializer.class = kafka.serializer.StringEncoder
可以看出多個server配置的規律
3. 配置client,這里也配置一個client示例
flume-client.properties
#agent1 name
agent.channels = c1
agent.sources = r1
agent.sinks = k1 k2 k3 k4 k5 k6 k7
#set gruop
agent.sinkgroups = g1
#set channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 102400
agent.channels.c1.transactionCapacity = 1000
agent.channels.c1.byteCapacity=134217728
agent.channels.c1.byteCapacityBufferPercentage=80
agent.sources.r1.type = com.cbo.flume.source.zip.SpoolDirectorySource
agent.sources.r1.channels = c1
agent.sources.r1.spoolDir = /data/ynjz/workspace/zip
agent.sources.r1.fileHeader = true
agent.sources.r1.channels = c1
agent.sources.r1.flumeBatchSize=1000
agent.sources.r1.useFlumeEventFormat=false
agent.sources.r1.restart=true
agent.sources.r1.batchSize=1000
agent.sources.r1.batchTimeout=3000
agent.sources.r1.channels=c1
# set sink1
agent.sinks.k1.channel = c1
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = ynjz003
agent.sinks.k1.port = 52020
# set sink2
agent.sinks.k2.channel = c1
agent.sinks.k2.type = avro
agent.sinks.k2.hostname = ynjz004
agent.sinks.k2.port = 52020
# set sink3
agent.sinks.k3.channel = c1
agent.sinks.k3.type = avro
agent.sinks.k3.hostname = ynjz005
agent.sinks.k3.port = 52020
# set sink4
agent.sinks.k1.channel = c1
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = ynjz006
agent.sinks.k1.port = 52020
# set sink5
agent.sinks.k2.channel = c1
agent.sinks.k2.type = avro
agent.sinks.k2.hostname = ynjz007
agent.sinks.k2.port = 52020
# set sink6
agent.sinks.k3.channel = c1
agent.sinks.k3.type = avro
agent.sinks.k3.hostname = ynjz008
agent.sinks.k3.port = 52020
# set sink7
agent.sinks.k3.channel = c1
agent.sinks.k3.type = avro
agent.sinks.k3.hostname = ynjz009
agent.sinks.k3.port = 52020
#set sink group
agent.sinkgroups.g1.sinks = k1 k2 k3 k4 k5 k6 k7
#set failover
agent.sinkgroups.g1.processor.type = failover
agent.sinkgroups.g1.processor.priority.k1 = 10
agent.sinkgroups.g1.processor.priority.k2 = 10
agent.sinkgroups.g1.processor.priority.k3 = 10
agent.sinkgroups.g1.processor.priority.k4 = 10
agent.sinkgroups.g1.processor.priority.k5 = 10
agent.sinkgroups.g1.processor.priority.k6 = 10
agent.sinkgroups.g1.processor.priority.k7 = 10
agent.sinkgroups.g1.processor.maxpenalty = 10000
這里需要注意sinkgroups配置,flume sinkgroups在常用的應用中有兩種方式failover
和load_balance
,failover
可以理解為容錯機制,在上面的配置中sink只會往一個kafka寫入數據,但一個kafka掛了,failover
機制會立馬選舉一個出來,所以這里的容錯機制很完善,但是應對大數據量會影響數據寫入的能力,所以建議在大數據量的時候采用load_balance
配置,下面時配置示例
#agent1 name
agent.channels = c1
agent.sources = r1
agent.sinks = k1 k2 k3 k4 k5 k6 k7
#set gruop
agent.sinkgroups = g1
#set channel
agent.channels.c1.type = memory
agent.channels.c1.capacity = 102400
agent.channels.c1.transactionCapacity = 24000
agent.channels.c1.byteCapacity=134217728
agent.channels.c1.byteCapacityBufferPercentage=80
agent.sources.r1.type = com.cbo.flume.source.zip.SpoolDirectorySource
agent.sources.r1.channels = c1
agent.sources.r1.spoolDir = /data/4G
agent.sources.r1.includePattern = ([^ ]*\.zip$)
agent.sources.r1.fileHeader = true
agent.sources.r1.channels = c1
agent.sources.r1.flumeBatchSize=10000
agent.sources.r1.useFlumeEventFormat=false
agent.sources.r1.restart=true
agent.sources.r1.batchSize=10000
agent.sources.r1.batchTimeout=3000
agent.sources.r1.channels=c1
# set sink1
agent.sinks.k1.channel = c1
agent.sinks.k1.type = avro
agent.sinks.k1.hostname = ynjz003
agent.sinks.k1.port = 52020
# set sink2
agent.sinks.k2.channel = c1
agent.sinks.k2.type = avro
agent.sinks.k2.hostname = ynjz004
agent.sinks.k2.port = 52020
# set sink3
agent.sinks.k3.channel = c1
agent.sinks.k3.type = avro
agent.sinks.k3.hostname = ynjz005
agent.sinks.k3.port = 52020
# set sink4
agent.sinks.k4.channel = c1
agent.sinks.k4.type = avro
agent.sinks.k4.hostname = ynjz006
agent.sinks.k4.port = 52020
# set sink5
agent.sinks.k5.channel = c1
agent.sinks.k5.type = avro
agent.sinks.k5.hostname = ynjz007
agent.sinks.k5.port = 52020
# set sink6
agent.sinks.k6.channel = c1
agent.sinks.k6.type = avro
agent.sinks.k6.hostname = ynjz008
agent.sinks.k6.port = 52020
# set sink7
agent.sinks.k7.channel = c1
agent.sinks.k7.type = avro
agent.sinks.k7.hostname = ynjz009
agent.sinks.k7.port = 52020
#set sink group
agent.sinkgroups.g1.sinks = k1 k2 k3 k4 k5 k6 k7
#set load_balance
agent.sinkgroups.g1.processor.type=load_balance
agent.sinkgroups.g1.processor.backoff=true
agent.sinkgroups.g1.processor.selector=random
在實際應用中多個client基本上一直,只有監控文件目錄的配置不同即可agent.sources.r1.spoolDir = /data/4G
4. 啟動flume-ng高可用集群
首先啟動每個server,每個server只是配置文件flume-server-data.properties
不同:
./bin/flume-ng agent --name agent --conf conf --conf-file conf/flume-server-data.properties -Dflume.root.logger=INFO,console > /data/ynjz/workspace/flume-server-data.log 2>&1 &
啟動每個client,,每個server只是配置文件flume-client-data.properties
不同:
./bin/flume-ng agent --name agent --conf conf --conf-file conf/flume-client-data.properties -Dflume.root.logger=INFO,console > /data/ynjz/workspace/flume-client-data.log 2>&1 &
在平時應用中,可以隨時停止client,但停止了server沒起而啟動client會導致報錯