多個線程運行MR程序時hadoop出現的問題


 

夜間多個任務同時並行,總有幾個隨機性有任務失敗,查看日志: 刷選關鍵詞 Caused by  或者  FAILED

 cat -n ads_channel.log |grep "Caused by" 7732 Caused by: java.util.concurrent.ExecutionException: java.io.IOException: Rename cannot overwrite non empty destination directory /tmp/hadoop-hdfs/mapred/local/1576781334421 7737 Caused by: java.io.IOException: Rename cannot overwrite non empty destination directory /tmp/hadoop-hdfs/mapred/local/1576781334421
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
java.io.IOException: java.util.concurrent.ExecutionException: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.
    at org.apache.hadoop.mapred.LocalDistributedCacheManager.setup(LocalDistributedCacheManager.java:143)
    at org.apache.hadoop.mapred.LocalJobRunner$Job.<init>(LocalJobRunner.java:171)
    at org.apache.hadoop.mapred.LocalJobRunner.submitJob(LocalJobRunner.java:758)
    at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:244)
    at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1307)
    at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1304)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
    at org.apache.hadoop.mapreduce.Job.submit(Job.java:1304)
    at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:578)
    at org.apache.hadoop.mapred.JobClient$1.run(JobClient.java:573)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1920)
    at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:573)
    at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:564)
    at org.apache.hadoop.hive.ql.exec.mr.ExecDriver.execute(ExecDriver.java:436)
    at org.apache.hadoop.hive.ql.exec.mr.MapRedTask.execute(MapRedTask.java:142)
    at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:214)
    at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:99)
    at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:2052)
    at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1748)
    at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1501)
    at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1285)
    at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1275)
    at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:226)
    at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:175)
    at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:389)
    at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:324)
    at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:726)
    at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:699)
    at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:634)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
    at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Caused by: java.util.concurrent.ExecutionException: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.
    at java.util.concurrent.FutureTask.report(FutureTask.java:122)
    at java.util.concurrent.FutureTask.get(FutureTask.java:192)
    at org.apache.hadoop.mapred.LocalDistributedCacheManager.setup(LocalDistributedCacheManager.java:139)
    ... 38 more
Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.
    at org.apache.hadoop.fs.FileSystem.rename(FileSystem.java:1310)
    at org.apache.hadoop.fs.DelegateToFileSystem.renameInternal(DelegateToFileSystem.java:193)
    at org.apache.hadoop.fs.AbstractFileSystem.renameInternal(AbstractFileSystem.java:744)
    at org.apache.hadoop.fs.FilterFs.renameInternal(FilterFs.java:236)
    at org.apache.hadoop.fs.AbstractFileSystem.rename(AbstractFileSystem.java:674)
    at org.apache.hadoop.fs.FileContext.rename(FileContext.java:932)
    at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:369)
    at org.apache.hadoop.yarn.util.FSDownload.call(FSDownload.java:60)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Job Submission failed with exception 'java.io.IOException(java.util.concurrent.ExecutionException: org.apache.hadoop.fs.FileAlreadyExistsException: rename destination /tmp/hadoop-hdfs/mapred/local/1579374502408 already exists.)'
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.mr.MapRedTask

 

 擴展:

cat -n ads_channel.log |grep "Caused by" 或者 grep ads_channel.log -e "Caused by" 或者 grep -E "Caused by|FAILED" ads_channel.log #兩個關鍵詞 grep "2019-12-21" ads_channel.log | grep "Caused by" ads_channel.log cat ads_channel.log | grep "Caused by" -B 10 ##根據關鍵字查看前20行日志 cat ads_channel.log | grep "Caused by" -A 10 ##根據關鍵字查看后20行日志 cat ads_channel.log | grep "Caused by" -C 10 #根據關鍵字查看前后10行日志 說明: -A 表示關鍵字之后,After -B 表示關鍵字之前,Before -C 表示關鍵字前后,Context vim ads_channel.log :set nu :7749 (跳轉到指定行數) 實時查詢多個關鍵字的日志信息 命令:tail -f ads_channel.log |grep -E “Caused by"

問題原因:

    當多個線程運行MR程序時hadoop出現的問題:
        https://issues.apache.org/jira/browse/MAPREDUCE-6992
        https://issues.apache.org/jira/browse/MAPREDUCE-6441

hdfs會創建一個以當前時間的時間戳命名的文件.當兩個mr任務在同一毫秒提交,造成了文件的並發訪問問題.

yarn的運行模式:

1-本地模式(LocalJobRunner實現)
mapreduce.framework.name設置為local,則不會使用YARN集群來分配資源,在本地節點執行。在本地模式運行的任務,無法發揮集群的優勢。注:在web UI是查看不到本地模式運行的任務。

        對 hive有些了解的人都會知道,hive 會將 SQL 語句最終轉化成分布式執行的 mapreduce 任務計划。對於大數量集的數據啟動 mapreduce 所花費的時間是渺小的。因為數據量大,並且分布再不同的機器上,在不同的機器上處理,這樣做是 hive 的優勢之一。然而當處理小數量,並且數據都聚集再一台機器上時,那么啟動本地模式是非常有意的,不可避免的啟動 mapreduce,將數據拉回客戶端,本地處理,這樣減少了分處理后合並花費的時間。如此一來,對數據量比較小的操作,就可以在本地執行,這樣要比提交任務到集群執行效率要快很多。
啟動本地模式,需要配置如下參數: 

 hive.exec.mode.local.auto 決定 Hive 是否應該自動地根據輸入文件大小,在本地運行。 hive.exec.mode.local.auto.inputbytes.max 最大輸入數據量,當輸入數據量小於這個值的時候將會啟動本地模式,默認是 128M。 hive.exec.mode.local.auto.tasks.max 最大輸入文件個數,當輸入文件個數小於這個值的時候將會啟動本地模式。(默認4)

 

當一個job滿足如下條件才能真正使用本地模式:      

  1.job的輸入數據大小必須小於參數:hive.exec.mode.local.auto.inputbytes.max(默認128MB)    
  2.job的map數必須小於參數:hive.exec.mode.local.auto.tasks.max(默認4)    
  3.job的reduce數必須為0或者1

 

2-Yarn模式(YARNRunner實現)
        mapreduce.framework.name設置為yarn,當客戶端配置mapreduce.framework.name為yarn時, 客戶端會使用YARNRunner與服務端通信, 而YARNRunner真正的實現是通過ClientRMProtocol與RM交互, 包括提交Application, 查詢狀態等功能。但是根據任務的特性,分為兩種方式執行任務

3-Uber模式:

        為降低小作業延遲而設計的一種模式,所有任務,不管是Map Task,還是Reduce Task,均在同一個Container中順序執行,這個Container其實也是MRAppMaster所在Container

4-Non-Uber模式:

         對於運行時間較長的大作業,先為Map Task申請資源,當Map Task運行完成數目達到一定比例后再為Reduce Task申請資源。

解決辦法:

    1-在不改源代碼的情況下,取消自動啟動本地模式,根據集群環境,臨時在運行程序時設置:

set hive.exec.mode.local.auto = false
2-在調度系統中設置設置失敗重試.
azkaban配置失敗重試如下:
type =command
command = xxxxxx
retries=3
retry.backoff=60000 #毫秒數

參考:https://blog.csdn.net/weixin_39445556/article/details/80348976

在官網找到了這個bug,在2.7.1版本中已經修復了這個bug,對集群進行升級:

This is a bug in Hadoop 2.6.0. It's been marked as fixed but it still happens occasionally (see: https://issues.apache.org/jira/browse/YARN-2624).

https://stackoverflow.com/questions/30857413/hadoop-complains-about-attempting-to-overwrite-nonempty-destination-directory

 

[hdfs@el-hadoop-1 logs]$ hadoop dfsadmin -report ##查看hadoop狀況:
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.

Configured Capacity: 1242537227061 (1.13 TB)
Present Capacity: 1154802876345 (1.05 TB)
DFS Remaining: 1125514018745 (1.02 TB)
DFS Used: 29288857600 (27.28 GB)
DFS Used%: 2.54%
Under replicated blocks: 0
Blocks with corrupt replicas: 0
Missing blocks: 0
Missing blocks (with replication factor 1): 0

-------------------------------------------------
Live datanodes (3):

Name: 172.26.0.106:50010 (el-hadoop-1)
Hostname: el-hadoop-1
Rack: /default
Decommission Status : Normal
Configured Capacity: 414179075687 (385.73 GB)
DFS Used: 9740627968 (9.07 GB)
Non DFS Used: 22051710567 (20.54 GB)
DFS Remaining: 360492523769 (335.73 GB)
DFS Used%: 2.35%
DFS Remaining%: 87.04%
Configured Cache Capacity: 4294967296 (4 GB)
Cache Used: 0 (0 B)
Cache Remaining: 4294967296 (4 GB)
Cache Used%: 0.00%
Cache Remaining%: 100.00%
Xceivers: 8
Last contact: Sat Dec 21 11:29:07 CST 2019


Name: 172.26.0.108:50010 (el-hadoop-2)
Hostname: el-hadoop-2
Rack: /default
Decommission Status : Normal
Configured Capacity: 414179075687 (385.73 GB)
DFS Used: 9774043136 (9.10 GB)
Non DFS Used: 0 (0 B)
DFS Remaining: 382510819168 (356.24 GB)
DFS Used%: 2.36%
DFS Remaining%: 92.35%
Configured Cache Capacity: 4294967296 (4 GB)
Cache Used: 0 (0 B)
Cache Remaining: 4294967296 (4 GB)
Cache Used%: 0.00%
Cache Remaining%: 100.00%
Xceivers: 8
Last contact: Sat Dec 21 11:29:06 CST 2019


Name: 172.26.0.109:50010 (el-hadoop-3)
Hostname: el-hadoop-3
Rack: /default
Decommission Status : Normal
Configured Capacity: 414179075687 (385.73 GB)
DFS Used: 9774186496 (9.10 GB)
Non DFS Used: 0 (0 B)
DFS Remaining: 382510675808 (356.24 GB)
DFS Used%: 2.36%
DFS Remaining%: 92.35%
Configured Cache Capacity: 4294967296 (4 GB)
Cache Used: 0 (0 B)
Cache Remaining: 4294967296 (4 GB)
Cache Used%: 0.00%
Cache Remaining%: 100.00%
Xceivers: 8
Last contact: Sat Dec 21 11:29:08 CST 2019


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM