更新hadoop fs 命令實現:
[ss@db csv]$ hadoop fs -count /my_rc/my_hive_db/* 18/01/14 15:40:19 INFO hdfs.PeerCache: SocketCache disabled. 3 2 0 /my_rc/my_hive_db/.hive-staging_hive_2017-08-19_16-52-39_153_7217997288202811839-170149 2 0 0 /my_rc/my_hive_db/.hive-staging_hive_2018-01-03_15-23-10_240_5147839610865108930-52517 1 0 0 /my_rc/my_hive_db/BusinessGtUser 4 1 321008 /my_rc/my_hive_db/ZJ2_SenseSta 1 1 143 /my_rc/my_hive_db/anthgain 1 1 27228 /my_rc/my_hive_db/anthgainpoint 1 1 70 /my_rc/my_hive_db/antvgain 1 1 27429 /my_rc/my_hive_db/antvgainpoint
通過hadoop fs -du 或者 hadoop fs -count只能統計指定的某個hdfs路徑(hive表目錄)的總文件個數及文件的大小,但是通過hadoop命令沒有辦法實現批量處理hive中多個表一次進行統計,如果一次性統計多個hive表目錄的文件個數、文件總大小只能通過java程序使用hadoop api實現。
package com.my.hdfsopt; import java.io.FileNotFoundException; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; public class HdfsPathMonitor { // submit shell /* * main類的路徑不需要指定,否則會被認為是參數傳遞進入。 * yarn jar /app/m_user1/service/Hangzhou_HdfsFileMananger.jar /hive_tenant_account/hivedbname/ */ public static void main(String[] args) throws Exception { System.out.println("the args is " + String.join(",", args)); String dirPath = args[0]; Configuration conf = new Configuration(); /* * <property> <name>fs.defaultFS</name> <value>hdfs://mycluster</value> * </property> */ conf.set("fs.defaultFS", "hdfs://mycluster"); FileSystem fileSystem = FileSystem.get(conf); Path path = new Path(dirPath); // 獲取文件列表 FileStatus[] files = fileSystem.listStatus(path); if (files == null || files.length == 0) { throw new FileNotFoundException("Cannot access " + dirPath + ": No such file or directory."); } System.out.println("dirpath \t total file size \t total file count"); for (int i = 0; i < files.length; i++) { String pathStr = files[i].getPath().toString(); FileSystem fs = files[i].getPath().getFileSystem(conf); long totalSize = fs.getContentSummary(files[i].getPath()).getLength(); long totalFileCount = listAll(conf, files[i].getPath()); fs.close(); System.out.println(("".equals(pathStr) ? "." : pathStr) + "\t" + totalSize + "\t" + totalFileCount); } } /** * @Title: listAll @Description: 列出目錄下所有文件 @return void 返回類型 @throws */ public static Long listAll(Configuration conf, Path path) throws IOException { long totalFileCount = 0; FileSystem fs = FileSystem.get(conf); if (fs.exists(path)) { FileStatus[] stats = fs.listStatus(path); for (int i = 0; i < stats.length; ++i) { if (!stats[i].isDir()) { // regular file // System.out.println(stats[i].getPath().toString()); totalFileCount++; } else { // dir // System.out.println(stats[i].getPath().toString()); totalFileCount += listAll(conf, stats[i].getPath()); } } } fs.close(); return totalFileCount; } }
執行命令:
yarn jar /app/m_user1/tommyduan_service/Hangzhou_HdfsFileMananger.jar /hive_tenant_account/hivedbname/
執行結果: