不多說,直接上代碼。
代碼版本1
1 package zhouls.bigdata.myWholeHadoop.HDFS.hdfs7; 2 3 import java.io.IOException; 4 import java.net.URI; 5 import java.net.URISyntaxException; 6 import org.apache.hadoop.conf.Configuration; 7 import org.apache.hadoop.fs.FSDataInputStream; 8 import org.apache.hadoop.fs.FSDataOutputStream; 9 import org.apache.hadoop.fs.FileStatus; 10 import org.apache.hadoop.fs.FileSystem; 11 import org.apache.hadoop.fs.FileUtil; 12 import org.apache.hadoop.fs.Path; 13 import org.apache.hadoop.fs.PathFilter; 14 import org.apache.hadoop.io.IOUtils; 15 /** 16 * function 合並小文件至 HDFS 17 * 18 * 19 */ 20 public class MergeSmallFilesToHDFS 21 { 22 private static FileSystem fs = null; //定義文件系統對象,是HDFS上的 23 private static FileSystem local = null; //定義文件系統對象,是本地上的 24 25 /** 26 * @function main 27 * @param args 28 * @throws IOException 29 * @throws URISyntaxException 30 */ 31 32 public static void main(String[] args) throws IOException,URISyntaxException{ 33 34 list(); 35 } 36 37 /** 38 * 39 * @throws IOException 40 * @throws URISyntaxException 41 */ 42 public static void list() throws IOException, URISyntaxException{ 43 // 讀取hadoop配置文件 44 Configuration conf = new Configuration(); 45 // 文件系統訪問接口和創建FileSystem對象,在本地上運行模式 46 URI uri = new URI("hdfs://HadoopMaster:9000"); 47 fs = FileSystem.get(uri, conf); 48 // 獲得本地文件系統 49 local = FileSystem.getLocal(conf); 50 // 過濾目錄下的 svn 文件 51 FileStatus[] dirstatus = local.globStatus(new Path("./data/mergeSmallFilesToHDFS/73/*"),new RegexExcludePathFilter("^.*svn$")); 52 // FileStatus[] dirstatus = local.globStatus(new Path("D://data/73/*"),new RegexExcludePathFilter("^.*svn$")); 53 //獲取D:\Data\tvdata目錄下的所有文件路徑 54 Path[] dirs = FileUtil.stat2Paths(dirstatus); 55 FSDataOutputStream out = null; 56 FSDataInputStream in = null; 57 for (Path dir : dirs) 58 {//比如拿2012-09-17為例 59 //將文件夾名稱2012-09-17的-去掉,直接,得到20120901文件夾名稱 60 String fileName = dir.getName().replace("-", "");//文件名稱 61 //只接受20120917日期目錄下的.txt文件 62 FileStatus[] localStatus = local.globStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$")); 63 // 獲得20120917日期目錄下的所有文件 64 Path[] listedPaths = FileUtil.stat2Paths(localStatus); 65 // 輸出路徑 66 Path block = new Path("hdfs://HadoopMaster:9000/middle/tv/"+ fileName + ".txt"); 67 System.out.println("合並后的文件名稱:"+fileName+".txt"); 68 // 打開輸出流 69 out = fs.create(block); 70 //循環20120917日期目錄下的所有文件 71 for (Path p : listedPaths){//這是星型for循環,即listedPaths的值傳給Path p 72 in = local.open(p);// 打開輸入流 73 IOUtils.copyBytes(in, out, 4096, false); // 復制數據 74 // 關閉輸入流 75 in.close(); 76 } 77 if (out != null){ 78 // 關閉輸出流 79 out.close(); 80 } 81 //當循環完20120917日期目錄下的所有文件之后,接着依次20120918,20120919,,, 82 } 83 } 84 85 /** 86 * 87 * @function 過濾 regex 格式的文件 88 * 89 */ 90 public static class RegexExcludePathFilter implements PathFilter{ 91 private final String regex; 92 93 public RegexExcludePathFilter(String regex){ 94 this.regex = regex; 95 } 96 97 98 public boolean accept(Path path){ 99 // TODO Auto-generated method stub 100 boolean flag = path.toString().matches(regex); 101 return !flag; 102 } 103 104 } 105 106 /** 107 * 108 * @function 接受 regex 格式的文件 109 * 110 */ 111 public static class RegexAcceptPathFilter implements PathFilter{ 112 private final String regex; 113 114 public RegexAcceptPathFilter(String regex){ 115 this.regex = regex; 116 } 117 118 119 public boolean accept(Path path){ 120 // TODO Auto-generated method stub 121 boolean flag = path.toString().matches(regex); 122 return flag; 123 } 124 125 } 126 }
代碼版本2
1 package com.dajiangtai.Hadoop.HDFS; 2 3 import java.io.IOException; 4 import java.net.URI; 5 import java.net.URISyntaxException; 6 import org.apache.hadoop.conf.Configuration; 7 import org.apache.hadoop.fs.FSDataInputStream; 8 import org.apache.hadoop.fs.FSDataOutputStream; 9 import org.apache.hadoop.fs.FileStatus; 10 import org.apache.hadoop.fs.FileSystem; 11 import org.apache.hadoop.fs.FileUtil; 12 import org.apache.hadoop.fs.Path; 13 import org.apache.hadoop.fs.PathFilter; 14 import org.apache.hadoop.hdfs.DistributedFileSystem; 15 import org.apache.hadoop.io.IOUtils; 16 /** 17 * function 合並小文件至 HDFS , 文件與塊大小(比如128M)來比,小的話,稱為小文件。是一個相對概念!相對於數據塊而言的! 18 * @author 小講 19 * 我們利用通配符和PathFilter 對象,將本地多種格式的文件上傳至 HDFS文件系統,並過濾掉 txt文本格式以外的文件。 20 */ 21 public class MergeSmallFilesToHDFS { 22 private static FileSystem fs = null; 23 private static FileSystem local = null; 24 /** 25 * @function main 26 * @param args 27 * @throws IOException 28 * @throws URISyntaxException 29 */ 30 public static void main(String[] args) throws IOException, 31 URISyntaxException { 32 list(); 33 } 34 35 /** 36 * 37 * @throws IOException 38 * @throws URISyntaxException 39 */ 40 public static void list() throws IOException, URISyntaxException { 41 // 讀取hadoop文件系統的配置 42 Configuration conf = new Configuration(); 43 // conf=Configuration 44 // conf是Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml 45 46 //文件系統訪問接口 47 URI uri = new URI("hdfs://djt002:9000"); 48 // uri=URI 49 // uri是hdfs://djt002:9000 50 51 // URL、URI與Path三者的區別 52 // Hadoop文件系統中通過Hadoop Path對象來代表一個文件 53 // URL(相當於絕對路徑) -> (文件) -> URI(相當於相對路徑,即代表URL前面的那一部分) 54 // URI:如hdfs://dajiangtai:9000 55 // 如,URL.openStream 56 57 58 59 //獲得FileSystem實例,即HDFS 60 fs = FileSystem.get(uri, conf); 61 // fs=DistributedFileSystem 62 // fs是DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_1814566850_1, ugi=Administrator (auth:SIMPLE)]] 63 64 //獲得FileSystem實例,即Local 65 local = FileSystem.getLocal(conf); 66 // local=LocalFileSystem 67 // local是org.apache.hadoop.fs.LocalFileSystem@3ce1b8c5 68 // 為什么要獲取到Local呢,因為,我們要把本地D盤下data/73目錄下的文件要合並后,上傳到HDFS里,所以,我們需先獲取到Local,再來做合並工作啦! 69 70 71 // 18、列出文件或目錄內容(主要是存放文件或目錄的元數據,即大小,權限,副本,,,) 72 // public FileStatus[] listStatus(Path f) throws IOException 73 // public FileStatus[] listStatus(Path f,PathFilter filter) throws IOException 74 // PathFilter是路徑過濾器 75 // public FileStatus[] listStatus(Path[] files) throws IOException 76 // public FileStatus[] listStatus(Path[] files,PathFilter filter) 77 // 傳送Path數組和路徑過濾器 78 // 79 // 80 // 19、FileUtil中的stat2Paths(),將一個FileStatus元數據對象數組轉換為一個Path對象數組 81 // 82 // 20、(1)使用通配符來匹配多個目錄下的多個文件,也是列出文件或目錄內容(主要是存放文件或目錄的元數據,即大小,權限,副本,,,) 83 // public FileStatus[] globStatus(Path pathPattern) throws IOException 84 // public FileStatus[] globStatus(Path pathPattern,PathFilter filter) throws IOException 85 // 86 // (2)PathFilter對象 87 // public interface PathFilter{ 88 // boolean accpet(Path path); 89 // } 90 91 92 93 //過濾目錄下的 svn 文件,globStatus從第一個參數通配符合到文件,剔除滿足第二個參數到結果,因為PathFilter中accept是return! 94 FileStatus[] dirstatus = local.globStatus(new Path("D://data/73/*"),new RegexExcludePathFilter("^.*svn$"));//一般這是隱藏文件,所以得排除 95 //dirstatus=FileStatus[7] 96 // dirstatus是[DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17; isDirectory=true; modification_time=1427791478002; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false} 97 // , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18; isDirectory=true; modification_time=1427791505373; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false} 98 // , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-19; isDirectory=true; modification_time=1427791532277; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false} 99 // , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-20; isDirectory=true; modification_time=1427791553035; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false} 100 // , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-21; isDirectory=true; modification_time=1427791577709; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false} 101 // , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-22; isDirectory=true; modification_time=1427791602770; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false} 102 // , DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-23; isDirectory=true; modification_time=1427791647177; access_time=0; owner=; group=; permission=rwxrwxrwx; isSymlink=false}] 103 104 105 // ^表示匹配我們字符串開始的位置 *代表0到多個字符 $代表字符串結束的位置 106 // RegexExcludePathFilter來只排除我們不需要的,即svn格式 107 // RegexExcludePathFilter這個方法我們自己寫 108 109 // 但是我們,最終是要處理文件里的東西,最終是要轉成Path類型,因為Path對象f,它對應着一個文件。 110 111 //獲取73目錄下的所有文件路徑,注意FIleUtil中stat2Paths()的使用,它將一個FileStatus對象數組轉換為Path對象數組。 112 Path[] dirs = FileUtil.stat2Paths(dirstatus);//dirstatus是FileStatus數組類型 113 // dirs=Path[7] 114 // dirs是 [file:/D:/data/73/2012-09-17 115 // , file:/D:/data/73/2012-09-18 116 // , file:/D:/data/73/2012-09-19 117 // , file:/D:/data/73/2012-09-20 118 // , file:/D:/data/73/2012-09-21 119 // , file:/D:/data/73/2012-09-22 120 // , file:/D:/data/73/2012-09-23] 121 122 123 FSDataOutputStream out = null;//輸出流 124 // out=HdfsDaDataOutputStream 125 // out是org.apache.hadoop.hdfs.client.HdfsDataOutputStream@2b11624e 126 127 FSDataInputStream in = null;//輸入流 128 // in=ChecksumFileSystem&FSDataBoundedInputStream 129 // in是org.apache.hadoop.fs.ChecksumFileSystem$FSDataBoundedInputStream@526d542f 130 131 // 很多人搞不清輸入流和輸出流,!!!! 132 // 其實啊,輸入流、輸出流都是針對內存的 133 // 往內存里寫,是輸入流。 134 // 內存往文件里寫,是輸出Luis。 135 // 136 // 比如一個文件A復制到另一文件B,那么,先寫到內存里,再寫到文件B。 137 // => 則文件A寫到內存里,叫輸入流。 138 // => 則內存里寫到文件B,叫輸出流 139 140 141 for (Path dir : dirs) {//for星型循環,即將dirs是Path對象數組,一一傳給Path dir 142 // dirs=Path[7] 143 // dirs是[file:/D:/data/73/2012-09-17 144 // , file:/D:/data/73/2012-09-18 145 // , file:/D:/data/73/2012-09-19 146 // , file:/D:/data/73/2012-09-20 147 // , file:/D:/data/73/2012-09-21 148 // , file:/D:/data/73/2012-09-22 149 // , file:/D:/data/73/2012-09-23] 150 151 // dir= Path 152 // 先傳,dir是file:/D:/data/73/2012-09-17 153 // 再傳,file:/D:/data/73/2012-09-18 154 // 再傳,file:/D:/data/73/2012-09-19 155 // 再傳,file:/D:/data/73/2012-09-20 156 // 再傳,file:/D:/data/73/2012-09-21 157 // 再傳,file:/D:/data/73/2012-09-22 158 // 再傳,file:/D:/data/73/2012-09-23 159 160 String fileName = dir.getName().replace("-", "");//文件名稱 161 // 先獲取到如2012-09-17,然后經過replace("-", ""),得到20120917 162 // 再獲取,20120918 163 // 再獲取,20120919 164 // 再獲取,20120920 165 // 再獲取,20120921 166 // 再獲取,20120922 167 // 再獲取,20120923 168 169 //只接受日期目錄下的.txt文件,^匹配輸入字符串的開始位置,$匹配輸入字符串的結束位置,*匹配0個或多個字符。 170 FileStatus[] localStatus = local.globStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$")); 171 // 先獲取到,localStatus=FileStatus[23] 172 // localStatus是[DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917000000.txt; isDirectory=false; length=1111961; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917001500.txt; isDirectory=false; length=782533; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917003000.txt; isDirectory=false; length=593507; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917004500.txt; isDirectory=false; length=839019; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917010000.txt; isDirectory=false; length=866393; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917011500.txt; isDirectory=false; length=678491; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917013000.txt; isDirectory=false; length=593292; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917014500.txt; isDirectory=false; length=688620; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917020000.txt; isDirectory=false; length=674864; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917021500.txt; isDirectory=false; length=635052; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917023000.txt; isDirectory=false; length=547324; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917024500.txt; isDirectory=false; length=598814; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917030000.txt; isDirectory=false; length=542600; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917031500.txt; isDirectory=false; length=535446; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917033000.txt; isDirectory=false; length=592780; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917034500.txt; isDirectory=false; length=619410; replication=1; blocksize=33554432; modification_time=1398669216000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917040000.txt; isDirectory=false; length=590326; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917041500.txt; isDirectory=false; length=428487; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917043000.txt; isDirectory=false; length=598048; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917044500.txt; isDirectory=false; length=598792; replication=1; blocksize=33554432; modification_time=1398669216000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917050000.txt; isDirectory=false; length=575613; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917051500.txt; isDirectory=false; length=619080; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-17/ars10767@20120917053000.txt; isDirectory=false; length=587763; replication=1; blocksize=33554432; modification_time=1398669214000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}] 173 // 再獲取到,localStatus=FileStatus[23] 174 // localStatus是[DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918131500.txt; isDirectory=false; length=1722797; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918133000.txt; isDirectory=false; length=1922955; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918134500.txt; isDirectory=false; length=1388036; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918140000.txt; isDirectory=false; length=1888871; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918141500.txt; isDirectory=false; length=1685719; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918143000.txt; isDirectory=false; length=1541381; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918144500.txt; isDirectory=false; length=1723638; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918150000.txt; isDirectory=false; length=1629322; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918151500.txt; isDirectory=false; length=1658684; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918153000.txt; isDirectory=false; length=1548216; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918154500.txt; isDirectory=false; length=1510965; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918160000.txt; isDirectory=false; length=1559078; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918161500.txt; isDirectory=false; length=1752005; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918163000.txt; isDirectory=false; length=1901994; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918164500.txt; isDirectory=false; length=2234304; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918170000.txt; isDirectory=false; length=1912051; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918171500.txt; isDirectory=false; length=1711317; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918173000.txt; isDirectory=false; length=1799747; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918174500.txt; isDirectory=false; length=2038653; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918180000.txt; isDirectory=false; length=2341515; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918181500.txt; isDirectory=false; length=2396977; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918183000.txt; isDirectory=false; length=2382769; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}, DeprecatedRawLocalFileStatus{path=file:/D:/data/73/2012-09-18/ars10767@20120918184500.txt; isDirectory=false; length=2709048; replication=1; blocksize=33554432; modification_time=1398669244000; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false}] 175 // 再獲取到,,,,不多贅述。 176 177 178 // FileStatus[] localStatus = local.listStatus(new Path(dir+"/*"),new RegexAcceptPathFilter("^.*txt$"));//試試,看有什么區別? 179 180 // 如果不設置過濾器,FileInputFormat 會使用一個默認的過濾器來排除隱藏文件。 181 // 如果通過調用 setInputPathFilter()設置了過濾器,它會在默認過濾器的基礎上進行過濾。換句話說,自定義的過濾器只能看到非隱藏文件。 182 183 184 //RegexAcceptPathFilter這個方法,我們自己寫 185 // RegexAcceptPathFilter來只接收我們需要,即txt格式 186 // 這里,我們還可以只接收別的格式,自己去改,一定要鍛煉學會改別人的代碼 187 188 189 // 獲得如2012-09-17日期目錄下的所有文件 190 Path[] listedPaths = FileUtil.stat2Paths(localStatus); 191 // 同樣,但是我們,最終是要處理文件里的東西,最終是要轉成Path類型,因為Path對象f,它對應着一個文件。 192 193 // 先獲取,listedPaths=Path[23] 194 // 先獲取2012-09-17下的所有,這個不多贅述啦! 195 196 // 再獲取,listedPaths=Path[23] 197 // listedPaths是[file:/D:/data/73/2012-09-18/ars10767@20120918131500.txt 198 // , file:/D:/data/73/2012-09-18/ars10767@20120918133000.txt 199 // , file:/D:/data/73/2012-09-18/ars10767@20120918134500.txt 200 // , file:/D:/data/73/2012-09-18/ars10767@20120918140000.txt 201 // , file:/D:/data/73/2012-09-18/ars10767@20120918141500.txt 202 // , file:/D:/data/73/2012-09-18/ars10767@20120918143000.txt 203 // , file:/D:/data/73/2012-09-18/ars10767@20120918144500.txt 204 // , file:/D:/data/73/2012-09-18/ars10767@20120918150000.txt 205 // , file:/D:/data/73/2012-09-18/ars10767@20120918151500.txt 206 // , file:/D:/data/73/2012-09-18/ars10767@20120918153000.txt 207 // , file:/D:/data/73/2012-09-18/ars10767@20120918154500.txt 208 // , file:/D:/data/73/2012-09-18/ars10767@20120918160000.txt 209 // , file:/D:/data/73/2012-09-18/ars10767@20120918161500.txt 210 // , file:/D:/data/73/2012-09-18/ars10767@20120918163000.txt 211 // , file:/D:/data/73/2012-09-18/ars10767@20120918164500.txt 212 // , file:/D:/data/73/2012-09-18/ars10767@20120918170000.txt 213 // , file:/D:/data/73/2012-09-18/ars10767@20120918171500.txt 214 // , file:/D:/data/73/2012-09-18/ars10767@20120918173000.txt 215 // , file:/D:/data/73/2012-09-18/ars10767@20120918174500.txt 216 // , file:/D:/data/73/2012-09-18/ars10767@20120918180000.txt 217 // , file:/D:/data/73/2012-09-18/ars10767@20120918181500.txt 218 // , file:/D:/data/73/2012-09-18/ars10767@20120918183000.txt 219 // , file:/D:/data/73/2012-09-18/ars10767@20120918184500.txt] 220 221 //輸出路徑 222 Path block = new Path("hdfs://djt002:9000/outData/MergeSmallFilesToHDFS/"+ fileName + ".txt"); 223 //fileName是"fileName" 224 // block=Path 225 // block是hdfs://djt002:9000/outData/MergeSmallFilesToHDFS/20120918.txt 226 227 // 打開輸出流 228 out = fs.create(block);//因為,合並小文件之后,比如這是,合並2012-09-17日期目錄下的所有小文件,之后,要上傳到HDFS里。 229 // 類似於,文件A寫到內存里,再內存里寫到文件B。而這行代碼out = fs.create(block);是相當於是,內存里寫到文件B。所以是輸出流,即是從內存里輸出的,所以叫輸出流。 230 // 這里,文件A是Local 文件B是HDFS 231 232 // 文件與塊大小(比如128M)來比,小的話,稱為小文件。是一個相對概念!相對於數據塊而言的! 233 234 // 很多人搞不清輸入流和輸出流,!!!! 235 // 其實啊,輸入流、輸出流都是針對內存的 236 // 往內存里寫,是輸入流。 237 // 內存往文件里寫,是輸出Luis。 238 // 239 // 比如一個文件A復制到另一文件B,那么,先寫到內存里,再寫到文件B。 240 // => 則文件A寫到內存里,叫輸入流。 241 // => 則內存里寫到文件B,叫輸出流 242 243 244 for (Path p : listedPaths) {//for星型循環,即將listedPaths的值一一傳給Path p 245 //先獲取2012-09-17下的所有,這個不多贅述啦! 246 //現在,獲取到2012-09-18下了 247 // p=Path 248 // p是file:/D:/data/73/2012-09-18/ars10767@20120918134500.txt 249 // 得一個一個來,這才叫做一一傳給Path p 250 251 in = local.open(p);// 打開輸入流in 252 // 類似於,文件A寫到內存里,再內存里寫到文件B。而這行代碼in = local.open(p);是相當於是,文件A寫到內存里。所以是輸如流,即是寫到內存里的,所以叫輸入流。 253 // 這里,文件A是Local 文件B是HDFS 254 255 IOUtils.copyBytes(in, out, 4096, false); // 復制數據,IOUtils.copyBytes可以方便地將數據寫入到文件,不需要自己去控制緩沖區,也不用自己去循環讀取輸入源。false表示不自動關閉數據流,那么就手動關閉。 256 // IOUtils.copyBytes這個方法很重要 257 //是否自動關閉輸入流和輸出流,若是false,就要單獨去關閉。則不在這個方法體里關閉輸入和輸出流了。 258 // 若是true,則在這個方法里關閉輸入和輸出流。不需單獨去關閉了 259 260 261 // 明白,IOUtils類的copyBytes將hdfs數據流拷貝到標准輸出流System.out中, 262 // copyBytes前兩個參數好理解,一個輸入,一個輸出,第三個是緩存大小,第四個指定拷貝完畢后是否關閉流。 263 // 要設置為false,標准輸出流不關閉,我們要手動關閉輸入流。即,設置為false表示關閉輸入流 264 265 // 主要是把最后的這個參數定義好, 就可以了。 定義為true還是false,則決定着是否在這個方法體里關閉 266 // 若定義為true,則在這個方法體里直接關閉輸入流、輸出流。不需單獨去關閉了 267 // 若定義為false,則不在這個方法體里直接關閉輸入流、輸出流。需單獨去關閉了 268 269 270 // 關閉輸入流 271 in.close();//若定義為false,則不在這個方法體里直接關閉輸入流、輸出流。需單獨去關閉了。這就是單獨在關閉輸入流!!!懂了嗎 272 } 273 if (out != null) {//這里為什么不為空,空指針,則說明里面還有資源。 274 // 關閉輸出流 275 out.close();//若定義為false,則不在這個方法體里直接關閉輸入流、輸出流。需單獨去關閉了。這就是單獨在關閉輸出流!!!懂了嗎 276 } 277 } 278 279 } 280 281 /** 282 * 283 * @function 過濾 regex 格式的文件 284 * 285 */ 286 public static class RegexExcludePathFilter implements PathFilter { 287 private final String regex;//變量 288 289 public RegexExcludePathFilter(String regex) {//這個是上面的那個,正在表達式 290 this.regex = regex;//將String regex的值,賦給RegexExcludePathFilter類里的private final String regex的值 291 } 292 293 public boolean accept(Path path) {//主要是實現accept方法 294 // TODO Auto-generated method stub 295 boolean flag = path.toString().matches(regex);//匹配正則表達式,這里是^.*svn$ 296 return !flag; 297 } 298 299 } 300 301 /** 302 * 303 * @function 接受 regex 格式的文件 304 * 305 */ 306 public static class RegexAcceptPathFilter implements PathFilter { 307 private final String regex;//變量 308 309 public RegexAcceptPathFilter(String regex) {//這個是上面的那個,正在表達式 310 this.regex = regex;//將String regex的值,賦給RegexAcceptPathFilter類里的private final String regex的值 311 } 312 313 public boolean accept(Path path) {//主要是實現accept方法 314 // TODO Auto-generated method stub 315 boolean flag = path.toString().matches(regex);//匹配正則表達式,這里是^.*txt$ 316 return flag; 317 } 318 319 } 320 }