如下的程序,將一個行數為fileLines的文本文件平均分為splitNum個小文本文件,其中換行符'r'是linux上的,windows的java換行符是'\r\n':
package kddcup2012.task2.FileSystem; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; public class FileSplit { public static void main(String[] args) throws IOException { long timer = System.currentTimeMillis(); int bufferSize = 20 * 1024 * 1024;//設讀取文件的緩存為20MB //建立緩沖文本輸入流 File file = new File("/media/Data/畢業設計/kdd cup/數據/userid_profile.txt"); FileInputStream fileInputStream = new FileInputStream(file); BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream); InputStreamReader inputStreamReader = new InputStreamReader(bufferedInputStream); BufferedReader input = new BufferedReader(inputStreamReader, bufferSize); int splitNum = 112-1;//要分割的塊數減一 int fileLines = 23669283;//輸入文件的行數 long perSplitLines = fileLines / splitNum;//每個塊的行數 for (int i = 0; i <= splitNum; ++i) { //分割 //每個塊建立一個輸出 FileWriter output = new FileWriter("/home/haoqiong/part" + i + ".txt"); String line = null; //逐行讀取,逐行輸出 for (long lineCounter = 0; lineCounter < perSplitLines && (line = input.readLine()) != null; ++lineCounter) { output.append(line + "\r"); } output.flush(); output.close(); output = null; } input.close(); timer = System.currentTimeMillis() - timer; System.out.println("處理時間:" + timer); } }
以上程序處理大文本文件只需要30MB左右的內存空間(這和所設的讀取緩沖大小有關),但是速度不是很快,在磁盤沒有其他程序占用的情況下,將200MB文件分割為112份需要20秒(機器配置:Centrino2 P7450 CPU,2GB DDR3內存,Ubuntu 11.10系統,硬盤最大讀寫速度大約60MB/S)。
另外,對於幾百兆到2GB大小的文件,使用內存映射文件的話,速度會塊一些,但是內存映射由於映射的文件長度不能超過java中int類型的最大值,所以只能處理2GB以下的文件。
java 讀取一個巨大的文本文件既能保證內存不溢出又能保證性能
package helloword.helloword; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; public class ReadBig { public static String fff = "C:\\mq\\read\\from.xml"; public static void main1(String[] args) throws Exception { final int BUFFER_SIZE = 0x300000;// 緩沖區大小為3M File f = new File(fff); MappedByteBuffer inputBuffer = new RandomAccessFile(f, "r").getChannel().map(FileChannel.MapMode.READ_ONLY, f.length() / 2, f.length() / 2); byte[] dst = new byte[BUFFER_SIZE];// 每次讀出3M的內容 long start = System.currentTimeMillis(); for (int offset = 0; offset < inputBuffer.capacity(); offset += BUFFER_SIZE) { if (inputBuffer.capacity() - offset >= BUFFER_SIZE) { for (int i = 0; i < BUFFER_SIZE; i++) dst[i] = inputBuffer.get(offset + i); } else { for (int i = 0; i < inputBuffer.capacity() - offset; i++) dst[i] = inputBuffer.get(offset + i); } int length = (inputBuffer.capacity() % BUFFER_SIZE == 0) ? BUFFER_SIZE : inputBuffer.capacity() % BUFFER_SIZE; System.out.println(new String(dst, 0, length));// new // String(dst,0,length)這樣可以取出緩存保存的字符串,可以對其進行操作 } long end = System.currentTimeMillis(); System.out.println("讀取文件文件一半內容花費:" + (end - start) + "毫秒"); } public static void main2(String[] args) throws Exception { int bufSize = 1024; byte[] bs = new byte[bufSize]; ByteBuffer byteBuf = ByteBuffer.allocate(1024); FileChannel channel = new RandomAccessFile(fff, "r").getChannel(); while (channel.read(byteBuf) != -1) { int size = byteBuf.position(); byteBuf.rewind(); byteBuf.get(bs); // 把文件當字符串處理,直接打印做為一個例子。 System.out.print(new String(bs, 0, size)); byteBuf.clear(); } } public static void main3(String[] args) throws Exception { BufferedReader br = new BufferedReader(new FileReader(fff)); String line = null; while ((line = br.readLine()) != null) { System.out.println(line); } } public static void main(String[] args) throws Exception { int bufSize = 1024; byte[] bs = new byte[bufSize]; ByteBuffer byteBuf = ByteBuffer.allocate(1024); FileChannel channel = new RandomAccessFile("d:\\filename", "r").getChannel(); while (channel.read(byteBuf) != -1) { int size = byteBuf.position(); byteBuf.rewind(); byteBuf.get(bs); // 把文件當字符串處理,直接打印做為一個例子。 System.out.print(new String(bs, 0, size)); byteBuf.clear(); } } }
java 讀取大容量文件,內存溢出?怎么按幾行讀取,讀取多次。
最佳答案
package helloword.helloword; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.RandomAccessFile; import java.util.Scanner; public class TestPrint { public static void main(String[] args) throws IOException { String path = "你要讀的文件的路徑"; RandomAccessFile br = new RandomAccessFile(path, "rw");// 這里rw看你了。要是之都就只寫r String str = null, app = null; int i = 0; while ((str = br.readLine()) != null) { i++; app = app + str; if (i >= 100) {// 假設讀取100行 i = 0; // 這里你先對這100行操作,然后繼續讀 app = null; } } br.close(); } // 當逐行讀寫大於2G的文本文件時推薦使用以下代碼 void largeFileIO(String inputFile, String outputFile) { try { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(new File(inputFile))); BufferedReader in = new BufferedReader(new InputStreamReader(bis, "utf-8"), 10 * 1024 * 1024);// 10M緩存 FileWriter fw = new FileWriter(outputFile); while (in.ready()) { String line = in.readLine(); fw.append(line + " "); } in.close(); fw.flush(); fw.close(); } catch (IOException ex) { ex.printStackTrace(); } } }
jdk本身就支持超大文件的讀寫。
網上的文章基本分為兩大類:
一類是使用BufferedReader類讀寫超大文件;
另一類是使用RandomAccessFile類讀取,經過比較,最后使用了前一種方式進行超大文件的讀取,下面是相關代碼,其實很簡單
-------------------------------------------------------------------
File file = new File(filepath); BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file)); BufferedReader reader = new BufferedReader(new InputStreamReader(fis,"utf-8"),5*1024*1024);// 用5M的緩沖讀取文本文件 String line = ""; while((line = reader.readLine()) != null){ //TODO: write your business }
---------------------------------------------------------------------
注意代碼,在實例化BufferedReader時,增加一個分配緩存的參數即可