通常在使用爬蟲時,爬取到網上的文章都是各式各樣的格式處理起來比較麻煩,這里我們使用Apache-Tika來處理PDF格式的文章,如下:
1 package com.mengyao.tika.app; 2 3 import java.io.File; 4 import java.io.FileInputStream; 5 6 import org.apache.tika.metadata.Metadata; 7 import org.apache.tika.parser.ParseContext; 8 import org.apache.tika.parser.pdf.PDFParser; 9 import org.apache.tika.sax.BodyContentHandler; 10 11 public class PDFApp { 12 13 public static void main(String[] args) throws Exception { 14 //Tika默認是10*1024*1024,這里防止文件過大導致Tika報錯 15 BodyContentHandler handler = new BodyContentHandler(100*1024*1024); 16 17 Metadata metadata = new Metadata(); 18 FileInputStream inputstream = new FileInputStream(new File("D:/Nutch入門教程.pdf")); 19 ParseContext pcontext = new ParseContext(); 20 21 // 解析PDF文檔時應由超類AbstractParser的派生類PDFParser實現 22 PDFParser pdfparser = new PDFParser(); 23 pdfparser.parse(inputstream, handler, metadata, pcontext); 24 25 // 獲取PDF文檔的內容 26 System.out.println("PDF文檔內容:" + handler.toString()); 27 28 // 獲取PDF文檔的元數據 29 System.out.println("PDF文檔元數據:"); 30 String[] metadataNames = metadata.names(); 31 32 for (String name : metadataNames) { 33 System.out.println(name + " : " + metadata.get(name)); 34 } 35 36 } 37 38 }