C#讀取PDF文檔文字內容


C#讀取PDF文檔文字內容

通過iTextSharp讀取PDF文件內容,下載地址,下載后解壓itextsharp-dll-core.zip。

只能讀取英文和數字,文檔中包含的漢字無法正常讀取:

private string ReadPdfContent(string filepath)  
{  
    try  
    {  
        string pdffilename = filepath;  
        PdfReader pdfReader = new PdfReader(pdffilename);  
        int numberOfPages = pdfReader.NumberOfPages;  
        string text = string.Empty;  
  
        for (int i = 1; i <= numberOfPages; ++i)  
        {  
            byte[] bufferOfPageContent = pdfReader.GetPageContent(i);  
            text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent);  
        }  
        pdfReader.Close();  
  
        return text;  
    }  
    catch (Exception ex)  
    {  
        StreamWriter log = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\log.log");  
        log.WriteLine("出錯文件:" + e.FullPath + "原因:" + ex.ToString());  
        log.Flush();  
        log.Close();return null;  
    } 
}  

 

可以讀取中英文

private string OnCreated(string filepath)  
{  
    try  
    {  
        string pdffilename = filepath;  
        PdfReader pdfReader = new PdfReader(pdffilename);  
        int numberOfPages = pdfReader.NumberOfPages;  
        string text = string.Empty;  
  
        for (int i = 1; i <= numberOfPages; ++i)  
        {  
            iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
            text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
        }  
        pdfReader.Close();  
  
        return text;  
    }  
    catch (Exception ex)  
    {  
        StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\mylog.log");  
        wlog.WriteLine("出錯文件:" + e.FullPath + "原因:" + ex.ToString());  
        wlog.Flush();  
        wlog.Close();return null;  
    }  
  
} 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM