PDF.js可以實現在html下直接瀏覽pdf文檔,是一款開源的pdf文檔讀取解析插件,非常強大,能將PDF文件渲染成Canvas。PDF.js主要包含兩個庫文件,一個pdf.js和一個pdf.worker.js,一個負責API解析,一個負責核心解析。
首先引入pdf.js文件<script type="text/javascript" src='pdf.js'></script>
PDF.js大部分用法都是基於Promise的,PDFJS.getDocument(url)方法返回的就是一個Promise:
PDFJS.getDocument('helloworld.pdf').then(function(pdf) {
});
PDF的解析工作需要通過pdf.getPage(page)去執行,這個方法返回的也是一個Promise,因此可以去逐頁解析PDF:
pdf.getPage(1).then(function(page) {
});
官網地址:http://mozilla.github.io/pdf.js/
渲染頁面
各PDF頁面有它自己的視窗,它定義了像素大小(n.72dpi和初始旋轉。默認情況下,該窗口將縮放到PDF但是通過修改視圖可以更改此操作。當創建了視圖時,還會創建一個初始轉換矩陣,它考慮到期望的規模、旋轉,並轉換坐標系統(0點)PDF文檔底部左邊,而畫布0是 左。
var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById('the-canvas'); var context = canvas.getContext('2d'); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; page.render(renderContext);
還可以自定義canvas大小:
var desiredWidth = 100; var viewport = page.getViewport(1); var scale = desiredWidth / viewport.width; var scaledViewport = page.getViewport(scale);
官方給出的示例:
ar url = '//cdn.mozilla.net/pdfjs/helloworld.pdf'; PDFJS.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js'; var loadingTask = PDFJS.getDocument(url); loadingTask.promise.then(function(pdf) { console.log('PDF loaded'); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log('Page loaded'); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById('the-canvas'); var context = canvas.getContext('2d'); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log('Page rendered'); }); }); }, function (reason) { console.error(reason); });
另外較大的PDF文件可以用base 64編碼方式加載,例如:
var pdfData = atob( 'JVBERi0xLjcKCjEgMCBvYmogICUgZW50cnkgcG9pbnQKPDwKICAvVHlwZSAvQ2F0YWxvZwog' + 'IC9QYWdlcyAyIDAgUgo+PgplbmRvYmoKCjIgMCBvYmoKPDwKICAvVHlwZSAvUGFnZXMKICAv' + 'TWVkaWFCb3ggWyAwIDAgMjAwIDIwMCBdCiAgL0NvdW50IDEKICAvS2lkcyBbIDMgMCBSIF0K' + 'Pj4KZW5kb2JqCgozIDAgb2JqCjw8CiAgL1R5cGUgL1BhZ2UKICAvUGFyZW50IDIgMCBSCiAg' + 'L1Jlc291cmNlcyA8PAogICAgL0ZvbnQgPDwKICAgICAgL0YxIDQgMCBSIAogICAgPj4KICA+' + 'PgogIC9Db250ZW50cyA1IDAgUgo+PgplbmRvYmoKCjQgMCBvYmoKPDwKICAvVHlwZSAvRm9u' + 'dAogIC9TdWJ0eXBlIC9UeXBlMQogIC9CYXNlRm9udCAvVGltZXMtUm9tYW4KPj4KZW5kb2Jq' + 'Cgo1IDAgb2JqICAlIHBhZ2UgY29udGVudAo8PAogIC9MZW5ndGggNDQKPj4Kc3RyZWFtCkJU' + 'CjcwIDUwIFRECi9GMSAxMiBUZgooSGVsbG8sIHdvcmxkISkgVGoKRVQKZW5kc3RyZWFtCmVu' + 'ZG9iagoKeHJlZgowIDYKMDAwMDAwMDAwMCA2NTUzNSBmIAowMDAwMDAwMDEwIDAwMDAwIG4g' + 'CjAwMDAwMDAwNzkgMDAwMDAgbiAKMDAwMDAwMDE3MyAwMDAwMCBuIAowMDAwMDAwMzAxIDAw' + 'MDAwIG4gCjAwMDAwMDAzODAgMDAwMDAgbiAKdHJhaWxlcgo8PAogIC9TaXplIDYKICAvUm9v' + 'dCAxIDAgUgo+PgpzdGFydHhyZWYKNDkyCiUlRU9G');
PDFJS.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js'; var loadingTask = PDFJS.getDocument({data: pdfData}); loadingTask.promise.then(function(pdf) { console.log('PDF loaded'); var pageNumber = 1; pdf.getPage(pageNumber).then(function(page) { console.log('Page loaded'); var scale = 1.5; var viewport = page.getViewport(scale); var canvas = document.getElementById('the-canvas'); var context = canvas.getContext('2d'); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: context, viewport: viewport }; var renderTask = page.render(renderContext); renderTask.then(function () { console.log('Page rendered'); }); }); }, function (reason) { console.error(reason); });
pdf翻頁處理:
// If absolute URL from the remote server is provided, configure the CORS // header on that server. var url = '//cdn.mozilla.net/pdfjs/tracemonkey.pdf'; // The workerSrc property shall be specified. PDFJS.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js'; var pdfDoc = null, pageNum = 1, pageRendering = false, pageNumPending = null, scale = 0.8, canvas = document.getElementById('the-canvas'), ctx = canvas.getContext('2d'); /** * Get page info from document, resize canvas accordingly, and render page. * @param num Page number. */ function renderPage(num) { pageRendering = true;
pdfDoc.getPage(num).then(function(page) { var viewport = page.getViewport(scale); canvas.height = viewport.height; canvas.width = viewport.width; var renderContext = { canvasContext: ctx, viewport: viewport }; var renderTask = page.render(renderContext);
renderTask.promise.then(function() { pageRendering = false; if (pageNumPending !== null) { renderPage(pageNumPending); pageNumPending = null; } }); }); document.getElementById('page_num').textContent = num; }
function queueRenderPage(num) { if (pageRendering) { pageNumPending = num; } else { renderPage(num); } }
function onPrevPage() { if (pageNum <= 1) { return; } pageNum--; queueRenderPage(pageNum); } document.getElementById('prev').addEventListener('click', onPrevPage);
function onNextPage() { if (pageNum >= pdfDoc.numPages) { return; } pageNum++; queueRenderPage(pageNum); } document.getElementById('next').addEventListener('click', onNextPage);
PDFJS.getDocument(url).then(function(pdfDoc_) { pdfDoc = pdfDoc_; document.getElementById('page_count').textContent = pdfDoc.numPages; renderPage(pageNum); });
關於page方式的使用:
解析結果,我們可以看下這個對象提供的方法:
方法 | 返回 |
---|---|
getAnnotations | A promise that is resolved with an {Array} of the annotation objects. |
getTextContent | That is resolved a TextContent object that represent the page text content. |
getViewport | Contains ‘width’ and ‘height’ properties along with transforms required for rendering. |
render | An object that contains the promise, which is resolved when the page finishes rendering. |
我們可以試試調用getTextContent方法,並將其結果打印出來:
pdf.getPage(1).then(function(page) { console.log(page); });
輸入格式大致如下:
{ "items": [ { "str": "xxx", "dir": "xxx", "width": xxx, "height": xxx, "transform": [ 48, 0, 0, 48, 45.32495, 679.04 ], "fontName": "g_d0_f1" }, { "str": " ", "dir": "ltr", "width": 9.600000000000001, "height": 2304, "transform": [ 48, 0, 0, 48, 285.325, 679.04 ], "fontName": "g_d0_f2" } ], "styles": { "g_d0_f1": { "fontFamily": "monospace", "ascent": 1.05810546875, "descent": -0.26171875, "vertical": false }, "g_d0_f2": { "fontFamily": "sans-serif", "ascent": 0.74365234375, "descent": -0.25634765625 } } }
PDF.js能將每頁文本的字符串、位置、字體都解析出來。
官網用的viewer.js:http://mozilla.github.io/pdf.js/web/viewer.html,首先底圖是一個Canvas,內容和PDF一樣(通過下面介紹的page.render方法可以得到),底圖之上是一個textLayer,這一層就是通過page.getTextContent()得到了字體的位置和樣式,再覆蓋在Canvas上。
我們可以直接使用官網view.html的demo,然后修改樣式去掉用不掉的功能,簡單粗暴。只需要在跳轉鏈接后面加上參數就行,例:http://xxxx/viewer.html?file='xxxx.pdf';