{"id":1632,"date":"2022-03-05T17:19:14","date_gmt":"2022-03-05T08:19:14","guid":{"rendered":"https:\/\/blog.wsd.sh\/?p=1632"},"modified":"2022-03-07T08:31:33","modified_gmt":"2022-03-06T23:31:33","slug":"pdfjs%ef%bc%9apraintext%e3%81%a8%e3%81%97%e3%81%a6%e6%8a%bd%e5%87%ba","status":"publish","type":"post","link":"https:\/\/blog.wsd.sh\/?p=1632","title":{"rendered":"<small>pdfjs\uff1apdf\u304b\u3089text\u3092plaintext\u3068\u3057\u3066\u62bd\u51fa<\/small>"},"content":{"rendered":"<p>\uff11\uff0e\u76ee\u7684<br \/>\n\u30fb\u30d6\u30e9\u30a6\u30b6\u304b\u3089pdf\u306etext\u3092plaintext\u3068\u3057\u3066\u62bd\u51fa\u3059\u308b<\/p>\n<p>\uff12\uff0e\u30e9\u30a4\u30d6\u30e9\u30ea<br \/>\n\u30fbmozilla\u306epdfjs v2.13.216<\/p>\n<pre>\r\n# wget https:\/\/github.com\/mozilla\/pdf.js\/releases\/download\/v2.13.216\/pdfjs-2.13.216-dist.zip\r\n<\/pre>\n<p>\uff13\uff0epdfjs\u306e\u8ab2\u984c<br \/>\n\u30fbgetTextContent\u3067plaintext\u306f\u53d6\u308a\u51fa\u305b\u308b\u304c\u3001\u53d6\u308a\u51fa\u3057\u9806\u5e8f\u304c\u3001\u66f8\u304d\u51fa\u3057\u9806\u306b\u306a\u308b<br \/>\n\u3010\u7406\u7531\u3011\u7de8\u96c6\u3059\u308b\u3068\u3001\u5909\u66f4\u3057\u305f\u7b87\u6240\u304c\u3001\u8907\u6570\u7e8f\u307e\u3063\u3066list\u306e\u5f8c\u308d\u306b\u5074\u306b\u79fb\u52d5\u3059\u308b<\/p>\n<p>\uff14\uff0esort() \u30e1\u30bd\u30c3\u30c9\u3067\u4e26\u3073\u66ff\u3048<br \/>\n\u30fbArray.prototype.sort()<br \/>\n\u30fb\u30c7\u30fc\u30bf\u5f62\u5f0f<\/p>\n<pre>\r\n let obj = {\"str\":textItem.str,\r\n            \"x\":parseInt(tx[4]),\r\n            \"y\":parseInt(tx[5])\r\n           };\r\n<\/pre>\n<p>\u30fb\uff59,x \u5ea7\u6a19\u3067sort\u3057\u3001\u4e0a\u304b\u3089\u3068\u5de6\u304b\u3089\u306e\u9806\u306b\u51fa\u529b\u3059\u308b<\/p>\n<pre>\r\nfinalObj.sort(compareY).sort(compareX);\r\n<\/pre>\n<p>\uff15\uff0e\u30bd\u30fc\u30c8\u9806\u3092\u5b9a\u7fa9\u3059\u308b\u6bd4\u8f03\u95a2\u6570<br \/>\n\u30fb\uff59\u5ea7\u6a19\u7528<\/p>\n<pre>\r\nfunction compareY( a, b ){\r\n  if( a.y < b.y ){ return -1; }\r\n  if( a.y > b.y ){ return 1; }\r\n  return 0;\r\n}\r\n<\/pre>\n<p>\u30fb\uff58\u5ea7\u6a19\u7528<\/p>\n<pre>\r\nfunction compareX( a, b ){\r\n  if( a.y != b.y ) { return 0; }\r\n  if( a.x < b.x ) { return -1; }\r\n  if( a.x > b.x ) { return 1; }\r\n  return 0;\r\n}\r\n<\/pre>\n<p>\uff16\uff0ex , y \u5ea7\u6a19<br \/>\n\u30fbUtil.transform\u3067\u6c42\u3081\u308b<\/p>\n<pre>\r\nlet tx = Util.transform(viewport.transform, textItem.transform);\r\n<\/pre>\n<p>x = tx [ 4 ] , y = tx [ 5 ];<\/p>\n<p>\uff17\uff0ecode<br \/>\n\u30fb\u62bd\u51fa\u5074<\/p>\n<pre>\r\nfunction getPageText(pageNum, PDFDocumentInstance) {\r\n\r\n   return new Promise(function (resolve, reject) {\r\n      PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {\r\n\r\n         pdfPage.getTextContent({ normalizeWhitespace: true }).then(function (textContent) {\r\n\r\n            const scale = 1.0;\r\n            const viewport = pdfPage.getViewport({ scale: scale });\r\n            const Util = window.pdfjsLib.Util;\r\n\r\n            let finalObj = [];\r\n            let obj,tx;\r\n\r\n            textContent.items.forEach(function (textItem) {\r\n\r\n              tx = Util.transform(viewport.transform, textItem.transform);\r\n\r\n                obj = {\"str\":textItem.str,\r\n                     \"x\":parseInt(tx[4]),\r\n                     \"y\":parseInt(tx[5])\r\n                };\r\n                finalObj.push(obj);\r\n             });\r\n\r\n            finalObj.sort(compareY).sort(compareX);\r\n\r\n\u3000\u3000\u3000      let finalString = [];\r\n      \u3000\u3000\u3000for (let k = 0; k < finalObj.length; k++) {\r\n      \u3000\u3000\u3000    finalString.push( finalObj[k].str);\r\n  \u3000\u3000\u3000    }\r\n\r\n            resolve(finalString);\r\n         });\r\n      });\r\n   });\r\n\r\n   function compareX( a, b ){\r\n      if( a.y != b.y ) { return 0; }\r\n      if( a.x < b.x ) { return -1; }\r\n      if( a.x > b.x ) { return 1; }\r\n\r\n      return 0;\r\n   }\r\n\r\n   function compareY( a, b ){\r\n      if( a.y < b.y ){ return -1; }\r\n      if( a.y > b.y ){ return 1; }\r\n\r\n      return 0;\r\n   }\r\n\r\n}\r\n<\/pre>\n<p>\u30fb\u547c\u3073\u51fa\u3057\u5074<\/p>\n<pre>\r\nasync function api_analysisPdfText(pdf) {\r\n\r\n   const pagesPromises = [];\r\n\r\n   for (let i = 0; i < pdf.numPages; i++) {\r\n      const rows = await getPageText(i+1, pdf);\r\n\r\n      pagesPromises.push(rows);\r\n\r\n   }\r\n\r\n\u30fb\u30fb\u30fb\r\n}\r\n<\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\uff11\uff0e\u76ee\u7684 \u30fb\u30d6\u30e9\u30a6\u30b6\u304b\u3089pdf\u306etext\u3092plaintext\u3068\u3057\u3066\u62bd\u51fa\u3059\u308b \uff12\uff0e\u30e9\u30a4\u30d6\u30e9\u30ea \u30fbmozilla\u306epdfjs v2.13.216 # wget https:\/\/github.com\/mozilla\/pdf.j&#8230;<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_mi_skip_tracking":false},"categories":[1],"tags":[],"_links":{"self":[{"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=\/wp\/v2\/posts\/1632"}],"collection":[{"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1632"}],"version-history":[{"count":46,"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=\/wp\/v2\/posts\/1632\/revisions"}],"predecessor-version":[{"id":1635,"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=\/wp\/v2\/posts\/1632\/revisions\/1635"}],"wp:attachment":[{"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1632"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1632"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/blog.wsd.sh\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1632"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}