-
-
Save neko-fire/7038322 to your computer and use it in GitHub Desktop.
| <?php | |
| /* | |
| This program is free software; you can redistribute it and/or modify | |
| it under the terms of the GNU General Public License as published by | |
| the Free Software Foundation; either version 2 of the License, or | |
| (at your option) any later version. | |
| This program is distributed in the hope that it will be useful, | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with this program; if not, write to the Free Software | |
| Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| This code is an improved version of what can be found at: | |
| http://www.webcheatsheet.com/php/reading_clean_text_from_pdf.php | |
| AUTHOR: | |
| - Webcheatsheet.com (Original code) | |
| - Joeri Stegeman (joeri210 [at] yahoo [dot] com) (Class conversion and fixes/adjustments) | |
| DESCRIPTION: | |
| This is a class to convert PDF files into ASCII text or so called PDF text extraction. | |
| It will ignore anything that is not addressed as text within the PDF and any layout. | |
| Currently supported filters are: ASCIIHexDecode, ASCII85Decode, FlateDecode | |
| PURPOSE(S): | |
| Most likely for people that want their PDF to be searchable. | |
| SYNTAX: | |
| include('class.pdf2text.php'); | |
| $a = new PDF2Text(); | |
| $a->setFilename('test.pdf'); | |
| $a->decodePDF(); | |
| echo $a->output(); | |
| ALTERNATIVES: | |
| Other excellent options to search within a PDF: | |
| - Apache PDFbox (http://pdfbox.apache.org/). An open source Java solution | |
| - pdflib TET (http://www.pdflib.com/products/tet/) | |
| - Online converter: http://snowtide.com/PDFTextStream | |
| */ | |
| class PDF2Text { | |
| // Some settings | |
| var $multibyte = 2; // Use setUnicode(TRUE|FALSE) | |
| var $convertquotes = ENT_QUOTES; // ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None) | |
| // Variables | |
| var $filename = ''; | |
| var $decodedtext = ''; | |
| function setFilename($filename) { | |
| // Reset | |
| $this->decodedtext = ''; | |
| $this->filename = $filename; | |
| } | |
| function output($echo = false) { | |
| if($echo) echo $this->decodedtext; | |
| else return $this->decodedtext; | |
| } | |
| function setUnicode($input) { | |
| // 4 for unicode. But 2 should work in most cases just fine | |
| if($input == true) $this->multibyte = 4; | |
| else $this->multibyte = 2; | |
| } | |
| function decodePDF() { | |
| // Read the data from pdf file | |
| $infile = @file_get_contents($this->filename, FILE_BINARY); | |
| if (empty($infile)) | |
| return ""; | |
| // Get all text data. | |
| $transformations = array(); | |
| $texts = array(); | |
| // Get the list of all objects. | |
| preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile, $objects); | |
| $objects = @$objects[1]; | |
| // Select objects with streams. | |
| for ($i = 0; $i < count($objects); $i++) { | |
| $currentObject = $objects[$i]; | |
| // Check if an object includes data stream. | |
| if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject, $stream)) { | |
| $stream = ltrim($stream[1]); | |
| // Check object parameters and look for text data. | |
| $options = $this->getObjectOptions($currentObject); | |
| if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"]))) | |
| continue; | |
| // Hack, length doesnt always seem to be correct | |
| unset($options["Length"]); | |
| // So, we have text data. Decode it. | |
| $data = $this->getDecodedStream($stream, $options); | |
| if (strlen($data)) { | |
| if (preg_match_all("#BT[\n|\r](.*)ET[\n|\r]#ismU", $data, $textContainers)) { | |
| $textContainers = @$textContainers[1]; | |
| $this->getDirtyTexts($texts, $textContainers); | |
| } else | |
| $this->getCharTransformations($transformations, $data); | |
| } | |
| } | |
| } | |
| // Analyze text blocks taking into account character transformations and return results. | |
| $this->decodedtext = $this->getTextUsingTransformations($texts, $transformations); | |
| } | |
| function decodeAsciiHex($input) { | |
| $output = ""; | |
| $isOdd = true; | |
| $isComment = false; | |
| for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) { | |
| $c = $input[$i]; | |
| if($isComment) { | |
| if ($c == '\r' || $c == '\n') | |
| $isComment = false; | |
| continue; | |
| } | |
| switch($c) { | |
| case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break; | |
| case '%': | |
| $isComment = true; | |
| break; | |
| default: | |
| $code = hexdec($c); | |
| if($code === 0 && $c != '0') | |
| return ""; | |
| if($isOdd) | |
| $codeHigh = $code; | |
| else | |
| $output .= chr($codeHigh * 16 + $code); | |
| $isOdd = !$isOdd; | |
| break; | |
| } | |
| } | |
| if($input[$i] != '>') | |
| return ""; | |
| if($isOdd) | |
| $output .= chr($codeHigh * 16); | |
| return $output; | |
| } | |
| function decodeAscii85($input) { | |
| $output = ""; | |
| $isComment = false; | |
| $ords = array(); | |
| for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) { | |
| $c = $input[$i]; | |
| if($isComment) { | |
| if ($c == '\r' || $c == '\n') | |
| $isComment = false; | |
| continue; | |
| } | |
| if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ') | |
| continue; | |
| if ($c == '%') { | |
| $isComment = true; | |
| continue; | |
| } | |
| if ($c == 'z' && $state === 0) { | |
| $output .= str_repeat(chr(0), 4); | |
| continue; | |
| } | |
| if ($c < '!' || $c > 'u') | |
| return ""; | |
| $code = ord($input[$i]) & 0xff; | |
| $ords[$state++] = $code - ord('!'); | |
| if ($state == 5) { | |
| $state = 0; | |
| for ($sum = 0, $j = 0; $j < 5; $j++) | |
| $sum = $sum * 85 + $ords[$j]; | |
| for ($j = 3; $j >= 0; $j--) | |
| $output .= chr($sum >> ($j * 8)); | |
| } | |
| } | |
| if ($state === 1) | |
| return ""; | |
| elseif ($state > 1) { | |
| for ($i = 0, $sum = 0; $i < $state; $i++) | |
| $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i); | |
| for ($i = 0; $i < $state - 1; $i++) | |
| $ouput .= chr($sum >> ((3 - $i) * 8)); | |
| } | |
| return $output; | |
| } | |
| function decodeFlate($input) { | |
| return gzuncompress($input); | |
| } | |
| function getObjectOptions($object) { | |
| $options = array(); | |
| if (preg_match("#<<(.*)>>#ismU", $object, $options)) { | |
| $options = explode("/", $options[1]); | |
| @array_shift($options); | |
| $o = array(); | |
| for ($j = 0; $j < @count($options); $j++) { | |
| $options[$j] = preg_replace("#\s+#", " ", trim($options[$j])); | |
| if (strpos($options[$j], " ") !== false) { | |
| $parts = explode(" ", $options[$j]); | |
| $o[$parts[0]] = $parts[1]; | |
| } else | |
| $o[$options[$j]] = true; | |
| } | |
| $options = $o; | |
| unset($o); | |
| } | |
| return $options; | |
| } | |
| function getDecodedStream($stream, $options) { | |
| $data = ""; | |
| if (empty($options["Filter"])) | |
| $data = $stream; | |
| else { | |
| $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream); | |
| $_stream = substr($stream, 0, $length); | |
| foreach ($options as $key => $value) { | |
| if ($key == "ASCIIHexDecode") | |
| $_stream = $this->decodeAsciiHex($_stream); | |
| if ($key == "ASCII85Decode") | |
| $_stream = $this->decodeAscii85($_stream); | |
| if ($key == "FlateDecode") | |
| $_stream = $this->decodeFlate($_stream); | |
| if ($key == "Crypt") { // TO DO | |
| } | |
| } | |
| $data = $_stream; | |
| } | |
| return $data; | |
| } | |
| function getDirtyTexts(&$texts, $textContainers) { | |
| for ($j = 0; $j < count($textContainers); $j++) { | |
| if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r]#ismU", $textContainers[$j], $parts)) | |
| $texts = array_merge($texts, @$parts[1]); | |
| elseif(preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) | |
| $texts = array_merge($texts, @$parts[1]); | |
| elseif(preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r]#ismU", $textContainers[$j], $parts)) | |
| $texts = array_merge($texts, @$parts[1]); | |
| } | |
| } | |
| function getCharTransformations(&$transformations, $stream) { | |
| preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER); | |
| preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER); | |
| for ($j = 0; $j < count($chars); $j++) { | |
| $count = $chars[$j][1]; | |
| $current = explode("\n", trim($chars[$j][2])); | |
| for ($k = 0; $k < $count && $k < count($current); $k++) { | |
| if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map)) | |
| $transformations[str_pad($map[1], 4, "0")] = $map[2]; | |
| } | |
| } | |
| for ($j = 0; $j < count($ranges); $j++) { | |
| $count = $ranges[$j][1]; | |
| $current = explode("\n", trim($ranges[$j][2])); | |
| for ($k = 0; $k < $count && $k < count($current); $k++) { | |
| if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) { | |
| $from = hexdec($map[1]); | |
| $to = hexdec($map[2]); | |
| $_from = hexdec($map[3]); | |
| for ($m = $from, $n = 0; $m <= $to; $m++, $n++) | |
| $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n); | |
| } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) { | |
| $from = hexdec($map[1]); | |
| $to = hexdec($map[2]); | |
| $parts = preg_split("#\s+#", trim($map[3])); | |
| for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++) | |
| $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n])); | |
| } | |
| } | |
| } | |
| } | |
| function getTextUsingTransformations($texts, $transformations) { | |
| $document = ""; | |
| for ($i = 0; $i < count($texts); $i++) { | |
| $isHex = false; | |
| $isPlain = false; | |
| $hex = ""; | |
| $plain = ""; | |
| for ($j = 0; $j < strlen($texts[$i]); $j++) { | |
| $c = $texts[$i][$j]; | |
| switch($c) { | |
| case "<": | |
| $hex = ""; | |
| $isHex = true; | |
| break; | |
| case ">": | |
| $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) | |
| for ($k = 0; $k < count($hexs); $k++) { | |
| $chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero | |
| if (isset($transformations[$chex])) | |
| $chex = $transformations[$chex]; | |
| $document .= html_entity_decode("&#x".$chex.";"); | |
| } | |
| $isHex = false; | |
| break; | |
| case "(": | |
| $plain = ""; | |
| $isPlain = true; | |
| break; | |
| case ")": | |
| $document .= $plain; | |
| $isPlain = false; | |
| break; | |
| case "\\": | |
| $c2 = $texts[$i][$j + 1]; | |
| if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; | |
| elseif ($c2 == "n") $plain .= '\n'; | |
| elseif ($c2 == "r") $plain .= '\r'; | |
| elseif ($c2 == "t") $plain .= '\t'; | |
| elseif ($c2 == "b") $plain .= '\b'; | |
| elseif ($c2 == "f") $plain .= '\f'; | |
| elseif ($c2 >= '0' && $c2 <= '9') { | |
| $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); | |
| $j += strlen($oct) - 1; | |
| $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); | |
| } | |
| $j++; | |
| break; | |
| default: | |
| if ($isHex) | |
| $hex .= $c; | |
| if ($isPlain) | |
| $plain .= $c; | |
| break; | |
| } | |
| } | |
| $document .= "\n"; | |
| } | |
| return $document; | |
| } | |
| } | |
| ?> |
Hi,
the same thing happens to me as derUli. Some pdf read them well and others throw me weird text.
Example of failure: https://boe.es/boe/dias/2018/12/03/pdfs/BOE-A-2018-16469.pdf
Result:
\���������8(��GH����GH�IHEUHUR�GH��������UHVWULQJLGRV��GH�GLiORJR�FRPSHWLWLYR��GH� OLFLWDFLyQ�FRQ�QHJRFLDFLyQ�\�GH�DVRFLDFLyQ�SDUD�OD�LQQRYDFLyQ��/D�LQWHUYHQFLyQ�GH�OD�0HVD� VHUi�SRWHVWDWLYD�HQ�ORV�SURFHGLPLHQWRV�QHJRFLDGRV�HQ�TXH�QR�VHD�QHFHVDULR�SXEOLFDU� DQXQFLRV�GH�OLFLWDFLyQ��VDOYR�TXH�VH�IXQGDPHQWH�HQ�OD�H[LVWHQFLD�GH�XQD�LPSHULRVD� XUJHQFLD�SUHYLVWD�HQ�HO�DUWtFXOR���������E\f�GH�OD�/H\���������GH���GH�QRYLHPEUH��HQ�HO�TXH� VHUi�REOLJDWRULD��DVt�FRPR�HQ�ORV�SURFHGLPLHQWRV�DELHUWRV�VLPSOLILFDGRV�D�ORV�TXH�VH� UHILHUH�HO�DUWtFXOR�������GH�OD�FLWDGD�OH\��GH�PRGR�TXH�HQ�HVWRV�SURFHGLPLHQWRV�VROR�
Hello.
Your code is useful for me.
But in some documents the plain text result contains parts with random chars.
Example given
"WiIbi tYaeuWZttW/IRIWYtoCWVt[HVt WF33WeH[a WpoaHsWEeYWK33WeH[a WiIbiWtaHeu"
It seems like it tries to convert some binary objects (images) to plain text.