<?php
function decodeAsciiHex ( $input )
{
$output = "";
$isOdd = true;
$isComment = false;
for ( $i = 0, $codeHigh = -1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i++ )
{
$c = $input [ $i ];
if ( $isComment )
{
if ( $c == '\r' || $c == '\n' )
$isComment = false;
continue;
}
switch ( $c )
{
case '\0' :
case '\t' :
case '\r' :
case '\f' :
case '\n' :
case ' ' :
break;
case '%' :
$isComment = true;
break;
default :
$code = hexdec ( $c );
if ( $code === 0 && $c != '0' )
return "";
if ( $isOdd )
$codeHigh = $code;
else
$output .= chr ( $codeHigh * 16 + $code );
$isOdd = !$isOdd;
break;
}
}
if ( $input [ $i ] != '>' )
return "";
if ( $isOdd )
$output .= chr ( $codeHigh * 16 );
return $output;
}
function decodeAscii85 ( $input )
{
$output = "";
$isComment = false;
$ords = array ();
for ( $i = 0, $state = 0 ; $i < strlen ( $input ) && $input [ $i ] != '~' ; $i++ )
{
$c = $input [ $i ];
if ( $isComment )
{
if ( $c == '\r' || $c == '\n' )
$isComment = false;
continue;
}
if ( $c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ' )
continue;
if ( $c == '%' )
{
$isComment = true;
continue;
}
if ( $c == 'z' && $state === 0 )
{
$output .= str_repeat ( chr ( 0 ), 4 );
continue;
}
if ( $c < '!' || $c > 'u' )
return "";
$code = ord ( $input [ $i ] ) & 0xff;
$ords [ $state++ ] = $code - ord ( '!' );
if ( $state == 5 )
{
$state = 0;
for ( $sum = 0, $j = 0 ; $j < 5 ; $j++ )
$sum = $sum * 85 + $ords [ $j ];
for ( $j = 3 ; $j >= 0 ; $j-- )
$output .= chr ( $sum >> ( $j * 8 ) );
}
}
if ( $state === 1 )
return "";
elseif ( $state > 1 )
{
for ( $i = 0, $sum = 0 ; $i < $state ; $i++ )
$sum += ( $ords [ $i ] + ( $i == $state - 1 ) ) * pow ( 85, 4 - $i );
for ( $i = 0 ; $i < $state - 1 ; $i++ )
$ouput .= chr ( $sum >> ( ( 3 - $i ) * 8 ) );
}
return $output;
}
function decodeFlate ( $input )
{
return @gzuncompress ( $input );
}
function getObjectOptions ( $object )
{
$options = array ();
if ( preg_match ( "#<<(.*)>>#ismU", $object, $options ) )
{
$options = explode ( "/", $options [ 1 ] );
@array_shift ( $options );
$o = array ();
for ( $j = 0 ; $j < @count ( $options ) ; $j++ )
{
$options [ $j ] = preg_replace ( "#\s+#", " ", trim ( $options [ $j ] ) );
if ( strpos ( $options [ $j ], " " ) !== false )
{
$parts = explode ( " ", $options [ $j ] );
$o [ $parts [ 0 ] ] = $parts [ 1 ];
}
else
$o [ $options [ $j ] ] = true;
}
$options = $o;
unset ( $o );
}
return $options;
}
function getDecodedStream ( $stream, $options )
{
$data = "";
if ( empty ( $options [ "Filter" ] ) )
$data = $stream;
else
{
$length = !empty ( $options [ "Length" ] ) ? $options [ "Length" ] : strlen ( $stream );
$_stream = substr ( $stream, 0, $length );
foreach ( $options as $key => $value )
{
if ( $key == "ASCIIHexDecode" )
$_stream = decodeAsciiHex ( $_stream );
if ( $key == "ASCII85Decode" )
$_stream = decodeAscii85 ( $_stream );
if ( $key == "FlateDecode" )
$_stream = decodeFlate ( $_stream );
}
$data = $_stream;
}
return $data;
}
function getDirtyTexts ( &$texts, $textContainers )
{
for ( $j = 0 ; $j < count ( $textContainers ) ; $j++ )
{
if ( preg_match_all ( "#\[(.*)\]\s*TJ#ismU", $textContainers [ $j ], $parts ) )
$texts = array_merge ( $texts, @$parts [ 1 ] );
elseif ( preg_match_all ( "#Td\s*(\(.*\))\s*Tj#ismU", $textContainers [ $j ], $parts ) )
$texts = array_merge ( $texts, @$parts [ 1 ] );
}
}
function getCharTransformations ( &$transformations, $stream )
{
preg_match_all ( "#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER );
preg_match_all ( "#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER );
for ( $j = 0 ; $j < count ( $chars ) ; $j++ )
{
$count = $chars [ $j ] [ 1 ];
$current = explode ( "\n", trim ( $chars [ $j ] [ 2 ] ) );
for ( $k = 0 ; $k < $count && $k < count ( $current ) ; $k++ )
{
if ( preg_match ( "#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim ( $current [ $k ] ), $map ) )
$transformations [ str_pad ( $map [ 1 ], 4, "0" ) ] = $map [ 2 ];
}
}
for ( $j = 0 ; $j < count ( $ranges ) ; $j++ )
{
$count = $ranges [ $j ] [ 1 ];
$current = explode ( "\n", trim ( $ranges [ $j ] [ 2 ] ) );
for ( $k = 0 ; $k < $count && $k < count ( $current ) ; $k++ )
{
if ( preg_match ( "#<([0-9a-f]{1,4})>\s+<([0-9a-f]{1,4})>\s+<([0-9a-f]{1,4})>#is", trim ( $current [ $k ] ), $map ) )
{
$from = hexdec ( $map [ 1 ] );
$to = hexdec ( $map [ 2 ] );
$_from = hexdec ( $map [ 3 ] );
for ( $m = $from, $n = 0 ; $m <= $to ; $m++ , $n++ )
$transformations [ sprintf ( "%04X", $m ) ] = sprintf ( "%04X", $_from + $n );
}
elseif ( preg_match ( "#<([0-9a-f]{1,4})>\s+<([0-9a-f]{1,4})>\s+\[(.*)\]#ismU", trim ( $current [ $k ] ), $map ) )
{
$from = hexdec ( $map [ 1 ] );
$to = hexdec ( $map [ 2 ] );
$parts = preg_split ( "#\s+#", trim ( $map [ 3 ] ) );
for ( $m = $from, $n = 0 ; $m <= $to && $n < count ( $parts ) ; $m++ , $n++ )
$transformations [ sprintf ( "%04X", $m ) ] = sprintf ( "%04X", hexdec ( $parts [ $n ] ) );
}
}
}
}
function getTextUsingTransformations ( $texts, $transformations )
{
$document = "";
for ( $i = 0 ; $i < count ( $texts ) ; $i++ )
{
$isHex = false;
$isPlain = false;
$hex = "";
$plain = "";
for ( $j = 0 ; $j < strlen ( $texts [ $i ] ) ; $j++ )
{
$c = $texts [ $i ] [ $j ];
switch ( $c )
{
case "<" :
$hex = "";
$isHex = true;
break;
case ">" :
$hexs = str_split ( $hex, 4 );
for ( $k = 0 ; $k < count ( $hexs ) ; $k++ )
{
$chex = str_pad ( $hexs [ $k ], 4, "0" );
if ( isset ( $transformations [ $chex ] ) )
$chex = $transformations [ $chex ];
$document .= html_entity_decode ( "&#x" . $chex . ";" );
}
$isHex = false;
break;
case "(" :
$plain = "";
$isPlain = true;
break;
case ")" :
$document .= $plain;
$isPlain = false;
break;
case "\\" :
$c2 = $texts [ $i ] [ $j + 1 ];
if ( in_array ( $c2, array (
"\\",
"(",
")"
) ) )
$plain .= $c2;
elseif ( $c2 == "n" )
$plain .= '\n';
elseif ( $c2 == "r" )
$plain .= '\r';
elseif ( $c2 == "t" )
$plain .= '\t';
elseif ( $c2 == "b" )
$plain .= '\b';
elseif ( $c2 == "f" )
$plain .= '\f';
elseif ( $c2 >= '0' && $c2 <= '9' )
{
$oct = preg_replace ( "#[^0-9]#", "", substr ( $texts [ $i ], $j + 1, 3 ) );
$j += strlen ( $oct ) - 1;
$plain .= html_entity_decode ( "&#" . octdec ( $oct ) . ";" );
}
$j++ ;
break;
default :
if ( $isHex )
$hex .= $c;
if ( $isPlain )
$plain .= $c;
break;
}
}
$document .= "\n";
}
return $document;
}
function pdf2text ( $filename )
{
$infile = @file_get_contents ( $filename, FILE_BINARY );
if ( empty ( $infile ) )
return "";
$transformations = array ();
$texts = array ();
preg_match_all ( "#obj(.*)endobj#ismU", $infile, $objects );
$objects = @$objects [ 1 ];
for ( $i = 0 ; $i < count ( $objects ) ; $i++ )
{
$currentObject = $objects [ $i ];
if ( preg_match ( "#stream(.*)endstream#ismU", $currentObject, $stream ) )
{
$stream = ltrim ( $stream [ 1 ] );
$options = getObjectOptions ( $currentObject );
if ( !( empty ( $options [ "Length1" ] ) && empty ( $options [ "Type" ] ) && empty ( $options [ "Subtype" ] ) ) )
continue;
$data = getDecodedStream ( $stream, $options );
if ( strlen ( $data ) )
{
// if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) { // mauvais découpage sur Tel CRLF : CRLF 0143507794 CRLF
//var_dump($data);
if ( preg_match_all ( "# /p <</MCID [0-9]{1,2}>> BDC (.*) EMC #ismU", $data, $textContainers ) )
{ // Découpage en lignes
// var_dump($textContainers);
$textContainers = preg_replace ( "#BT(.*)ET#ismU", "$1", $textContainers [ 1 ] ); // Suppression des BT ... ET
for ( $i = 0 ; $i < count ( $textContainers ) ; $i++ )
{
if ( ( $pos = strrpos ( $textContainers [ $i ], "TJ" ) ) !== false )
{
$search_length = strlen ( "TJ" );
$textContainers [ $i ] = substr_replace ( $textContainers [ $i ], "WX", $pos, $search_length );
}
}
$textContainers = preg_replace ( "#(.*)TJ(.*)#ismU", "$1 $2", $textContainers ); // Suppression des ... TJ ...
for ( $i = 0 ; $i < count ( $textContainers ) ; $i++ )
{
if ( ( $pos = strrpos ( $textContainers [ $i ], "WX" ) ) !== false )
{
$search_length = strlen ( "WX" );
$textContainers [ $i ] = substr_replace ( $textContainers [ $i ], "TJ", $pos, $search_length );
}
}
// $textContainers = @$textContainers[1];
// var_dump($textContainers);
getDirtyTexts ( $texts, $textContainers );
}
else
getCharTransformations ( $transformations, $data );
print_r ( $transformations ) ;
}
}
}
return getTextUsingTransformations ( $texts, $transformations );
}
?>
|