#! /bin/sh
eval '(exit $?0)' && eval 'PERL_BADLANG=x;export PERL_BADLANG;: \
;exec perl -x -S -- "$0" ${1+"$@"};#'if 0;
eval 'setenv PERL_BADLANG x;exec perl -x -S -- "$0" $argv:q;#'.q+
#!perl -w
package Htex::pdfboxes;  $0=~/(.*)/s;unshift@INC,'.';do($1);die$@if$@;__END__+if !1;
# This Perl script was generated by JustLib2 at Sun Feb  9 23:57:15 2003.
# Don't touch/remove any lines above; http://www.inf.bme.hu/~pts/justlib
package just; BEGIN{$INC{'just.pm'}='just.pm'}
BEGIN{ $just::VERSION=2 }
sub end(){1}
sub main(){}

BEGIN{$ INC{'strict.pm'}='strict.pm'} {
package strict;
use just;
# by pts@fazekas.hu at Wed Jan 10 12:42:08 CET 2001
require 5.002;
sub bits {
  (grep{'refs'eq$_}@_ && 2)|
  (grep{'subs'eq$_}@_ && 0x200)|
  (grep{'vars'eq$_}@_ && 0x400)|
  ($@ || 0x602)
}
sub import { shift; $^H |= bits @_ }
sub unimport { shift; $^H &= ~ bits @_ }
just::end}

BEGIN{$ INC{'integer.pm'}='integer.pm'} {
package integer;
use just;
# by pts@fazekas.hu at Wed Jan 10 12:42:08 CET 2001
sub import   { $^H |= 1 }
sub unimport { $^H &= ~1 }
just::end}

BEGIN{$ INC{'Pts/string.pm'}='Pts/string.pm'} {
package Pts::string;
# by pts@fazekas.hu at Sat Dec 21 21:32:18 CET 2002
use just;
use integer;
use strict;

#** @param $_[0] a string
#** @param $_[1] index of first bit to return. Bit 128 of byte 0 is index 0.
#** @param $_[2] number of bits to return (<=32)
#** @return an integer (negative on overflow), bit at $_[1] is its MSB
sub get_bits_msb($$$) {
  # assume: use integer;
  my $loop=$_[1];
  my $count=$_[2];
  my $ret=0;
  ($ret+=$ret+(1&(vec($_[0],$loop>>3,8)>>(7-($loop&7)))), $loop++) while $count--!=0;
  $ret
}

#** @param $_[0] a string
#** @return value if $_[0] represents a floating point numeric constant
#**   in the C language (without the LU etc. modifiers) -- or undef. Returns
#**   undef for integer constants
sub c_floatval($) {
  my $S=$_[0];
  no integer; # very important; has local scope
  return 0.0+$S if $S=~/\A[+-]?(?:[0-9]*\.[0-9]+|[0-9]+\.])(?:[eE][+-]?[0-9]+)?\Z(?!\n)/;
  undef
}

#** @param $_[0] a string
#** @return value if $_[0] represents a floating point or integer numeric
#**   constant in the C language (without the LU etc. modifiers) -- or undef
sub c_numval($) {
  my $S=$_[0];
  no integer; # very important; has local scope
  return 0+$S if $S=~/\A[+-]?(?:[0-9]*\.[0-9]+(?:[eE][+-]?[0-9]+)?|[0-9]+\.?)\Z(?!\n)/;
  undef
}

#** @param $_[0] a string
#** @return the integer value of $_[0] in C -- or undef
sub c_intval($) {
  my $S=$_[0];
  my $neg=1;
  $neg=-1 if $S=~s@\A([+-])@@ and '-'eq$1;
  return $neg*hex $1 if $S=~/\A0[xX]([0-9a-fA-F]+)\Z(?!\n)/;
  return $neg*oct $1 if $S=~/\A0([0-7]+)\Z(?!\n)/;
  return $neg*$1     if $S=~/\A([0-9]+)\Z(?!\n)/;
  undef
}

sub import {
  no strict 'refs';
  my $package = (caller())[0];
  shift; # my package
  for my $p (@_ ? @_ : qw{get_bits_msb c_floatval c_numval c_intval}) { *{$package."::$p"}=\&{$p} }
}

just::end}

BEGIN{$ INC{'vars.pm'}='vars.pm'} {
package vars;
use just;
# by pts@fazekas.hu at Wed Jan 10 12:42:08 CET 2001
require 5.002;
sub import {
  my $callpack = caller;
  my ($sym, $ch, $sym9);
  shift;
  for $sym0 (@_) {
    die("Can't declare another package's variables") if $sym0 =~ /::/;
    ($ch, $sym) = unpack('a1a*', $sym0);
    *{"${callpack}::$sym"} =
    (  $ch eq "\$" ? \$   {"${callpack}::$sym"}
     : $ch eq "\@" ? \@   {"${callpack}::$sym"}
     : $ch eq "\%" ? \%   {"${callpack}::$sym"}
     : $ch eq "\*" ? \*   {"${callpack}::$sym"}
     : $ch eq "\&" ? \&   {"${callpack}::$sym"}
     : die("'$ch$sym' is not a valid variable name\n")
    );
  }
}
just::end}

BEGIN{$ INC{'Htex/PDFread.pm'}='Htex/PDFread.pm'} {
package Htex::PDFread;
# by pts@fazekas.hu at Sat Dec 21 21:28:09 CET 2002
use just;
use integer;
use strict;
use Pts::string;
use vars qw($pdf_last_ref0);

my @pdf_classify;
#** @param $_[0] a string in PDF source format
#** @return a rewritten string, or "" if $_[0] is truncated, or undef if
#**   there is a parse error
sub pdf_rewrite($;$) {
  my $explicit_term_p=$_[1];
  my $L=length($_[0]);
  return "" if $L==0;
  my $S="$_[0]\n>>  "; # add sentinel
  my $I=0;
  my $O;
  my $RET="";
  if (!@pdf_classify) {
    # Dat: PDF whitespace(0) is  [\000\011\012\014\015\040]
    # Dat: PDF separators(10) are < > { } [ ] ( ) / %
    # Dat: PDF regular(40) character is any of [\000-\377] which is not whitespace or separator
    @pdf_classify=(40)x256;
    @pdf_classify[ord('<'),ord('>'),ord('{'),ord('}'),ord('['),ord(']'),
      ord('('),ord(')'),ord('/'),ord('%')]=(10,11,12,13,14,15,16,17,18,19);
    @pdf_classify[000,011,012,014,015,040]=(0,0,0,0,0,0);
  }
  while ($I<$L) {
    $O=$pdf_classify[vec($S,$I,8)];
    if ($O==0) { # whitespace
    } elsif (12<=$O and $O<=15) { # one-char token
      $RET.=" ".substr($S,$I,1);
    } elsif ($O==18 or $O==40) { # name or /name
      my $P=0;
      if ($O==18) { $I++; $RET.=" /" } else { $RET.=" "; $P=1 }
      my $T="";
      $T.=chr($O) while $pdf_classify[$O=vec($S,$I++,8)]==40;
      $I--;
      ## die $I;
      $T=~s@([^A-Za-z0-9_.-])@sprintf"#%02x",ord$1@ge; # make name safe
      $RET.=$T;
      return $RET if $P and ($T eq "stream" or $T eq "endobj" or $T eq "startxref");
      next
    } elsif ($O==11) { # `>'
      return "" if ++$I==$L; # only `>' has arrived
      return undef if vec($S,$I,8)!=62; # err(">> expected");
      $RET.=" >>";
    } elsif ($O==16) { # string
      my $T="";
      my $depth=1; $I++;
      while ($I<$L) {
        $O=vec($S,$I++,8); bcont:
        ## print chr($O),":$depth\n";
        if ($O==40) { $depth++ }
        elsif ($O==41) { last unless --$depth }
        elsif ($O==92) { # a backslash
          $O=vec($S,$I++,8);
          if (48<=$O && $O<=55) {
            my $P=$O-48; $O=vec($S,$I++,8);
            if (48<=$O && $O<=55) {
              my $Q=$O-48; $O=vec($S,$I++,8);
              if (48<=$O && $O<=55) { $T.=chr(255&($P<<6|$Q<<3|($O-48))) }
                               else { $T.=chr($P<<3|$Q); goto bcont }
            } else { $T.=chr($P); goto bcont }
          } elsif ($O==110) { $O=10 }
          elsif ($O==114) { $O=13 }
          elsif ($O==116) { $O=9 }
          elsif ($O== 98) { $O=8 }
          elsif ($O==102) { $O=12 }
        }
        $T.=chr($O)
      } # WHILE
      return "" if $depth; # err("unterminated string")
      $T=~s@([^A-Za-z0-9_.-])@sprintf"\\%03o",ord$1@ge; # make string safe
      $RET.=" ($T)"; next
    } elsif ($O==10) { # hex string
      $O=vec($S,++$I,8);
      if ($O==60) { $RET.=" <<"; $I++; next }
      # parse hexadecimal string
      my $half=0x100;
      my $T="";
      while (1) {
        1 until $pdf_classify[$O=vec($S,$I++,8)]; # skip whitespace
        if ($O==62) { $T.=chr($half&0xFF) if $half&0x1000; last } # '>'
        return undef if $pdf_classify[$O]!=40; # err("unexpected token in hex")
        if (65<=$O and $O<=70) { $half+=$O-55 }
        elsif (97<=$O and $O<=102) { $half+=$O-87 }
        elsif (48<=$O and $O<=57) { $half+=$O-48 }
        else { return undef } # err("illegal hex digit")
        if ($half&0x1000) { $T.=chr($half&0xFF); $half=0x100 }
                     else { $half<<=4 }
      }
      $T=~s@([^A-Za-z0-9_.-])@sprintf"\\%03o",ord$1@ge; # make string safe
      $RET.=" ($T)"; next
    } elsif ($O==19) { # single-line comment
      $I++ while ($O=vec($S,$I,8))!=13 && $O!=10;
      ## print STDERR "I=$I L=$L\n";
      next
    } else { return undef } # err("token expected") # $O==11, $O==17
    $I++
  } ## WHILE
  ## print STDERR "XI=$I L=$L\n";
  # die $explicit_term_p;
  return "" if $explicit_term_p;
  ($I>$L) ? "" : $RET
}

# Unit test:
#die unless pdf_rewrite("hello \n\t world\n\t") eq " hello world";
#die unless pdf_rewrite('(hel\)lo\n\bw(or)ld)') eq ' (hel\051lo\012\010w\050or\051ld)';
#die unless pdf_rewrite('(hel\)lo\n\bw(orld)') eq '';
#die unless pdf_rewrite('[ (hel\)lo\n\bw(or)ld)>>') eq ' [ (hel\051lo\012\010w\050or\051ld) >>';
#die unless pdf_rewrite('>') eq "";
#die unless pdf_rewrite('<') eq "";
#die unless pdf_rewrite('< ') eq "";
#die unless !defined pdf_rewrite('< <');
#die unless !defined pdf_rewrite('> >');
#die unless pdf_rewrite('[ (hel\)lo\n\bw(or)ld) <') eq "";
#die unless pdf_rewrite("<\n3\t1\r4f5C5 >]") eq ' (1O\134P) ]';
#die unless pdf_rewrite("<\n3\t1\r4f5C5") eq "";
#die unless !defined pdf_rewrite("<\n3\t1\r4f5C5]>");
#die unless pdf_rewrite("% he te\n<\n3\t1\r4f5C5 >]endobj<<") eq ' (1O\134P) ] endobj';
#die unless pdf_rewrite("") eq "";
#die unless pdf_rewrite("<<") eq " <<";
#die unless pdf_rewrite('%hello') eq '';
#die unless pdf_rewrite("alma\n%korte\n42") eq ' alma 42';
#die unless pdf_rewrite('/Size 42') eq ' /Size 42';
#die "OK";

#** Reads a single PDF indirect object (without its stream) from a PDF file.
#** Does some trivial transformations on it to make later regexp matching
#** easier. Stops at `stream', `endobj' and `startxref'.
#** @param $_[0] a filehandle (e.g \*STDIN), correctly positioned in the PDF
#**   file to the beginning of the object data (i.e just before `5 0 obj')
#** @return string containing PDF source code, or undef on error
sub pdf_read_obj($) {
  my $F=$_[0];  my $L=1024;  my $M;  my $S="";  my $RET;
  while (1) { # read as much data as necessary
    return undef if 0>($M=read $F, $S, $L, length($S));
    $RET=pdf_rewrite($S,1);
    ## print "($S)\n";
    return undef if !defined $RET; # parse error
    return $RET if length $RET; # OK, found object
    return undef if $M==0; # cannot read more, reached EOF
    $L<<=1;
  }
  #$S=~m@[\000\011\012\014\015\040]*(
  #  %[^\r\n]*[\r\n]|
  #  /?[^\000\011\012\014\015\040<>{}\[\]()/%]*(?=[\000\011\012\014\015\040<>{}\[\]()/%])| # unterminated
  #  <<|>>|\{|}|\[|]|
  #  <[a-fA-F0-9\000\011\012\014\015\040]*>| # hex string
  #  \((?:[^\\()]+|\\[\000-\377])*\)| # literal string, the easy way
  #  \( # an unfinished string, needs special care
  #)@gx
}

#** @param $_[0] a filehandle (e.g \*STDIN), containing a PDF file, positioned
#**  just before an `xref' table
#** @param $_[1] an xref table: $_[1][4][56] is the file offset of object 56
#**   from generation 4; will be extended
#** @return the `trailer' section after the `xref'; or undef
sub pdf_read_xref($$) {
  # made much faster at Wed Dec 18 09:50:23 CET 2002
  my $T;
  my $E;
  my $F=$_[0];
  my $XREF=$_[1];
  return undef if 8>read $F, $T, 1024;
  return undef unless $T=~s@\A\s*xref\s+(\d+)\s+(\d+)\s+(?=\S)@@;
  my ($first,$len,$flen);
  while (1) {
    ($first,$len)=($1+0,$2+0);
    ## print " $first + $len\n";
    $flen=($len*=20)-length($T)+20;
    return undef unless $flen<1 or $flen==read $F, $T, $flen, length($T);
    for (my $I=0;$I<$len;$I+=20, $first++) {
      $E=substr($T, $I, 20);
      return undef unless $E=~/\A(\d{10})\s(\d{5})\s([nf])\s\s/;
      ## print "($1 $2 $3)\n";
      $XREF->[$2+0][$first]=$1+0 if $3 eq 'n';
    }
    $E=substr($T, $len);
    last if $E!~s@\A\s*(\d+)\s*(\d+)\s+(?=\S)@@; # next section
    $T=$E;
  }
  
  # die(-length($T)+$len);
  ## die tell($F);
  return undef if length($T)!=$len and !seek $F, -length($T)+$len, 1;
  ## die tell($F);
  return undef unless defined($T=pdf_read_obj($F));
  $XREF->[0][0]=undef if defined $XREF->[0];
  $XREF->[0][0]=$1+0 if $T=~m@ /Prev (\d+)@; # remember /Prev xref table
  return undef unless $T=~m@\A trailer( .*) startxref\Z(?!\n)@s;
  $1
}

$pdf_last_ref0=0;
#** @param $_[0] a filehandle (e.g \*STDIN), containing a PDF file
#** @param $_[1] an xref table: $_[1][4][56] is the file offset of object 56
#**   from generation 4
#** @param $_[2] an object number
#** @param $_[3] a generation number
#** @return PDF source code of the reference, or undef
sub pdf_ref($$$$) {
  my $F=$_[0]; my $XREF=$_[1]; my $ON=$_[2]+0; my $GN=$_[3]+0;
  my $T;
  $pdf_last_ref0=$ON if $GN==0;
  ## print "REF $ON $GN;\n";
  until (ref $XREF->[$GN] and defined ($T=$XREF->[$GN][$ON])) {
    return undef if !ref $XREF->[0] or !defined $XREF->[0][0]; # no /Prev entry, `$ON $GN R' not found
    return undef unless seek $F, $XREF->[0][0], 0;
    return undef if !defined pdf_read_xref($F,$XREF);
  }
  ## print "REF at $T;\n";
  return undef unless seek $F, $T, 0;
  return undef unless defined($T=pdf_read_obj($F));
  ## print "REF=($T);\n";
  return undef unless $T=~s@\A (\d+) (\d+) obj\b(.*) (endobj|stream)\Z(?!\n)@$3@s;
  $T
}

#** Gets a key from a direct dict, and resolves it if it is an indirect object
#** @param $_[0] a filehandle (e.g \*STDIN), containing a PDF file
#** @param $_[1] an xref table: $_[1][4][56] is the file offset of object 56
#**   from generation 4
#** @param $_[2] a PDF source dict (`<< ... >>') or array
#** @param $_[3] a key (`/...')
sub pdf_get($$$$) {
  my $F=$_[0]; my $XREF=$_[1]; my $S=$_[2]; my $KEY=$_[3]; my $POS=0;
  my $DEPTH=0; my $IS_DICT; my $C=0; my $N=0;
  ## print "\n";
  while ($S=~/\G (\S+)/g) {
    $C=vec($1,0,8);  $POS=pos($S);
    ## print "($1) $DEPTH $N\n";
    if ($1 eq '>>' or $1 eq ']') {
      return undef if 0==$DEPTH--;
      last if !$DEPTH;
      $N++ if 1==$DEPTH;
    }
    elsif ($DEPTH==1 and !$IS_DICT and $KEY==$N) { $POS=pos($S)-=length($1)+1; goto do_ret }
    elsif ($1 eq '<<') { $IS_DICT=1 if 0==$DEPTH++ }
    elsif ($1 eq '[') {
      if (0==$DEPTH++) {
        $IS_DICT=0;
        return undef if $KEY!~/\A(\d+)\Z(?!\n)/; # err("non-numeric key in array")
      }
    }
    elsif (0==$DEPTH) { return undef } # not in a composite object
    elsif (1!=$DEPTH) { next }
    elsif (!$IS_DICT) { $N++ }
    elsif ($C==40) { $N++ } # `(': string or bare name
    elsif ($C>=47 and $C<=57) { # '/': /name 0..9: number
      ## print "TRY ($1) KEY=$KEY.\n";
      next if ($N++&1)==1 or $1 ne $KEY;
     do_ret:
      ## print substr($S,pos($S)),";;\n";
      return pdf_ref $F, $XREF, $1, $2 if $S=~/\G (\d+) (\d+) R\b/gc;
      ## print substr($S,pos($S)),"::\n";
      $DEPTH=0;
      while ($S=~/\G( \S+)/g) {
        if ($1 eq ' <<' or $1 eq ' [') { $DEPTH++ }
        elsif ($1 eq ' >>' or $1 eq ' ]') {
          ## die "($1)\n";
          return undef if 0==$DEPTH--; # err("nesting")
          return substr($S,$POS,pos($S)-$POS) if 0==$DEPTH;
        } elsif ($DEPTH==0) { return $1 }
      }
    } else { $N++ } # bare name
  }
  return undef if $POS!=length($S); # err("invalid source dict");
  "" # not found
}

# Unit test:
#die unless pdf_get(\*STDIN, 0, ' [ al makorte 42 ]', 0) eq ' al';
#die unless pdf_get(\*STDIN, 0, ' [ al makorte 42 ]', 1) eq ' makorte';
#die unless pdf_get(\*STDIN, 0, ' [ al makorte 42 ]', 2) eq ' 42';
#die unless pdf_get(\*STDIN, 0, ' [ al makorte 42 ]', 3) eq '';
#die unless pdf_get(\*STDIN, 0, ' [ << >> ]', 0) eq ' << >>';
#die unless pdf_get(\*STDIN, 0, ' [ << >> ]', 1) eq '';
#die unless pdf_get(\*STDIN, 0, ' [ << >> [ al makorte 42 ] ]', 1) eq ' [ al makorte 42 ]';
#die unless pdf_get(\*STDIN, 0, ' << /Alma [ 1 2 ] /Korte [ 3 4 ] >>', '/Korte') eq ' [ 3 4 ]';
#die unless !defined pdf_get(\*STDIN, 0, ' [ al makorte 42 ]', '/Name');
#die unless !defined pdf_get(\*STDIN, 0, ' << al makorte 42 >>', 42);
#die unless pdf_get(\*STDIN, 0, ' << al makorte 42 137 >>', 42) eq ' 137';
#die unless pdf_get(\*STDIN, 0, ' << al makorte >>', 'al') eq "";
#die "OK";

#** Reported boxes: /MediaBox /CropBox /BleedBox /TrimBox /ArtBox
#** @param $_[0] a filehandle (e.g \*STDIN), containing a PDF file
#** @param $_[1] an xref table: $_[1][4][56] is the file offset of object 56
#**   from generation 4
#** @param $_[2] a PDF source dict (`<< ... >>') of /Type/Catalog
#**   /Type/Pages or /Type/Page
#** @param $_[3] hashref to update. $_[3]{BleedBox}[2] will be the URX corner
#**   of the BleedBox
sub pdf_get_boxes($$$$) {
  my $F=$_[0]; my $XREF=$_[1]; my $S=$_[2]; my $bbi=$_[3];
  return if !defined $S;
  for my $name (qw{MediaBox CropBox BleedBox TrimBox ArtBox}) {
    my $box=pdf_get($F, $XREF, $S, "/$name");
    next if !defined $box or !length $box
         or $box!~m@ \[ ([0-9eE.-]+) ([0-9eE.-]+) ([0-9eE.-]+) ([0-9eE.-]+) \]\Z(?!\n)@
         or !defined c_numval($1) or !defined c_numval($2) or !defined c_numval($3) or !defined c_numval($4);
    ($bbi->{LLX},$bbi->{LLY},$bbi->{URX},$bbi->{URY})=($1+0,$2+0,$3+0,$4+0) if $name eq 'MediaBox';
    my $name2="Info.$name";
    ($bbi->{$name2}[0],$bbi->{$name2}[1],$bbi->{$name2}[2],$bbi->{$name2}[3])=($1+0,$2+0,$3+0,$4+0);
  }
}

sub import {
  no strict 'refs';
  my $package=(caller())[0];
  shift;
  for my $p (@_ ? @_ : qw{pdf_get_boxes pdf_get pdf_read_xref pdf_read_obj
    pdf_rewrite pdf_ref}) { *{$package."::$p"}=\&{$p} }
}

just::end}

BEGIN{$  INC{'Htex/pdfboxes.pm'}='Htex/pdfboxes.pm'}

package Htex::pdfboxes;
# pdfboxes.pl -- extract bounding box information from PDFs and update them 
# see perldoc(1) POD documentation later
# by pts@fazekas.hu at Sat Jan 25 14:26:26 2003
# docs at Tue Jan 28 11:29:55 CET 2003
#

use just +1;
use strict;
use integer;
use Htex::PDFread;
BEGIN { $Htex::pdfboxes::VERSION=0.05 }

#** Derived from pdf_read_xref()
#** @param $_[0] a filehandle (e.g \*STDIN), containing a PDF file, positioned
#**  just before an `xref' table
#** @param $_[1] a file offset
#** @param $_[2] number to add to /Size
#** @param $_[3] arrayref containing xref20 intervals
#** @param $_[4] retval for original trailer length
#** @param $_[5] retval for original trailer offset 
#** -param $_[6] arrayref, trailer offsets (not the topmost) will be pushed to
#** @return the `trailer' section after the `xref'; or undef
sub my_read_xref($$$$$$) {
  # made much faster at Wed Dec 18 09:50:23 CET 2002
  my $T;
  my $E;
  my($F,$fofs,$addsize,$xrefs)=@_;
  my($first,$len,$flen);
  my $ret_trailer;
 prev_xref:
  return undef unless seek $F, $fofs, 0;
  return undef if 8>read $F, $T, 1024;
  $fofs+=length($T);
  return undef unless $T=~s@\A\s*xref\s+(\d+)\s+(\d+)\s+(?=\S)@@;
  $fofs-=length($T);
  while (1) {
    ($first,$len)=($1+0,$2+0);
    # Now: ($fofs ... $fofs+20*$len) contains xref enties
    push @$xrefs, $fofs, $fofs+20*$len;
    ## print " $first + $len\n";
    $flen=($len*=20)-length($T)+20; # +20: two-number header of next section
    return undef unless $flen<1 or $flen==read $F, $T, $flen, length($T);
    for (my $I=0;$I<$len;$I+=20, $first++) {
      $E=substr($T, $I, 20);
      return undef unless $E=~/\A(\d{10})\s(\d{5})\s([nf])\s\s/;
      ## print "($1 $2 $3)\n";
      # $XREF->[$2+0][$first]=$1+0 if $3 eq 'n';
    }
    $E=substr($T, $len);
    $fofs+=length($T);
    last if $E!~s@\A\s*(\d+)\s*(\d+)\s+(?=\S)@@; # next section
    $fofs-=length($E);
    $T=$E;
  }
  return undef if length($T)!=$len and !seek $F, -length($T)+$len, 1;
  $fofs+=$len-length($T);
  return undef if 20>read $F, $T, 2048; # read trailer
  $T=~s@%.*@ @g; # assume there are no strings
  return undef unless $T=~s@(\s*>>\s*).*@@s;
  ($_[4],$_[5])=(length($T)+length($1),$fofs) if !defined $ret_trailer;
  # push @{$_[6]}, $fofs+length($1), $fofs+length($1)+length($2)
  push @$xrefs, $fofs+length($1), -($fofs+length($1)+length($2))
    if defined($ret_trailer)
    and $T=~m@\A(\s*trailer\s*<<.*?)(\s*/Prev\s+\d+[\000\011\012\014\015\040]*[<>{}\[\]()/]?)@s;
  return undef if $T!~s/\A\s*trailer\s*<<\s*//;
  if (!defined $ret_trailer) {
    return undef if $T!~s@/Size\s+(\d+)@"/Size ".($1+$addsize)@e;
    ## $T=~y/\r/\n/; die "$T";
    $ret_trailer=$T;
  }
  if ($T=~m@/Prev\s+(\d+)@) { $fofs=$1+0; goto prev_xref }
  $ret_trailer
}

#** Modifies the /Prev value in a PDF trailer, tries not to increase length
#** of $_[0].
#** @param $_[0] string containing a PDF trailer /Prev entry
#** @param $_[1] coderef to calculate modification
#** @return the modified string, or undef
sub modify_prev($$) {
  my($S,$N)=@_;
  my $lenS=length($S);
  my $T=$S;
  return $T unless $T=~s@(/Prev\s+)(\d+)@$1.$N->($2)@e; # preserve spaces
  if (length($T)>$lenS) { # Imp: test this
    $T=$S;
    die unless $T=~s@\s*/Prev\s+(\d+)[\000\011\012\014\015\040]*([<>{}\[\]()/]?)@
      "/Prev ".$N->($1).(length$2 ? $2 : " ")
    @;
    # ^^^ don't check for comments: $T=~s@\s*/Prev\s+(\d+)\s*(?:%.*)*
  }
  die unless length($T)>=$lenS or $T=~s@(/Prev\s+\d+)@$1.(" "x(length($T)-$lenS))@e;
  # Now: length($T)>=$lenS
  $T
}

#** @param $_[0] $bbi hashref
#** @return a multiline dump of all key--value pairs, sorted by key
sub all2($) {
  my $bbi=$_[0];
  my $RET="";
  for my $key (sort keys %$bbi) { if ($key=~/\AInfo[.](\w+Box)\Z(?!\n)/) { #1<length $key
    my $val=$bbi->{$key};
    $RET.="/$1 ".(ref($val)eq'ARRAY' ? "[@$val]\n" : "$val\n");
  } }
  $RET
}

sub usage() {
  die "This is pdfboxes.pl by pts\@fazekas.hu, version $Htex::pdfboxes::VERSION
This program is free software, licensed under the GNU GPL.
This software comes with absolutely NO WARRANTY. Use at your own risk!

I can extract /MediaBox, /CropBox and other bounding box information from PDF
files, and write back this information to the begininning of the PDF. Run this
to get more docs:
        perldoc '$0'

Usage: $0 [<action>] <input.pdf> [<output.pdf>]\n
Action is one of: -- --show\n"
}

just::main;

my $outfilename;
my $infilename;
my $show_p=0;
usage if !@ARGV;
if (@ARGV==1) {
  $show_p=1;
} else {
  if ($ARGV[0] eq '--') { shift @ARGV; }
  elsif ($ARGV[0] eq '--show') { $show_p=1; shift @ARGV; }
  elsif ($ARGV[0]=~/\A-/) { print STDERR "$0: unknown option: $ARGV[0]"; usage }
}
($infilename,$outfilename)=@ARGV; # $outfilename possibly undef
  
# Dat: reading PDF from STDIN not supported, because it is probably unseekable
die "$0: open $infilename: $!\n" if !open F, "< $infilename";
die unless binmode F;

my $head;
# vvv 2048: file header + max length(/Type/pdfboxes)
if (32>read F, $head, 2048) {
  IOerr: die "$0: I/O error in $infilename: $!\n"
}
if ($head!~/\A(%PDF-[!-~]+)/) {
  SYerr: die "$0: syntax error in $infilename. Not a PDF?\n";
}
my $firstline=$1;

my $binary_p=($head=~/\A[^\r\n]+[\r\n]+[ -~]*[^\n\r -~]/) ? 'Binary' : 'Clean7Bit';
#$head=pdf_rewrite($head,1);
$head=~s@\A(.*?)(\bendobj\s*).*@$1@s; # The first obj decides whether it is linearized
my $headprelen=length($1)+length($2);
goto SYerr if $head!~/[\n\r](\d+\s+\d+\s+obj)/g;
my $obj1ofs=pos($head)-length($1);
$head=substr($head,$obj1ofs);
my $page1obj;
my $linearized_p=0;
my $was_pdfboxes=0; # the obj # of the pdfboxes.pl already applied
#if ($head=~s@\A((\d+)\s+\d+\s+obj\s*<<.*?/Type\s*/pdfboxes\b.*?\bendobj\s*)@@s) {
if ($head=~m@\A(\d+)\s+\d+\s+obj\s*<<.*?/Type\s*/pdfboxes\b@s) {
  goto SYerr if !defined $headprelen;
  $obj1ofs=$headprelen; $head="";
  # ^^^ remove previous instance of /pdfboxes
  # ^^^ might not be a real obj at $obj1ofs
  $was_pdfboxes=$1+0;
} elsif ($head=~m@/Linearized\b@ and $head=~m@/O\s+(\d+)@) {
  $linearized_p=1;
  $page1obj=$1+0;
}
goto IOerr if !seek F, -1024, 2 and !seek F, 0, 0;
my $tailofs=tell F;
goto IOerr if 1>read F, $head, 1024;
goto SYerr if $head!~/startxref\s+(\d+)\s*%%EOF\s*\Z(?!\n)/
          and $head!~/startxref\s+(\d+)\s*%%EOF/;
# ^^^ Dat: some PDF files contain binary junk at the end

my $xref_ofs=$1+0;
my $xrefs=[];
my $oldtrailerlen;
my $oldtrailerofs;
# my @trailersofs;
my $trailerx=my_read_xref(\*F, $xref_ofs, $was_pdfboxes==0, $xrefs, $oldtrailerlen, $oldtrailerofs); # , \@trailersofs);
## print "(@trailersofs)\n";
goto SYerr if !defined $trailerx;
goto SYerr if $trailerx!~m@/Size\s+(\d+)@;
my $oldSize=$1-1;

# Imp: remove code duplication with package Htex::ImgBBox
goto IOerr if !seek \*F, $xref_ofs, 0;
my $xref=[];
my $trailer=pdf_read_xref(\*F,$xref);
goto SYerr if !defined $trailer;
my $pages;
my $type;
my $bbi={};
if (!defined $page1obj) { do_pdf_slow:
  my $root=pdf_get(\*F,$xref,$trailer,'/Root');
  goto IOerr if !defined $root; goto SYerr if !length $root;
  $type=pdf_get(\*F,$xref,$root,'/Type');
  goto IOerr if !defined $type; goto SYerr if $type ne ' /Catalog';
  # die $root;
  # vvv Dat: reading xref for /Pages in a linearized PDF is quite slow
  $pages=pdf_get(\*F,$xref,$root,'/Pages');
  goto IOerr if !defined $pages; goto SYerr if !length $pages;
  ## die $pages;
  my $kids;
  while (1) {
    $type=pdf_get(\*F,$xref,$pages,'/Type');
    goto IOerr if !defined $type;
    last if $type ne ' /Pages';
    pdf_get_boxes(\*F, $xref, $pages, $bbi);
    $kids=pdf_get(\*F,$xref,$pages,'/Kids');
    goto IOerr if !defined $kids; goto SYerr if !length $kids;
    ## die $kids;
    $pages=pdf_get(\*F,$xref,$kids,0);
    ## die $pages;
    goto IOerr if !defined $pages; goto SYerr if !length $pages;
  }
  goto SYerr if $type ne ' /Page';
  # Dat: cannot set $page1obj properly here, because it might be a direct object
  $bbi->{'Info.page1obj'}=$Htex::PDFread::pdf_last_ref0;
} else {
  # die $page1obj;
  $pages=pdf_ref(\*F, $xref, $page1obj, 0);
  goto IOerr if !defined $pages;
  $type=pdf_get(\*F,$xref,$pages,'/Type');
  goto IOerr if !defined $type;
  goto SYerr if $type ne ' /Page';
  my $mediabox=pdf_get(\*F,$xref,$pages,'/MediaBox');
  goto IOerr if !defined $mediabox;
  goto do_pdf_slow if !length $mediabox;
}
pdf_get_boxes(\*F, $xref, $pages, $bbi);
print all2($bbi) if $show_p;

# -- write it back

my @tmpfiles;
END { unlink @tmpfiles }

if (defined $outfilename) {
  my $inplace_p=0;
  
  if ($outfilename eq $infilename) {
    # die "$0: cannot modify file in place: $outfilename\n"
    $inplace_p=1;
    do { $outfilename.=".tmp.$$" } while -e $outfilename;
    push @tmpfiles, $outfilename;
  }
  if ($outfilename eq '-') {
    die unless open O, ">&STDOUT";
  } else {
    die "$0: open_w $outfilename: $!\n" if !open O, "> $outfilename";
  }
  die unless binmode O;
  # Dat: $newobj destroys the /Linearized property, but never mind.
  # my $newobj="$oldSize 0 obj\n<</Type/pdfboxes\n/Kis (".scalar(localtime).")>>\nendobj\n";
  my $newobj="$oldSize 0 obj\n<</Type/pdfboxes%\n".all2($bbi).">>\nendobj\n";

  my $outhead="$firstline\n";
  $outhead.="%\307\354\217\242\n" if $binary_p;
  my $delta=length($outhead)+length($newobj)-$obj1ofs;
  my $newxrefentry=sprintf"%d 1\n%010d 00000 n \n",
    $oldSize, length($outhead);
  my $newtrailer="trailer\n<<\n$trailerx\n>>\n";
  my $newtrailerlen;
  my $trailerdelta;

  do { # iterate until lengths are right
    $newtrailerlen=length($newtrailer);
    $trailerdelta=length($newxrefentry)+$newtrailerlen-$oldtrailerlen;
    $newtrailer=modify_prev($newtrailer, sub { $_[0]+$delta+($_[0]>$xref_ofs ? $trailerdelta : 0) } );
  } until length($newtrailer)==$newtrailerlen;

  if (!print O $outhead, $newobj) {
    OOerr: die "$0: write $outfilename: $!\n"
  }

  # die unless seek F, $obj1ofs, 0;
  my %xrefs=@$xrefs;
  ## print STDERR "@$xrefs\n";
  #** F has been dumped 0...$doneofs
  my $doneofs=$obj1ofs;
  goto IOerr unless seek F, $doneofs, 0;
  for my $from (sort {$a<=>$b} keys %xrefs) {
    ## print STDERR "xrefs $from $xrefs{$from} ($doneofs)\n";
    my $to=$xrefs{$from};
    
    # Copy
    die "$0: inconsistent offsets: $doneofs > $from" if $doneofs>$from;
    my $O=$from-4096;
    my $S;
    while ($doneofs<$O) {
      goto IOerr unless 4096==read F, $S, 4096;
      goto OOerr unless print O $S;
      $doneofs+=4096;
    }
    if ($doneofs<$from) {
      goto IOerr unless $from-$doneofs==read F, $S, $from-$doneofs;
      goto OOerr unless print O $S;
      $doneofs=$from;
    }
    if ($to<0) { # /Prev correction for the 2nd, 3rd etc. /Prevs
      # Example file: examples/lm2.pdf
      ## print STDERR "/Prev correction $from ... $to ($doneofs)\n";
      goto IOerr unless -$to-$doneofs==read F, $S, -$to-$doneofs;
      $S=modify_prev($S, sub { $_[0]+$delta+($_[0]>$xref_ofs ? $trailerdelta : 0) } );
      die "$0: modified /Prev too long\n" if length($S)!=-$to-$doneofs;
      goto OOerr unless print O $S;
      $doneofs=-$to;
    } else { # xref offset correction
      die "$0: inconsistent xref interval: $from > $to " if $from>$to or 0!=($to-$from)%20;
      while ($doneofs<$to) { # Imp: speed up with larger read bursts?
        goto IOerr unless 20==read F, $S, 20;
        goto SYerr unless $S=~/\A\d{10} \d{5} [nf]\s\s/;
        if ($was_pdfboxes and $doneofs==$oldtrailerofs-20) {
          $S=substr($newxrefentry,-20);
        } else {
          my $ofs=0+substr($S,0,10);
          $ofs+=$trailerdelta if $ofs>$xref_ofs;
          substr($S,0,10)=sprintf("%010d",$ofs+$delta);
        }
        goto OOerr unless print O $S;
        $doneofs+=20;
      }
      if ($doneofs==$oldtrailerofs) { # the xref entry we modify the trailer of
        ## print "$doneofs -> $oldtrailerofs\n";
        goto IOerr unless seek F, $oldtrailerlen, 1;
        $doneofs+=$oldtrailerlen;
        goto OOerr unless print O ($was_pdfboxes==0 ? $newxrefentry : ""), $newtrailer;
      }
    }
  }

  my $extra_trailer="trailer\n<<\n>>\n";
  $extra_trailer="" if $doneofs==$oldtrailerofs+$oldtrailerlen;
  { my $O=$tailofs-4096;
    my $S;
    while ($doneofs<$O) {
      goto IOerr unless 4096==read F, $S, 4096;
      goto OOerr unless print O $S;
      $doneofs+=4096;
    }
    if ($doneofs<$tailofs) {
      goto IOerr unless $tailofs-$doneofs==read F, $S, $tailofs-$doneofs;
      goto OOerr unless print O $S;
      $doneofs=$tailofs;
    }
    goto IOerr if 16>read(F, $S, 1025) or length($S)>1024;
    # ^^^ $tailofs is at most 1024 bytes before EOF
    goto SYerr unless $S=~s@
      ((?:\btrailer\s*<<.*?>>\s*)?)  startxref\s+(\d+)\s*%%EOF
      ( \s*\Z(?!\n) | )
    @@sx;
    # @length($1)?($extra_trailer=""):""@sex;
    goto OOerr unless print O $S;
  }

  die unless printf O "${extra_trailer}startxref\n%d\n%%%%EOF\n", $xref_ofs+$delta;
  die unless close O;
  if ($inplace_p) {
    die "$0: rename $outfilename -> $infilename: $!\n" if !rename $outfilename, $infilename;
  }   
}

just::end __END__

=begin man

.ds pts-dev \*[.T]
.do if '\*[.T]'ascii'  .ds pts-dev tty
.do if '\*[.T]'ascii8' .ds pts-dev tty
.do if '\*[.T]'latin1' .ds pts-dev tty
.do if '\*[.T]'nippon' .ds pts-dev tty
.do if '\*[.T]'utf8'   .ds pts-dev tty
.do if '\*[.T]'cp1047' .ds pts-dev tty
.do if '\*[pts-dev]'tty' \{\
.ll 79
.pl 33333v
.nr IN 2n
.\}
.ad n

=end

=head1 NAME

pdfboxes.pl - extract bounding box information from PDFs and update them 

=head1 SYNOPSIS

C<B<pdfboxes.pl>>
 S<[ C<-->>
 S<| C<--show> ]>
 S<I<input.pdf> I<output.pdf>>

=head1 DESCRIPTION

pdfboxes.pl is a standalone Perl script that can extract /MediaBox,
/CropBox and other bounding box information from PDF files, and
write back this information to the begininning of the PDF, so other programs,
such as pdftex.def of the graphicx package of LaTeX can read the bounding
box by simply reading the first few lines of the PDF output of pdfboxes.pl.

To get the bounding box from other file formats, use img_bbox.pl from the same
author.

=head1 COMMENTS

Breaks a linearized PDF (as defined by Appendix F of PDFRef.pdf) into a
non-linearized one.
