#!/usr/bin/perl use strict; use warnings; use File::stat; use File::Basename; use File::Copy; use File::Path "make_path"; use DateTime; use POSIX (); use POSIX qw(strftime); my $lang = "bel"; if ( $#ARGV < 0 ) { print "give a DJVU file as 1st argument \n" ; exit 0; } my $inputdjvu = $ARGV[0]; my $basedjvu = basename($inputdjvu, '.djvu'); my $tmp = "/srv/ftp/tmp/$basedjvu"; my $err; if (-d $tmp) { print "$tmp already exists.\n" } else { make_path($tmp, {error => $err}) or die "Can not create output dir $tmp\n"; } my $outputprefix = "$tmp/$basedjvu"; my $outputdjvu = $outputprefix . '.djvu'; my $logfile = $outputprefix . '.log'; `touch $logfile`; copy($inputdjvu, $outputdjvu) or die "Can not copy $inputdjvu to $outputdjvu"; my $nbpages = `djvused "$inputdjvu" -e 'n'`; chomp($nbpages); my $fileprefix; my $outtiff; my $djvused; my $pdfs = ""; my $elapsedtime = 0; my $estimatedduration = 0; my $i; for ($i=1 ; $i <= $nbpages ; $i++) { my $start = Time::HiRes::gettimeofday(); print "Page $i/$nbpages: "; my $zeropage = sprintf("%04d", $i); $fileprefix = $outputprefix . '.' . $zeropage; my $splitpagepnm = $fileprefix . '.pnm'; # page extraction as an image `ddjvu -format=pnm -page="$i" "$inputdjvu" $splitpagepnm >> $logfile 2>&1` ; my $threshed = $fileprefix . '.thr.pnm'; `localthresh -b 35 -m 3 -n yes $splitpagepnm $threshed >> $logfile 2>&1`; # unlink($splitpagepnm) or warn "Cannot unlink $splitpagepnm: $!"; my $outpnm = $fileprefix . '.out.pnm'; `unpaper -v --layout none $threshed $outpnm >> $logfile 2>&1`; # unlink($threshed) or warn "Cannot unlink $threshed: $!"; my $onepagedjvu = $fileprefix . '.djvu'; `cjb2 -clean $outpnm $onepagedjvu >> $logfile 2>&1`; $outtiff = $fileprefix . '.out.tiff'; `pnmtotiff $outpnm > $outtiff 2>> $logfile`; # unlink($outpnm) or warn "Cannot unlink $outpnm: $!"; my $hocr = $fileprefix . '.hocr.html'; my $pdffile = $fileprefix . '.pdf'; `tesseract $outtiff $fileprefix -l $lang hocr pdf txt >> $logfile 2>&1`; if (-z $pdffile) { print "$pdffile is empty!\n"; } else { $pdfs = $pdfs . ' ' . $pdffile; } my $txtfile = "$fileprefix.txt"; my $words = `cat $txtfile | wc -w`; chomp($words); my $correctwords = `hunspell -d be_BY -G $txtfile | wc -l`; chomp($correctwords); print(sprintf("%.0f", $words ? 100*$correctwords/$words : 0) . "% good, "); rename($fileprefix . '.hocr', $hocr); unlink($outtiff) or warn "Cannot unlink $outtiff: $!"; my $tempdjvused = $fileprefix . '.djvused.txt'; `hocr2djvused $hocr > $tempdjvused 2>> $logfile`; #unlink($hocr) or warn "Cannot unlink $hocr: $!"; `djvused "$onepagedjvu" -f "$tempdjvused" -s >> $logfile 2>&1`; my $bwdjvu = $outputprefix . '.bw.djvu'; if ($i == 1) { rename($onepagedjvu, $bwdjvu); } else { `djvm -i $bwdjvu $onepagedjvu >> $logfile 2>&1`; } $djvused = $fileprefix . ".out.djvused.txt" ; open(TXT, "$tempdjvused") ; open(TXTDJVU, ">$djvused") ; my $line; while ($line=) { $line =~ s/select 1/select $i/g; print TXTDJVU $line } close (TXT); close (TXTDJVU); # unlink($tempdjvused) or warn "Cannot unlink $tempdjvused: $!"; # print "Writing the text in the DJVU file\n"; `djvused $outputdjvu -f $djvused -s >> $logfile 2>&1`; # print "djvused $outputdjvu -f $djvused -s returned $?\n"; # unlink($djvused) or warn "Cannot unlink $djvused: $!"; my $end = Time::HiRes::gettimeofday(); my $iterationtime = $end - $start; $elapsedtime += $iterationtime; $estimatedduration = $elapsedtime / $i * $nbpages; my $timeleft = $estimatedduration - $elapsedtime; my $timeleftsec = $timeleft % 60; my $timeleftmin = (($timeleft - $timeleftsec) / 60) % 60; my $timelefthour = ($timeleft - $timeleftsec - $timeleftmin * 60) / 3600; printf("%.0f:%.0f:%.0f left.\n", $timelefthour, $timeleftmin, $timeleftsec); } `pdfunite $pdfs $outputprefix.pdf >> $logfile 2>&1`; print "pdfunite returned $?\n"; #`rm -f -v $pdfs`;