#!/bin/bash
#
# convert scans to .pdf, expects *.tiff, assumes 600 dpi
#

# tiff_findskew is from http://sourceforge.net/projects/pagetools
# the rest of the toolchain is part of debian since at least etch

tools="convert gm mmv tiff_findskew pnmrotate tiffcp xmessage tesseract pdfunite"
missing=0

echo -n "checking kit "
for item in $tools ; do 
  echo -n "."
  which $item > /dev/null
  if [ $? -ne 0 ] ; then
    echo " missing $item "
    missing=1
  fi
done

if [ $missing -eq 1 ] ; then
  echo "missing tools, exiting"
  exit 1
else
  echo " OK"
fi

# compress original scans from uncompressed to ZIP
echo -n "compressing originals with ZIP: "
for i in *.tiff ; do
  short=$(echo $i|sed 's/\.tiff//')
  echo -n "$short "
  tiffcp -c zip:2 $i compressed.tiff
  mv compressed.tiff $i
done
echo "done."

# ok, lets back up the original scans in case our process fucks up
echo -n "backing up original scan files: "
tar cpf backed_up_original_scans.tar *.tiff
echo "backed_up_original_scans.tar DONE"


echo -n "normalizing & despeckling: "
for i in *.tiff ; do
  short=$(echo $i|sed 's/\.tiff//')
  echo -n "$short "
  gm convert $i -normalize -despeckle +dither -type bilevel out.tiff && mv out.tiff $i
done

echo

echo -n "deskewing original scans: "
for item in *.tiff ; do
  short=$(echo $item|sed 's/\.tiff//')
  echo -n "$short "
  skew=$(tiff_findskew $item)
  deskew_in="deskew_in_${RANDOM}.pnm"
  deskew_out="deskew_out_${RANDOM}.pnm"
  gm convert -monochrome $item $deskew_in
  pnmrotate -noantialias $skew $deskew_in > $deskew_out
  gm convert -monochrome -crop 4910x7164+25+25 $deskew_out $item
  rm $deskew_in $deskew_out
done
echo

echo -n "compressing tiff with g4 ... "
for i in *.tiff ; do
  short=$(echo $i|sed 's/\.tiff//')
  echo -n "$short "
  nice -n 19 tiffcp -c g4 $i out.tiff && mv out.tiff $i
done
rm -f data.tiff out.tiff
echo

echo "creating PDF with OCR ..."
pdf_files=""
txt_files=""
tiff_files=$(ls -1 *tiff|sort -n)
gm convert ${tiff_files} document.tiff
tesseract document.tiff document   -c user_defined_dpi=600 -l deu --psm 1 pdf txt
rm document.tiff

# notify when processing done
if [ -n $DISPLAY ] ; then # we have an X11 display
  xmessage -buttons OK -default OK  -nearmouse "scan2page processing done, awaiting input"
fi

echo -n "Enter basename of archive: "
read archive_name
mmv "document*" "$archive_name#1"

# keep the original scans around
cp backed_up_original_scans.tar $archive_name.tar

