summaryrefslogtreecommitdiff
path: root/pdf-to-text.sh
blob: e773e8b55a4fc91e58de8053662024b66f75814e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/bin/sh
STARTPAGE=1 # set to pagenumber of the first page of PDF you wish to convert
ENDPAGE=12 # set to pagenumber of the last page of PDF you wish to convert
SOURCE="$1" # set to the file name of the PDF
OUTPUT=book.txt # set to the final output file
RESOLUTION=600 # set to the resolution the scanner used (the higher, the better)

touch $OUTPUT
for i in `seq $STARTPAGE $ENDPAGE`; do
    convert -monochrome -density $RESOLUTION $SOURCE\[$(($i - 1 ))\] page.tif
    echo processing page $i
    tesseract page.tif tempoutput
    cat tempoutput.txt >> $OUTPUT
done