diff options
Diffstat (limited to 'pdf-to-text.sh')
-rwxr-xr-x | pdf-to-text.sh | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/pdf-to-text.sh b/pdf-to-text.sh new file mode 100755 index 0000000..e773e8b --- /dev/null +++ b/pdf-to-text.sh @@ -0,0 +1,14 @@ +#!/bin/sh +STARTPAGE=1 # set to pagenumber of the first page of PDF you wish to convert +ENDPAGE=12 # set to pagenumber of the last page of PDF you wish to convert +SOURCE="$1" # set to the file name of the PDF +OUTPUT=book.txt # set to the final output file +RESOLUTION=600 # set to the resolution the scanner used (the higher, the better) + +touch $OUTPUT +for i in `seq $STARTPAGE $ENDPAGE`; do + convert -monochrome -density $RESOLUTION $SOURCE\[$(($i - 1 ))\] page.tif + echo processing page $i + tesseract page.tif tempoutput + cat tempoutput.txt >> $OUTPUT +done |