summaryrefslogtreecommitdiff
path: root/pdf-to-text.sh
diff options
context:
space:
mode:
Diffstat (limited to 'pdf-to-text.sh')
-rwxr-xr-xpdf-to-text.sh14
1 files changed, 14 insertions, 0 deletions
diff --git a/pdf-to-text.sh b/pdf-to-text.sh
new file mode 100755
index 0000000..e773e8b
--- /dev/null
+++ b/pdf-to-text.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+STARTPAGE=1 # set to pagenumber of the first page of PDF you wish to convert
+ENDPAGE=12 # set to pagenumber of the last page of PDF you wish to convert
+SOURCE="$1" # set to the file name of the PDF
+OUTPUT=book.txt # set to the final output file
+RESOLUTION=600 # set to the resolution the scanner used (the higher, the better)
+
+touch $OUTPUT
+for i in `seq $STARTPAGE $ENDPAGE`; do
+ convert -monochrome -density $RESOLUTION $SOURCE\[$(($i - 1 ))\] page.tif
+ echo processing page $i
+ tesseract page.tif tempoutput
+ cat tempoutput.txt >> $OUTPUT
+done