Untitled
Posted by Anonymous on Mon 4th Jun 2018 01:33
raw | new post
view followups (newest first): Untitled by Anonymous

pdfseparate ../source.pdf page%04d.pdf
ls *.pdf | awk '1==1 {printf("convert -quality 100 -density 200 %s %s.tif\n",$0,$0)'}
ls *.tif | gawk '1==1 {printf("tesseract -l eng+ita %s %s.txt pdf \n",$0,$0);}'  | sh
pdfunite *.txt.pdf out.pdf
pdftotext out.pdf out.txt
cat out.txt | tr "'" ' ' | tr ' ' '\n' | tr A-Z a-z | tr '.' ' ' | tr ',' ' ' | sort  | uniq -c | sort -rn > words_frequency.txt


convert -density 50 file.pdf[0] page.jpg  # convert page #0 to jpg  [0,10] 0 and 10