diff options
author | Florian Pritz <bluewind@xinu.at> | 2014-06-21 12:59:41 +0200 |
---|---|---|
committer | Florian Pritz <bluewind@xinu.at> | 2014-06-21 12:59:41 +0200 |
commit | cb286d9007d07621b7f4a3169a94135ee07aeafc (patch) | |
tree | 6523b422bae578ce0f28b808bd775ad4b927939f /split-kochabo.sh | |
parent | 726f26c346deb99e21e2c2df597cdb6a928686dc (diff) | |
download | bin-cb286d9007d07621b7f4a3169a94135ee07aeafc.tar.gz bin-cb286d9007d07621b7f4a3169a94135ee07aeafc.tar.xz |
split-kochabo.sh: add support for obstbox recipes
Signed-off-by: Florian Pritz <bluewind@xinu.at>
Diffstat (limited to 'split-kochabo.sh')
-rwxr-xr-x | split-kochabo.sh | 64 |
1 files changed, 50 insertions, 14 deletions
diff --git a/split-kochabo.sh b/split-kochabo.sh index 2878c4d..94081ec 100755 --- a/split-kochabo.sh +++ b/split-kochabo.sh @@ -3,10 +3,14 @@ # This script can extract recipes from the PDFs mailed to customers of # kochabo.at. # Recipes from 2013 need extra processing for the file name in get_name(). +# Recipes before 2014-18 (that's week 18) also need less processing (no obstbox recipe) # set -e +TMPDIR="`mktemp -d "/tmp/${0##*/}.XXXXXX"`" +trap "rm -rf '${TMPDIR}'" EXIT TERM + extract_pages() { local first=$1 local last=$2 @@ -16,31 +20,63 @@ extract_pages() { gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@" } +get_obstbox_name() { + pdftotext "$1" /dev/stdout | sed -n '3p' | paste -sd " " +} + get_name() { pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " " # needed for older files with all uppercase names #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;' } +extract_obstbox() { + tmpfile="$TMPDIR/rezept-obstbox.pdf" + + extract_pages 4 4 "$tmpfile" "$file" + name="$(get_obstbox_name "$tmpfile")" + + if [[ -z $name ]]; then + rm "$tmpfile" + continue + fi + + mv "$tmpfile" "extracted/$name.pdf" + + pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name" + mv "$TMPDIR/$name-002.jpg" "extracted/$name.jpg" + rm -f $TMPDIR/*.{jpg,ppm} +} + +extract_recipe() { + local i=$1 + local tmpfile="$TMPDIR/rezept-$i.pdf" + + # extract 2 pages for each $i starting on page 5 + extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) "$tmpfile" "$file" + name="$(get_name "$tmpfile")" + + if [[ -z $name ]]; then + rm "$tmpfile" + continue + fi + + mv "$tmpfile" "extracted/$name.pdf" + + pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name" + mv "$TMPDIR/$name-000.jpg" "extracted/$name.jpg" + rm -f $TMPDIR/*.{jpg,ppm} +} + mkdir -p extracted skip_pages=4 for file; do - for i in 1 2 3 4 5; do - extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file" - src=rezept-$i.pdf - name="$(get_name $src)" - - if [[ -z $name ]]; then - rm "rezept-$i.pdf" - continue - fi + extract_obstbox - mv "$src" "extracted/$name.pdf" - - pdfimages -j "extracted/$name.pdf" "extracted/$name" - rm -f extracted/*-{001,002,003,004}.{jpg,ppm} - mv "extracted/$name-000.jpg" "extracted/$name.jpg" + # extract normal recipes + for i in 1 2 3 4 5; do + extract_recipe $i done done |