From cb286d9007d07621b7f4a3169a94135ee07aeafc Mon Sep 17 00:00:00 2001 From: Florian Pritz Date: Sat, 21 Jun 2014 12:59:41 +0200 Subject: split-kochabo.sh: add support for obstbox recipes Signed-off-by: Florian Pritz --- split-kochabo.sh | 64 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 14 deletions(-) (limited to 'split-kochabo.sh') diff --git a/split-kochabo.sh b/split-kochabo.sh index 2878c4d..94081ec 100755 --- a/split-kochabo.sh +++ b/split-kochabo.sh @@ -3,10 +3,14 @@ # This script can extract recipes from the PDFs mailed to customers of # kochabo.at. # Recipes from 2013 need extra processing for the file name in get_name(). +# Recipes before 2014-18 (that's week 18) also need less processing (no obstbox recipe) # set -e +TMPDIR="`mktemp -d "/tmp/${0##*/}.XXXXXX"`" +trap "rm -rf '${TMPDIR}'" EXIT TERM + extract_pages() { local first=$1 local last=$2 @@ -16,31 +20,63 @@ extract_pages() { gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@" } +get_obstbox_name() { + pdftotext "$1" /dev/stdout | sed -n '3p' | paste -sd " " +} + get_name() { pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " " # needed for older files with all uppercase names #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;' } +extract_obstbox() { + tmpfile="$TMPDIR/rezept-obstbox.pdf" + + extract_pages 4 4 "$tmpfile" "$file" + name="$(get_obstbox_name "$tmpfile")" + + if [[ -z $name ]]; then + rm "$tmpfile" + continue + fi + + mv "$tmpfile" "extracted/$name.pdf" + + pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name" + mv "$TMPDIR/$name-002.jpg" "extracted/$name.jpg" + rm -f $TMPDIR/*.{jpg,ppm} +} + +extract_recipe() { + local i=$1 + local tmpfile="$TMPDIR/rezept-$i.pdf" + + # extract 2 pages for each $i starting on page 5 + extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) "$tmpfile" "$file" + name="$(get_name "$tmpfile")" + + if [[ -z $name ]]; then + rm "$tmpfile" + continue + fi + + mv "$tmpfile" "extracted/$name.pdf" + + pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name" + mv "$TMPDIR/$name-000.jpg" "extracted/$name.jpg" + rm -f $TMPDIR/*.{jpg,ppm} +} + mkdir -p extracted skip_pages=4 for file; do - for i in 1 2 3 4 5; do - extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file" - src=rezept-$i.pdf - name="$(get_name $src)" - - if [[ -z $name ]]; then - rm "rezept-$i.pdf" - continue - fi + extract_obstbox - mv "$src" "extracted/$name.pdf" - - pdfimages -j "extracted/$name.pdf" "extracted/$name" - rm -f extracted/*-{001,002,003,004}.{jpg,ppm} - mv "extracted/$name-000.jpg" "extracted/$name.jpg" + # extract normal recipes + for i in 1 2 3 4 5; do + extract_recipe $i done done -- cgit v1.2.3-24-g4f1b