summaryrefslogtreecommitdiffstats
path: root/split-kochabo.sh
diff options
context:
space:
mode:
authorFlorian Pritz <bluewind@xinu.at>2014-06-21 12:59:41 +0200
committerFlorian Pritz <bluewind@xinu.at>2014-06-21 12:59:41 +0200
commitcb286d9007d07621b7f4a3169a94135ee07aeafc (patch)
tree6523b422bae578ce0f28b808bd775ad4b927939f /split-kochabo.sh
parent726f26c346deb99e21e2c2df597cdb6a928686dc (diff)
downloadbin-cb286d9007d07621b7f4a3169a94135ee07aeafc.tar.gz
bin-cb286d9007d07621b7f4a3169a94135ee07aeafc.tar.xz
split-kochabo.sh: add support for obstbox recipes
Signed-off-by: Florian Pritz <bluewind@xinu.at>
Diffstat (limited to 'split-kochabo.sh')
-rwxr-xr-xsplit-kochabo.sh64
1 files changed, 50 insertions, 14 deletions
diff --git a/split-kochabo.sh b/split-kochabo.sh
index 2878c4d..94081ec 100755
--- a/split-kochabo.sh
+++ b/split-kochabo.sh
@@ -3,10 +3,14 @@
# This script can extract recipes from the PDFs mailed to customers of
# kochabo.at.
# Recipes from 2013 need extra processing for the file name in get_name().
+# Recipes before 2014-18 (that's week 18) also need less processing (no obstbox recipe)
#
set -e
+TMPDIR="`mktemp -d "/tmp/${0##*/}.XXXXXX"`"
+trap "rm -rf '${TMPDIR}'" EXIT TERM
+
extract_pages() {
local first=$1
local last=$2
@@ -16,31 +20,63 @@ extract_pages() {
gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@"
}
+get_obstbox_name() {
+ pdftotext "$1" /dev/stdout | sed -n '3p' | paste -sd " "
+}
+
get_name() {
pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " "
# needed for older files with all uppercase names
#| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;'
}
+extract_obstbox() {
+ tmpfile="$TMPDIR/rezept-obstbox.pdf"
+
+ extract_pages 4 4 "$tmpfile" "$file"
+ name="$(get_obstbox_name "$tmpfile")"
+
+ if [[ -z $name ]]; then
+ rm "$tmpfile"
+ continue
+ fi
+
+ mv "$tmpfile" "extracted/$name.pdf"
+
+ pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name"
+ mv "$TMPDIR/$name-002.jpg" "extracted/$name.jpg"
+ rm -f $TMPDIR/*.{jpg,ppm}
+}
+
+extract_recipe() {
+ local i=$1
+ local tmpfile="$TMPDIR/rezept-$i.pdf"
+
+ # extract 2 pages for each $i starting on page 5
+ extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) "$tmpfile" "$file"
+ name="$(get_name "$tmpfile")"
+
+ if [[ -z $name ]]; then
+ rm "$tmpfile"
+ continue
+ fi
+
+ mv "$tmpfile" "extracted/$name.pdf"
+
+ pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name"
+ mv "$TMPDIR/$name-000.jpg" "extracted/$name.jpg"
+ rm -f $TMPDIR/*.{jpg,ppm}
+}
+
mkdir -p extracted
skip_pages=4
for file; do
- for i in 1 2 3 4 5; do
- extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file"
- src=rezept-$i.pdf
- name="$(get_name $src)"
-
- if [[ -z $name ]]; then
- rm "rezept-$i.pdf"
- continue
- fi
+ extract_obstbox
- mv "$src" "extracted/$name.pdf"
-
- pdfimages -j "extracted/$name.pdf" "extracted/$name"
- rm -f extracted/*-{001,002,003,004}.{jpg,ppm}
- mv "extracted/$name-000.jpg" "extracted/$name.jpg"
+ # extract normal recipes
+ for i in 1 2 3 4 5; do
+ extract_recipe $i
done
done