summaryrefslogtreecommitdiffstats
path: root/split-kochabo.sh
diff options
context:
space:
mode:
authorFlorian Pritz <bluewind@xinu.at>2014-02-13 21:32:22 +0100
committerFlorian Pritz <bluewind@xinu.at>2014-02-13 21:32:22 +0100
commitf6c6b964fc8750fe46cf1d53d067ff135ec59708 (patch)
treeaf1cbad7845d80939dd5983db3ea9c005b647cf0 /split-kochabo.sh
parent457cbccf334faa04f790d4654af364c243c18e94 (diff)
downloadbin-f6c6b964fc8750fe46cf1d53d067ff135ec59708.tar.gz
bin-f6c6b964fc8750fe46cf1d53d067ff135ec59708.tar.xz
add split-kochabo.sh
Signed-off-by: Florian Pritz <bluewind@xinu.at>
Diffstat (limited to 'split-kochabo.sh')
-rwxr-xr-xsplit-kochabo.sh48
1 files changed, 48 insertions, 0 deletions
diff --git a/split-kochabo.sh b/split-kochabo.sh
new file mode 100755
index 0000000..8f30f32
--- /dev/null
+++ b/split-kochabo.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# This script can extract recipes from the PDFs mailed to customers of
+# kochabo.at.
+# Recipes from 2013 need extra processing for the file name in get_name().
+#
+
+set -e
+
+extract_pages() {
+ local first=$1
+ local last=$2
+ local output=$3
+ shift 3
+
+ gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@"
+}
+
+get_name() {
+ pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " "
+ # needed for older files with all uppercase names
+ #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;'
+}
+
+mkdir -p extracted
+
+skip_pages=4
+for file; do
+ for i in 1 2 3 4 5; do
+ extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file"
+ src=rezept-$i.pdf
+ name="$(get_name $src)"
+
+ if [[ -z $name ]]; then
+ rm "rezept-$i.pdf"
+ continue
+ fi
+
+ mv "$src" "extracted/$name.pdf"
+
+ pdfimages -j "extracted/$name.pdf" "extracted/$name"
+ rm -f extracted/*-{001,002,003,004}.{jpg,ppm}
+ mv "extracted/$name-000.jpg" "extracted/$name.jpg"
+ done
+done
+
+cd extracted
+generate-kochabo-index.sh