diff options
author | Florian Pritz <bluewind@xinu.at> | 2014-02-13 21:32:22 +0100 |
---|---|---|
committer | Florian Pritz <bluewind@xinu.at> | 2014-02-13 21:32:22 +0100 |
commit | f6c6b964fc8750fe46cf1d53d067ff135ec59708 (patch) | |
tree | af1cbad7845d80939dd5983db3ea9c005b647cf0 /split-kochabo.sh | |
parent | 457cbccf334faa04f790d4654af364c243c18e94 (diff) | |
download | bin-f6c6b964fc8750fe46cf1d53d067ff135ec59708.tar.gz bin-f6c6b964fc8750fe46cf1d53d067ff135ec59708.tar.xz |
add split-kochabo.sh
Signed-off-by: Florian Pritz <bluewind@xinu.at>
Diffstat (limited to 'split-kochabo.sh')
-rwxr-xr-x | split-kochabo.sh | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/split-kochabo.sh b/split-kochabo.sh new file mode 100755 index 0000000..8f30f32 --- /dev/null +++ b/split-kochabo.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# This script can extract recipes from the PDFs mailed to customers of +# kochabo.at. +# Recipes from 2013 need extra processing for the file name in get_name(). +# + +set -e + +extract_pages() { + local first=$1 + local last=$2 + local output=$3 + shift 3 + + gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@" +} + +get_name() { + pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " " + # needed for older files with all uppercase names + #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;' +} + +mkdir -p extracted + +skip_pages=4 +for file; do + for i in 1 2 3 4 5; do + extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file" + src=rezept-$i.pdf + name="$(get_name $src)" + + if [[ -z $name ]]; then + rm "rezept-$i.pdf" + continue + fi + + mv "$src" "extracted/$name.pdf" + + pdfimages -j "extracted/$name.pdf" "extracted/$name" + rm -f extracted/*-{001,002,003,004}.{jpg,ppm} + mv "extracted/$name-000.jpg" "extracted/$name.jpg" + done +done + +cd extracted +generate-kochabo-index.sh |