From f6c6b964fc8750fe46cf1d53d067ff135ec59708 Mon Sep 17 00:00:00 2001 From: Florian Pritz Date: Thu, 13 Feb 2014 21:32:22 +0100 Subject: add split-kochabo.sh Signed-off-by: Florian Pritz --- split-kochabo.sh | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 split-kochabo.sh (limited to 'split-kochabo.sh') diff --git a/split-kochabo.sh b/split-kochabo.sh new file mode 100755 index 0000000..8f30f32 --- /dev/null +++ b/split-kochabo.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# +# This script can extract recipes from the PDFs mailed to customers of +# kochabo.at. +# Recipes from 2013 need extra processing for the file name in get_name(). +# + +set -e + +extract_pages() { + local first=$1 + local last=$2 + local output=$3 + shift 3 + + gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@" +} + +get_name() { + pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " " + # needed for older files with all uppercase names + #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;' +} + +mkdir -p extracted + +skip_pages=4 +for file; do + for i in 1 2 3 4 5; do + extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file" + src=rezept-$i.pdf + name="$(get_name $src)" + + if [[ -z $name ]]; then + rm "rezept-$i.pdf" + continue + fi + + mv "$src" "extracted/$name.pdf" + + pdfimages -j "extracted/$name.pdf" "extracted/$name" + rm -f extracted/*-{001,002,003,004}.{jpg,ppm} + mv "extracted/$name-000.jpg" "extracted/$name.jpg" + done +done + +cd extracted +generate-kochabo-index.sh -- cgit v1.2.3-24-g4f1b