#!/bin/bash # # This script can extract recipes from the PDFs mailed to customers of # kochabo.at. # Recipes from 2013 need extra processing for the file name in get_name(). # Recipes before 2014-18 (that's week 18) also need less processing (no obstbox recipe) # set -e TMPDIR="$(mktemp -d "/tmp/${0##*/}.XXXXXX")" trap "rm -rf '${TMPDIR}'" EXIT TERM extract_pages() { local first=$1 local last=$2 local output=$3 shift 3 gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@" } get_obstbox_name() { pdftotext "$1" /dev/stdout | sed -n '3p' | paste -sd " " } get_name() { pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " " # needed for older files with all uppercase names #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;' } extract_obstbox() { local tmpfile="$TMPDIR/rezept-obstbox.pdf" extract_pages 4 4 "$tmpfile" "$file" local name="$(get_obstbox_name "$tmpfile")" if [[ -z $name ]]; then rm "$tmpfile" continue fi mv "$tmpfile" "extracted/$name.pdf" pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name" local biggest_size=0 local biggest_path="" local file for file in "$TMPDIR/$name"-*.jpg; do local size=$(stat -c%s "$file") if ((size>biggest_size)); then biggest_path="$file" biggest_size=$size fi done mv "$biggest_path" "extracted/$name.jpg" rm -f "$TMPDIR"/*.{jpg,ppm} } extract_recipe() { local i=$1 local tmpfile="$TMPDIR/rezept-$i.pdf" # extract 2 pages for each $i starting on page 5 extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) "$tmpfile" "$file" name="$(get_name "$tmpfile")" if [[ -z $name ]]; then rm "$tmpfile" continue fi mv "$tmpfile" "extracted/$name.pdf" pdfimages -j "extracted/$name.pdf" "$TMPDIR/$name" mv "$TMPDIR/$name-000.jpg" "extracted/$name.jpg" rm -f $TMPDIR/*.{jpg,ppm} } mkdir -p extracted skip_pages=4 for file; do extract_obstbox # extract normal recipes for i in 1 2 3 4 5; do extract_recipe $i done done output="index.html" cat <"$output" Übersicht
EOF for i in extracted/*.jpg; do n=${i%.jpg} bn=${n##*/} echo "

$bn

" >> "$output" done cat <>"$output"
Übersicht
EOF