#!/bin/bash # # This script can extract recipes from the PDFs mailed to customers of # kochabo.at. # Recipes from 2013 need extra processing for the file name in get_name(). # set -e extract_pages() { local first=$1 local last=$2 local output=$3 shift 3 gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@" } get_name() { pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " " # needed for older files with all uppercase names #| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;' } mkdir -p extracted skip_pages=4 for file; do for i in 1 2 3 4 5; do extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file" src=rezept-$i.pdf name="$(get_name $src)" if [[ -z $name ]]; then rm "rezept-$i.pdf" continue fi mv "$src" "extracted/$name.pdf" pdfimages -j "extracted/$name.pdf" "extracted/$name" rm -f extracted/*-{001,002,003,004}.{jpg,ppm} mv "extracted/$name-000.jpg" "extracted/$name.jpg" done done output="index.html" cat <"$output" Übersicht
EOF for i in extracted/*.jpg; do n=${i%.jpg} bn=${n##*/} echo "

$bn

" >> "$output" done cat <>"$output"
Übersicht
EOF