blob: 8f30f327344328910c57aaa83f86ec8b39ef2d2b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
#!/bin/bash
#
# This script can extract recipes from the PDFs mailed to customers of
# kochabo.at.
# Recipes from 2013 need extra processing for the file name in get_name().
#
set -e
extract_pages() {
local first=$1
local last=$2
local output=$3
shift 3
gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@"
}
get_name() {
pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " "
# needed for older files with all uppercase names
#| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;'
}
mkdir -p extracted
skip_pages=4
for file; do
for i in 1 2 3 4 5; do
extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file"
src=rezept-$i.pdf
name="$(get_name $src)"
if [[ -z $name ]]; then
rm "rezept-$i.pdf"
continue
fi
mv "$src" "extracted/$name.pdf"
pdfimages -j "extracted/$name.pdf" "extracted/$name"
rm -f extracted/*-{001,002,003,004}.{jpg,ppm}
mv "extracted/$name-000.jpg" "extracted/$name.jpg"
done
done
cd extracted
generate-kochabo-index.sh
|