summaryrefslogtreecommitdiffstats
path: root/split-kochabo.sh
blob: 2878c4d36d4b360db63a66b8d319bc487f6cc35f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/bin/bash
#
# This script can extract recipes from the PDFs mailed to customers of
# kochabo.at.
# Recipes from 2013 need extra processing for the file name in get_name().
#

set -e

extract_pages() {
	local first=$1
	local last=$2
	local output=$3
	shift 3

	gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@"
}

get_name() {
	pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " "
	# needed for older files with all uppercase names
	#| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;'
}

mkdir -p extracted

skip_pages=4
for file; do
	for i in 1 2 3 4 5; do
		extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file"
		src=rezept-$i.pdf
		name="$(get_name $src)"

		if [[ -z $name ]]; then
			rm "rezept-$i.pdf"
			continue
		fi

		mv "$src" "extracted/$name.pdf"

		pdfimages -j "extracted/$name.pdf" "extracted/$name"
		rm -f extracted/*-{001,002,003,004}.{jpg,ppm}
		mv "extracted/$name-000.jpg" "extracted/$name.jpg"
	done
done

output="index.html"

cat <<EOF >"$output"
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">

<head>
        <title></title>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <style type="text/css">
        html {
                min-height: 100%;
                background-color: #fff;
        }

        * {
                margin: 0;
                padding: 0;
        }

        a {
                text-decoration: none;
                color: #048;
        }

        a:hover {
                color: #06C;
        }

        img {
                border:0;
                width: 220px;
        }

        .thumbnail {
                line-height: 0;
        }

        div.thumbnail {
                display: inline-block;
                text-align: center;
                vertical-align: middle;
                width: 232px;
                padding: 20px 0;
        }

        div.thumbnail p {
                line-height: 1em;
                padding-top: 10px;
        }

        img.thumbnail {
                margin:2px;
                border: 1px solid #fff;
        }

        body {
                color: #444;
                font-family: "Bitstream Vera Sans", Monospace;
                margin-top: 30px;
                text-align: center;
        }
        </style>
</head>

<body>
	<a href="../">Übersicht</a><br />
EOF

for i in extracted/*.jpg; do
	n=${i%.jpg}
	bn=${n##*/}
	echo "<div class='thumbnail'><a href='$n.pdf'><img src='$n.jpg'></a><p>$bn</p></div>" >> "$output"
done

cat <<EOF >>"$output"
	<br /><a href="../">Übersicht</a><br />
</body>
</html>
EOF