blob: 2878c4d36d4b360db63a66b8d319bc487f6cc35f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#!/bin/bash
#
# This script can extract recipes from the PDFs mailed to customers of
# kochabo.at.
# Recipes from 2013 need extra processing for the file name in get_name().
#
set -e
extract_pages() {
local first=$1
local last=$2
local output=$3
shift 3
gs -dNOPAUSE -dQUIET -dBATCH -dFirstPage=$first -dLastPage=$last -sOutputFile="$output" -sDEVICE=pdfwrite "$@"
}
get_name() {
pdftotext "$1" /dev/stdout | grep -B10 "Nährwertangaben pro Person" | sed '/[1-5]$/d; $d' | paste -sd " "
# needed for older files with all uppercase names
#| awk '{print tolower($0)}' sed 's/\b\(.\)/\u\1/g; s#\bMit\b#mit#g; s#\bUnd\b#und#g; s#\bAuf\b#auf#g; s#\s\+$##;'
}
mkdir -p extracted
skip_pages=4
for file; do
for i in 1 2 3 4 5; do
extract_pages $((skip_pages+((i-1)*2)+1)) $((skip_pages+((i-1)*2)+2)) rezept-$i.pdf "$file"
src=rezept-$i.pdf
name="$(get_name $src)"
if [[ -z $name ]]; then
rm "rezept-$i.pdf"
continue
fi
mv "$src" "extracted/$name.pdf"
pdfimages -j "extracted/$name.pdf" "extracted/$name"
rm -f extracted/*-{001,002,003,004}.{jpg,ppm}
mv "extracted/$name-000.jpg" "extracted/$name.jpg"
done
done
output="index.html"
cat <<EOF >"$output"
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style type="text/css">
html {
min-height: 100%;
background-color: #fff;
}
* {
margin: 0;
padding: 0;
}
a {
text-decoration: none;
color: #048;
}
a:hover {
color: #06C;
}
img {
border:0;
width: 220px;
}
.thumbnail {
line-height: 0;
}
div.thumbnail {
display: inline-block;
text-align: center;
vertical-align: middle;
width: 232px;
padding: 20px 0;
}
div.thumbnail p {
line-height: 1em;
padding-top: 10px;
}
img.thumbnail {
margin:2px;
border: 1px solid #fff;
}
body {
color: #444;
font-family: "Bitstream Vera Sans", Monospace;
margin-top: 30px;
text-align: center;
}
</style>
</head>
<body>
<a href="../">Übersicht</a><br />
EOF
for i in extracted/*.jpg; do
n=${i%.jpg}
bn=${n##*/}
echo "<div class='thumbnail'><a href='$n.pdf'><img src='$n.jpg'></a><p>$bn</p></div>" >> "$output"
done
cat <<EOF >>"$output"
<br /><a href="../">Übersicht</a><br />
</body>
</html>
EOF
|