summaryrefslogtreecommitdiffstats
path: root/urlencode
blob: c472ab8802fe916e40247c12921d7f1a69806e49 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
:
##########################################################################
# Title      :	urlencode - encode URL data
# Author     :	Heiner Steven (heiner.steven@odn.de)
# Date       :	2000-03-15
# Requires   :	awk
# Categories :	File Conversion, WWW, CGI
# SCCS-Id.   :	@(#) urlencode	1.4 06/10/29
##########################################################################
# Description
#	Encode data according to
#	    RFC 1738: "Uniform Resource Locators (URL)" and
#	    RFC 1866: "Hypertext Markup Language - 2.0" (HTML)
#
#	This encoding is used i.e. for the MIME type
#	"application/x-www-form-urlencoded"
#
# Notes
#    o	The default behaviour is not to encode the line endings. This
#	may not be what was intended, because the result will be
#	multiple lines of output (which cannot be used in an URL or a
#	HTTP "POST" request). If the desired output should be one
#	line, use the "-l" option.
#
#    o	The "-l" option assumes, that the end-of-line is denoted by
#	the character LF (ASCII 10). This is not true for Windows or
#	Mac systems, where the end of a line is denoted by the two
#	characters CR LF (ASCII 13 10).
#	We use this for symmetry; data processed in the following way:
#		cat | urlencode -l | urldecode -l
#	should (and will) result in the original data
#
#    o	Large lines (or binary files) will break many AWK
#    	implementations. If you get the message
#		awk: record `...' too long
#		 record number xxx
#	consider using GNU AWK (gawk).
#
#    o	urlencode will always terminate it's output with an EOL
#    	character
#
# Thanks to Stefan Brozinski for pointing out a bug related to non-standard
# locales.
#
# See also
#	urldecode
##########################################################################

PN=`basename "$0"`			# Program name
VER='1.4'

: ${AWK=awk}

Usage () {
    echo >&2 "$PN - encode URL data, $VER
usage: $PN [-l] [file ...]
    -l:  encode line endings (result will be one line of output)

The default is to encode each input line on its own."
    exit 1
}

Msg () {
    for MsgLine
    do echo "$PN: $MsgLine" >&2
    done
}

Fatal () { Msg "$@"; exit 1; }

set -- `getopt hl "$@" 2>/dev/null` || Usage
[ $# -lt 1 ] && Usage			# "getopt" detected an error

EncodeEOL=no
while [ $# -gt 0 ]
do
    case "$1" in
    	-l)	EncodeEOL=yes;;
	--)	shift; break;;
	-h)	Usage;;
	-*)	Usage;;
	*)	break;;			# First file name
    esac
    shift
done

LANG=C	export LANG
$AWK '
    BEGIN {
	# We assume an awk implementation that is just plain dumb.
	# We will convert an character to its ASCII value with the
	# table ord[], and produce two-digit hexadecimal output
	# without the printf("%02X") feature.

	EOL = "%0A"		# "end of line" string (encoded)
	split ("1 2 3 4 5 6 7 8 9 A B C D E F", hextab, " ")
	hextab [0] = 0
	for ( i=1; i<=255; ++i ) ord [ sprintf ("%c", i) "" ] = i + 0
	if ("'"$EncodeEOL"'" == "yes") EncodeEOL = 1; else EncodeEOL = 0
    }
    {
	encoded = ""
	for ( i=1; i<=length ($0); ++i ) {
	    c = substr ($0, i, 1)
	    if ( c ~ /[a-zA-Z0-9.-]/ ) {
		encoded = encoded c		# safe character
	    } else if ( c == " " ) {
		encoded = encoded "+"	# special handling
	    } else {
		# unsafe character, encode it as a two-digit hex-number
		lo = ord [c] % 16
		hi = int (ord [c] / 16);
		encoded = encoded "%" hextab [hi] hextab [lo]
	    }
	}
	if ( EncodeEOL ) {
	    printf ("%s", encoded EOL)
	} else {
	    print encoded
	}
    }
    END {
    	#if ( EncodeEOL ) print ""
    }
' "$@"