* use encoding 'utf8';
[lhc/web/wiklou.git] / maintenance / entities2literals.pl
1 #!/usr/bin/evn perl
2 # Takes STDIN and converts Converts hexadecimal, decimal and named HTML
3 # entities to their respective literals.
4 #
5 # Usage: perl entities2literals.pl < file_to_convert [> outfile]
6 # Reference: http://www.w3.org/TR/REC-html40/sgml/entities.html
7 # Copyright 2005 Ævar Arnfjörð Bjarmason <avarab@gmail.com> No rights reserved
8
9 use encoding 'utf8';
10 use strict;
11
12 my $file = join /$\//, <>;
13
14 $file =~ s/&#(\d+);/chr $1/eg;
15 $file =~ s/&#x([0-9a-fA-F]+);/chr hex $1/eg;
16
17 while (<DATA>) {
18 chomp;
19 my ($number, $entity) = split / +/;
20 $file =~ s/&$entity;/chr $number/eg;
21 }
22 print $file;
23
24 __DATA__
25 34 quot
26 38 amp
27 60 lt
28 62 gt
29 160 nbsp
30 161 iexcl
31 162 cent
32 163 pound
33 164 curren
34 165 yen
35 166 brvbar
36 167 sect
37 168 uml
38 169 copy
39 170 ordf
40 171 laquo
41 172 not
42 173 shy
43 174 reg
44 175 macr
45 176 deg
46 177 plusmn
47 178 sup2
48 179 sup3
49 180 acute
50 181 micro
51 182 para
52 183 middot
53 184 cedil
54 185 sup1
55 186 ordm
56 187 raquo
57 188 frac14
58 189 frac12
59 190 frac34
60 191 iquest
61 192 Agrave
62 193 Aacute
63 194 Acirc
64 195 Atilde
65 196 Auml
66 197 Aring
67 198 AElig
68 199 Ccedil
69 200 Egrave
70 201 Eacute
71 202 Ecirc
72 203 Euml
73 204 Igrave
74 205 Iacute
75 206 Icirc
76 207 Iuml
77 208 ETH
78 209 Ntilde
79 210 Ograve
80 211 Oacute
81 212 Ocirc
82 213 Otilde
83 214 Ouml
84 215 times
85 216 Oslash
86 217 Ugrave
87 218 Uacute
88 219 Ucirc
89 220 Uuml
90 221 Yacute
91 222 THORN
92 223 szlig
93 224 agrave
94 225 aacute
95 226 acirc
96 227 atilde
97 228 auml
98 229 aring
99 230 aelig
100 231 ccedil
101 232 egrave
102 233 eacute
103 234 ecirc
104 235 euml
105 236 igrave
106 237 iacute
107 238 icirc
108 239 iuml
109 240 eth
110 241 ntilde
111 242 ograve
112 243 oacute
113 244 ocirc
114 245 otilde
115 246 ouml
116 247 divide
117 248 oslash
118 249 ugrave
119 250 uacute
120 251 ucirc
121 252 uuml
122 253 yacute
123 254 thorn
124 255 yuml
125 338 OElig
126 339 oelig
127 352 Scaron
128 353 scaron
129 376 Yuml
130 402 fnof
131 710 circ
132 732 tilde
133 913 Alpha
134 914 Beta
135 915 Gamma
136 916 Delta
137 917 Epsilon
138 918 Zeta
139 919 Eta
140 920 Theta
141 921 Iota
142 922 Kappa
143 923 Lambda
144 924 Mu
145 925 Nu
146 926 Xi
147 927 Omicron
148 928 Pi
149 929 Rho
150 931 Sigma
151 932 Tau
152 933 Upsilon
153 934 Phi
154 935 Chi
155 936 Psi
156 937 Omega
157 945 alpha
158 946 beta
159 947 gamma
160 948 delta
161 949 epsilon
162 950 zeta
163 951 eta
164 952 theta
165 953 iota
166 954 kappa
167 955 lambda
168 956 mu
169 957 nu
170 958 xi
171 959 omicron
172 960 pi
173 961 rho
174 962 sigmaf
175 963 sigma
176 964 tau
177 965 upsilon
178 966 phi
179 967 chi
180 968 psi
181 969 omega
182 977 thetasym
183 978 upsih
184 982 piv
185 8194 ensp
186 8195 emsp
187 8201 thinsp
188 8204 zwnj
189 8205 zwj
190 8206 lrm
191 8207 rlm
192 8211 ndash
193 8212 mdash
194 8216 lsquo
195 8217 rsquo
196 8218 sbquo
197 8220 ldquo
198 8221 rdquo
199 8222 bdquo
200 8224 dagger
201 8225 Dagger
202 8226 bull
203 8230 hellip
204 8240 permil
205 8242 prime
206 8243 Prime
207 8249 lsaquo
208 8250 rsaquo
209 8254 oline
210 8260 frasl
211 8364 euro
212 8465 image
213 8472 weierp
214 8476 real
215 8482 trade
216 8501 alefsym
217 8592 larr
218 8593 uarr
219 8594 rarr
220 8595 darr
221 8596 harr
222 8629 crarr
223 8656 lArr
224 8657 uArr
225 8658 rArr
226 8659 dArr
227 8660 hArr
228 8704 forall
229 8706 part
230 8707 exist
231 8709 empty
232 8711 nabla
233 8712 isin
234 8713 notin
235 8715 ni
236 8719 prod
237 8721 sum
238 8722 minus
239 8727 lowast
240 8730 radic
241 8733 prop
242 8734 infin
243 8736 ang
244 8743 and
245 8744 or
246 8745 cap
247 8746 cup
248 8747 int
249 8756 there4
250 8764 sim
251 8773 cong
252 8776 asymp
253 8800 ne
254 8801 equiv
255 8804 le
256 8805 ge
257 8834 sub
258 8835 sup
259 8836 nsub
260 8838 sube
261 8839 supe
262 8853 oplus
263 8855 otimes
264 8869 perp
265 8901 sdot
266 8968 lceil
267 8969 rceil
268 8970 lfloor
269 8971 rfloor
270 9001 lang
271 9002 rang
272 9674 loz
273 9824 spades
274 9827 clubs
275 9829 hearts
276 9830 diams