From a51e42ce575b7a9694f928e392c327909fbb3e4c Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sun, 3 Apr 2005 20:38:51 +0000 Subject: [PATCH] * NEW: A script to convert html entitites to Unicode literals. --- maintenance/entities2literals.pl | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 maintenance/entities2literals.pl diff --git a/maintenance/entities2literals.pl b/maintenance/entities2literals.pl new file mode 100644 index 0000000000..791bdf6d2a --- /dev/null +++ b/maintenance/entities2literals.pl @@ -0,0 +1,40 @@ +#!/usr/bin/evn perl +# Takes STDIN and converts Converts decimal and named HTML entities to their +# respective literals +# Usage: perl entities2literals.pl < file_to_convert [> outfile] +# +# Copyright 2005 Ævar Arnfjörð Bjarmason No rights reserved + +undef $/; +$file = <>; + +for (@s=split /;&/,;$i<=$#s,$_=$s[$i];++$i) { + ($chr, $nr) = $s[$i] =~ m#&?([^;]+)[^0-9]+([0-9]+)#; + $file =~ s/&$chr;/&#$nr;/g; +} + +for $i (0..length $file) { + if (&ss($file,$i) eq '&' and &ss($file, $i+1) eq '#') { + $eat = 1; # Yummie entities + undef $food; + next; + } elsif ($eat && &ss($file, $i) eq '#') { + next; + } elsif ($eat && &ss($file, $i) =~ /\d/) { + $food .= &ss($file, $i); + next; + } elsif ($eat && &ss($file, $i) =~ /;/) { + undef $eat; + $out .= chr($food); + undef $food; + next; + } + $out .= &ss($file, $i); +} + +print $out; + +sub ss {substr($_[0],$_[1],1)} + +__DATA__ +á:áÁ:Áâ:âÂ:Âà:àÀ:Àå:åÅ:Åã:ãÃ:Ãä:äÄ:Äæ:æÆ:Æç:çÇ:Çð:ðÐ:Ðé:éÉ:Éê:êÊ:Êè:èÈ:Èë:ëË:Ëí:íÍ:Íî:îÎ:Îì:ìÌ:Ìï:ïÏ:Ïñ:ñÑ:Ñó:óÓ:Óô:ôÔ:Ôò:òÒ:Òø:øØ:Øõ:õÕ:Õö:öÖ:Öß:ßþ:þÞ:Þú:úÚ:Úû:ûÛ:Ûù:ùÙ:Ùü:üÜ:Üý:ýÝ:Ýÿ:ÿ -- 2.20.1