From: Ævar Arnfjörð Bjarmason Date: Sun, 3 Apr 2005 20:38:51 +0000 (+0000) Subject: * NEW: A script to convert html entitites to Unicode literals. X-Git-Tag: 1.5.0alpha1~405 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/journal.php?a=commitdiff_plain;h=a51e42ce575b7a9694f928e392c327909fbb3e4c;p=lhc%2Fweb%2Fwiklou.git * NEW: A script to convert html entitites to Unicode literals. --- diff --git a/maintenance/entities2literals.pl b/maintenance/entities2literals.pl new file mode 100644 index 0000000000..791bdf6d2a --- /dev/null +++ b/maintenance/entities2literals.pl @@ -0,0 +1,40 @@ +#!/usr/bin/evn perl +# Takes STDIN and converts Converts decimal and named HTML entities to their +# respective literals +# Usage: perl entities2literals.pl < file_to_convert [> outfile] +# +# Copyright 2005 Ævar Arnfjörð Bjarmason No rights reserved + +undef $/; +$file = <>; + +for (@s=split /;&/,;$i<=$#s,$_=$s[$i];++$i) { + ($chr, $nr) = $s[$i] =~ m#&?([^;]+)[^0-9]+([0-9]+)#; + $file =~ s/&$chr;/&#$nr;/g; +} + +for $i (0..length $file) { + if (&ss($file,$i) eq '&' and &ss($file, $i+1) eq '#') { + $eat = 1; # Yummie entities + undef $food; + next; + } elsif ($eat && &ss($file, $i) eq '#') { + next; + } elsif ($eat && &ss($file, $i) =~ /\d/) { + $food .= &ss($file, $i); + next; + } elsif ($eat && &ss($file, $i) =~ /;/) { + undef $eat; + $out .= chr($food); + undef $food; + next; + } + $out .= &ss($file, $i); +} + +print $out; + +sub ss {substr($_[0],$_[1],1)} + +__DATA__ +á:áÁ:Áâ:âÂ:Âà:àÀ:Àå:åÅ:Åã:ãÃ:Ãä:äÄ:Äæ:æÆ:Æç:çÇ:Çð:ðÐ:Ðé:éÉ:Éê:êÊ:Êè:èÈ:Èë:ëË:Ëí:íÍ:Íî:îÎ:Îì:ìÌ:Ìï:ïÏ:Ïñ:ñÑ:Ñó:óÓ:Óô:ôÔ:Ôò:òÒ:Òø:øØ:Øõ:õÕ:Õö:öÖ:Öß:ßþ:þÞ:Þú:úÚ:Úû:ûÛ:Ûù:ùÙ:Ùü:üÜ:Üý:ýÝ:Ýÿ:ÿ