value(frequency, int)... key(ngram, string)
$ng_frequency = array_count_values($array_ngram);
//sort array by value(frequency) desc
arsort($ng_frequency);
//use only top frequent ngrams
$most_frequent = array_slice($ng_frequency, 0, $ng_number);
foreach ($most_frequent as $ng => $number_frequencey){
$sub_ng[] = $ng;
}
return $sub_ng;
}
function compareNGrams($sub_ng, $lm_ng, $max_delta = 140000) {
foreach ($lm_ng as $lm_basename => $language) {
$delta = 0;
//compare each ngram of input text to current lm-array
foreach ($sub_ng as $key => $existing_ngram){
//match
if(in_array($existing_ngram, $language)) {
$delta += abs($key - array_search($existing_ngram, $language));
//no match
} else {
$delta += 400;
}
//abort: this language already differs too much
if ($delta > $max_delta) {
break;
}
} // End comparison with current language
//include only non-aborted languages in result array
if ($delta < ($max_delta - 400)) {
$result[$lm_basename] = $delta;
}
} //End comparison all languages
if(!isset($result)) {
$result = '';
} else {
asort($result);
}
return $result;
}
function tester_plage_utf($texte, $plage) {
$total = mb_strlen($texte, "UTF-8");
$test = mb_strlen(
preg_replace("/".$plage."/ui", "", $texte),
"UTF-8"
);
// echo "
".($test / $total)." ";
return ($test / $total);
}
function detecter_plages_utf($texte) {
if (tester_plage_utf($texte, "[\x{0041}-\x{024F}\x{1E00}-\x{1EFF}]") < 0.5) {
// Latin
return array("fr", "en", "de", "it", "es", 'af','br','ca','ceb',"da","fi","nl","nr","pt","pt_BR","pt_PT","sk","ha","haw","hr","pl","cs","az","cy","et","ro","eu","hu","id","is","la","lt","lv","nb","nso","sl","so","sq","ss","st","sv","sw","tl","tlh","tn","tr","ts","ve","xh","zu");
} else if (tester_plage_utf($texte, "[\x{0400}-\x{0523}]") < 0.5) {
// Cyrillique
return array("ru","bg","kk","uk", "ky","mn", "uz","mk","sr");
} else if (tester_plage_utf($texte, "[\x{0370}-\x{03ff}\x{1F00}-\x{1FFE}]") < 0.5) {
// Grec/Copte
return "el";
} else if (tester_plage_utf($texte, "[\x{0530}-\x{058A}]") < 0.5) {
// Armenien
return "hy";
} else if (tester_plage_utf($texte, "[\x{0590}-\x{05F4}]") < 0.5) {
// Hebreux
return "he";
} else if (tester_plage_utf($texte, "[\x{0600}-\x{077F}]") < 0.5) {
// Arabe-farsi-pachtoune-urdu
return array("ar","fa", "ps","ur");
} else if (tester_plage_utf($texte, "[\x{3040}-\x{30FF}]") < 0.7) {
// Japonais (hiragana / katakana)
return "ja";
} else if (tester_plage_utf($texte, "[\x{4E00}-\x{9FBB}\x{4E00}–\x{9FBF}]") < 0.7) {
// Chinois
return "zh";
} else if (tester_plage_utf($texte, "[\x{0E00}-\x{0E5B}]") < 0.5) {
// Thai
return "th";
} else if (tester_plage_utf($texte, "[\x{1100}-\x{11F8}\x{3130}-\x{318E}\x{AC00}-\x{D7A3}]") < 0.5) {
// Hangul - Koreen
return "ko";
}
return false;
}
// Detecter langue
// Premiere passe: detecter dans quel alphabet le texte est écrit
// ce qui permet de limiter le nombre de réponse
// (par exemple: des caractères «arabes» ne peuvent être que de l'arabe, du farsi, du pachtourne ou du urdu)
// Deuxieme passe: faire un test classique sur les trigrams (uniquement sur les langues possibles de la première passe)
function _detecter_langue($texte) {
$texte = strip_tags($texte);
$texte = str_replace("’", "'", $texte);
$texte = str_replace("\"", " ", $texte);
// Si texte trop court, impossible de détecter la langue
if (mb_strlen($texte, "utf-8") < 6 ) return false;
$possibles = detecter_plages_utf($texte);
if (!$possibles) return;
else if (!is_array($possibles)) {
return $possibles;
} else {
$ngrams = array();
foreach($possibles as $lang) {
$ngrams["$lang"] = $GLOBALS["ngrams"]["$lang"];
}
}
$sub_ng = createNGrams($texte);
$result_array = compareNGrams($sub_ng, $ngrams, 140000);
// print_r($result_array);
$resultat = false;
if ($result_array) {
foreach($result_array as $lang => $val) {
if (!$resultat) $resultat = $lang;
}
}
return $resultat;
}
?>