value(frequency, int)... key(ngram, string) $ng_frequency = array_count_values($array_ngram); //sort array by value(frequency) desc arsort($ng_frequency); //use only top frequent ngrams $most_frequent = array_slice($ng_frequency, 0, $ng_number); foreach ($most_frequent as $ng => $number_frequencey){ $sub_ng[] = $ng; } return $sub_ng; } function compareNGrams($sub_ng, $lm_ng, $max_delta = 140000) { foreach ($lm_ng as $lm_basename => $language) { $delta = 0; //compare each ngram of input text to current lm-array foreach ($sub_ng as $key => $existing_ngram){ //match if(in_array($existing_ngram, $language)) { $delta += abs($key - array_search($existing_ngram, $language)); //no match } else { $delta += 400; } //abort: this language already differs too much if ($delta > $max_delta) { break; } } // End comparison with current language //include only non-aborted languages in result array if ($delta < ($max_delta - 400)) { $result[$lm_basename] = $delta; } } //End comparison all languages if(!isset($result)) { $result = ''; } else { asort($result); } return $result; } function tester_plage_utf($texte, $plage) { $total = mb_strlen($texte, "UTF-8"); $test = mb_strlen( preg_replace("/".$plage."/ui", "", $texte), "UTF-8" ); // echo "

".($test / $total)."

"; return ($test / $total); } function detecter_plages_utf($texte) { if (tester_plage_utf($texte, "[\x{0041}-\x{024F}\x{1E00}-\x{1EFF}]") < 0.5) { // Latin return array("fr", "en", "de", "it", "es", 'af','br','ca','ceb',"da","fi","nl","nr","pt","pt_BR","pt_PT","sk","ha","haw","hr","pl","cs","az","cy","et","ro","eu","hu","id","is","la","lt","lv","nb","nso","sl","so","sq","ss","st","sv","sw","tl","tlh","tn","tr","ts","ve","xh","zu"); } else if (tester_plage_utf($texte, "[\x{0400}-\x{0523}]") < 0.5) { // Cyrillique return array("ru","bg","kk","uk", "ky","mn", "uz","mk","sr"); } else if (tester_plage_utf($texte, "[\x{0370}-\x{03ff}\x{1F00}-\x{1FFE}]") < 0.5) { // Grec/Copte return "el"; } else if (tester_plage_utf($texte, "[\x{0530}-\x{058A}]") < 0.5) { // Armenien return "hy"; } else if (tester_plage_utf($texte, "[\x{0590}-\x{05F4}]") < 0.5) { // Hebreux return "he"; } else if (tester_plage_utf($texte, "[\x{0600}-\x{077F}]") < 0.5) { // Arabe-farsi-pachtoune-urdu return array("ar","fa", "ps","ur"); } else if (tester_plage_utf($texte, "[\x{3040}-\x{30FF}]") < 0.7) { // Japonais (hiragana / katakana) return "ja"; } else if (tester_plage_utf($texte, "[\x{4E00}-\x{9FBB}\x{4E00}–\x{9FBF}]") < 0.7) { // Chinois return "zh"; } else if (tester_plage_utf($texte, "[\x{0E00}-\x{0E5B}]") < 0.5) { // Thai return "th"; } else if (tester_plage_utf($texte, "[\x{1100}-\x{11F8}\x{3130}-\x{318E}\x{AC00}-\x{D7A3}]") < 0.5) { // Hangul - Koreen return "ko"; } return false; } // Detecter langue // Premiere passe: detecter dans quel alphabet le texte est écrit // ce qui permet de limiter le nombre de réponse // (par exemple: des caractères «arabes» ne peuvent être que de l'arabe, du farsi, du pachtourne ou du urdu) // Deuxieme passe: faire un test classique sur les trigrams (uniquement sur les langues possibles de la première passe) function _detecter_langue($texte) { $texte = strip_tags($texte); $texte = str_replace("’", "'", $texte); $texte = str_replace("\"", " ", $texte); // Si texte trop court, impossible de détecter la langue if (mb_strlen($texte, "utf-8") < 6 ) return false; $possibles = detecter_plages_utf($texte); if (!$possibles) return; else if (!is_array($possibles)) { return $possibles; } else { $ngrams = array(); foreach($possibles as $lang) { $ngrams["$lang"] = $GLOBALS["ngrams"]["$lang"]; } } $sub_ng = createNGrams($texte); $result_array = compareNGrams($sub_ng, $ngrams, 140000); // print_r($result_array); $resultat = false; if ($result_array) { foreach($result_array as $lang => $val) { if (!$resultat) $resultat = $lang; } } return $resultat; } ?>