value(frequency, int)... key(ngram, string) $ng_frequency = array_count_values($array_ngram); //sort array by value(frequency) desc arsort($ng_frequency); //use only top frequent ngrams $most_frequent = array_slice($ng_frequency, 0, $ng_number); $sub_ng = array(); foreach ($most_frequent as $ng => $number_frequencey){ $sub_ng[] = $ng; } return $sub_ng; } /** * Trouver la langue la/les langues plausibles en fonction des ngrams trouves dans le texte et des ngrams possibles * @param array $sub_ng * jeu de Ngrams trouves dans le texte * @param array $lm_ng * jeu de Ngrams possibles * @param int $max_delta * @return string */ function compareNGrams($sub_ng, $lm_ng, $max_delta = 140000){ foreach ($lm_ng as $lm_basename => $language){ $delta = 0; //compare each ngram of input text to current lm-array foreach ($sub_ng as $key => $existing_ngram){ //match if (in_array($existing_ngram, $language)){ $delta += abs($key-array_search($existing_ngram, $language)); //no match } else { $delta += 400; } //abort: this language already differs too much if ($delta>$max_delta){ break; } } // End comparison with current language //include only non-aborted languages in result array if ($delta<($max_delta-400)){ $result[$lm_basename] = $delta; } } //End comparison all languages if (!isset($result)){ $result = ''; } else { asort($result); } return $result; } /** * Retourne la proportion de texte qui n'est pas dans la plage utf cherchee * @param string $texte * @param string $plage * @return float */ function tester_plage_utf($texte, $plage){ $total = mb_strlen($texte, "UTF-8"); $test = mb_strlen( preg_replace("/" . $plage . "/ui", "", $texte), "UTF-8" ); // echo "

".($test / $total)."

"; return ($test/$total); } /** * Reduire le jeu de langue possibles en fonction des plages utf8 utilisees dans le texte * @param string $texte * @return array|bool|string */ function detecter_plages_utf($texte){ if (tester_plage_utf($texte, "[\x{0041}-\x{024F}\x{1E00}-\x{1EFF}]")<0.5){ // Latin return array("fr", "en", "de", "it", "es", 'af', 'br', 'ca', 'ceb', "da", "fi", "nl", "nr", "pt", "pt_BR", "pt_PT", "sk", "ha", "haw", "hr", "pl", "cs", "az", "cy", "et", "ro", "eu", "hu", "id", "is", "la", "lt", "lv", "nb", "nso", "sl", "so", "sq", "ss", "st", "sv", "sw", "tl", "tlh", "tn", "tr", "ts", "ve", "xh", "zu"); } else if (tester_plage_utf($texte, "[\x{0400}-\x{0523}]")<0.5){ // Cyrillique return array("ru", "bg", "kk", "uk", "ky", "mn", "uz", "mk", "sr"); } else if (tester_plage_utf($texte, "[\x{0370}-\x{03ff}\x{1F00}-\x{1FFE}]")<0.5){ // Grec/Copte return "el"; } else if (tester_plage_utf($texte, "[\x{0530}-\x{058A}]")<0.5){ // Armenien return "hy"; } else if (tester_plage_utf($texte, "[\x{0590}-\x{05F4}]")<0.5){ // Hebreux return "he"; } else if (tester_plage_utf($texte, "[\x{0600}-\x{077F}]")<0.5){ // Arabe-farsi-pachtoune-urdu return array("ar", "fa", "ps", "ur"); } else if (tester_plage_utf($texte, "[\x{3040}-\x{30FF}]")<0.7){ // Japonais (hiragana / katakana) return "ja"; } else if (tester_plage_utf($texte, "[\x{4E00}-\x{9FBB}\x{4E00}–\x{9FBF}]")<0.7){ // Chinois return "zh"; } else if (tester_plage_utf($texte, "[\x{0E00}-\x{0E5B}]")<0.5){ // Thai return "th"; } else if (tester_plage_utf($texte, "[\x{1100}-\x{11F8}\x{3130}-\x{318E}\x{AC00}-\x{D7A3}]")<0.5){ // Hangul - Koreen return "ko"; } return false; } /** * Detecter langue * * Premiere passe: detecter dans quel alphabet le texte est écrit * ce qui permet de limiter le nombre de réponse * (par exemple: des caractères «arabes» ne peuvent être que de l'arabe, du farsi, du pachtourne ou du urdu) * Deuxieme passe: faire un test classique sur les trigrams (uniquement sur les langues possibles de la première passe) * * @param string $texte * @return bool|string */ function _detecter_langue($texte){ $texte = strip_tags($texte); $texte = str_replace("’", "'", $texte); $texte = str_replace("\"", " ", $texte); // Si les fonctions mb_ ne sont pas disponibles // ou si texte trop court, impossible de détecter la langue if ( !function_exists('mb_strlen') OR !function_exists('mb_substr') OR mb_strlen($texte, "utf-8")<6 ){ return false; } $possibles = detecter_plages_utf($texte); if (!$possibles){ return false; } else if (!is_array($possibles)){ return $possibles; } else { $ngrams = array(); foreach ($possibles as $lang){ $ngrams[$lang] = $GLOBALS["ngrams"][$lang]; } } $sub_ng = createNGrams($texte); $result_array = compareNGrams($sub_ng, $ngrams, 140000); // print_r($result_array); $lang = false; if (is_array($result_array) AND count($result_array)){ while (!$lang AND $r = each($result_array)){ list($lang, $val) = $r; } } return $lang; } ?>