www/plugins-dist/sites/inc/feedfinder.php

   1 <?php
   2 /**
   3  * adaptation en php de feedfinder.py :
   4  *
   5  * """Ultra-liberal feed finder, de Mark Pilgrim
   6  * <http://diveintomark.org/projects/feed_finder/>
   7  * Par: courcy.michael@wanadoo.fr
   8  *
   9  * adaptation en php, je ne reprends qu'une partie de cette algorithme
  10  *
  11  * 0) A chaque etape on verifie si les feed indiques sont reellement des feeds
  12  * 1) Si l'uri passe est un feed on retourne le resultat tout simplement
  13  * 2) Si le header de la page contient des balises LINK qui renvoient vers des feed on les retourne
  14  * 3) on cherche les liens <a> qui se termine par  ".rss", ".rdf", ".xml", ou ".atom"
  15  * 4) on cherche les liens <a> contenant "rss", "rdf", "xml", ou "atom"
  16  *
  17  * j'integre pas l'interrogation  avec xml_rpc de syndic8, mais on peut le faire assez facilement
  18  * dans la phase de test sur differentes url je n'ai constate aucune diffrerence entre les reponses
  19  * donnees par feedfinder.py et les miennes donc je ne suis pas sur de voir l'interet
  20  *
  21  * Je ne me preoccupe pas comme l'auteur de savoir si mes liens de feed sont sur le meme serveur ou pas
  22  *
  23  * exemple d'utilisation
  24  *
  25  * print_r (get_feed_from_url("http://willy.boerland.com/myblog/"));
  26  *
  27  * on obtient
  28  *
  29  * Array
  30  * (
  31  *   [0] => http://willy.boerland.com/myblog/atom/feed
  32  *   [1] => http://willy.boerland.com/myblog/blogapi/rsd
  33  *   [2] => http://willy.boerland.com/myblog/rss.xml
  34  *   [3] => http://willy.boerland.com/myblog/node/feed
  35  * )
  36  */
  37 if (!defined('_ECRIRE_INC_VERSION')) return;
  38
  39 $verif_complete = 0; //mettez le a 1 si vous voulez controler la validite des feed trouves mais le temps d'execution
  40                      //est alors plus long
  41
  42 /**
  43  * une fonction qui permet de si un lien est un feed ou nom,
  44  * si c'est un feed elle retourne son type, si c'est pas un feed elle retourne 0,
  45  * cette verification est évidemment très très légère
  46  *
  47  * @param string $url
  48  *              URL à analyser
  49  * @return string|0
  50  *              Retourne son type (rss|atom|rdf) ou 0 si pas feed
  51  */
  52 function is_feed($url){
  53
  54         /**
  55          * méthode SPIP
  56          */
  57         if (function_exists('recuperer_page')) {
  58                 $buffer = recuperer_page($url);
  59                 if (preg_match("/<(\w*) .*/", $buffer, $matches)){
  60                         //ici on detecte la premiere balise
  61                         $type_feed = $matches[1];
  62                         switch ($type_feed) {
  63                                 case "rss": return "rss";
  64                                 case "feed": return "atom";
  65                                 case "rdf": return "rdf";
  66                         }
  67                 }
  68                 return '';
  69         }
  70
  71         $fp = @fopen($url, "r");
  72         if (!$fp )
  73                 return 0;
  74         //verifion la nature de ce fichier
  75         while (!feof($fp)) {
  76                 $buffer = fgets($fp, 4096);
  77                 if (preg_match("/<(\w*) .*/", $buffer, $matches)){
  78                         //ici on detecte la premiere balise
  79                         $type_feed = $matches[1];
  80                         switch ($type_feed) {
  81                                 case "rss": fclose($fp); return "rss";
  82                                 case "feed": fclose($fp); return "atom";
  83                                 case "rdf": fclose($fp); return "rdf";
  84                                 default : fclose($fp); return 0;
  85                         }
  86                 }
  87         }
  88 }
  89
  90 /*****************test is_feed******************************
  91 echo is_feed("http://contrib.spip.net/spip.php?page=backend" _EXTENSIO_PHP") . "<br />"; //retourne rss
  92 echo is_feed("http://liberation.fr/rss.php") . "<br />"; //retourne rss
  93 echo is_feed("http://liberation.fr/rss.php") . "<br />"; //retourne rss
  94 echo is_feed("http://willy.boerland.com/myblog/atom/feed") //retourne atom
  95 echo is_feed("http://spip.net/") . "<br />"; //retoune 0
  96 ************************************************************/
  97
  98 /**
  99  * fonction sans finesse mais efficace
 100  * on parcourt ligne par ligne a la recherche de balise <a> ou <link>
 101  * si dans le corps de celle-ci on trouve les mots rss, xml, atom ou rdf
 102  * alors on recupere la valeur href='<url>', on adapte celle-ci si elle
 103  * est relative et on verifie que c'est bien un feed si oui on l'ajoute
 104  * au tableau des feed si on ne trouve rien ou si aucun feed est trouve on retourne
 105  * un tableau vide
 106  *
 107  * @param string $url
 108  *              L'URL à analyser
 109  * @param $buffer
 110  * @return array $feed_list
 111  *              Le tableau des feed trouvés dans la page
 112  */
 113 function get_feed_from_url($url, $buffer=false){
 114         global $verif_complete;
 115         //j'ai prevenu ce sera pas fin
 116         if (!preg_match("/^http:\/\/.*/", $url)) $url = "http://www." . $url;
 117         if (!$buffer) $buffer = @file_get_contents($url);
 118
 119         $feed_list = array();
 120         //extraction des <link>
 121         if (preg_match_all("/<link [^>]*>/i", $buffer, $matches)){
 122                 //y a t-y rss atom rdf ou xml dans ces balises
 123                 foreach($matches[0] as $link){
 124                         if ((strpos($link, "rss")
 125                                 || strpos($link, "rdf")
 126                                 || strpos($link, "atom")
 127                                 || strpos($link, "xml"))
 128                                 && !strpos($link,'opensearch')){
 129                                 //voila un candidat on va extraire sa partie href et la placer dans notre tableau
 130                                 if (preg_match("/href=['|\"]?([^\s'\"]*)['|\"]?/",$link,$matches2)){
 131                                         //on aura pris soin de verifier si ce lien est relatif d'en faire un absolu
 132                                         if (!preg_match("/^http:\/\/.*/", $matches2[1])){
 133                                                 $matches2[1] = concat_url($url,$matches2[1]);
 134                                         }
 135                                         if($verif_complete){
 136                                                 if (is_feed($matches2[1])) $feed_list[] = $matches2[1];
 137                         }
 138                         else
 139                                 $feed_list[] = $matches2[1];
 140                                 }
 141                         }
 142                 }
 143         }
 144         //extraction des <a>
 145         if (preg_match_all("/<a [^>]*>/i", $buffer, $matches)){
 146                 //y a t-y rss atom rdf ou xml dans ces balises
 147                 foreach($matches[0] as $link){
 148                         if ((strpos($link, "rss")
 149                                 || strpos($link, "rdf")
 150                                 || strpos($link, "atom")
 151                                 || strpos($link, "xml"))
 152                                 && !strpos($link,'opensearch')){
 153                                 //voila un candidat on va extraire sa partie href et la placer dans notre tableau
 154                                 if (preg_match("/href=['|\"]?([^\s'\"]*)['|\"]?/",$link,$matches2)){
 155                                         //on aura pris soin de verifier si ce lien est relatif d'en faire un absolu
 156                                         if (!preg_match("/^http:\/\/.*/", $matches2[1])){
 157                                                 $matches2[1] = concat_url($url,$matches2[1]);
 158                                         }
 159                                         if($verif_complete){
 160                                                 if (is_feed($matches2[1])) $feed_list[] = $matches2[1];
 161                                         }
 162                                         else
 163                                                 $feed_list[] = $matches2[1];
 164                                 }
 165                         }
 166                 }
 167         }
 168         return $feed_list;
 169 }
 170 /************************************ getFeed ****************************
 171 print_r (get_feed_from_url("contrib.spip.net"));
 172 print_r (get_feed_from_url("http://liberation.fr/"));
 173 print_r (get_feed_from_url("cnn.com"));
 174 print_r (get_feed_from_url("http://willy.boerland.com/myblog/"));
 175 *****************************    Resultat *****************************************
 176 Array
 177 (
 178     [0] => http://www.spip-contrib.net/backend.php
 179 )
 180 Array
 181 (
 182     [0] => http://www.liberation.fr/rss.php
 183 )
 184 Array
 185 (
 186     [0] => http://rss.cnn.com/rss/cnn_topstories.rss
 187     [1] => http://rss.cnn.com/rss/cnn_latest.rss
 188     [2] => http://www.cnn.com/services/rss/
 189     [3] => http://www.cnn.com/services/rss/
 190     [4] => http://www.cnn.com/services/rss/
 191 )
 192 Array
 193 (
 194     [0] => http://willy.boerland.com/myblog/atom/feed
 195     [1] => http://willy.boerland.com/myblog/blogapi/rsd
 196     [2] => http://willy.boerland.com/myblog/rss.xml
 197     [3] => http://willy.boerland.com/myblog/node/feed
 198 )
 199 ************************************************************************/
 200
 201 /**
 202  * petite fonction qui prend en charge les problèmes de double slash
 203  * quand on concatène les liens
 204  */
 205 function concat_url($url1, $path){
 206         /**
 207          * méthode spip
 208          */
 209         if(function_exists('suivre_lien')) {
 210                 return suivre_lien($url1,$path);
 211         }
 212         $url = $url1 . "/" . $path;
 213         //cette operation peut tres facilement avoir genere // ou ///
 214         $url = str_replace("///", "/", $url);
 215         $url = str_replace("//", "/", $url);
 216         //cas particulier de http://
 217         $url = str_replace("http:/", "http://", $url);
 218         return $url;
 219 }
 220
 221 /****************************test concat**********************
 222 echo concat_url("http://spip.net" , "ecrire")."<br />";
 223 echo concat_url("http://spip.net/" , "ecrire")."<br />";
 224 echo concat_url("http://spip.net" , "/ecrire")."<br />";
 225 echo concat_url("http://spip.net/" , "/ecrire")."<br />";
 226 *************************************************************/
 227 ?>