www/ecrire/inc/feedfinder.php

   1 <?php
   2
   3 if (!defined('_ECRIRE_INC_VERSION')) return;
   4
   5 /**********************************
   6 adaptation en php de feedfinder.py :
   7
   8 """Ultra-liberal feed finder, de Mark Pilgrim
   9 <http://diveintomark.org/projects/feed_finder/>
  10
  11 Par: courcy.michael@wanadoo.fr
  12
  13 adaptation en php, je ne reprends qu'une partie de cette algorithme
  14
  15 0) A chaque etape on verifie si les feed indiques sont reellement des feeds
  16 1) Si l'uri passe est un feed on retourne le resultat tout simplement
  17 2) Si le header de la page contient des balises LINK qui renvoient vers des feed on les retourne
  18 3) on cherche les liens <a> qui se termine par  ".rss", ".rdf", ".xml", ou ".atom"
  19 4) on cherche les liens <a> contenant "rss", "rdf", "xml", ou "atom"
  20
  21 j'integre pas l'interrogation  avec xml_rpc de syndic8, mais on peut le faire assez facilement
  22 dans la phase de test sur differentes url je n'ai constate aucune diffrerence entre les reponses
  23 donnees par feedfinder.py et les miennes donc je ne suis pas sur de voir l'interet
  24
  25 Je ne me preoccupe pas comme l'auteur de savoir si mes liens de feed sont sur le meme serveur ou pas
  26
  27 exemple d'utilisation
  28
  29 print_r (get_feed_from_url("http://willy.boerland.com/myblog/"));
  30
  31 on obtient
  32
  33 Array
  34 (
  35     [0] => http://willy.boerland.com/myblog/atom/feed
  36     [1] => http://willy.boerland.com/myblog/blogapi/rsd
  37     [2] => http://willy.boerland.com/myblog/rss.xml
  38     [3] => http://willy.boerland.com/myblog/node/feed
  39 )
  40
  41 *****************************************************************/
  42
  43 $verif_complete = 0; //mettez le a 1 si vous voulez controler la validite des feed trouves mais le temps d'execution
  44                      //est alors plus long
  45
  46 //une fonction qui permet de si un lien est un feed ou nom, si c'est un feed elle retourne son type
  47 //si c'est pas un feed elle retourne 0, cette verification est evidemment tres tres legere
  48 // http://doc.spip.org/@is_feed
  49 function is_feed($url){
  50
  51         # methode SPIP
  52         if (function_exists('recuperer_page')) {
  53                 $buffer = recuperer_page($url);
  54                 if (preg_match("/<(\w*) .*/", $buffer, $matches)){
  55                 //ici on detecte la premiere balise
  56                 $type_feed = $matches[1];
  57                 switch ($type_feed) {
  58                        case "rss": return "rss";
  59                        case "feed": return "atom";
  60                        case "rdf": return "rdf";
  61                 }
  62                 }
  63         return '';
  64         }
  65
  66       $fp = @fopen($url, "r");
  67       if (!$fp ) {
  68            return 0;
  69       }
  70       //verifion la nature de ce fichier
  71       while (!feof($fp)) {
  72            $buffer = fgets($fp, 4096);
  73            if (preg_match("/<(\w*) .*/", $buffer, $matches)){
  74                 //ici on detecte la premiere balise
  75                 $type_feed = $matches[1];
  76                 switch ($type_feed) {
  77                        case "rss": fclose($fp); return "rss";
  78                        case "feed": fclose($fp); return "atom";
  79                        case "rdf": fclose($fp); return "rdf";
  80                        default : fclose($fp); return 0;
  81                 }
  82            }
  83       }
  84 }
  85
  86 /*****************test is_feed******************************
  87 echo is_feed("http://spip-contrib.net/backend" _EXTENSIO_PHP") . "<br />"; //retourne rss
  88 echo is_feed("http://liberation.fr/rss.php") . "<br />"; //retourne rss
  89 echo is_feed("http://liberation.fr/rss.php") . "<br />"; //retourne rss
  90 echo is_feed("http://willy.boerland.com/myblog/atom/feed") //retourne atom
  91 echo is_feed("http://spip.net/") . "<br />"; //retoune 0
  92 //pas trouver d'exmples avec rdf j'ai encore du mal a saisir ce que rdf apporte de plus que rss
  93 //mais bon j'ai pas aprofondi
  94 ************************************************************/
  95
  96 //fonction sans finesse mais efficace
  97 //on parcourt ligne par ligne a la recherche de balise <a> ou <link>
  98 //si dans le corps de celle-ci on trouve les mots rss, xml, atom ou rdf
  99 //alors on recupere la valeur href='<url>', on adapte celle-ci si elle
 100 //est relative et on verifie que c'est bien un feed si oui on l'ajoute
 101 //au tableau des feed si on ne trouve rien ou si aucun feed est trouve on retourne
 102 //un tableau vide
 103 // http://doc.spip.org/@get_feed_from_url
 104 function get_feed_from_url($url, $buffer=false){
 105          global $verif_complete;
 106          //j'ai prevenu ce sera pas fin
 107          if (!preg_match("/^http:\/\/.*/", $url)) $url = "http://www." . $url;
 108          if (!$buffer) $buffer = @file_get_contents($url);
 109
 110          $feed_list = array();
 111          //extraction des <link>
 112          if (preg_match_all("/<link [^>]*>/i", $buffer, $matches)){
 113                     //y a t-y rss atom rdf ou xml dans ces balises
 114                     foreach($matches[0] as $link){
 115                       if (  strpos($link, "rss")
 116                          || strpos($link, "rdf")
 117                          || strpos($link, "atom")
 118                          || strpos($link, "xml") ){
 119                             //voila un candidat on va extraire sa partie href et la placer dans notre tableau
 120                             if (preg_match("/href=['|\"]?([^\s'\"]*)['|\"]?/",$link,$matches2)){
 121                                  //on aura pris soin de verifier si ce lien est relatif d'en faire un absolu
 122                                  if (!preg_match("/^http:\/\/.*/", $matches2[1])){
 123                                         $matches2[1] = concat_url($url,$matches2[1]);
 124                                  }
 125                                  if($verif_complete){
 126                                         if (is_feed($matches2[1])) $feed_list[] = $matches2[1];
 127                                  }else  $feed_list[] = $matches2[1];
 128                             }
 129                       }
 130                     }
 131                     //print_r($matches);
 132          }
 133          //extraction des <a>
 134          if (preg_match_all("/<a [^>]*>/i", $buffer, $matches)){
 135                     //y a t-y rss atom rdf ou xml dans ces balises
 136                     foreach($matches[0] as $link){
 137                        if (  strpos($link, "rss")
 138                          || strpos($link, "rdf")
 139                          || strpos($link, "atom")
 140                          || strpos($link, "xml") ){
 141                             //voila un candidat on va extraire sa partie href et la placer dans notre tableau
 142                             if (preg_match("/href=['|\"]?([^\s'\"]*)['|\"]?/",$link,$matches2)){
 143                                  //on aura pris soin de verifier si ce lien est relatif d'en faire un absolu
 144                                  if (!preg_match("/^http:\/\/.*/", $matches2[1])){
 145                                         $matches2[1] = concat_url($url,$matches2[1]);
 146                                  }
 147                                  if($verif_complete){
 148                                         if (is_feed($matches2[1])) $feed_list[] = $matches2[1];
 149                                  }else  $feed_list[] = $matches2[1];
 150                             }
 151                        }
 152                     }
 153          }
 154          return $feed_list;
 155 }
 156 /************************************ getFeed ****************************
 157 print_r (get_feed_from_url("spip-contrib.net"));
 158 print_r (get_feed_from_url("http://liberation.fr/"));
 159 print_r (get_feed_from_url("cnn.com"));
 160 print_r (get_feed_from_url("http://willy.boerland.com/myblog/"));
 161 *****************************    Resultat *****************************************
 162 Array
 163 (
 164     [0] => http://www.spip-contrib.net/backend.php
 165 )
 166 Array
 167 (
 168     [0] => http://www.liberation.fr/rss.php
 169 )
 170 Array
 171 (
 172     [0] => http://rss.cnn.com/rss/cnn_topstories.rss
 173     [1] => http://rss.cnn.com/rss/cnn_latest.rss
 174     [2] => http://www.cnn.com/services/rss/
 175     [3] => http://www.cnn.com/services/rss/
 176     [4] => http://www.cnn.com/services/rss/
 177 )
 178 Array
 179 (
 180     [0] => http://willy.boerland.com/myblog/atom/feed
 181     [1] => http://willy.boerland.com/myblog/blogapi/rsd
 182     [2] => http://willy.boerland.com/myblog/rss.xml
 183     [3] => http://willy.boerland.com/myblog/node/feed
 184 )
 185 ************************************************************************/
 186
 187 //petite fonction qui prend en charge les problemes de double slash
 188 //qunad on concatene les lien
 189 // http://doc.spip.org/@concat_url
 190 function concat_url($url1, $path){
 191         # methode spip
 192         if(function_exists('suivre_lien')) {
 193                 return suivre_lien($url1,$path);
 194         }
 195         $url = $url1 . "/" . $path;
 196         //cette operation peut tres facilement avoir genere // ou ///
 197         $url = str_replace("///", "/", $url);
 198         $url = str_replace("//", "/", $url);
 199         //cas particulier de http://
 200         $url = str_replace("http:/", "http://", $url);
 201         return $url;
 202 }
 203
 204 /****************************test concat**********************
 205 echo concat_url("http://spip.net" , "ecrire")."<br />";
 206 echo concat_url("http://spip.net/" , "ecrire")."<br />";
 207 echo concat_url("http://spip.net" , "/ecrire")."<br />";
 208 echo concat_url("http://spip.net/" , "/ecrire")."<br />";
 209 *************************************************************/
 210
 211
 212
 213
 214 ?>