SPIP 1.9.2c : /ecrire/inc/syndic.php source

[Sommaire] [Imprimer]
   1  <?php
   2  
   3  /***************************************************************************\
   4   *  SPIP, Systeme de publication pour l'internet                           *
   5   *                                                                         *
   6   *  Copyright (c) 2001-2007                                                *
   7   *  Arnaud Martin, Antoine Pitrou, Philippe Riviere, Emmanuel Saint-James  *
   8   *                                                                         *
   9   *  Ce programme est un logiciel libre distribue sous licence GNU/GPL.     *
  10   *  Pour plus de details voir le fichier COPYING.txt ou l'aide en ligne.   *
  11  \***************************************************************************/
  12  
  13  if (!defined("_ECRIRE_INC_VERSION")) return;
  14  
  15  //
  16  // Effectuer la syndication d'un unique site, retourne 0 si aucun a faire.
  17  //
  18  
  19  // http://doc.spip.org/@executer_une_syndication
  20  function executer_une_syndication() {
  21      $id_syndic = 0;
  22  
  23      ## valeurs modifiables dans mes_options
  24      ## attention il est tres mal vu de prendre une periode < 20 minutes
  25      define('_PERIODE_SYNDICATION', 2*60);
  26      define('_PERIODE_SYNDICATION_SUSPENDUE', 24*60);
  27  
  28      // On va tenter un site 'sus' ou 'off' de plus de 24h, et le passer en 'off'
  29      // s'il echoue
  30      $where = "syndication IN ('sus','off')
  31      AND statut='publie'
  32      AND date_syndic < DATE_SUB(NOW(), INTERVAL
  33      "._PERIODE_SYNDICATION_SUSPENDUE." MINUTE)";
  34      $row = spip_fetch_array(spip_query("SELECT id_syndic FROM spip_syndic WHERE $where    ORDER BY date_syndic LIMIT 1"));
  35      if ($row) {
  36          $id_syndic = $row["id_syndic"];
  37          syndic_a_jour($id_syndic, 'off');
  38      }
  39  
  40      // Et un site 'oui' de plus de 2 heures, qui passe en 'sus' s'il echoue
  41      $where = "syndication='oui'
  42      AND statut='publie'
  43      AND date_syndic < DATE_SUB(NOW(), INTERVAL "._PERIODE_SYNDICATION." MINUTE)";
  44      $row = spip_fetch_array(spip_query("SELECT id_syndic FROM spip_syndic WHERE $where    ORDER BY date_syndic LIMIT 1"));
  45  
  46      if ($row) {
  47          $id_syndic = $row["id_syndic"];
  48          syndic_a_jour($id_syndic, 'sus');
  49      }
  50      return $id_syndic;
  51  }
  52  
  53  
  54  // A partir d'un <dc:subject> ou autre essayer de recuperer
  55  // le mot et son url ; on cree <a href="url" rel="tag">mot</a>
  56  // http://doc.spip.org/@creer_tag
  57  function creer_tag($mot,$type,$url) {
  58      if (!strlen($mot = trim($mot))) return '';
  59      $mot = "<a rel=\"tag\">$mot</a>";
  60      if ($url)
  61          $mot = inserer_attribut($mot, 'href', $url);
  62      if ($type)
  63          $mot = inserer_attribut($mot, 'rel', $type);
  64      return $mot;
  65  }
  66  
  67  // http://doc.spip.org/@ajouter_tags
  68  function ajouter_tags($matches, $item) {
  69      include_spip('inc/filtres');
  70      $tags = array();
  71      foreach ($matches as $match) {
  72          $type = ($match[3] == 'category' OR $match[3] == 'directory')
  73              ? 'directory':'tag';
  74          $mot = supprimer_tags($match[0]);
  75          if (!strlen($mot)) break;
  76          // rechercher un url
  77          if ($url = extraire_attribut($match[0], 'domain')
  78          OR $url = extraire_attribut($match[0], 'resource')
  79          OR $url = extraire_attribut($match[0], 'url'))
  80              {}
  81  
  82          ## cas particuliers
  83          else if (extraire_attribut($match[0], 'scheme') == 'urn:flickr:tags') {
  84              foreach(explode(' ', $mot) as $petit)
  85                  if ($t = creer_tag($petit, $type,
  86                  'http://www.flickr.com/photos/tags/'.rawurlencode($petit).'/'))
  87                      $tags[] = $t;
  88              $mot = '';
  89          } else {
  90              # type del.icio.us
  91              foreach(explode(' ', $mot) as $petit)
  92                  if (preg_match(',<rdf[^>]* resource=["\']([^>]*/'
  93                  .preg_quote(rawurlencode($petit),',').')["\'],i',
  94                  $item, $m)) {
  95                      $mot = '';
  96                      if ($t = creer_tag($petit, $type, $m[1]))
  97                          $tags[] = $t;
  98                  }
  99          }
 100  
 101          if ($t = creer_tag($mot, $type, $url))
 102              $tags[] = $t;
 103      }
 104      return $tags;
 105  }
 106  
 107  
 108  // Retablit le contenu des blocs [[CDATA]] dans un tableau
 109  // http://doc.spip.org/@cdata_echappe_retour
 110  function cdata_echappe_retour(&$table, &$echappe_cdata) {
 111      foreach ($table as $var => $val) {
 112          $table[$var] = filtrer_entites($table[$var]);
 113          foreach ($echappe_cdata as $n => $e)
 114              $table[$var] = str_replace("@@@SPIP_CDATA$n@@@",
 115                  $e, $table[$var]);
 116      }
 117  }
 118  
 119  
 120  // prend un fichier backend et retourne un tableau des items lus,
 121  // et une chaine en cas d'erreur
 122  // http://doc.spip.org/@analyser_backend
 123  function analyser_backend($rss, $url_syndic='') {
 124      include_spip('inc/texte'); # pour couper()
 125  
 126      $rss = pipeline('pre_syndication', $rss);
 127  
 128      // Echapper les CDATA
 129      $echappe_cdata = array();
 130      if (preg_match_all(',<!\[CDATA\[(.*)]]>,Uims', $rss,
 131      $regs, PREG_SET_ORDER)) {
 132          foreach ($regs as $n => $reg) {
 133              $echappe_cdata[$n] = $reg[1];
 134              $rss = str_replace($reg[0], "@@@SPIP_CDATA$n@@@", $rss);
 135          }
 136      }
 137  
 138      // supprimer les commentaires
 139      $rss = preg_replace(',<!--\s+.*\s-->,Ums', '', $rss);
 140  
 141      // simplifier le backend, en supprimant les espaces de nommage type "dc:"
 142      $rss = preg_replace(',<(/?)(dc):,i', '<\1', $rss);
 143  
 144      // chercher auteur/lang dans le fil au cas ou les items n'en auraient pas
 145      list($header) = preg_split(',<(item|entry)[:[:space:]>],', $rss, 2);
 146      if (preg_match_all(
 147      ',<(author|creator)>(.*)</\1>,Uims',
 148      $header, $regs, PREG_SET_ORDER)) {
 149          $les_auteurs_du_site = array();
 150          foreach ($regs as $reg) {
 151              $nom = $reg[2];
 152              if (preg_match(',<name>(.*)</name>,Uims', $nom, $reg))
 153                  $nom = $reg[1];
 154              $les_auteurs_du_site[] = trim(textebrut(filtrer_entites($nom)));
 155          }
 156          $les_auteurs_du_site = join(', ', array_unique($les_auteurs_du_site));
 157      } else
 158          $les_auteurs_du_site = '';
 159  
 160      if (preg_match(',<([^>]*xml:)?lang(uage)?'.'>([^<>]+)<,i',
 161      $header, $match))
 162          $langue_du_site = $match[3];
 163  
 164      $items = array();
 165      if (preg_match_all(',<(item|entry)([:[:space:]][^>]*)?'.
 166      '>(.*)</\1>,Uims',$rss,$r, PREG_PATTERN_ORDER))
 167          $items = $r[0];
 168  
 169      //
 170      // Analyser chaque <item>...</item> du backend et le transformer en tableau
 171      //
 172  
 173      if (!count($items)) return _T('avis_echec_syndication_01');
 174  
 175      foreach ($items as $item) {
 176          $data = array();
 177  
 178          // URL (semi-obligatoire, sert de cle)
 179  
 180          // guid n'est un URL que si marque de <guid ispermalink="true"> ;
 181          // attention la valeur par defaut est 'true' ce qui oblige a quelque
 182          // gymnastique
 183          if (preg_match(',<guid.*>[[:space:]]*(https?:[^<]*)</guid>,Uims',
 184          $item, $regs) AND preg_match(',^(true|1)?$,i',
 185          extraire_attribut($regs[0], 'ispermalink')))
 186              $data['url'] = $regs[1];
 187  
 188          // <link>, plus classique
 189          else if (preg_match(
 190          ',<link[^>]*[[:space:]]rel=["\']?alternate[^>]*>(.*)</link>,Uims',
 191          $item, $regs))
 192              $data['url'] = $regs[1];
 193          else if (preg_match(',<link[^>]*[[:space:]]rel=.alternate[^>]*>,Uims',
 194          $item, $regs))
 195              $data['url'] = extraire_attribut($regs[0], 'href');
 196          else if (preg_match(',<link[^>]*>(.*)</link>,Uims', $item, $regs))
 197              $data['url'] = $regs[1];
 198          else if (preg_match(',<link[^>]*>,Uims', $item, $regs))
 199              $data['url'] = extraire_attribut($regs[0], 'href');
 200  
 201          // Aucun link ni guid, mais une enclosure
 202          else if (preg_match(',<enclosure[^>]*>,ims', $item, $regs)
 203          AND $url = extraire_attribut($regs[0], 'url'))
 204              $data['url'] = $url;
 205  
 206          // pas d'url, c'est genre un compteur...
 207          else
 208              $data['url'] = '';
 209  
 210          // Titre (semi-obligatoire)
 211          if (preg_match(",<title[^>]*>(.*?)</title>,ims",$item,$match))
 212              $data['titre'] = $match[1];
 213          else if (preg_match(',<link[[:space:]][^>]*>,Uims',$item,$mat)
 214          AND $title = extraire_attribut($mat[0], 'title'))
 215              $data['titre'] = $title; 
 216          if (!strlen($data['titre'] = trim($data['titre'])))
 217              $data['titre'] = _T('ecrire:info_sans_titre');
 218  
 219          // Date
 220          $la_date = '';
 221          if (preg_match(',<(published|modified|issued)>([^<]*)<,Uims',
 222          $item,$match))
 223              $la_date = my_strtotime($match[2]);
 224          if (!$la_date AND
 225          preg_match(',<(pubdate)>([^<]*)<,Uims',$item, $match))
 226              $la_date = my_strtotime($match[2]);
 227          if (!$la_date AND
 228          preg_match(',<([a-z]+:date)>([^<]*)<,Uims',$item,$match))
 229              $la_date = my_strtotime($match[2]);
 230          if (!$la_date AND
 231          preg_match(',<date>([^<]*)<,Uims',$item,$match))
 232              $la_date = my_strtotime($match[1]);
 233  
 234          // controle de validite de la date
 235          // pour eviter qu'un backend errone passe toujours devant
 236          // (note: ca pourrait etre defini site par site, mais ca risque d'etre
 237          // plus lourd que vraiment utile)
 238          if ($GLOBALS['controler_dates_rss']) {
 239              if ($la_date < time() - 365 * 24 * 3600
 240              OR $la_date > time() + 48 * 3600)
 241                  $la_date = time();
 242          }
 243  
 244          $data['date'] = $la_date;
 245  
 246          // Honorer le <lastbuilddate> en forcant la date
 247          if (preg_match(',<(lastbuilddate|updated|modified)>([^<>]+)</\1>,i',
 248          $item, $regs)
 249          AND $lastbuilddate = my_strtotime(trim($regs[2]))
 250          // pas dans le futur
 251          AND $lastbuilddate < time())
 252              $data['lastbuilddate'] = $lastbuilddate;
 253  
 254          // Auteur(s)
 255          if (preg_match_all(
 256          ',<(author|creator)>(.*)</\1>,Uims',
 257          $item, $regs, PREG_SET_ORDER)) {
 258              $auteurs = array();
 259              foreach ($regs as $reg) {
 260                  $nom = $reg[2];
 261                  if (preg_match(',<name>(.*)</name>,Uims', $nom, $reg))
 262                      $nom = $reg[1];
 263                  $auteurs[] = trim(textebrut(filtrer_entites($nom)));
 264              }
 265              $data['lesauteurs'] = join(', ', array_unique($auteurs));
 266          }
 267          else
 268              $data['lesauteurs'] = $les_auteurs_du_site;
 269  
 270          // Description
 271          if (preg_match(',<((description|summary)([:[:space:]][^>]*)?)'
 272          .'>(.*)</\2[:>[:space:]],Uims',$item,$match)) {
 273              $data['descriptif'] = trim($match[4]);
 274          }
 275          if (preg_match(',<((content)([:[:space:]][^>]*)?)'
 276          .'>(.*)</\2[:>[:space:]],Uims',$item,$match)) {
 277              $data['content'] = trim($match[4]);
 278          }
 279  
 280          // lang
 281          if (preg_match(',<([^>]*xml:)?lang(uage)?'.'>([^<>]+)<,i',
 282              $item, $match))
 283              $data['lang'] = trim($match[3]);
 284          else
 285              $data['lang'] = trim($langue_du_site);
 286  
 287          // source et url_source  (pas trouve d'exemple en ligne !!)
 288          # <source url="http://www.truc.net/music/uatsap.mp3" length="19917" />
 289          # <source url="http://www.truc.net/rss">Site source</source>
 290          if (preg_match(',(<source[^>]*>)(([^<>]+)</source>)?,i',
 291          $item, $match)) {
 292              $data['source'] = trim($match[3]);
 293              $data['url_source'] = str_replace('&amp;', '&',
 294                  trim(extraire_attribut($match[1], 'url')));
 295          }
 296  
 297          // tags
 298          # a partir de "<dc:subject>", (del.icio.us)
 299          # ou <media:category> (flickr)
 300          # ou <itunes:category> (apple)
 301          # on cree nos tags microformat <a rel="directory" href="url">titre</a>
 302          # http://microformats.org/wiki/rel-directory
 303          $tags = array();
 304          if (preg_match_all(
 305          ',<(([a-z]+:)?(subject|category|directory|keywords?|tags?|type))[^>]*>'
 306          .'(.*?)</\1>,ims',
 307          $item, $matches, PREG_SET_ORDER))
 308              $tags = ajouter_tags($matches, $item); # array()
 309          // Pieces jointes : s'il n'y a pas de microformat relEnclosure,
 310          // chercher <enclosure> au format RSS et les passer en microformat
 311          if (!afficher_enclosures(join(', ', $tags)))
 312              if (preg_match_all(',<enclosure[[:space:]][^<>]+>,i',
 313              $item, $matches, PREG_PATTERN_ORDER))
 314                  $data['enclosures'] = join(', ',
 315                      array_map('enclosure2microformat', $matches[0]));
 316          $data['item'] = $item;
 317  
 318          // Nettoyer les donnees et remettre les CDATA en place
 319          cdata_echappe_retour($data, $echappe_cdata);
 320          cdata_echappe_retour($tags, $echappe_cdata);
 321  
 322          // passer l'url en absolue
 323          $data['url'] = url_absolue(filtrer_entites($data['url']), $url_syndic);
 324  
 325          // Trouver les microformats (ecrase les <category> et <dc:subject>)
 326          if (preg_match_all(
 327          ',<a[[:space:]]([^>]+[[:space:]])?rel=[^>]+>.*</a>,Uims',
 328          $data['item'], $regs, PREG_PATTERN_ORDER)) {
 329              $tags = $regs[0];
 330          }
 331          // Cas particulier : tags Connotea sous la forme <a class="postedtag">
 332          if (preg_match_all(
 333          ',<a[[:space:]][^>]+ class="postedtag"[^>]*>.*</a>,Uims',
 334          $data['item'], $regs, PREG_PATTERN_ORDER))
 335              $tags = preg_replace(', class="postedtag",i',
 336              ' rel="tag"', $regs[0]);
 337  
 338          $data['tags'] = $tags;
 339  
 340          $articles[] = $data;
 341      }
 342  
 343      return $articles;
 344  }
 345  
 346  //
 347  // Insere un article syndique (renvoie true si l'article est nouveau)
 348  //
 349  // http://doc.spip.org/@inserer_article_syndique
 350  function inserer_article_syndique ($data, $now_id_syndic, $statut, $url_site, $url_syndic, $resume, $documents) {
 351  
 352      // Creer le lien s'il est nouveau - cle=(id_syndic,url)
 353      $le_lien = substr($data['url'], 0,255);
 354      $n = spip_num_rows(spip_query("SELECT * FROM spip_syndic_articles WHERE url=" . _q($le_lien) . " AND id_syndic=$now_id_syndic"));
 355      if ($n == 0 and !spip_sql_error()) {
 356          spip_query("INSERT INTO spip_syndic_articles (id_syndic, url, date, statut) VALUES ('$now_id_syndic', " . _q($le_lien) . ", FROM_UNIXTIME(".$data['date']."), '$statut')");
 357          $ajout = true;
 358      }
 359  
 360      // Descriptif, en mode resume ou mode 'full text'
 361      // on prend en priorite data['descriptif'] si on est en mode resume,
 362      // et data['content'] si on est en mode "full syndication"
 363      if ($resume != 'non') {
 364          // mode "resume"
 365          $desc = strlen($data['descriptif']) ?
 366              $data['descriptif'] : $data['content'];
 367          $desc = couper(trim(textebrut($desc)), 300);
 368      } else {
 369          // mode "full syndication"
 370          // choisir le contenu pertinent
 371          // & refaire les liens relatifs
 372          $desc = strlen($data['content']) ?
 373              $data['content'] : $data['descriptif'];
 374          $desc = liens_absolus($desc, $url_syndic);
 375      }
 376  
 377      // Mettre a jour la date si lastbuilddate
 378      $update_date = $data['lastbuilddate'] ?
 379          "date = FROM_UNIXTIME(".$data['lastbuilddate'].")," : '';
 380  
 381      // tags & enclosures (preparer spip_syndic_articles.tags)
 382      $tags = $data['enclosures'];
 383      # eviter les doublons (cle = url+titre) et passer d'un tableau a une chaine
 384      if ($data['tags']) {
 385          $vus = array();
 386          foreach ($data['tags'] as $tag) {
 387              $cle = supprimer_tags($tag).extraire_attribut($tag,'href');
 388              $vus[$cle] = $tag;
 389          }
 390          $tags .= ($tags ? ', ' : '') . join(', ', $vus);
 391      }
 392  
 393      // Mise a jour du contenu (titre,auteurs,description,date?,source...)
 394      spip_query("UPDATE spip_syndic_articles SET                titre=" . _q($data['titre']) .             ",    ".$update_date."                                lesauteurs=" . _q($data['lesauteurs']) . ",            descriptif=" . _q($desc) . ",                    lang="._q(substr($data['lang'],0,10)).",            source="._q(substr($data['source'],0,255)).",            url_source="._q(substr($data['url_source'],0,255)).",        tags=" . _q($tags) .                     "    WHERE id_syndic='$now_id_syndic' AND url=" . _q($le_lien));
 395  
 396      // Point d'entree post_syndication
 397      pipeline('post_syndication',
 398          array(
 399              $le_lien,
 400              $now_id_syndic,
 401              $data
 402          )
 403      );
 404  
 405      return $ajout;
 406  }
 407  
 408  //
 409  // Mettre a jour le site
 410  //
 411  // http://doc.spip.org/@syndic_a_jour
 412  function syndic_a_jour($now_id_syndic, $statut = 'off') {
 413      include_spip('inc/texte');
 414  
 415      $result = spip_query("SELECT * FROM spip_syndic WHERE id_syndic='$now_id_syndic'");
 416  
 417      if (!$row = spip_fetch_array($result))
 418          return;
 419  
 420      $url_syndic = $row['url_syndic'];
 421      $url_site = $row['url_site'];
 422  
 423      if ($row['moderation'] == 'oui')
 424          $moderation = 'dispo';    // a valider
 425      else
 426          $moderation = 'publie';    // en ligne sans validation
 427  
 428      // Section critique : n'autoriser qu'une seule syndication
 429      // simultanee pour un site donne
 430      if (!spip_get_lock("syndication $url_syndic"))
 431          return;
 432  
 433      spip_query("UPDATE spip_syndic SET syndication='$statut', date_syndic=NOW() WHERE id_syndic='$now_id_syndic'");
 434  
 435      // Aller chercher les donnees du RSS et les analyser
 436      include_spip('inc/distant');
 437      $rss = recuperer_page($url_syndic, true);
 438      if (!$rss)
 439          $articles = _T('avis_echec_syndication_02');
 440      else
 441          $articles = analyser_backend($rss, $url_syndic);
 442  
 443      // Les enregistrer dans la base
 444      if (is_array($articles)) {
 445          $urls = array();
 446          foreach ($articles as $data) {
 447              inserer_article_syndique ($data, $now_id_syndic, $moderation, $url_site, $url_syndic, $row['resume'], $row['documents']);
 448              $urls[] = $data['url'];
 449          }
 450  
 451          // moderation automatique des liens qui sont sortis du feed
 452          if (count($urls) > 0
 453          AND $row['miroir'] == 'oui') {
 454              spip_query("UPDATE spip_syndic_articles    SET statut='off', maj=maj WHERE id_syndic=$now_id_syndic AND NOT (url IN ("    . join(",", array_map('_q',$urls))    . "))");
 455          }
 456  
 457          // suppression apres 2 mois des liens qui sont sortis du feed
 458          if (count($urls) > 0
 459          AND $row['oubli'] == 'oui') {
 460              $time = date('U') - 61*24*3600; # deux mois
 461              spip_query("DELETE FROM spip_syndic_articles WHERE id_syndic=$now_id_syndic AND UNIX_TIMESTAMP(maj) < $time AND UNIX_TIMESTAMP(date) < $time AND NOT (url IN (" . join(",", array_map('_q',$urls)) . "))");
 462          }
 463  
 464  
 465          // Noter que la syndication est OK
 466          spip_query("UPDATE spip_syndic SET syndication='oui' WHERE id_syndic='$now_id_syndic'");
 467      }
 468  
 469      // Ne pas oublier de liberer le verrou
 470      spip_release_lock($url_syndic);
 471  
 472  
 473      // Renvoyer l'erreur le cas echeant
 474      if (!is_array($articles))
 475          return $articles;
 476      else
 477          return false; # c'est bon
 478  }
 479  
 480  
 481  // helas strtotime ne reconnait pas le format W3C
 482  // http://www.w3.org/TR/NOTE-datetime
 483  // http://doc.spip.org/@my_strtotime
 484  function my_strtotime($la_date) {
 485  
 486      // format complet
 487      if (preg_match(
 488      ',^([0-9]+-[0-9]+-[0-9]+[T ][0-9]+:[0-9]+(:[0-9]+)?)(\.[0-9]+)?'
 489      .'(Z|([-+][0-9][0-9]):[0-9]+)?$,',
 490      $la_date, $match)) {
 491          $la_date = str_replace("T", " ", $match[1])." GMT";
 492          return strtotime($la_date) - intval($match[5]) * 3600;
 493      }
 494  
 495      // YYYY
 496      if (preg_match(',^([0-9][0-9][0-9][0-9])$,', $la_date, $match))
 497          return strtotime($match[1]."-01-01");
 498  
 499      // YYYY-MM
 500      if (preg_match(',^([0-9][0-9][0-9][0-9]-[0-9][0-9])$,', $la_date, $match))
 501          return strtotime($match[1]."-01");
 502  
 503      // utiliser strtotime en dernier ressort
 504      $s = strtotime($la_date);
 505      if ($s > 0)
 506          return $s;
 507  
 508      // erreur
 509      spip_log("Impossible de lire le format de date '$la_date'");
 510      return false;
 511  }
 512  
 513  
 514  // http://doc.spip.org/@cron_syndic
 515  function cron_syndic($t) {
 516      $r = executer_une_syndication();
 517      if (($GLOBALS['meta']['activer_moteur'] == 'oui') &&
 518          ($GLOBALS['meta']["visiter_sites"] == 'oui')) {
 519          include_spip("inc/indexation");
 520          $r2 = executer_une_indexation_syndic();
 521          $r = $r && $r2;
 522      }
 523      return $r;
 524  }
 525  
 526  ?>
Code source de SPIP 1.9.2c

/ecrire/inc/ -> syndic.php (source)

Généré le : Wed Nov 21 10:20:27 2007	par Balluche grâce à PHPXref 0.7