dolibarr  7.0.0-beta
rssparser.class.php
Go to the documentation of this file.
1 <?php
2 /* Copyright (C) 2011-2012 Laurent Destailleur <eldy@users.sourceforge.net>
3  *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17 
27 class RssParser
28 {
29  var $db;
30  var $error;
31 
32  private $_format='';
33  private $_urlRSS;
34  private $_language;
35  private $_generator;
36  private $_copyright;
37  private $_lastbuilddate;
38  private $_imageurl;
39  private $_link;
40  private $_title;
41  private $_description;
42  private $_lastfetchdate; // Last successful fetch
43  private $_rssarray=array();
44 
45  // For parsing with xmlparser
46  var $stack = array(); // parser stack
47  var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
48 
49 
55  public function __construct($db)
56  {
57  $this->db=$db;
58  }
59 
65  public function getFormat()
66  {
67  return $this->_format;
68  }
69 
75  public function getUrlRss()
76  {
77  return $this->_urlRSS;
78  }
84  public function getLanguage()
85  {
86  return $this->_language;
87  }
93  public function getGenerator()
94  {
95  return $this->_generator;
96  }
102  public function getCopyright()
103  {
104  return $this->_copyright;
105  }
111  public function getLastBuildDate()
112  {
113  return $this->_lastbuilddate;
114  }
120  public function getImageUrl()
121  {
122  return $this->_imageurl;
123  }
129  public function getLink()
130  {
131  return $this->_link;
132  }
138  public function getTitle()
139  {
140  return $this->_title;
141  }
147  public function getDescription()
148  {
149  return $this->_description;
150  }
156  public function getLastFetchDate()
157  {
158  return $this->_lastfetchdate;
159  }
165  public function getItems()
166  {
167  return $this->_rssarray;
168  }
169 
170 
180  public function parser($urlRSS, $maxNb=0, $cachedelay=60, $cachedir='')
181  {
182  global $conf;
183 
184  include_once DOL_DOCUMENT_ROOT.'/core/lib/files.lib.php';
185 
186  $rss='';
187  $str=''; // This will contain content of feed
188 
189  // Check parameters
190  if (! dol_is_url($urlRSS))
191  {
192  $this->error="ErrorBadUrl";
193  return -1;
194  }
195 
196  $this->_urlRSS = $urlRSS;
197  $newpathofdestfile=$cachedir.'/'.dol_hash($this->_urlRSS,3); // Force md5 hash (does not contains special chars)
198  $newmask='0644';
199 
200  //dol_syslog("RssPArser::parser parse url=".$urlRSS." => cache file=".$newpathofdestfile);
201  $nowgmt = dol_now();
202 
203  // Search into cache
204  $foundintocache=0;
205  if ($cachedelay > 0 && $cachedir)
206  {
207  $filedate=dol_filemtime($newpathofdestfile);
208  if ($filedate >= ($nowgmt - $cachedelay))
209  {
210  //dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we use it.");
211  $foundintocache=1;
212 
213  $this->_lastfetchdate=$filedate;
214  }
215  else
216  {
217  dol_syslog(get_class($this)."::parser cache file ".$newpathofdestfile." is not found or older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we can't use it.");
218  }
219  }
220 
221  // Load file into $str
222  if ($foundintocache) // Cache file found and is not too old
223  {
224  $str = file_get_contents($newpathofdestfile);
225  }
226  else
227  {
228  try {
229  ini_set("user_agent","Dolibarr ERP-CRM RSS reader");
230  ini_set("max_execution_time", $conf->global->MAIN_USE_RESPONSE_TIMEOUT);
231  ini_set("default_socket_timeout", $conf->global->MAIN_USE_RESPONSE_TIMEOUT);
232 
233  $opts = array('http'=>array('method'=>"GET"));
234  if (! empty($conf->global->MAIN_USE_CONNECT_TIMEOUT)) $opts['http']['timeout']=$conf->global->MAIN_USE_CONNECT_TIMEOUT;
235  if (! empty($conf->global->MAIN_PROXY_USE)) $opts['http']['proxy']='tcp://'.$conf->global->MAIN_PROXY_HOST.':'.$conf->global->MAIN_PROXY_PORT;
236  //var_dump($opts);exit;
237  $context = stream_context_create($opts);
238 
239  $str = file_get_contents($this->_urlRSS, false, $context);
240  }
241  catch (Exception $e) {
242  print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage();
243  }
244  }
245 
246  if ($str !== false)
247  {
248  // Convert $str into xml
249  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
250  {
251  //print 'xx'.LIBXML_NOCDATA;
252  libxml_use_internal_errors(false);
253  $rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA);
254  }
255  else
256  {
257  $xmlparser=xml_parser_create('');
258  if (!is_resource($xmlparser)) {
259  $this->error="ErrorFailedToCreateParser"; return -1;
260  }
261 
262  xml_set_object($xmlparser, $this);
263  xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element');
264  xml_set_character_data_handler($xmlparser, 'feed_cdata');
265  $status = xml_parse($xmlparser, $str);
266  xml_parser_free($xmlparser);
267  $rss=$this;
268  //var_dump($rss->_format);exit;
269  }
270  }
271 
272  // If $rss loaded
273  if ($rss)
274  {
275  // Save file into cache
276  if (empty($foundintocache) && $cachedir)
277  {
278  dol_syslog(get_class($this)."::parser cache file ".$newpathofdestfile." is saved onto disk.");
279  if (! dol_is_dir($cachedir)) dol_mkdir($cachedir);
280  $fp = fopen($newpathofdestfile, 'w');
281  if ($fp)
282  {
283  fwrite($fp, $str);
284  fclose($fp);
285  if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK;
286  @chmod($newpathofdestfile, octdec($newmask));
287 
288  $this->_lastfetchdate=$nowgmt;
289  }
290  else
291  {
292  print 'Error, failed to open file '.$newpathofdestfile.' for write';
293  }
294  }
295 
296  unset($str); // Free memory
297 
298  if (empty($rss->_format)) // If format not detected automatically
299  {
300  $rss->_format='rss';
301  if (empty($rss->channel)) $rss->_format='atom';
302  }
303 
304  $items=array();
305 
306  // Save description entries
307  if ($rss->_format == 'rss')
308  {
309  //var_dump($rss);
310  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
311  {
312  if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language;
313  if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator;
314  if (!empty($rss->channel->copyright)) $this->_copyright = (string) $rss->channel->copyright;
315  if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate;
316  if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0];
317  if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link;
318  if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title;
319  if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description;
320  }
321  else
322  {
323  //var_dump($rss->channel);
324  if (!empty($rss->channel['language'])) $this->_language = (string) $rss->channel['language'];
325  if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
326  if (!empty($rss->channel['copyright'])) $this->_copyright = (string) $rss->channel['copyright'];
327  if (!empty($rss->channel['lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['lastbuilddate'];
328  if (!empty($rss->image['url'])) $this->_imageurl = (string) $rss->image['url'];
329  if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
330  if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
331  if (!empty($rss->channel['description'])) $this->_description = (string) $rss->channel['description'];
332  }
333 
334  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item; // With simplexml
335  else $items=$rss->items; // With xmlparse
336  //var_dump($items);exit;
337  }
338  else if ($rss->_format == 'atom')
339  {
340  //var_dump($rss);
341  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
342  {
343  if (!empty($rss->generator)) $this->_generator = (string) $rss->generator;
344  if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
345  if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href;
346  if (!empty($rss->title)) $this->_title = (string) $rss->title;
347  if (!empty($rss->description)) $this->_description = (string) $rss->description;
348  }
349  else
350  {
351  //if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language'];
352  if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
353  //if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright'];
354  if (!empty($rss->channel['modified'])) $this->_lastbuilddate = (string) $rss->channel['modified'];
355  //if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url'];
356  if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
357  if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
358  //if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description'];
359  }
360  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) {
361  $tmprss=xml2php($rss); $items=$tmprss['entry'];
362  } // With simplexml
363  else $items=$rss->items; // With xmlparse
364  //var_dump($items);exit;
365  }
366 
367  $i = 0;
368 
369  // Loop on each record
370  if (is_array($items))
371  {
372  foreach($items as $item)
373  {
374  //var_dump($item);exit;
375  if ($rss->_format == 'rss')
376  {
377  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
378  {
379  $itemLink = (string) $item->link;
380  $itemTitle = (string) $item->title;
381  $itemDescription = (string) $item->description;
382  $itemPubDate = (string) $item->pubDate;
383  $itemId = '';
384  $itemAuthor = '';
385  }
386  else
387  {
388  $itemLink = (string) $item['link'];
389  $itemTitle = (string) $item['title'];
390  $itemDescription = (string) $item['description'];
391  $itemPubDate = (string) $item['pubdate'];
392  $itemId = (string) $item['guid'];
393  $itemAuthor = (string) $item['author'];
394  }
395 
396  // Loop on each category
397  $itemCategory=array();
398  if (is_array($item->category))
399  {
400  foreach ($item->category as $cat)
401  {
402  $itemCategory[] = (string) $cat;
403  }
404  }
405  }
406  else if ($rss->_format == 'atom')
407  {
408  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
409  {
410  $itemLink = (isset($item['link']['href']) ? (string) $item['link']['href'] : '');
411  $itemTitle = (string) $item['title'];
412  $itemDescription = (string) $item['summary'];
413  $itemPubDate = (string) $item['created'];
414  $itemId = (string) $item['id'];
415  $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
416  }
417  else
418  {
419  $itemLink = (isset($item['link']['href']) ? (string) $item['link']['href'] : '');
420  $itemTitle = (string) $item['title'];
421  $itemDescription = (string) $item['summary'];
422  $itemPubDate = (string) $item['created'];
423  $itemId = (string) $item['id'];
424  $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
425  }
426  }
427  else print 'ErrorBadFeedFormat';
428 
429  // Add record to result array
430  $this->_rssarray[$i] = array(
431  'link'=>$itemLink,
432  'title'=>$itemTitle,
433  'description'=>$itemDescription,
434  'pubDate'=>$itemPubDate,
435  'category'=>$itemCategory,
436  'id'=>$itemId,
437  'author'=>$itemAuthor);
438  //var_dump($this->_rssarray);
439 
440  $i++;
441 
442  if ($i > $maxNb) break; // We get all records we want
443  }
444  }
445 
446  return 1;
447  }
448  else
449  {
450  $this->error='ErrorFailedToLoadRSSFile';
451  return -1;
452  }
453  }
454 
455 
456 
465  function feed_start_element($p, $element, &$attrs)
466  {
467  $el = $element = strtolower($element);
468  $attrs = array_change_key_case($attrs, CASE_LOWER);
469 
470  // check for a namespace, and split if found
471  $ns = false;
472  if (strpos($element, ':'))
473  {
474  list($ns, $el) = explode(':', $element, 2);
475  }
476  if ( $ns and $ns != 'rdf' )
477  {
478  $this->current_namespace = $ns;
479  }
480 
481  // if feed type isn't set, then this is first element of feed identify feed from root element
482  if (empty($this->_format))
483  {
484  if ( $el == 'rdf' ) {
485  $this->_format = 'rss';
486  $this->feed_version = '1.0';
487  }
488  elseif ( $el == 'rss' ) {
489  $this->_format = 'rss';
490  $this->feed_version = $attrs['version'];
491  }
492  elseif ( $el == 'feed' ) {
493  $this->_format = 'atom';
494  $this->feed_version = $attrs['version'];
495  $this->inchannel = true;
496  }
497  return;
498  }
499 
500  if ( $el == 'channel' )
501  {
502  $this->inchannel = true;
503  }
504  elseif ($el == 'item' or $el == 'entry' )
505  {
506  $this->initem = true;
507  if ( isset($attrs['rdf:about']) ) {
508  $this->current_item['about'] = $attrs['rdf:about'];
509  }
510  }
511 
512  // if we're in the default namespace of an RSS feed,
513  // record textinput or image fields
514  elseif (
515  $this->_format == 'rss' and
516  $this->current_namespace == '' and
517  $el == 'textinput' )
518  {
519  $this->intextinput = true;
520  }
521 
522  elseif (
523  $this->_format == 'rss' and
524  $this->current_namespace == '' and
525  $el == 'image' )
526  {
527  $this->inimage = true;
528  }
529 
530  // handle atom content constructs
531  elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
532  {
533  // avoid clashing w/ RSS mod_content
534  if ($el == 'content' ) {
535  $el = 'atom_content';
536  }
537 
538  $this->incontent = $el;
539 
540 
541  }
542 
543  // if inside an Atom content construct (e.g. content or summary) field treat tags as text
544  elseif ($this->_format == 'atom' and $this->incontent )
545  {
546  // if tags are inlined, then flatten
547  $attrs_str = join(' ', array_map('map_attrs', array_keys($attrs), array_values($attrs)));
548 
549  $this->append_content("<$element $attrs_str>");
550 
551  array_unshift($this->stack, $el);
552  }
553 
554  // Atom support many links per containging element.
555  // Magpie treats link elements of type rel='alternate'
556  // as being equivalent to RSS's simple link element.
557  //
558  elseif ($this->_format == 'atom' and $el == 'link' )
559  {
560  if ( isset($attrs['rel']) && $attrs['rel'] == 'alternate' )
561  {
562  $link_el = 'link';
563  }
564  else {
565  $link_el = 'link_' . $attrs['rel'];
566  }
567 
568  $this->append($link_el, $attrs['href']);
569  }
570  // set stack[0] to current element
571  else {
572  array_unshift($this->stack, $el);
573  }
574  }
575 
576 
584  function feed_cdata($p, $text)
585  {
586  if ($this->_format == 'atom' and $this->incontent)
587  {
588  $this->append_content($text);
589  }
590  else
591  {
592  $current_el = join('_', array_reverse($this->stack));
593  $this->append($current_el, $text);
594  }
595  }
596 
604  function feed_end_element($p, $el)
605  {
606  $el = strtolower($el);
607 
608  if ($el == 'item' or $el == 'entry')
609  {
610  $this->items[] = $this->current_item;
611  $this->current_item = array();
612  $this->initem = false;
613  }
614  elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' )
615  {
616  $this->intextinput = false;
617  }
618  elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' )
619  {
620  $this->inimage = false;
621  }
622  elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
623  {
624  $this->incontent = false;
625  }
626  elseif ($el == 'channel' or $el == 'feed' )
627  {
628  $this->inchannel = false;
629  }
630  elseif ($this->_format == 'atom' and $this->incontent ) {
631  // balance tags properly
632  // note: i don't think this is actually neccessary
633  if ( $this->stack[0] == $el )
634  {
635  $this->append_content("</$el>");
636  }
637  else {
638  $this->append_content("<$el />");
639  }
640 
641  array_shift($this->stack);
642  }
643  else {
644  array_shift($this->stack);
645  }
646 
647  $this->current_namespace = false;
648  }
649 
650 
658  function concat(&$str1, $str2="")
659  {
660  if (!isset($str1) ) {
661  $str1="";
662  }
663  $str1 .= $str2;
664  }
665 
672  function append_content($text)
673  {
674  if ( $this->initem ) {
675  $this->concat($this->current_item[ $this->incontent ], $text);
676  }
677  elseif ( $this->inchannel ) {
678  $this->concat($this->channel[ $this->incontent ], $text);
679  }
680  }
681 
689  function append($el, $text)
690  {
691  if (!$el) {
692  return;
693  }
694  if ( $this->current_namespace )
695  {
696  if ( $this->initem ) {
697  $this->concat($this->current_item[ $this->current_namespace ][ $el ], $text);
698  }
699  elseif ($this->inchannel) {
700  $this->concat($this->channel[ $this->current_namespace][ $el ], $text);
701  }
702  elseif ($this->intextinput) {
703  $this->concat($this->textinput[ $this->current_namespace][ $el ], $text);
704  }
705  elseif ($this->inimage) {
706  $this->concat($this->image[ $this->current_namespace ][ $el ], $text);
707  }
708  }
709  else {
710  if ( $this->initem ) {
711  $this->concat($this->current_item[ $el ], $text);
712  }
713  elseif ($this->intextinput) {
714  $this->concat($this->textinput[ $el ], $text);
715  }
716  elseif ($this->inimage) {
717  $this->concat($this->image[ $el ], $text);
718  }
719  elseif ($this->inchannel) {
720  $this->concat($this->channel[ $el ], $text);
721  }
722 
723  }
724  }
725 
726 }
727 
728 
735 function xml2php($xml)
736 {
737  $fils = 0;
738  $tab = false;
739  $array = array();
740  foreach($xml->children() as $key => $value)
741  {
742  $child = xml2php($value);
743 
744  //To deal with the attributes
745  foreach($value->attributes() as $ak=>$av)
746  {
747  $child[$ak] = (string) $av;
748 
749  }
750 
751  //Let see if the new child is not in the array
752  if($tab==false && in_array($key,array_keys($array)))
753  {
754  //If this element is already in the array we will create an indexed array
755  $tmp = $array[$key];
756  $array[$key] = NULL;
757  $array[$key][] = $tmp;
758  $array[$key][] = $child;
759  $tab = true;
760  }
761  elseif($tab == true)
762  {
763  //Add an element in an existing array
764  $array[$key][] = $child;
765  }
766  else
767  {
768  //Add a simple element
769  $array[$key] = $child;
770  }
771 
772  $fils++;
773  }
774 
775 
776  if($fils==0)
777  {
778  return (string) $xml;
779  }
780 
781  return $array;
782 
783 }
784 
getUrlRss()
getUrlRss
feed_start_element($p, $element, &$attrs)
Triggered when opened tag is found.
if(GETPOST('cancel','alpha')) if(!GETPOST('confirmmassaction','alpha')&&$massaction!= 'presend'&&$massaction!= 'confirm_presend')
Draft customers invoices.
Definition: list.php:147
concat(&$str1, $str2="")
To concat 2 string with no warning if an operand is not defined.
getItems()
getItems
dol_is_dir($folder)
Test if filename is a directory.
Definition: files.lib.php:414
getFormat()
getFormat
append($el, $text)
smart append - field and namespace aware
feed_end_element($p, $el)
Triggered when closed tag is found.
dol_syslog($message, $level=LOG_INFO, $ident=0, $suffixinfilename='', $restricttologhandler='')
Write log message into outputs.
dol_is_url($url)
Return if path is an URL.
Definition: files.lib.php:439
getDescription()
getDescription
getLink()
getLink
xml2php($xml)
Function to convert an XML object into an array.
getCopyright()
getCopyright
append_content($text)
Enter description here ...
__construct($db)
Constructor.
getLanguage()
getLanguage
dol_now($mode='gmt')
Return date for now.
getImageUrl()
getImageUrl
print
Draft customers invoices.
Definition: index.php:91
getTitle()
getTitle
dol_filemtime($pathoffile)
Return time of a file.
Definition: files.lib.php:528
getGenerator()
getGenerator
dol_mkdir($dir, $dataroot='', $newmask=null)
Creation of a directory (this can create recursive subdir)
feed_cdata($p, $text)
Triggered when CDATA is found.
parser($urlRSS, $maxNb=0, $cachedelay=60, $cachedir='')
Parse rss URL.
dol_hash($chain, $type='0')
Returns a hash of a string.
getLastFetchDate()
getLastFetchDate
getLastBuildDate()
getLastBuildDate
Class to parse RSS files.