dolibarr  9.0.0
rssparser.class.php
Go to the documentation of this file.
1 <?php
2 /* Copyright (C) 2011-2012 Laurent Destailleur <eldy@users.sourceforge.net>
3  *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17 
27 class RssParser
28 {
32  public $db;
33 
37  public $error='';
38 
39  private $_format='';
40  private $_urlRSS;
41  private $_language;
42  private $_generator;
43  private $_copyright;
44  private $_lastbuilddate;
45  private $_imageurl;
46  private $_link;
47  private $_title;
48  private $_description;
49  private $_lastfetchdate; // Last successful fetch
50  private $_rssarray=array();
51 
52  // For parsing with xmlparser
53  var $stack = array(); // parser stack
54  var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
55 
56 
62  public function __construct($db)
63  {
64  $this->db = $db;
65  }
66 
72  public function getFormat()
73  {
74  return $this->_format;
75  }
76 
82  public function getUrlRss()
83  {
84  return $this->_urlRSS;
85  }
91  public function getLanguage()
92  {
93  return $this->_language;
94  }
100  public function getGenerator()
101  {
102  return $this->_generator;
103  }
109  public function getCopyright()
110  {
111  return $this->_copyright;
112  }
118  public function getLastBuildDate()
119  {
120  return $this->_lastbuilddate;
121  }
127  public function getImageUrl()
128  {
129  return $this->_imageurl;
130  }
136  public function getLink()
137  {
138  return $this->_link;
139  }
145  public function getTitle()
146  {
147  return $this->_title;
148  }
154  public function getDescription()
155  {
156  return $this->_description;
157  }
163  public function getLastFetchDate()
164  {
165  return $this->_lastfetchdate;
166  }
172  public function getItems()
173  {
174  return $this->_rssarray;
175  }
176 
177 
187  public function parser($urlRSS, $maxNb=0, $cachedelay=60, $cachedir='')
188  {
189  global $conf;
190 
191  include_once DOL_DOCUMENT_ROOT.'/core/lib/files.lib.php';
192 
193  $rss='';
194  $str=''; // This will contain content of feed
195 
196  // Check parameters
197  if (! dol_is_url($urlRSS))
198  {
199  $this->error="ErrorBadUrl";
200  return -1;
201  }
202 
203  $this->_urlRSS = $urlRSS;
204  $newpathofdestfile=$cachedir.'/'.dol_hash($this->_urlRSS,3); // Force md5 hash (does not contains special chars)
205  $newmask='0644';
206 
207  //dol_syslog("RssPArser::parser parse url=".$urlRSS." => cache file=".$newpathofdestfile);
208  $nowgmt = dol_now();
209 
210  // Search into cache
211  $foundintocache=0;
212  if ($cachedelay > 0 && $cachedir)
213  {
214  $filedate=dol_filemtime($newpathofdestfile);
215  if ($filedate >= ($nowgmt - $cachedelay))
216  {
217  //dol_syslog("RssParser::parser cache file ".$newpathofdestfile." is not older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we use it.");
218  $foundintocache=1;
219 
220  $this->_lastfetchdate=$filedate;
221  }
222  else
223  {
224  dol_syslog(get_class($this)."::parser cache file ".$newpathofdestfile." is not found or older than now - cachedelay (".$nowgmt." - ".$cachedelay.") so we can't use it.");
225  }
226  }
227 
228  // Load file into $str
229  if ($foundintocache) // Cache file found and is not too old
230  {
231  $str = file_get_contents($newpathofdestfile);
232  }
233  else
234  {
235  try {
236  ini_set("user_agent","Dolibarr ERP-CRM RSS reader");
237  ini_set("max_execution_time", $conf->global->MAIN_USE_RESPONSE_TIMEOUT);
238  ini_set("default_socket_timeout", $conf->global->MAIN_USE_RESPONSE_TIMEOUT);
239 
240  $opts = array('http'=>array('method'=>"GET"));
241  if (! empty($conf->global->MAIN_USE_CONNECT_TIMEOUT)) $opts['http']['timeout']=$conf->global->MAIN_USE_CONNECT_TIMEOUT;
242  if (! empty($conf->global->MAIN_PROXY_USE)) $opts['http']['proxy']='tcp://'.$conf->global->MAIN_PROXY_HOST.':'.$conf->global->MAIN_PROXY_PORT;
243  //var_dump($opts);exit;
244  $context = stream_context_create($opts);
245 
246  $str = file_get_contents($this->_urlRSS, false, $context);
247  }
248  catch (Exception $e) {
249  print 'Error retrieving URL '.$this->urlRSS.' - '.$e->getMessage();
250  }
251  }
252 
253  if ($str !== false)
254  {
255  // Convert $str into xml
256  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
257  {
258  //print 'xx'.LIBXML_NOCDATA;
259  libxml_use_internal_errors(false);
260  $rss = simplexml_load_string($str, "SimpleXMLElement", LIBXML_NOCDATA);
261  }
262  else
263  {
264  $xmlparser=xml_parser_create('');
265  if (!is_resource($xmlparser)) {
266  $this->error="ErrorFailedToCreateParser"; return -1;
267  }
268 
269  xml_set_object($xmlparser, $this);
270  xml_set_element_handler($xmlparser, 'feed_start_element', 'feed_end_element');
271  xml_set_character_data_handler($xmlparser, 'feed_cdata');
272  $status = xml_parse($xmlparser, $str);
273  xml_parser_free($xmlparser);
274  $rss=$this;
275  //var_dump($rss->_format);exit;
276  }
277  }
278 
279  // If $rss loaded
280  if ($rss)
281  {
282  // Save file into cache
283  if (empty($foundintocache) && $cachedir)
284  {
285  dol_syslog(get_class($this)."::parser cache file ".$newpathofdestfile." is saved onto disk.");
286  if (! dol_is_dir($cachedir)) dol_mkdir($cachedir);
287  $fp = fopen($newpathofdestfile, 'w');
288  if ($fp)
289  {
290  fwrite($fp, $str);
291  fclose($fp);
292  if (! empty($conf->global->MAIN_UMASK)) $newmask=$conf->global->MAIN_UMASK;
293  @chmod($newpathofdestfile, octdec($newmask));
294 
295  $this->_lastfetchdate=$nowgmt;
296  }
297  else
298  {
299  print 'Error, failed to open file '.$newpathofdestfile.' for write';
300  }
301  }
302 
303  unset($str); // Free memory
304 
305  if (empty($rss->_format)) // If format not detected automatically
306  {
307  $rss->_format='rss';
308  if (empty($rss->channel)) $rss->_format='atom';
309  }
310 
311  $items=array();
312 
313  // Save description entries
314  if ($rss->_format == 'rss')
315  {
316  //var_dump($rss);
317  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
318  {
319  if (!empty($rss->channel->language)) $this->_language = (string) $rss->channel->language;
320  if (!empty($rss->channel->generator)) $this->_generator = (string) $rss->channel->generator;
321  if (!empty($rss->channel->copyright)) $this->_copyright = (string) $rss->channel->copyright;
322  if (!empty($rss->channel->lastbuilddate)) $this->_lastbuilddate = (string) $rss->channel->lastbuilddate;
323  if (!empty($rss->channel->image->url[0])) $this->_imageurl = (string) $rss->channel->image->url[0];
324  if (!empty($rss->channel->link)) $this->_link = (string) $rss->channel->link;
325  if (!empty($rss->channel->title)) $this->_title = (string) $rss->channel->title;
326  if (!empty($rss->channel->description)) $this->_description = (string) $rss->channel->description;
327  }
328  else
329  {
330  //var_dump($rss->channel);
331  if (!empty($rss->channel['language'])) $this->_language = (string) $rss->channel['language'];
332  if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
333  if (!empty($rss->channel['copyright'])) $this->_copyright = (string) $rss->channel['copyright'];
334  if (!empty($rss->channel['lastbuilddate'])) $this->_lastbuilddate = (string) $rss->channel['lastbuilddate'];
335  if (!empty($rss->image['url'])) $this->_imageurl = (string) $rss->image['url'];
336  if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
337  if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
338  if (!empty($rss->channel['description'])) $this->_description = (string) $rss->channel['description'];
339  }
340 
341  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) $items=$rss->channel->item; // With simplexml
342  else $items=$rss->items; // With xmlparse
343  //var_dump($items);exit;
344  }
345  else if ($rss->_format == 'atom')
346  {
347  //var_dump($rss);
348  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
349  {
350  if (!empty($rss->generator)) $this->_generator = (string) $rss->generator;
351  if (!empty($rss->lastbuilddate)) $this->_lastbuilddate = (string) $rss->modified;
352  if (!empty($rss->link->href)) $this->_link = (string) $rss->link->href;
353  if (!empty($rss->title)) $this->_title = (string) $rss->title;
354  if (!empty($rss->description)) $this->_description = (string) $rss->description;
355  }
356  else
357  {
358  //if (!empty($rss->channel['rss_language'])) $this->_language = (string) $rss->channel['rss_language'];
359  if (!empty($rss->channel['generator'])) $this->_generator = (string) $rss->channel['generator'];
360  //if (!empty($rss->channel['rss_copyright'])) $this->_copyright = (string) $rss->channel['rss_copyright'];
361  if (!empty($rss->channel['modified'])) $this->_lastbuilddate = (string) $rss->channel['modified'];
362  //if (!empty($rss->image['rss_url'])) $this->_imageurl = (string) $rss->image['rss_url'];
363  if (!empty($rss->channel['link'])) $this->_link = (string) $rss->channel['link'];
364  if (!empty($rss->channel['title'])) $this->_title = (string) $rss->channel['title'];
365  //if (!empty($rss->channel['rss_description'])) $this->_description = (string) $rss->channel['rss_description'];
366  }
367  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML)) {
368  $tmprss=xml2php($rss); $items=$tmprss['entry'];
369  } // With simplexml
370  else $items=$rss->items; // With xmlparse
371  //var_dump($items);exit;
372  }
373 
374  $i = 0;
375 
376  // Loop on each record
377  if (is_array($items))
378  {
379  foreach($items as $item)
380  {
381  //var_dump($item);exit;
382  if ($rss->_format == 'rss')
383  {
384  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
385  {
386  $itemLink = (string) $item->link;
387  $itemTitle = (string) $item->title;
388  $itemDescription = (string) $item->description;
389  $itemPubDate = (string) $item->pubDate;
390  $itemId = '';
391  $itemAuthor = '';
392  }
393  else
394  {
395  $itemLink = (string) $item['link'];
396  $itemTitle = (string) $item['title'];
397  $itemDescription = (string) $item['description'];
398  $itemPubDate = (string) $item['pubdate'];
399  $itemId = (string) $item['guid'];
400  $itemAuthor = (string) $item['author'];
401  }
402 
403  // Loop on each category
404  $itemCategory=array();
405  if (is_array($item->category))
406  {
407  foreach ($item->category as $cat)
408  {
409  $itemCategory[] = (string) $cat;
410  }
411  }
412  }
413  else if ($rss->_format == 'atom')
414  {
415  if (! empty($conf->global->EXTERNALRSS_USE_SIMPLEXML))
416  {
417  $itemLink = (isset($item['link']['href']) ? (string) $item['link']['href'] : '');
418  $itemTitle = (string) $item['title'];
419  $itemDescription = (string) $item['summary'];
420  $itemPubDate = (string) $item['created'];
421  $itemId = (string) $item['id'];
422  $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
423  }
424  else
425  {
426  $itemLink = (isset($item['link']['href']) ? (string) $item['link']['href'] : '');
427  $itemTitle = (string) $item['title'];
428  $itemDescription = (string) $item['summary'];
429  $itemPubDate = (string) $item['created'];
430  $itemId = (string) $item['id'];
431  $itemAuthor = (string) ($item['author']?$item['author']:$item['author_name']);
432  }
433  }
434  else print 'ErrorBadFeedFormat';
435 
436  // Add record to result array
437  $this->_rssarray[$i] = array(
438  'link'=>$itemLink,
439  'title'=>$itemTitle,
440  'description'=>$itemDescription,
441  'pubDate'=>$itemPubDate,
442  'category'=>$itemCategory,
443  'id'=>$itemId,
444  'author'=>$itemAuthor);
445  //var_dump($this->_rssarray);
446 
447  $i++;
448 
449  if ($i > $maxNb) break; // We get all records we want
450  }
451  }
452 
453  return 1;
454  }
455  else
456  {
457  $this->error='ErrorFailedToLoadRSSFile';
458  return -1;
459  }
460  }
461 
462 
463 
464  // phpcs:disable PEAR.NamingConventions.ValidFunctionName.NotCamelCaps
473  function feed_start_element($p, $element, &$attrs)
474  {
475  // phpcs:enable
476  $el = $element = strtolower($element);
477  $attrs = array_change_key_case($attrs, CASE_LOWER);
478 
479  // check for a namespace, and split if found
480  $ns = false;
481  if (strpos($element, ':'))
482  {
483  list($ns, $el) = explode(':', $element, 2);
484  }
485  if ( $ns and $ns != 'rdf' )
486  {
487  $this->current_namespace = $ns;
488  }
489 
490  // if feed type isn't set, then this is first element of feed identify feed from root element
491  if (empty($this->_format))
492  {
493  if ( $el == 'rdf' ) {
494  $this->_format = 'rss';
495  $this->feed_version = '1.0';
496  }
497  elseif ( $el == 'rss' ) {
498  $this->_format = 'rss';
499  $this->feed_version = $attrs['version'];
500  }
501  elseif ( $el == 'feed' ) {
502  $this->_format = 'atom';
503  $this->feed_version = $attrs['version'];
504  $this->inchannel = true;
505  }
506  return;
507  }
508 
509  if ( $el == 'channel' )
510  {
511  $this->inchannel = true;
512  }
513  elseif ($el == 'item' or $el == 'entry' )
514  {
515  $this->initem = true;
516  if ( isset($attrs['rdf:about']) ) {
517  $this->current_item['about'] = $attrs['rdf:about'];
518  }
519  }
520 
521  // if we're in the default namespace of an RSS feed,
522  // record textinput or image fields
523  elseif (
524  $this->_format == 'rss' and
525  $this->current_namespace == '' and
526  $el == 'textinput' )
527  {
528  $this->intextinput = true;
529  }
530 
531  elseif (
532  $this->_format == 'rss' and
533  $this->current_namespace == '' and
534  $el == 'image' )
535  {
536  $this->inimage = true;
537  }
538 
539  // handle atom content constructs
540  elseif ( $this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
541  {
542  // avoid clashing w/ RSS mod_content
543  if ($el == 'content' ) {
544  $el = 'atom_content';
545  }
546 
547  $this->incontent = $el;
548  }
549 
550  // if inside an Atom content construct (e.g. content or summary) field treat tags as text
551  elseif ($this->_format == 'atom' and $this->incontent )
552  {
553  // if tags are inlined, then flatten
554  $attrs_str = join(' ', array_map('map_attrs', array_keys($attrs), array_values($attrs)));
555 
556  $this->append_content("<$element $attrs_str>");
557 
558  array_unshift($this->stack, $el);
559  }
560 
561  // Atom support many links per containging element.
562  // Magpie treats link elements of type rel='alternate'
563  // as being equivalent to RSS's simple link element.
564  //
565  elseif ($this->_format == 'atom' and $el == 'link' )
566  {
567  if ( isset($attrs['rel']) && $attrs['rel'] == 'alternate' )
568  {
569  $link_el = 'link';
570  }
571  else {
572  $link_el = 'link_' . $attrs['rel'];
573  }
574 
575  $this->append($link_el, $attrs['href']);
576  }
577  // set stack[0] to current element
578  else {
579  array_unshift($this->stack, $el);
580  }
581  }
582 
583 
584  // phpcs:disable PEAR.NamingConventions.ValidFunctionName.NotCamelCaps
592  function feed_cdata($p, $text)
593  {
594  // phpcs:enable
595  if ($this->_format == 'atom' and $this->incontent)
596  {
597  $this->append_content($text);
598  }
599  else
600  {
601  $current_el = join('_', array_reverse($this->stack));
602  $this->append($current_el, $text);
603  }
604  }
605 
606  // phpcs:disable PEAR.NamingConventions.ValidFunctionName.NotCamelCaps
614  function feed_end_element($p, $el)
615  {
616  // phpcs:enable
617  $el = strtolower($el);
618 
619  if ($el == 'item' or $el == 'entry')
620  {
621  $this->items[] = $this->current_item;
622  $this->current_item = array();
623  $this->initem = false;
624  }
625  elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'textinput' )
626  {
627  $this->intextinput = false;
628  }
629  elseif ($this->_format == 'rss' and $this->current_namespace == '' and $el == 'image' )
630  {
631  $this->inimage = false;
632  }
633  elseif ($this->_format == 'atom' and in_array($el, $this->_CONTENT_CONSTRUCTS) )
634  {
635  $this->incontent = false;
636  }
637  elseif ($el == 'channel' or $el == 'feed' )
638  {
639  $this->inchannel = false;
640  }
641  elseif ($this->_format == 'atom' and $this->incontent ) {
642  // balance tags properly
643  // note: i don't think this is actually neccessary
644  if ( $this->stack[0] == $el )
645  {
646  $this->append_content("</$el>");
647  }
648  else {
649  $this->append_content("<$el />");
650  }
651 
652  array_shift($this->stack);
653  }
654  else {
655  array_shift($this->stack);
656  }
657 
658  $this->current_namespace = false;
659  }
660 
661 
669  function concat(&$str1, $str2="")
670  {
671  if (!isset($str1) ) {
672  $str1="";
673  }
674  $str1 .= $str2;
675  }
676 
677  // phpcs:disable PEAR.NamingConventions.ValidFunctionName.NotCamelCaps
684  function append_content($text)
685  {
686  // phpcs:enable
687  if ( $this->initem ) {
688  $this->concat($this->current_item[ $this->incontent ], $text);
689  }
690  elseif ( $this->inchannel ) {
691  $this->concat($this->channel[ $this->incontent ], $text);
692  }
693  }
694 
702  function append($el, $text)
703  {
704  if (!$el) {
705  return;
706  }
707  if ( $this->current_namespace )
708  {
709  if ( $this->initem ) {
710  $this->concat($this->current_item[ $this->current_namespace ][ $el ], $text);
711  }
712  elseif ($this->inchannel) {
713  $this->concat($this->channel[ $this->current_namespace][ $el ], $text);
714  }
715  elseif ($this->intextinput) {
716  $this->concat($this->textinput[ $this->current_namespace][ $el ], $text);
717  }
718  elseif ($this->inimage) {
719  $this->concat($this->image[ $this->current_namespace ][ $el ], $text);
720  }
721  }
722  else {
723  if ( $this->initem ) {
724  $this->concat($this->current_item[ $el ], $text);
725  }
726  elseif ($this->intextinput) {
727  $this->concat($this->textinput[ $el ], $text);
728  }
729  elseif ($this->inimage) {
730  $this->concat($this->image[ $el ], $text);
731  }
732  elseif ($this->inchannel) {
733  $this->concat($this->channel[ $el ], $text);
734  }
735  }
736  }
737 }
738 
739 
746 function xml2php($xml)
747 {
748  $fils = 0;
749  $tab = false;
750  $array = array();
751  foreach($xml->children() as $key => $value)
752  {
753  $child = xml2php($value);
754 
755  //To deal with the attributes
756  foreach($value->attributes() as $ak=>$av)
757  {
758  $child[$ak] = (string) $av;
759  }
760 
761  //Let see if the new child is not in the array
762  if ($tab === false && in_array($key,array_keys($array)))
763  {
764  //If this element is already in the array we will create an indexed array
765  $tmp = $array[$key];
766  $array[$key] = null;
767  $array[$key][] = $tmp;
768  $array[$key][] = $child;
769  $tab = true;
770  }
771  elseif($tab === true)
772  {
773  //Add an element in an existing array
774  $array[$key][] = $child;
775  }
776  else
777  {
778  //Add a simple element
779  $array[$key] = $child;
780  }
781 
782  $fils++;
783  }
784 
785 
786  if ($fils==0)
787  {
788  return (string) $xml;
789  }
790 
791  return $array;
792 }
print
Draft customers invoices.
Definition: index.php:91
getUrlRss()
getUrlRss
feed_start_element($p, $element, &$attrs)
Triggered when opened tag is found.
concat(&$str1, $str2="")
To concat 2 string with no warning if an operand is not defined.
getItems()
getItems
dol_is_dir($folder)
Test if filename is a directory.
Definition: files.lib.php:438
getFormat()
getFormat
append($el, $text)
smart append - field and namespace aware
feed_end_element($p, $el)
Triggered when closed tag is found.
dol_syslog($message, $level=LOG_INFO, $ident=0, $suffixinfilename='', $restricttologhandler='')
Write log message into outputs.
dol_is_url($url)
Return if path is an URL.
Definition: files.lib.php:475
getDescription()
getDescription
getLink()
getLink
xml2php($xml)
Function to convert an XML object into an array.
getCopyright()
getCopyright
append_content($text)
Enter description here ...
__construct($db)
Constructor.
getLanguage()
getLanguage
if(GETPOST('cancel', 'alpha')) if(! GETPOST( 'confirmmassaction', 'alpha') &&$massaction !='presend' &&$massaction !='confirm_presend')
Draft customers invoices.
Definition: list.php:156
dol_now($mode='gmt')
Return date for now.
getImageUrl()
getImageUrl
getTitle()
getTitle
dol_filemtime($pathoffile)
Return time of a file.
Definition: files.lib.php:564
getGenerator()
getGenerator
dol_mkdir($dir, $dataroot='', $newmask=null)
Creation of a directory (this can create recursive subdir)
feed_cdata($p, $text)
Triggered when CDATA is found.
parser($urlRSS, $maxNb=0, $cachedelay=60, $cachedir='')
Parse rss URL.
dol_hash($chain, $type='0')
Returns a hash of a string.
getLastFetchDate()
getLastFetchDate
getLastBuildDate()
getLastBuildDate
Class to parse RSS files.