dolibarr  18.0.0-alpha
geturl.lib.php
Go to the documentation of this file.
1 <?php
2 /* Copyright (C) 2008-2020 Laurent Destailleur <eldy@users.sourceforge.net>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16  * or see https://www.gnu.org/
17  */
18 
41 function getURLContent($url, $postorget = 'GET', $param = '', $followlocation = 1, $addheaders = array(), $allowedschemes = array('http', 'https'), $localurl = 0, $ssl_verifypeer = -1)
42 {
43  //declaring of global variables
44  global $conf;
45  $USE_PROXY = empty($conf->global->MAIN_PROXY_USE) ? 0 : $conf->global->MAIN_PROXY_USE;
46  $PROXY_HOST = empty($conf->global->MAIN_PROXY_HOST) ? 0 : $conf->global->MAIN_PROXY_HOST;
47  $PROXY_PORT = empty($conf->global->MAIN_PROXY_PORT) ? 0 : $conf->global->MAIN_PROXY_PORT;
48  $PROXY_USER = empty($conf->global->MAIN_PROXY_USER) ? 0 : $conf->global->MAIN_PROXY_USER;
49  $PROXY_PASS = empty($conf->global->MAIN_PROXY_PASS) ? 0 : $conf->global->MAIN_PROXY_PASS;
50 
51  dol_syslog("getURLContent postorget=".$postorget." URL=".$url." param=".$param);
52 
53  //setting the curl parameters.
54  $ch = curl_init();
55 
56  /*print $API_Endpoint."-".$API_version."-".$PAYPAL_API_USER."-".$PAYPAL_API_PASSWORD."-".$PAYPAL_API_SIGNATURE."<br>";
57  print $USE_PROXY."-".$gv_ApiErrorURL."<br>";
58  print $nvpStr;
59  exit;*/
60  curl_setopt($ch, CURLOPT_VERBOSE, 1);
61  curl_setopt($ch, CURLOPT_USERAGENT, 'Dolibarr geturl function');
62 
63  // We use @ here because this may return warning if safe mode is on or open_basedir is on (following location is forbidden when safe mode is on).
64  // We force value to false so we will manage redirection ourself later.
65  @curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
66 
67  if (is_array($addheaders) && count($addheaders)) {
68  curl_setopt($ch, CURLOPT_HTTPHEADER, $addheaders);
69  }
70  curl_setopt($ch, CURLINFO_HEADER_OUT, true); // To be able to retrieve request header and log it
71 
72  // By default use tls decied by PHP.
73  // You can force, if supported a version like TLSv1 or TLSv1.2
74  if (!empty($conf->global->MAIN_CURL_SSLVERSION)) {
75  curl_setopt($ch, CURLOPT_SSLVERSION, $conf->global->MAIN_CURL_SSLVERSION);
76  }
77  //curl_setopt($ch, CURLOPT_SSLVERSION, 6); for tls 1.2
78 
79  // Turning on or off the ssl target certificate
80  if ($ssl_verifypeer < 0) {
81  global $dolibarr_main_prod;
82  $ssl_verifypeer = ($dolibarr_main_prod ? true : false);
83  }
84  if (!empty($conf->global->MAIN_CURL_DISABLE_VERIFYPEER)) {
85  $ssl_verifypeer = 0;
86  }
87 
88  // Turning off the server and peer verification(TrustManager Concept).
89  curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, ($ssl_verifypeer ? true : false));
90  curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, ($ssl_verifypeer ? true : false));
91 
92  // Restrict use to some protocols only
93  $protocols = 0;
94  if (is_array($allowedschemes)) {
95  foreach ($allowedschemes as $allowedscheme) {
96  if ($allowedscheme == 'http') {
97  $protocols |= CURLPROTO_HTTP;
98  }
99  if ($allowedscheme == 'https') {
100  $protocols |= CURLPROTO_HTTPS;
101  }
102  }
103  curl_setopt($ch, CURLOPT_PROTOCOLS, $protocols);
104  curl_setopt($ch, CURLOPT_REDIR_PROTOCOLS, $protocols);
105  }
106 
107  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, empty($conf->global->MAIN_USE_CONNECT_TIMEOUT) ? 5 : $conf->global->MAIN_USE_CONNECT_TIMEOUT);
108  curl_setopt($ch, CURLOPT_TIMEOUT, empty($conf->global->MAIN_USE_RESPONSE_TIMEOUT) ? 30 : $conf->global->MAIN_USE_RESPONSE_TIMEOUT);
109 
110  // limit size of downloaded files. TODO Add MAIN_SECURITY_MAXFILESIZE_DOWNLOADED
111  $maxsize = getDolGlobalInt('MAIN_SECURITY_MAXFILESIZE_DOWNLOADED');
112  if ($maxsize && defined('CURLOPT_MAXFILESIZE_LARGE')) {
113  curl_setopt($ch, CURLOPT_MAXFILESIZE_LARGE, $maxsize);
114  }
115  if ($maxsize && defined('CURLOPT_MAXFILESIZE')) {
116  curl_setopt($ch, CURLOPT_MAXFILESIZE, $maxsize);
117  }
118 
119  //curl_setopt($ch, CURLOPT_SAFE_UPLOAD, true); // PHP 5.5
120  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // We want response
121  if ($postorget == 'POST') {
122  curl_setopt($ch, CURLOPT_POST, 1); // POST
123  curl_setopt($ch, CURLOPT_POSTFIELDS, $param); // Setting param x=a&y=z as POST fields
124  } elseif ($postorget == 'POSTALREADYFORMATED') {
125  curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST'); // HTTP request is 'POST' but param string is taken as it is
126  curl_setopt($ch, CURLOPT_POSTFIELDS, $param); // param = content of post, like a xml string
127  } elseif ($postorget == 'PUT') {
128  $array_param = null;
129  curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT'); // HTTP request is 'PUT'
130  if (!is_array($param)) {
131  parse_str($param, $array_param);
132  } else {
133  dol_syslog("parameter param must be a string", LOG_WARNING);
134  $array_param = $param;
135  }
136  curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($array_param)); // Setting param x=a&y=z as PUT fields
137  } elseif ($postorget == 'PUTALREADYFORMATED') {
138  curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT'); // HTTP request is 'PUT'
139  curl_setopt($ch, CURLOPT_POSTFIELDS, $param); // param = content of post, like a xml string
140  } elseif ($postorget == 'HEAD') {
141  curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'HEAD'); // HTTP request is 'HEAD'
142  curl_setopt($ch, CURLOPT_NOBODY, true);
143  } elseif ($postorget == 'DELETE') {
144  curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'DELETE'); // POST
145  } else {
146  curl_setopt($ch, CURLOPT_POST, 0); // GET
147  }
148 
149  //if USE_PROXY constant set at begin of this method.
150  if ($USE_PROXY) {
151  dol_syslog("getURLContent set proxy to ".$PROXY_HOST.":".$PROXY_PORT." - ".$PROXY_USER.":".$PROXY_PASS);
152  //curl_setopt ($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); // Curl 7.10
153  curl_setopt($ch, CURLOPT_PROXY, $PROXY_HOST.":".$PROXY_PORT);
154  if ($PROXY_USER) {
155  curl_setopt($ch, CURLOPT_PROXYUSERPWD, $PROXY_USER.":".$PROXY_PASS);
156  }
157  }
158 
159  $newUrl = $url;
160  $maxRedirection = 5;
161  $info = array();
162  $response = '';
163 
164  do {
165  if ($maxRedirection < 1) {
166  break;
167  }
168 
169  curl_setopt($ch, CURLOPT_URL, $newUrl);
170 
171  // Parse $newUrl
172  $newUrlArray = parse_url($newUrl);
173  $hosttocheck = $newUrlArray['host'];
174  $hosttocheck = str_replace(array('[', ']'), '', $hosttocheck); // Remove brackets of IPv6
175 
176  // Deny some reserved host names
177  if (in_array($hosttocheck, array('metadata.google.internal'))) {
178  $info['http_code'] = 400;
179  $info['content'] = 'Error bad hostname '.$hosttocheck.' (Used by Google metadata). This value for hostname is not allowed.';
180  break;
181  }
182 
183  // Clean host name $hosttocheck to convert it into an IP $iptocheck
184  if (in_array($hosttocheck, array('localhost', 'localhost.domain'))) {
185  $iptocheck = '127.0.0.1';
186  } elseif (in_array($hosttocheck, array('ip6-localhost', 'ip6-loopback'))) {
187  $iptocheck = '::1';
188  } else {
189  // Resolve $hosttocheck to get the IP $iptocheck and set CURLOPT_CONNECT_TO to use this ip so curl will not try another resolution that may give a different result
190  if (function_exists('gethostbyname')) {
191  $iptocheck = gethostbyname($hosttocheck);
192  } else {
193  $iptocheck = $hosttocheck;
194  }
195  // TODO Resolve ip v6
196  }
197 
198  // Check $iptocheck is an IP (v4 or v6), if not clear value.
199  if (!filter_var($iptocheck, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4 | FILTER_FLAG_IPV6)) { // This is not an IP, we clean data
200  $iptocheck = '0'; //
201  }
202 
203  if ($iptocheck) {
204  if ($localurl == 0) { // Only external url allowed (dangerous, may allow to get malware)
205  if (!filter_var($iptocheck, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE)) {
206  // Deny ips like 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 et 240.0.0.0/4, ::1/128, ::/128, ::ffff:0:0/96, fe80::/10...
207  $info['http_code'] = 400;
208  $info['content'] = 'Error bad hostname IP (private or reserved range). Must be an external URL.';
209  break;
210  }
211  if (!empty($_SERVER["SERVER_ADDR"]) && $iptocheck == $_SERVER["SERVER_ADDR"]) {
212  $info['http_code'] = 400;
213  $info['content'] = 'Error bad hostname IP (IP is a local IP). Must be an external URL.';
214  break;
215  }
216  if (!empty($conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP) && in_array($iptocheck, explode(',', $conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP))) {
217  $info['http_code'] = 400;
218  $info['content'] = 'Error bad hostname IP (IP is a local IP defined into MAIN_SECURITY_SERVER_IP). Must be an external URL.';
219  break;
220  }
221  }
222  if ($localurl == 1) { // Only local url allowed (dangerous, may allow to get metadata on server or make internal port scanning)
223  // Deny ips NOT like 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 et 240.0.0.0/4, ::1/128, ::/128, ::ffff:0:0/96, fe80::/10...
224  if (filter_var($iptocheck, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE)) {
225  $info['http_code'] = 400;
226  $info['content'] = 'Error bad hostname '.$iptocheck.'. Must be a local URL.';
227  break;
228  }
229  if (!empty($conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP) && !in_array($iptocheck, explode(',', $conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP))) {
230  $info['http_code'] = 400;
231  $info['content'] = 'Error bad hostname IP (IP is not a local IP defined into list MAIN_SECURITY_SERVER_IP). Must be a local URL in allowed list.';
232  break;
233  }
234  }
235 
236  // Common check on ip (local and external)
237  // See list on https://tagmerge.com/gist/a7b9d57ff8ec11d63642f8778609a0b8
238  // Not evasive url that ar enot IP are excluded by test on IP v4/v6 validity.
239  $arrayofmetadataserver = array(
240  '100.100.100.200' => 'Alibaba',
241  '192.0.0.192' => 'Oracle',
242  '192.80.8.124' => 'Packet',
243  '100.88.222.5' => 'Tencent cloud',
244  );
245  foreach ($arrayofmetadataserver as $ipofmetadataserver => $nameofmetadataserver) {
246  if ($iptocheck == $ipofmetadataserver) {
247  $info['http_code'] = 400;
248  $info['content'] = 'Error bad hostname IP (Used by '.$nameofmetadataserver.' metadata server). This IP is forbidden.';
249  break 2; // exit the foreach and the do...
250  }
251  }
252 
253  // Set CURLOPT_CONNECT_TO so curl will not try another resolution that may give a different result. Possible only on PHP v7+
254  if (defined('CURLOPT_CONNECT_TO')) {
255  $connect_to = array(sprintf("%s:%d:%s:%d", $newUrlArray['host'], empty($newUrlArray['port'])?'':$newUrlArray['port'], $iptocheck, empty($newUrlArray['port'])?'':$newUrlArray['port']));
256  //var_dump($newUrlArray);
257  //var_dump($connect_to);
258  curl_setopt($ch, CURLOPT_CONNECT_TO, $connect_to);
259  }
260  }
261 
262  // Getting response from server
263  $response = curl_exec($ch);
264 
265  $info = curl_getinfo($ch); // Reading of request must be done after sending request
266  $http_code = $info['http_code'];
267 
268  if ($followlocation && ($http_code == 301 || $http_code == 302 || $http_code == 303 || $http_code == 307)) {
269  $newUrl = $info['redirect_url'];
270  $maxRedirection--;
271  // TODO Use $info['local_ip'] and $info['primary_ip'] ?
272  continue;
273  }
274 
275  $http_code = 0;
276  } while ($http_code);
277 
278  $request = curl_getinfo($ch, CURLINFO_HEADER_OUT); // Reading of request must be done after sending request
279 
280  dol_syslog("getURLContent request=".$request);
281  if (!empty($conf->global->MAIN_GETURLCONTENT_OUTPUT_RESPONSE)) {
282  // This may contains binary data, so we dont output reponse by default.
283  dol_syslog("getURLContent response =".$response);
284  }
285  dol_syslog("getURLContent response size=".strlen($response)); // This may contains binary data, so we dont output it
286 
287  $rep = array();
288  if (curl_errno($ch)) {
289  // Add keys to $rep
290  $rep['content'] = $response;
291 
292  // moving to display page to display curl errors
293  $rep['curl_error_no'] = curl_errno($ch);
294  $rep['curl_error_msg'] = curl_error($ch);
295 
296  dol_syslog("getURLContent response array is ".join(',', $rep));
297  } else {
298  //$info = curl_getinfo($ch);
299 
300  // Add keys to $rep
301  $rep = $info;
302  //$rep['header_size']=$info['header_size'];
303  //$rep['http_code']=$info['http_code'];
304  dol_syslog("getURLContent http_code=".$rep['http_code']);
305 
306  // Add more keys to $rep
307  if ($response) {
308  $rep['content'] = $response;
309  }
310  $rep['curl_error_no'] = '';
311  $rep['curl_error_msg'] = '';
312  }
313 
314  //closing the curl
315  curl_close($ch);
316 
317  return $rep;
318 }
319 
320 
329 function getDomainFromURL($url, $mode = 0)
330 {
331  $tmpdomain = preg_replace('/^https?:\/\//i', '', $url); // Remove http(s)://
332  $tmpdomain = preg_replace('/\/.*$/i', '', $tmpdomain); // Remove part after domain
333  if ($mode == 2) {
334  $tmpdomain = preg_replace('/^.*\.([^\.]+)\.([^\.]+)\.([^\.]+)$/', '\1.\2.\3', $tmpdomain); // Remove part 'www.' before 'abc.mydomain.com'
335  } else {
336  $tmpdomain = preg_replace('/^.*\.([^\.]+)\.([^\.]+)$/', '\1.\2', $tmpdomain); // Remove part 'www.abc.' before 'mydomain.com'
337  }
338  if (empty($mode)) {
339  $tmpdomain = preg_replace('/\.[^\.]+$/', '', $tmpdomain); // Remove first level domain (.com, .net, ...)
340  }
341 
342  return $tmpdomain;
343 }
344 
353 function getRootURLFromURL($url)
354 {
355  $prefix = '';
356  $tmpurl = $url;
357  $reg = null;
358  if (preg_match('/^(https?:\/\/)/i', $tmpurl, $reg)) {
359  $prefix = $reg[1];
360  }
361  $tmpurl = preg_replace('/^https?:\/\//i', '', $tmpurl); // Remove http(s)://
362  $tmpurl = preg_replace('/\/.*$/i', '', $tmpurl); // Remove part after domain
363 
364  return $prefix.$tmpurl;
365 }
366 
373 function removeHtmlComment($content)
374 {
375  $content = preg_replace('/<!--[^\-]+-->/', '', $content);
376  return $content;
377 }
getRootURLFromURL
getRootURLFromURL($url)
Function root url from a long url For example: https://www.abc.mydomain.com/dir/page....
Definition: geturl.lib.php:353
getURLContent
getURLContent($url, $postorget='GET', $param='', $followlocation=1, $addheaders=array(), $allowedschemes=array('http', 'https'), $localurl=0, $ssl_verifypeer=-1)
Function to get a content from an URL (use proxy if proxy defined).
Definition: geturl.lib.php:41
dol_syslog
dol_syslog($message, $level=LOG_INFO, $ident=0, $suffixinfilename='', $restricttologhandler='', $logcontext=null)
Write log message into outputs.
Definition: functions.lib.php:1639
getDomainFromURL
getDomainFromURL($url, $mode=0)
Function get second level domain name.
Definition: geturl.lib.php:329
removeHtmlComment
removeHtmlComment($content)
Function to remove comments into HTML content.
Definition: geturl.lib.php:373
getDolGlobalInt
getDolGlobalInt($key, $default=0)
Return dolibarr global constant int value.
Definition: functions.lib.php:96