dolibarr 24.0.0-beta
privacy_guard.class.php
Go to the documentation of this file.
1<?php
2/* Copyright (C) 2026 Laurent Destailleur <eldy@users.sourceforge.net>
3 * Copyright (C) 2026 Nick Fragoulis
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <https://www.gnu.org/licenses/>.
17 */
18
29{
33 public $db;
34
38 private $map = [];
39
43 private $index = 0;
44
53 public function mask($text)
54 {
55 $this->map = [];
56 $this->index = 0;
57
58 // References / IDs (e.g. FA24-001, CUS-999)
59 // Must contain letters and numbers and separators
60 $text = preg_replace_callback(
61 '/\b(?=[A-Z0-9]*[0-9])(?=[A-Z0-9]*[A-Z])[A-Z0-9-_]{4,}\b/i',
66 function (array $m) {
67 return $this->createToken($m[0], 'REF');
68 },
69 $text
70 );
71
72 // Credit cards (13-19 digits, various separators)
73 // Use the Luhn algorithm to validate credit cards before masking
74 $text = preg_replace_callback(
75 '/\b(?:\d[ -]*?){13,19}\b/',
76 [$this, 'maskCreditCardCallback'],
77 $text
78 );
79
80 // IBAN (International Bank Account Number)
81 $text = preg_replace_callback(
82 '/\b[A-Z]{2}[0-9]{2}[a-zA-Z0-9]{4,30}\b/',
87 function (array $m) {
88 return $this->createToken($m[0], 'IBAN');
89 },
90 $text
91 );
92
93 // SWIFT / BIC Codes (8 or 11 characters)
94 $text = preg_replace_callback(
95 '/\b[A-Z]{6}[A-Z0-9]{2}([A-Z0-9]{3})?\b/',
100 function (array $m) {
101 return $this->createToken($m[0], 'SWIFT');
102 },
103 $text
104 );
105
106 // Generic bank account numbers (Context-aware)
107 // This looks for numbers preceded by keywords to reduce false positives.
108 $text = preg_replace_callback(
109 '/(?i)(?:account\s+num(?:ber)?|bank\s+acct|acct\s*#)[:\s#]*\b(\d{8,17})\b/',
114 function (array $m) {
115 return $this->createToken($m[0], 'BANKACCT');
116 },
117 $text
118 );
119
120 // Emails
121 $text = preg_replace_callback(
122 '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/',
127 function (array $m) {
128 return $this->createToken($m[0], 'EMAIL');
129 },
130 $text
131 );
132
133 // Vat numbers (EU: 2 letters + 2-12 chars)
134 $taxPatterns = [
135 [
136 'name' => 'EU VAT Numbers',
137 'regex' => '/\b(AT|BE|BG|CY|CZ|DE|DK|EE|EL|ES|FI|FR|GB|GR|HR|HU|IE|IT|LT|LU|LV|MT|NL|PL|PT|RO|SE|SI|SK)(?![a-z])[0-9A-Z]{2,12}\b/i',
138 'token' => 'VAT'
139 ],
140 [
141 'name' => 'Canadian GST/HST Numbers',
142 'regex' => '/\b\d{9}\s*RT\s*\d{4}\b/i',
143 'token' => 'TAXID'
144 ],
145 [
146 'name' => 'Australian ABN (Australian Business Number)',
147 'regex' => '/\b\d{2}\s*\d{3}\s*\d{3}\s*\d{3}\b/',
148 'token' => 'TAXID'
149 ],
150 [
151 'name' => 'Norwegian MVA (VAT) Numbers',
152 'regex' => '/\b\d{9}\s*MVA\b/i',
153 'token' => 'TAXID'
154 ],
155 [
156 'name' => 'Swiss VAT Numbers (MWST/TVA/IVA)',
157 'regex' => '/\bCHE-?\d{3}\.?\d{3}\.?\d{3}\s*(MWST|TVA|IVA)\b/i',
158 'token' => 'TAXID'
159 ],
160 ];
161
162 foreach ($taxPatterns as $pattern) {
163 $text = preg_replace_callback(
164 $pattern['regex'],
169 function (array $m) use ($pattern) {
170 return $this->createToken($m[0], $pattern['token']);
171 },
172 $text
173 );
174 }
175
176 // Phone numbers
177 $phonePatterns = [
178 [
179 'name' => 'Generic International Numbers',
180 'regex' => '/\b(?:\+|00)[0-9][0-9-.\s()]{8,}\b/',
181 'token' => 'PHONE'
182 ],
183 [
184 'name' => 'Greek National Numbers',
185 // This pattern is specific to Greece.
186 // Landlines: 10 digits starting with '2' (e.g., 210 123 4567).
187 // Mobiles: 10 digits starting with '69' (e.g., 698 123 4567).
188 // It matches numbers with optional separators like spaces, hyphens, or dots.
189 'regex' => '/\b(?:2[0-9-.\s()]{9}|69[0-9-.\s()]{8})\b/',
190 'token' => 'PHONE'
191 ],
192 [
193 'name' => 'French National Numbers',
194 // This pattern matches standard 10-digit French numbers.
195 // It handles common formats like "01 23 45 67 89", "01.23.45.67.89", or "0123456789".
196 // It covers all geographic prefixes (01-05) and mobiles (06, 07).
197 'regex' => '/\b0[1-9](?:[\s.-]?\d){8}\b/',
198 'token' => 'PHONE'
199 ],
200 ];
201
202 foreach ($phonePatterns as $pattern) {
203 $text = preg_replace_callback(
204 $pattern['regex'],
209 function (array $m) use ($pattern) {
210 return $this->createToken($m[0], $pattern['token']);
211 },
212 $text
213 );
214 }
215
216 // Address patterns
217 // Initialize translation and exclusions
218 // We need the global $langs object to get dynamic month names.
219 global $langs;
220
221 // Hardcoded Fallbacks (English, French, Spanish, German, Greek)
222 // We keep these hardcoded because an invoice might be in English even if the ERP is in Greek.
223 $hardcoded_excludes = [
224 // English
225 'January',
226 'February',
227 'March',
228 'April',
229 'May',
230 'June',
231 'July',
232 'August',
233 'September',
234 'October',
235 'November',
236 'December',
237 'Jan',
238 'Feb',
239 'Mar',
240 'Apr',
241 'Jun',
242 'Jul',
243 'Aug',
244 'Sep',
245 'Oct',
246 'Nov',
247 'Dec',
248 // Greek
249 'Ιανουάριος',
250 'Φεβρουάριος',
251 'Μάρτιος',
252 'Απρίλιος',
253 'Μάιος',
254 'Ιούνιος',
255 'Ιούλιος',
256 'Αύγουστος',
257 'Σεπτέμβριος',
258 'Οκτώβριος',
259 'Νοέμβριος',
260 'Δεκέμβριος',
261 'Ιαν',
262 'Φεβ',
263 'Μαρ',
264 'Απρ',
265 'Μαι',
266 'Ιουν',
267 'Ιουλ',
268 'Αυγ',
269 'Σεπ',
270 'Οκτ',
271 'Νοε',
272 'Δεκ',
273 // French
274 'Janvier',
275 'Février',
276 'Mars',
277 'Avril',
278 'Mai',
279 'Juin',
280 'Juillet',
281 'Août',
282 'Septembre',
283 'Octobre',
284 'Novembre',
285 'Décembre',
286 // German
287 'Januar',
288 'Februar',
289 'März',
290 'Juni',
291 'Juli',
292 'Oktober',
293 'Dezember',
294 // Spanish
295 'Enero',
296 'Febrero',
297 'Marzo',
298 'Abril',
299 'Mayo',
300 'Junio',
301 'Julio',
302 'Agosto',
303 'Septiembre',
304 'Octubre',
305 'Noviembre',
306 'Diciembre',
307 // ERP Noise (Common False Positives)
308 'Page',
309 'Pag',
310 'Vol',
311 'Volume',
312 'Inv',
313 'Invoice',
314 'Tel',
315 'Fax',
316 'Mob',
317 'Email',
318 'Vat',
319 'Tax',
320 'Sarl',
321 'Gmbh',
322 'Inc',
323 'Ltd',
324 'Total',
325 'Subtotal'
326 ];
327
328 // Dynamic Dolibarr Translations
329 // If $langs is available, we add the months in the current user's language.
330 $dynamic_excludes = [];
331 if (is_object($langs)) {
332 for ($i = 1; $i <= 12; $i++) {
333 $key = sprintf("%02d", $i); // 01, 02...
334 $dynamic_excludes[] = $langs->trans('Month' . $key); // Full name
335 $dynamic_excludes[] = $langs->trans('MonthShort' . $key); // Short name
336 }
337 }
338
339 // Merge and Deduplicate
340 // We combine hardcoded list + dynamic list + noise words
341 $all_excludes_array = array_unique(array_merge($hardcoded_excludes, $dynamic_excludes));
342
343 // Remove empty entries just in case
344 $all_excludes_array = array_filter($all_excludes_array);
345
346 // Create the Regex string: "January|Feb|Μάρτιος|Page..."
347 // We use preg_quote to ensure no special characters break the regex (though rare in months).
348 $excluded_words_regex = implode('|', array_map(
349 function (string $word): string {
350 return preg_quote($word, '/');
351 },
352 $all_excludes_array
353 ));
354
355
356 // Define address keywords
357 $address_keywords = 'Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd|Rue|Via|Strasse|Platz|Drive|Dr|Court|Ct|Way|Plaza|Square|Sq|Οδός|Λεωφόρος|Διεύθυνση|Piazza|Avenida';
358
359
360 // Define patterns
361 $addressPatterns = [
362 [
363 'name' => 'Number First (e.g., 123 Main St)',
364 // 123 Main St
365 'regex' => '/\b\d{1,5}\s+(?:[\p{L}\p{N}\.\'\-]+\s+){1,6}(?:' . $address_keywords . ')\b/ui',
366 'token' => 'ADDR'
367 ],
368 [
369 'name' => 'Keyword First (e.g., Rue de la Paix 12)',
370 // Rue de la Paix 12
371 'regex' => '/\b(?:' . $address_keywords . ')\s+(?:[\p{L}\p{N}\.\'\-]+\s+){1,6}\d{1,5}\b/ui',
372 'token' => 'ADDR'
373 ],
374 [
375 'name' => 'Name First, Keyword Middle (e.g., Main St 12)',
376 // Main St 12
377 'regex' => '/\b(?:[\p{L}\p{N}\.\'\-]+\s+){1,4}(?:' . $address_keywords . ')\s+\d{1,5}\b/ui',
378 'token' => 'ADDR'
379 ],
380 [
381 'name' => 'Name First, No Keyword (Strict)',
382 // Matches: "ΦΟΡΜΙΩΝΟΣ 101" or "Musterway 12"
383 // Ignores: "January 2024", "Page 1", "Invoice 2023"
384 // Logic:
385 // 1. Negative Lookahead (?!(?:...)\b): If next word is in exclusion list, STOP.
386 // 2. \p{Lu}: Must start with Uppercase Letter (Unicode safe).
387 'regex' => '/\b(?!(?:' . $excluded_words_regex . ')\b)\p{Lu}[\p{L}\p{N}\.\'\-]+\s+\d{1,5}\b/u',
388 'token' => 'ADDR'
389 ],
390 ];
391
392 foreach ($addressPatterns as $pattern) {
393 // We use a callback to replace the found address with a token
394 $text = preg_replace_callback(
395 $pattern['regex'],
400 function (array $m) use ($pattern) {
401 return $this->createToken($m[0], $pattern['token']);
402 },
403 $text
404 );
405 }
406
407 // Zip codes
408 $zipCodePatterns = [
409 [
410 'name' => 'UK Postal Codes',
411 // Matches UK postcodes like SW1A 0AA, M1 1AA, B33 8TH.
412 'regex' => '/\b[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\b/i',
413 'token' => 'ZIP'
414 ],
415 [
416 'name' => 'Canadian Postal Codes',
417 // Matches Canadian codes like K1A 0B1 or V6A 1H1.
418 'regex' => '/\b[A-CEGHJ-NPR-STV-Z]\d[A-CEGHJ-NPR-STV-Z][ -]?\d[A-CEGHJ-NPR-STV-Z]\d\b/i',
419 'token' => 'ZIP'
420 ],
421 [
422 'name' => 'French Postal Codes',
423 // Matches 5-digit French codes. It's more specific than a generic \d{5}
424 // by checking for valid department numbers (01-95) and Corsica (2A, 2B).
425 'regex' => '/\b(0[1-9]\d{3}|[1-8]\d{4}|9[0-5]\d{2}|2[AB]\d{3})\b/',
426 'token' => 'ZIP'
427 ],
428 [
429 'name' => 'Greek Postal Codes',
430 // Matches 5-digit Greek codes (e.g., 115 28). Note: This is a generic
431 // 5-digit pattern and may have false positives, but is standard for Greece.
432 'regex' => '/\b\d{3}\s?\d{2}\b/',
433 'token' => 'ZIP'
434 ],
435 [
436 'name' => 'US ZIP Codes',
437 // Matches 5-digit US ZIP codes and ZIP+4 format.
438 'regex' => '/\b\d{5}(?:-\d{4})?\b/',
439 'token' => 'ZIP'
440 ],
441 ];
442
443 foreach ($zipCodePatterns as $pattern) {
444 $text = preg_replace_callback(
445 $pattern['regex'],
450 function (array $m) use ($pattern) {
451 return $this->createToken($m[0], $pattern['token']);
452 },
453 $text
454 );
455 }
456
457 return $text;
458 }
459
467 public function unmask($jsonString)
468 {
469 if (empty($this->map)) {
470 return $jsonString;
471 }
472
473 // Handle case where AI might return token inside quotes or escaped
474 $search = array_keys($this->map);
475 $replace = array_values($this->map);
476
477 return str_replace($search, $replace, $jsonString);
478 }
479
487 private function maskCreditCardCallback(array $matches)
488 {
489 $potentialCc = $matches[0];
490 if ($this->passesLuhnCheck($potentialCc)) {
491 return $this->createToken($potentialCc, 'CC');
492 }
493
494 // If it doesn't pass the Luhn check, return the original string unmodified.
495 return $potentialCc;
496 }
497
504 private function passesLuhnCheck($number)
505 {
506 // Clean the string to contain only digits.
507 $digits = preg_replace('/\D/', '', $number);
508
509 // Check if the cleaned string is within a valid length range.
510 if (strlen($digits) < 13 || strlen($digits) > 19) {
511 return false;
512 }
513
514 // Perform the Luhn algorithm.
515 $sum = 0;
516 $isEvenDigit = false;
517
518 // Iterate from right to left
519 for ($i = strlen($digits) - 1; $i >= 0; $i--) {
520 $digit = (int) $digits[$i];
521
522 if ($isEvenDigit) {
523 $digit *= 2;
524 // If the result is two digits, sum them (or subtract 9)
525 if ($digit > 9) {
526 $digit -= 9;
527 }
528 }
529
530 $sum += $digit;
531 $isEvenDigit = !$isEvenDigit; // Flip the flag for the next digit
532 }
533
534 // The number is valid if the sum is a multiple of 10.
535 return ($sum % 10) === 0;
536 }
537
545 public function unmaskAiResponse($text)
546 {
547 if (empty($this->map)) {
548 return $text;
549 }
550
551 // Standard unmasking
552 $text = $this->unmask($text);
553
554 // Next, find and replace any tokens that were stripped by the AI.
555 // We iterate through our map and check for the stripped version of each token.
556 foreach ($this->map as $fullToken => $originalValue) {
557 // The stripped token is the full token without the brackets.
558 // e.g., '[[REF_1]]' becomes 'REF_1'
559 $strippedToken = substr($fullToken, 2, -2);
560
561 if (strpos($text, $strippedToken) !== false) {
562 // We found a stripped token in the text, so we replace it.
563 $text = str_replace($strippedToken, $originalValue, $text);
564 }
565 }
566
567 return $text;
568 }
569
577 private function createToken($value, $type)
578 {
579 $this->index++;
580 // Format: [[EMAIL_1]]
581 $token = "[[{$type}_{$this->index}]]";
582 $this->map[$token] = $value;
583 return $token;
584 }
585}
Class to manage privacy data masking and unmasking.
mask($text)
Mask sensitive GDPR data in the query.
maskCreditCardCallback(array $matches)
Callback function for preg_replace_callback to mask credit cards.
unmaskAiResponse($text)
Unmasks a string from an AI response, handling cases where the AI might have stripped the [[ and ]] d...
passesLuhnCheck($number)
Validates a number string using the Luhn algorithm.
unmask($jsonString)
Restore real data from a masked string.
createToken($value, $type)
Create a unique token and store the original value in the map.