idna_convert.class.php 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212
  1. <?php
  2. namespace WGSSWITCHEPP\IDNA;
  3. // {{{ license
  4. /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  5. //
  6. // +----------------------------------------------------------------------+
  7. // | This library is free software; you can redistribute it and/or modify |
  8. // | it under the terms of the GNU Lesser General Public License as |
  9. // | published by the Free Software Foundation; either version 2.1 of the |
  10. // | License, or (at your option) any later version. |
  11. // | |
  12. // | This library is distributed in the hope that it will be useful, but |
  13. // | WITHOUT ANY WARRANTY; without even the implied warranty of |
  14. // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
  15. // | Lesser General Public License for more details. |
  16. // | |
  17. // | You should have received a copy of the GNU Lesser General Public |
  18. // | License along with this library; if not, write to the Free Software |
  19. // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
  20. // | USA. |
  21. // +----------------------------------------------------------------------+
  22. //
  23. // }}}
  24. /**
  25. * Encode/decode Internationalized Domain Names.
  26. *
  27. * The class allows to convert internationalized domain names
  28. * (see RFC 3490 for details) as they can be used with various registries worldwide
  29. * to be translated between their original (localized) form and their encoded form
  30. * as it will be used in the DNS (Domain Name System).
  31. *
  32. * The class provides two public methods, encode() and decode(), which do exactly
  33. * what you would expect them to do. You are allowed to use complete domain names,
  34. * simple strings and complete email addresses as well. That means, that you might
  35. * use any of the following notations:
  36. *
  37. * - www.nörgler.com
  38. * - xn--nrgler-wxa
  39. * - xn--brse-5qa.xn--knrz-1ra.info
  40. *
  41. * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
  42. * array. Unicode output is available in the same formats.
  43. * You can select your preferred format via {@link set_paramter()}.
  44. *
  45. * ACE input and output is always expected to be ASCII.
  46. *
  47. * @author Matthias Sommerfeld <mso@phlylabs.de>
  48. * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
  49. * @version 0.5.1
  50. *
  51. */
  52. class idna_convert {
  53. /**
  54. * Holds all relevant mapping tables, loaded from a seperate file on construct
  55. * See RFC3454 for details
  56. *
  57. * @var array
  58. * @access private
  59. */
  60. var $NP = array();
  61. // Internal settings, do not mess with them
  62. var $_punycode_prefix = 'xn--';
  63. var $_invalid_ucs = 0x80000000;
  64. var $_max_ucs = 0x10FFFF;
  65. var $_base = 36;
  66. var $_tmin = 1;
  67. var $_tmax = 26;
  68. var $_skew = 38;
  69. var $_damp = 700;
  70. var $_initial_bias = 72;
  71. var $_initial_n = 0x80;
  72. var $_sbase = 0xAC00;
  73. var $_lbase = 0x1100;
  74. var $_vbase = 0x1161;
  75. var $_tbase = 0x11A7;
  76. var $_lcount = 19;
  77. var $_vcount = 21;
  78. var $_tcount = 28;
  79. var $_ncount = 588; // _vcount * _tcount
  80. var $_scount = 11172; // _lcount * _tcount * _vcount
  81. var $_error = false;
  82. // See {@link set_paramter()} for details of how to change the following
  83. // settings from within your script / application
  84. var $_api_encoding = 'utf8'; // Default input charset is UTF-8
  85. var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
  86. var $_strict_mode = false; // Behave strict or not
  87. // The constructor
  88. function idna_convert($options = false) {
  89. $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
  90. if (function_exists('file_get_contents')) {
  91. $this->NP = unserialize(file_get_contents(dirname(__FILE__) . '/npdata.ser'));
  92. } else {
  93. $this->NP = unserialize(join('', file(dirname(__FILE__) . '/npdata.ser')));
  94. }
  95. // If parameters are given, pass these to the respective method
  96. if (is_array($options)) {
  97. return $this->set_parameter($options);
  98. }
  99. return true;
  100. }
  101. /**
  102. * Sets a new option value. Available options and values:
  103. * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
  104. * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
  105. * [overlong - Unicode does not allow unnecessarily long encodings of chars,
  106. * to allow this, set this parameter to true, else to false;
  107. * default is false.]
  108. * [strict - true: strict mode, good for registration purposes - Causes errors
  109. * on failures; false: loose mode, ideal for "wildlife" applications
  110. * by silently ignoring errors and returning the original input instead
  111. *
  112. * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
  113. * @param string Value to use (if parameter 1 is a string)
  114. * @return boolean true on success, false otherwise
  115. * @access public
  116. */
  117. function set_parameter($option, $value = false) {
  118. if (!is_array($option)) {
  119. $option = array($option => $value);
  120. }
  121. foreach ($option as $k => $v) {
  122. switch ($k) {
  123. case 'encoding':
  124. switch ($v) {
  125. case 'utf8':
  126. case 'ucs4_string':
  127. case 'ucs4_array':
  128. $this->_api_encoding = $v;
  129. break;
  130. default:
  131. $this->_error('Set Parameter: Unknown parameter ' . $v . ' for option ' . $k);
  132. return false;
  133. }
  134. break;
  135. case 'overlong':
  136. $this->_allow_overlong = ($v) ? true : false;
  137. break;
  138. case 'strict':
  139. $this->_strict_mode = ($v) ? true : false;
  140. break;
  141. default:
  142. $this->_error('Set Parameter: Unknown option ' . $k);
  143. return false;
  144. }
  145. }
  146. return true;
  147. }
  148. /**
  149. * Decode a given ACE domain name
  150. * @param string Domain name (ACE string)
  151. * [@param string Desired output encoding, see {@link set_parameter}]
  152. * @return string Decoded Domain name (UTF-8 or UCS-4)
  153. * @access public
  154. */
  155. function decode($input, $one_time_encoding = false) {
  156. // Optionally set
  157. if ($one_time_encoding) {
  158. switch ($one_time_encoding) {
  159. case 'utf8':
  160. case 'ucs4_string':
  161. case 'ucs4_array':
  162. break;
  163. default:
  164. $this->_error('Unknown encoding ' . $one_time_encoding);
  165. return false;
  166. }
  167. }
  168. // Make sure to drop any newline characters around
  169. $input = trim($input);
  170. // Negotiate input and try to determine, whether it is a plain string,
  171. // an email address or something like a complete URL
  172. if (strpos($input, '@')) { // Maybe it is an email address
  173. // No no in strict mode
  174. if ($this->_strict_mode) {
  175. $this->_error('Only simple domain name parts can be handled in strict mode');
  176. return false;
  177. }
  178. list ($email_pref, $input) = explode('@', $input, 2);
  179. $arr = explode('.', $input);
  180. foreach ($arr as $k => $v) {
  181. if (preg_match('!^' . preg_quote($this->_punycode_prefix, '!') . '!', $v)) {
  182. $conv = $this->_decode($v);
  183. if ($conv)
  184. $arr[$k] = $conv;
  185. }
  186. }
  187. $input = join('.', $arr);
  188. $arr = explode('.', $email_pref);
  189. foreach ($arr as $k => $v) {
  190. if (preg_match('!^' . preg_quote($this->_punycode_prefix, '!') . '!', $v)) {
  191. $conv = $this->_decode($v);
  192. if ($conv)
  193. $arr[$k] = $conv;
  194. }
  195. }
  196. $email_pref = join('.', $arr);
  197. $return = $email_pref . '@' . $input;
  198. } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
  199. // No no in strict mode
  200. if ($this->_strict_mode) {
  201. $this->_error('Only simple domain name parts can be handled in strict mode');
  202. return false;
  203. }
  204. $parsed = parse_url($input);
  205. if (isset($parsed['host'])) {
  206. $arr = explode('.', $parsed['host']);
  207. foreach ($arr as $k => $v) {
  208. $conv = $this->_decode($v);
  209. if ($conv)
  210. $arr[$k] = $conv;
  211. }
  212. $parsed['host'] = join('.', $arr);
  213. $return = (empty($parsed['scheme']) ? '' : $parsed['scheme'] . (strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
  214. . (empty($parsed['user']) ? '' : $parsed['user'] . (empty($parsed['pass']) ? '' : ':' . $parsed['pass']) . '@')
  215. . $parsed['host']
  216. . (empty($parsed['port']) ? '' : ':' . $parsed['port'])
  217. . (empty($parsed['path']) ? '' : $parsed['path'])
  218. . (empty($parsed['query']) ? '' : '?' . $parsed['query'])
  219. . (empty($parsed['fragment']) ? '' : '#' . $parsed['fragment']);
  220. } else { // parse_url seems to have failed, try without it
  221. $arr = explode('.', $input);
  222. foreach ($arr as $k => $v) {
  223. $conv = $this->_decode($v);
  224. $arr[$k] = ($conv) ? $conv : $v;
  225. }
  226. $return = join('.', $arr);
  227. }
  228. } else { // Otherwise we consider it being a pure domain name string
  229. $return = $this->_decode($input);
  230. if (!$return)
  231. $return = $input;
  232. }
  233. // The output is UTF-8 by default, other output formats need conversion here
  234. // If one time encoding is given, use this, else the objects property
  235. switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
  236. case 'utf8':
  237. return $return;
  238. break;
  239. case 'ucs4_string':
  240. return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
  241. break;
  242. case 'ucs4_array':
  243. return $this->_utf8_to_ucs4($return);
  244. break;
  245. default:
  246. $this->_error('Unsupported output format');
  247. return false;
  248. }
  249. }
  250. /**
  251. * Encode a given UTF-8 domain name
  252. * @param string Domain name (UTF-8 or UCS-4)
  253. * [@param string Desired input encoding, see {@link set_parameter}]
  254. * @return string Encoded Domain name (ACE string)
  255. * @access public
  256. */
  257. function encode($decoded, $one_time_encoding = false) {
  258. // Forcing conversion of input to UCS4 array
  259. // If one time encoding is given, use this, else the objects property
  260. switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
  261. case 'utf8':
  262. $decoded = $this->_utf8_to_ucs4($decoded);
  263. break;
  264. case 'ucs4_string':
  265. $decoded = $this->_ucs4_string_to_ucs4($decoded);
  266. case 'ucs4_array':
  267. break;
  268. default:
  269. $this->_error('Unsupported input format: ' . ($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
  270. return false;
  271. }
  272. // No input, no output, what else did you expect?
  273. if (empty($decoded))
  274. return '';
  275. // Anchors for iteration
  276. $last_begin = 0;
  277. // Output string
  278. $output = '';
  279. foreach ($decoded as $k => $v) {
  280. // Make sure to use just the plain dot
  281. switch ($v) {
  282. case 0x3002:
  283. case 0xFF0E:
  284. case 0xFF61:
  285. $decoded[$k] = 0x2E;
  286. // Right, no break here, the above are converted to dots anyway
  287. // Stumbling across an anchoring character
  288. case 0x2E:
  289. case 0x2F:
  290. case 0x3A:
  291. case 0x3F:
  292. case 0x40:
  293. // Neither email addresses nor URLs allowed in strict mode
  294. if ($this->_strict_mode) {
  295. $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
  296. return false;
  297. } else {
  298. // Skip first char
  299. if ($k) {
  300. $encoded = '';
  301. $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k) - $last_begin)));
  302. if ($encoded) {
  303. $output .= $encoded;
  304. } else {
  305. $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k) - $last_begin)));
  306. }
  307. $output .= chr($decoded[$k]);
  308. }
  309. $last_begin = $k + 1;
  310. }
  311. }
  312. }
  313. // Catch the rest of the string
  314. if ($last_begin) {
  315. $inp_len = sizeof($decoded);
  316. $encoded = '';
  317. $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
  318. if ($encoded) {
  319. $output .= $encoded;
  320. } else {
  321. $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len) - $last_begin)));
  322. }
  323. return $output;
  324. } else {
  325. if ($output = $this->_encode($decoded)) {
  326. return $output;
  327. } else {
  328. return $this->_ucs4_to_utf8($decoded);
  329. }
  330. }
  331. }
  332. /**
  333. * Use this method to get the last error ocurred
  334. * @param void
  335. * @return string The last error, that occured
  336. * @access public
  337. */
  338. function get_last_error() {
  339. return $this->_error;
  340. }
  341. /**
  342. * The actual decoding algorithm
  343. * @access private
  344. */
  345. function _decode($encoded) {
  346. // We do need to find the Punycode prefix
  347. if (!preg_match('!^' . preg_quote($this->_punycode_prefix, '!') . '!', $encoded)) {
  348. $this->_error('This is not a punycode string');
  349. return false;
  350. }
  351. $encode_test = preg_replace('!^' . preg_quote($this->_punycode_prefix, '!') . '!', '', $encoded);
  352. // If nothing left after removing the prefix, it is hopeless
  353. if (!$encode_test) {
  354. $this->_error('The given encoded string was empty');
  355. return false;
  356. }
  357. // Find last occurence of the delimiter
  358. $delim_pos = strrpos($encoded, '-');
  359. if ($delim_pos > strlen($this->_punycode_prefix)) {
  360. for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
  361. $decoded[] = ord($encoded[$k]);
  362. }
  363. } else {
  364. $decoded = array();
  365. }
  366. $deco_len = count($decoded);
  367. $enco_len = strlen($encoded);
  368. // Wandering through the strings; init
  369. $is_first = true;
  370. $bias = $this->_initial_bias;
  371. $idx = 0;
  372. $char = $this->_initial_n;
  373. for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
  374. for ($old_idx = $idx, $w = 1, $k = $this->_base; 1; $k += $this->_base) {
  375. $digit = $this->_decode_digit($encoded[$enco_idx++]);
  376. $idx += $digit * $w;
  377. $t = ($k <= $bias) ? $this->_tmin :
  378. (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
  379. if ($digit < $t)
  380. break;
  381. $w = (int) ($w * ($this->_base - $t));
  382. }
  383. $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
  384. $is_first = false;
  385. $char += (int) ($idx / ($deco_len + 1));
  386. $idx %= ($deco_len + 1);
  387. if ($deco_len > 0) {
  388. // Make room for the decoded char
  389. for ($i = $deco_len; $i > $idx; $i--) {
  390. $decoded[$i] = $decoded[($i - 1)];
  391. }
  392. }
  393. $decoded[$idx++] = $char;
  394. }
  395. return $this->_ucs4_to_utf8($decoded);
  396. }
  397. /**
  398. * The actual encoding algorithm
  399. * @access private
  400. */
  401. function _encode($decoded) {
  402. // We cannot encode a domain name containing the Punycode prefix
  403. $extract = strlen($this->_punycode_prefix);
  404. $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
  405. $check_deco = array_slice($decoded, 0, $extract);
  406. if ($check_pref == $check_deco) {
  407. $this->_error('This is already a punycode string');
  408. return false;
  409. }
  410. // We will not try to encode strings consisting of basic code points only
  411. $encodable = false;
  412. foreach ($decoded as $k => $v) {
  413. if ($v > 0x7a) {
  414. $encodable = true;
  415. break;
  416. }
  417. }
  418. if (!$encodable) {
  419. $this->_error('The given string does not contain encodable chars');
  420. return false;
  421. }
  422. // Do NAMEPREP
  423. $decoded = $this->_nameprep($decoded);
  424. if (!$decoded || !is_array($decoded))
  425. return false; // NAMEPREP failed
  426. $deco_len = count($decoded);
  427. if (!$deco_len)
  428. return false; // Empty array
  429. $codecount = 0; // How many chars have been consumed
  430. $encoded = '';
  431. // Copy all basic code points to output
  432. for ($i = 0; $i < $deco_len; ++$i) {
  433. $test = $decoded[$i];
  434. // Will match [-0-9a-zA-Z]
  435. if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B) || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
  436. $encoded .= chr($decoded[$i]);
  437. $codecount++;
  438. }
  439. }
  440. if ($codecount == $deco_len)
  441. return $encoded; // All codepoints were basic ones
  442. // Start with the prefix; copy it to output
  443. $encoded = $this->_punycode_prefix . $encoded;
  444. // If we have basic code points in output, add an hyphen to the end
  445. if ($codecount)
  446. $encoded .= '-';
  447. // Now find and encode all non-basic code points
  448. $is_first = true;
  449. $cur_code = $this->_initial_n;
  450. $bias = $this->_initial_bias;
  451. $delta = 0;
  452. while ($codecount < $deco_len) {
  453. // Find the smallest code point >= the current code point and
  454. // remember the last ouccrence of it in the input
  455. for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
  456. if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
  457. $next_code = $decoded[$i];
  458. }
  459. }
  460. $delta += ($next_code - $cur_code) * ($codecount + 1);
  461. $cur_code = $next_code;
  462. // Scan input again and encode all characters whose code point is $cur_code
  463. for ($i = 0; $i < $deco_len; $i++) {
  464. if ($decoded[$i] < $cur_code) {
  465. $delta++;
  466. } elseif ($decoded[$i] == $cur_code) {
  467. for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
  468. $t = ($k <= $bias) ? $this->_tmin :
  469. (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
  470. if ($q < $t)
  471. break;
  472. $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
  473. $q = (int) (($q - $t) / ($this->_base - $t));
  474. }
  475. $encoded .= $this->_encode_digit($q);
  476. $bias = $this->_adapt($delta, $codecount + 1, $is_first);
  477. $codecount++;
  478. $delta = 0;
  479. $is_first = false;
  480. }
  481. }
  482. $delta++;
  483. $cur_code++;
  484. }
  485. return $encoded;
  486. }
  487. /**
  488. * Adapt the bias according to the current code point and position
  489. * @access private
  490. */
  491. function _adapt($delta, $npoints, $is_first) {
  492. $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
  493. $delta += intval($delta / $npoints);
  494. for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
  495. $delta = intval($delta / ($this->_base - $this->_tmin));
  496. }
  497. return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
  498. }
  499. /**
  500. * Encoding a certain digit
  501. * @access private
  502. */
  503. function _encode_digit($d) {
  504. return chr($d + 22 + 75 * ($d < 26));
  505. }
  506. /**
  507. * Decode a certain digit
  508. * @access private
  509. */
  510. function _decode_digit($cp) {
  511. $cp = ord($cp);
  512. return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
  513. }
  514. /**
  515. * Internal error handling method
  516. * @access private
  517. */
  518. function _error($error = '') {
  519. $this->_error = $error;
  520. }
  521. /**
  522. * Do Nameprep according to RFC3491 and RFC3454
  523. * @param array Unicode Characters
  524. * @return string Unicode Characters, Nameprep'd
  525. * @access private
  526. */
  527. function _nameprep($input) {
  528. $output = array();
  529. $error = false;
  530. //
  531. // Mapping
  532. // Walking through the input array, performing the required steps on each of
  533. // the input chars and putting the result into the output array
  534. // While mapping required chars we apply the cannonical ordering
  535. foreach ($input as $v) {
  536. // Map to nothing == skip that code point
  537. if (in_array($v, $this->NP['map_nothing']))
  538. continue;
  539. // Try to find prohibited input
  540. if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
  541. $this->_error('NAMEPREP: Prohibited input U+' . sprintf('%08X', $v));
  542. return false;
  543. }
  544. foreach ($this->NP['prohibit_ranges'] as $range) {
  545. if ($range[0] <= $v && $v <= $range[1]) {
  546. $this->_error('NAMEPREP: Prohibited input U+' . sprintf('%08X', $v));
  547. return false;
  548. }
  549. }
  550. //
  551. // Hangul syllable decomposition
  552. if (0xAC00 <= $v && $v <= 0xD7AF) {
  553. foreach ($this->_hangul_decompose($v) as $out) {
  554. $output[] = (int) $out;
  555. }
  556. // There's a decomposition mapping for that code point
  557. } elseif (isset($this->NP['replacemaps'][$v])) {
  558. foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
  559. $output[] = (int) $out;
  560. }
  561. } else {
  562. $output[] = (int) $v;
  563. }
  564. }
  565. // Before applying any Combining, try to rearrange any Hangul syllables
  566. $output = $this->_hangul_compose($output);
  567. //
  568. // Combine code points
  569. //
  570. $last_class = 0;
  571. $last_starter = 0;
  572. $out_len = count($output);
  573. for ($i = 0; $i < $out_len; ++$i) {
  574. $class = $this->_get_combining_class($output[$i]);
  575. if ((!$last_class || $last_class > $class) && $class) {
  576. // Try to match
  577. $seq_len = $i - $last_starter;
  578. $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
  579. // On match: Replace the last starter with the composed character and remove
  580. // the now redundant non-starter(s)
  581. if ($out) {
  582. $output[$last_starter] = $out;
  583. if (count($out) != $seq_len) {
  584. for ($j = $i + 1; $j < $out_len; ++$j) {
  585. $output[$j - 1] = $output[$j];
  586. }
  587. unset($output[$out_len]);
  588. }
  589. // Rewind the for loop by one, since there can be more possible compositions
  590. $i--;
  591. $out_len--;
  592. $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i - 1]);
  593. continue;
  594. }
  595. }
  596. // The current class is 0
  597. if (!$class)
  598. $last_starter = $i;
  599. $last_class = $class;
  600. }
  601. return $output;
  602. }
  603. /**
  604. * Decomposes a Hangul syllable
  605. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  606. * @param integer 32bit UCS4 code point
  607. * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
  608. * @access private
  609. */
  610. function _hangul_decompose($char) {
  611. $sindex = (int) $char - $this->_sbase;
  612. if ($sindex < 0 || $sindex >= $this->_scount) {
  613. return array($char);
  614. }
  615. $result = array();
  616. $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
  617. $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
  618. $T = intval($this->_tbase + $sindex % $this->_tcount);
  619. if ($T != $this->_tbase)
  620. $result[] = $T;
  621. return $result;
  622. }
  623. /**
  624. * Ccomposes a Hangul syllable
  625. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  626. * @param array Decomposed UCS4 sequence
  627. * @return array UCS4 sequence with syllables composed
  628. * @access private
  629. */
  630. function _hangul_compose($input) {
  631. $inp_len = count($input);
  632. if (!$inp_len)
  633. return array();
  634. $result = array();
  635. $last = (int) $input[0];
  636. $result[] = $last; // copy first char from input to output
  637. for ($i = 1; $i < $inp_len; ++$i) {
  638. $char = (int) $input[$i];
  639. $sindex = $last - $this->_sbase;
  640. $lindex = $last - $this->_lbase;
  641. $vindex = $char - $this->_vbase;
  642. $tindex = $char - $this->_tbase;
  643. // Find out, whether two current characters are LV and T
  644. if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0) && 0 <= $tindex && $tindex <= $this->_tcount) {
  645. // create syllable of form LVT
  646. $last += $tindex;
  647. $result[(count($result) - 1)] = $last; // reset last
  648. continue; // discard char
  649. }
  650. // Find out, whether two current characters form L and V
  651. if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
  652. // create syllable of form LV
  653. $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
  654. $result[(count($result) - 1)] = $last; // reset last
  655. continue; // discard char
  656. }
  657. // if neither case was true, just add the character
  658. $last = $char;
  659. $result[] = $char;
  660. }
  661. return $result;
  662. }
  663. /**
  664. * Returns the combining class of a certain wide char
  665. * @param integer Wide char to check (32bit integer)
  666. * @return integer Combining class if found, else 0
  667. * @access private
  668. */
  669. function _get_combining_class($char) {
  670. return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
  671. }
  672. /**
  673. * Apllies the cannonical ordering of a decomposed UCS4 sequence
  674. * @param array Decomposed UCS4 sequence
  675. * @return array Ordered USC4 sequence
  676. * @access private
  677. */
  678. function _apply_cannonical_ordering($input) {
  679. $swap = true;
  680. $size = count($input);
  681. while ($swap) {
  682. $swap = false;
  683. $last = $this->_get_combining_class(intval($input[0]));
  684. for ($i = 0; $i < $size - 1; ++$i) {
  685. $next = $this->_get_combining_class(intval($input[$i + 1]));
  686. if ($next != 0 && $last > $next) {
  687. // Move item leftward until it fits
  688. for ($j = $i + 1; $j > 0; --$j) {
  689. if ($this->_get_combining_class(intval($input[$j - 1])) <= $next)
  690. break;
  691. $t = intval($input[$j]);
  692. $input[$j] = intval($input[$j - 1]);
  693. $input[$j - 1] = $t;
  694. $swap = true;
  695. }
  696. // Reentering the loop looking at the old character again
  697. $next = $last;
  698. }
  699. $last = $next;
  700. }
  701. }
  702. return $input;
  703. }
  704. /**
  705. * Do composition of a sequence of starter and non-starter
  706. * @param array UCS4 Decomposed sequence
  707. * @return array Ordered USC4 sequence
  708. * @access private
  709. */
  710. function _combine($input) {
  711. $inp_len = count($input);
  712. foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
  713. if ($np_target[0] != $input[0])
  714. continue;
  715. if (count($np_target) != $inp_len)
  716. continue;
  717. $hit = false;
  718. foreach ($input as $k2 => $v2) {
  719. if ($v2 == $np_target[$k2]) {
  720. $hit = true;
  721. } else {
  722. $hit = false;
  723. break;
  724. }
  725. }
  726. if ($hit)
  727. return $np_src;
  728. }
  729. return false;
  730. }
  731. /**
  732. * This converts an UTF-8 encoded string to its UCS-4 representation
  733. * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
  734. * each of the "chars". This is due to PHP not being able to handle strings with
  735. * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
  736. * The following UTF-8 encodings are supported:
  737. * bytes bits representation
  738. * 1 7 0xxxxxxx
  739. * 2 11 110xxxxx 10xxxxxx
  740. * 3 16 1110xxxx 10xxxxxx 10xxxxxx
  741. * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  742. * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  743. * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  744. * Each x represents a bit that can be used to store character data.
  745. * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
  746. * @access private
  747. */
  748. function _utf8_to_ucs4($input) {
  749. $output = array();
  750. $out_len = 0;
  751. $inp_len = strlen($input);
  752. $mode = 'next';
  753. $test = 'none';
  754. for ($k = 0; $k < $inp_len; ++$k) {
  755. $v = ord($input[$k]); // Extract byte from input string
  756. if ($v < 128) { // We found an ASCII char - put into stirng as is
  757. $output[$out_len] = $v;
  758. ++$out_len;
  759. if ('add' == $mode) {
  760. $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte ' . $k);
  761. return false;
  762. }
  763. continue;
  764. }
  765. if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
  766. $start_byte = $v;
  767. $mode = 'add';
  768. $test = 'range';
  769. if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
  770. $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
  771. $v = ($v - 192) << 6;
  772. } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
  773. $next_byte = 1;
  774. $v = ($v - 224) << 12;
  775. } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  776. $next_byte = 2;
  777. $v = ($v - 240) << 18;
  778. } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  779. $next_byte = 3;
  780. $v = ($v - 248) << 24;
  781. } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  782. $next_byte = 4;
  783. $v = ($v - 252) << 30;
  784. } else {
  785. $this->_error('This might be UTF-8, but I don\'t understand it at byte ' . $k);
  786. return false;
  787. }
  788. if ('add' == $mode) {
  789. $output[$out_len] = (int) $v;
  790. ++$out_len;
  791. continue;
  792. }
  793. }
  794. if ('add' == $mode) {
  795. if (!$this->_allow_overlong && $test == 'range') {
  796. $test = 'none';
  797. if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
  798. $this->_error('Bogus UTF-8 character detected (out of legal range) at byte ' . $k);
  799. return false;
  800. }
  801. }
  802. if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
  803. $v = ($v - 128) << ($next_byte * 6);
  804. $output[($out_len - 1)] += $v;
  805. --$next_byte;
  806. } else {
  807. $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte ' . $k);
  808. return false;
  809. }
  810. if ($next_byte < 0) {
  811. $mode = 'next';
  812. }
  813. }
  814. } // for
  815. return $output;
  816. }
  817. /**
  818. * Convert UCS-4 string into UTF-8 string
  819. * See _utf8_to_ucs4() for details
  820. * @access private
  821. */
  822. function _ucs4_to_utf8($input) {
  823. $output = '';
  824. $k = 0;
  825. foreach ($input as $v) {
  826. ++$k;
  827. // $v = ord($v);
  828. if ($v < 128) { // 7bit are transferred literally
  829. $output .= chr($v);
  830. } elseif ($v < (1 << 11)) { // 2 bytes
  831. $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
  832. } elseif ($v < (1 << 16)) { // 3 bytes
  833. $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  834. } elseif ($v < (1 << 21)) { // 4 bytes
  835. $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
  836. . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  837. } elseif ($v < (1 << 26)) { // 5 bytes
  838. $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
  839. . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
  840. . chr(128 + ($v & 63));
  841. } elseif ($v < (1 << 31)) { // 6 bytes
  842. $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
  843. . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
  844. . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  845. } else {
  846. $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte ' . $k);
  847. return false;
  848. }
  849. }
  850. return $output;
  851. }
  852. /**
  853. * Convert UCS-4 array into UCS-4 string
  854. *
  855. * @access private
  856. */
  857. function _ucs4_to_ucs4_string($input) {
  858. $output = '';
  859. // Take array values and split output to 4 bytes per value
  860. // The bit mask is 255, which reads &11111111
  861. foreach ($input as $v) {
  862. $output .= chr(($v >> 24) & 255) . chr(($v >> 16) & 255) . chr(($v >> 8) & 255) . chr($v & 255);
  863. }
  864. return $output;
  865. }
  866. /**
  867. * Convert UCS-4 strin into UCS-4 garray
  868. *
  869. * @access private
  870. */
  871. function _ucs4_string_to_ucs4($input) {
  872. $output = array();
  873. $inp_len = strlen($input);
  874. // Input length must be dividable by 4
  875. if ($inp_len % 4) {
  876. $this->_error('Input UCS4 string is broken');
  877. return false;
  878. }
  879. // Empty input - return empty output
  880. if (!$inp_len)
  881. return $output;
  882. for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
  883. // Increment output position every 4 input bytes
  884. if (!($i % 4)) {
  885. $out_len++;
  886. $output[$out_len] = 0;
  887. }
  888. $output[$out_len] += ord($input[$i]) << (8 * (3 - ($i % 4) ) );
  889. }
  890. return $output;
  891. }
  892. }
  893. /**
  894. * Adapter class for aligning the API of idna_convert with that of Net_IDNA
  895. * @author Matthias Sommerfeld <mso@phlylabs.de>
  896. */
  897. class Net_IDNA_php4 extends idna_convert {
  898. /**
  899. * Sets a new option value. Available options and values:
  900. * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
  901. * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
  902. * [overlong - Unicode does not allow unnecessarily long encodings of chars,
  903. * to allow this, set this parameter to true, else to false;
  904. * default is false.]
  905. * [strict - true: strict mode, good for registration purposes - Causes errors
  906. * on failures; false: loose mode, ideal for "wildlife" applications
  907. * by silently ignoring errors and returning the original input instead
  908. *
  909. * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
  910. * @param string Value to use (if parameter 1 is a string)
  911. * @return boolean true on success, false otherwise
  912. * @access public
  913. */
  914. function setParams($option, $param = false) {
  915. return $this->IC->set_parameters($option, $param);
  916. }
  917. }
  918. class owncloud_Encoding {
  919. const ICONV_TRANSLIT = "TRANSLIT";
  920. const ICONV_IGNORE = "IGNORE";
  921. const WITHOUT_ICONV = "";
  922. protected static $win1252ToUtf8 = array(
  923. 128 => "\xe2\x82\xac",
  924. 130 => "\xe2\x80\x9a",
  925. 131 => "\xc6\x92",
  926. 132 => "\xe2\x80\x9e",
  927. 133 => "\xe2\x80\xa6",
  928. 134 => "\xe2\x80\xa0",
  929. 135 => "\xe2\x80\xa1",
  930. 136 => "\xcb\x86",
  931. 137 => "\xe2\x80\xb0",
  932. 138 => "\xc5\xa0",
  933. 139 => "\xe2\x80\xb9",
  934. 140 => "\xc5\x92",
  935. 142 => "\xc5\xbd",
  936. 145 => "\xe2\x80\x98",
  937. 146 => "\xe2\x80\x99",
  938. 147 => "\xe2\x80\x9c",
  939. 148 => "\xe2\x80\x9d",
  940. 149 => "\xe2\x80\xa2",
  941. 150 => "\xe2\x80\x93",
  942. 151 => "\xe2\x80\x94",
  943. 152 => "\xcb\x9c",
  944. 153 => "\xe2\x84\xa2",
  945. 154 => "\xc5\xa1",
  946. 155 => "\xe2\x80\xba",
  947. 156 => "\xc5\x93",
  948. 158 => "\xc5\xbe",
  949. 159 => "\xc5\xb8"
  950. );
  951. protected static $brokenUtf8ToUtf8 = array(
  952. "\xc2\x80" => "\xe2\x82\xac",
  953. "\xc2\x82" => "\xe2\x80\x9a",
  954. "\xc2\x83" => "\xc6\x92",
  955. "\xc2\x84" => "\xe2\x80\x9e",
  956. "\xc2\x85" => "\xe2\x80\xa6",
  957. "\xc2\x86" => "\xe2\x80\xa0",
  958. "\xc2\x87" => "\xe2\x80\xa1",
  959. "\xc2\x88" => "\xcb\x86",
  960. "\xc2\x89" => "\xe2\x80\xb0",
  961. "\xc2\x8a" => "\xc5\xa0",
  962. "\xc2\x8b" => "\xe2\x80\xb9",
  963. "\xc2\x8c" => "\xc5\x92",
  964. "\xc2\x8e" => "\xc5\xbd",
  965. "\xc2\x91" => "\xe2\x80\x98",
  966. "\xc2\x92" => "\xe2\x80\x99",
  967. "\xc2\x93" => "\xe2\x80\x9c",
  968. "\xc2\x94" => "\xe2\x80\x9d",
  969. "\xc2\x95" => "\xe2\x80\xa2",
  970. "\xc2\x96" => "\xe2\x80\x93",
  971. "\xc2\x97" => "\xe2\x80\x94",
  972. "\xc2\x98" => "\xcb\x9c",
  973. "\xc2\x99" => "\xe2\x84\xa2",
  974. "\xc2\x9a" => "\xc5\xa1",
  975. "\xc2\x9b" => "\xe2\x80\xba",
  976. "\xc2\x9c" => "\xc5\x93",
  977. "\xc2\x9e" => "\xc5\xbe",
  978. "\xc2\x9f" => "\xc5\xb8"
  979. );
  980. protected static $utf8ToWin1252 = array(
  981. "\xe2\x82\xac" => "\x80",
  982. "\xe2\x80\x9a" => "\x82",
  983. "\xc6\x92" => "\x83",
  984. "\xe2\x80\x9e" => "\x84",
  985. "\xe2\x80\xa6" => "\x85",
  986. "\xe2\x80\xa0" => "\x86",
  987. "\xe2\x80\xa1" => "\x87",
  988. "\xcb\x86" => "\x88",
  989. "\xe2\x80\xb0" => "\x89",
  990. "\xc5\xa0" => "\x8a",
  991. "\xe2\x80\xb9" => "\x8b",
  992. "\xc5\x92" => "\x8c",
  993. "\xc5\xbd" => "\x8e",
  994. "\xe2\x80\x98" => "\x91",
  995. "\xe2\x80\x99" => "\x92",
  996. "\xe2\x80\x9c" => "\x93",
  997. "\xe2\x80\x9d" => "\x94",
  998. "\xe2\x80\xa2" => "\x95",
  999. "\xe2\x80\x93" => "\x96",
  1000. "\xe2\x80\x94" => "\x97",
  1001. "\xcb\x9c" => "\x98",
  1002. "\xe2\x84\xa2" => "\x99",
  1003. "\xc5\xa1" => "\x9a",
  1004. "\xe2\x80\xba" => "\x9b",
  1005. "\xc5\x93" => "\x9c",
  1006. "\xc5\xbe" => "\x9e",
  1007. "\xc5\xb8" => "\x9f"
  1008. );
  1009. static function toUTF8($text) {
  1010. if (is_array($text)) {
  1011. foreach ($text as $k => $v) {
  1012. $text[$k] = self::toUTF8($v);
  1013. }
  1014. return $text;
  1015. } elseif (is_string($text)) {
  1016. if (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) {
  1017. $max = mb_strlen($text, '8bit');
  1018. } else {
  1019. $max = strlen($text);
  1020. }
  1021. $buf = "";
  1022. for ($i = 0; $i < $max; $i++) {
  1023. $c1 = $text[$i];
  1024. if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already
  1025. $c2 = $i + 1 >= $max ? "\x00" : $text[$i + 1];
  1026. $c3 = $i + 2 >= $max ? "\x00" : $text[$i + 2];
  1027. $c4 = $i + 3 >= $max ? "\x00" : $text[$i + 3];
  1028. if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
  1029. if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
  1030. $buf .= $c1 . $c2;
  1031. $i++;
  1032. } else { //not valid UTF8. Convert it.
  1033. $cc1 = (chr(ord($c1) / 64) | "\xc0");
  1034. $cc2 = ($c1 & "\x3f") | "\x80";
  1035. $buf .= $cc1 . $cc2;
  1036. }
  1037. } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
  1038. if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
  1039. $buf .= $c1 . $c2 . $c3;
  1040. $i = $i + 2;
  1041. } else { //not valid UTF8. Convert it.
  1042. $cc1 = (chr(ord($c1) / 64) | "\xc0");
  1043. $cc2 = ($c1 & "\x3f") | "\x80";
  1044. $buf .= $cc1 . $cc2;
  1045. }
  1046. } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
  1047. if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
  1048. $buf .= $c1 . $c2 . $c3 . $c4;
  1049. $i = $i + 3;
  1050. } else { //not valid UTF8. Convert it.
  1051. $cc1 = (chr(ord($c1) / 64) | "\xc0");
  1052. $cc2 = ($c1 & "\x3f") | "\x80";
  1053. $buf .= $cc1 . $cc2;
  1054. }
  1055. } else { //doesn't look like UTF8, but should be converted
  1056. $cc1 = (chr(ord($c1) / 64) | "\xc0");
  1057. $cc2 = (($c1 & "\x3f") | "\x80");
  1058. $buf .= $cc1 . $cc2;
  1059. }
  1060. } elseif (($c1 & "\xc0") == "\x80") { // needs conversion
  1061. if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
  1062. $buf .= self::$win1252ToUtf8[ord($c1)];
  1063. } else {
  1064. $cc1 = (chr(ord($c1) / 64) | "\xc0");
  1065. $cc2 = (($c1 & "\x3f") | "\x80");
  1066. $buf .= $cc1 . $cc2;
  1067. }
  1068. } else { // it doesn't need conversion
  1069. $buf .= $c1;
  1070. }
  1071. }
  1072. return $buf;
  1073. } else {
  1074. return $text;
  1075. }
  1076. }
  1077. static function toWin1252($text, $option = self::WITHOUT_ICONV) {
  1078. if (is_array($text)) {
  1079. foreach ($text as $k => $v) {
  1080. $text[$k] = self::toWin1252($v, $option);
  1081. }
  1082. return $text;
  1083. } elseif (is_string($text)) {
  1084. return static::utf8_decode($text, $option);
  1085. } else {
  1086. return $text;
  1087. }
  1088. }
  1089. static function toISO8859($text) {
  1090. return self::toWin1252($text);
  1091. }
  1092. static function toLatin1($text) {
  1093. return self::toWin1252($text);
  1094. }
  1095. static function fixUTF8($text, $option = self::WITHOUT_ICONV) {
  1096. if (is_array($text)) {
  1097. foreach ($text as $k => $v) {
  1098. $text[$k] = self::fixUTF8($v, $option);
  1099. }
  1100. return $text;
  1101. }
  1102. $last = "";
  1103. while ($last <> $text) {
  1104. $last = $text;
  1105. $text = self::toUTF8(static::utf8_decode($text, $option));
  1106. }
  1107. $text = self::toUTF8(static::utf8_decode($text, $option));
  1108. return $text;
  1109. }
  1110. static function UTF8FixWin1252Chars($text) {
  1111. // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
  1112. // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
  1113. // See: http://en.wikipedia.org/wiki/Windows-1252
  1114. return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
  1115. }
  1116. static function removeBOM($str = "") {
  1117. if (substr($str, 0, 3) == pack("CCC", 0xef, 0xbb, 0xbf)) {
  1118. $str = substr($str, 3);
  1119. }
  1120. return $str;
  1121. }
  1122. public static function normalizeEncoding($encodingLabel) {
  1123. $encoding = strtoupper($encodingLabel);
  1124. $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
  1125. $equivalences = array(
  1126. 'ISO88591' => 'ISO-8859-1',
  1127. 'ISO8859' => 'ISO-8859-1',
  1128. 'ISO' => 'ISO-8859-1',
  1129. 'LATIN1' => 'ISO-8859-1',
  1130. 'LATIN' => 'ISO-8859-1',
  1131. 'UTF8' => 'UTF-8',
  1132. 'UTF' => 'UTF-8',
  1133. 'WIN1252' => 'ISO-8859-1',
  1134. 'WINDOWS1252' => 'ISO-8859-1'
  1135. );
  1136. if (empty($equivalences[$encoding])) {
  1137. return 'UTF-8';
  1138. }
  1139. return $equivalences[$encoding];
  1140. }
  1141. public static function encode($encodingLabel, $text) {
  1142. $encodingLabel = self::normalizeEncoding($encodingLabel);
  1143. if ($encodingLabel == 'UTF-8')
  1144. return Encoding::toUTF8($text);
  1145. if ($encodingLabel == 'ISO-8859-1')
  1146. return Encoding::toLatin1($text);
  1147. }
  1148. protected static function utf8_decode($text, $option) {
  1149. if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) {
  1150. $o = mb_convert_encoding(
  1151. str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)), 'ISO-8859-1', 'UTF-8');
  1152. } else {
  1153. $o = iconv("UTF-8", "Windows-1252" . ($option == self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option == self::ICONV_IGNORE ? '//IGNORE' : '')), $text);
  1154. }
  1155. return $o;
  1156. }
  1157. }
  1158. ?>