Punycode.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. <?php
  2. // {{{ license
  3. /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
  4. //
  5. // +----------------------------------------------------------------------+
  6. // | This library is free software; you can redistribute it and/or modify |
  7. // | it under the terms of the GNU Lesser General Public License as |
  8. // | published by the Free Software Foundation; either version 2.1 of the |
  9. // | License, or (at your option) any later version. |
  10. // | |
  11. // | This library is distributed in the hope that it will be useful, but |
  12. // | WITHOUT ANY WARRANTY; without even the implied warranty of |
  13. // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
  14. // | Lesser General Public License for more details. |
  15. // | |
  16. // | You should have received a copy of the GNU Lesser General Public |
  17. // | License along with this library; if not, write to the Free Software |
  18. // | Foundation, Inc., 51 Franklin St, Boston, MA 02110, United States |
  19. // +----------------------------------------------------------------------+
  20. //
  21. // }}}
  22. /*
  23. * @author Matthias Sommerfeld <mso@phlylabs.de>
  24. * @copyright 2004-2016 phlyLabs Berlin, http://phlylabs.de
  25. * @version 1.0.1 2016-01-24
  26. */
  27. namespace Mso\IdnaConvert;
  28. class Punycode implements PunycodeInterface
  29. {
  30. // Internal settings, do not touch!
  31. const punycodePrefix = 'xn--';
  32. const invalidUcs = 0x80000000;
  33. const maxUcs = 0x10FFFF;
  34. const base = 36;
  35. const tMin = 1;
  36. const tMax = 26;
  37. const skew = 38;
  38. const damp = 700;
  39. const initialBias = 72;
  40. const initialN = 0x80;
  41. const sBase = 0xAC00;
  42. const lBase = 0x1100;
  43. const vBase = 0x1161;
  44. const tBase = 0x11A7;
  45. const lCount = 19;
  46. const vCount = 21;
  47. const tCount = 28;
  48. const nCount = 588; // vCount * tCount
  49. const sCount = 11172; // lCount * tCount * vCount
  50. const sLast = self::sBase + self::lCount * self::vCount * self::tCount;
  51. protected static $isMbStringOverload = null;
  52. protected $NamePrepData;
  53. protected $UnicodeTranscoder;
  54. /**
  55. * the constructor
  56. *
  57. * @param $NamePrepData NamePrepDataInterface inject NamePrepData object
  58. * @param $UnicodeTranscoder UnicodeTranscoderInterface inject Unicode Transcoder
  59. * @since 0.5.2
  60. */
  61. public function __construct(NamePrepDataInterface $NamePrepData, UnicodeTranscoderInterface $UnicodeTranscoder)
  62. {
  63. // populate mbstring overloading cache if not set
  64. if (self::$isMbStringOverload === null) {
  65. self::$isMbStringOverload = (extension_loaded('mbstring') && (ini_get('mbstring.func_overload') & 0x02) === 0x02);
  66. }
  67. $this->NamePrepData = $NamePrepData;
  68. $this->UnicodeTranscoder = $UnicodeTranscoder;
  69. }
  70. /**
  71. * Returns the used prefix for punycode-encoded strings
  72. * @return string
  73. */
  74. public function getPunycodePrefix()
  75. {
  76. return self::punycodePrefix;
  77. }
  78. /**
  79. * Checks, whether or not the provided string is a valid punycode string
  80. * @param string $encoded
  81. * @return boolean
  82. */
  83. public function validate($encoded) {
  84. // Check for existence of the prefix
  85. if (strpos($encoded, self::punycodePrefix) !== 0) {
  86. return false;
  87. }
  88. // If nothing is left after the prefix, it is hopeless
  89. if (strlen(trim($encoded)) <= strlen(self::punycodePrefix)) {
  90. return false;
  91. }
  92. return true;
  93. }
  94. /**
  95. * The actual decoding algorithm
  96. * @param string
  97. * @return mixed
  98. */
  99. public function decode($encoded)
  100. {
  101. if (!$this->validate($encoded)) {
  102. return false;
  103. }
  104. $decoded = [];
  105. // Find last occurence of the delimiter
  106. $delim_pos = strrpos($encoded, '-');
  107. if ($delim_pos > self::byteLength(self::punycodePrefix)) {
  108. for ($k = self::byteLength(self::punycodePrefix); $k < $delim_pos; ++$k) {
  109. $decoded[] = ord($encoded{$k});
  110. }
  111. }
  112. $deco_len = count($decoded);
  113. $enco_len = self::byteLength($encoded);
  114. // Wandering through the strings; init
  115. $is_first = true;
  116. $bias = self::initialBias;
  117. $idx = 0;
  118. $char = self::initialN;
  119. for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
  120. for ($old_idx = $idx, $w = 1, $k = self::base; 1; $k += self::base) {
  121. $digit = $this->decodeDigit($encoded{$enco_idx++});
  122. $idx += $digit * $w;
  123. $t = ($k <= $bias) ? self::tMin :
  124. (($k >= $bias + self::tMax) ? self::tMax : ($k - $bias));
  125. if ($digit < $t) {
  126. break;
  127. }
  128. $w = (int) ($w * (self::base - $t));
  129. }
  130. $bias = $this->adapt($idx - $old_idx, $deco_len + 1, $is_first);
  131. $is_first = false;
  132. $char += (int) ($idx / ($deco_len + 1));
  133. $idx %= ($deco_len + 1);
  134. if ($deco_len > 0) {
  135. // Make room for the decoded char
  136. for ($i = $deco_len; $i > $idx; $i--) {
  137. $decoded[$i] = $decoded[($i - 1)];
  138. }
  139. }
  140. $decoded[$idx++] = $char;
  141. }
  142. return $this->UnicodeTranscoder->ucs4array_utf8($decoded);
  143. }
  144. /**
  145. * The actual encoding algorithm
  146. * @param array $decoded
  147. * @return mixed
  148. */
  149. public function encode($decoded)
  150. {
  151. // We cannot encode a domain name containing the Punycode prefix
  152. $extract = self::byteLength(self::punycodePrefix);
  153. $check_pref = $this->UnicodeTranscoder->utf8_ucs4array(self::punycodePrefix);
  154. $check_deco = array_slice($decoded, 0, $extract);
  155. if ($check_pref == $check_deco) {
  156. throw new \InvalidArgumentException('This is already a Punycode string');
  157. }
  158. // We will not try to encode strings consisting of basic code points only
  159. $encodable = false;
  160. foreach ($decoded as $k => $v) {
  161. if ($v > 0x7a) {
  162. $encodable = true;
  163. break;
  164. }
  165. }
  166. if (!$encodable) {
  167. return false;
  168. }
  169. // Do NAMEPREP
  170. $decoded = $this->namePrep($decoded);
  171. if (!$decoded || !is_array($decoded)) {
  172. return false; // NAMEPREP failed
  173. }
  174. $deco_len = count($decoded);
  175. if (!$deco_len) {
  176. return false; // Empty array
  177. }
  178. $codecount = 0; // How many chars have been consumed
  179. $encoded = '';
  180. // Copy all basic code points to output
  181. for ($i = 0; $i < $deco_len; ++$i) {
  182. $test = $decoded[$i];
  183. // Will match [-0-9a-zA-Z]
  184. if ((0x2F < $test && $test < 0x40)
  185. || (0x40 < $test && $test < 0x5B)
  186. || (0x60 < $test && $test <= 0x7B)
  187. || (0x2D == $test)) {
  188. $encoded .= chr($decoded[$i]);
  189. $codecount++;
  190. }
  191. }
  192. if ($codecount == $deco_len) {
  193. return $encoded; // All codepoints were basic ones
  194. }
  195. // Start with the prefix; copy it to output
  196. $encoded = self::punycodePrefix . $encoded;
  197. // If we have basic code points in output, add an hyphen to the end
  198. if ($codecount) {
  199. $encoded .= '-';
  200. }
  201. // Now find and encode all non-basic code points
  202. $is_first = true;
  203. $cur_code = self::initialN;
  204. $bias = self::initialBias;
  205. $delta = 0;
  206. while ($codecount < $deco_len) {
  207. // Find the smallest code point >= the current code point and
  208. // remember the last ouccrence of it in the input
  209. for ($i = 0, $next_code = self::maxUcs; $i < $deco_len; $i++) {
  210. if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
  211. $next_code = $decoded[$i];
  212. }
  213. }
  214. $delta += ($next_code - $cur_code) * ($codecount + 1);
  215. $cur_code = $next_code;
  216. // Scan input again and encode all characters whose code point is $cur_code
  217. for ($i = 0; $i < $deco_len; $i++) {
  218. if ($decoded[$i] < $cur_code) {
  219. $delta++;
  220. } elseif ($decoded[$i] == $cur_code) {
  221. for ($q = $delta, $k = self::base; 1; $k += self::base) {
  222. $t = ($k <= $bias)
  223. ? self::tMin
  224. : (($k >= $bias + self::tMax) ? self::tMax : $k - $bias);
  225. if ($q < $t) {
  226. break;
  227. }
  228. $encoded .= $this->encodeDigit(intval($t + (($q - $t) % (self::base - $t))));
  229. $q = (int) (($q - $t) / (self::base - $t));
  230. }
  231. $encoded .= $this->encodeDigit($q);
  232. $bias = $this->adapt($delta, $codecount + 1, $is_first);
  233. $codecount++;
  234. $delta = 0;
  235. $is_first = false;
  236. }
  237. }
  238. $delta++;
  239. $cur_code++;
  240. }
  241. return $encoded;
  242. }
  243. /**
  244. * Adapt the bias according to the current code point and position
  245. * @param int $delta
  246. * @param int $npoints
  247. * @param int $is_first
  248. * @return int
  249. */
  250. protected function adapt($delta, $npoints, $is_first)
  251. {
  252. $delta = intval($is_first ? ($delta / self::damp) : ($delta / 2));
  253. $delta += intval($delta / $npoints);
  254. for ($k = 0; $delta > ((self::base - self::tMin) * self::tMax) / 2; $k += self::base) {
  255. $delta = intval($delta / (self::base - self::tMin));
  256. }
  257. return intval($k + (self::base - self::tMin + 1) * $delta / ($delta + self::skew));
  258. }
  259. /**
  260. * Encoding a certain digit
  261. * @param int $d
  262. * @return string
  263. */
  264. protected function encodeDigit($d)
  265. {
  266. return chr($d + 22 + 75 * ($d < 26));
  267. }
  268. /**
  269. * Decode a certain digit
  270. * @param int $cp
  271. * @return int
  272. */
  273. protected function decodeDigit($cp)
  274. {
  275. $cp = ord($cp);
  276. if ($cp - 48 < 10) {
  277. return $cp - 22;
  278. }
  279. if ($cp - 65 < 26) {
  280. return $cp - 65;
  281. }
  282. if ($cp - 97 < 26) {
  283. return $cp - 97;
  284. }
  285. return self::base;
  286. }
  287. /**
  288. * Do Nameprep according to RFC3491 and RFC3454
  289. * @param array $input Unicode Characters
  290. * @return string Unicode Characters, Nameprep'd
  291. */
  292. protected function namePrep($input)
  293. {
  294. $output = [];
  295. //
  296. // Mapping
  297. // Walking through the input array, performing the required steps on each of
  298. // the input chars and putting the result into the output array
  299. // While mapping required chars we apply the canonical ordering
  300. foreach ($input as $v) {
  301. // Map to nothing == skip that code point
  302. if (in_array($v, $this->NamePrepData->mapToNothing)) {
  303. continue;
  304. }
  305. // Try to find prohibited input
  306. if (in_array($v, $this->NamePrepData->prohibit) || in_array($v, $this->NamePrepData->generalProhibited)) {
  307. throw new \InvalidArgumentException(sprintf('NAMEPREP: Prohibited input U+%08X', $v));
  308. }
  309. foreach ($this->NamePrepData->prohibitRanges as $range) {
  310. if ($range[0] <= $v && $v <= $range[1]) {
  311. throw new \InvalidArgumentException(sprintf('NAMEPREP: Prohibited input U+%08X', $v));
  312. }
  313. }
  314. if (0xAC00 <= $v && $v <= 0xD7AF) {
  315. // Hangul syllable decomposition
  316. foreach ($this->hangulDecompose($v) as $out) {
  317. $output[] = (int) $out;
  318. }
  319. } elseif (isset($this->NamePrepData->replaceMaps[$v])) {
  320. foreach ($this->applyCanonicalOrdering($this->NamePrepData->replaceMaps[$v]) as $out) {
  321. $output[] = (int) $out;
  322. }
  323. } else {
  324. $output[] = (int) $v;
  325. }
  326. }
  327. // Before applying any Combining, try to rearrange any Hangul syllables
  328. $output = $this->hangulCompose($output);
  329. //
  330. // Combine code points
  331. //
  332. $last_class = 0;
  333. $last_starter = 0;
  334. $out_len = count($output);
  335. for ($i = 0; $i < $out_len; ++$i) {
  336. $class = $this->getCombiningClass($output[$i]);
  337. if ((!$last_class || $last_class > $class) && $class) {
  338. // Try to match
  339. $seq_len = $i - $last_starter;
  340. $out = $this->combine(array_slice($output, $last_starter, $seq_len));
  341. // On match: Replace the last starter with the composed character and remove
  342. // the now redundant non-starter(s)
  343. if ($out) {
  344. $output[$last_starter] = $out;
  345. if (count($out) != $seq_len) {
  346. for ($j = $i + 1; $j < $out_len; ++$j) {
  347. $output[$j - 1] = $output[$j];
  348. }
  349. unset($output[$out_len]);
  350. }
  351. // Rewind the for loop by one, since there can be more possible compositions
  352. $i--;
  353. $out_len--;
  354. $last_class = ($i == $last_starter) ? 0 : $this->getCombiningClass($output[$i - 1]);
  355. continue;
  356. }
  357. }
  358. // The current class is 0
  359. if (!$class) {
  360. $last_starter = $i;
  361. }
  362. $last_class = $class;
  363. }
  364. return $output;
  365. }
  366. /**
  367. * Decomposes a Hangul syllable
  368. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  369. * @param integer 32bit UCS4 code point
  370. * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
  371. */
  372. protected function hangulDecompose($char)
  373. {
  374. $sindex = (int) $char - self::sBase;
  375. if ($sindex < 0 || $sindex >= self::sCount) {
  376. return [$char];
  377. }
  378. $result = [];
  379. $result[] = (int) self::lBase + $sindex / self::nCount;
  380. $result[] = (int) self::vBase + ($sindex % self::nCount) / self::tCount;
  381. $T = intval(self::tBase + $sindex % self::tCount);
  382. if ($T != self::tBase) {
  383. $result[] = $T;
  384. }
  385. return $result;
  386. }
  387. /**
  388. * Ccomposes a Hangul syllable
  389. * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
  390. * @param array $input Decomposed UCS4 sequence
  391. * @return array UCS4 sequence with syllables composed
  392. */
  393. protected function hangulCompose($input)
  394. {
  395. $inp_len = count($input);
  396. if (!$inp_len) {
  397. return [];
  398. }
  399. $result = [];
  400. $last = (int) $input[0];
  401. $result[] = $last; // copy first char from input to output
  402. for ($i = 1; $i < $inp_len; ++$i) {
  403. $char = (int) $input[$i];
  404. $sindex = $last - self::sBase;
  405. $lindex = $last - self::lBase;
  406. $vindex = $char - self::vBase;
  407. $tindex = $char - self::tBase;
  408. // Find out, whether two current characters are LV and T
  409. if (0 <= $sindex && $sindex < self::sCount && ($sindex % self::tCount == 0) && 0 <= $tindex && $tindex <= self::tCount) {
  410. // create syllable of form LVT
  411. $last += $tindex;
  412. $result[(count($result) - 1)] = $last; // reset last
  413. continue; // discard char
  414. }
  415. // Find out, whether two current characters form L and V
  416. if (0 <= $lindex && $lindex < self::lCount && 0 <= $vindex && $vindex < self::vCount) {
  417. // create syllable of form LV
  418. $last = (int) self::sBase + ($lindex * self::vCount + $vindex) * self::tCount;
  419. $result[(count($result) - 1)] = $last; // reset last
  420. continue; // discard char
  421. }
  422. // if neither case was true, just add the character
  423. $last = $char;
  424. $result[] = $char;
  425. }
  426. return $result;
  427. }
  428. /**
  429. * Returns the combining class of a certain wide char
  430. * @param integer $char Wide char to check (32bit integer)
  431. * @return integer Combining class if found, else 0
  432. */
  433. protected function getCombiningClass($char)
  434. {
  435. return isset($this->NamePrepData->normalizeCombiningClasses[$char])
  436. ? $this->NamePrepData->normalizeCombiningClasses[$char]
  437. : 0;
  438. }
  439. /**
  440. * Applies the canonical ordering of a decomposed UCS4 sequence
  441. * @param array $input Decomposed UCS4 sequence
  442. * @return array Ordered USC4 sequence
  443. */
  444. protected function applyCanonicalOrdering($input)
  445. {
  446. $swap = true;
  447. $size = count($input);
  448. while ($swap) {
  449. $swap = false;
  450. $last = $this->getCombiningClass(intval($input[0]));
  451. for ($i = 0; $i < $size - 1; ++$i) {
  452. $next = $this->getCombiningClass(intval($input[$i + 1]));
  453. if ($next != 0 && $last > $next) {
  454. // Move item leftward until it fits
  455. for ($j = $i + 1; $j > 0; --$j) {
  456. if ($this->getCombiningClass(intval($input[$j - 1])) <= $next) {
  457. break;
  458. }
  459. $t = intval($input[$j]);
  460. $input[$j] = intval($input[$j - 1]);
  461. $input[$j - 1] = $t;
  462. $swap = true;
  463. }
  464. // Reentering the loop looking at the old character again
  465. $next = $last;
  466. }
  467. $last = $next;
  468. }
  469. }
  470. return $input;
  471. }
  472. /**
  473. * Do composition of a sequence of starter and non-starter
  474. * @param array $input UCS4 Decomposed sequence
  475. * @return array Ordered USC4 sequence
  476. */
  477. protected function combine($input)
  478. {
  479. $inp_len = count($input);
  480. if (0 == $inp_len) {
  481. return false;
  482. }
  483. foreach ($this->NamePrepData->replaceMaps as $np_src => $np_target) {
  484. if ($np_target[0] != $input[0]) {
  485. continue;
  486. }
  487. if (count($np_target) != $inp_len) {
  488. continue;
  489. }
  490. $hit = false;
  491. foreach ($input as $k2 => $v2) {
  492. if ($v2 == $np_target[$k2]) {
  493. $hit = true;
  494. } else {
  495. $hit = false;
  496. break;
  497. }
  498. }
  499. if ($hit) {
  500. return $np_src;
  501. }
  502. }
  503. return false;
  504. }
  505. /**
  506. * Gets the length of a string in bytes even if mbstring function
  507. * overloading is turned on
  508. *
  509. * @param string $string the string for which to get the length.
  510. * @return integer the length of the string in bytes.
  511. */
  512. protected static function byteLength($string)
  513. {
  514. if (self::$isMbStringOverload) {
  515. return mb_strlen($string, '8bit');
  516. }
  517. return strlen((binary) $string);
  518. }
  519. }