UnicodeTranscoder.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. <?php
  2. /**
  3. * UCTC - The Unicode Transcoder
  4. *
  5. * Converts between various flavours of Unicode representations like UCS-4 or UTF-8
  6. * Supported schemes:
  7. * - UCS-4 Little Endian / Big Endian / Array (partially)
  8. * - UTF-16 Little Endian / Big Endian (not yet)
  9. * - UTF-8
  10. * - UTF-7
  11. * - UTF-7 IMAP (modified UTF-7)
  12. *
  13. * @package IdnaConvert
  14. * @author Matthias Sommerfeld <mso@phlyLabs.de>
  15. * @copyright 2003-2016 phlyLabs Berlin, http://phlylabs.de
  16. * @version 0.1.1 2016-01-24
  17. */
  18. namespace Mso\IdnaConvert;
  19. class UnicodeTranscoder implements UnicodeTranscoderInterface
  20. {
  21. private static $mechs = ['ucs4', 'ucs4array', 'utf8', 'utf7', 'utf7imap'];
  22. // unsupported yet: 'ucs4le', 'ucs4be', 'utf16', 'utf16le', 'utf16be'
  23. private static $allow_overlong = false;
  24. private static $safe_mode;
  25. private static $safe_char;
  26. /**
  27. * The actual conversion routine
  28. *
  29. * @param mixed $data The data to convert, usually a string, array when converting from UCS-4 array
  30. * @param string $from Original encoding of the data
  31. * @param string $to Target encoding of the data
  32. * @param bool $safe_mode SafeMode tries to correct invalid codepoints
  33. * @param int $safe_char Unicode Codepoint as placeholder for all otherwise broken characters
  34. * @return mixed False on failure, String or array on success, depending on target encoding
  35. * @access public
  36. * @throws \InvalidArgumentException
  37. * @since 0.0.1
  38. */
  39. public static function convert($data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC)
  40. {
  41. self::$safe_mode = ($safe_mode) ? true : false;
  42. self::$safe_char = ($safe_char) ? $safe_char : 0xFFFC;
  43. if (self::$safe_mode) {
  44. self::$allow_overlong = true;
  45. }
  46. if (!in_array($from, self::$mechs)) {
  47. throw new \InvalidArgumentException(sprintf('Invalid input format %s', $from));
  48. }
  49. if (!in_array($to, self::$mechs)) {
  50. throw new \InvalidArgumentException(sprintf('Invalid output format %s', $to));
  51. }
  52. if ($from != 'ucs4array') {
  53. $methodName = $from.'_ucs4array';
  54. $data = self::$methodName($data);
  55. }
  56. if ($to != 'ucs4array') {
  57. $methodName = 'ucs4array_'.$to;
  58. $data = self::$methodName($data);
  59. }
  60. return $data;
  61. }
  62. /**
  63. * This converts an UTF-8 encoded string to its UCS-4 representation
  64. *
  65. * @param string $input The UTF-8 string to convert
  66. * @return array Array of 32bit values representing each codepoint
  67. * @throws \InvalidArgumentException
  68. * @access public
  69. */
  70. public static function utf8_ucs4array($input)
  71. {
  72. $start_byte = $next_byte = 0;
  73. $output = [];
  74. $out_len = 0;
  75. $inp_len = self::byteLength($input);
  76. $mode = 'next';
  77. $test = 'none';
  78. for ($k = 0; $k < $inp_len; ++$k) {
  79. $v = ord($input{$k}); // Extract byte from input string
  80. if ($v < 128) { // We found an ASCII char - put into stirng as is
  81. $output[$out_len] = $v;
  82. ++$out_len;
  83. if ('add' == $mode) {
  84. if (self::$safe_mode) {
  85. $output[$out_len - 2] = self::$safe_char;
  86. $mode = 'next';
  87. } else {
  88. throw new \InvalidArgumentException(sprintf('Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d', $k));
  89. }
  90. }
  91. continue;
  92. }
  93. if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
  94. $start_byte = $v;
  95. $mode = 'add';
  96. $test = 'range';
  97. if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
  98. $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
  99. $v = ($v - 192) << 6;
  100. } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
  101. $next_byte = 1;
  102. $v = ($v - 224) << 12;
  103. } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  104. $next_byte = 2;
  105. $v = ($v - 240) << 18;
  106. } elseif (self::$safe_mode) {
  107. $mode = 'next';
  108. $output[$out_len] = self::$safe_char;
  109. ++$out_len;
  110. continue;
  111. } else {
  112. throw new \InvalidArgumentException(sprintf('This might be UTF-8, but I don\'t understand it at byte %d', $k));
  113. }
  114. if ($inp_len - $k - $next_byte < 2) {
  115. $output[$out_len] = self::$safe_char;
  116. $mode = 'no';
  117. continue;
  118. }
  119. if ('add' == $mode) {
  120. $output[$out_len] = (int)$v;
  121. ++$out_len;
  122. continue;
  123. }
  124. }
  125. if ('add' == $mode) {
  126. if (!self::$allow_overlong && $test == 'range') {
  127. $test = 'none';
  128. if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
  129. throw new \InvalidArgumentException(sprintf('Bogus UTF-8 character detected (out of legal range) at byte %d', $k));
  130. }
  131. }
  132. if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
  133. $v = ($v - 128) << ($next_byte * 6);
  134. $output[($out_len - 1)] += $v;
  135. --$next_byte;
  136. } else {
  137. if (self::$safe_mode) {
  138. $output[$out_len - 1] = ord(self::$safe_char);
  139. $k--;
  140. $mode = 'next';
  141. continue;
  142. } else {
  143. throw new \InvalidArgumentException(sprintf('Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d', $k));
  144. }
  145. }
  146. if ($next_byte < 0) {
  147. $mode = 'next';
  148. }
  149. }
  150. } // for
  151. return $output;
  152. }
  153. /**
  154. * Convert UCS-4 arary into UTF-8 string
  155. * See utf8_ucs4array() for details
  156. * @param $input array Array of UCS-4 codepoints
  157. * @return string
  158. * @access public
  159. */
  160. public static function ucs4array_utf8($input)
  161. {
  162. $output = '';
  163. foreach ($input as $k => $v) {
  164. if ($v < 128) { // 7bit are transferred literally
  165. $output .= chr($v);
  166. } elseif ($v < (1 << 11)) { // 2 bytes
  167. $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
  168. } elseif ($v < (1 << 16)) { // 3 bytes
  169. $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  170. } elseif ($v < (1 << 21)) { // 4 bytes
  171. $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
  172. } elseif (self::$safe_mode) {
  173. $output .= self::$safe_char;
  174. } else {
  175. throw new \InvalidArgumentException(sprintf('Conversion from UCS-4 to UTF-8 failed: malformed input at byte %d', $k));
  176. }
  177. }
  178. return $output;
  179. }
  180. public static function utf7imap_ucs4array($input)
  181. {
  182. return self::utf7_ucs4array(str_replace(',', '/', $input), '&');
  183. }
  184. public static function utf7_ucs4array($input, $sc = '+')
  185. {
  186. $output = [];
  187. $out_len = 0;
  188. $inp_len = self::byteLength($input);
  189. $mode = 'd';
  190. $b64 = '';
  191. for ($k = 0; $k < $inp_len; ++$k) {
  192. $c = $input{$k};
  193. // Ignore zero bytes
  194. if (0 == ord($c)) {
  195. continue;
  196. }
  197. if ('b' == $mode) {
  198. // Sequence got terminated
  199. if (!preg_match('![A-Za-z0-9/' . preg_quote($sc, '!') . ']!', $c)) {
  200. if ('-' == $c) {
  201. if ($b64 == '') {
  202. $output[$out_len] = ord($sc);
  203. $out_len++;
  204. $mode = 'd';
  205. continue;
  206. }
  207. }
  208. $tmp = base64_decode($b64);
  209. $tmp = substr($tmp, -1 * (strlen($tmp) % 2));
  210. for ($i = 0; $i < strlen($tmp); $i++) {
  211. if ($i % 2) {
  212. $output[$out_len] += ord($tmp{$i});
  213. $out_len++;
  214. } else {
  215. $output[$out_len] = ord($tmp{$i}) << 8;
  216. }
  217. }
  218. $mode = 'd';
  219. $b64 = '';
  220. continue;
  221. } else {
  222. $b64 .= $c;
  223. }
  224. }
  225. if ('d' == $mode) {
  226. if ($sc == $c) {
  227. $mode = 'b';
  228. continue;
  229. }
  230. $output[$out_len] = ord($c);
  231. $out_len++;
  232. }
  233. }
  234. return $output;
  235. }
  236. public static function ucs4array_utf7imap($input)
  237. {
  238. return str_replace('/', ',', self::ucs4array_utf7($input, '&'));
  239. }
  240. public static function ucs4array_utf7($input, $sc = '+')
  241. {
  242. $output = '';
  243. $mode = 'd';
  244. $b64 = '';
  245. while (true) {
  246. $v = (!empty($input)) ? array_shift($input) : false;
  247. $is_direct = (false !== $v) ? (0x20 <= $v && $v <= 0x7e && $v != ord($sc)) : true;
  248. if ($mode == 'b') {
  249. if ($is_direct) {
  250. if ($b64 == chr(0) . $sc) {
  251. $output .= $sc . '-';
  252. $b64 = '';
  253. } elseif ($b64) {
  254. $output .= $sc . str_replace('=', '', base64_encode($b64)) . '-';
  255. $b64 = '';
  256. }
  257. $mode = 'd';
  258. } elseif (false !== $v) {
  259. $b64 .= chr(($v >> 8) & 255) . chr($v & 255);
  260. }
  261. }
  262. if ($mode == 'd' && false !== $v) {
  263. if ($is_direct) {
  264. $output .= chr($v);
  265. } else {
  266. $b64 = chr(($v >> 8) & 255) . chr($v & 255);
  267. $mode = 'b';
  268. }
  269. }
  270. if (false === $v && $b64 == '') break;
  271. }
  272. return $output;
  273. }
  274. /**
  275. * Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
  276. * @param $input array UCS-4 code points
  277. * @return string
  278. * @access public
  279. */
  280. public static function ucs4array_ucs4($input)
  281. {
  282. $output = '';
  283. foreach ($input as $v) {
  284. $output .= chr(($v >> 24) & 255) . chr(($v >> 16) & 255) . chr(($v >> 8) & 255) . chr($v & 255);
  285. }
  286. return $output;
  287. }
  288. /**
  289. * Convert UCS-4 string (LE ar the moment) into UCS-4 array
  290. * @param $input string UCS-4 LE string
  291. * @return array
  292. * @access public
  293. */
  294. public static function ucs4_ucs4array($input)
  295. {
  296. $output = [];
  297. $inp_len = self::byteLength($input);
  298. // Input length must be dividable by 4
  299. if ($inp_len % 4) {
  300. throw new \InvalidArgumentException('Input UCS4 string is broken');
  301. }
  302. // Empty input - return empty output
  303. if (!$inp_len) return $output;
  304. for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
  305. if (!($i % 4)) { // Increment output position every 4 input bytes
  306. $out_len++;
  307. $output[$out_len] = 0;
  308. }
  309. $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4)));
  310. }
  311. return $output;
  312. }
  313. /**
  314. * Gets the length of a string in bytes even if mbstring function
  315. * overloading is turned on
  316. *
  317. * @param string $string the string for which to get the length.
  318. * @return integer the length of the string in bytes.
  319. */
  320. protected static function byteLength($string)
  321. {
  322. if ((extension_loaded('mbstring') && (ini_get('mbstring.func_overload') & 0x02) === 0x02)) {
  323. return mb_strlen($string, '8bit');
  324. }
  325. return strlen((binary) $string);
  326. }
  327. }