unicode.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/fs/hfsplus/unicode.c
  4. *
  5. * Copyright (C) 2001
  6. * Brad Boyer (flar@allandria.com)
  7. * (C) 2003 Ardis Technologies <roman@ardistech.com>
  8. *
  9. * Handler routines for unicode strings
  10. */
  11. #include <linux/types.h>
  12. #include <linux/nls.h>
  13. #include "hfsplus_fs.h"
  14. #include "hfsplus_raw.h"
  15. /* Fold the case of a unicode char, given the 16 bit value */
  16. /* Returns folded char, or 0 if ignorable */
  17. static inline u16 case_fold(u16 c)
  18. {
  19. u16 tmp;
  20. tmp = hfsplus_case_fold_table[c >> 8];
  21. if (tmp)
  22. tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
  23. else
  24. tmp = c;
  25. return tmp;
  26. }
  27. /* Compare unicode strings, return values like normal strcmp */
  28. int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
  29. const struct hfsplus_unistr *s2)
  30. {
  31. u16 len1, len2, c1, c2;
  32. const hfsplus_unichr *p1, *p2;
  33. len1 = be16_to_cpu(s1->length);
  34. len2 = be16_to_cpu(s2->length);
  35. p1 = s1->unicode;
  36. p2 = s2->unicode;
  37. if (len1 > HFSPLUS_MAX_STRLEN) {
  38. len1 = HFSPLUS_MAX_STRLEN;
  39. pr_err("invalid length %u has been corrected to %d\n",
  40. be16_to_cpu(s1->length), len1);
  41. }
  42. if (len2 > HFSPLUS_MAX_STRLEN) {
  43. len2 = HFSPLUS_MAX_STRLEN;
  44. pr_err("invalid length %u has been corrected to %d\n",
  45. be16_to_cpu(s2->length), len2);
  46. }
  47. while (1) {
  48. c1 = c2 = 0;
  49. while (len1 && !c1) {
  50. c1 = case_fold(be16_to_cpu(*p1));
  51. p1++;
  52. len1--;
  53. }
  54. while (len2 && !c2) {
  55. c2 = case_fold(be16_to_cpu(*p2));
  56. p2++;
  57. len2--;
  58. }
  59. if (c1 != c2)
  60. return (c1 < c2) ? -1 : 1;
  61. if (!c1 && !c2)
  62. return 0;
  63. }
  64. }
  65. /* Compare names as a sequence of 16-bit unsigned integers */
  66. int hfsplus_strcmp(const struct hfsplus_unistr *s1,
  67. const struct hfsplus_unistr *s2)
  68. {
  69. u16 len1, len2, c1, c2;
  70. const hfsplus_unichr *p1, *p2;
  71. int len;
  72. len1 = be16_to_cpu(s1->length);
  73. len2 = be16_to_cpu(s2->length);
  74. p1 = s1->unicode;
  75. p2 = s2->unicode;
  76. if (len1 > HFSPLUS_MAX_STRLEN) {
  77. len1 = HFSPLUS_MAX_STRLEN;
  78. pr_err("invalid length %u has been corrected to %d\n",
  79. be16_to_cpu(s1->length), len1);
  80. }
  81. if (len2 > HFSPLUS_MAX_STRLEN) {
  82. len2 = HFSPLUS_MAX_STRLEN;
  83. pr_err("invalid length %u has been corrected to %d\n",
  84. be16_to_cpu(s2->length), len2);
  85. }
  86. for (len = min(len1, len2); len > 0; len--) {
  87. c1 = be16_to_cpu(*p1);
  88. c2 = be16_to_cpu(*p2);
  89. if (c1 != c2)
  90. return c1 < c2 ? -1 : 1;
  91. p1++;
  92. p2++;
  93. }
  94. return len1 < len2 ? -1 :
  95. len1 > len2 ? 1 : 0;
  96. }
  97. #define Hangul_SBase 0xac00
  98. #define Hangul_LBase 0x1100
  99. #define Hangul_VBase 0x1161
  100. #define Hangul_TBase 0x11a7
  101. #define Hangul_SCount 11172
  102. #define Hangul_LCount 19
  103. #define Hangul_VCount 21
  104. #define Hangul_TCount 28
  105. #define Hangul_NCount (Hangul_VCount * Hangul_TCount)
  106. static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
  107. {
  108. int i, s, e;
  109. s = 1;
  110. e = p[1];
  111. if (!e || cc < p[s * 2] || cc > p[e * 2])
  112. return NULL;
  113. do {
  114. i = (s + e) / 2;
  115. if (cc > p[i * 2])
  116. s = i + 1;
  117. else if (cc < p[i * 2])
  118. e = i - 1;
  119. else
  120. return hfsplus_compose_table + p[i * 2 + 1];
  121. } while (s <= e);
  122. return NULL;
  123. }
  124. int hfsplus_uni2asc(struct super_block *sb,
  125. const struct hfsplus_unistr *ustr,
  126. char *astr, int *len_p)
  127. {
  128. const hfsplus_unichr *ip;
  129. struct nls_table *nls = HFSPLUS_SB(sb)->nls;
  130. u8 *op;
  131. u16 cc, c0, c1;
  132. u16 *ce1, *ce2;
  133. int i, len, ustrlen, res, compose;
  134. op = astr;
  135. ip = ustr->unicode;
  136. ustrlen = be16_to_cpu(ustr->length);
  137. if (ustrlen > HFSPLUS_MAX_STRLEN) {
  138. ustrlen = HFSPLUS_MAX_STRLEN;
  139. pr_err("invalid length %u has been corrected to %d\n",
  140. be16_to_cpu(ustr->length), ustrlen);
  141. }
  142. len = *len_p;
  143. ce1 = NULL;
  144. compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
  145. while (ustrlen > 0) {
  146. c0 = be16_to_cpu(*ip++);
  147. ustrlen--;
  148. /* search for single decomposed char */
  149. if (likely(compose))
  150. ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0);
  151. if (ce1)
  152. cc = ce1[0];
  153. else
  154. cc = 0;
  155. if (cc) {
  156. /* start of a possibly decomposed Hangul char */
  157. if (cc != 0xffff)
  158. goto done;
  159. if (!ustrlen)
  160. goto same;
  161. c1 = be16_to_cpu(*ip) - Hangul_VBase;
  162. if (c1 < Hangul_VCount) {
  163. /* compose the Hangul char */
  164. cc = (c0 - Hangul_LBase) * Hangul_VCount;
  165. cc = (cc + c1) * Hangul_TCount;
  166. cc += Hangul_SBase;
  167. ip++;
  168. ustrlen--;
  169. if (!ustrlen)
  170. goto done;
  171. c1 = be16_to_cpu(*ip) - Hangul_TBase;
  172. if (c1 > 0 && c1 < Hangul_TCount) {
  173. cc += c1;
  174. ip++;
  175. ustrlen--;
  176. }
  177. goto done;
  178. }
  179. }
  180. while (1) {
  181. /* main loop for common case of not composed chars */
  182. if (!ustrlen)
  183. goto same;
  184. c1 = be16_to_cpu(*ip);
  185. if (likely(compose))
  186. ce1 = hfsplus_compose_lookup(
  187. hfsplus_compose_table, c1);
  188. if (ce1)
  189. break;
  190. switch (c0) {
  191. case 0:
  192. c0 = 0x2400;
  193. break;
  194. case '/':
  195. c0 = ':';
  196. break;
  197. }
  198. res = nls->uni2char(c0, op, len);
  199. if (res < 0) {
  200. if (res == -ENAMETOOLONG)
  201. goto out;
  202. *op = '?';
  203. res = 1;
  204. }
  205. op += res;
  206. len -= res;
  207. c0 = c1;
  208. ip++;
  209. ustrlen--;
  210. }
  211. ce2 = hfsplus_compose_lookup(ce1, c0);
  212. if (ce2) {
  213. i = 1;
  214. while (i < ustrlen) {
  215. ce1 = hfsplus_compose_lookup(ce2,
  216. be16_to_cpu(ip[i]));
  217. if (!ce1)
  218. break;
  219. i++;
  220. ce2 = ce1;
  221. }
  222. cc = ce2[0];
  223. if (cc) {
  224. ip += i;
  225. ustrlen -= i;
  226. goto done;
  227. }
  228. }
  229. same:
  230. switch (c0) {
  231. case 0:
  232. cc = 0x2400;
  233. break;
  234. case '/':
  235. cc = ':';
  236. break;
  237. default:
  238. cc = c0;
  239. }
  240. done:
  241. res = nls->uni2char(cc, op, len);
  242. if (res < 0) {
  243. if (res == -ENAMETOOLONG)
  244. goto out;
  245. *op = '?';
  246. res = 1;
  247. }
  248. op += res;
  249. len -= res;
  250. }
  251. res = 0;
  252. out:
  253. *len_p = (char *)op - astr;
  254. return res;
  255. }
  256. /*
  257. * Convert one or more ASCII characters into a single unicode character.
  258. * Returns the number of ASCII characters corresponding to the unicode char.
  259. */
  260. static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
  261. wchar_t *uc)
  262. {
  263. int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
  264. if (size <= 0) {
  265. *uc = '?';
  266. size = 1;
  267. }
  268. switch (*uc) {
  269. case 0x2400:
  270. *uc = 0;
  271. break;
  272. case ':':
  273. *uc = '/';
  274. break;
  275. }
  276. return size;
  277. }
  278. /* Decomposes a non-Hangul unicode character. */
  279. static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size)
  280. {
  281. int off;
  282. off = hfsplus_decompose_table[(uc >> 12) & 0xf];
  283. if (off == 0 || off == 0xffff)
  284. return NULL;
  285. off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
  286. if (!off)
  287. return NULL;
  288. off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
  289. if (!off)
  290. return NULL;
  291. off = hfsplus_decompose_table[off + (uc & 0xf)];
  292. *size = off & 3;
  293. if (*size == 0)
  294. return NULL;
  295. return hfsplus_decompose_table + (off / 4);
  296. }
  297. /*
  298. * Try to decompose a unicode character as Hangul. Return 0 if @uc is not
  299. * precomposed Hangul, otherwise return the length of the decomposition.
  300. *
  301. * This function was adapted from sample code from the Unicode Standard
  302. * Annex #15: Unicode Normalization Forms, version 3.2.0.
  303. *
  304. * Copyright (C) 1991-2018 Unicode, Inc. All rights reserved. Distributed
  305. * under the Terms of Use in http://www.unicode.org/copyright.html.
  306. */
  307. static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result)
  308. {
  309. int index;
  310. int l, v, t;
  311. index = uc - Hangul_SBase;
  312. if (index < 0 || index >= Hangul_SCount)
  313. return 0;
  314. l = Hangul_LBase + index / Hangul_NCount;
  315. v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount;
  316. t = Hangul_TBase + index % Hangul_TCount;
  317. result[0] = l;
  318. result[1] = v;
  319. if (t != Hangul_TBase) {
  320. result[2] = t;
  321. return 3;
  322. }
  323. return 2;
  324. }
  325. /* Decomposes a single unicode character. */
  326. static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer)
  327. {
  328. u16 *result;
  329. /* Hangul is handled separately */
  330. result = hangul_buffer;
  331. *size = hfsplus_try_decompose_hangul(uc, result);
  332. if (*size == 0)
  333. result = hfsplus_decompose_nonhangul(uc, size);
  334. return result;
  335. }
  336. int hfsplus_asc2uni(struct super_block *sb,
  337. struct hfsplus_unistr *ustr, int max_unistr_len,
  338. const char *astr, int len)
  339. {
  340. int size, dsize, decompose;
  341. u16 *dstr, outlen = 0;
  342. wchar_t c;
  343. u16 dhangul[3];
  344. decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
  345. while (outlen < max_unistr_len && len > 0) {
  346. size = asc2unichar(sb, astr, len, &c);
  347. if (decompose)
  348. dstr = decompose_unichar(c, &dsize, dhangul);
  349. else
  350. dstr = NULL;
  351. if (dstr) {
  352. if (outlen + dsize > max_unistr_len)
  353. break;
  354. do {
  355. ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
  356. } while (--dsize > 0);
  357. } else
  358. ustr->unicode[outlen++] = cpu_to_be16(c);
  359. astr += size;
  360. len -= size;
  361. }
  362. ustr->length = cpu_to_be16(outlen);
  363. if (len > 0)
  364. return -ENAMETOOLONG;
  365. return 0;
  366. }
  367. /*
  368. * Hash a string to an integer as appropriate for the HFS+ filesystem.
  369. * Composed unicode characters are decomposed and case-folding is performed
  370. * if the appropriate bits are (un)set on the superblock.
  371. */
  372. int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
  373. {
  374. struct super_block *sb = dentry->d_sb;
  375. const char *astr;
  376. const u16 *dstr;
  377. int casefold, decompose, size, len;
  378. unsigned long hash;
  379. wchar_t c;
  380. u16 c2;
  381. u16 dhangul[3];
  382. casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
  383. decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
  384. hash = init_name_hash(dentry);
  385. astr = str->name;
  386. len = str->len;
  387. while (len > 0) {
  388. int dsize;
  389. size = asc2unichar(sb, astr, len, &c);
  390. astr += size;
  391. len -= size;
  392. if (decompose)
  393. dstr = decompose_unichar(c, &dsize, dhangul);
  394. else
  395. dstr = NULL;
  396. if (dstr) {
  397. do {
  398. c2 = *dstr++;
  399. if (casefold)
  400. c2 = case_fold(c2);
  401. if (!casefold || c2)
  402. hash = partial_name_hash(c2, hash);
  403. } while (--dsize > 0);
  404. } else {
  405. c2 = c;
  406. if (casefold)
  407. c2 = case_fold(c2);
  408. if (!casefold || c2)
  409. hash = partial_name_hash(c2, hash);
  410. }
  411. }
  412. str->hash = end_name_hash(hash);
  413. return 0;
  414. }
  415. /*
  416. * Compare strings with HFS+ filename ordering.
  417. * Composed unicode characters are decomposed and case-folding is performed
  418. * if the appropriate bits are (un)set on the superblock.
  419. */
  420. int hfsplus_compare_dentry(const struct dentry *dentry,
  421. unsigned int len, const char *str, const struct qstr *name)
  422. {
  423. struct super_block *sb = dentry->d_sb;
  424. int casefold, decompose, size;
  425. int dsize1, dsize2, len1, len2;
  426. const u16 *dstr1, *dstr2;
  427. const char *astr1, *astr2;
  428. u16 c1, c2;
  429. wchar_t c;
  430. u16 dhangul_1[3], dhangul_2[3];
  431. casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
  432. decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
  433. astr1 = str;
  434. len1 = len;
  435. astr2 = name->name;
  436. len2 = name->len;
  437. dsize1 = dsize2 = 0;
  438. dstr1 = dstr2 = NULL;
  439. while (len1 > 0 && len2 > 0) {
  440. if (!dsize1) {
  441. size = asc2unichar(sb, astr1, len1, &c);
  442. astr1 += size;
  443. len1 -= size;
  444. if (decompose)
  445. dstr1 = decompose_unichar(c, &dsize1,
  446. dhangul_1);
  447. if (!decompose || !dstr1) {
  448. c1 = c;
  449. dstr1 = &c1;
  450. dsize1 = 1;
  451. }
  452. }
  453. if (!dsize2) {
  454. size = asc2unichar(sb, astr2, len2, &c);
  455. astr2 += size;
  456. len2 -= size;
  457. if (decompose)
  458. dstr2 = decompose_unichar(c, &dsize2,
  459. dhangul_2);
  460. if (!decompose || !dstr2) {
  461. c2 = c;
  462. dstr2 = &c2;
  463. dsize2 = 1;
  464. }
  465. }
  466. c1 = *dstr1;
  467. c2 = *dstr2;
  468. if (casefold) {
  469. c1 = case_fold(c1);
  470. if (!c1) {
  471. dstr1++;
  472. dsize1--;
  473. continue;
  474. }
  475. c2 = case_fold(c2);
  476. if (!c2) {
  477. dstr2++;
  478. dsize2--;
  479. continue;
  480. }
  481. }
  482. if (c1 < c2)
  483. return -1;
  484. else if (c1 > c2)
  485. return 1;
  486. dstr1++;
  487. dsize1--;
  488. dstr2++;
  489. dsize2--;
  490. }
  491. if (len1 < len2)
  492. return -1;
  493. if (len1 > len2)
  494. return 1;
  495. return 0;
  496. }