llvm.org GIT mirror llvm / fd7c224
Implement a case-folding version of DJB hash Summary: This patch implements a variant of the DJB hash function which folds the input according to the algorithm in the Dwarf 5 specification (Section 6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18, "Case Mappings"). To achieve this, I have added a llvm::sys::unicode::foldCharSimple function, which performs this mapping. The implementation of this function was generated from the CaseMatching.txt file from the Unicode spec using a python script (which is also included in this patch). The script tries to optimize the function by coalescing adjecant mappings with the same shift and stride (terms I made up). Theoretically, it could be made a bit smarter and merge adjecant blocks that were interrupted by only one or two characters with exceptional mapping, but this would save only a couple of branches, while it would greatly complicate the implementation, so I deemed it was not worth it. Since we assume that the vast majority of the input characters will be US-ASCII, the folding hash function has a fast-path for handling these, and only whips out the full decode+fold+encode logic if we encounter a character outside of this range. It might be possible to implement the folding directly on utf8 sequences, but this would also bring a lot of complexity for the few cases where we will actually need to process non-ascii characters. Reviewers: JDevlieghere, aprantl, probinson, dblaikie Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits Differential Revision: https://reviews.llvm.org/D42740 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325107 91177308-0d34-0410-b5e6-96231b3b80d8 Pavel Labath 2 years ago
8 changed file(s) with 1055 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
1919
2020 /// The Bernstein hash function used by the DWARF accelerator tables.
2121 uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);
22
23 /// Computes the Bernstein hash after folding the input according to the Dwarf 5
24 /// standard case folding rules.
25 uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
2226 } // namespace llvm
2327
2428 #endif // LLVM_SUPPORT_DJB_H
5959 /// * 1 for each of the remaining characters.
6060 int columnWidthUTF8(StringRef Text);
6161
62 /// Fold input unicode character according the the Simple unicode case folding
63 /// rules.
64 int foldCharSimple(int C);
65
6266 } // namespace unicode
6367 } // namespace sys
6468 } // namespace llvm
112112 Triple.cpp
113113 Twine.cpp
114114 Unicode.cpp
115 UnicodeCaseFold.cpp
115116 YAMLParser.cpp
116117 YAMLTraits.cpp
117118 raw_os_ostream.cpp
1111 //===----------------------------------------------------------------------===//
1212
1313 #include "llvm/Support/DJB.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/Support/Compiler.h"
16 #include "llvm/Support/ConvertUTF.h"
17 #include "llvm/Support/Unicode.h"
18
19 using namespace llvm;
20
21 static inline uint32_t djbHashChar(char C, uint32_t H) {
22 return (H << 5) + H + C;
23 }
1424
1525 uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
1626 for (char C : Buffer.bytes())
17 H = ((H << 5) + H) + C;
27 H = djbHashChar(C, H);
1828 return H;
1929 }
30
31 static UTF32 chopOneUTF32(StringRef &Buffer) {
32 UTF32 C;
33 const UTF8 *const Begin8Const =
34 reinterpret_cast(Buffer.begin());
35 const UTF8 *Begin8 = Begin8Const;
36 UTF32 *Begin32 = &C;
37
38 // In lenient mode we will always end up with a "reasonable" value in C for
39 // non-empty input.
40 assert(!Buffer.empty());
41 ConvertUTF8toUTF32(&Begin8, reinterpret_cast(Buffer.end()),
42 &Begin32, &C + 1, lenientConversion);
43 Buffer = Buffer.drop_front(Begin8 - Begin8Const);
44 return C;
45 }
46
47 static StringRef toUTF8(UTF32 C, MutableArrayRef Storage) {
48 const UTF32 *Begin32 = &C;
49 UTF8 *Begin8 = Storage.begin();
50
51 // The case-folded output should always be a valid unicode character, so use
52 // strict mode here.
53 ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
54 Storage.end(), strictConversion);
55 assert(CR == conversionOK && "Case folding produced invalid char?");
56 (void)CR;
57 return StringRef(reinterpret_cast(Storage.begin()),
58 Begin8 - Storage.begin());
59 }
60
61 static UTF32 foldCharDwarf(UTF32 C) {
62 // DWARF v5 addition to the unicode folding rules.
63 // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
64 // Above" into "i".
65 if (C == 0x130 || C == 0x131)
66 return 'i';
67 return sys::unicode::foldCharSimple(C);
68 }
69
70 static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
71 UTF32 C = chopOneUTF32(Buffer);
72
73 C = foldCharDwarf(C);
74
75 std::array Storage;
76 StringRef Folded = toUTF8(C, Storage);
77 return djbHash(Folded, H);
78 }
79
80 uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
81 while (!Buffer.empty()) {
82 unsigned char C = Buffer.front();
83 if (LLVM_LIKELY(C <= 0x7f)) {
84 // US-ASCII, encoded as one character in utf-8.
85 // This is by far the most common case, so handle this specially.
86 if (C >= 'A' && C <= 'Z')
87 C = 'a' + (C - 'A'); // fold uppercase into lowercase
88 H = djbHashChar(C, H);
89 Buffer = Buffer.drop_front();
90 continue;
91 }
92 H = caseFoldingDjbHashCharSlow(Buffer, H);
93 }
94 return H;
95 }
0 //===---------- Support/UnicodeCaseFold.cpp -------------------------------===//
1 //
2 // This file was generated by utils/unicode-case-fold.py from the Unicode
3 // case folding database at
4 // http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt
5 //
6 // To regenerate this file, run:
7 // utils/unicode-case-fold.py \
8 // "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \
9 // > lib/Support/UnicodeCaseFold.cpp
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/Support/Unicode.h"
14
15 int llvm::sys::unicode::foldCharSimple(int C) {
16 if (C < 0x0041)
17 return C;
18 // 26 characters
19 if (C <= 0x005a)
20 return C + 32;
21 // MICRO SIGN
22 if (C == 0x00b5)
23 return 0x03bc;
24 if (C < 0x00c0)
25 return C;
26 // 23 characters
27 if (C <= 0x00d6)
28 return C + 32;
29 if (C < 0x00d8)
30 return C;
31 // 7 characters
32 if (C <= 0x00de)
33 return C + 32;
34 if (C < 0x0100)
35 return C;
36 // 24 characters
37 if (C <= 0x012e)
38 return C | 1;
39 if (C < 0x0132)
40 return C;
41 // 3 characters
42 if (C <= 0x0136)
43 return C | 1;
44 if (C < 0x0139)
45 return C;
46 // 8 characters
47 if (C <= 0x0147 && C % 2 == 1)
48 return C + 1;
49 if (C < 0x014a)
50 return C;
51 // 23 characters
52 if (C <= 0x0176)
53 return C | 1;
54 // LATIN CAPITAL LETTER Y WITH DIAERESIS
55 if (C == 0x0178)
56 return 0x00ff;
57 if (C < 0x0179)
58 return C;
59 // 3 characters
60 if (C <= 0x017d && C % 2 == 1)
61 return C + 1;
62 // LATIN SMALL LETTER LONG S
63 if (C == 0x017f)
64 return 0x0073;
65 // LATIN CAPITAL LETTER B WITH HOOK
66 if (C == 0x0181)
67 return 0x0253;
68 if (C < 0x0182)
69 return C;
70 // 2 characters
71 if (C <= 0x0184)
72 return C | 1;
73 // LATIN CAPITAL LETTER OPEN O
74 if (C == 0x0186)
75 return 0x0254;
76 // LATIN CAPITAL LETTER C WITH HOOK
77 if (C == 0x0187)
78 return 0x0188;
79 if (C < 0x0189)
80 return C;
81 // 2 characters
82 if (C <= 0x018a)
83 return C + 205;
84 // LATIN CAPITAL LETTER D WITH TOPBAR
85 if (C == 0x018b)
86 return 0x018c;
87 // LATIN CAPITAL LETTER REVERSED E
88 if (C == 0x018e)
89 return 0x01dd;
90 // LATIN CAPITAL LETTER SCHWA
91 if (C == 0x018f)
92 return 0x0259;
93 // LATIN CAPITAL LETTER OPEN E
94 if (C == 0x0190)
95 return 0x025b;
96 // LATIN CAPITAL LETTER F WITH HOOK
97 if (C == 0x0191)
98 return 0x0192;
99 // LATIN CAPITAL LETTER G WITH HOOK
100 if (C == 0x0193)
101 return 0x0260;
102 // LATIN CAPITAL LETTER GAMMA
103 if (C == 0x0194)
104 return 0x0263;
105 // LATIN CAPITAL LETTER IOTA
106 if (C == 0x0196)
107 return 0x0269;
108 // LATIN CAPITAL LETTER I WITH STROKE
109 if (C == 0x0197)
110 return 0x0268;
111 // LATIN CAPITAL LETTER K WITH HOOK
112 if (C == 0x0198)
113 return 0x0199;
114 // LATIN CAPITAL LETTER TURNED M
115 if (C == 0x019c)
116 return 0x026f;
117 // LATIN CAPITAL LETTER N WITH LEFT HOOK
118 if (C == 0x019d)
119 return 0x0272;
120 // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
121 if (C == 0x019f)
122 return 0x0275;
123 if (C < 0x01a0)
124 return C;
125 // 3 characters
126 if (C <= 0x01a4)
127 return C | 1;
128 // LATIN LETTER YR
129 if (C == 0x01a6)
130 return 0x0280;
131 // LATIN CAPITAL LETTER TONE TWO
132 if (C == 0x01a7)
133 return 0x01a8;
134 // LATIN CAPITAL LETTER ESH
135 if (C == 0x01a9)
136 return 0x0283;
137 // LATIN CAPITAL LETTER T WITH HOOK
138 if (C == 0x01ac)
139 return 0x01ad;
140 // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
141 if (C == 0x01ae)
142 return 0x0288;
143 // LATIN CAPITAL LETTER U WITH HORN
144 if (C == 0x01af)
145 return 0x01b0;
146 if (C < 0x01b1)
147 return C;
148 // 2 characters
149 if (C <= 0x01b2)
150 return C + 217;
151 if (C < 0x01b3)
152 return C;
153 // 2 characters
154 if (C <= 0x01b5 && C % 2 == 1)
155 return C + 1;
156 // LATIN CAPITAL LETTER EZH
157 if (C == 0x01b7)
158 return 0x0292;
159 if (C < 0x01b8)
160 return C;
161 // 2 characters
162 if (C <= 0x01bc && C % 4 == 0)
163 return C + 1;
164 // LATIN CAPITAL LETTER DZ WITH CARON
165 if (C == 0x01c4)
166 return 0x01c6;
167 // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
168 if (C == 0x01c5)
169 return 0x01c6;
170 // LATIN CAPITAL LETTER LJ
171 if (C == 0x01c7)
172 return 0x01c9;
173 // LATIN CAPITAL LETTER L WITH SMALL LETTER J
174 if (C == 0x01c8)
175 return 0x01c9;
176 // LATIN CAPITAL LETTER NJ
177 if (C == 0x01ca)
178 return 0x01cc;
179 if (C < 0x01cb)
180 return C;
181 // 9 characters
182 if (C <= 0x01db && C % 2 == 1)
183 return C + 1;
184 if (C < 0x01de)
185 return C;
186 // 9 characters
187 if (C <= 0x01ee)
188 return C | 1;
189 // LATIN CAPITAL LETTER DZ
190 if (C == 0x01f1)
191 return 0x01f3;
192 if (C < 0x01f2)
193 return C;
194 // 2 characters
195 if (C <= 0x01f4)
196 return C | 1;
197 // LATIN CAPITAL LETTER HWAIR
198 if (C == 0x01f6)
199 return 0x0195;
200 // LATIN CAPITAL LETTER WYNN
201 if (C == 0x01f7)
202 return 0x01bf;
203 if (C < 0x01f8)
204 return C;
205 // 20 characters
206 if (C <= 0x021e)
207 return C | 1;
208 // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
209 if (C == 0x0220)
210 return 0x019e;
211 if (C < 0x0222)
212 return C;
213 // 9 characters
214 if (C <= 0x0232)
215 return C | 1;
216 // LATIN CAPITAL LETTER A WITH STROKE
217 if (C == 0x023a)
218 return 0x2c65;
219 // LATIN CAPITAL LETTER C WITH STROKE
220 if (C == 0x023b)
221 return 0x023c;
222 // LATIN CAPITAL LETTER L WITH BAR
223 if (C == 0x023d)
224 return 0x019a;
225 // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
226 if (C == 0x023e)
227 return 0x2c66;
228 // LATIN CAPITAL LETTER GLOTTAL STOP
229 if (C == 0x0241)
230 return 0x0242;
231 // LATIN CAPITAL LETTER B WITH STROKE
232 if (C == 0x0243)
233 return 0x0180;
234 // LATIN CAPITAL LETTER U BAR
235 if (C == 0x0244)
236 return 0x0289;
237 // LATIN CAPITAL LETTER TURNED V
238 if (C == 0x0245)
239 return 0x028c;
240 if (C < 0x0246)
241 return C;
242 // 5 characters
243 if (C <= 0x024e)
244 return C | 1;
245 // COMBINING GREEK YPOGEGRAMMENI
246 if (C == 0x0345)
247 return 0x03b9;
248 if (C < 0x0370)
249 return C;
250 // 2 characters
251 if (C <= 0x0372)
252 return C | 1;
253 // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
254 if (C == 0x0376)
255 return 0x0377;
256 // GREEK CAPITAL LETTER YOT
257 if (C == 0x037f)
258 return 0x03f3;
259 // GREEK CAPITAL LETTER ALPHA WITH TONOS
260 if (C == 0x0386)
261 return 0x03ac;
262 if (C < 0x0388)
263 return C;
264 // 3 characters
265 if (C <= 0x038a)
266 return C + 37;
267 // GREEK CAPITAL LETTER OMICRON WITH TONOS
268 if (C == 0x038c)
269 return 0x03cc;
270 if (C < 0x038e)
271 return C;
272 // 2 characters
273 if (C <= 0x038f)
274 return C + 63;
275 if (C < 0x0391)
276 return C;
277 // 17 characters
278 if (C <= 0x03a1)
279 return C + 32;
280 if (C < 0x03a3)
281 return C;
282 // 9 characters
283 if (C <= 0x03ab)
284 return C + 32;
285 // GREEK SMALL LETTER FINAL SIGMA
286 if (C == 0x03c2)
287 return 0x03c3;
288 // GREEK CAPITAL KAI SYMBOL
289 if (C == 0x03cf)
290 return 0x03d7;
291 // GREEK BETA SYMBOL
292 if (C == 0x03d0)
293 return 0x03b2;
294 // GREEK THETA SYMBOL
295 if (C == 0x03d1)
296 return 0x03b8;
297 // GREEK PHI SYMBOL
298 if (C == 0x03d5)
299 return 0x03c6;
300 // GREEK PI SYMBOL
301 if (C == 0x03d6)
302 return 0x03c0;
303 if (C < 0x03d8)
304 return C;
305 // 12 characters
306 if (C <= 0x03ee)
307 return C | 1;
308 // GREEK KAPPA SYMBOL
309 if (C == 0x03f0)
310 return 0x03ba;
311 // GREEK RHO SYMBOL
312 if (C == 0x03f1)
313 return 0x03c1;
314 // GREEK CAPITAL THETA SYMBOL
315 if (C == 0x03f4)
316 return 0x03b8;
317 // GREEK LUNATE EPSILON SYMBOL
318 if (C == 0x03f5)
319 return 0x03b5;
320 // GREEK CAPITAL LETTER SHO
321 if (C == 0x03f7)
322 return 0x03f8;
323 // GREEK CAPITAL LUNATE SIGMA SYMBOL
324 if (C == 0x03f9)
325 return 0x03f2;
326 // GREEK CAPITAL LETTER SAN
327 if (C == 0x03fa)
328 return 0x03fb;
329 if (C < 0x03fd)
330 return C;
331 // 3 characters
332 if (C <= 0x03ff)
333 return C + -130;
334 if (C < 0x0400)
335 return C;
336 // 16 characters
337 if (C <= 0x040f)
338 return C + 80;
339 if (C < 0x0410)
340 return C;
341 // 32 characters
342 if (C <= 0x042f)
343 return C + 32;
344 if (C < 0x0460)
345 return C;
346 // 17 characters
347 if (C <= 0x0480)
348 return C | 1;
349 if (C < 0x048a)
350 return C;
351 // 27 characters
352 if (C <= 0x04be)
353 return C | 1;
354 // CYRILLIC LETTER PALOCHKA
355 if (C == 0x04c0)
356 return 0x04cf;
357 if (C < 0x04c1)
358 return C;
359 // 7 characters
360 if (C <= 0x04cd && C % 2 == 1)
361 return C + 1;
362 if (C < 0x04d0)
363 return C;
364 // 48 characters
365 if (C <= 0x052e)
366 return C | 1;
367 if (C < 0x0531)
368 return C;
369 // 38 characters
370 if (C <= 0x0556)
371 return C + 48;
372 if (C < 0x10a0)
373 return C;
374 // 38 characters
375 if (C <= 0x10c5)
376 return C + 7264;
377 if (C < 0x10c7)
378 return C;
379 // 2 characters
380 if (C <= 0x10cd && C % 6 == 5)
381 return C + 7264;
382 if (C < 0x13f8)
383 return C;
384 // 6 characters
385 if (C <= 0x13fd)
386 return C + -8;
387 // CYRILLIC SMALL LETTER ROUNDED VE
388 if (C == 0x1c80)
389 return 0x0432;
390 // CYRILLIC SMALL LETTER LONG-LEGGED DE
391 if (C == 0x1c81)
392 return 0x0434;
393 // CYRILLIC SMALL LETTER NARROW O
394 if (C == 0x1c82)
395 return 0x043e;
396 if (C < 0x1c83)
397 return C;
398 // 2 characters
399 if (C <= 0x1c84)
400 return C + -6210;
401 // CYRILLIC SMALL LETTER THREE-LEGGED TE
402 if (C == 0x1c85)
403 return 0x0442;
404 // CYRILLIC SMALL LETTER TALL HARD SIGN
405 if (C == 0x1c86)
406 return 0x044a;
407 // CYRILLIC SMALL LETTER TALL YAT
408 if (C == 0x1c87)
409 return 0x0463;
410 // CYRILLIC SMALL LETTER UNBLENDED UK
411 if (C == 0x1c88)
412 return 0xa64b;
413 if (C < 0x1e00)
414 return C;
415 // 75 characters
416 if (C <= 0x1e94)
417 return C | 1;
418 // LATIN SMALL LETTER LONG S WITH DOT ABOVE
419 if (C == 0x1e9b)
420 return 0x1e61;
421 // LATIN CAPITAL LETTER SHARP S
422 if (C == 0x1e9e)
423 return 0x00df;
424 if (C < 0x1ea0)
425 return C;
426 // 48 characters
427 if (C <= 0x1efe)
428 return C | 1;
429 if (C < 0x1f08)
430 return C;
431 // 8 characters
432 if (C <= 0x1f0f)
433 return C + -8;
434 if (C < 0x1f18)
435 return C;
436 // 6 characters
437 if (C <= 0x1f1d)
438 return C + -8;
439 if (C < 0x1f28)
440 return C;
441 // 8 characters
442 if (C <= 0x1f2f)
443 return C + -8;
444 if (C < 0x1f38)
445 return C;
446 // 8 characters
447 if (C <= 0x1f3f)
448 return C + -8;
449 if (C < 0x1f48)
450 return C;
451 // 6 characters
452 if (C <= 0x1f4d)
453 return C + -8;
454 if (C < 0x1f59)
455 return C;
456 // 4 characters
457 if (C <= 0x1f5f && C % 2 == 1)
458 return C + -8;
459 if (C < 0x1f68)
460 return C;
461 // 8 characters
462 if (C <= 0x1f6f)
463 return C + -8;
464 if (C < 0x1f88)
465 return C;
466 // 8 characters
467 if (C <= 0x1f8f)
468 return C + -8;
469 if (C < 0x1f98)
470 return C;
471 // 8 characters
472 if (C <= 0x1f9f)
473 return C + -8;
474 if (C < 0x1fa8)
475 return C;
476 // 8 characters
477 if (C <= 0x1faf)
478 return C + -8;
479 if (C < 0x1fb8)
480 return C;
481 // 2 characters
482 if (C <= 0x1fb9)
483 return C + -8;
484 if (C < 0x1fba)
485 return C;
486 // 2 characters
487 if (C <= 0x1fbb)
488 return C + -74;
489 // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
490 if (C == 0x1fbc)
491 return 0x1fb3;
492 // GREEK PROSGEGRAMMENI
493 if (C == 0x1fbe)
494 return 0x03b9;
495 if (C < 0x1fc8)
496 return C;
497 // 4 characters
498 if (C <= 0x1fcb)
499 return C + -86;
500 // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
501 if (C == 0x1fcc)
502 return 0x1fc3;
503 if (C < 0x1fd8)
504 return C;
505 // 2 characters
506 if (C <= 0x1fd9)
507 return C + -8;
508 if (C < 0x1fda)
509 return C;
510 // 2 characters
511 if (C <= 0x1fdb)
512 return C + -100;
513 if (C < 0x1fe8)
514 return C;
515 // 2 characters
516 if (C <= 0x1fe9)
517 return C + -8;
518 if (C < 0x1fea)
519 return C;
520 // 2 characters
521 if (C <= 0x1feb)
522 return C + -112;
523 // GREEK CAPITAL LETTER RHO WITH DASIA
524 if (C == 0x1fec)
525 return 0x1fe5;
526 if (C < 0x1ff8)
527 return C;
528 // 2 characters
529 if (C <= 0x1ff9)
530 return C + -128;
531 if (C < 0x1ffa)
532 return C;
533 // 2 characters
534 if (C <= 0x1ffb)
535 return C + -126;
536 // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
537 if (C == 0x1ffc)
538 return 0x1ff3;
539 // OHM SIGN
540 if (C == 0x2126)
541 return 0x03c9;
542 // KELVIN SIGN
543 if (C == 0x212a)
544 return 0x006b;
545 // ANGSTROM SIGN
546 if (C == 0x212b)
547 return 0x00e5;
548 // TURNED CAPITAL F
549 if (C == 0x2132)
550 return 0x214e;
551 if (C < 0x2160)
552 return C;
553 // 16 characters
554 if (C <= 0x216f)
555 return C + 16;
556 // ROMAN NUMERAL REVERSED ONE HUNDRED
557 if (C == 0x2183)
558 return 0x2184;
559 if (C < 0x24b6)
560 return C;
561 // 26 characters
562 if (C <= 0x24cf)
563 return C + 26;
564 if (C < 0x2c00)
565 return C;
566 // 47 characters
567 if (C <= 0x2c2e)
568 return C + 48;
569 // LATIN CAPITAL LETTER L WITH DOUBLE BAR
570 if (C == 0x2c60)
571 return 0x2c61;
572 // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
573 if (C == 0x2c62)
574 return 0x026b;
575 // LATIN CAPITAL LETTER P WITH STROKE
576 if (C == 0x2c63)
577 return 0x1d7d;
578 // LATIN CAPITAL LETTER R WITH TAIL
579 if (C == 0x2c64)
580 return 0x027d;
581 if (C < 0x2c67)
582 return C;
583 // 3 characters
584 if (C <= 0x2c6b && C % 2 == 1)
585 return C + 1;
586 // LATIN CAPITAL LETTER ALPHA
587 if (C == 0x2c6d)
588 return 0x0251;
589 // LATIN CAPITAL LETTER M WITH HOOK
590 if (C == 0x2c6e)
591 return 0x0271;
592 // LATIN CAPITAL LETTER TURNED A
593 if (C == 0x2c6f)
594 return 0x0250;
595 // LATIN CAPITAL LETTER TURNED ALPHA
596 if (C == 0x2c70)
597 return 0x0252;
598 if (C < 0x2c72)
599 return C;
600 // 2 characters
601 if (C <= 0x2c75 && C % 3 == 2)
602 return C + 1;
603 if (C < 0x2c7e)
604 return C;
605 // 2 characters
606 if (C <= 0x2c7f)
607 return C + -10815;
608 if (C < 0x2c80)
609 return C;
610 // 50 characters
611 if (C <= 0x2ce2)
612 return C | 1;
613 if (C < 0x2ceb)
614 return C;
615 // 2 characters
616 if (C <= 0x2ced && C % 2 == 1)
617 return C + 1;
618 if (C < 0x2cf2)
619 return C;
620 // 2 characters
621 if (C <= 0xa640 && C % 31054 == 11506)
622 return C + 1;
623 if (C < 0xa642)
624 return C;
625 // 22 characters
626 if (C <= 0xa66c)
627 return C | 1;
628 if (C < 0xa680)
629 return C;
630 // 14 characters
631 if (C <= 0xa69a)
632 return C | 1;
633 if (C < 0xa722)
634 return C;
635 // 7 characters
636 if (C <= 0xa72e)
637 return C | 1;
638 if (C < 0xa732)
639 return C;
640 // 31 characters
641 if (C <= 0xa76e)
642 return C | 1;
643 if (C < 0xa779)
644 return C;
645 // 2 characters
646 if (C <= 0xa77b && C % 2 == 1)
647 return C + 1;
648 // LATIN CAPITAL LETTER INSULAR G
649 if (C == 0xa77d)
650 return 0x1d79;
651 if (C < 0xa77e)
652 return C;
653 // 5 characters
654 if (C <= 0xa786)
655 return C | 1;
656 // LATIN CAPITAL LETTER SALTILLO
657 if (C == 0xa78b)
658 return 0xa78c;
659 // LATIN CAPITAL LETTER TURNED H
660 if (C == 0xa78d)
661 return 0x0265;
662 if (C < 0xa790)
663 return C;
664 // 2 characters
665 if (C <= 0xa792)
666 return C | 1;
667 if (C < 0xa796)
668 return C;
669 // 10 characters
670 if (C <= 0xa7a8)
671 return C | 1;
672 // LATIN CAPITAL LETTER H WITH HOOK
673 if (C == 0xa7aa)
674 return 0x0266;
675 // LATIN CAPITAL LETTER REVERSED OPEN E
676 if (C == 0xa7ab)
677 return 0x025c;
678 // LATIN CAPITAL LETTER SCRIPT G
679 if (C == 0xa7ac)
680 return 0x0261;
681 // LATIN CAPITAL LETTER L WITH BELT
682 if (C == 0xa7ad)
683 return 0x026c;
684 // LATIN CAPITAL LETTER SMALL CAPITAL I
685 if (C == 0xa7ae)
686 return 0x026a;
687 // LATIN CAPITAL LETTER TURNED K
688 if (C == 0xa7b0)
689 return 0x029e;
690 // LATIN CAPITAL LETTER TURNED T
691 if (C == 0xa7b1)
692 return 0x0287;
693 // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
694 if (C == 0xa7b2)
695 return 0x029d;
696 // LATIN CAPITAL LETTER CHI
697 if (C == 0xa7b3)
698 return 0xab53;
699 if (C < 0xa7b4)
700 return C;
701 // 2 characters
702 if (C <= 0xa7b6)
703 return C | 1;
704 if (C < 0xab70)
705 return C;
706 // 80 characters
707 if (C <= 0xabbf)
708 return C + -38864;
709 if (C < 0xff21)
710 return C;
711 // 26 characters
712 if (C <= 0xff3a)
713 return C + 32;
714 if (C < 0x10400)
715 return C;
716 // 40 characters
717 if (C <= 0x10427)
718 return C + 40;
719 if (C < 0x104b0)
720 return C;
721 // 36 characters
722 if (C <= 0x104d3)
723 return C + 40;
724 if (C < 0x10c80)
725 return C;
726 // 51 characters
727 if (C <= 0x10cb2)
728 return C + 64;
729 if (C < 0x118a0)
730 return C;
731 // 32 characters
732 if (C <= 0x118bf)
733 return C + 32;
734 if (C < 0x1e900)
735 return C;
736 // 34 characters
737 if (C <= 0x1e921)
738 return C + 34;
739
740 return C;
741 }
1818 ConvertUTFTest.cpp
1919 DataExtractorTest.cpp
2020 DebugTest.cpp
21 DJBTest.cpp
2122 EndianStreamTest.cpp
2223 EndianTest.cpp
2324 ErrnoTest.cpp
0 //===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/Support/DJB.h"
10 #include "llvm/ADT/Twine.h"
11 #include "gtest/gtest.h"
12
13 using namespace llvm;
14
15 TEST(DJBTest, caseFolding) {
16 struct TestCase {
17 StringLiteral One;
18 StringLiteral Two;
19 };
20
21 static constexpr TestCase Tests[] = {
22 {"ASDF", "asdf"},
23 {"qWeR", "QwEr"},
24 {"qqqqqqqqqqqqqqqqqqqq", "QQQQQQQQQQQQQQQQQQQQ"},
25
26 {"I", "i"},
27 // Latin Small Letter Dotless I
28 {u8"\u0130", "i"},
29 // Latin Capital Letter I With Dot Above
30 {u8"\u0131", "i"},
31
32 // Latin Capital Letter A With Grave
33 {u8"\u00c0", u8"\u00e0"},
34 // Latin Capital Letter A With Macron
35 {u8"\u0100", u8"\u0101"},
36 // Latin Capital Letter L With Acute
37 {u8"\u0139", u8"\u013a"},
38 // Cyrillic Capital Letter Ie
39 {u8"\u0415", u8"\u0435"},
40 // Latin Capital Letter A With Circumflex And Grave
41 {u8"\u1ea6", u8"\u1ea7"},
42 // Kelvin Sign
43 {u8"\u212a", u8"\u006b"},
44 // Glagolitic Capital Letter Chrivi
45 {u8"\u2c1d", u8"\u2c4d"},
46 // Fullwidth Latin Capital Letter M
47 {u8"\uff2d", u8"\uff4d"},
48 // Old Hungarian Capital Letter Ej
49 {u8"\U00010c92", u8"\U00010cd2"},
50 };
51
52 for (const TestCase &T : Tests) {
53 SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'");
54 EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two));
55 }
56 }
57
58 TEST(DJBTest, knownValuesLowerCase) {
59 struct TestCase {
60 StringLiteral Text;
61 uint32_t Hash;
62 };
63 static constexpr TestCase Tests[] = {
64 {"", 5381u},
65 {"f", 177675u},
66 {"fo", 5863386u},
67 {"foo", 193491849u},
68 {"foob", 2090263819u},
69 {"fooba", 259229388u},
70 {"foobar", 4259602622u},
71 {"pneumonoultramicroscopicsilicovolcanoconiosis", 3999417781u},
72 };
73
74 for (const TestCase &T : Tests) {
75 SCOPED_TRACE("Text: '" + T.Text + "'");
76 EXPECT_EQ(T.Hash, djbHash(T.Text));
77 EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text));
78 EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper()));
79 }
80 }
81
82 TEST(DJBTest, knownValuesUnicode) {
83 EXPECT_EQ(
84 2326183139u,
85 caseFoldingDjbHash(
86 u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6"
87 u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2"));
88 }
0 #!/usr/bin/env python
1 """
2 Unicode case folding database conversion utility
3
4 Parses the database and generates a C++ function which implements the case
5 folding algorithm. The database entries are of the form:
6
7 ; ; ; #
8
9 can be one of four characters:
10 C - Common mappings
11 S - mappings for Simple case folding
12 F - mappings for Full case folding
13 T - special case for Turkish I characters
14
15 Right now this generates a function which implements simple case folding (C+S
16 entries).
17 """
18
19 import sys
20 import re
21 import urllib2
22
23 # This variable will body of the mappings function
24 body = ""
25
26 # Reads file line-by-line, extracts Common and Simple case fold mappings and
27 # returns a (from_char, to_char, from_name) tuple.
28 def mappings(f):
29 previous_from = -1
30 expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
31 for line in f:
32 m = expr.match(line)
33 if not m: continue
34 from_char = int(m.group(1), 16)
35 to_char = int(m.group(2), 16)
36 from_name = m.group(3)
37
38 if from_char <= previous_from:
39 raise Exception("Duplicate or unsorted characters in input")
40 yield from_char, to_char, from_name
41 previous_from = from_char
42
43 # Computes the shift (to_char - from_char) in a mapping.
44 def shift(mapping):
45 return mapping[1] - mapping[0]
46
47 # Computes the stride (from_char2 - from_char1) of two mappings.
48 def stride2(mapping1, mapping2):
49 return mapping2[0] - mapping1[0]
50
51 # Computes the stride of a list of mappings. The list should have at least two
52 # mappings. All mappings in the list are assumed to have the same stride.
53 def stride(block):
54 return stride2(block[0], block[1])
55
56
57 # b is a list of mappings. All the mappings are assumed to have the same
58 # shift and the stride between adjecant mappings (if any) is constant.
59 def dump_block(b):
60 global body
61
62 if len(b) == 1:
63 # Special case for handling blocks of length 1. We don't even need to
64 # emit the "if (C < X) return C" check below as all characters in this
65 # range will be caught by the "C < X" check emitted by the first
66 # non-trivial block.
67 body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
68 return
69
70 first = b[0][0]
71 last = first + stride(b) * (len(b)-1)
72 modulo = first % stride(b)
73
74 # All characters before this block map to themselves.
75 body += " if (C < {0:#06x})\n return C;\n".format(first)
76 body += " // {0} characters\n".format(len(b))
77
78 # Generic pattern: check upper bound (lower bound is checked by the "if"
79 # above) and modulo of C, return C+shift.
80 pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
81
82 if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
83 # Special case:
84 # We can elide the modulo-check because the expression "C|1" will map
85 # the intervening characters to themselves.
86 pattern = " if (C <= {0:#06x})\n return C | 1;\n"
87 elif stride(b) == 1:
88 # Another special case: X % 1 is always zero, so don't emit the
89 # modulo-check.
90 pattern = " if (C <= {0:#06x})\n return C + {3};\n"
91
92 body += pattern.format(last, stride(b), modulo, shift(b[0]))
93
94 current_block = []
95 f = urllib2.urlopen(sys.argv[1])
96 for m in mappings(f):
97 if len(current_block) == 0:
98 current_block.append(m)
99 continue
100
101 if shift(current_block[0]) != shift(m):
102 # Incompatible shift, start a new block.
103 dump_block(current_block)
104 current_block = [m]
105 continue
106
107 if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
108 current_block.append(m)
109 continue
110
111 # Incompatible stride, start a new block.
112 dump_block(current_block)
113 current_block = [m]
114 f.close()
115
116 dump_block(current_block)
117
118 print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//'
119 print '//'
120 print '// This file was generated by utils/unicode-case-fold.py from the Unicode'
121 print '// case folding database at'
122 print '// ', sys.argv[1]
123 print '//'
124 print '// To regenerate this file, run:'
125 print '// utils/unicode-case-fold.py \\'
126 print '// "{}" \\'.format(sys.argv[1])
127 print '// > lib/Support/UnicodeCaseFold.cpp'
128 print '//'
129 print '//===----------------------------------------------------------------------===//'
130 print ''
131 print '#include "llvm/Support/Unicode.h"'
132 print ''
133 print "int llvm::sys::unicode::foldCharSimple(int C) {"
134 print body
135 print " return C;"
136 print "}"