llvm.org GIT mirror llvm / 274527c
Resubmit r325107 (case folding DJB hash) The issue was that the has function was generating different results depending on the signedness of char on the host platform. This commit fixes the issue by explicitly using an unsigned char type to prevent sign extension and adds some extra tests. The original commit message was: This patch implements a variant of the DJB hash function which folds the input according to the algorithm in the Dwarf 5 specification (Section 6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18, "Case Mappings"). To achieve this, I have added a llvm::sys::unicode::foldCharSimple function, which performs this mapping. The implementation of this function was generated from the CaseMatching.txt file from the Unicode spec using a python script (which is also included in this patch). The script tries to optimize the function by coalescing adjecant mappings with the same shift and stride (terms I made up). Theoretically, it could be made a bit smarter and merge adjecant blocks that were interrupted by only one or two characters with exceptional mapping, but this would save only a couple of branches, while it would greatly complicate the implementation, so I deemed it was not worth it. Since we assume that the vast majority of the input characters will be US-ASCII, the folding hash function has a fast-path for handling these, and only whips out the full decode+fold+encode logic if we encounter a character outside of this range. It might be possible to implement the folding directly on utf8 sequences, but this would also bring a lot of complexity for the few cases where we will actually need to process non-ascii characters. Reviewers: JDevlieghere, aprantl, probinson, dblaikie Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits Differential Revision: https://reviews.llvm.org/D42740 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325732 91177308-0d34-0410-b5e6-96231b3b80d8 Pavel Labath 2 years ago
8 changed file(s) with 1063 addition(s) and 2 deletion(s). Raw diff Collapse all Expand all
1919
2020 /// The Bernstein hash function used by the DWARF accelerator tables.
2121 uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);
22
23 /// Computes the Bernstein hash after folding the input according to the Dwarf 5
24 /// standard case folding rules.
25 uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
2226 } // namespace llvm
2327
2428 #endif // LLVM_SUPPORT_DJB_H
5959 /// * 1 for each of the remaining characters.
6060 int columnWidthUTF8(StringRef Text);
6161
62 /// Fold input unicode character according the the Simple unicode case folding
63 /// rules.
64 int foldCharSimple(int C);
65
6266 } // namespace unicode
6367 } // namespace sys
6468 } // namespace llvm
112112 Triple.cpp
113113 Twine.cpp
114114 Unicode.cpp
115 UnicodeCaseFold.cpp
115116 YAMLParser.cpp
116117 YAMLTraits.cpp
117118 raw_os_ostream.cpp
1111 //===----------------------------------------------------------------------===//
1212
1313 #include "llvm/Support/DJB.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/Support/Compiler.h"
16 #include "llvm/Support/ConvertUTF.h"
17 #include "llvm/Support/Unicode.h"
18
19 using namespace llvm;
20
21 static inline uint32_t djbHashChar(unsigned char C, uint32_t H) {
22 return (H << 5) + H + C;
23 }
1424
1525 uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
16 for (char C : Buffer.bytes())
17 H = ((H << 5) + H) + C;
26 for (unsigned char C : Buffer.bytes())
27 H = djbHashChar(C, H);
1828 return H;
1929 }
30
31 static UTF32 chopOneUTF32(StringRef &Buffer) {
32 UTF32 C;
33 const UTF8 *const Begin8Const =
34 reinterpret_cast(Buffer.begin());
35 const UTF8 *Begin8 = Begin8Const;
36 UTF32 *Begin32 = &C;
37
38 // In lenient mode we will always end up with a "reasonable" value in C for
39 // non-empty input.
40 assert(!Buffer.empty());
41 ConvertUTF8toUTF32(&Begin8, reinterpret_cast(Buffer.end()),
42 &Begin32, &C + 1, lenientConversion);
43 Buffer = Buffer.drop_front(Begin8 - Begin8Const);
44 return C;
45 }
46
47 static StringRef toUTF8(UTF32 C, MutableArrayRef Storage) {
48 const UTF32 *Begin32 = &C;
49 UTF8 *Begin8 = Storage.begin();
50
51 // The case-folded output should always be a valid unicode character, so use
52 // strict mode here.
53 ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
54 Storage.end(), strictConversion);
55 assert(CR == conversionOK && "Case folding produced invalid char?");
56 (void)CR;
57 return StringRef(reinterpret_cast(Storage.begin()),
58 Begin8 - Storage.begin());
59 }
60
61 static UTF32 foldCharDwarf(UTF32 C) {
62 // DWARF v5 addition to the unicode folding rules.
63 // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
64 // Above" into "i".
65 if (C == 0x130 || C == 0x131)
66 return 'i';
67 return sys::unicode::foldCharSimple(C);
68 }
69
70 static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
71 UTF32 C = chopOneUTF32(Buffer);
72
73 C = foldCharDwarf(C);
74
75 std::array Storage;
76 StringRef Folded = toUTF8(C, Storage);
77 return djbHash(Folded, H);
78 }
79
80 uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
81 while (!Buffer.empty()) {
82 unsigned char C = Buffer.front();
83 if (LLVM_LIKELY(C <= 0x7f)) {
84 // US-ASCII, encoded as one character in utf-8.
85 // This is by far the most common case, so handle this specially.
86 if (C >= 'A' && C <= 'Z')
87 C = 'a' + (C - 'A'); // fold uppercase into lowercase
88 H = djbHashChar(C, H);
89 Buffer = Buffer.drop_front();
90 continue;
91 }
92 H = caseFoldingDjbHashCharSlow(Buffer, H);
93 }
94 return H;
95 }
0 //===---------- Support/UnicodeCaseFold.cpp -------------------------------===//
1 //
2 // This file was generated by utils/unicode-case-fold.py from the Unicode
3 // case folding database at
4 // http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt
5 //
6 // To regenerate this file, run:
7 // utils/unicode-case-fold.py \
8 // "http://www.unicode.org/Public/9.0.0/ucd/CaseFolding.txt" \
9 // > lib/Support/UnicodeCaseFold.cpp
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/Support/Unicode.h"
14
15 int llvm::sys::unicode::foldCharSimple(int C) {
16 if (C < 0x0041)
17 return C;
18 // 26 characters
19 if (C <= 0x005a)
20 return C + 32;
21 // MICRO SIGN
22 if (C == 0x00b5)
23 return 0x03bc;
24 if (C < 0x00c0)
25 return C;
26 // 23 characters
27 if (C <= 0x00d6)
28 return C + 32;
29 if (C < 0x00d8)
30 return C;
31 // 7 characters
32 if (C <= 0x00de)
33 return C + 32;
34 if (C < 0x0100)
35 return C;
36 // 24 characters
37 if (C <= 0x012e)
38 return C | 1;
39 if (C < 0x0132)
40 return C;
41 // 3 characters
42 if (C <= 0x0136)
43 return C | 1;
44 if (C < 0x0139)
45 return C;
46 // 8 characters
47 if (C <= 0x0147 && C % 2 == 1)
48 return C + 1;
49 if (C < 0x014a)
50 return C;
51 // 23 characters
52 if (C <= 0x0176)
53 return C | 1;
54 // LATIN CAPITAL LETTER Y WITH DIAERESIS
55 if (C == 0x0178)
56 return 0x00ff;
57 if (C < 0x0179)
58 return C;
59 // 3 characters
60 if (C <= 0x017d && C % 2 == 1)
61 return C + 1;
62 // LATIN SMALL LETTER LONG S
63 if (C == 0x017f)
64 return 0x0073;
65 // LATIN CAPITAL LETTER B WITH HOOK
66 if (C == 0x0181)
67 return 0x0253;
68 if (C < 0x0182)
69 return C;
70 // 2 characters
71 if (C <= 0x0184)
72 return C | 1;
73 // LATIN CAPITAL LETTER OPEN O
74 if (C == 0x0186)
75 return 0x0254;
76 // LATIN CAPITAL LETTER C WITH HOOK
77 if (C == 0x0187)
78 return 0x0188;
79 if (C < 0x0189)
80 return C;
81 // 2 characters
82 if (C <= 0x018a)
83 return C + 205;
84 // LATIN CAPITAL LETTER D WITH TOPBAR
85 if (C == 0x018b)
86 return 0x018c;
87 // LATIN CAPITAL LETTER REVERSED E
88 if (C == 0x018e)
89 return 0x01dd;
90 // LATIN CAPITAL LETTER SCHWA
91 if (C == 0x018f)
92 return 0x0259;
93 // LATIN CAPITAL LETTER OPEN E
94 if (C == 0x0190)
95 return 0x025b;
96 // LATIN CAPITAL LETTER F WITH HOOK
97 if (C == 0x0191)
98 return 0x0192;
99 // LATIN CAPITAL LETTER G WITH HOOK
100 if (C == 0x0193)
101 return 0x0260;
102 // LATIN CAPITAL LETTER GAMMA
103 if (C == 0x0194)
104 return 0x0263;
105 // LATIN CAPITAL LETTER IOTA
106 if (C == 0x0196)
107 return 0x0269;
108 // LATIN CAPITAL LETTER I WITH STROKE
109 if (C == 0x0197)
110 return 0x0268;
111 // LATIN CAPITAL LETTER K WITH HOOK
112 if (C == 0x0198)
113 return 0x0199;
114 // LATIN CAPITAL LETTER TURNED M
115 if (C == 0x019c)
116 return 0x026f;
117 // LATIN CAPITAL LETTER N WITH LEFT HOOK
118 if (C == 0x019d)
119 return 0x0272;
120 // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
121 if (C == 0x019f)
122 return 0x0275;
123 if (C < 0x01a0)
124 return C;
125 // 3 characters
126 if (C <= 0x01a4)
127 return C | 1;
128 // LATIN LETTER YR
129 if (C == 0x01a6)
130 return 0x0280;
131 // LATIN CAPITAL LETTER TONE TWO
132 if (C == 0x01a7)
133 return 0x01a8;
134 // LATIN CAPITAL LETTER ESH
135 if (C == 0x01a9)
136 return 0x0283;
137 // LATIN CAPITAL LETTER T WITH HOOK
138 if (C == 0x01ac)
139 return 0x01ad;
140 // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
141 if (C == 0x01ae)
142 return 0x0288;
143 // LATIN CAPITAL LETTER U WITH HORN
144 if (C == 0x01af)
145 return 0x01b0;
146 if (C < 0x01b1)
147 return C;
148 // 2 characters
149 if (C <= 0x01b2)
150 return C + 217;
151 if (C < 0x01b3)
152 return C;
153 // 2 characters
154 if (C <= 0x01b5 && C % 2 == 1)
155 return C + 1;
156 // LATIN CAPITAL LETTER EZH
157 if (C == 0x01b7)
158 return 0x0292;
159 if (C < 0x01b8)
160 return C;
161 // 2 characters
162 if (C <= 0x01bc && C % 4 == 0)
163 return C + 1;
164 // LATIN CAPITAL LETTER DZ WITH CARON
165 if (C == 0x01c4)
166 return 0x01c6;
167 // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
168 if (C == 0x01c5)
169 return 0x01c6;
170 // LATIN CAPITAL LETTER LJ
171 if (C == 0x01c7)
172 return 0x01c9;
173 // LATIN CAPITAL LETTER L WITH SMALL LETTER J
174 if (C == 0x01c8)
175 return 0x01c9;
176 // LATIN CAPITAL LETTER NJ
177 if (C == 0x01ca)
178 return 0x01cc;
179 if (C < 0x01cb)
180 return C;
181 // 9 characters
182 if (C <= 0x01db && C % 2 == 1)
183 return C + 1;
184 if (C < 0x01de)
185 return C;
186 // 9 characters
187 if (C <= 0x01ee)
188 return C | 1;
189 // LATIN CAPITAL LETTER DZ
190 if (C == 0x01f1)
191 return 0x01f3;
192 if (C < 0x01f2)
193 return C;
194 // 2 characters
195 if (C <= 0x01f4)
196 return C | 1;
197 // LATIN CAPITAL LETTER HWAIR
198 if (C == 0x01f6)
199 return 0x0195;
200 // LATIN CAPITAL LETTER WYNN
201 if (C == 0x01f7)
202 return 0x01bf;
203 if (C < 0x01f8)
204 return C;
205 // 20 characters
206 if (C <= 0x021e)
207 return C | 1;
208 // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
209 if (C == 0x0220)
210 return 0x019e;
211 if (C < 0x0222)
212 return C;
213 // 9 characters
214 if (C <= 0x0232)
215 return C | 1;
216 // LATIN CAPITAL LETTER A WITH STROKE
217 if (C == 0x023a)
218 return 0x2c65;
219 // LATIN CAPITAL LETTER C WITH STROKE
220 if (C == 0x023b)
221 return 0x023c;
222 // LATIN CAPITAL LETTER L WITH BAR
223 if (C == 0x023d)
224 return 0x019a;
225 // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
226 if (C == 0x023e)
227 return 0x2c66;
228 // LATIN CAPITAL LETTER GLOTTAL STOP
229 if (C == 0x0241)
230 return 0x0242;
231 // LATIN CAPITAL LETTER B WITH STROKE
232 if (C == 0x0243)
233 return 0x0180;
234 // LATIN CAPITAL LETTER U BAR
235 if (C == 0x0244)
236 return 0x0289;
237 // LATIN CAPITAL LETTER TURNED V
238 if (C == 0x0245)
239 return 0x028c;
240 if (C < 0x0246)
241 return C;
242 // 5 characters
243 if (C <= 0x024e)
244 return C | 1;
245 // COMBINING GREEK YPOGEGRAMMENI
246 if (C == 0x0345)
247 return 0x03b9;
248 if (C < 0x0370)
249 return C;
250 // 2 characters
251 if (C <= 0x0372)
252 return C | 1;
253 // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
254 if (C == 0x0376)
255 return 0x0377;
256 // GREEK CAPITAL LETTER YOT
257 if (C == 0x037f)
258 return 0x03f3;
259 // GREEK CAPITAL LETTER ALPHA WITH TONOS
260 if (C == 0x0386)
261 return 0x03ac;
262 if (C < 0x0388)
263 return C;
264 // 3 characters
265 if (C <= 0x038a)
266 return C + 37;
267 // GREEK CAPITAL LETTER OMICRON WITH TONOS
268 if (C == 0x038c)
269 return 0x03cc;
270 if (C < 0x038e)
271 return C;
272 // 2 characters
273 if (C <= 0x038f)
274 return C + 63;
275 if (C < 0x0391)
276 return C;
277 // 17 characters
278 if (C <= 0x03a1)
279 return C + 32;
280 if (C < 0x03a3)
281 return C;
282 // 9 characters
283 if (C <= 0x03ab)
284 return C + 32;
285 // GREEK SMALL LETTER FINAL SIGMA
286 if (C == 0x03c2)
287 return 0x03c3;
288 // GREEK CAPITAL KAI SYMBOL
289 if (C == 0x03cf)
290 return 0x03d7;
291 // GREEK BETA SYMBOL
292 if (C == 0x03d0)
293 return 0x03b2;
294 // GREEK THETA SYMBOL
295 if (C == 0x03d1)
296 return 0x03b8;
297 // GREEK PHI SYMBOL
298 if (C == 0x03d5)
299 return 0x03c6;
300 // GREEK PI SYMBOL
301 if (C == 0x03d6)
302 return 0x03c0;
303 if (C < 0x03d8)
304 return C;
305 // 12 characters
306 if (C <= 0x03ee)
307 return C | 1;
308 // GREEK KAPPA SYMBOL
309 if (C == 0x03f0)
310 return 0x03ba;
311 // GREEK RHO SYMBOL
312 if (C == 0x03f1)
313 return 0x03c1;
314 // GREEK CAPITAL THETA SYMBOL
315 if (C == 0x03f4)
316 return 0x03b8;
317 // GREEK LUNATE EPSILON SYMBOL
318 if (C == 0x03f5)
319 return 0x03b5;
320 // GREEK CAPITAL LETTER SHO
321 if (C == 0x03f7)
322 return 0x03f8;
323 // GREEK CAPITAL LUNATE SIGMA SYMBOL
324 if (C == 0x03f9)
325 return 0x03f2;
326 // GREEK CAPITAL LETTER SAN
327 if (C == 0x03fa)
328 return 0x03fb;
329 if (C < 0x03fd)
330 return C;
331 // 3 characters
332 if (C <= 0x03ff)
333 return C + -130;
334 if (C < 0x0400)
335 return C;
336 // 16 characters
337 if (C <= 0x040f)
338 return C + 80;
339 if (C < 0x0410)
340 return C;
341 // 32 characters
342 if (C <= 0x042f)
343 return C + 32;
344 if (C < 0x0460)
345 return C;
346 // 17 characters
347 if (C <= 0x0480)
348 return C | 1;
349 if (C < 0x048a)
350 return C;
351 // 27 characters
352 if (C <= 0x04be)
353 return C | 1;
354 // CYRILLIC LETTER PALOCHKA
355 if (C == 0x04c0)
356 return 0x04cf;
357 if (C < 0x04c1)
358 return C;
359 // 7 characters
360 if (C <= 0x04cd && C % 2 == 1)
361 return C + 1;
362 if (C < 0x04d0)
363 return C;
364 // 48 characters
365 if (C <= 0x052e)
366 return C | 1;
367 if (C < 0x0531)
368 return C;
369 // 38 characters
370 if (C <= 0x0556)
371 return C + 48;
372 if (C < 0x10a0)
373 return C;
374 // 38 characters
375 if (C <= 0x10c5)
376 return C + 7264;
377 if (C < 0x10c7)
378 return C;
379 // 2 characters
380 if (C <= 0x10cd && C % 6 == 5)
381 return C + 7264;
382 if (C < 0x13f8)
383 return C;
384 // 6 characters
385 if (C <= 0x13fd)
386 return C + -8;
387 // CYRILLIC SMALL LETTER ROUNDED VE
388 if (C == 0x1c80)
389 return 0x0432;
390 // CYRILLIC SMALL LETTER LONG-LEGGED DE
391 if (C == 0x1c81)
392 return 0x0434;
393 // CYRILLIC SMALL LETTER NARROW O
394 if (C == 0x1c82)
395 return 0x043e;
396 if (C < 0x1c83)
397 return C;
398 // 2 characters
399 if (C <= 0x1c84)
400 return C + -6210;
401 // CYRILLIC SMALL LETTER THREE-LEGGED TE
402 if (C == 0x1c85)
403 return 0x0442;
404 // CYRILLIC SMALL LETTER TALL HARD SIGN
405 if (C == 0x1c86)
406 return 0x044a;
407 // CYRILLIC SMALL LETTER TALL YAT
408 if (C == 0x1c87)
409 return 0x0463;
410 // CYRILLIC SMALL LETTER UNBLENDED UK
411 if (C == 0x1c88)
412 return 0xa64b;
413 if (C < 0x1e00)
414 return C;
415 // 75 characters
416 if (C <= 0x1e94)
417 return C | 1;
418 // LATIN SMALL LETTER LONG S WITH DOT ABOVE
419 if (C == 0x1e9b)
420 return 0x1e61;
421 // LATIN CAPITAL LETTER SHARP S
422 if (C == 0x1e9e)
423 return 0x00df;
424 if (C < 0x1ea0)
425 return C;
426 // 48 characters
427 if (C <= 0x1efe)
428 return C | 1;
429 if (C < 0x1f08)
430 return C;
431 // 8 characters
432 if (C <= 0x1f0f)
433 return C + -8;
434 if (C < 0x1f18)
435 return C;
436 // 6 characters
437 if (C <= 0x1f1d)
438 return C + -8;
439 if (C < 0x1f28)
440 return C;
441 // 8 characters
442 if (C <= 0x1f2f)
443 return C + -8;
444 if (C < 0x1f38)
445 return C;
446 // 8 characters
447 if (C <= 0x1f3f)
448 return C + -8;
449 if (C < 0x1f48)
450 return C;
451 // 6 characters
452 if (C <= 0x1f4d)
453 return C + -8;
454 if (C < 0x1f59)
455 return C;
456 // 4 characters
457 if (C <= 0x1f5f && C % 2 == 1)
458 return C + -8;
459 if (C < 0x1f68)
460 return C;
461 // 8 characters
462 if (C <= 0x1f6f)
463 return C + -8;
464 if (C < 0x1f88)
465 return C;
466 // 8 characters
467 if (C <= 0x1f8f)
468 return C + -8;
469 if (C < 0x1f98)
470 return C;
471 // 8 characters
472 if (C <= 0x1f9f)
473 return C + -8;
474 if (C < 0x1fa8)
475 return C;
476 // 8 characters
477 if (C <= 0x1faf)
478 return C + -8;
479 if (C < 0x1fb8)
480 return C;
481 // 2 characters
482 if (C <= 0x1fb9)
483 return C + -8;
484 if (C < 0x1fba)
485 return C;
486 // 2 characters
487 if (C <= 0x1fbb)
488 return C + -74;
489 // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
490 if (C == 0x1fbc)
491 return 0x1fb3;
492 // GREEK PROSGEGRAMMENI
493 if (C == 0x1fbe)
494 return 0x03b9;
495 if (C < 0x1fc8)
496 return C;
497 // 4 characters
498 if (C <= 0x1fcb)
499 return C + -86;
500 // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
501 if (C == 0x1fcc)
502 return 0x1fc3;
503 if (C < 0x1fd8)
504 return C;
505 // 2 characters
506 if (C <= 0x1fd9)
507 return C + -8;
508 if (C < 0x1fda)
509 return C;
510 // 2 characters
511 if (C <= 0x1fdb)
512 return C + -100;
513 if (C < 0x1fe8)
514 return C;
515 // 2 characters
516 if (C <= 0x1fe9)
517 return C + -8;
518 if (C < 0x1fea)
519 return C;
520 // 2 characters
521 if (C <= 0x1feb)
522 return C + -112;
523 // GREEK CAPITAL LETTER RHO WITH DASIA
524 if (C == 0x1fec)
525 return 0x1fe5;
526 if (C < 0x1ff8)
527 return C;
528 // 2 characters
529 if (C <= 0x1ff9)
530 return C + -128;
531 if (C < 0x1ffa)
532 return C;
533 // 2 characters
534 if (C <= 0x1ffb)
535 return C + -126;
536 // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
537 if (C == 0x1ffc)
538 return 0x1ff3;
539 // OHM SIGN
540 if (C == 0x2126)
541 return 0x03c9;
542 // KELVIN SIGN
543 if (C == 0x212a)
544 return 0x006b;
545 // ANGSTROM SIGN
546 if (C == 0x212b)
547 return 0x00e5;
548 // TURNED CAPITAL F
549 if (C == 0x2132)
550 return 0x214e;
551 if (C < 0x2160)
552 return C;
553 // 16 characters
554 if (C <= 0x216f)
555 return C + 16;
556 // ROMAN NUMERAL REVERSED ONE HUNDRED
557 if (C == 0x2183)
558 return 0x2184;
559 if (C < 0x24b6)
560 return C;
561 // 26 characters
562 if (C <= 0x24cf)
563 return C + 26;
564 if (C < 0x2c00)
565 return C;
566 // 47 characters
567 if (C <= 0x2c2e)
568 return C + 48;
569 // LATIN CAPITAL LETTER L WITH DOUBLE BAR
570 if (C == 0x2c60)
571 return 0x2c61;
572 // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
573 if (C == 0x2c62)
574 return 0x026b;
575 // LATIN CAPITAL LETTER P WITH STROKE
576 if (C == 0x2c63)
577 return 0x1d7d;
578 // LATIN CAPITAL LETTER R WITH TAIL
579 if (C == 0x2c64)
580 return 0x027d;
581 if (C < 0x2c67)
582 return C;
583 // 3 characters
584 if (C <= 0x2c6b && C % 2 == 1)
585 return C + 1;
586 // LATIN CAPITAL LETTER ALPHA
587 if (C == 0x2c6d)
588 return 0x0251;
589 // LATIN CAPITAL LETTER M WITH HOOK
590 if (C == 0x2c6e)
591 return 0x0271;
592 // LATIN CAPITAL LETTER TURNED A
593 if (C == 0x2c6f)
594 return 0x0250;
595 // LATIN CAPITAL LETTER TURNED ALPHA
596 if (C == 0x2c70)
597 return 0x0252;
598 if (C < 0x2c72)
599 return C;
600 // 2 characters
601 if (C <= 0x2c75 && C % 3 == 2)
602 return C + 1;
603 if (C < 0x2c7e)
604 return C;
605 // 2 characters
606 if (C <= 0x2c7f)
607 return C + -10815;
608 if (C < 0x2c80)
609 return C;
610 // 50 characters
611 if (C <= 0x2ce2)
612 return C | 1;
613 if (C < 0x2ceb)
614 return C;
615 // 2 characters
616 if (C <= 0x2ced && C % 2 == 1)
617 return C + 1;
618 if (C < 0x2cf2)
619 return C;
620 // 2 characters
621 if (C <= 0xa640 && C % 31054 == 11506)
622 return C + 1;
623 if (C < 0xa642)
624 return C;
625 // 22 characters
626 if (C <= 0xa66c)
627 return C | 1;
628 if (C < 0xa680)
629 return C;
630 // 14 characters
631 if (C <= 0xa69a)
632 return C | 1;
633 if (C < 0xa722)
634 return C;
635 // 7 characters
636 if (C <= 0xa72e)
637 return C | 1;
638 if (C < 0xa732)
639 return C;
640 // 31 characters
641 if (C <= 0xa76e)
642 return C | 1;
643 if (C < 0xa779)
644 return C;
645 // 2 characters
646 if (C <= 0xa77b && C % 2 == 1)
647 return C + 1;
648 // LATIN CAPITAL LETTER INSULAR G
649 if (C == 0xa77d)
650 return 0x1d79;
651 if (C < 0xa77e)
652 return C;
653 // 5 characters
654 if (C <= 0xa786)
655 return C | 1;
656 // LATIN CAPITAL LETTER SALTILLO
657 if (C == 0xa78b)
658 return 0xa78c;
659 // LATIN CAPITAL LETTER TURNED H
660 if (C == 0xa78d)
661 return 0x0265;
662 if (C < 0xa790)
663 return C;
664 // 2 characters
665 if (C <= 0xa792)
666 return C | 1;
667 if (C < 0xa796)
668 return C;
669 // 10 characters
670 if (C <= 0xa7a8)
671 return C | 1;
672 // LATIN CAPITAL LETTER H WITH HOOK
673 if (C == 0xa7aa)
674 return 0x0266;
675 // LATIN CAPITAL LETTER REVERSED OPEN E
676 if (C == 0xa7ab)
677 return 0x025c;
678 // LATIN CAPITAL LETTER SCRIPT G
679 if (C == 0xa7ac)
680 return 0x0261;
681 // LATIN CAPITAL LETTER L WITH BELT
682 if (C == 0xa7ad)
683 return 0x026c;
684 // LATIN CAPITAL LETTER SMALL CAPITAL I
685 if (C == 0xa7ae)
686 return 0x026a;
687 // LATIN CAPITAL LETTER TURNED K
688 if (C == 0xa7b0)
689 return 0x029e;
690 // LATIN CAPITAL LETTER TURNED T
691 if (C == 0xa7b1)
692 return 0x0287;
693 // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
694 if (C == 0xa7b2)
695 return 0x029d;
696 // LATIN CAPITAL LETTER CHI
697 if (C == 0xa7b3)
698 return 0xab53;
699 if (C < 0xa7b4)
700 return C;
701 // 2 characters
702 if (C <= 0xa7b6)
703 return C | 1;
704 if (C < 0xab70)
705 return C;
706 // 80 characters
707 if (C <= 0xabbf)
708 return C + -38864;
709 if (C < 0xff21)
710 return C;
711 // 26 characters
712 if (C <= 0xff3a)
713 return C + 32;
714 if (C < 0x10400)
715 return C;
716 // 40 characters
717 if (C <= 0x10427)
718 return C + 40;
719 if (C < 0x104b0)
720 return C;
721 // 36 characters
722 if (C <= 0x104d3)
723 return C + 40;
724 if (C < 0x10c80)
725 return C;
726 // 51 characters
727 if (C <= 0x10cb2)
728 return C + 64;
729 if (C < 0x118a0)
730 return C;
731 // 32 characters
732 if (C <= 0x118bf)
733 return C + 32;
734 if (C < 0x1e900)
735 return C;
736 // 34 characters
737 if (C <= 0x1e921)
738 return C + 34;
739
740 return C;
741 }
1818 ConvertUTFTest.cpp
1919 DataExtractorTest.cpp
2020 DebugTest.cpp
21 DJBTest.cpp
2122 EndianStreamTest.cpp
2223 EndianTest.cpp
2324 ErrnoTest.cpp
0 //===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/Support/DJB.h"
10 #include "llvm/ADT/Twine.h"
11 #include "gtest/gtest.h"
12
13 using namespace llvm;
14
15 TEST(DJBTest, caseFolding) {
16 struct TestCase {
17 StringLiteral One;
18 StringLiteral Two;
19 };
20
21 static constexpr TestCase Tests[] = {
22 {{"ASDF"}, {"asdf"}},
23 {{"qWeR"}, {"QwEr"}},
24 {{"qqqqqqqqqqqqqqqqqqqq"}, {"QQQQQQQQQQQQQQQQQQQQ"}},
25
26 {{"I"}, {"i"}},
27 // Latin Small Letter Dotless I
28 {{u8"\u0130"}, {"i"}},
29 // Latin Capital Letter I With Dot Above
30 {{u8"\u0131"}, {"i"}},
31
32 // Latin Capital Letter A With Grave
33 {{u8"\u00c0"}, {u8"\u00e0"}},
34 // Latin Capital Letter A With Macron
35 {{u8"\u0100"}, {u8"\u0101"}},
36 // Latin Capital Letter L With Acute
37 {{u8"\u0139"}, {u8"\u013a"}},
38 // Cyrillic Capital Letter Ie
39 {{u8"\u0415"}, {u8"\u0435"}},
40 // Latin Capital Letter A With Circumflex And Grave
41 {{u8"\u1ea6"}, {u8"\u1ea7"}},
42 // Kelvin Sign
43 {{u8"\u212a"}, {u8"\u006b"}},
44 // Glagolitic Capital Letter Chrivi
45 {{u8"\u2c1d"}, {u8"\u2c4d"}},
46 // Fullwidth Latin Capital Letter M
47 {{u8"\uff2d"}, {u8"\uff4d"}},
48 // Old Hungarian Capital Letter Ej
49 {{u8"\U00010c92"}, {u8"\U00010cd2"}},
50 };
51
52 for (const TestCase &T : Tests) {
53 SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'");
54 EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two));
55 }
56 }
57
58 TEST(DJBTest, knownValuesLowerCase) {
59 struct TestCase {
60 StringLiteral Text;
61 uint32_t Hash;
62 };
63 static constexpr TestCase Tests[] = {
64 {{""}, 5381u},
65 {{"f"}, 177675u},
66 {{"fo"}, 5863386u},
67 {{"foo"}, 193491849u},
68 {{"foob"}, 2090263819u},
69 {{"fooba"}, 259229388u},
70 {{"foobar"}, 4259602622u},
71 {{"pneumonoultramicroscopicsilicovolcanoconiosis"}, 3999417781u},
72 };
73
74 for (const TestCase &T : Tests) {
75 SCOPED_TRACE("Text: '" + T.Text + "'");
76 EXPECT_EQ(T.Hash, djbHash(T.Text));
77 EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text));
78 EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper()));
79 }
80 }
81
82 TEST(DJBTest, knownValuesUnicode) {
83 EXPECT_EQ(5866553u, djbHash(u8"\u0130"));
84 EXPECT_EQ(177678u, caseFoldingDjbHash(u8"\u0130"));
85 EXPECT_EQ(
86 1302161417u,
87 djbHash(
88 u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6"
89 u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2"));
90 EXPECT_EQ(
91 1145571043u,
92 caseFoldingDjbHash(
93 u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6"
94 u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2"));
95 }
0 #!/usr/bin/env python
1 """
2 Unicode case folding database conversion utility
3
4 Parses the database and generates a C++ function which implements the case
5 folding algorithm. The database entries are of the form:
6
7 ; ; ; #
8
9 can be one of four characters:
10 C - Common mappings
11 S - mappings for Simple case folding
12 F - mappings for Full case folding
13 T - special case for Turkish I characters
14
15 Right now this generates a function which implements simple case folding (C+S
16 entries).
17 """
18
19 import sys
20 import re
21 import urllib2
22
23 # This variable will body of the mappings function
24 body = ""
25
26 # Reads file line-by-line, extracts Common and Simple case fold mappings and
27 # returns a (from_char, to_char, from_name) tuple.
28 def mappings(f):
29 previous_from = -1
30 expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
31 for line in f:
32 m = expr.match(line)
33 if not m: continue
34 from_char = int(m.group(1), 16)
35 to_char = int(m.group(2), 16)
36 from_name = m.group(3)
37
38 if from_char <= previous_from:
39 raise Exception("Duplicate or unsorted characters in input")
40 yield from_char, to_char, from_name
41 previous_from = from_char
42
43 # Computes the shift (to_char - from_char) in a mapping.
44 def shift(mapping):
45 return mapping[1] - mapping[0]
46
47 # Computes the stride (from_char2 - from_char1) of two mappings.
48 def stride2(mapping1, mapping2):
49 return mapping2[0] - mapping1[0]
50
51 # Computes the stride of a list of mappings. The list should have at least two
52 # mappings. All mappings in the list are assumed to have the same stride.
53 def stride(block):
54 return stride2(block[0], block[1])
55
56
57 # b is a list of mappings. All the mappings are assumed to have the same
58 # shift and the stride between adjecant mappings (if any) is constant.
59 def dump_block(b):
60 global body
61
62 if len(b) == 1:
63 # Special case for handling blocks of length 1. We don't even need to
64 # emit the "if (C < X) return C" check below as all characters in this
65 # range will be caught by the "C < X" check emitted by the first
66 # non-trivial block.
67 body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0])
68 return
69
70 first = b[0][0]
71 last = first + stride(b) * (len(b)-1)
72 modulo = first % stride(b)
73
74 # All characters before this block map to themselves.
75 body += " if (C < {0:#06x})\n return C;\n".format(first)
76 body += " // {0} characters\n".format(len(b))
77
78 # Generic pattern: check upper bound (lower bound is checked by the "if"
79 # above) and modulo of C, return C+shift.
80 pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n"
81
82 if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
83 # Special case:
84 # We can elide the modulo-check because the expression "C|1" will map
85 # the intervening characters to themselves.
86 pattern = " if (C <= {0:#06x})\n return C | 1;\n"
87 elif stride(b) == 1:
88 # Another special case: X % 1 is always zero, so don't emit the
89 # modulo-check.
90 pattern = " if (C <= {0:#06x})\n return C + {3};\n"
91
92 body += pattern.format(last, stride(b), modulo, shift(b[0]))
93
94 current_block = []
95 f = urllib2.urlopen(sys.argv[1])
96 for m in mappings(f):
97 if len(current_block) == 0:
98 current_block.append(m)
99 continue
100
101 if shift(current_block[0]) != shift(m):
102 # Incompatible shift, start a new block.
103 dump_block(current_block)
104 current_block = [m]
105 continue
106
107 if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
108 current_block.append(m)
109 continue
110
111 # Incompatible stride, start a new block.
112 dump_block(current_block)
113 current_block = [m]
114 f.close()
115
116 dump_block(current_block)
117
118 print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//'
119 print '//'
120 print '// This file was generated by utils/unicode-case-fold.py from the Unicode'
121 print '// case folding database at'
122 print '// ', sys.argv[1]
123 print '//'
124 print '// To regenerate this file, run:'
125 print '// utils/unicode-case-fold.py \\'
126 print '// "{}" \\'.format(sys.argv[1])
127 print '// > lib/Support/UnicodeCaseFold.cpp'
128 print '//'
129 print '//===----------------------------------------------------------------------===//'
130 print ''
131 print '#include "llvm/Support/Unicode.h"'
132 print ''
133 print "int llvm::sys::unicode::foldCharSimple(int C) {"
134 print body
135 print " return C;"
136 print "}"