Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(756)

Side by Side Diff: icu4c/source/data/brkitr/rules/line_normal.txt

Issue 330940043: RBBI, add caching, remove reverse rules. Base URL: svn+ssh://source.icu-project.org/repos/icu/trunk/
Patch Set: Created 6 years, 5 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
OLDNEW
1 # Copyright (C) 2016 and later: Unicode, Inc. and others. 1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html 2 # License & terms of use: http://www.unicode.org/copyright.html
3 # Copyright (c) 2002-2016 International Business Machines Corporation and 3 # Copyright (c) 2002-2016 International Business Machines Corporation and
4 # others. All Rights Reserved. 4 # others. All Rights Reserved.
5 # 5 #
6 # file: line_normal.txt 6 # file: line_normal.txt
7 # 7 #
8 # Line Breaking Rules 8 # Line Breaking Rules
9 # Implement default line breaking as defined by 9 # Implement default line breaking as defined by
10 # Unicode Standard Annex #14 Revision 37 for Unicode 9.0 10 # Unicode Standard Annex #14 Revision 37 for Unicode 9.0
11 # http://www.unicode.org/reports/tr14/ 11 # http://www.unicode.org/reports/tr14/
(...skipping 10 matching lines...) Expand all
22 # This tailors the line break behavior to correspond to CSS 22 # This tailors the line break behavior to correspond to CSS
23 # line-break=normal (BCP47 -u-lb-normal) as defined for languages other than 23 # line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
24 # Chinese & Japanese. 24 # Chinese & Japanese.
25 # It sets characters of class CJ to behave like ID. 25 # It sets characters of class CJ to behave like ID.
26 26
27 # 27 #
28 # Character Classes defined by TR 14. 28 # Character Classes defined by TR 14.
29 # 29 #
30 30
31 !!chain; 31 !!chain;
32 !!quoted_literals_only;
32 33
33 $AI = [:LineBreak = Ambiguous:]; 34 $AI = [:LineBreak = Ambiguous:];
34 $AL = [:LineBreak = Alphabetic:]; 35 $AL = [:LineBreak = Alphabetic:];
35 $BA = [:LineBreak = Break_After:]; 36 $BA = [:LineBreak = Break_After:];
36 $BB = [:LineBreak = Break_Before:]; 37 $BB = [:LineBreak = Break_Before:];
37 $BK = [:LineBreak = Mandatory_Break:]; 38 $BK = [:LineBreak = Mandatory_Break:];
38 $B2 = [:LineBreak = Break_Both:]; 39 $B2 = [:LineBreak = Break_Both:];
39 $CB = [:LineBreak = Contingent_Break:]; 40 $CB = [:LineBreak = Contingent_Break:];
40 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; 41 $CJ = [:LineBreak = Conditional_Japanese_Starter:];
41 $CL = [:LineBreak = Close_Punctuation:]; 42 $CL = [:LineBreak = Close_Punctuation:];
(...skipping 289 matching lines...) Expand 10 before | Expand all | Expand 10 after
331 $RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}]; 332 $RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
332 $RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; 333 $RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
333 $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $S Y $GL $QU $BA $HY $NS $CM] {eof}]; 334 $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $S Y $GL $QU $BA $HY $NS $CM] {eof}];
334 335
335 $RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $ HY $NS {eof}]; 336 $RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $ HY $NS {eof}];
336 $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); 337 $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
337 338
338 # LB 30b Do not break between an Emoji Base and an Emoji Modifier 339 # LB 30b Do not break between an Emoji Base and an Emoji Modifier
339 $EB $CM* $EM; 340 $EB $CM* $EM;
340 341
341 # 342 # LB 31 Break everywhere else.
342 # Reverse Rules. 343 # Match a single code point if no other rule applies.
343 # 344 .;
344 ## -------------------------------------------------
345
346 !!reverse;
347
348 # LB 9 Combining Marks.
349 # Stick together any combining sequences that don't match other rules.
350
351 ^$CM+ $CAN_CM?;
352
353 #
354 # Sequences of the form (shown forwards)
355 # [CANT_CM] <break> [CM] [whatever]
356 # The CM needs to behave as an AL
357 #
358 $AL_FOLLOW $CM+ / (
359 [$BK $CR $LF $NL $ZW {eof}] |
360 $SP+ $CM+ $SP |
361 $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, ne ed to surpress this break.
362 # LB14 says OP SP* x .
363 # becomes OP SP* x AL
364 # becomes OP SP* x CM+ AL_F OLLOW
365 #
366 # Further note: the $AL in [$AL {eof}] is only to work around
367 # a rule compiler bug which complains about
368 # empty sets other wise.
369
370
371 # LB 4, 5, 6
372
373 $LB4Breaks [$LB4NonBreaks-$CM];
374 $LB4Breaks $CM+ $CAN_CM;
375 $LF $CR;
376
377
378 # LB 7 x SP
379 # x ZW
380 [$SP $ZW] [$LB4NonBreaks-$CM];
381 [$SP $ZW] $CM+ $CAN_CM;
382
383 # LB 8 ZW SP* <break>
384 # TODO: to implement this, we need more than one look-ahead hard break in pl ay at a time.
385 # Requires an engine enhancement.
386 # / $SP* $ZW
387
388 # LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
389 #
390 ($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
391
392
393 # LB 9,10 Combining marks.
394 # X $CM needs to behave like X, where X is not $SP or controls.
395 # $CM not covered by the above needs to behave like $AL
396 # Stick together any combining sequences that don't match other rules.
397 ^$CM+ $CAN_CM;
398
399
400 # LB 11
401 #
402 $WJ $CM* $CAN_CM;
403 $WJ [$LB8NonBreaks-$CM];
404
405 $CANT_CM $CM* $WJ;
406 $CAN_CM $CM* $WJ;
407
408 # LB 12a
409 # [^SP BA HY] x GL
410 #
411 $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];
412
413 # LB 12
414 # GL x
415 #
416 $CANT_CM $CM* $GL;
417 $CAN_CM $CM* $GL;
418
419
420 # LB 13
421 $CL $CM+ $CAN_CM;
422 $CP $CM+ $CAN_CM;
423 $EX $CM+ $CAN_CM;
424 $IS $CM+ $CAN_CM;
425 $SY $CM+ $CAN_CM;
426
427 $CL [$LB8NonBreaks-$CM];
428 $CP [$LB8NonBreaks-$CM];
429 $EX [$LB8NonBreaks-$CM];
430 $IS [$LB8NonBreaks-$CM];
431 $SY [$LB8NonBreaks-$CM];
432
433
434 # LB 14 OP SP* x
435 #
436 . $SP* $CM* $OP;
437 $AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $S P+ $CM* $OP
438
439
440 # LB 15
441 $OP $SP* $CM* $QU;
442
443 # LB 16
444 $NS $SP* $CM* ($CL | $CP);
445
446 # LB 17
447 $B2 $SP* $CM* $B2;
448
449 # LB 18 break after spaces
450 # Nothing explicit needed here.
451
452
453 #
454 # LB 19
455 #
456 $QU $CM* $CAN_CM; # . x QU
457 $QU $LB18NonBreaks;
458
459
460 $CAN_CM $CM* $QU; # QU x .
461 $CANT_CM $CM* $QU;
462
463 #
464 # LB 20 Break before and after CB.
465 # nothing needed here.
466 #
467
468 # LB 21
469 ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
470
471 [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
472 [^$CB] $CM* $BB; #
473
474 # LB21a
475 [^$CB] $CM* ($HY | $BA) $CM* $HL;
476
477 # LB21b (reverse)
478 $HL $CM* $SY;
479
480 # LB 22
481 $IN $CM* ($ALPlus | $HL);
482 $IN $CM* $EX;
483 $IN $CM* ($ID | $EB | $EM);
484 $IN $CM* $IN;
485 $IN $CM* $NU;
486
487 # LB 23
488 $NU $CM* ($ALPlus | $HL);
489 ($ALPlus | $HL) $CM* $NU;
490
491 # LB23a
492 ($ID | $EB | $EM) $CM* $PR;
493 $PO $CM* ($ID | $EB | $EM);
494
495 # LB 24
496 ($ALPlus | $HL) $CM* ($PR | $PO);
497 ($PR | $PO) $CM* ($ALPlus | $HL);
498
499
500 # LB 25
501 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;
502
503 # LB 26
504 ($H3 | $H2 | $JV | $JL) $CM* $JL;
505 ($JT | $JV) $CM* ($H2 | $JV);
506 $JT $CM* ($H3 | $JT);
507
508 # LB 27
509 $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
510 $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
511 ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
512
513 # LB 28
514 ($ALPlus | $HL) $CM* ($ALPlus | $HL);
515
516
517 # LB 29
518 ($ALPlus | $HL) $CM* $IS;
519
520 # LB 30
521 $OP $CM* ($ALPlus | $HL | $NU);
522 ($ALPlus | $HL | $NU) $CM* $CP;
523
524 # LB 30a
525 # Pairs of Regional Indicators.
526 # The following two rules are nearly identical. The first matches only sequen ces with an odd number of adjacent RIs,
527 # the second with an even number. Stripping away the cruft they look like
528 # [^RI] RI / (RI RI)+ ^RI;
529 # [^RI] RI RI / (RI RI)+ ^RI;
530 #
531 [{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
532 [{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
533
534 # In general, adjacent RIs stay together. The hard-break rules, above, overide t his, forcing in the boundaries between pairs.
535 $RI $CM* $RI;
536
537 # WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x RI".
538 $RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL));
539
540
541 # LB 30b Do not break between an Emoji Base and an Emoji Modifier
542 $EM $CM* $EB;
543
544 345
545 ## ------------------------------------------------- 346 ## -------------------------------------------------
546 347
547 !!safe_reverse; 348 !!safe_reverse;
548 349
549 # LB 9 350 # LB 9
550 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; 351 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
551 ^$CM+ $SP / .;
552 352
553 # LB 14 353 # LB 14
554 $SP+ $CM* $OP; 354 $SP+ $CM* $OP;
555 355
556 # LB 15 356 # LB 15
557 $SP+ $CM* $QU; 357 $SP+ $CM* $QU;
558 358
559 # LB 16 359 # LB 16
560 $SP+ $CM* ($CL | $CP); 360 $SP+ $CM* ($CL | $CP);
561 361
562 # LB 17 362 # LB 17
563 $SP+ $CM* $B2; 363 $SP+ $CM* $B2;
564 364
565 # LB 21 365 # LB 21
566 $CM* ($HY | $BA) $CM* $HL; 366 $CM* ($HY | $BA) $CM* $HL;
567 367
568 # LB 25 368 # LB 25
569 ($CM* ($IS | $SY))+ $CM* $NU; 369 ($CM* ($IS | $SY))+ $CM* $NU;
570 ($CL | $CP) $CM* ($NU | $IS | $SY); 370 ($CL | $CP) $CM* ($NU | $IS | $SY);
571 371
572 # LB 30 372 # LB 30
573 ($CM* $RI)+; 373 ($CM* $RI)+;
574 374
575 # For dictionary-based break 375 # For dictionary-based break
576 $dictionary $dictionary; 376 $dictionary $dictionary;
577
578 ## -------------------------------------------------
579
580 !!safe_forward;
581
582 # Skip forward over all character classes that are involved in
583 # rules containing patterns with possibly more than one char
584 # of context.
585 #
586 # It might be slightly more efficient to have specific rules
587 # instead of one generic one, but only if we could
588 # turn off rule chaining. We don't want to move more
589 # than necessary.
590 #
591 ^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $ CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary];
592 $dictionary $dictionary;
593
OLDNEW

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b