OLD | NEW |
1 # Copyright (C) 2016 and later: Unicode, Inc. and others. | 1 # Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 # License & terms of use: http://www.unicode.org/copyright.html | 2 # License & terms of use: http://www.unicode.org/copyright.html |
3 # Copyright (c) 2002-2016 International Business Machines Corporation and | 3 # Copyright (c) 2002-2016 International Business Machines Corporation and |
4 # others. All Rights Reserved. | 4 # others. All Rights Reserved. |
5 # | 5 # |
6 # file: line_normal.txt | 6 # file: line_normal.txt |
7 # | 7 # |
8 # Line Breaking Rules | 8 # Line Breaking Rules |
9 # Implement default line breaking as defined by | 9 # Implement default line breaking as defined by |
10 # Unicode Standard Annex #14 Revision 37 for Unicode 9.0 | 10 # Unicode Standard Annex #14 Revision 37 for Unicode 9.0 |
11 # http://www.unicode.org/reports/tr14/ | 11 # http://www.unicode.org/reports/tr14/ |
(...skipping 10 matching lines...) Expand all Loading... |
22 # This tailors the line break behavior to correspond to CSS | 22 # This tailors the line break behavior to correspond to CSS |
23 # line-break=normal (BCP47 -u-lb-normal) as defined for languages other
than | 23 # line-break=normal (BCP47 -u-lb-normal) as defined for languages other
than |
24 # Chinese & Japanese. | 24 # Chinese & Japanese. |
25 # It sets characters of class CJ to behave like ID. | 25 # It sets characters of class CJ to behave like ID. |
26 | 26 |
27 # | 27 # |
28 # Character Classes defined by TR 14. | 28 # Character Classes defined by TR 14. |
29 # | 29 # |
30 | 30 |
31 !!chain; | 31 !!chain; |
| 32 !!quoted_literals_only; |
32 | 33 |
33 $AI = [:LineBreak = Ambiguous:]; | 34 $AI = [:LineBreak = Ambiguous:]; |
34 $AL = [:LineBreak = Alphabetic:]; | 35 $AL = [:LineBreak = Alphabetic:]; |
35 $BA = [:LineBreak = Break_After:]; | 36 $BA = [:LineBreak = Break_After:]; |
36 $BB = [:LineBreak = Break_Before:]; | 37 $BB = [:LineBreak = Break_Before:]; |
37 $BK = [:LineBreak = Mandatory_Break:]; | 38 $BK = [:LineBreak = Mandatory_Break:]; |
38 $B2 = [:LineBreak = Break_Both:]; | 39 $B2 = [:LineBreak = Break_Both:]; |
39 $CB = [:LineBreak = Contingent_Break:]; | 40 $CB = [:LineBreak = Contingent_Break:]; |
40 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; | 41 $CJ = [:LineBreak = Conditional_Japanese_Starter:]; |
41 $CL = [:LineBreak = Close_Punctuation:]; | 42 $CL = [:LineBreak = Close_Punctuation:]; |
(...skipping 289 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
331 $RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY
$GL $QU $BA $HY $NS $CM] {eof}]; | 332 $RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY
$GL $QU $BA $HY $NS $CM] {eof}]; |
332 $RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY
$GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; | 333 $RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY
$GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}]; |
333 $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $S
Y $GL $QU $BA $HY $NS $CM] {eof}]; | 334 $RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $S
Y $GL $QU $BA $HY $NS $CM] {eof}]; |
334 | 335 |
335 $RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $
HY $NS {eof}]; | 336 $RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $
HY $NS {eof}]; |
336 $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); | 337 $RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK); |
337 | 338 |
338 # LB 30b Do not break between an Emoji Base and an Emoji Modifier | 339 # LB 30b Do not break between an Emoji Base and an Emoji Modifier |
339 $EB $CM* $EM; | 340 $EB $CM* $EM; |
340 | 341 |
341 # | 342 # LB 31 Break everywhere else. |
342 # Reverse Rules. | 343 # Match a single code point if no other rule applies. |
343 # | 344 .; |
344 ## ------------------------------------------------- | |
345 | |
346 !!reverse; | |
347 | |
348 # LB 9 Combining Marks. | |
349 # Stick together any combining sequences that don't match other rules. | |
350 | |
351 ^$CM+ $CAN_CM?; | |
352 | |
353 # | |
354 # Sequences of the form (shown forwards) | |
355 # [CANT_CM] <break> [CM] [whatever] | |
356 # The CM needs to behave as an AL | |
357 # | |
358 $AL_FOLLOW $CM+ / ( | |
359 [$BK $CR $LF $NL $ZW {eof}] | | |
360 $SP+ $CM+ $SP | | |
361 $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, ne
ed to surpress this break. | |
362 # LB14 says OP SP* x . | |
363 # becomes OP SP* x AL | |
364 # becomes OP SP* x CM+ AL_F
OLLOW | |
365 # | |
366 # Further note: the $AL in [$AL
{eof}] is only to work around | |
367 # a rule compiler
bug which complains about | |
368 # empty sets other
wise. | |
369 | |
370 | |
371 # LB 4, 5, 6 | |
372 | |
373 $LB4Breaks [$LB4NonBreaks-$CM]; | |
374 $LB4Breaks $CM+ $CAN_CM; | |
375 $LF $CR; | |
376 | |
377 | |
378 # LB 7 x SP | |
379 # x ZW | |
380 [$SP $ZW] [$LB4NonBreaks-$CM]; | |
381 [$SP $ZW] $CM+ $CAN_CM; | |
382 | |
383 # LB 8 ZW SP* <break> | |
384 # TODO: to implement this, we need more than one look-ahead hard break in pl
ay at a time. | |
385 # Requires an engine enhancement. | |
386 # / $SP* $ZW | |
387 | |
388 # LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) | |
389 # | |
390 ($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?; | |
391 | |
392 | |
393 # LB 9,10 Combining marks. | |
394 # X $CM needs to behave like X, where X is not $SP or controls. | |
395 # $CM not covered by the above needs to behave like $AL | |
396 # Stick together any combining sequences that don't match other rules. | |
397 ^$CM+ $CAN_CM; | |
398 | |
399 | |
400 # LB 11 | |
401 # | |
402 $WJ $CM* $CAN_CM; | |
403 $WJ [$LB8NonBreaks-$CM]; | |
404 | |
405 $CANT_CM $CM* $WJ; | |
406 $CAN_CM $CM* $WJ; | |
407 | |
408 # LB 12a | |
409 # [^SP BA HY] x GL | |
410 # | |
411 $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; | |
412 | |
413 # LB 12 | |
414 # GL x | |
415 # | |
416 $CANT_CM $CM* $GL; | |
417 $CAN_CM $CM* $GL; | |
418 | |
419 | |
420 # LB 13 | |
421 $CL $CM+ $CAN_CM; | |
422 $CP $CM+ $CAN_CM; | |
423 $EX $CM+ $CAN_CM; | |
424 $IS $CM+ $CAN_CM; | |
425 $SY $CM+ $CAN_CM; | |
426 | |
427 $CL [$LB8NonBreaks-$CM]; | |
428 $CP [$LB8NonBreaks-$CM]; | |
429 $EX [$LB8NonBreaks-$CM]; | |
430 $IS [$LB8NonBreaks-$CM]; | |
431 $SY [$LB8NonBreaks-$CM]; | |
432 | |
433 | |
434 # LB 14 OP SP* x | |
435 # | |
436 . $SP* $CM* $OP; | |
437 $AL_FOLLOW? $CM+ $SP+ $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $S
P+ $CM* $OP | |
438 | |
439 | |
440 # LB 15 | |
441 $OP $SP* $CM* $QU; | |
442 | |
443 # LB 16 | |
444 $NS $SP* $CM* ($CL | $CP); | |
445 | |
446 # LB 17 | |
447 $B2 $SP* $CM* $B2; | |
448 | |
449 # LB 18 break after spaces | |
450 # Nothing explicit needed here. | |
451 | |
452 | |
453 # | |
454 # LB 19 | |
455 # | |
456 $QU $CM* $CAN_CM; # . x QU | |
457 $QU $LB18NonBreaks; | |
458 | |
459 | |
460 $CAN_CM $CM* $QU; # QU x . | |
461 $CANT_CM $CM* $QU; | |
462 | |
463 # | |
464 # LB 20 Break before and after CB. | |
465 # nothing needed here. | |
466 # | |
467 | |
468 # LB 21 | |
469 ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) | |
470 | |
471 [$LB20NonBreaks-$CM] $CM* $BB; # BB x . | |
472 [^$CB] $CM* $BB; # | |
473 | |
474 # LB21a | |
475 [^$CB] $CM* ($HY | $BA) $CM* $HL; | |
476 | |
477 # LB21b (reverse) | |
478 $HL $CM* $SY; | |
479 | |
480 # LB 22 | |
481 $IN $CM* ($ALPlus | $HL); | |
482 $IN $CM* $EX; | |
483 $IN $CM* ($ID | $EB | $EM); | |
484 $IN $CM* $IN; | |
485 $IN $CM* $NU; | |
486 | |
487 # LB 23 | |
488 $NU $CM* ($ALPlus | $HL); | |
489 ($ALPlus | $HL) $CM* $NU; | |
490 | |
491 # LB23a | |
492 ($ID | $EB | $EM) $CM* $PR; | |
493 $PO $CM* ($ID | $EB | $EM); | |
494 | |
495 # LB 24 | |
496 ($ALPlus | $HL) $CM* ($PR | $PO); | |
497 ($PR | $PO) $CM* ($ALPlus | $HL); | |
498 | |
499 | |
500 # LB 25 | |
501 ($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM*
($OP | $HY))? ($CM* ($PR | $PO))?; | |
502 | |
503 # LB 26 | |
504 ($H3 | $H2 | $JV | $JL) $CM* $JL; | |
505 ($JT | $JV) $CM* ($H2 | $JV); | |
506 $JT $CM* ($H3 | $JT); | |
507 | |
508 # LB 27 | |
509 $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); | |
510 $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); | |
511 ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; | |
512 | |
513 # LB 28 | |
514 ($ALPlus | $HL) $CM* ($ALPlus | $HL); | |
515 | |
516 | |
517 # LB 29 | |
518 ($ALPlus | $HL) $CM* $IS; | |
519 | |
520 # LB 30 | |
521 $OP $CM* ($ALPlus | $HL | $NU); | |
522 ($ALPlus | $HL | $NU) $CM* $CP; | |
523 | |
524 # LB 30a | |
525 # Pairs of Regional Indicators. | |
526 # The following two rules are nearly identical. The first matches only sequen
ces with an odd number of adjacent RIs, | |
527 # the second with an even number. Stripping away the cruft they look like | |
528 # [^RI] RI / (RI RI)+ ^RI; | |
529 # [^RI] RI RI / (RI RI)+ ^RI; | |
530 # | |
531 [{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ]
$CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; | |
532 [{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ]
$CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]]; | |
533 | |
534 # In general, adjacent RIs stay together. The hard-break rules, above, overide t
his, forcing in the boundaries between pairs. | |
535 $RI $CM* $RI; | |
536 | |
537 # WJ, GL, QU, etc. are classes with rules like "WJ x " which includes "WJ x
RI". | |
538 $RI $CM* ([$WJ $GL $QU $BB] | (($HY | $BA)$CM* $HL)); | |
539 | |
540 | |
541 # LB 30b Do not break between an Emoji Base and an Emoji Modifier | |
542 $EM $CM* $EB; | |
543 | |
544 | 345 |
545 ## ------------------------------------------------- | 346 ## ------------------------------------------------- |
546 | 347 |
547 !!safe_reverse; | 348 !!safe_reverse; |
548 | 349 |
549 # LB 9 | 350 # LB 9 |
550 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; | 351 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; |
551 ^$CM+ $SP / .; | |
552 | 352 |
553 # LB 14 | 353 # LB 14 |
554 $SP+ $CM* $OP; | 354 $SP+ $CM* $OP; |
555 | 355 |
556 # LB 15 | 356 # LB 15 |
557 $SP+ $CM* $QU; | 357 $SP+ $CM* $QU; |
558 | 358 |
559 # LB 16 | 359 # LB 16 |
560 $SP+ $CM* ($CL | $CP); | 360 $SP+ $CM* ($CL | $CP); |
561 | 361 |
562 # LB 17 | 362 # LB 17 |
563 $SP+ $CM* $B2; | 363 $SP+ $CM* $B2; |
564 | 364 |
565 # LB 21 | 365 # LB 21 |
566 $CM* ($HY | $BA) $CM* $HL; | 366 $CM* ($HY | $BA) $CM* $HL; |
567 | 367 |
568 # LB 25 | 368 # LB 25 |
569 ($CM* ($IS | $SY))+ $CM* $NU; | 369 ($CM* ($IS | $SY))+ $CM* $NU; |
570 ($CL | $CP) $CM* ($NU | $IS | $SY); | 370 ($CL | $CP) $CM* ($NU | $IS | $SY); |
571 | 371 |
572 # LB 30 | 372 # LB 30 |
573 ($CM* $RI)+; | 373 ($CM* $RI)+; |
574 | 374 |
575 # For dictionary-based break | 375 # For dictionary-based break |
576 $dictionary $dictionary; | 376 $dictionary $dictionary; |
577 | |
578 ## ------------------------------------------------- | |
579 | |
580 !!safe_forward; | |
581 | |
582 # Skip forward over all character classes that are involved in | |
583 # rules containing patterns with possibly more than one char | |
584 # of context. | |
585 # | |
586 # It might be slightly more efficient to have specific rules | |
587 # instead of one generic one, but only if we could | |
588 # turn off rule chaining. We don't want to move more | |
589 # than necessary. | |
590 # | |
591 ^[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $
CL $CP $B2 $PR $HY $BA $RI $ZWJ $dictionary]; | |
592 $dictionary $dictionary; | |
593 | |
OLD | NEW |