OLD | NEW |
(Empty) | |
| 1 // © 2017 and later: Unicode, Inc. and others. |
| 2 // License & terms of use: http://www.unicode.org/copyright.html#License |
| 3 package com.ibm.icu.text; |
| 4 |
| 5 import java.nio.BufferOverflowException; |
| 6 import java.util.Arrays; |
| 7 |
| 8 /** |
| 9 * Records lengths of string edits but not replacement text. |
| 10 * Supports replacements, insertions, deletions in linear progression. |
| 11 * Does not support moving/reordering of text. |
| 12 * |
| 13 * @draft ICU 59 |
| 14 * @provisional This API might change or be removed in a future release. |
| 15 */ |
| 16 public final class Edits { |
| 17 // 0000uuuuuuuuuuuu records u+1 unchanged text units. |
| 18 private static final int MAX_UNCHANGED_LENGTH = 0x1000; |
| 19 private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; |
| 20 |
| 21 // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units
. |
| 22 // No length change. |
| 23 private static final int MAX_SHORT_WIDTH = 6; |
| 24 private static final int MAX_SHORT_CHANGE_LENGTH = 0xfff; |
| 25 private static final int MAX_SHORT_CHANGE = 0x6fff; |
| 26 |
| 27 // 0111mmmmmmnnnnnn records a replacement of m text units with n. |
| 28 // m or n = 61: actual length follows in the next edits array unit. |
| 29 // m or n = 62..63: actual length follows in the next two edits array units. |
| 30 // Bit 30 of the actual length is in the head unit. |
| 31 // Trailing units have bit 15 set. |
| 32 private static final int LENGTH_IN_1TRAIL = 61; |
| 33 private static final int LENGTH_IN_2TRAIL = 62; |
| 34 |
| 35 private static final int STACK_CAPACITY = 100; |
| 36 private char[] array; |
| 37 private int length; |
| 38 private int delta; |
| 39 |
| 40 /** |
| 41 * Constructs an empty object. |
| 42 * @draft ICU 59 |
| 43 * @provisional This API might change or be removed in a future release. |
| 44 */ |
| 45 public Edits() { |
| 46 array = new char[STACK_CAPACITY]; |
| 47 } |
| 48 |
| 49 /** |
| 50 * Resets the data but may not release memory. |
| 51 * @draft ICU 59 |
| 52 * @provisional This API might change or be removed in a future release. |
| 53 */ |
| 54 public void reset() { |
| 55 length = delta = 0; |
| 56 } |
| 57 |
| 58 private void setLastUnit(int last) { |
| 59 array[length - 1] = (char)last; |
| 60 } |
| 61 private int lastUnit() { |
| 62 return length > 0 ? array[length - 1] : 0xffff; |
| 63 } |
| 64 |
| 65 /** |
| 66 * Adds a record for an unchanged segment of text. |
| 67 * Normally called from inside ICU string transformation functions, not user
code. |
| 68 * @draft ICU 59 |
| 69 * @provisional This API might change or be removed in a future release. |
| 70 */ |
| 71 public void addUnchanged(int unchangedLength) { |
| 72 if(unchangedLength < 0) { |
| 73 throw new IllegalArgumentException( |
| 74 "addUnchanged(" + unchangedLength + "): length must not be n
egative"); |
| 75 } |
| 76 // Merge into previous unchanged-text record, if any. |
| 77 int last = lastUnit(); |
| 78 if(last < MAX_UNCHANGED) { |
| 79 int remaining = MAX_UNCHANGED - last; |
| 80 if (remaining >= unchangedLength) { |
| 81 setLastUnit(last + unchangedLength); |
| 82 return; |
| 83 } |
| 84 setLastUnit(MAX_UNCHANGED); |
| 85 unchangedLength -= remaining; |
| 86 } |
| 87 // Split large lengths into multiple units. |
| 88 while(unchangedLength >= MAX_UNCHANGED_LENGTH) { |
| 89 append(MAX_UNCHANGED); |
| 90 unchangedLength -= MAX_UNCHANGED_LENGTH; |
| 91 } |
| 92 // Write a small (remaining) length. |
| 93 if(unchangedLength > 0) { |
| 94 append(unchangedLength - 1); |
| 95 } |
| 96 } |
| 97 |
| 98 /** |
| 99 * Adds a record for a text replacement/insertion/deletion. |
| 100 * Normally called from inside ICU string transformation functions, not user
code. |
| 101 * @draft ICU 59 |
| 102 * @provisional This API might change or be removed in a future release. |
| 103 */ |
| 104 public void addReplace(int oldLength, int newLength) { |
| 105 if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WID
TH) { |
| 106 // Replacement of short oldLength text units by same-length new text
. |
| 107 // Merge into previous short-replacement record, if any. |
| 108 int last = lastUnit(); |
| 109 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && |
| 110 (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHAN
GE_LENGTH) { |
| 111 setLastUnit(last + 1); |
| 112 return; |
| 113 } |
| 114 append(oldLength << 12); |
| 115 return; |
| 116 } |
| 117 |
| 118 if(oldLength < 0 || newLength < 0) { |
| 119 throw new IllegalArgumentException( |
| 120 "addReplace(" + oldLength + ", " + newLength + |
| 121 "): both lengths must be non-negative"); |
| 122 } |
| 123 if (oldLength == 0 && newLength == 0) { |
| 124 return; |
| 125 } |
| 126 int newDelta = newLength - oldLength; |
| 127 if (newDelta != 0) { |
| 128 if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - d
elta)) || |
| 129 (newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE
- delta))) { |
| 130 // Integer overflow or underflow. |
| 131 throw new IndexOutOfBoundsException(); |
| 132 } |
| 133 delta += newDelta; |
| 134 } |
| 135 |
| 136 int head = 0x7000; |
| 137 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { |
| 138 head |= oldLength << 6; |
| 139 head |= newLength; |
| 140 append(head); |
| 141 } else if ((array.length - length) >= 5 || growArray()) { |
| 142 int limit = length + 1; |
| 143 if(oldLength < LENGTH_IN_1TRAIL) { |
| 144 head |= oldLength << 6; |
| 145 } else if(oldLength <= 0x7fff) { |
| 146 head |= LENGTH_IN_1TRAIL << 6; |
| 147 array[limit++] = (char)(0x8000 | oldLength); |
| 148 } else { |
| 149 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; |
| 150 array[limit++] = (char)(0x8000 | (oldLength >> 15)); |
| 151 array[limit++] = (char)(0x8000 | oldLength); |
| 152 } |
| 153 if(newLength < LENGTH_IN_1TRAIL) { |
| 154 head |= newLength; |
| 155 } else if(newLength <= 0x7fff) { |
| 156 head |= LENGTH_IN_1TRAIL; |
| 157 array[limit++] = (char)(0x8000 | newLength); |
| 158 } else { |
| 159 head |= LENGTH_IN_2TRAIL + (newLength >> 30); |
| 160 array[limit++] = (char)(0x8000 | (newLength >> 15)); |
| 161 array[limit++] = (char)(0x8000 | newLength); |
| 162 } |
| 163 array[length] = (char)head; |
| 164 length = limit; |
| 165 } |
| 166 } |
| 167 |
| 168 private void append(int r) { |
| 169 if(length < array.length || growArray()) { |
| 170 array[length++] = (char)r; |
| 171 } |
| 172 } |
| 173 |
| 174 private boolean growArray() { |
| 175 int newCapacity; |
| 176 if (array.length == STACK_CAPACITY) { |
| 177 newCapacity = 2000; |
| 178 } else if (array.length == Integer.MAX_VALUE) { |
| 179 throw new BufferOverflowException(); |
| 180 } else if (array.length >= (Integer.MAX_VALUE / 2)) { |
| 181 newCapacity = Integer.MAX_VALUE; |
| 182 } else { |
| 183 newCapacity = 2 * array.length; |
| 184 } |
| 185 // Grow by at least 5 units so that a maximal change record will fit. |
| 186 if ((newCapacity - array.length) < 5) { |
| 187 throw new BufferOverflowException(); |
| 188 } |
| 189 array = Arrays.copyOf(array, newCapacity); |
| 190 return true; |
| 191 } |
| 192 |
| 193 /** |
| 194 * How much longer is the new text compared with the old text? |
| 195 * @return new length minus old length |
| 196 * @draft ICU 59 |
| 197 * @provisional This API might change or be removed in a future release. |
| 198 */ |
| 199 public int lengthDelta() { return delta; } |
| 200 /** |
| 201 * @return true if there are any change edits |
| 202 * @draft ICU 59 |
| 203 * @provisional This API might change or be removed in a future release. |
| 204 */ |
| 205 public boolean hasChanges() { |
| 206 if (delta != 0) { |
| 207 return true; |
| 208 } |
| 209 for (int i = 0; i < length; ++i) { |
| 210 if (array[i] > MAX_UNCHANGED) { |
| 211 return true; |
| 212 } |
| 213 } |
| 214 return false; |
| 215 } |
| 216 |
| 217 /** |
| 218 * Access to the list of edits. |
| 219 * @see #getCoarseIterator |
| 220 * @see #getFineIterator |
| 221 * @draft ICU 59 |
| 222 * @provisional This API might change or be removed in a future release. |
| 223 */ |
| 224 public static final class Iterator { |
| 225 private final char[] array; |
| 226 private int index; |
| 227 private final int length; |
| 228 private int remaining; |
| 229 private final boolean onlyChanges_, coarse; |
| 230 |
| 231 private boolean changed; |
| 232 private int oldLength_, newLength_; |
| 233 private int srcIndex, replIndex, destIndex; |
| 234 |
| 235 private Iterator(char[] a, int len, boolean oc, boolean crs) { |
| 236 array = a; |
| 237 length = len; |
| 238 onlyChanges_ = oc; |
| 239 coarse = crs; |
| 240 } |
| 241 |
| 242 private int readLength(int head) { |
| 243 if (head < LENGTH_IN_1TRAIL) { |
| 244 return head; |
| 245 } else if (head < LENGTH_IN_2TRAIL) { |
| 246 assert(index < length); |
| 247 assert(array[index] >= 0x8000); |
| 248 return array[index++] & 0x7fff; |
| 249 } else { |
| 250 assert((index + 2) <= length); |
| 251 assert(array[index] >= 0x8000); |
| 252 assert(array[index + 1] >= 0x8000); |
| 253 int len = ((head & 1) << 30) | |
| 254 ((array[index] & 0x7fff) << 15) | |
| 255 (array[index + 1] & 0x7fff); |
| 256 index += 2; |
| 257 return len; |
| 258 } |
| 259 } |
| 260 |
| 261 private void updateIndexes() { |
| 262 srcIndex += oldLength_; |
| 263 if (changed) { |
| 264 replIndex += newLength_; |
| 265 } |
| 266 destIndex += newLength_; |
| 267 } |
| 268 |
| 269 private boolean noNext() { |
| 270 // No change beyond the string. |
| 271 changed = false; |
| 272 oldLength_ = newLength_ = 0; |
| 273 return false; |
| 274 } |
| 275 |
| 276 /** |
| 277 * Advances to the next edit. |
| 278 * @return true if there is another edit |
| 279 * @draft ICU 59 |
| 280 * @provisional This API might change or be removed in a future release. |
| 281 */ |
| 282 public boolean next() { |
| 283 return next(onlyChanges_); |
| 284 } |
| 285 |
| 286 private boolean next(boolean onlyChanges) { |
| 287 // We have an errorCode in case we need to start guarding against in
teger overflows. |
| 288 // It is also convenient for caller loops if we bail out when an err
or was set elsewhere. |
| 289 updateIndexes(); |
| 290 if (remaining > 0) { |
| 291 // Fine-grained iterator: Continue a sequence of equal-length ch
anges. |
| 292 --remaining; |
| 293 return true; |
| 294 } |
| 295 if (index >= length) { |
| 296 return noNext(); |
| 297 } |
| 298 int u = array[index++]; |
| 299 if (u <= MAX_UNCHANGED) { |
| 300 // Combine adjacent unchanged ranges. |
| 301 changed = false; |
| 302 oldLength_ = u + 1; |
| 303 while (index < length && (u = array[index]) <= MAX_UNCHANGED) { |
| 304 ++index; |
| 305 oldLength_ += u + 1; |
| 306 } |
| 307 newLength_ = oldLength_; |
| 308 if (onlyChanges) { |
| 309 updateIndexes(); |
| 310 if (index >= length) { |
| 311 return noNext(); |
| 312 } |
| 313 // already fetched u > MAX_UNCHANGED at index |
| 314 ++index; |
| 315 } else { |
| 316 return true; |
| 317 } |
| 318 } |
| 319 changed = true; |
| 320 if (u <= MAX_SHORT_CHANGE) { |
| 321 if (coarse) { |
| 322 int w = u >> 12; |
| 323 int len = (u & 0xfff) + 1; |
| 324 oldLength_ = newLength_ = len * w; |
| 325 } else { |
| 326 // Split a sequence of equal-length changes that was compres
sed into one unit. |
| 327 oldLength_ = newLength_ = u >> 12; |
| 328 remaining = u & 0xfff; |
| 329 return true; |
| 330 } |
| 331 } else { |
| 332 assert(u <= 0x7fff); |
| 333 oldLength_ = readLength((u >> 6) & 0x3f); |
| 334 newLength_ = readLength(u & 0x3f); |
| 335 if (!coarse) { |
| 336 return true; |
| 337 } |
| 338 } |
| 339 // Combine adjacent changes. |
| 340 while (index < length && (u = array[index]) > MAX_UNCHANGED) { |
| 341 ++index; |
| 342 if (u <= MAX_SHORT_CHANGE) { |
| 343 int w = u >> 12; |
| 344 int len = (u & 0xfff) + 1; |
| 345 len = len * w; |
| 346 oldLength_ += len; |
| 347 newLength_ += len; |
| 348 } else { |
| 349 assert(u <= 0x7fff); |
| 350 int oldLen = readLength((u >> 6) & 0x3f); |
| 351 int newLen = readLength(u & 0x3f); |
| 352 oldLength_ += oldLen; |
| 353 newLength_ += newLen; |
| 354 } |
| 355 } |
| 356 return true; |
| 357 } |
| 358 |
| 359 /** |
| 360 * Finds the edit that contains the source index. |
| 361 * The source index may be found in a non-change |
| 362 * even if normal iteration would skip non-changes. |
| 363 * Normal iteration can continue from a found edit. |
| 364 * |
| 365 * <p>The iterator state before this search logically does not matter. |
| 366 * (It may affect the performance of the search.) |
| 367 * |
| 368 * <p>The iterator state after this search is undefined |
| 369 * if the source index is out of bounds for the source string. |
| 370 * |
| 371 * @param i source index |
| 372 * @return true if the edit for the source index was found |
| 373 * @draft ICU 59 |
| 374 * @provisional This API might change or be removed in a future release. |
| 375 */ |
| 376 public boolean findSourceIndex(int i) { |
| 377 if (i < 0) { return false; } |
| 378 if (i < srcIndex) { |
| 379 // Reset the iterator to the start. |
| 380 index = remaining = oldLength_ = newLength_ = srcIndex = replInd
ex = destIndex = 0; |
| 381 } else if (i < (srcIndex + oldLength_)) { |
| 382 // The index is in the current span. |
| 383 return true; |
| 384 } |
| 385 while (next(false)) { |
| 386 if (i < (srcIndex + oldLength_)) { |
| 387 // The index is in the current span. |
| 388 return true; |
| 389 } |
| 390 if (remaining > 0) { |
| 391 // Is the index in one of the remaining compressed edits? |
| 392 // srcIndex is the start of the current span, before the rem
aining ones. |
| 393 int len = (remaining + 1) * oldLength_; |
| 394 if (i < (srcIndex + len)) { |
| 395 int n = (i - srcIndex) / oldLength_; // 1 <= n <= remai
ning |
| 396 len = n * oldLength_; |
| 397 srcIndex += len; |
| 398 replIndex += len; |
| 399 destIndex += len; |
| 400 remaining -= n; |
| 401 return true; |
| 402 } |
| 403 // Make next() skip all of these edits at once. |
| 404 oldLength_ = newLength_ = len; |
| 405 remaining = 0; |
| 406 } |
| 407 } |
| 408 return false; |
| 409 } |
| 410 |
| 411 /** |
| 412 * @return true if this edit replaces oldLength() units with newLength()
different ones. |
| 413 * false if oldLength units remain unchanged. |
| 414 * @draft ICU 59 |
| 415 * @provisional This API might change or be removed in a future release. |
| 416 */ |
| 417 public boolean hasChange() { return changed; } |
| 418 /** |
| 419 * @return the number of units in the original string which are replaced
or remain unchanged. |
| 420 * @draft ICU 59 |
| 421 * @provisional This API might change or be removed in a future release. |
| 422 */ |
| 423 public int oldLength() { return oldLength_; } |
| 424 /** |
| 425 * @return the number of units in the modified string, if hasChange() is
true. |
| 426 * Same as oldLength if hasChange() is false. |
| 427 * @draft ICU 59 |
| 428 * @provisional This API might change or be removed in a future release. |
| 429 */ |
| 430 public int newLength() { return newLength_; } |
| 431 |
| 432 /** |
| 433 * @return the current index into the source string |
| 434 * @draft ICU 59 |
| 435 * @provisional This API might change or be removed in a future release. |
| 436 */ |
| 437 public int sourceIndex() { return srcIndex; } |
| 438 /** |
| 439 * @return the current index into the replacement-characters-only string
, |
| 440 * not counting unchanged spans |
| 441 * @draft ICU 59 |
| 442 * @provisional This API might change or be removed in a future release. |
| 443 */ |
| 444 public int replacementIndex() { return replIndex; } |
| 445 /** |
| 446 * @return the current index into the full destination string |
| 447 * @draft ICU 59 |
| 448 * @provisional This API might change or be removed in a future release. |
| 449 */ |
| 450 public int destinationIndex() { return destIndex; } |
| 451 }; |
| 452 |
| 453 /** |
| 454 * Returns an Iterator for coarse-grained changes for simple string updates. |
| 455 * Skips non-changes. |
| 456 * @return an Iterator that merges adjacent changes. |
| 457 * @draft ICU 59 |
| 458 * @provisional This API might change or be removed in a future release. |
| 459 */ |
| 460 public Iterator getCoarseChangesIterator() { |
| 461 return new Iterator(array, length, true, true); |
| 462 } |
| 463 |
| 464 /** |
| 465 * Returns an Iterator for coarse-grained changes and non-changes for simple
string updates. |
| 466 * @return an Iterator that merges adjacent changes. |
| 467 * @draft ICU 59 |
| 468 * @provisional This API might change or be removed in a future release. |
| 469 */ |
| 470 public Iterator getCoarseIterator() { |
| 471 return new Iterator(array, length, false, true); |
| 472 } |
| 473 |
| 474 /** |
| 475 * Returns an Iterator for fine-grained changes for modifying styled text. |
| 476 * Skips non-changes. |
| 477 * @return an Iterator that separates adjacent changes. |
| 478 * @draft ICU 59 |
| 479 * @provisional This API might change or be removed in a future release. |
| 480 */ |
| 481 public Iterator getFineChangesIterator() { |
| 482 return new Iterator(array, length, true, false); |
| 483 } |
| 484 |
| 485 /** |
| 486 * Returns an Iterator for fine-grained changes and non-changes for modifyin
g styled text. |
| 487 * @return an Iterator that separates adjacent changes. |
| 488 * @draft ICU 59 |
| 489 * @provisional This API might change or be removed in a future release. |
| 490 */ |
| 491 public Iterator getFineIterator() { |
| 492 return new Iterator(array, length, false, false); |
| 493 } |
| 494 } |
OLD | NEW |