Left: | ||
Right: |
|
OLD | NEW |
---|---|
(Empty) | |
1 // © 2017 and later: Unicode, Inc. and others. | |
2 // License & terms of use: http://www.unicode.org/copyright.html | |
3 | |
4 // edits.cpp | |
5 // created: 2017feb08 Markus W. Scherer | |
6 | |
7 #include "unicode/utypes.h" | |
8 #include "unicode/edits.h" | |
9 #include "cmemory.h" | |
10 #include "uassert.h" | |
11 | |
12 U_NAMESPACE_BEGIN | |
13 | |
14 namespace { | |
15 | |
16 // 0000uuuuuuuuuuuu records u+1 unchanged text units. | |
17 const int32_t MAX_UNCHANGED_LENGTH = 0x1000; | |
18 const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; | |
19 | |
20 // 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units. | |
21 // No length change. | |
22 const int32_t MAX_SHORT_WIDTH = 6; | |
23 const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff; | |
24 const int32_t MAX_SHORT_CHANGE = 0x6fff; | |
25 | |
26 // 0111mmmmmmnnnnnn records a replacement of m text units with n. | |
27 // m or n = 61: actual length follows in the next edits array unit. | |
28 // m or n = 62..63: actual length follows in the next two edits array units. | |
29 // Bit 30 of the actual length is in the head unit. | |
30 // Trailing units have bit 15 set. | |
31 const int32_t LENGTH_IN_1TRAIL = 61; | |
32 const int32_t LENGTH_IN_2TRAIL = 62; | |
33 | |
34 } // namespace | |
35 | |
36 Edits::~Edits() { | |
37 if(array != stackArray) { | |
38 uprv_free(array); | |
39 } | |
40 } | |
41 | |
42 void Edits::reset() { | |
43 length = delta = 0; | |
44 } | |
45 | |
46 void Edits::addUnchanged(int32_t unchangedLength) { | |
47 if(U_FAILURE(errorCode) || unchangedLength == 0) { return; } | |
48 if(unchangedLength < 0) { | |
49 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
50 return; | |
51 } | |
52 // Merge into previous unchanged-text record, if any. | |
53 int32_t last = lastUnit(); | |
54 if(last < MAX_UNCHANGED) { | |
55 int32_t remaining = MAX_UNCHANGED - last; | |
56 if (remaining >= unchangedLength) { | |
57 setLastUnit(last + unchangedLength); | |
58 return; | |
59 } | |
60 setLastUnit(MAX_UNCHANGED); | |
61 unchangedLength -= remaining; | |
62 } | |
63 // Split large lengths into multiple units. | |
64 while(unchangedLength >= MAX_UNCHANGED_LENGTH) { | |
65 append(MAX_UNCHANGED); | |
66 unchangedLength -= MAX_UNCHANGED_LENGTH; | |
67 } | |
68 // Write a small (remaining) length. | |
69 if(unchangedLength > 0) { | |
70 append(unchangedLength - 1); | |
71 } | |
72 } | |
73 | |
74 void Edits::addReplace(int32_t oldLength, int32_t newLength) { | |
75 if(U_FAILURE(errorCode)) { return; } | |
76 if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) { | |
77 // Replacement of short oldLength text units by same-length new text. | |
78 // Merge into previous short-replacement record, if any. | |
79 int32_t last = lastUnit(); | |
80 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && | |
81 (last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_L ENGTH) { | |
82 setLastUnit(last + 1); | |
83 return; | |
84 } | |
85 append(oldLength << 12); | |
86 return; | |
87 } | |
88 | |
89 if(oldLength < 0 || newLength < 0) { | |
90 errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |
91 return; | |
92 } | |
93 if (oldLength == 0 && newLength == 0) { | |
94 return; | |
95 } | |
96 int32_t newDelta = newLength - oldLength; | |
97 if (newDelta != 0) { | |
98 if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) || | |
99 (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) { | |
100 // Integer overflow or underflow. | |
101 errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
102 return; | |
103 } | |
104 delta += newDelta; | |
105 } | |
106 | |
107 int32_t head = 0x7000; | |
108 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { | |
109 head |= oldLength << 6; | |
110 head |= newLength; | |
111 append(head); | |
112 } else if ((capacity - length) >= 5 || growArray()) { | |
113 int32_t limit = length + 1; | |
114 if(oldLength < LENGTH_IN_1TRAIL) { | |
115 head |= oldLength << 6; | |
116 } else if(oldLength <= 0x7fff) { | |
117 head |= LENGTH_IN_1TRAIL << 6; | |
118 array[limit++] = (uint16_t)(0x8000 | oldLength); | |
119 } else { | |
120 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; | |
121 array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15)); | |
122 array[limit++] = (uint16_t)(0x8000 | oldLength); | |
123 } | |
124 if(newLength < LENGTH_IN_1TRAIL) { | |
125 head |= newLength; | |
126 } else if(newLength <= 0x7fff) { | |
127 head |= LENGTH_IN_1TRAIL; | |
128 array[limit++] = (uint16_t)(0x8000 | newLength); | |
129 } else { | |
130 head |= LENGTH_IN_2TRAIL + (newLength >> 30); | |
131 array[limit++] = (uint16_t)(0x8000 | (newLength >> 15)); | |
132 array[limit++] = (uint16_t)(0x8000 | newLength); | |
133 } | |
134 array[length] = (uint16_t)head; | |
135 length = limit; | |
136 } | |
137 } | |
138 | |
139 void Edits::append(int32_t r) { | |
140 if(length < capacity || growArray()) { | |
141 array[length++] = (uint16_t)r; | |
142 } | |
143 } | |
144 | |
145 UBool Edits::growArray() { | |
andy.heninger
2017/02/17 05:59:53
Too bad you can't use MaybeStackArray. I think it
markus.icu
2017/02/17 18:31:21
Acknowledged.
| |
146 int32_t newCapacity; | |
147 if (array == stackArray) { | |
148 newCapacity = 2000; | |
149 } else if (capacity == INT32_MAX) { | |
150 errorCode = U_BUFFER_OVERFLOW_ERROR; | |
151 return FALSE; | |
andy.heninger
2017/02/17 05:59:53
Not related to this code specifically, but maybe w
markus.icu
2017/02/17 18:31:22
Maybe. We should discuss our strategy in the team
| |
152 } else if (capacity >= (INT32_MAX / 2)) { | |
153 newCapacity = INT32_MAX; | |
154 } else { | |
155 newCapacity = 2 * capacity; | |
156 } | |
157 // Grow by at least 5 units so that a maximal change record will fit. | |
158 if ((newCapacity - capacity) < 5) { | |
159 errorCode = U_BUFFER_OVERFLOW_ERROR; | |
160 return FALSE; | |
161 } | |
162 uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2); | |
andy.heninger
2017/02/17 05:59:53
uprv_realloc?
Maybe not, since it doesn't help whe
markus.icu
2017/02/17 18:31:22
Acknowledged.
| |
163 if (newArray == NULL) { | |
164 errorCode = U_MEMORY_ALLOCATION_ERROR; | |
165 return FALSE; | |
166 } | |
167 uprv_memcpy(newArray, array, (size_t)length * 2); | |
168 if (array != stackArray) { | |
169 uprv_free(array); | |
170 } | |
171 array = newArray; | |
172 capacity = newCapacity; | |
173 return TRUE; | |
174 } | |
175 | |
176 UBool Edits::copyErrorTo(UErrorCode &outErrorCode) { | |
177 if (U_FAILURE(outErrorCode)) { return TRUE; } | |
178 if (U_SUCCESS(errorCode)) { return FALSE; } | |
179 outErrorCode = errorCode; | |
180 return TRUE; | |
181 } | |
182 | |
183 UBool Edits::hasChanges() const { | |
184 if (delta != 0) { | |
185 return TRUE; | |
186 } | |
187 for (int32_t i = 0; i < length; ++i) { | |
188 if (array[i] > MAX_UNCHANGED) { | |
189 return TRUE; | |
190 } | |
191 } | |
192 return FALSE; | |
193 } | |
194 | |
195 Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : | |
196 array(a), index(0), length(len), remaining(0), | |
197 onlyChanges_(oc), coarse(crs), | |
198 changed(FALSE), oldLength_(0), newLength_(0), | |
199 srcIndex(0), replIndex(0), destIndex(0) {} | |
200 | |
201 int32_t Edits::Iterator::readLength(int32_t head) { | |
202 if (head < LENGTH_IN_1TRAIL) { | |
203 return head; | |
204 } else if (head < LENGTH_IN_2TRAIL) { | |
205 U_ASSERT(index < length); | |
206 U_ASSERT(array[index] >= 0x8000); | |
207 return array[index++] & 0x7fff; | |
208 } else { | |
209 U_ASSERT((index + 2) <= length); | |
210 U_ASSERT(array[index] >= 0x8000); | |
211 U_ASSERT(array[index + 1] >= 0x8000); | |
212 int32_t len = ((head & 1) << 30) | | |
213 ((int32_t)(array[index] & 0x7fff) << 15) | | |
214 (array[index + 1] & 0x7fff); | |
215 index += 2; | |
216 return len; | |
217 } | |
218 } | |
219 | |
220 void Edits::Iterator::updateIndexes() { | |
221 srcIndex += oldLength_; | |
222 if (changed) { | |
223 replIndex += newLength_; | |
224 } | |
225 destIndex += newLength_; | |
226 } | |
227 | |
228 UBool Edits::Iterator::noNext() { | |
229 // No change beyond the string. | |
230 changed = FALSE; | |
231 oldLength_ = newLength_ = 0; | |
232 return FALSE; | |
233 } | |
234 | |
235 UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { | |
236 if (U_FAILURE(errorCode)) { return FALSE; } | |
237 // We have an errorCode in case we need to start guarding against integer ov erflows. | |
238 // It is also convenient for caller loops if we bail out when an error was s et elsewhere. | |
239 updateIndexes(); | |
240 if (remaining > 0) { | |
241 // Fine-grained iterator: Continue a sequence of equal-length changes. | |
242 --remaining; | |
243 return TRUE; | |
244 } | |
245 if (index >= length) { | |
246 return noNext(); | |
247 } | |
248 int32_t u = array[index++]; | |
249 if (u <= MAX_UNCHANGED) { | |
250 // Combine adjacent unchanged ranges. | |
251 changed = FALSE; | |
252 oldLength_ = u + 1; | |
253 while (index < length && (u = array[index]) <= MAX_UNCHANGED) { | |
254 ++index; | |
255 oldLength_ += u + 1; | |
256 } | |
257 newLength_ = oldLength_; | |
258 if (onlyChanges) { | |
259 updateIndexes(); | |
260 if (index >= length) { | |
261 return noNext(); | |
262 } | |
263 // already fetched u > MAX_UNCHANGED at index | |
264 ++index; | |
265 } else { | |
266 return TRUE; | |
267 } | |
268 } | |
269 changed = TRUE; | |
270 if (u <= MAX_SHORT_CHANGE) { | |
271 if (coarse) { | |
272 int32_t w = u >> 12; | |
273 int32_t len = (u & 0xfff) + 1; | |
274 oldLength_ = newLength_ = len * w; | |
275 } else { | |
276 // Split a sequence of equal-length changes that was compressed into one unit. | |
277 oldLength_ = newLength_ = u >> 12; | |
278 remaining = u & 0xfff; | |
279 return TRUE; | |
280 } | |
281 } else { | |
282 U_ASSERT(u <= 0x7fff); | |
283 oldLength_ = readLength((u >> 6) & 0x3f); | |
284 newLength_ = readLength(u & 0x3f); | |
285 if (!coarse) { | |
286 return TRUE; | |
287 } | |
288 } | |
289 // Combine adjacent changes. | |
290 while (index < length && (u = array[index]) > MAX_UNCHANGED) { | |
291 ++index; | |
292 if (u <= MAX_SHORT_CHANGE) { | |
293 int32_t w = u >> 12; | |
294 int32_t len = (u & 0xfff) + 1; | |
295 len = len * w; | |
296 oldLength_ += len; | |
297 newLength_ += len; | |
298 } else { | |
299 U_ASSERT(u <= 0x7fff); | |
300 int32_t oldLen = readLength((u >> 6) & 0x3f); | |
301 int32_t newLen = readLength(u & 0x3f); | |
302 oldLength_ += oldLen; | |
303 newLength_ += newLen; | |
304 } | |
305 } | |
306 return TRUE; | |
307 } | |
308 | |
309 UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) { | |
310 if (U_FAILURE(errorCode) || i < 0) { return FALSE; } | |
311 if (i < srcIndex) { | |
312 // Reset the iterator to the start. | |
313 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = des tIndex = 0; | |
314 } else if (i < (srcIndex + oldLength_)) { | |
315 // The index is in the current span. | |
316 return TRUE; | |
317 } | |
318 while (next(FALSE, errorCode)) { | |
319 if (i < (srcIndex + oldLength_)) { | |
320 // The index is in the current span. | |
321 return TRUE; | |
322 } | |
323 if (remaining > 0) { | |
324 // Is the index in one of the remaining compressed edits? | |
325 // srcIndex is the start of the current span, before the remaining o nes. | |
326 int32_t len = (remaining + 1) * oldLength_; | |
327 if (i < (srcIndex + len)) { | |
328 int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining | |
329 len = n * oldLength_; | |
330 srcIndex += len; | |
331 replIndex += len; | |
332 destIndex += len; | |
333 remaining -= n; | |
334 return TRUE; | |
335 } | |
336 // Make next() skip all of these edits at once. | |
337 oldLength_ = newLength_ = len; | |
338 remaining = 0; | |
339 } | |
340 } | |
341 return FALSE; | |
342 } | |
343 | |
344 U_NAMESPACE_END | |
OLD | NEW |