LEFT | RIGHT |
1 // Copyright 2009 The Go Authors. All rights reserved. | 1 // Copyright 2009 The Go Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Garbage collector. | 5 // Garbage collector. |
6 | 6 |
7 #include "runtime.h" | 7 #include "runtime.h" |
8 #include "arch.h" | 8 #include "arch_GOARCH.h" |
9 #include "malloc.h" | 9 #include "malloc.h" |
10 #include "stack.h" | 10 #include "stack.h" |
11 | 11 |
12 enum { | 12 enum { |
13 Debug = 0, | 13 Debug = 0, |
14 PtrSize = sizeof(void*), | 14 PtrSize = sizeof(void*), |
15 DebugMark = 0, // run second pass to check mark | 15 DebugMark = 0, // run second pass to check mark |
| 16 DataBlock = 8*1024, |
16 | 17 |
17 // Four bits per word (see #defines below). | 18 // Four bits per word (see #defines below). |
18 wordsPerBitmapWord = sizeof(void*)*8/4, | 19 wordsPerBitmapWord = sizeof(void*)*8/4, |
19 bitShift = sizeof(void*)*8/4, | 20 bitShift = sizeof(void*)*8/4, |
20 }; | 21 }; |
21 | 22 |
22 // Bits in per-word bitmap. | 23 // Bits in per-word bitmap. |
23 // #defines because enum might not be able to hold the values. | 24 // #defines because enum might not be able to hold the values. |
24 // | 25 // |
25 // Each word in the bitmap describes wordsPerBitmapWord words | 26 // Each word in the bitmap describes wordsPerBitmapWord words |
(...skipping 19 matching lines...) Expand all Loading... |
45 // /* then test bits & bitAllocated, bits & bitMarked, etc. */ | 46 // /* then test bits & bitAllocated, bits & bitMarked, etc. */ |
46 // | 47 // |
47 #define bitAllocated ((uintptr)1<<(bitShift*0)) | 48 #define bitAllocated ((uintptr)1<<(bitShift*0)) |
48 #define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAlloc
ated is set */ | 49 #define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAlloc
ated is set */ |
49 #define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAlloc
ated is set */ | 50 #define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAlloc
ated is set */ |
50 #define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAlloc
ated is set - has finalizer or being profiled */ | 51 #define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAlloc
ated is set - has finalizer or being profiled */ |
51 #define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAlloc
ated is NOT set */ | 52 #define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAlloc
ated is NOT set */ |
52 | 53 |
53 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial) | 54 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial) |
54 | 55 |
| 56 // Holding worldsema grants an M the right to try to stop the world. |
| 57 // The procedure is: |
| 58 // |
| 59 // runtime·semacquire(&runtime·worldsema); |
| 60 // m->gcing = 1; |
| 61 // runtime·stoptheworld(); |
| 62 // |
| 63 // ... do stuff ... |
| 64 // |
| 65 // m->gcing = 0; |
| 66 // runtime·semrelease(&runtime·worldsema); |
| 67 // runtime·starttheworld(); |
| 68 // |
| 69 uint32 runtime·worldsema = 1; |
55 static int32 gctrace; | 70 static int32 gctrace; |
56 | 71 |
57 typedef struct Workbuf Workbuf; | 72 typedef struct Workbuf Workbuf; |
58 struct Workbuf | 73 struct Workbuf |
59 { | 74 { |
60 Workbuf *next; | 75 Workbuf *next; |
| 76 uintptr pushcnt; |
61 uintptr nobj; | 77 uintptr nobj; |
62 » byte *obj[512-2]; | 78 » byte *obj[512-3]; |
63 }; | 79 }; |
64 | 80 |
65 typedef struct Finalizer Finalizer; | 81 typedef struct Finalizer Finalizer; |
66 struct Finalizer | 82 struct Finalizer |
67 { | 83 { |
68 void (*fn)(void*); | 84 void (*fn)(void*); |
69 void *arg; | 85 void *arg; |
70 int32 nret; | 86 int32 nret; |
71 }; | 87 }; |
72 | 88 |
73 typedef struct FinBlock FinBlock; | 89 typedef struct FinBlock FinBlock; |
74 struct FinBlock | 90 struct FinBlock |
75 { | 91 { |
76 FinBlock *alllink; | 92 FinBlock *alllink; |
77 FinBlock *next; | 93 FinBlock *next; |
78 int32 cnt; | 94 int32 cnt; |
79 int32 cap; | 95 int32 cap; |
80 Finalizer fin[1]; | 96 Finalizer fin[1]; |
81 }; | 97 }; |
82 | 98 |
83 extern byte data[]; | 99 extern byte data[]; |
84 extern byte etext[]; | 100 extern byte etext[]; |
85 extern byte end[]; | 101 extern byte ebss[]; |
86 | 102 |
87 static G *fing; | 103 static G *fing; |
88 static FinBlock *finq; // list of finalizers that are to be executed | 104 static FinBlock *finq; // list of finalizers that are to be executed |
89 static FinBlock *finc; // cache of free blocks | 105 static FinBlock *finc; // cache of free blocks |
90 static FinBlock *allfin; // list of all blocks | 106 static FinBlock *allfin; // list of all blocks |
91 static Lock finlock; | 107 static Lock finlock; |
92 static int32 fingwait; | 108 static int32 fingwait; |
93 | 109 |
94 static void runfinq(void); | 110 static void runfinq(void); |
95 static Workbuf* getempty(Workbuf*); | 111 static Workbuf* getempty(Workbuf*); |
96 static Workbuf* getfull(Workbuf*); | 112 static Workbuf* getfull(Workbuf*); |
97 static void putempty(Workbuf*); | 113 static void putempty(Workbuf*); |
98 static Workbuf* handoff(Workbuf*); | 114 static Workbuf* handoff(Workbuf*); |
99 | 115 |
100 static struct { | 116 // parallel for descriptor |
101 » Lock fmu; | 117 typedef struct Parfor Parfor; |
102 » Workbuf»*full; | 118 struct Parfor |
103 » Lock emu; | 119 { |
104 » Workbuf»*empty; | 120 » // executed for each element |
| 121 » void (*body)(Parfor*, uint32); |
| 122 » // number of idle threads |
| 123 » uint32 done; |
| 124 » // total number of threads |
| 125 » uint32 nthr; |
| 126 » // thread id sequencer |
| 127 » uint32 thrseq; |
| 128 » // iteration space [0, cnt) |
| 129 » uint32 cnt; |
| 130 » // arbitrary user context |
| 131 » void *ctx; |
| 132 » // if true, wait while all threads finish processing, |
| 133 » // otherwise parfor may return while other threads still working |
| 134 » bool wait; |
| 135 » byte pad[CacheLineSize]; |
| 136 » struct |
| 137 » { |
| 138 » » // the thread's iteration space [32lsb, 32msb) |
| 139 » » uint64 pos; |
| 140 » » byte pad[CacheLineSize-sizeof(uint64)]; |
| 141 » } thr[MaxGcproc]; |
| 142 }; |
| 143 |
| 144 typedef struct GcRoot GcRoot; |
| 145 struct GcRoot |
| 146 { |
| 147 » byte *p; |
| 148 » uintptr n; |
| 149 }; |
| 150 |
| 151 static struct |
| 152 { |
| 153 » uint64» full; // Workbuf* + ABA counter in 17msb |
| 154 » uint64» empty; // Workbuf* + ABA counter in 17msb |
105 uint32 nproc; | 155 uint32 nproc; |
| 156 byte pad0[CacheLineSize]; |
106 volatile uint32 nwait; | 157 volatile uint32 nwait; |
107 » volatile uint32»ndone; | 158 » byte pad1[CacheLineSize]; |
108 » volatile uint32 markdone; | 159 » volatile uint32 ndone; |
| 160 » volatile uint32 debugmarkdone; |
109 Note alldone; | 161 Note alldone; |
110 » MSpan» *spans; | 162 » Parfor» sweepfor; |
| 163 » Parfor» markfor; |
111 | 164 |
112 Lock; | 165 Lock; |
113 byte *chunk; | 166 byte *chunk; |
114 uintptr nchunk; | 167 uintptr nchunk; |
| 168 |
| 169 GcRoot *roots; |
| 170 uint32 nroot; |
| 171 uint32 rootcap; |
115 } work; | 172 } work; |
| 173 |
| 174 // lock-free stack |
| 175 // used to manage empty and free Workbuf's during mark phase |
| 176 static void lifopush(uint64 *a, Workbuf *b); |
| 177 static Workbuf* lifopop(uint64 *a); |
| 178 |
| 179 // parallel for over [0, n). |
| 180 // body() is executed for each iteration. |
| 181 // nthr - total number of worker threads. |
| 182 // ctx - arbitrary user context. |
| 183 // if wait=true, threads return from parfor() when all work is done; |
| 184 // otherwise, threads can return while other threads still finishing processing.········ |
| 185 static void parforsetup(Parfor *desc, void (*body)(Parfor*, uint32), |
| 186 uint32 nthr, uint32 n, void *ctx, bool wait); |
| 187 static void parfor(Parfor *desc); |
116 | 188 |
117 // scanblock scans a block of n bytes starting at pointer b for references | 189 // scanblock scans a block of n bytes starting at pointer b for references |
118 // to other objects, scanning any it finds recursively until there are no | 190 // to other objects, scanning any it finds recursively until there are no |
119 // unscanned objects left. Instead of using an explicit recursion, it keeps | 191 // unscanned objects left. Instead of using an explicit recursion, it keeps |
120 // a work list in the Workbuf* structures and loops in the main function | 192 // a work list in the Workbuf* structures and loops in the main function |
121 // body. Keeping an explicit work list is easier on the stack allocator and | 193 // body. Keeping an explicit work list is easier on the stack allocator and |
122 // more efficient. | 194 // more efficient. |
123 static void | 195 static void |
124 scanblock(byte *b, int64 n) | 196 scanblock(byte *b, int64 n) |
125 { | 197 { |
(...skipping 14 matching lines...) Expand all Loading... |
140 // Memory arena parameters. | 212 // Memory arena parameters. |
141 arena_start = runtime·mheap.arena_start; | 213 arena_start = runtime·mheap.arena_start; |
142 arena_used = runtime·mheap.arena_used; | 214 arena_used = runtime·mheap.arena_used; |
143 nproc = work.nproc; | 215 nproc = work.nproc; |
144 | 216 |
145 wbuf = nil; // current work buffer | 217 wbuf = nil; // current work buffer |
146 wp = nil; // storage for next queued pointer (write pointer) | 218 wp = nil; // storage for next queued pointer (write pointer) |
147 nobj = 0; // number of queued objects | 219 nobj = 0; // number of queued objects |
148 | 220 |
149 // Scanblock helpers pass b==nil. | 221 // Scanblock helpers pass b==nil. |
150 » // The main proc needs to return to make more | 222 » // Procs need to return to make more |
151 // calls to scanblock. But if work.nproc==1 then | 223 // calls to scanblock. But if work.nproc==1 then |
152 // might as well process blocks as soon as we | 224 // might as well process blocks as soon as we |
153 // have them. | 225 // have them. |
154 keepworking = b == nil || work.nproc == 1; | 226 keepworking = b == nil || work.nproc == 1; |
155 | 227 |
156 // Align b to a word boundary. | 228 // Align b to a word boundary. |
157 off = (uintptr)b & (PtrSize-1); | 229 off = (uintptr)b & (PtrSize-1); |
158 if(off != 0) { | 230 if(off != 0) { |
159 b += PtrSize - off; | 231 b += PtrSize - off; |
160 n -= PtrSize - off; | 232 n -= PtrSize - off; |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
198 if(((xbits>>j) & (bitAllocated|bitBlockBoundary)
) != 0) { | 270 if(((xbits>>j) & (bitAllocated|bitBlockBoundary)
) != 0) { |
199 obj = (byte*)obj - (shift-j)*PtrSize; | 271 obj = (byte*)obj - (shift-j)*PtrSize; |
200 shift = j; | 272 shift = j; |
201 bits = xbits>>shift; | 273 bits = xbits>>shift; |
202 goto found; | 274 goto found; |
203 } | 275 } |
204 } | 276 } |
205 | 277 |
206 // Otherwise consult span table to find beginning. | 278 // Otherwise consult span table to find beginning. |
207 // (Manually inlined copy of MHeap_LookupMaybe.) | 279 // (Manually inlined copy of MHeap_LookupMaybe.) |
208 m->gcstats.nlookup++; | |
209 m->gcstats.naddrlookup++; | |
210 k = (uintptr)obj>>PageShift; | 280 k = (uintptr)obj>>PageShift; |
211 x = k; | 281 x = k; |
212 if(sizeof(void*) == 8) | 282 if(sizeof(void*) == 8) |
213 x -= (uintptr)arena_start>>PageShift; | 283 x -= (uintptr)arena_start>>PageShift; |
214 s = runtime·mheap.map[x]; | 284 s = runtime·mheap.map[x]; |
215 if(s == nil || k < s->start || k - s->start >= s->npages
|| s->state != MSpanInUse) | 285 if(s == nil || k < s->start || k - s->start >= s->npages
|| s->state != MSpanInUse) |
216 continue; | 286 continue; |
217 p = (byte*)((uintptr)s->start<<PageShift); | 287 p = (byte*)((uintptr)s->start<<PageShift); |
218 if(s->sizeclass == 0) { | 288 if(s->sizeclass == 0) { |
219 obj = p; | 289 obj = p; |
220 } else { | 290 } else { |
221 if((byte*)obj >= (byte*)s->limit) | 291 if((byte*)obj >= (byte*)s->limit) |
222 continue; | 292 continue; |
223 size = runtime·class_to_size[s->sizeclass]; | 293 size = runtime·class_to_size[s->sizeclass]; |
224 int32 i = ((byte*)obj - p)/size; | 294 int32 i = ((byte*)obj - p)/size; |
225 obj = p+i*size; | 295 obj = p+i*size; |
226 } | 296 } |
227 | 297 |
228 // Now that we know the object header, reload bits. | 298 // Now that we know the object header, reload bits. |
229 off = (uintptr*)obj - (uintptr*)arena_start; | 299 off = (uintptr*)obj - (uintptr*)arena_start; |
230 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord -
1; | 300 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord -
1; |
231 shift = off % wordsPerBitmapWord; | 301 shift = off % wordsPerBitmapWord; |
232 xbits = *bitp; | 302 xbits = *bitp; |
233 bits = xbits >> shift; | 303 bits = xbits >> shift; |
234 | 304 |
235 found: | 305 found: |
| 306 // If another proc wants a pointer, give it some. |
| 307 if(work.nwait > 0 && nobj > 4 && work.full == 0) { |
| 308 wbuf->nobj = nobj; |
| 309 wbuf = handoff(wbuf); |
| 310 nobj = wbuf->nobj; |
| 311 wp = wbuf->obj + nobj; |
| 312 } |
| 313 |
236 // Now we have bits, bitp, and shift correct for | 314 // Now we have bits, bitp, and shift correct for |
237 // obj pointing at the base of the object. | 315 // obj pointing at the base of the object. |
238 // Only care about allocated and not marked. | 316 // Only care about allocated and not marked. |
239 if((bits & (bitAllocated|bitMarked)) != bitAllocated) | 317 if((bits & (bitAllocated|bitMarked)) != bitAllocated) |
240 continue; | 318 continue; |
241 if(nproc == 1) | 319 if(nproc == 1) |
242 *bitp |= bitMarked<<shift; | 320 *bitp |= bitMarked<<shift; |
243 else { | 321 else { |
244 for(;;) { | 322 for(;;) { |
245 x = *bitp; | 323 x = *bitp; |
246 if(x & (bitMarked<<shift)) | 324 if(x & (bitMarked<<shift)) |
247 goto continue_obj; | 325 goto continue_obj; |
248 if(runtime·casp((void**)bitp, (void*)x,
(void*)(x|(bitMarked<<shift)))) | 326 if(runtime·casp((void**)bitp, (void*)x,
(void*)(x|(bitMarked<<shift)))) |
249 break; | 327 break; |
250 } | 328 } |
251 } | 329 } |
252 | 330 |
253 // If object has no pointers, don't need to scan further
. | 331 // If object has no pointers, don't need to scan further
. |
254 if((bits & bitNoPointers) != 0) | 332 if((bits & bitNoPointers) != 0) |
255 continue; | 333 continue; |
256 | 334 |
257 » » » // If another proc wants a pointer, give it some. | 335 » » » PREFETCH(obj); |
258 » » » if(nobj > 4 && work.nwait > 0 && work.full == nil) { | |
259 » » » » wbuf->nobj = nobj; | |
260 » » » » wbuf = handoff(wbuf); | |
261 » » » » nobj = wbuf->nobj; | |
262 » » » » wp = wbuf->obj + nobj; | |
263 » » » } | |
264 | 336 |
265 // If buffer is full, get a new one. | 337 // If buffer is full, get a new one. |
266 if(wbuf == nil || nobj >= nelem(wbuf->obj)) { | 338 if(wbuf == nil || nobj >= nelem(wbuf->obj)) { |
267 if(wbuf != nil) | 339 if(wbuf != nil) |
268 wbuf->nobj = nobj; | 340 wbuf->nobj = nobj; |
269 wbuf = getempty(wbuf); | 341 wbuf = getempty(wbuf); |
270 wp = wbuf->obj; | 342 wp = wbuf->obj; |
271 nobj = 0; | 343 nobj = 0; |
272 } | 344 } |
273 *wp++ = obj; | 345 *wp++ = obj; |
274 nobj++; | 346 nobj++; |
275 continue_obj:; | 347 continue_obj:; |
276 } | 348 } |
277 | 349 |
278 // Done scanning [b, b+n). Prepare for the next iteration of | 350 // Done scanning [b, b+n). Prepare for the next iteration of |
279 // the loop by setting b and n to the parameters for the next bl
ock. | 351 // the loop by setting b and n to the parameters for the next bl
ock. |
280 | 352 |
281 // Fetch b from the work buffer. | 353 // Fetch b from the work buffer. |
282 if(nobj == 0) { | 354 if(nobj == 0) { |
283 if(!keepworking) { | 355 if(!keepworking) { |
284 » » » » putempty(wbuf); | 356 » » » » if(wbuf != nil) |
| 357 » » » » » putempty(wbuf); |
285 return; | 358 return; |
286 } | 359 } |
287 // Emptied our buffer: refill. | 360 // Emptied our buffer: refill. |
288 wbuf = getfull(wbuf); | 361 wbuf = getfull(wbuf); |
289 if(wbuf == nil) | 362 if(wbuf == nil) |
290 return; | 363 return; |
291 nobj = wbuf->nobj; | 364 nobj = wbuf->nobj; |
292 wp = wbuf->obj + wbuf->nobj; | 365 wp = wbuf->obj + wbuf->nobj; |
293 } | 366 } |
294 b = *--wp; | 367 b = *--wp; |
295 nobj--; | 368 nobj--; |
296 | 369 |
297 » » // Figure out n = size of b. Start by loading bits for b. | 370 » » // Ask span about size class. |
298 » » off = (uintptr*)b - (uintptr*)arena_start; | |
299 » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; | |
300 » » shift = off % wordsPerBitmapWord; | |
301 » » xbits = *bitp; | |
302 » » bits = xbits >> shift; | |
303 | |
304 » » // Might be small; look for nearby block boundary. | |
305 » » // A block boundary is marked by either bitBlockBoundary | |
306 » » // or bitAllocated being set (see notes near their definition). | |
307 » » enum { | |
308 » » » boundary = bitBlockBoundary|bitAllocated | |
309 » » }; | |
310 » » // Look for a block boundary both after and before b | |
311 » » // in the same bitmap word. | |
312 » » // | |
313 » » // A block boundary j words after b is indicated by | |
314 » » //» bits>>j & boundary | |
315 » » // assuming shift+j < bitShift. (If shift+j >= bitShift then | |
316 » » // we'll be bleeding other bit types like bitMarked into our tes
t.) | |
317 » » // Instead of inserting the conditional shift+j < bitShift into
the loop, | |
318 » » // we can let j range from 1 to bitShift as long as we first | |
319 » » // apply a mask to keep only the bits corresponding | |
320 » » // to shift+j < bitShift aka j < bitShift-shift. | |
321 » » bits &= (boundary<<(bitShift-shift)) - boundary; | |
322 | |
323 » » // A block boundary j words before b is indicated by | |
324 » » //» xbits>>(shift-j) & boundary | |
325 » » // (assuming shift >= j). There is no cleverness here | |
326 » » // avoid the test, because when j gets too large the shift | |
327 » » // turns negative, which is undefined in C. | |
328 | |
329 » » for(j=1; j<bitShift; j++) { | |
330 » » » if(((bits>>j)&boundary) != 0 || shift>=j && ((xbits>>(sh
ift-j))&boundary) != 0) { | |
331 » » » » n = j*PtrSize; | |
332 » » » » goto scan; | |
333 » » » } | |
334 » » } | |
335 | |
336 » » // Fall back to asking span about size class. | |
337 // (Manually inlined copy of MHeap_Lookup.) | 371 // (Manually inlined copy of MHeap_Lookup.) |
338 m->gcstats.nlookup++; | |
339 m->gcstats.nsizelookup++; | |
340 x = (uintptr)b>>PageShift; | 372 x = (uintptr)b>>PageShift; |
341 if(sizeof(void*) == 8) | 373 if(sizeof(void*) == 8) |
342 x -= (uintptr)arena_start>>PageShift; | 374 x -= (uintptr)arena_start>>PageShift; |
343 s = runtime·mheap.map[x]; | 375 s = runtime·mheap.map[x]; |
344 if(s->sizeclass == 0) | 376 if(s->sizeclass == 0) |
345 n = s->npages<<PageShift; | 377 n = s->npages<<PageShift; |
346 else | 378 else |
347 n = runtime·class_to_size[s->sizeclass]; | 379 n = runtime·class_to_size[s->sizeclass]; |
348 scan:; | |
349 } | 380 } |
350 } | 381 } |
351 | 382 |
352 // debug_scanblock is the debug copy of scanblock. | 383 // debug_scanblock is the debug copy of scanblock. |
353 // it is simpler, slower, single-threaded, recursive, | 384 // it is simpler, slower, single-threaded, recursive, |
354 // and uses bitSpecial as the mark bit. | 385 // and uses bitSpecial as the mark bit. |
355 static void | 386 static void |
356 debug_scanblock(byte *b, int64 n) | 387 debug_scanblock(byte *b, int64 n) |
357 { | 388 { |
358 byte *obj, *p; | 389 byte *obj, *p; |
(...skipping 25 matching lines...) Expand all Loading... |
384 if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runti
me·mheap.arena_used) | 415 if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runti
me·mheap.arena_used) |
385 continue; | 416 continue; |
386 | 417 |
387 // Round down to word boundary. | 418 // Round down to word boundary. |
388 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); | 419 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); |
389 | 420 |
390 // Consult span table to find beginning. | 421 // Consult span table to find beginning. |
391 s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj); | 422 s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj); |
392 if(s == nil) | 423 if(s == nil) |
393 continue; | 424 continue; |
394 | |
395 | 425 |
396 p = (byte*)((uintptr)s->start<<PageShift); | 426 p = (byte*)((uintptr)s->start<<PageShift); |
397 if(s->sizeclass == 0) { | 427 if(s->sizeclass == 0) { |
398 obj = p; | 428 obj = p; |
399 size = (uintptr)s->npages<<PageShift; | 429 size = (uintptr)s->npages<<PageShift; |
400 } else { | 430 } else { |
401 if((byte*)obj >= (byte*)s->limit) | 431 if((byte*)obj >= (byte*)s->limit) |
402 continue; | 432 continue; |
403 size = runtime·class_to_size[s->sizeclass]; | 433 size = runtime·class_to_size[s->sizeclass]; |
404 int32 i = ((byte*)obj - p)/size; | 434 int32 i = ((byte*)obj - p)/size; |
(...skipping 17 matching lines...) Expand all Loading... |
422 runtime·printf("found unmarked block %p in %p\n", obj, v
p+i); | 452 runtime·printf("found unmarked block %p in %p\n", obj, v
p+i); |
423 | 453 |
424 // If object has no pointers, don't need to scan further. | 454 // If object has no pointers, don't need to scan further. |
425 if((bits & bitNoPointers) != 0) | 455 if((bits & bitNoPointers) != 0) |
426 continue; | 456 continue; |
427 | 457 |
428 debug_scanblock(obj, size); | 458 debug_scanblock(obj, size); |
429 } | 459 } |
430 } | 460 } |
431 | 461 |
| 462 static void |
| 463 markroot(Parfor *desc, uint32 i) |
| 464 { |
| 465 USED(desc); |
| 466 scanblock(work.roots[i].p, work.roots[i].n); |
| 467 } |
| 468 |
432 // Get an empty work buffer off the work.empty list, | 469 // Get an empty work buffer off the work.empty list, |
433 // allocating new buffers as needed. | 470 // allocating new buffers as needed. |
434 static Workbuf* | 471 static Workbuf* |
435 getempty(Workbuf *b) | 472 getempty(Workbuf *b) |
436 { | 473 { |
437 » if(work.nproc == 1) { | 474 » if(b != nil) |
438 » » // Put b on full list. | 475 » » lifopush(&work.full, b); |
439 » » if(b != nil) { | 476 » b = lifopop(&work.empty); |
440 » » » b->next = work.full; | 477 » if(b == nil) { |
441 » » » work.full = b; | 478 » » // Need to allocate. |
442 » » } | 479 » » runtime·lock(&work); |
443 » » // Grab from empty list if possible. | 480 » » if(work.nchunk < sizeof *b) { |
444 » » b = work.empty; | 481 » » » work.nchunk = 1<<20; |
445 » » if(b != nil) { | 482 » » » work.chunk = runtime·SysAlloc(work.nchunk); |
446 » » » work.empty = b->next; | 483 » » } |
447 » » » goto haveb; | 484 » » b = (Workbuf*)work.chunk; |
448 » » } | 485 » » work.chunk += sizeof *b; |
449 » } else { | 486 » » work.nchunk -= sizeof *b; |
450 » » // Put b on full list. | 487 » » runtime·unlock(&work); |
451 » » if(b != nil) { | 488 » } |
452 » » » runtime·lock(&work.fmu); | |
453 » » » b->next = work.full; | |
454 » » » work.full = b; | |
455 » » » runtime·unlock(&work.fmu); | |
456 » » } | |
457 » » // Grab from empty list if possible. | |
458 » » runtime·lock(&work.emu); | |
459 » » b = work.empty; | |
460 » » if(b != nil) | |
461 » » » work.empty = b->next; | |
462 » » runtime·unlock(&work.emu); | |
463 » » if(b != nil) | |
464 » » » goto haveb; | |
465 » } | |
466 | |
467 » // Need to allocate. | |
468 » runtime·lock(&work); | |
469 » if(work.nchunk < sizeof *b) { | |
470 » » work.nchunk = 1<<20; | |
471 » » work.chunk = runtime·SysAlloc(work.nchunk); | |
472 » } | |
473 » b = (Workbuf*)work.chunk; | |
474 » work.chunk += sizeof *b; | |
475 » work.nchunk -= sizeof *b; | |
476 » runtime·unlock(&work); | |
477 | |
478 haveb: | |
479 b->nobj = 0; | 489 b->nobj = 0; |
480 return b; | 490 return b; |
481 } | 491 } |
482 | 492 |
483 static void | 493 static void |
484 putempty(Workbuf *b) | 494 putempty(Workbuf *b) |
485 { | 495 { |
486 » if(b == nil) | 496 » lifopush(&work.empty, b); |
487 » » return; | |
488 | |
489 » if(work.nproc == 1) { | |
490 » » b->next = work.empty; | |
491 » » work.empty = b; | |
492 » » return; | |
493 » } | |
494 | |
495 » runtime·lock(&work.emu); | |
496 » b->next = work.empty; | |
497 » work.empty = b; | |
498 » runtime·unlock(&work.emu); | |
499 } | 497 } |
500 | 498 |
501 // Get a full work buffer off the work.full list, or return nil. | 499 // Get a full work buffer off the work.full list, or return nil. |
502 static Workbuf* | 500 static Workbuf* |
503 getfull(Workbuf *b) | 501 getfull(Workbuf *b) |
504 { | 502 { |
505 int32 i; | 503 int32 i; |
506 » Workbuf *b1; | 504 |
507 | 505 » if(b != nil) |
508 » if(work.nproc == 1) { | 506 » » lifopush(&work.empty, b); |
509 » » // Put b on empty list. | 507 » b = lifopop(&work.full); |
510 » » if(b != nil) { | 508 » if(b != nil || work.nproc == 1) |
511 » » » b->next = work.empty; | |
512 » » » work.empty = b; | |
513 » » } | |
514 » » // Grab from full list if possible. | |
515 » » // Since work.nproc==1, no one else is | |
516 » » // going to give us work. | |
517 » » b = work.full; | |
518 » » if(b != nil) | |
519 » » » work.full = b->next; | |
520 return b; | 509 return b; |
521 } | |
522 | |
523 putempty(b); | |
524 | |
525 // Grab buffer from full list if possible. | |
526 for(;;) { | |
527 b1 = work.full; | |
528 if(b1 == nil) | |
529 break; | |
530 runtime·lock(&work.fmu); | |
531 if(work.full != nil) { | |
532 b1 = work.full; | |
533 work.full = b1->next; | |
534 runtime·unlock(&work.fmu); | |
535 return b1; | |
536 } | |
537 runtime·unlock(&work.fmu); | |
538 } | |
539 | 510 |
540 runtime·xadd(&work.nwait, +1); | 511 runtime·xadd(&work.nwait, +1); |
541 for(i=0;; i++) { | 512 for(i=0;; i++) { |
542 » » b1 = work.full; | 513 » » if(work.full != 0) { |
543 » » if(b1 != nil) { | 514 » » » runtime·xadd(&work.nwait, -1); |
544 » » » runtime·lock(&work.fmu); | 515 » » » b = lifopop(&work.full); |
545 » » » if(work.full != nil) { | 516 » » » if(b != nil) |
546 » » » » runtime·xadd(&work.nwait, -1); | 517 » » » » return b; |
547 » » » » b1 = work.full; | 518 » » » runtime·xadd(&work.nwait, +1); |
548 » » » » work.full = b1->next; | |
549 » » » » runtime·unlock(&work.fmu); | |
550 » » » » return b1; | |
551 » » » } | |
552 » » » runtime·unlock(&work.fmu); | |
553 » » » continue; | |
554 } | 519 } |
555 if(work.nwait == work.nproc) | 520 if(work.nwait == work.nproc) |
556 return nil; | 521 return nil; |
557 » » if(i < 10) | 522 » » if(i < 10) { |
| 523 » » » m->gcstats.nprocyield++; |
558 runtime·procyield(20); | 524 runtime·procyield(20); |
559 » » else if(i < 20) | 525 » » } else if(i < 20) { |
| 526 » » » m->gcstats.nosyield++; |
560 runtime·osyield(); | 527 runtime·osyield(); |
561 » » else | 528 » » } else { |
| 529 » » » m->gcstats.nsleep++; |
562 runtime·usleep(100); | 530 runtime·usleep(100); |
| 531 } |
563 } | 532 } |
564 } | 533 } |
565 | 534 |
566 static Workbuf* | 535 static Workbuf* |
567 handoff(Workbuf *b) | 536 handoff(Workbuf *b) |
568 { | 537 { |
569 int32 n; | 538 int32 n; |
570 Workbuf *b1; | 539 Workbuf *b1; |
571 | 540 |
572 // Make new buffer with half of b's pointers. | 541 // Make new buffer with half of b's pointers. |
573 b1 = getempty(nil); | 542 b1 = getempty(nil); |
574 n = b->nobj/2; | 543 n = b->nobj/2; |
575 b->nobj -= n; | 544 b->nobj -= n; |
576 b1->nobj = n; | 545 b1->nobj = n; |
577 runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]); | 546 runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]); |
578 » m->gcstats.nhandoff += n; | 547 » m->gcstats.nhandoff += 1; |
| 548 » m->gcstats.nhandoffcnt += n; |
579 | 549 |
580 // Put b on full list - let first half of b get stolen. | 550 // Put b on full list - let first half of b get stolen. |
581 » runtime·lock(&work.fmu); | 551 » lifopush(&work.full, b); |
582 » b->next = work.full; | |
583 » work.full = b; | |
584 » runtime·unlock(&work.fmu); | |
585 | |
586 return b1; | 552 return b1; |
587 } | 553 } |
588 | 554 |
589 // Scanstack calls scanblock on each of gp's stack segments. | |
590 static void | 555 static void |
591 scanstack(void (*scanblock)(byte*, int64), G *gp) | 556 addroot(byte *p, uintptr n) |
| 557 { |
| 558 » uint32 cap; |
| 559 » GcRoot *new; |
| 560 |
| 561 » if(work.nroot >= work.rootcap) { |
| 562 » » cap = PageSize/sizeof(GcRoot); |
| 563 » » if(cap < 2*work.rootcap) |
| 564 » » » cap = 2*work.rootcap; |
| 565 » » new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot)); |
| 566 » » if(work.roots != nil) { |
| 567 » » » runtime·memmove(new, work.roots, work.rootcap*sizeof(GcR
oot)); |
| 568 » » » runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot))
; |
| 569 » » } |
| 570 » » work.roots = new; |
| 571 » » work.rootcap = cap; |
| 572 » } |
| 573 » work.roots[work.nroot].p = p; |
| 574 » work.roots[work.nroot].n = n; |
| 575 » work.nroot++; |
| 576 } |
| 577 |
| 578 static void |
| 579 addstackroots(G *gp) |
592 { | 580 { |
593 M *mp; | 581 M *mp; |
594 int32 n; | 582 int32 n; |
595 Stktop *stk; | 583 Stktop *stk; |
596 byte *sp, *guard; | 584 byte *sp, *guard; |
597 | 585 |
598 stk = (Stktop*)gp->stackbase; | 586 stk = (Stktop*)gp->stackbase; |
599 guard = gp->stackguard; | 587 guard = gp->stackguard; |
600 | 588 |
601 if(gp == g) { | 589 if(gp == g) { |
(...skipping 12 matching lines...) Expand all Loading... |
614 // as schedlock and may have needed to start a new stack segment
. | 602 // as schedlock and may have needed to start a new stack segment
. |
615 // Use the stack segment and stack pointer at the time of | 603 // Use the stack segment and stack pointer at the time of |
616 // the system call instead, since that won't change underfoot. | 604 // the system call instead, since that won't change underfoot. |
617 if(gp->gcstack != nil) { | 605 if(gp->gcstack != nil) { |
618 stk = (Stktop*)gp->gcstack; | 606 stk = (Stktop*)gp->gcstack; |
619 sp = gp->gcsp; | 607 sp = gp->gcsp; |
620 guard = gp->gcguard; | 608 guard = gp->gcguard; |
621 } | 609 } |
622 } | 610 } |
623 | 611 |
624 if(Debug > 1) | |
625 runtime·printf("scanstack %d %p\n", gp->goid, sp); | |
626 n = 0; | 612 n = 0; |
627 while(stk) { | 613 while(stk) { |
628 if(sp < guard-StackGuard || (byte*)stk < sp) { | 614 if(sp < guard-StackGuard || (byte*)stk < sp) { |
629 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not
in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); | 615 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not
in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); |
630 runtime·throw("scanstack"); | 616 runtime·throw("scanstack"); |
631 } | 617 } |
632 » » scanblock(sp, (byte*)stk - sp); | 618 » » addroot(sp, (byte*)stk - sp); |
633 sp = stk->gobuf.sp; | 619 sp = stk->gobuf.sp; |
634 guard = stk->stackguard; | 620 guard = stk->stackguard; |
635 stk = (Stktop*)stk->stackbase; | 621 stk = (Stktop*)stk->stackbase; |
636 n++; | 622 n++; |
637 } | 623 } |
638 } | 624 } |
639 | 625 |
640 // Markfin calls scanblock on the blocks that have finalizers: | |
641 // the things pointed at cannot be freed until the finalizers have run. | |
642 static void | 626 static void |
643 markfin(void *v) | 627 addfinroots(void *v) |
644 { | 628 { |
645 uintptr size; | 629 uintptr size; |
646 | 630 |
647 size = 0; | 631 size = 0; |
648 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v)) | 632 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v)) |
649 runtime·throw("mark - finalizer inconsistency"); | 633 runtime·throw("mark - finalizer inconsistency"); |
650 | 634 |
651 // do not mark the finalizer block itself. just mark the things it poin
ts at. | 635 // do not mark the finalizer block itself. just mark the things it poin
ts at. |
652 » scanblock(v, size); | 636 » addroot(v, size); |
653 } | 637 } |
654 | 638 |
655 static void | 639 static void |
656 debug_markfin(void *v) | 640 addroots(void) |
657 { | |
658 » uintptr size; | |
659 | |
660 » if(!runtime·mlookup(v, &v, &size, nil)) | |
661 » » runtime·throw("debug_mark - finalizer inconsistency"); | |
662 » debug_scanblock(v, size); | |
663 } | |
664 | |
665 // Mark | |
666 static void | |
667 mark(void (*scan)(byte*, int64)) | |
668 { | 641 { |
669 G *gp; | 642 G *gp; |
670 FinBlock *fb; | 643 FinBlock *fb; |
| 644 byte *p; |
| 645 |
| 646 work.nroot = 0; |
671 | 647 |
672 // mark data+bss. | 648 // mark data+bss. |
673 » // skip runtime·mheap itself, which has no interesting pointers | 649 » for(p=data; p<ebss; p+=DataBlock) |
674 » // and is mostly zeroed and would not otherwise be paged in. | 650 » » addroot(p, p+DataBlock<ebss?DataBlock:ebss-p); |
675 » scan(data, (byte*)&runtime·mheap - data); | 651 |
676 » scan((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1)); | 652 » runtime·walkfintab(addfinroots); |
677 | 653 |
678 » // mark stacks | 654 » for(fb=allfin; fb; fb=fb->alllink) |
| 655 » » addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); |
| 656 |
679 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { | 657 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { |
680 switch(gp->status){ | 658 switch(gp->status){ |
681 » » default: | 659 » » » default: |
682 » » » runtime·printf("unexpected G.status %d\n", gp->status); | 660 » » » » runtime·printf("unexpected G.status %d\n", gp->s
tatus); |
683 » » » runtime·throw("mark - bad status"); | 661 » » » » runtime·throw("mark - bad status"); |
684 » » case Gdead: | 662 » » » case Gdead: |
685 » » » break; | 663 » » » » break; |
686 » » case Grunning: | 664 » » » case Grunning: |
687 » » » if(gp != g) | 665 » » » » if(gp != g) |
688 » » » » runtime·throw("mark - world not stopped"); | 666 » » » » » runtime·throw("mark - world not stopped"
); |
689 » » » scanstack(scan, gp); | 667 » » » » addstackroots(gp); |
690 » » » break; | 668 » » » » break; |
691 » » case Grunnable: | 669 » » » case Grunnable: |
692 » » case Gsyscall: | 670 » » » case Gsyscall: |
693 » » case Gwaiting: | 671 » » » case Gwaiting: |
694 » » » scanstack(scan, gp); | 672 » » » » addstackroots(gp); |
695 » » » break; | 673 » » » » break; |
696 » » } | 674 » » } |
697 » } | 675 » } |
698 | |
699 » // mark things pointed at by objects with finalizers | |
700 » if(scan == debug_scanblock) | |
701 » » runtime·walkfintab(debug_markfin); | |
702 » else | |
703 » » runtime·walkfintab(markfin); | |
704 | |
705 » for(fb=allfin; fb; fb=fb->alllink) | |
706 » » scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); | |
707 | |
708 » // in multiproc mode, join in the queued work. | |
709 » scan(nil, 0); | |
710 } | 676 } |
711 | 677 |
712 static bool | 678 static bool |
713 handlespecial(byte *p, uintptr size) | 679 handlespecial(byte *p, uintptr size) |
714 { | 680 { |
715 void (*fn)(void*); | 681 void (*fn)(void*); |
716 int32 nret; | 682 int32 nret; |
717 FinBlock *block; | 683 FinBlock *block; |
718 Finalizer *f; | 684 Finalizer *f; |
719 » | 685 |
720 if(!runtime·getfinalizer(p, true, &fn, &nret)) { | 686 if(!runtime·getfinalizer(p, true, &fn, &nret)) { |
721 runtime·setblockspecial(p, false); | 687 runtime·setblockspecial(p, false); |
722 runtime·MProf_Free(p, size); | 688 runtime·MProf_Free(p, size); |
723 return false; | 689 return false; |
724 } | 690 } |
725 | 691 |
726 runtime·lock(&finlock); | 692 runtime·lock(&finlock); |
727 if(finq == nil || finq->cnt == finq->cap) { | 693 if(finq == nil || finq->cnt == finq->cap) { |
728 if(finc == nil) { | 694 if(finc == nil) { |
729 finc = runtime·SysAlloc(PageSize); | 695 finc = runtime·SysAlloc(PageSize); |
730 finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Final
izer) + 1; | 696 finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Final
izer) + 1; |
731 finc->alllink = allfin; | 697 finc->alllink = allfin; |
732 allfin = finc; | 698 allfin = finc; |
733 } | 699 } |
734 block = finc; | 700 block = finc; |
735 finc = block->next; | 701 finc = block->next; |
736 block->next = finq; | 702 block->next = finq; |
737 finq = block; | 703 finq = block; |
738 } | 704 } |
739 f = &finq->fin[finq->cnt]; | 705 f = &finq->fin[finq->cnt]; |
740 finq->cnt++; | 706 finq->cnt++; |
741 f->fn = fn; | 707 f->fn = fn; |
742 f->nret = nret; | 708 f->nret = nret; |
743 f->arg = p; | 709 f->arg = p; |
744 » runtime·unlock(&finlock); | 710 » runtime·unlock(&finlock); |
745 return true; | 711 return true; |
746 } | 712 } |
747 | 713 |
748 // Sweep frees or collects finalizers for blocks not marked in the mark phase. | 714 // Sweep frees or collects finalizers for blocks not marked in the mark phase. |
749 // It clears the mark bits in preparation for the next GC round. | 715 // It clears the mark bits in preparation for the next GC round. |
750 static void | 716 static void |
751 sweep(void) | 717 sweepspan(Parfor *desc, uint32 spanidx) |
752 { | 718 { |
753 MSpan *s; | 719 MSpan *s; |
754 int32 cl, n, npages; | 720 int32 cl, n, npages; |
755 uintptr size; | 721 uintptr size; |
756 byte *p; | 722 byte *p; |
757 MCache *c; | 723 MCache *c; |
758 byte *arena_start; | 724 byte *arena_start; |
| 725 MLink *start, *end; |
| 726 int32 nfree; |
| 727 |
| 728 USED(desc); |
| 729 |
| 730 s = runtime·mheap.allspans[spanidx]; |
| 731 // Stamp newly unused spans. The scavenger will use that |
| 732 // info to potentially give back some pages to the OS. |
| 733 if(s->state == MSpanFree && s->unusedsince == 0) |
| 734 s->unusedsince = runtime·nanotime(); |
| 735 if(s->state != MSpanInUse) |
| 736 return; |
759 | 737 |
760 arena_start = runtime·mheap.arena_start; | 738 arena_start = runtime·mheap.arena_start; |
761 | 739 |
762 » for(;;) { | 740 » p = (byte*)(s->start << PageShift); |
763 » » s = work.spans; | 741 » cl = s->sizeclass; |
764 » » if(s == nil) | 742 » if(cl == 0) { |
765 » » » break; | 743 » » size = s->npages<<PageShift; |
766 » » if(!runtime·casp(&work.spans, s, s->allnext)) | 744 » » n = 1; |
| 745 » } else { |
| 746 » » // Chunk full of small blocks. |
| 747 » » size = runtime·class_to_size[cl]; |
| 748 » » npages = runtime·class_to_allocnpages[cl]; |
| 749 » » n = (npages << PageShift) / size; |
| 750 » } |
| 751 » c = m->mcache; |
| 752 » nfree = 0; |
| 753 » start = end = nil; |
| 754 |
| 755 » // Sweep through n objects of given size starting at p. |
| 756 » // This thread owns the span now, so it can manipulate |
| 757 » // the block bitmap without atomic operations. |
| 758 » for(; n > 0; n--, p += size) { |
| 759 » » uintptr off, *bitp, shift, bits; |
| 760 |
| 761 » » off = (uintptr*)p - (uintptr*)arena_start; |
| 762 » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; |
| 763 » » PREFETCH((byte*)bitp - size); |
| 764 » » shift = off % wordsPerBitmapWord; |
| 765 » » bits = *bitp>>shift; |
| 766 |
| 767 » » if((bits & bitAllocated) == 0) |
767 continue; | 768 continue; |
768 | 769 |
769 » » if(s->state != MSpanInUse) | 770 » » if((bits & bitMarked) != 0) { |
| 771 » » » if(DebugMark) { |
| 772 » » » » if(!(bits & bitSpecial)) |
| 773 » » » » » runtime·printf("found spurious mark on %
p\n", p); |
| 774 » » » » *bitp &= ~(bitSpecial<<shift); |
| 775 » » » } |
| 776 » » » *bitp &= ~(bitMarked<<shift); |
770 continue; | 777 continue; |
771 | 778 » » } |
772 » » p = (byte*)(s->start << PageShift); | 779 |
773 » » cl = s->sizeclass; | 780 » » // Special means it has a finalizer or is being profiled. |
| 781 » » // In DebugMark mode, the bit has been coopted so |
| 782 » » // we have to assume all blocks are special. |
| 783 » » if(DebugMark || (bits & bitSpecial) != 0) { |
| 784 » » » if(handlespecial(p, size)) |
| 785 » » » » continue; |
| 786 » » } |
| 787 |
| 788 » » // Mark freed; restore block boundary bit. |
| 789 » » *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift); |
| 790 |
774 if(cl == 0) { | 791 if(cl == 0) { |
775 » » » size = s->npages<<PageShift; | 792 » » » // Free large span. |
776 » » » n = 1; | 793 » » » runtime·unmarkspan(p, 1<<PageShift); |
777 » » } else { | 794 » » » *(uintptr*)p = 1;» // needs zeroing |
778 » » » // Chunk full of small blocks. | 795 » » » runtime·MHeap_Free(&runtime·mheap, s, 1); |
779 » » » size = runtime·class_to_size[cl]; | |
780 » » » npages = runtime·class_to_allocnpages[cl]; | |
781 » » » n = (npages << PageShift) / size; | |
782 » » } | |
783 | |
784 » » // Sweep through n objects of given size starting at p. | |
785 » » // This thread owns the span now, so it can manipulate | |
786 » » // the block bitmap without atomic operations. | |
787 » » for(; n > 0; n--, p += size) { | |
788 » » » uintptr off, *bitp, shift, bits; | |
789 | |
790 » » » off = (uintptr*)p - (uintptr*)arena_start; | |
791 » » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord -
1; | |
792 » » » shift = off % wordsPerBitmapWord; | |
793 » » » bits = *bitp>>shift; | |
794 | |
795 » » » if((bits & bitAllocated) == 0) | |
796 » » » » continue; | |
797 | |
798 » » » if((bits & bitMarked) != 0) { | |
799 » » » » if(DebugMark) { | |
800 » » » » » if(!(bits & bitSpecial)) | |
801 » » » » » » runtime·printf("found spurious m
ark on %p\n", p); | |
802 » » » » » *bitp &= ~(bitSpecial<<shift); | |
803 » » » » } | |
804 » » » » *bitp &= ~(bitMarked<<shift); | |
805 » » » » continue; | |
806 » » » } | |
807 | |
808 » » » // Special means it has a finalizer or is being profiled
. | |
809 » » » // In DebugMark mode, the bit has been coopted so | |
810 » » » // we have to assume all blocks are special. | |
811 » » » if(DebugMark || (bits & bitSpecial) != 0) { | |
812 » » » » if(handlespecial(p, size)) | |
813 » » » » » continue; | |
814 » » » } | |
815 | |
816 » » » // Mark freed; restore block boundary bit. | |
817 » » » *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<
<shift); | |
818 | |
819 » » » c = m->mcache; | |
820 » » » if(s->sizeclass == 0) { | |
821 » » » » // Free large span. | |
822 » » » » runtime·unmarkspan(p, 1<<PageShift); | |
823 » » » » *(uintptr*)p = 1;» // needs zeroing | |
824 » » » » runtime·MHeap_Free(&runtime·mheap, s, 1); | |
825 » » » } else { | |
826 » » » » // Free small object. | |
827 » » » » if(size > sizeof(uintptr)) | |
828 » » » » » ((uintptr*)p)[1] = 1;» // mark as "need
s to be zeroed" | |
829 » » » » c->local_by_size[s->sizeclass].nfree++; | |
830 » » » » runtime·MCache_Free(c, p, s->sizeclass, size); | |
831 » » » } | |
832 c->local_alloc -= size; | 796 c->local_alloc -= size; |
833 c->local_nfree++; | 797 c->local_nfree++; |
834 » » } | 798 » » } else { |
| 799 » » » // Free small object. |
| 800 » » » if(nfree) { |
| 801 » » » » PREFETCH(p); |
| 802 » » » » if(size > sizeof(uintptr)) |
| 803 » » » » » ((uintptr*)end)[1] = 1; // mark as "ne
eds to be zeroed" |
| 804 » » » » end->next = (MLink*)p; |
| 805 » » » » end = (MLink*)p; |
| 806 » » » } else { |
| 807 » » » » start = (MLink*)p; |
| 808 » » » » end = (MLink*)p; |
| 809 » » » } |
| 810 » » » nfree++; |
| 811 » » } |
| 812 » } |
| 813 |
| 814 » if(nfree) { |
| 815 » » if(size > sizeof(uintptr)) |
| 816 » » » ((uintptr*)end)[1] = 1; // mark as "needs to be zeroed
" |
| 817 » » c->local_by_size[s->sizeclass].nfree += nfree; |
| 818 » » c->local_alloc -= size * nfree; |
| 819 » » c->local_nfree += nfree; |
| 820 » » c->local_cachealloc -= nfree * size; |
| 821 » » c->local_objects -= nfree; |
| 822 » » runtime·MCentral_FreeSpan(&runtime·mheap.central[cl], |
| 823 » » » » » s, nfree, start, end); |
835 } | 824 } |
836 } | 825 } |
837 | 826 |
838 void | 827 void |
839 runtime·gchelper(void) | 828 runtime·gchelper(void) |
840 { | 829 { |
| 830 // parallel mark for over gc roots |
| 831 parfor(&work.markfor); |
| 832 // help other threads scan secondary blocks |
841 scanblock(nil, 0); | 833 scanblock(nil, 0); |
| 834 |
842 if(DebugMark) { | 835 if(DebugMark) { |
843 » » while(runtime·atomicload(&work.markdone) == 0) | 836 » » // wait while the main thread executes mark(debug_scanblock) |
| 837 » » while(runtime·atomicload(&work.debugmarkdone) == 0) |
844 runtime·usleep(10); | 838 runtime·usleep(10); |
845 } | 839 } |
846 »······· | 840 |
847 » sweep(); | 841 » // parallel sweep for over all spans |
| 842 » parfor(&work.sweepfor); |
848 | 843 |
849 if(runtime·xadd(&work.ndone, +1) == work.nproc-1) | 844 if(runtime·xadd(&work.ndone, +1) == work.nproc-1) |
850 runtime·notewakeup(&work.alldone); | 845 runtime·notewakeup(&work.alldone); |
851 } | 846 } |
852 | |
853 // Semaphore, not Lock, so that the goroutine | |
854 // reschedules when there is contention rather | |
855 // than spinning. | |
856 static uint32 gcsema = 1; | |
857 | 847 |
858 // Initialized from $GOGC. GOGC=off means no gc. | 848 // Initialized from $GOGC. GOGC=off means no gc. |
859 // | 849 // |
860 // Next gc is after we've allocated an extra amount of | 850 // Next gc is after we've allocated an extra amount of |
861 // memory proportional to the amount already in use. | 851 // memory proportional to the amount already in use. |
862 // If gcpercent=100 and we're using 4M, we'll gc again | 852 // If gcpercent=100 and we're using 4M, we'll gc again |
863 // when we get to 8M. This keeps the gc cost in linear | 853 // when we get to 8M. This keeps the gc cost in linear |
864 // proportion to the allocation cost. Adjusting gcpercent | 854 // proportion to the allocation cost. Adjusting gcpercent |
865 // just changes the linear constant (and also the amount of | 855 // just changes the linear constant (and also the amount of |
866 // extra memory used). | 856 // extra memory used). |
867 static int32 gcpercent = -2; | 857 static int32 gcpercent = -2; |
868 | 858 |
869 static void | 859 static void |
870 stealcache(void) | 860 stealcache(void) |
871 { | 861 { |
872 M *m; | 862 M *m; |
873 | 863 |
874 for(m=runtime·allm; m; m=m->alllink) | 864 for(m=runtime·allm; m; m=m->alllink) |
875 runtime·MCache_ReleaseAll(m->mcache); | 865 runtime·MCache_ReleaseAll(m->mcache); |
876 } | 866 } |
877 | 867 |
878 static void | 868 static void |
879 cachestats(GCStats *stats) | 869 cachestats(GCStats *stats) |
880 { | 870 { |
881 M *m; | 871 M *m; |
882 MCache *c; | 872 MCache *c; |
883 int32 i; | 873 int32 i; |
884 uint64 stacks_inuse; | 874 uint64 stacks_inuse; |
885 uint64 stacks_sys; | 875 uint64 stacks_sys; |
| 876 uint64 *src, *dst; |
886 | 877 |
887 if(stats) | 878 if(stats) |
888 runtime·memclr((byte*)stats, sizeof(*stats)); | 879 runtime·memclr((byte*)stats, sizeof(*stats)); |
889 stacks_inuse = 0; | 880 stacks_inuse = 0; |
890 stacks_sys = 0; | 881 stacks_sys = 0; |
891 for(m=runtime·allm; m; m=m->alllink) { | 882 for(m=runtime·allm; m; m=m->alllink) { |
892 runtime·purgecachedstats(m); | 883 runtime·purgecachedstats(m); |
893 stacks_inuse += m->stackalloc->inuse; | 884 stacks_inuse += m->stackalloc->inuse; |
894 stacks_sys += m->stackalloc->sys; | 885 stacks_sys += m->stackalloc->sys; |
895 if(stats) { | 886 if(stats) { |
896 » » » stats->nlookup += m->gcstats.nlookup; | 887 » » » src = (uint64*)&m->gcstats; |
897 » » » stats->nsizelookup += m->gcstats.nsizelookup; | 888 » » » dst = (uint64*)stats; |
898 » » » stats->naddrlookup += m->gcstats.naddrlookup; | 889 » » » for(i=0; i<sizeof(*stats)/sizeof(uint64); i++) |
899 » » » stats->nhandoff += m->gcstats.nhandoff; | 890 » » » » dst[i] += src[i]; |
900 runtime·memclr((byte*)&m->gcstats, sizeof(m->gcstats)); | 891 runtime·memclr((byte*)&m->gcstats, sizeof(m->gcstats)); |
901 } | 892 } |
902 c = m->mcache; | 893 c = m->mcache; |
903 for(i=0; i<nelem(c->local_by_size); i++) { | 894 for(i=0; i<nelem(c->local_by_size); i++) { |
904 mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc
; | 895 mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc
; |
905 c->local_by_size[i].nmalloc = 0; | 896 c->local_by_size[i].nmalloc = 0; |
906 mstats.by_size[i].nfree += c->local_by_size[i].nfree; | 897 mstats.by_size[i].nfree += c->local_by_size[i].nfree; |
907 c->local_by_size[i].nfree = 0; | 898 c->local_by_size[i].nfree = 0; |
908 } | 899 } |
909 } | 900 } |
910 mstats.stacks_inuse = stacks_inuse; | 901 mstats.stacks_inuse = stacks_inuse; |
911 mstats.stacks_sys = stacks_sys; | 902 mstats.stacks_sys = stacks_sys; |
912 } | 903 } |
913 | 904 |
914 void | 905 void |
915 runtime·gc(int32 force) | 906 runtime·gc(int32 force) |
916 { | 907 { |
917 int64 t0, t1, t2, t3; | 908 int64 t0, t1, t2, t3; |
918 uint64 heap0, heap1, obj0, obj1; | 909 uint64 heap0, heap1, obj0, obj1; |
| 910 GCStats stats; |
| 911 uint32 i; |
919 byte *p; | 912 byte *p; |
920 bool extra; | |
921 GCStats stats; | |
922 | 913 |
923 // The gc is turned off (via enablegc) until | 914 // The gc is turned off (via enablegc) until |
924 // the bootstrap has completed. | 915 // the bootstrap has completed. |
925 // Also, malloc gets called in the guts | 916 // Also, malloc gets called in the guts |
926 // of a number of libraries that might be | 917 // of a number of libraries that might be |
927 // holding locks. To avoid priority inversion | 918 // holding locks. To avoid priority inversion |
928 // problems, don't bother trying to run gc | 919 // problems, don't bother trying to run gc |
929 // while holding a lock. The next mallocgc | 920 // while holding a lock. The next mallocgc |
930 // without a lock will do the gc instead. | 921 // without a lock will do the gc instead. |
931 if(!mstats.enablegc || m->locks > 0 || runtime·panicking) | 922 if(!mstats.enablegc || m->locks > 0 || runtime·panicking) |
932 return; | 923 return; |
933 | 924 |
934 if(gcpercent == -2) { // first time through | 925 if(gcpercent == -2) { // first time through |
935 p = runtime·getenv("GOGC"); | 926 p = runtime·getenv("GOGC"); |
936 if(p == nil || p[0] == '\0') | 927 if(p == nil || p[0] == '\0') |
937 gcpercent = 100; | 928 gcpercent = 100; |
938 else if(runtime·strcmp(p, (byte*)"off") == 0) | 929 else if(runtime·strcmp(p, (byte*)"off") == 0) |
939 gcpercent = -1; | 930 gcpercent = -1; |
940 else | 931 else |
941 gcpercent = runtime·atoi(p); | 932 gcpercent = runtime·atoi(p); |
942 | 933 |
943 p = runtime·getenv("GOGCTRACE"); | 934 p = runtime·getenv("GOGCTRACE"); |
944 if(p != nil) | 935 if(p != nil) |
945 gctrace = runtime·atoi(p); | 936 gctrace = runtime·atoi(p); |
946 } | 937 } |
947 if(gcpercent < 0) | 938 if(gcpercent < 0) |
948 return; | 939 return; |
949 | 940 |
950 » runtime·semacquire(&gcsema); | 941 » runtime·semacquire(&runtime·worldsema); |
951 if(!force && mstats.heap_alloc < mstats.next_gc) { | 942 if(!force && mstats.heap_alloc < mstats.next_gc) { |
952 » » runtime·semrelease(&gcsema); | 943 » » runtime·semrelease(&runtime·worldsema); |
953 return; | 944 return; |
954 } | 945 } |
955 | 946 |
956 t0 = runtime·nanotime(); | 947 t0 = runtime·nanotime(); |
957 | 948 |
958 m->gcing = 1; | 949 m->gcing = 1; |
959 runtime·stoptheworld(); | 950 runtime·stoptheworld(); |
960 | 951 |
961 » cachestats(0); | 952 » heap0 = 0; |
962 » heap0 = mstats.heap_alloc; | 953 » obj0 = 0; |
963 » obj0 = mstats.nmalloc - mstats.nfree; | 954 » if(gctrace) { |
964 | 955 » » cachestats(0); |
965 » work.nproc = MaxGcproc+1; | 956 » » heap0 = mstats.heap_alloc; |
| 957 » » obj0 = mstats.nmalloc - mstats.nfree; |
| 958 » } |
| 959 |
| 960 » work.nproc = runtime·gcprocs(); |
966 work.nwait = 0; | 961 work.nwait = 0; |
967 work.ndone = 0; | 962 work.ndone = 0; |
968 » work.spans = runtime·mheap.allspans; | 963 » work.debugmarkdone = 0; |
969 » work.markdone = 0; | 964 » addroots(); |
970 » extra = false; | 965 » parforsetup(&work.markfor, markroot, work.nproc, work.nroot, nil, false)
; |
971 » if(runtime·gomaxprocs > 1 && runtime·ncpu > 1) { | 966 » parforsetup(&work.sweepfor, sweepspan, work.nproc, runtime·mheap.nspan,
nil, true); |
| 967 » if(work.nproc > 1) { |
972 runtime·noteclear(&work.alldone); | 968 runtime·noteclear(&work.alldone); |
973 » » work.nproc += runtime·helpgc(&extra); | 969 » » runtime·helpgc(work.nproc); |
974 » } | 970 » } |
975 » work.nproc -= MaxGcproc; | 971 |
976 | 972 » parfor(&work.markfor); |
977 » mark(scanblock); | 973 » scanblock(nil, 0); |
| 974 |
978 if(DebugMark) { | 975 if(DebugMark) { |
979 » » mark(debug_scanblock); | 976 » » for(i=0; i<work.nroot; i++) |
980 » » runtime·xchg(&work.markdone, 1); | 977 » » » debug_scanblock(work.roots[i].p, work.roots[i].n); |
| 978 » » runtime·xchg(&work.debugmarkdone, 1); |
981 } | 979 } |
982 t1 = runtime·nanotime(); | 980 t1 = runtime·nanotime(); |
983 | 981 » // parallel sweep for over all spans |
984 » sweep(); | 982 » parfor(&work.sweepfor); |
985 » if(work.nproc > 1) | |
986 » » runtime·notesleep(&work.alldone); | |
987 t2 = runtime·nanotime(); | 983 t2 = runtime·nanotime(); |
988 | 984 |
989 stealcache(); | 985 stealcache(); |
990 cachestats(&stats); | 986 cachestats(&stats); |
991 | 987 |
992 mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; | 988 mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; |
993 m->gcing = 0; | 989 m->gcing = 0; |
994 | 990 |
995 m->locks++; // disable gc during the mallocs in newproc | |
996 if(finq != nil) { | 991 if(finq != nil) { |
| 992 m->locks++; // disable gc during the mallocs in newproc |
997 // kick off or wake up goroutine to run queued finalizers | 993 // kick off or wake up goroutine to run queued finalizers |
998 if(fing == nil) | 994 if(fing == nil) |
999 fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runti
me·gc); | 995 fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runti
me·gc); |
1000 else if(fingwait) { | 996 else if(fingwait) { |
1001 fingwait = 0; | 997 fingwait = 0; |
1002 runtime·ready(fing); | 998 runtime·ready(fing); |
1003 } | 999 } |
1004 » } | 1000 » » m->locks--; |
1005 » m->locks--; | 1001 » } |
| 1002 »······· |
| 1003 » if(work.nproc > 1) |
| 1004 » » runtime·notesleep(&work.alldone); |
1006 | 1005 |
1007 heap1 = mstats.heap_alloc; | 1006 heap1 = mstats.heap_alloc; |
1008 obj1 = mstats.nmalloc - mstats.nfree; | 1007 obj1 = mstats.nmalloc - mstats.nfree; |
1009 | 1008 |
1010 t3 = runtime·nanotime(); | 1009 t3 = runtime·nanotime(); |
| 1010 mstats.last_gc = t3; |
1011 mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0; | 1011 mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0; |
1012 mstats.pause_total_ns += t3 - t0; | 1012 mstats.pause_total_ns += t3 - t0; |
1013 mstats.numgc++; | 1013 mstats.numgc++; |
1014 if(mstats.debuggc) | 1014 if(mstats.debuggc) |
1015 runtime·printf("pause %D\n", t3-t0); | 1015 runtime·printf("pause %D\n", t3-t0); |
1016 | 1016 |
1017 if(gctrace) { | 1017 if(gctrace) { |
1018 » » runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-%
D) objects %D pointer lookups (%D size, %D addr) %D handoff\n", | 1018 » » runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-%
D) objects, %D(%D) steals, %D(%D) handoffs, %D/%D/%D yields\n", |
1019 mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/10000
00, (t3-t2)/1000000, | 1019 mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/10000
00, (t3-t2)/1000000, |
1020 heap0>>20, heap1>>20, obj0, obj1, | 1020 heap0>>20, heap1>>20, obj0, obj1, |
1021 mstats.nmalloc, mstats.nfree, | 1021 mstats.nmalloc, mstats.nfree, |
1022 » » » stats.nlookup, stats.nsizelookup, stats.naddrlookup, sta
ts.nhandoff); | 1022 » » » stats.nsteal, stats.nstealcnt, |
1023 » } | 1023 » » » stats.nhandoff, stats.nhandoffcnt, |
1024 | 1024 » » » stats.nprocyield, stats.nosyield, stats.nsleep); |
1025 » runtime·semrelease(&gcsema); | 1025 » } |
1026 | 1026 »······· |
1027 » // If we could have used another helper proc, start one now, | 1027 » runtime·MProf_GC(); |
1028 » // in the hope that it will be available next time. | 1028 » runtime·semrelease(&runtime·worldsema); |
1029 » // It would have been even better to start it before the collection, | 1029 » runtime·starttheworld(); |
1030 » // but doing so requires allocating memory, so it's tricky to | 1030 |
1031 » // coordinate. This lazy approach works out in practice: | 1031 » // give the queued finalizers, if any, a chance to run |
1032 » // we don't mind if the first couple gc rounds don't have quite | 1032 » if(finq != nil) |
1033 » // the maximum number of procs. | |
1034 » runtime·starttheworld(extra); | |
1035 | |
1036 » // give the queued finalizers, if any, a chance to run»· | |
1037 » if(finq != nil)» | |
1038 runtime·gosched(); | 1033 runtime·gosched(); |
1039 | 1034 |
1040 if(gctrace > 1 && !force) | 1035 if(gctrace > 1 && !force) |
1041 runtime·gc(1); | 1036 runtime·gc(1); |
1042 } | 1037 } |
1043 | 1038 |
1044 void | 1039 void |
1045 runtime·UpdateMemStats(void) | 1040 runtime·ReadMemStats(MStats *stats) |
1046 { | 1041 { |
1047 » // Have to acquire gcsema to stop the world, | 1042 » // Have to acquire worldsema to stop the world, |
1048 // because stoptheworld can only be used by | 1043 // because stoptheworld can only be used by |
1049 // one goroutine at a time, and there might be | 1044 // one goroutine at a time, and there might be |
1050 // a pending garbage collection already calling it. | 1045 // a pending garbage collection already calling it. |
1051 » runtime·semacquire(&gcsema); | 1046 » runtime·semacquire(&runtime·worldsema); |
1052 m->gcing = 1; | 1047 m->gcing = 1; |
1053 runtime·stoptheworld(); | 1048 runtime·stoptheworld(); |
1054 cachestats(0); | 1049 cachestats(0); |
| 1050 *stats = mstats; |
1055 m->gcing = 0; | 1051 m->gcing = 0; |
1056 » runtime·semrelease(&gcsema); | 1052 » runtime·semrelease(&runtime·worldsema); |
1057 » runtime·starttheworld(false); | 1053 » runtime·starttheworld(); |
1058 } | 1054 } |
1059 | 1055 |
1060 static void | 1056 static void |
1061 runfinq(void) | 1057 runfinq(void) |
1062 { | 1058 { |
1063 Finalizer *f; | 1059 Finalizer *f; |
1064 FinBlock *fb, *next; | 1060 FinBlock *fb, *next; |
1065 byte *frame; | 1061 byte *frame; |
1066 uint32 framesz, framecap, i; | 1062 uint32 framesz, framecap, i; |
1067 | 1063 |
(...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1302 uintptr n; | 1298 uintptr n; |
1303 | 1299 |
1304 n = (h->arena_used - h->arena_start) / wordsPerBitmapWord; | 1300 n = (h->arena_used - h->arena_start) / wordsPerBitmapWord; |
1305 n = (n+bitmapChunk-1) & ~(bitmapChunk-1); | 1301 n = (n+bitmapChunk-1) & ~(bitmapChunk-1); |
1306 if(h->bitmap_mapped >= n) | 1302 if(h->bitmap_mapped >= n) |
1307 return; | 1303 return; |
1308 | 1304 |
1309 runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped); | 1305 runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped); |
1310 h->bitmap_mapped = n; | 1306 h->bitmap_mapped = n; |
1311 } | 1307 } |
| 1308 |
| 1309 #ifdef _64BIT |
| 1310 // Amd64 uses 48-bit virtual addresses, 47-th bit is used as kernel/user flag. |
| 1311 // So we use 17msb of pointers as ABA counter. |
| 1312 # define PTR_BITS 47 |
| 1313 #else |
| 1314 # define PTR_BITS 32 |
| 1315 #endif |
| 1316 #define PTR_MASK ((1ull<<PTR_BITS)-1) |
| 1317 |
| 1318 static void |
| 1319 lifopush(uint64 *a, Workbuf *b) |
| 1320 { |
| 1321 uint64 old, new; |
| 1322 |
| 1323 if((uint64)b != ((uint64)b&PTR_MASK)) { |
| 1324 runtime·printf("p=%p\n", b); |
| 1325 runtime·throw("lifopush: invalid pointer"); |
| 1326 } |
| 1327 |
| 1328 if(work.nproc == 1) { |
| 1329 b->next = (Workbuf*)(*a&PTR_MASK); |
| 1330 *a = (uint64)b; |
| 1331 return; |
| 1332 } |
| 1333 |
| 1334 b->pushcnt++; |
| 1335 new = (uint64)b|(((uint64)b->pushcnt)<<PTR_BITS); |
| 1336 old = runtime·atomicload64(a); |
| 1337 for(;;) { |
| 1338 b->next = (Workbuf*)(old&PTR_MASK); |
| 1339 if(runtime·cas64(a, &old, new)) |
| 1340 break; |
| 1341 } |
| 1342 } |
| 1343 |
| 1344 static Workbuf* |
| 1345 lifopop(uint64 *a) |
| 1346 { |
| 1347 Workbuf *b, *b2; |
| 1348 uint64 old, new; |
| 1349 |
| 1350 if(work.nproc == 1) { |
| 1351 b = (Workbuf*)(*a&PTR_MASK); |
| 1352 if(b == nil) |
| 1353 return nil; |
| 1354 *a = (uint64)b->next; |
| 1355 return b; |
| 1356 } |
| 1357 |
| 1358 old = runtime·atomicload64(a); |
| 1359 for(;;) { |
| 1360 if(old == 0) |
| 1361 return nil; |
| 1362 b = (Workbuf*)(old&PTR_MASK); |
| 1363 b2 = runtime·atomicloadp(&b->next); |
| 1364 new = 0; |
| 1365 if(b2 != nil) |
| 1366 new = (uint64)b2|(((uint64)b2->pushcnt)<<PTR_BITS); |
| 1367 if(runtime·cas64(a, &old, new)) |
| 1368 return b; |
| 1369 } |
| 1370 } |
| 1371 |
| 1372 void |
| 1373 runtime·CTestLockFreeStack(bool isShort) |
| 1374 { |
| 1375 uint64 stack; |
| 1376 Workbuf *b; |
| 1377 |
| 1378 USED(isShort); |
| 1379 |
| 1380 for(work.nproc=1; work.nproc<=2; work.nproc++) { |
| 1381 // check the stack is initially empty |
| 1382 stack = 0; |
| 1383 if(lifopop(&stack) != nil) |
| 1384 runtime·panicstring("stack is not empty"); |
| 1385 |
| 1386 // push one element |
| 1387 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0); |
| 1388 b->nobj = 42; |
| 1389 lifopush(&stack, b); |
| 1390 |
| 1391 // push another |
| 1392 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0); |
| 1393 b->nobj = 43; |
| 1394 lifopush(&stack, b); |
| 1395 |
| 1396 // pop one element |
| 1397 b = lifopop(&stack); |
| 1398 if(b == nil) |
| 1399 runtime·panicstring("stack is empty"); |
| 1400 if(b->nobj != 43) |
| 1401 runtime·panicstring("no lifo"); |
| 1402 runtime·free(b); |
| 1403 |
| 1404 // pop another |
| 1405 b = lifopop(&stack); |
| 1406 if(b == nil) |
| 1407 runtime·panicstring("stack is empty"); |
| 1408 if(b->nobj != 42) |
| 1409 runtime·panicstring("no lifo"); |
| 1410 runtime·free(b); |
| 1411 |
| 1412 // check the stack is empty again |
| 1413 if(stack != 0) |
| 1414 runtime·panicstring("stack is not empty"); |
| 1415 if(lifopop(&stack) != nil) |
| 1416 runtime·panicstring("stack is not empty"); |
| 1417 } |
| 1418 } |
| 1419 |
| 1420 typedef struct StackTestCtx StackTestCtx; |
| 1421 struct StackTestCtx |
| 1422 { |
| 1423 uint32 niter; |
| 1424 uint32 waitsema; |
| 1425 uint64 stack[2]; |
| 1426 }; |
| 1427 |
| 1428 static void |
| 1429 stackTestProc(StackTestCtx *ctx) |
| 1430 { |
| 1431 int32 i, n; |
| 1432 Workbuf *b; |
| 1433 |
| 1434 for(i=0; i<ctx->niter; i++) { |
| 1435 n = runtime·fastrand1()%2; |
| 1436 b = lifopop(&ctx->stack[n]); |
| 1437 if(b==nil) |
| 1438 continue; |
| 1439 n = runtime·fastrand1()%2; |
| 1440 lifopush(&ctx->stack[n], b); |
| 1441 } |
| 1442 runtime·semrelease(&ctx->waitsema); |
| 1443 } |
| 1444 |
| 1445 void |
| 1446 runtime·CTestLockFreeStackStress(bool isShort) |
| 1447 { |
| 1448 StackTestCtx ctx, *arg; |
| 1449 Workbuf *b; |
| 1450 int32 i, sum, sum2, cnt, procs; |
| 1451 const int32 N = 100; |
| 1452 const int32 P = 8; |
| 1453 const int32 G = 16; |
| 1454 |
| 1455 procs = runtime·gomaxprocsfunc(P); |
| 1456 work.nproc = P; |
| 1457 ctx.niter = isShort ? 10000 : 100000; |
| 1458 ctx.waitsema = 0; |
| 1459 ctx.stack[0] = 0; |
| 1460 ctx.stack[1] = 0; |
| 1461 arg = &ctx; |
| 1462 sum = 0; |
| 1463 for(i=0; i<N; i++) { |
| 1464 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0); |
| 1465 b->nobj = i; |
| 1466 sum += i; |
| 1467 lifopush(&ctx.stack[i%2], b); |
| 1468 } |
| 1469 m->locks++; // disable gc during the mallocs (gc resets work.nproc) |
| 1470 for(i=0; i<G; i++) |
| 1471 runtime·newproc1((byte*)stackTestProc, (byte*)&arg, sizeof(arg),
0, runtime·CTestLockFreeStackStress); |
| 1472 m->locks--; |
| 1473 for(i=0; i<G; i++) |
| 1474 runtime·semacquire(&ctx.waitsema); |
| 1475 sum2 = 0; |
| 1476 cnt = 0; |
| 1477 for(i=0; i<2; i++) { |
| 1478 while((b=lifopop(&ctx.stack[i])) != nil) { |
| 1479 cnt++; |
| 1480 sum2 += b->nobj; |
| 1481 b->next = (Workbuf*)123; |
| 1482 runtime·free(b); |
| 1483 } |
| 1484 } |
| 1485 if(cnt != N || sum2 != sum) |
| 1486 runtime·panicstring("stack corruption"); |
| 1487 runtime·gomaxprocsfunc(procs); |
| 1488 } |
| 1489 |
| 1490 static void |
| 1491 parforsetup(Parfor *desc, void (*body)(Parfor*, uint32), uint32 nthr, uint32 n,
void *ctx, bool wait) |
| 1492 { |
| 1493 uint32 i, d, begin, end; |
| 1494 |
| 1495 if(nthr == 0 || nthr > MaxGcproc || body == nil) { |
| 1496 runtime·printf("nthr=%d count=%d body=%p\n", |
| 1497 nthr, n, body); |
| 1498 runtime·throw("parallel for: invalid args"); |
| 1499 } |
| 1500 |
| 1501 desc->body = body; |
| 1502 desc->done = 0; |
| 1503 desc->nthr = nthr; |
| 1504 desc->thrseq = 0; |
| 1505 desc->cnt = n; |
| 1506 desc->ctx = ctx; |
| 1507 desc->wait = wait; |
| 1508 for(i=0; i<nthr; i++) { |
| 1509 begin = n / nthr * i; |
| 1510 end = n / nthr * (i+1); |
| 1511 d = n % nthr; |
| 1512 if(i < d) { |
| 1513 begin += i; |
| 1514 end += (i+1); |
| 1515 } else { |
| 1516 begin += d; |
| 1517 end += d; |
| 1518 } |
| 1519 desc->thr[i].pos = (uint64)begin | (((uint64)end)<<32); |
| 1520 } |
| 1521 } |
| 1522 |
| 1523 static void |
| 1524 parfor(Parfor *desc) |
| 1525 { |
| 1526 uint32 tid, begin, end, begin2, try, victim, i; |
| 1527 uint64 *mypos, pos; |
| 1528 bool idle; |
| 1529 |
| 1530 // obtain 0-based thread index |
| 1531 tid = runtime·xadd(&desc->thrseq, 1) - 1; |
| 1532 if(tid >= desc->nthr) { |
| 1533 runtime·throw("parallel for: invalid tid"); |
| 1534 } |
| 1535 |
| 1536 // if single-threaded, just execute the for serially |
| 1537 if(desc->nthr==1) { |
| 1538 for(i=0; i<desc->cnt; i++) |
| 1539 desc->body(desc, i); |
| 1540 return; |
| 1541 } |
| 1542 |
| 1543 mypos = &desc->thr[tid].pos; |
| 1544 for(;;) { |
| 1545 for(;;) { |
| 1546 // while have work, bump low index and execute the itera
tion |
| 1547 pos = runtime·xadd64(mypos, 1); |
| 1548 begin = (uint32)pos-1; |
| 1549 end = (uint32)(pos>>32); |
| 1550 if(begin>=end) |
| 1551 break; |
| 1552 desc->body(desc, begin); |
| 1553 } |
| 1554 |
| 1555 // out of work, need to steal something |
| 1556 idle = false; |
| 1557 for(try=0;; try++) { |
| 1558 // if we don't see any work for long enough, |
| 1559 // increment the done counter... |
| 1560 if(try > desc->nthr*4 && !idle) { |
| 1561 idle = true; |
| 1562 runtime·xadd(&desc->done, 1); |
| 1563 } |
| 1564 // ...if all threads have incremented the counter, |
| 1565 // we are done |
| 1566 if(desc->done + !idle == desc->nthr) { |
| 1567 if(!idle) |
| 1568 runtime·xadd(&desc->done, 1); |
| 1569 return; |
| 1570 } |
| 1571 // choose a random victim for stealing |
| 1572 victim = runtime·fastrand1() % (desc->nthr-1); |
| 1573 if(victim >= tid) |
| 1574 victim++; |
| 1575 pos = runtime·atomicload64(&desc->thr[victim].pos); |
| 1576 for(;;) { |
| 1577 // see if it has any work |
| 1578 begin = (uint32)pos; |
| 1579 end = (uint32)(pos>>32); |
| 1580 if(begin >= end-1) { |
| 1581 begin = end = 0; |
| 1582 break; |
| 1583 } |
| 1584 if(idle) { |
| 1585 runtime·xadd(&desc->done, -1); |
| 1586 idle = false; |
| 1587 } |
| 1588 begin2 = begin + (end-begin)/2; |
| 1589 if(runtime·cas64(&desc->thr[victim].pos, &pos, (
uint64)begin | (((uint64)begin2)<<32))) { |
| 1590 begin = begin2; |
| 1591 break; |
| 1592 } |
| 1593 } |
| 1594 if(begin < end) { |
| 1595 // has successfully stolen some work |
| 1596 if(idle) |
| 1597 runtime·throw("parfor: should not be idl
e"); |
| 1598 runtime·atomicstore64(mypos, (uint64)begin | ((u
int64)end)<<32); |
| 1599 m->gcstats.nsteal += 1; |
| 1600 m->gcstats.nstealcnt += end-begin; |
| 1601 break; |
| 1602 } |
| 1603 // backoff |
| 1604 if(try < desc->nthr) { |
| 1605 } else if (try < desc->nthr*4) { |
| 1606 m->gcstats.nprocyield++; |
| 1607 runtime·procyield(20); |
| 1608 // if a user asked to not wait for others, exit now |
| 1609 // (assume that most work is already done) |
| 1610 } else if (!desc->wait) { |
| 1611 if(!idle) |
| 1612 runtime·xadd(&desc->done, 1); |
| 1613 return; |
| 1614 } else if (try < desc->nthr*6) { |
| 1615 m->gcstats.nosyield++; |
| 1616 runtime·osyield(); |
| 1617 } else { |
| 1618 m->gcstats.nsleep++; |
| 1619 runtime·usleep(1); |
| 1620 } |
| 1621 } |
| 1622 } |
| 1623 } |
| 1624 |
| 1625 static void |
| 1626 testParforBody(Parfor *desc, uint32 i) |
| 1627 { |
| 1628 uint64 *data; |
| 1629 |
| 1630 data = (uint64*)desc->ctx; |
| 1631 data[i] *= data[i]; |
| 1632 } |
| 1633 |
| 1634 static void |
| 1635 testParforProc(Parfor *desc) |
| 1636 { |
| 1637 parfor(desc); |
| 1638 } |
| 1639 |
| 1640 // Simple serial sanity test for parallelfor. |
| 1641 void |
| 1642 runtime·CTestParfor(bool isShort) |
| 1643 { |
| 1644 Parfor desc; |
| 1645 uint64 *data; |
| 1646 uint32 i, N; |
| 1647 |
| 1648 USED(isShort); |
| 1649 |
| 1650 N = 1000; |
| 1651 data = (uint64*)runtime·mal(N*sizeof(uint64)); |
| 1652 for(i=0; i<N; i++) |
| 1653 data[i] = i; |
| 1654 parforsetup(&desc, testParforBody, 1, N, data, true); |
| 1655 parfor(&desc); |
| 1656 for(i=0; i<N; i++) { |
| 1657 if(data[i] != i*i) |
| 1658 runtime·panicstring("incorrect result"); |
| 1659 } |
| 1660 runtime·free(data); |
| 1661 } |
| 1662 |
| 1663 // Test that iterations are properly distributed. |
| 1664 void |
| 1665 runtime·CTestParforSetup(bool isShort) |
| 1666 { |
| 1667 Parfor desc; |
| 1668 uint32 n, t, i, begin, end, size, end0, size0, sum; |
| 1669 |
| 1670 USED(isShort); |
| 1671 |
| 1672 for(n=0; n<100; n++) { |
| 1673 for(t=1; t<=MaxGcproc; t++) { |
| 1674 parforsetup(&desc, testParforBody, t, n, 0, true); |
| 1675 sum = 0; |
| 1676 size0 = 0; |
| 1677 end0 = 0; |
| 1678 for(i=0; i<t; i++) { |
| 1679 begin = (uint32)desc.thr[i].pos; |
| 1680 end = (uint32)(desc.thr[i].pos>>32); |
| 1681 size = end - begin; |
| 1682 sum += size; |
| 1683 if(i==0) { |
| 1684 size0 = size; |
| 1685 if(begin != 0) |
| 1686 runtime·panicstring("incorrect b
egin"); |
| 1687 } else { |
| 1688 if(size != size0 && size != size0-1) |
| 1689 runtime·panicstring("incorrect s
ize"); |
| 1690 if(begin != end0) |
| 1691 runtime·panicstring("incorrect b
egin/end"); |
| 1692 } |
| 1693 end0 = end; |
| 1694 } |
| 1695 if(sum != n) |
| 1696 runtime·panicstring("incorrect sum"); |
| 1697 } |
| 1698 } |
| 1699 } |
| 1700 |
| 1701 // Test that nonblocking parallelfor does not block. |
| 1702 void |
| 1703 runtime·CTestParforNonblock(bool isShort) |
| 1704 { |
| 1705 Parfor desc; |
| 1706 uint64 *data; |
| 1707 uint32 i, N; |
| 1708 |
| 1709 USED(isShort); |
| 1710 |
| 1711 N = 1000; |
| 1712 data = (uint64*)runtime·mal(N*sizeof(uint64)); |
| 1713 for(i=0; i<N; i++) |
| 1714 data[i] = i; |
| 1715 parforsetup(&desc, testParforBody, MaxGcproc, N, data, false); |
| 1716 for(i=0; i<MaxGcproc; i++) |
| 1717 parfor(&desc); |
| 1718 for(i=0; i<N; i++) { |
| 1719 if(data[i] != i*i) |
| 1720 runtime·panicstring("incorrect result"); |
| 1721 } |
| 1722 runtime·free(data); |
| 1723 } |
| 1724 |
| 1725 // Test parallel parallelfor. |
| 1726 void |
| 1727 runtime·CTestParforParallel(bool isShort) |
| 1728 { |
| 1729 Parfor desc, *arg; |
| 1730 uint64 *data; |
| 1731 uint32 i, N; |
| 1732 int32 procs; |
| 1733 |
| 1734 procs = runtime·gomaxprocsfunc(MaxGcproc+1); |
| 1735 N = 10000; |
| 1736 if(isShort) |
| 1737 N /= 10; |
| 1738 data = (uint64*)runtime·mal(N*sizeof(uint64)); |
| 1739 for(i=0; i<N; i++) |
| 1740 data[i] = i; |
| 1741 parforsetup(&desc, testParforBody, MaxGcproc, N, data, true); |
| 1742 arg = &desc; |
| 1743 m->locks++; // disable gc during the mallocs in newproc |
| 1744 for(i=1; i<MaxGcproc; i++) |
| 1745 runtime·newproc1((byte*)testParforProc, (byte*)&arg, sizeof(arg)
, 0, runtime·CTestParforParallel); |
| 1746 m->locks--; |
| 1747 parfor(&desc); |
| 1748 for(i=0; i<N; i++) { |
| 1749 if(data[i] != i*i) |
| 1750 runtime·panicstring("incorrect result"); |
| 1751 } |
| 1752 runtime·free(data); |
| 1753 runtime·gomaxprocsfunc(procs); |
| 1754 } |
LEFT | RIGHT |