Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(59)

Delta Between Two Patch Sets: src/pkg/runtime/mgc0.c

Issue 5279048: code review 5279048: runtime: faster and more scalable GC (Closed)
Left Patch Set: diff -r 9e73a59f386c https://go.googlecode.com/hg/ Created 13 years, 5 months ago
Right Patch Set: diff -r f44057cc01b2 https://go.googlecode.com/hg/ Created 12 years, 11 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « src/pkg/runtime/mcentral.c ('k') | src/pkg/runtime/mheap.c » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 // Copyright 2009 The Go Authors. All rights reserved. 1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // Garbage collector. 5 // Garbage collector.
6 6
7 #include "runtime.h" 7 #include "runtime.h"
8 #include "arch.h" 8 #include "arch_GOARCH.h"
9 #include "malloc.h" 9 #include "malloc.h"
10 #include "stack.h" 10 #include "stack.h"
11 11
12 enum { 12 enum {
13 Debug = 0, 13 Debug = 0,
14 PtrSize = sizeof(void*), 14 PtrSize = sizeof(void*),
15 DebugMark = 0, // run second pass to check mark 15 DebugMark = 0, // run second pass to check mark
16 DataBlock = 8*1024,
16 17
17 // Four bits per word (see #defines below). 18 // Four bits per word (see #defines below).
18 wordsPerBitmapWord = sizeof(void*)*8/4, 19 wordsPerBitmapWord = sizeof(void*)*8/4,
19 bitShift = sizeof(void*)*8/4, 20 bitShift = sizeof(void*)*8/4,
20 }; 21 };
21 22
22 // Bits in per-word bitmap. 23 // Bits in per-word bitmap.
23 // #defines because enum might not be able to hold the values. 24 // #defines because enum might not be able to hold the values.
24 // 25 //
25 // Each word in the bitmap describes wordsPerBitmapWord words 26 // Each word in the bitmap describes wordsPerBitmapWord words
(...skipping 19 matching lines...) Expand all
45 // /* then test bits & bitAllocated, bits & bitMarked, etc. */ 46 // /* then test bits & bitAllocated, bits & bitMarked, etc. */
46 // 47 //
47 #define bitAllocated ((uintptr)1<<(bitShift*0)) 48 #define bitAllocated ((uintptr)1<<(bitShift*0))
48 #define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is set */ 49 #define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is set */
49 #define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAlloc ated is set */ 50 #define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAlloc ated is set */
50 #define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAlloc ated is set - has finalizer or being profiled */ 51 #define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAlloc ated is set - has finalizer or being profiled */
51 #define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is NOT set */ 52 #define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is NOT set */
52 53
53 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial) 54 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial)
54 55
56 // Holding worldsema grants an M the right to try to stop the world.
57 // The procedure is:
58 //
59 // runtime·semacquire(&runtime·worldsema);
60 // m->gcing = 1;
61 // runtime·stoptheworld();
62 //
63 // ... do stuff ...
64 //
65 // m->gcing = 0;
66 // runtime·semrelease(&runtime·worldsema);
67 // runtime·starttheworld();
68 //
69 uint32 runtime·worldsema = 1;
55 static int32 gctrace; 70 static int32 gctrace;
56 71
57 typedef struct Workbuf Workbuf; 72 typedef struct Workbuf Workbuf;
58 struct Workbuf 73 struct Workbuf
59 { 74 {
60 Workbuf *next; 75 Workbuf *next;
76 uintptr pushcnt;
61 uintptr nobj; 77 uintptr nobj;
62 » byte *obj[512-2]; 78 » byte *obj[512-3];
63 }; 79 };
64 80
65 typedef struct Finalizer Finalizer; 81 typedef struct Finalizer Finalizer;
66 struct Finalizer 82 struct Finalizer
67 { 83 {
68 void (*fn)(void*); 84 void (*fn)(void*);
69 void *arg; 85 void *arg;
70 int32 nret; 86 int32 nret;
71 }; 87 };
72 88
73 typedef struct FinBlock FinBlock; 89 typedef struct FinBlock FinBlock;
74 struct FinBlock 90 struct FinBlock
75 { 91 {
76 FinBlock *alllink; 92 FinBlock *alllink;
77 FinBlock *next; 93 FinBlock *next;
78 int32 cnt; 94 int32 cnt;
79 int32 cap; 95 int32 cap;
80 Finalizer fin[1]; 96 Finalizer fin[1];
81 }; 97 };
82 98
83 extern byte data[]; 99 extern byte data[];
84 extern byte etext[]; 100 extern byte etext[];
85 extern byte end[]; 101 extern byte ebss[];
86 102
87 static G *fing; 103 static G *fing;
88 static FinBlock *finq; // list of finalizers that are to be executed 104 static FinBlock *finq; // list of finalizers that are to be executed
89 static FinBlock *finc; // cache of free blocks 105 static FinBlock *finc; // cache of free blocks
90 static FinBlock *allfin; // list of all blocks 106 static FinBlock *allfin; // list of all blocks
91 static Lock finlock; 107 static Lock finlock;
92 static int32 fingwait; 108 static int32 fingwait;
93 109
94 static void runfinq(void); 110 static void runfinq(void);
95 static Workbuf* getempty(Workbuf*); 111 static Workbuf* getempty(Workbuf*);
96 static Workbuf* getfull(Workbuf*); 112 static Workbuf* getfull(Workbuf*);
97 static void putempty(Workbuf*); 113 static void putempty(Workbuf*);
98 static Workbuf* handoff(Workbuf*); 114 static Workbuf* handoff(Workbuf*);
99 115
100 static struct { 116 // parallel for descriptor
101 » Lock fmu; 117 typedef struct Parfor Parfor;
102 » Workbuf»*full; 118 struct Parfor
103 » Lock emu; 119 {
104 » Workbuf»*empty; 120 » // executed for each element
121 » void (*body)(Parfor*, uint32);
122 » // number of idle threads
123 » uint32 done;
124 » // total number of threads
125 » uint32 nthr;
126 » // thread id sequencer
127 » uint32 thrseq;
128 » // iteration space [0, cnt)
129 » uint32 cnt;
130 » // arbitrary user context
131 » void *ctx;
132 » // if true, wait while all threads finish processing,
133 » // otherwise parfor may return while other threads still working
134 » bool wait;
135 » byte pad[CacheLineSize];
136 » struct
137 » {
138 » » // the thread's iteration space [32lsb, 32msb)
139 » » uint64 pos;
140 » » byte pad[CacheLineSize-sizeof(uint64)];
141 » } thr[MaxGcproc];
142 };
143
144 typedef struct GcRoot GcRoot;
145 struct GcRoot
146 {
147 » byte *p;
148 » uintptr n;
149 };
150
151 static struct
152 {
153 » uint64» full; // Workbuf* + ABA counter in 17msb
154 » uint64» empty; // Workbuf* + ABA counter in 17msb
105 uint32 nproc; 155 uint32 nproc;
156 byte pad0[CacheLineSize];
106 volatile uint32 nwait; 157 volatile uint32 nwait;
107 » volatile uint32»ndone; 158 » byte pad1[CacheLineSize];
108 » volatile uint32 markdone; 159 » volatile uint32 ndone;
160 » volatile uint32 debugmarkdone;
109 Note alldone; 161 Note alldone;
110 » MSpan» *spans; 162 » Parfor» sweepfor;
163 » Parfor» markfor;
111 164
112 Lock; 165 Lock;
113 byte *chunk; 166 byte *chunk;
114 uintptr nchunk; 167 uintptr nchunk;
168
169 GcRoot *roots;
170 uint32 nroot;
171 uint32 rootcap;
115 } work; 172 } work;
173
174 // lock-free stack
175 // used to manage empty and free Workbuf's during mark phase
176 static void lifopush(uint64 *a, Workbuf *b);
177 static Workbuf* lifopop(uint64 *a);
178
179 // parallel for over [0, n).
180 // body() is executed for each iteration.
181 // nthr - total number of worker threads.
182 // ctx - arbitrary user context.
183 // if wait=true, threads return from parfor() when all work is done;
184 // otherwise, threads can return while other threads still finishing processing.········
185 static void parforsetup(Parfor *desc, void (*body)(Parfor*, uint32),
186 uint32 nthr, uint32 n, void *ctx, bool wait);
187 static void parfor(Parfor *desc);
116 188
117 // scanblock scans a block of n bytes starting at pointer b for references 189 // scanblock scans a block of n bytes starting at pointer b for references
118 // to other objects, scanning any it finds recursively until there are no 190 // to other objects, scanning any it finds recursively until there are no
119 // unscanned objects left. Instead of using an explicit recursion, it keeps 191 // unscanned objects left. Instead of using an explicit recursion, it keeps
120 // a work list in the Workbuf* structures and loops in the main function 192 // a work list in the Workbuf* structures and loops in the main function
121 // body. Keeping an explicit work list is easier on the stack allocator and 193 // body. Keeping an explicit work list is easier on the stack allocator and
122 // more efficient. 194 // more efficient.
123 static void 195 static void
124 scanblock(byte *b, int64 n) 196 scanblock(byte *b, int64 n)
125 { 197 {
(...skipping 14 matching lines...) Expand all
140 // Memory arena parameters. 212 // Memory arena parameters.
141 arena_start = runtime·mheap.arena_start; 213 arena_start = runtime·mheap.arena_start;
142 arena_used = runtime·mheap.arena_used; 214 arena_used = runtime·mheap.arena_used;
143 nproc = work.nproc; 215 nproc = work.nproc;
144 216
145 wbuf = nil; // current work buffer 217 wbuf = nil; // current work buffer
146 wp = nil; // storage for next queued pointer (write pointer) 218 wp = nil; // storage for next queued pointer (write pointer)
147 nobj = 0; // number of queued objects 219 nobj = 0; // number of queued objects
148 220
149 // Scanblock helpers pass b==nil. 221 // Scanblock helpers pass b==nil.
150 » // The main proc needs to return to make more 222 » // Procs need to return to make more
151 // calls to scanblock. But if work.nproc==1 then 223 // calls to scanblock. But if work.nproc==1 then
152 // might as well process blocks as soon as we 224 // might as well process blocks as soon as we
153 // have them. 225 // have them.
154 keepworking = b == nil || work.nproc == 1; 226 keepworking = b == nil || work.nproc == 1;
155 227
156 // Align b to a word boundary. 228 // Align b to a word boundary.
157 off = (uintptr)b & (PtrSize-1); 229 off = (uintptr)b & (PtrSize-1);
158 if(off != 0) { 230 if(off != 0) {
159 b += PtrSize - off; 231 b += PtrSize - off;
160 n -= PtrSize - off; 232 n -= PtrSize - off;
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
198 if(((xbits>>j) & (bitAllocated|bitBlockBoundary) ) != 0) { 270 if(((xbits>>j) & (bitAllocated|bitBlockBoundary) ) != 0) {
199 obj = (byte*)obj - (shift-j)*PtrSize; 271 obj = (byte*)obj - (shift-j)*PtrSize;
200 shift = j; 272 shift = j;
201 bits = xbits>>shift; 273 bits = xbits>>shift;
202 goto found; 274 goto found;
203 } 275 }
204 } 276 }
205 277
206 // Otherwise consult span table to find beginning. 278 // Otherwise consult span table to find beginning.
207 // (Manually inlined copy of MHeap_LookupMaybe.) 279 // (Manually inlined copy of MHeap_LookupMaybe.)
208 m->gcstats.nlookup++;
209 m->gcstats.naddrlookup++;
210 k = (uintptr)obj>>PageShift; 280 k = (uintptr)obj>>PageShift;
211 x = k; 281 x = k;
212 if(sizeof(void*) == 8) 282 if(sizeof(void*) == 8)
213 x -= (uintptr)arena_start>>PageShift; 283 x -= (uintptr)arena_start>>PageShift;
214 s = runtime·mheap.map[x]; 284 s = runtime·mheap.map[x];
215 if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse) 285 if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
216 continue; 286 continue;
217 p = (byte*)((uintptr)s->start<<PageShift); 287 p = (byte*)((uintptr)s->start<<PageShift);
218 if(s->sizeclass == 0) { 288 if(s->sizeclass == 0) {
219 obj = p; 289 obj = p;
220 } else { 290 } else {
221 if((byte*)obj >= (byte*)s->limit) 291 if((byte*)obj >= (byte*)s->limit)
222 continue; 292 continue;
223 size = runtime·class_to_size[s->sizeclass]; 293 size = runtime·class_to_size[s->sizeclass];
224 int32 i = ((byte*)obj - p)/size; 294 int32 i = ((byte*)obj - p)/size;
225 obj = p+i*size; 295 obj = p+i*size;
226 } 296 }
227 297
228 // Now that we know the object header, reload bits. 298 // Now that we know the object header, reload bits.
229 off = (uintptr*)obj - (uintptr*)arena_start; 299 off = (uintptr*)obj - (uintptr*)arena_start;
230 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; 300 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
231 shift = off % wordsPerBitmapWord; 301 shift = off % wordsPerBitmapWord;
232 xbits = *bitp; 302 xbits = *bitp;
233 bits = xbits >> shift; 303 bits = xbits >> shift;
234 304
235 found: 305 found:
306 // If another proc wants a pointer, give it some.
307 if(work.nwait > 0 && nobj > 4 && work.full == 0) {
308 wbuf->nobj = nobj;
309 wbuf = handoff(wbuf);
310 nobj = wbuf->nobj;
311 wp = wbuf->obj + nobj;
312 }
313
236 // Now we have bits, bitp, and shift correct for 314 // Now we have bits, bitp, and shift correct for
237 // obj pointing at the base of the object. 315 // obj pointing at the base of the object.
238 // Only care about allocated and not marked. 316 // Only care about allocated and not marked.
239 if((bits & (bitAllocated|bitMarked)) != bitAllocated) 317 if((bits & (bitAllocated|bitMarked)) != bitAllocated)
240 continue; 318 continue;
241 if(nproc == 1) 319 if(nproc == 1)
242 *bitp |= bitMarked<<shift; 320 *bitp |= bitMarked<<shift;
243 else { 321 else {
244 for(;;) { 322 for(;;) {
245 x = *bitp; 323 x = *bitp;
246 if(x & (bitMarked<<shift)) 324 if(x & (bitMarked<<shift))
247 goto continue_obj; 325 goto continue_obj;
248 if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift)))) 326 if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
249 break; 327 break;
250 } 328 }
251 } 329 }
252 330
253 // If object has no pointers, don't need to scan further . 331 // If object has no pointers, don't need to scan further .
254 if((bits & bitNoPointers) != 0) 332 if((bits & bitNoPointers) != 0)
255 continue; 333 continue;
256 334
257 » » » // If another proc wants a pointer, give it some. 335 » » » PREFETCH(obj);
258 » » » if(nobj > 4 && work.nwait > 0 && work.full == nil) {
259 » » » » wbuf->nobj = nobj;
260 » » » » wbuf = handoff(wbuf);
261 » » » » nobj = wbuf->nobj;
262 » » » » wp = wbuf->obj + nobj;
263 » » » }
264 336
265 // If buffer is full, get a new one. 337 // If buffer is full, get a new one.
266 if(wbuf == nil || nobj >= nelem(wbuf->obj)) { 338 if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
267 if(wbuf != nil) 339 if(wbuf != nil)
268 wbuf->nobj = nobj; 340 wbuf->nobj = nobj;
269 wbuf = getempty(wbuf); 341 wbuf = getempty(wbuf);
270 wp = wbuf->obj; 342 wp = wbuf->obj;
271 nobj = 0; 343 nobj = 0;
272 } 344 }
273 *wp++ = obj; 345 *wp++ = obj;
274 nobj++; 346 nobj++;
275 continue_obj:; 347 continue_obj:;
276 } 348 }
277 349
278 // Done scanning [b, b+n). Prepare for the next iteration of 350 // Done scanning [b, b+n). Prepare for the next iteration of
279 // the loop by setting b and n to the parameters for the next bl ock. 351 // the loop by setting b and n to the parameters for the next bl ock.
280 352
281 // Fetch b from the work buffer. 353 // Fetch b from the work buffer.
282 if(nobj == 0) { 354 if(nobj == 0) {
283 if(!keepworking) { 355 if(!keepworking) {
284 » » » » putempty(wbuf); 356 » » » » if(wbuf != nil)
357 » » » » » putempty(wbuf);
285 return; 358 return;
286 } 359 }
287 // Emptied our buffer: refill. 360 // Emptied our buffer: refill.
288 wbuf = getfull(wbuf); 361 wbuf = getfull(wbuf);
289 if(wbuf == nil) 362 if(wbuf == nil)
290 return; 363 return;
291 nobj = wbuf->nobj; 364 nobj = wbuf->nobj;
292 wp = wbuf->obj + wbuf->nobj; 365 wp = wbuf->obj + wbuf->nobj;
293 } 366 }
294 b = *--wp; 367 b = *--wp;
295 nobj--; 368 nobj--;
296 369
297 » » // Figure out n = size of b. Start by loading bits for b. 370 » » // Ask span about size class.
298 » » off = (uintptr*)b - (uintptr*)arena_start;
299 » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
300 » » shift = off % wordsPerBitmapWord;
301 » » xbits = *bitp;
302 » » bits = xbits >> shift;
303
304 » » // Might be small; look for nearby block boundary.
305 » » // A block boundary is marked by either bitBlockBoundary
306 » » // or bitAllocated being set (see notes near their definition).
307 » » enum {
308 » » » boundary = bitBlockBoundary|bitAllocated
309 » » };
310 » » // Look for a block boundary both after and before b
311 » » // in the same bitmap word.
312 » » //
313 » » // A block boundary j words after b is indicated by
314 » » //» bits>>j & boundary
315 » » // assuming shift+j < bitShift. (If shift+j >= bitShift then
316 » » // we'll be bleeding other bit types like bitMarked into our tes t.)
317 » » // Instead of inserting the conditional shift+j < bitShift into the loop,
318 » » // we can let j range from 1 to bitShift as long as we first
319 » » // apply a mask to keep only the bits corresponding
320 » » // to shift+j < bitShift aka j < bitShift-shift.
321 » » bits &= (boundary<<(bitShift-shift)) - boundary;
322
323 » » // A block boundary j words before b is indicated by
324 » » //» xbits>>(shift-j) & boundary
325 » » // (assuming shift >= j). There is no cleverness here
326 » » // avoid the test, because when j gets too large the shift
327 » » // turns negative, which is undefined in C.
328
329 » » for(j=1; j<bitShift; j++) {
330 » » » if(((bits>>j)&boundary) != 0 || shift>=j && ((xbits>>(sh ift-j))&boundary) != 0) {
331 » » » » n = j*PtrSize;
332 » » » » goto scan;
333 » » » }
334 » » }
335
336 » » // Fall back to asking span about size class.
337 // (Manually inlined copy of MHeap_Lookup.) 371 // (Manually inlined copy of MHeap_Lookup.)
338 m->gcstats.nlookup++;
339 m->gcstats.nsizelookup++;
340 x = (uintptr)b>>PageShift; 372 x = (uintptr)b>>PageShift;
341 if(sizeof(void*) == 8) 373 if(sizeof(void*) == 8)
342 x -= (uintptr)arena_start>>PageShift; 374 x -= (uintptr)arena_start>>PageShift;
343 s = runtime·mheap.map[x]; 375 s = runtime·mheap.map[x];
344 if(s->sizeclass == 0) 376 if(s->sizeclass == 0)
345 n = s->npages<<PageShift; 377 n = s->npages<<PageShift;
346 else 378 else
347 n = runtime·class_to_size[s->sizeclass]; 379 n = runtime·class_to_size[s->sizeclass];
348 scan:;
349 } 380 }
350 } 381 }
351 382
352 // debug_scanblock is the debug copy of scanblock. 383 // debug_scanblock is the debug copy of scanblock.
353 // it is simpler, slower, single-threaded, recursive, 384 // it is simpler, slower, single-threaded, recursive,
354 // and uses bitSpecial as the mark bit. 385 // and uses bitSpecial as the mark bit.
355 static void 386 static void
356 debug_scanblock(byte *b, int64 n) 387 debug_scanblock(byte *b, int64 n)
357 { 388 {
358 byte *obj, *p; 389 byte *obj, *p;
(...skipping 25 matching lines...) Expand all
384 if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runti me·mheap.arena_used) 415 if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runti me·mheap.arena_used)
385 continue; 416 continue;
386 417
387 // Round down to word boundary. 418 // Round down to word boundary.
388 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); 419 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
389 420
390 // Consult span table to find beginning. 421 // Consult span table to find beginning.
391 s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj); 422 s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj);
392 if(s == nil) 423 if(s == nil)
393 continue; 424 continue;
394
395 425
396 p = (byte*)((uintptr)s->start<<PageShift); 426 p = (byte*)((uintptr)s->start<<PageShift);
397 if(s->sizeclass == 0) { 427 if(s->sizeclass == 0) {
398 obj = p; 428 obj = p;
399 size = (uintptr)s->npages<<PageShift; 429 size = (uintptr)s->npages<<PageShift;
400 } else { 430 } else {
401 if((byte*)obj >= (byte*)s->limit) 431 if((byte*)obj >= (byte*)s->limit)
402 continue; 432 continue;
403 size = runtime·class_to_size[s->sizeclass]; 433 size = runtime·class_to_size[s->sizeclass];
404 int32 i = ((byte*)obj - p)/size; 434 int32 i = ((byte*)obj - p)/size;
(...skipping 17 matching lines...) Expand all
422 runtime·printf("found unmarked block %p in %p\n", obj, v p+i); 452 runtime·printf("found unmarked block %p in %p\n", obj, v p+i);
423 453
424 // If object has no pointers, don't need to scan further. 454 // If object has no pointers, don't need to scan further.
425 if((bits & bitNoPointers) != 0) 455 if((bits & bitNoPointers) != 0)
426 continue; 456 continue;
427 457
428 debug_scanblock(obj, size); 458 debug_scanblock(obj, size);
429 } 459 }
430 } 460 }
431 461
462 static void
463 markroot(Parfor *desc, uint32 i)
464 {
465 USED(desc);
466 scanblock(work.roots[i].p, work.roots[i].n);
467 }
468
432 // Get an empty work buffer off the work.empty list, 469 // Get an empty work buffer off the work.empty list,
433 // allocating new buffers as needed. 470 // allocating new buffers as needed.
434 static Workbuf* 471 static Workbuf*
435 getempty(Workbuf *b) 472 getempty(Workbuf *b)
436 { 473 {
437 » if(work.nproc == 1) { 474 » if(b != nil)
438 » » // Put b on full list. 475 » » lifopush(&work.full, b);
439 » » if(b != nil) { 476 » b = lifopop(&work.empty);
440 » » » b->next = work.full; 477 » if(b == nil) {
441 » » » work.full = b; 478 » » // Need to allocate.
442 » » } 479 » » runtime·lock(&work);
443 » » // Grab from empty list if possible. 480 » » if(work.nchunk < sizeof *b) {
444 » » b = work.empty; 481 » » » work.nchunk = 1<<20;
445 » » if(b != nil) { 482 » » » work.chunk = runtime·SysAlloc(work.nchunk);
446 » » » work.empty = b->next; 483 » » }
447 » » » goto haveb; 484 » » b = (Workbuf*)work.chunk;
448 » » } 485 » » work.chunk += sizeof *b;
449 » } else { 486 » » work.nchunk -= sizeof *b;
450 » » // Put b on full list. 487 » » runtime·unlock(&work);
451 » » if(b != nil) { 488 » }
452 » » » runtime·lock(&work.fmu);
453 » » » b->next = work.full;
454 » » » work.full = b;
455 » » » runtime·unlock(&work.fmu);
456 » » }
457 » » // Grab from empty list if possible.
458 » » runtime·lock(&work.emu);
459 » » b = work.empty;
460 » » if(b != nil)
461 » » » work.empty = b->next;
462 » » runtime·unlock(&work.emu);
463 » » if(b != nil)
464 » » » goto haveb;
465 » }
466
467 » // Need to allocate.
468 » runtime·lock(&work);
469 » if(work.nchunk < sizeof *b) {
470 » » work.nchunk = 1<<20;
471 » » work.chunk = runtime·SysAlloc(work.nchunk);
472 » }
473 » b = (Workbuf*)work.chunk;
474 » work.chunk += sizeof *b;
475 » work.nchunk -= sizeof *b;
476 » runtime·unlock(&work);
477
478 haveb:
479 b->nobj = 0; 489 b->nobj = 0;
480 return b; 490 return b;
481 } 491 }
482 492
483 static void 493 static void
484 putempty(Workbuf *b) 494 putempty(Workbuf *b)
485 { 495 {
486 » if(b == nil) 496 » lifopush(&work.empty, b);
487 » » return;
488
489 » if(work.nproc == 1) {
490 » » b->next = work.empty;
491 » » work.empty = b;
492 » » return;
493 » }
494
495 » runtime·lock(&work.emu);
496 » b->next = work.empty;
497 » work.empty = b;
498 » runtime·unlock(&work.emu);
499 } 497 }
500 498
501 // Get a full work buffer off the work.full list, or return nil. 499 // Get a full work buffer off the work.full list, or return nil.
502 static Workbuf* 500 static Workbuf*
503 getfull(Workbuf *b) 501 getfull(Workbuf *b)
504 { 502 {
505 int32 i; 503 int32 i;
506 » Workbuf *b1; 504
507 505 » if(b != nil)
508 » if(work.nproc == 1) { 506 » » lifopush(&work.empty, b);
509 » » // Put b on empty list. 507 » b = lifopop(&work.full);
510 » » if(b != nil) { 508 » if(b != nil || work.nproc == 1)
511 » » » b->next = work.empty;
512 » » » work.empty = b;
513 » » }
514 » » // Grab from full list if possible.
515 » » // Since work.nproc==1, no one else is
516 » » // going to give us work.
517 » » b = work.full;
518 » » if(b != nil)
519 » » » work.full = b->next;
520 return b; 509 return b;
521 }
522
523 putempty(b);
524
525 // Grab buffer from full list if possible.
526 for(;;) {
527 b1 = work.full;
528 if(b1 == nil)
529 break;
530 runtime·lock(&work.fmu);
531 if(work.full != nil) {
532 b1 = work.full;
533 work.full = b1->next;
534 runtime·unlock(&work.fmu);
535 return b1;
536 }
537 runtime·unlock(&work.fmu);
538 }
539 510
540 runtime·xadd(&work.nwait, +1); 511 runtime·xadd(&work.nwait, +1);
541 for(i=0;; i++) { 512 for(i=0;; i++) {
542 » » b1 = work.full; 513 » » if(work.full != 0) {
543 » » if(b1 != nil) { 514 » » » runtime·xadd(&work.nwait, -1);
544 » » » runtime·lock(&work.fmu); 515 » » » b = lifopop(&work.full);
545 » » » if(work.full != nil) { 516 » » » if(b != nil)
546 » » » » runtime·xadd(&work.nwait, -1); 517 » » » » return b;
547 » » » » b1 = work.full; 518 » » » runtime·xadd(&work.nwait, +1);
548 » » » » work.full = b1->next;
549 » » » » runtime·unlock(&work.fmu);
550 » » » » return b1;
551 » » » }
552 » » » runtime·unlock(&work.fmu);
553 » » » continue;
554 } 519 }
555 if(work.nwait == work.nproc) 520 if(work.nwait == work.nproc)
556 return nil; 521 return nil;
557 » » if(i < 10) 522 » » if(i < 10) {
523 » » » m->gcstats.nprocyield++;
558 runtime·procyield(20); 524 runtime·procyield(20);
559 » » else if(i < 20) 525 » » } else if(i < 20) {
526 » » » m->gcstats.nosyield++;
560 runtime·osyield(); 527 runtime·osyield();
561 » » else 528 » » } else {
529 » » » m->gcstats.nsleep++;
562 runtime·usleep(100); 530 runtime·usleep(100);
531 }
563 } 532 }
564 } 533 }
565 534
566 static Workbuf* 535 static Workbuf*
567 handoff(Workbuf *b) 536 handoff(Workbuf *b)
568 { 537 {
569 int32 n; 538 int32 n;
570 Workbuf *b1; 539 Workbuf *b1;
571 540
572 // Make new buffer with half of b's pointers. 541 // Make new buffer with half of b's pointers.
573 b1 = getempty(nil); 542 b1 = getempty(nil);
574 n = b->nobj/2; 543 n = b->nobj/2;
575 b->nobj -= n; 544 b->nobj -= n;
576 b1->nobj = n; 545 b1->nobj = n;
577 runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]); 546 runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
578 » m->gcstats.nhandoff += n; 547 » m->gcstats.nhandoff += 1;
548 » m->gcstats.nhandoffcnt += n;
579 549
580 // Put b on full list - let first half of b get stolen. 550 // Put b on full list - let first half of b get stolen.
581 » runtime·lock(&work.fmu); 551 » lifopush(&work.full, b);
582 » b->next = work.full;
583 » work.full = b;
584 » runtime·unlock(&work.fmu);
585
586 return b1; 552 return b1;
587 } 553 }
588 554
589 // Scanstack calls scanblock on each of gp's stack segments.
590 static void 555 static void
591 scanstack(void (*scanblock)(byte*, int64), G *gp) 556 addroot(byte *p, uintptr n)
557 {
558 » uint32 cap;
559 » GcRoot *new;
560
561 » if(work.nroot >= work.rootcap) {
562 » » cap = PageSize/sizeof(GcRoot);
563 » » if(cap < 2*work.rootcap)
564 » » » cap = 2*work.rootcap;
565 » » new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
566 » » if(work.roots != nil) {
567 » » » runtime·memmove(new, work.roots, work.rootcap*sizeof(GcR oot));
568 » » » runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot)) ;
569 » » }
570 » » work.roots = new;
571 » » work.rootcap = cap;
572 » }
573 » work.roots[work.nroot].p = p;
574 » work.roots[work.nroot].n = n;
575 » work.nroot++;
576 }
577
578 static void
579 addstackroots(G *gp)
592 { 580 {
593 M *mp; 581 M *mp;
594 int32 n; 582 int32 n;
595 Stktop *stk; 583 Stktop *stk;
596 byte *sp, *guard; 584 byte *sp, *guard;
597 585
598 stk = (Stktop*)gp->stackbase; 586 stk = (Stktop*)gp->stackbase;
599 guard = gp->stackguard; 587 guard = gp->stackguard;
600 588
601 if(gp == g) { 589 if(gp == g) {
(...skipping 12 matching lines...) Expand all
614 // as schedlock and may have needed to start a new stack segment . 602 // as schedlock and may have needed to start a new stack segment .
615 // Use the stack segment and stack pointer at the time of 603 // Use the stack segment and stack pointer at the time of
616 // the system call instead, since that won't change underfoot. 604 // the system call instead, since that won't change underfoot.
617 if(gp->gcstack != nil) { 605 if(gp->gcstack != nil) {
618 stk = (Stktop*)gp->gcstack; 606 stk = (Stktop*)gp->gcstack;
619 sp = gp->gcsp; 607 sp = gp->gcsp;
620 guard = gp->gcguard; 608 guard = gp->gcguard;
621 } 609 }
622 } 610 }
623 611
624 if(Debug > 1)
625 runtime·printf("scanstack %d %p\n", gp->goid, sp);
626 n = 0; 612 n = 0;
627 while(stk) { 613 while(stk) {
628 if(sp < guard-StackGuard || (byte*)stk < sp) { 614 if(sp < guard-StackGuard || (byte*)stk < sp) {
629 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); 615 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
630 runtime·throw("scanstack"); 616 runtime·throw("scanstack");
631 } 617 }
632 » » scanblock(sp, (byte*)stk - sp); 618 » » addroot(sp, (byte*)stk - sp);
633 sp = stk->gobuf.sp; 619 sp = stk->gobuf.sp;
634 guard = stk->stackguard; 620 guard = stk->stackguard;
635 stk = (Stktop*)stk->stackbase; 621 stk = (Stktop*)stk->stackbase;
636 n++; 622 n++;
637 } 623 }
638 } 624 }
639 625
640 // Markfin calls scanblock on the blocks that have finalizers:
641 // the things pointed at cannot be freed until the finalizers have run.
642 static void 626 static void
643 markfin(void *v) 627 addfinroots(void *v)
644 { 628 {
645 uintptr size; 629 uintptr size;
646 630
647 size = 0; 631 size = 0;
648 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v)) 632 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v))
649 runtime·throw("mark - finalizer inconsistency"); 633 runtime·throw("mark - finalizer inconsistency");
650 634
651 // do not mark the finalizer block itself. just mark the things it poin ts at. 635 // do not mark the finalizer block itself. just mark the things it poin ts at.
652 » scanblock(v, size); 636 » addroot(v, size);
653 } 637 }
654 638
655 static void 639 static void
656 debug_markfin(void *v) 640 addroots(void)
657 {
658 » uintptr size;
659
660 » if(!runtime·mlookup(v, &v, &size, nil))
661 » » runtime·throw("debug_mark - finalizer inconsistency");
662 » debug_scanblock(v, size);
663 }
664
665 // Mark
666 static void
667 mark(void (*scan)(byte*, int64))
668 { 641 {
669 G *gp; 642 G *gp;
670 FinBlock *fb; 643 FinBlock *fb;
644 byte *p;
645
646 work.nroot = 0;
671 647
672 // mark data+bss. 648 // mark data+bss.
673 » // skip runtime·mheap itself, which has no interesting pointers 649 » for(p=data; p<ebss; p+=DataBlock)
674 » // and is mostly zeroed and would not otherwise be paged in. 650 » » addroot(p, p+DataBlock<ebss?DataBlock:ebss-p);
675 » scan(data, (byte*)&runtime·mheap - data); 651
676 » scan((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1)); 652 » runtime·walkfintab(addfinroots);
677 653
678 » // mark stacks 654 » for(fb=allfin; fb; fb=fb->alllink)
655 » » addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
656
679 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { 657 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
680 switch(gp->status){ 658 switch(gp->status){
681 » » default: 659 » » » default:
682 » » » runtime·printf("unexpected G.status %d\n", gp->status); 660 » » » » runtime·printf("unexpected G.status %d\n", gp->s tatus);
683 » » » runtime·throw("mark - bad status"); 661 » » » » runtime·throw("mark - bad status");
684 » » case Gdead: 662 » » » case Gdead:
685 » » » break; 663 » » » » break;
686 » » case Grunning: 664 » » » case Grunning:
687 » » » if(gp != g) 665 » » » » if(gp != g)
688 » » » » runtime·throw("mark - world not stopped"); 666 » » » » » runtime·throw("mark - world not stopped" );
689 » » » scanstack(scan, gp); 667 » » » » addstackroots(gp);
690 » » » break; 668 » » » » break;
691 » » case Grunnable: 669 » » » case Grunnable:
692 » » case Gsyscall: 670 » » » case Gsyscall:
693 » » case Gwaiting: 671 » » » case Gwaiting:
694 » » » scanstack(scan, gp); 672 » » » » addstackroots(gp);
695 » » » break; 673 » » » » break;
696 » » } 674 » » }
697 » } 675 » }
698
699 » // mark things pointed at by objects with finalizers
700 » if(scan == debug_scanblock)
701 » » runtime·walkfintab(debug_markfin);
702 » else
703 » » runtime·walkfintab(markfin);
704
705 » for(fb=allfin; fb; fb=fb->alllink)
706 » » scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
707
708 » // in multiproc mode, join in the queued work.
709 » scan(nil, 0);
710 } 676 }
711 677
712 static bool 678 static bool
713 handlespecial(byte *p, uintptr size) 679 handlespecial(byte *p, uintptr size)
714 { 680 {
715 void (*fn)(void*); 681 void (*fn)(void*);
716 int32 nret; 682 int32 nret;
717 FinBlock *block; 683 FinBlock *block;
718 Finalizer *f; 684 Finalizer *f;
719 » 685
720 if(!runtime·getfinalizer(p, true, &fn, &nret)) { 686 if(!runtime·getfinalizer(p, true, &fn, &nret)) {
721 runtime·setblockspecial(p, false); 687 runtime·setblockspecial(p, false);
722 runtime·MProf_Free(p, size); 688 runtime·MProf_Free(p, size);
723 return false; 689 return false;
724 } 690 }
725 691
726 runtime·lock(&finlock); 692 runtime·lock(&finlock);
727 if(finq == nil || finq->cnt == finq->cap) { 693 if(finq == nil || finq->cnt == finq->cap) {
728 if(finc == nil) { 694 if(finc == nil) {
729 finc = runtime·SysAlloc(PageSize); 695 finc = runtime·SysAlloc(PageSize);
730 finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Final izer) + 1; 696 finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Final izer) + 1;
731 finc->alllink = allfin; 697 finc->alllink = allfin;
732 allfin = finc; 698 allfin = finc;
733 } 699 }
734 block = finc; 700 block = finc;
735 finc = block->next; 701 finc = block->next;
736 block->next = finq; 702 block->next = finq;
737 finq = block; 703 finq = block;
738 } 704 }
739 f = &finq->fin[finq->cnt]; 705 f = &finq->fin[finq->cnt];
740 finq->cnt++; 706 finq->cnt++;
741 f->fn = fn; 707 f->fn = fn;
742 f->nret = nret; 708 f->nret = nret;
743 f->arg = p; 709 f->arg = p;
744 » runtime·unlock(&finlock); 710 » runtime·unlock(&finlock);
745 return true; 711 return true;
746 } 712 }
747 713
748 // Sweep frees or collects finalizers for blocks not marked in the mark phase. 714 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
749 // It clears the mark bits in preparation for the next GC round. 715 // It clears the mark bits in preparation for the next GC round.
750 static void 716 static void
751 sweep(void) 717 sweepspan(Parfor *desc, uint32 spanidx)
752 { 718 {
753 MSpan *s; 719 MSpan *s;
754 int32 cl, n, npages; 720 int32 cl, n, npages;
755 uintptr size; 721 uintptr size;
756 byte *p; 722 byte *p;
757 MCache *c; 723 MCache *c;
758 byte *arena_start; 724 byte *arena_start;
725 MLink *start, *end;
726 int32 nfree;
727
728 USED(desc);
729
730 s = runtime·mheap.allspans[spanidx];
731 // Stamp newly unused spans. The scavenger will use that
732 // info to potentially give back some pages to the OS.
733 if(s->state == MSpanFree && s->unusedsince == 0)
734 s->unusedsince = runtime·nanotime();
735 if(s->state != MSpanInUse)
736 return;
759 737
760 arena_start = runtime·mheap.arena_start; 738 arena_start = runtime·mheap.arena_start;
761 739
762 » for(;;) { 740 » p = (byte*)(s->start << PageShift);
763 » » s = work.spans; 741 » cl = s->sizeclass;
764 » » if(s == nil) 742 » if(cl == 0) {
765 » » » break; 743 » » size = s->npages<<PageShift;
766 » » if(!runtime·casp(&work.spans, s, s->allnext)) 744 » » n = 1;
745 » } else {
746 » » // Chunk full of small blocks.
747 » » size = runtime·class_to_size[cl];
748 » » npages = runtime·class_to_allocnpages[cl];
749 » » n = (npages << PageShift) / size;
750 » }
751 » c = m->mcache;
752 » nfree = 0;
753 » start = end = nil;
754
755 » // Sweep through n objects of given size starting at p.
756 » // This thread owns the span now, so it can manipulate
757 » // the block bitmap without atomic operations.
758 » for(; n > 0; n--, p += size) {
759 » » uintptr off, *bitp, shift, bits;
760
761 » » off = (uintptr*)p - (uintptr*)arena_start;
762 » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
763 » » PREFETCH((byte*)bitp - size);
764 » » shift = off % wordsPerBitmapWord;
765 » » bits = *bitp>>shift;
766
767 » » if((bits & bitAllocated) == 0)
767 continue; 768 continue;
768 769
769 » » if(s->state != MSpanInUse) 770 » » if((bits & bitMarked) != 0) {
771 » » » if(DebugMark) {
772 » » » » if(!(bits & bitSpecial))
773 » » » » » runtime·printf("found spurious mark on % p\n", p);
774 » » » » *bitp &= ~(bitSpecial<<shift);
775 » » » }
776 » » » *bitp &= ~(bitMarked<<shift);
770 continue; 777 continue;
771 778 » » }
772 » » p = (byte*)(s->start << PageShift); 779
773 » » cl = s->sizeclass; 780 » » // Special means it has a finalizer or is being profiled.
781 » » // In DebugMark mode, the bit has been coopted so
782 » » // we have to assume all blocks are special.
783 » » if(DebugMark || (bits & bitSpecial) != 0) {
784 » » » if(handlespecial(p, size))
785 » » » » continue;
786 » » }
787
788 » » // Mark freed; restore block boundary bit.
789 » » *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
790
774 if(cl == 0) { 791 if(cl == 0) {
775 » » » size = s->npages<<PageShift; 792 » » » // Free large span.
776 » » » n = 1; 793 » » » runtime·unmarkspan(p, 1<<PageShift);
777 » » } else { 794 » » » *(uintptr*)p = 1;» // needs zeroing
778 » » » // Chunk full of small blocks. 795 » » » runtime·MHeap_Free(&runtime·mheap, s, 1);
779 » » » size = runtime·class_to_size[cl];
780 » » » npages = runtime·class_to_allocnpages[cl];
781 » » » n = (npages << PageShift) / size;
782 » » }
783
784 » » // Sweep through n objects of given size starting at p.
785 » » // This thread owns the span now, so it can manipulate
786 » » // the block bitmap without atomic operations.
787 » » for(; n > 0; n--, p += size) {
788 » » » uintptr off, *bitp, shift, bits;
789
790 » » » off = (uintptr*)p - (uintptr*)arena_start;
791 » » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
792 » » » shift = off % wordsPerBitmapWord;
793 » » » bits = *bitp>>shift;
794
795 » » » if((bits & bitAllocated) == 0)
796 » » » » continue;
797
798 » » » if((bits & bitMarked) != 0) {
799 » » » » if(DebugMark) {
800 » » » » » if(!(bits & bitSpecial))
801 » » » » » » runtime·printf("found spurious m ark on %p\n", p);
802 » » » » » *bitp &= ~(bitSpecial<<shift);
803 » » » » }
804 » » » » *bitp &= ~(bitMarked<<shift);
805 » » » » continue;
806 » » » }
807
808 » » » // Special means it has a finalizer or is being profiled .
809 » » » // In DebugMark mode, the bit has been coopted so
810 » » » // we have to assume all blocks are special.
811 » » » if(DebugMark || (bits & bitSpecial) != 0) {
812 » » » » if(handlespecial(p, size))
813 » » » » » continue;
814 » » » }
815
816 » » » // Mark freed; restore block boundary bit.
817 » » » *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary< <shift);
818
819 » » » c = m->mcache;
820 » » » if(s->sizeclass == 0) {
821 » » » » // Free large span.
822 » » » » runtime·unmarkspan(p, 1<<PageShift);
823 » » » » *(uintptr*)p = 1;» // needs zeroing
824 » » » » runtime·MHeap_Free(&runtime·mheap, s, 1);
825 » » » } else {
826 » » » » // Free small object.
827 » » » » if(size > sizeof(uintptr))
828 » » » » » ((uintptr*)p)[1] = 1;» // mark as "need s to be zeroed"
829 » » » » c->local_by_size[s->sizeclass].nfree++;
830 » » » » runtime·MCache_Free(c, p, s->sizeclass, size);
831 » » » }
832 c->local_alloc -= size; 796 c->local_alloc -= size;
833 c->local_nfree++; 797 c->local_nfree++;
834 » » } 798 » » } else {
799 » » » // Free small object.
800 » » » if(nfree) {
801 » » » » PREFETCH(p);
802 » » » » if(size > sizeof(uintptr))
803 » » » » » ((uintptr*)end)[1] = 1; // mark as "ne eds to be zeroed"
804 » » » » end->next = (MLink*)p;
805 » » » » end = (MLink*)p;
806 » » » } else {
807 » » » » start = (MLink*)p;
808 » » » » end = (MLink*)p;
809 » » » }
810 » » » nfree++;
811 » » }
812 » }
813
814 » if(nfree) {
815 » » if(size > sizeof(uintptr))
816 » » » ((uintptr*)end)[1] = 1; // mark as "needs to be zeroed "
817 » » c->local_by_size[s->sizeclass].nfree += nfree;
818 » » c->local_alloc -= size * nfree;
819 » » c->local_nfree += nfree;
820 » » c->local_cachealloc -= nfree * size;
821 » » c->local_objects -= nfree;
822 » » runtime·MCentral_FreeSpan(&runtime·mheap.central[cl],
823 » » » » » s, nfree, start, end);
835 } 824 }
836 } 825 }
837 826
838 void 827 void
839 runtime·gchelper(void) 828 runtime·gchelper(void)
840 { 829 {
830 // parallel mark for over gc roots
831 parfor(&work.markfor);
832 // help other threads scan secondary blocks
841 scanblock(nil, 0); 833 scanblock(nil, 0);
834
842 if(DebugMark) { 835 if(DebugMark) {
843 » » while(runtime·atomicload(&work.markdone) == 0) 836 » » // wait while the main thread executes mark(debug_scanblock)
837 » » while(runtime·atomicload(&work.debugmarkdone) == 0)
844 runtime·usleep(10); 838 runtime·usleep(10);
845 } 839 }
846 »······· 840
847 » sweep(); 841 » // parallel sweep for over all spans
842 » parfor(&work.sweepfor);
848 843
849 if(runtime·xadd(&work.ndone, +1) == work.nproc-1) 844 if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
850 runtime·notewakeup(&work.alldone); 845 runtime·notewakeup(&work.alldone);
851 } 846 }
852
853 // Semaphore, not Lock, so that the goroutine
854 // reschedules when there is contention rather
855 // than spinning.
856 static uint32 gcsema = 1;
857 847
858 // Initialized from $GOGC. GOGC=off means no gc. 848 // Initialized from $GOGC. GOGC=off means no gc.
859 // 849 //
860 // Next gc is after we've allocated an extra amount of 850 // Next gc is after we've allocated an extra amount of
861 // memory proportional to the amount already in use. 851 // memory proportional to the amount already in use.
862 // If gcpercent=100 and we're using 4M, we'll gc again 852 // If gcpercent=100 and we're using 4M, we'll gc again
863 // when we get to 8M. This keeps the gc cost in linear 853 // when we get to 8M. This keeps the gc cost in linear
864 // proportion to the allocation cost. Adjusting gcpercent 854 // proportion to the allocation cost. Adjusting gcpercent
865 // just changes the linear constant (and also the amount of 855 // just changes the linear constant (and also the amount of
866 // extra memory used). 856 // extra memory used).
867 static int32 gcpercent = -2; 857 static int32 gcpercent = -2;
868 858
869 static void 859 static void
870 stealcache(void) 860 stealcache(void)
871 { 861 {
872 M *m; 862 M *m;
873 863
874 for(m=runtime·allm; m; m=m->alllink) 864 for(m=runtime·allm; m; m=m->alllink)
875 runtime·MCache_ReleaseAll(m->mcache); 865 runtime·MCache_ReleaseAll(m->mcache);
876 } 866 }
877 867
878 static void 868 static void
879 cachestats(GCStats *stats) 869 cachestats(GCStats *stats)
880 { 870 {
881 M *m; 871 M *m;
882 MCache *c; 872 MCache *c;
883 int32 i; 873 int32 i;
884 uint64 stacks_inuse; 874 uint64 stacks_inuse;
885 uint64 stacks_sys; 875 uint64 stacks_sys;
876 uint64 *src, *dst;
886 877
887 if(stats) 878 if(stats)
888 runtime·memclr((byte*)stats, sizeof(*stats)); 879 runtime·memclr((byte*)stats, sizeof(*stats));
889 stacks_inuse = 0; 880 stacks_inuse = 0;
890 stacks_sys = 0; 881 stacks_sys = 0;
891 for(m=runtime·allm; m; m=m->alllink) { 882 for(m=runtime·allm; m; m=m->alllink) {
892 runtime·purgecachedstats(m); 883 runtime·purgecachedstats(m);
893 stacks_inuse += m->stackalloc->inuse; 884 stacks_inuse += m->stackalloc->inuse;
894 stacks_sys += m->stackalloc->sys; 885 stacks_sys += m->stackalloc->sys;
895 if(stats) { 886 if(stats) {
896 » » » stats->nlookup += m->gcstats.nlookup; 887 » » » src = (uint64*)&m->gcstats;
897 » » » stats->nsizelookup += m->gcstats.nsizelookup; 888 » » » dst = (uint64*)stats;
898 » » » stats->naddrlookup += m->gcstats.naddrlookup; 889 » » » for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
899 » » » stats->nhandoff += m->gcstats.nhandoff; 890 » » » » dst[i] += src[i];
900 runtime·memclr((byte*)&m->gcstats, sizeof(m->gcstats)); 891 runtime·memclr((byte*)&m->gcstats, sizeof(m->gcstats));
901 } 892 }
902 c = m->mcache; 893 c = m->mcache;
903 for(i=0; i<nelem(c->local_by_size); i++) { 894 for(i=0; i<nelem(c->local_by_size); i++) {
904 mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc ; 895 mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc ;
905 c->local_by_size[i].nmalloc = 0; 896 c->local_by_size[i].nmalloc = 0;
906 mstats.by_size[i].nfree += c->local_by_size[i].nfree; 897 mstats.by_size[i].nfree += c->local_by_size[i].nfree;
907 c->local_by_size[i].nfree = 0; 898 c->local_by_size[i].nfree = 0;
908 } 899 }
909 } 900 }
910 mstats.stacks_inuse = stacks_inuse; 901 mstats.stacks_inuse = stacks_inuse;
911 mstats.stacks_sys = stacks_sys; 902 mstats.stacks_sys = stacks_sys;
912 } 903 }
913 904
914 void 905 void
915 runtime·gc(int32 force) 906 runtime·gc(int32 force)
916 { 907 {
917 int64 t0, t1, t2, t3; 908 int64 t0, t1, t2, t3;
918 uint64 heap0, heap1, obj0, obj1; 909 uint64 heap0, heap1, obj0, obj1;
910 GCStats stats;
911 uint32 i;
919 byte *p; 912 byte *p;
920 bool extra;
921 GCStats stats;
922 913
923 // The gc is turned off (via enablegc) until 914 // The gc is turned off (via enablegc) until
924 // the bootstrap has completed. 915 // the bootstrap has completed.
925 // Also, malloc gets called in the guts 916 // Also, malloc gets called in the guts
926 // of a number of libraries that might be 917 // of a number of libraries that might be
927 // holding locks. To avoid priority inversion 918 // holding locks. To avoid priority inversion
928 // problems, don't bother trying to run gc 919 // problems, don't bother trying to run gc
929 // while holding a lock. The next mallocgc 920 // while holding a lock. The next mallocgc
930 // without a lock will do the gc instead. 921 // without a lock will do the gc instead.
931 if(!mstats.enablegc || m->locks > 0 || runtime·panicking) 922 if(!mstats.enablegc || m->locks > 0 || runtime·panicking)
932 return; 923 return;
933 924
934 if(gcpercent == -2) { // first time through 925 if(gcpercent == -2) { // first time through
935 p = runtime·getenv("GOGC"); 926 p = runtime·getenv("GOGC");
936 if(p == nil || p[0] == '\0') 927 if(p == nil || p[0] == '\0')
937 gcpercent = 100; 928 gcpercent = 100;
938 else if(runtime·strcmp(p, (byte*)"off") == 0) 929 else if(runtime·strcmp(p, (byte*)"off") == 0)
939 gcpercent = -1; 930 gcpercent = -1;
940 else 931 else
941 gcpercent = runtime·atoi(p); 932 gcpercent = runtime·atoi(p);
942 933
943 p = runtime·getenv("GOGCTRACE"); 934 p = runtime·getenv("GOGCTRACE");
944 if(p != nil) 935 if(p != nil)
945 gctrace = runtime·atoi(p); 936 gctrace = runtime·atoi(p);
946 } 937 }
947 if(gcpercent < 0) 938 if(gcpercent < 0)
948 return; 939 return;
949 940
950 » runtime·semacquire(&gcsema); 941 » runtime·semacquire(&runtime·worldsema);
951 if(!force && mstats.heap_alloc < mstats.next_gc) { 942 if(!force && mstats.heap_alloc < mstats.next_gc) {
952 » » runtime·semrelease(&gcsema); 943 » » runtime·semrelease(&runtime·worldsema);
953 return; 944 return;
954 } 945 }
955 946
956 t0 = runtime·nanotime(); 947 t0 = runtime·nanotime();
957 948
958 m->gcing = 1; 949 m->gcing = 1;
959 runtime·stoptheworld(); 950 runtime·stoptheworld();
960 951
961 » cachestats(0); 952 » heap0 = 0;
962 » heap0 = mstats.heap_alloc; 953 » obj0 = 0;
963 » obj0 = mstats.nmalloc - mstats.nfree; 954 » if(gctrace) {
964 955 » » cachestats(0);
965 » work.nproc = MaxGcproc+1; 956 » » heap0 = mstats.heap_alloc;
957 » » obj0 = mstats.nmalloc - mstats.nfree;
958 » }
959
960 » work.nproc = runtime·gcprocs();
966 work.nwait = 0; 961 work.nwait = 0;
967 work.ndone = 0; 962 work.ndone = 0;
968 » work.spans = runtime·mheap.allspans; 963 » work.debugmarkdone = 0;
969 » work.markdone = 0; 964 » addroots();
970 » extra = false; 965 » parforsetup(&work.markfor, markroot, work.nproc, work.nroot, nil, false) ;
971 » if(runtime·gomaxprocs > 1 && runtime·ncpu > 1) { 966 » parforsetup(&work.sweepfor, sweepspan, work.nproc, runtime·mheap.nspan, nil, true);
967 » if(work.nproc > 1) {
972 runtime·noteclear(&work.alldone); 968 runtime·noteclear(&work.alldone);
973 » » work.nproc += runtime·helpgc(&extra); 969 » » runtime·helpgc(work.nproc);
974 » } 970 » }
975 » work.nproc -= MaxGcproc; 971
976 972 » parfor(&work.markfor);
977 » mark(scanblock); 973 » scanblock(nil, 0);
974
978 if(DebugMark) { 975 if(DebugMark) {
979 » » mark(debug_scanblock); 976 » » for(i=0; i<work.nroot; i++)
980 » » runtime·xchg(&work.markdone, 1); 977 » » » debug_scanblock(work.roots[i].p, work.roots[i].n);
978 » » runtime·xchg(&work.debugmarkdone, 1);
981 } 979 }
982 t1 = runtime·nanotime(); 980 t1 = runtime·nanotime();
983 981 » // parallel sweep for over all spans
984 » sweep(); 982 » parfor(&work.sweepfor);
985 » if(work.nproc > 1)
986 » » runtime·notesleep(&work.alldone);
987 t2 = runtime·nanotime(); 983 t2 = runtime·nanotime();
988 984
989 stealcache(); 985 stealcache();
990 cachestats(&stats); 986 cachestats(&stats);
991 987
992 mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; 988 mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
993 m->gcing = 0; 989 m->gcing = 0;
994 990
995 m->locks++; // disable gc during the mallocs in newproc
996 if(finq != nil) { 991 if(finq != nil) {
992 m->locks++; // disable gc during the mallocs in newproc
997 // kick off or wake up goroutine to run queued finalizers 993 // kick off or wake up goroutine to run queued finalizers
998 if(fing == nil) 994 if(fing == nil)
999 fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runti me·gc); 995 fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runti me·gc);
1000 else if(fingwait) { 996 else if(fingwait) {
1001 fingwait = 0; 997 fingwait = 0;
1002 runtime·ready(fing); 998 runtime·ready(fing);
1003 } 999 }
1004 » } 1000 » » m->locks--;
1005 » m->locks--; 1001 » }
1002 »·······
1003 » if(work.nproc > 1)
1004 » » runtime·notesleep(&work.alldone);
1006 1005
1007 heap1 = mstats.heap_alloc; 1006 heap1 = mstats.heap_alloc;
1008 obj1 = mstats.nmalloc - mstats.nfree; 1007 obj1 = mstats.nmalloc - mstats.nfree;
1009 1008
1010 t3 = runtime·nanotime(); 1009 t3 = runtime·nanotime();
1010 mstats.last_gc = t3;
1011 mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0; 1011 mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0;
1012 mstats.pause_total_ns += t3 - t0; 1012 mstats.pause_total_ns += t3 - t0;
1013 mstats.numgc++; 1013 mstats.numgc++;
1014 if(mstats.debuggc) 1014 if(mstats.debuggc)
1015 runtime·printf("pause %D\n", t3-t0); 1015 runtime·printf("pause %D\n", t3-t0);
1016 1016
1017 if(gctrace) { 1017 if(gctrace) {
1018 » » runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-% D) objects %D pointer lookups (%D size, %D addr) %D handoff\n", 1018 » » runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-% D) objects, %D(%D) steals, %D(%D) handoffs, %D/%D/%D yields\n",
1019 mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/10000 00, (t3-t2)/1000000, 1019 mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/10000 00, (t3-t2)/1000000,
1020 heap0>>20, heap1>>20, obj0, obj1, 1020 heap0>>20, heap1>>20, obj0, obj1,
1021 mstats.nmalloc, mstats.nfree, 1021 mstats.nmalloc, mstats.nfree,
1022 » » » stats.nlookup, stats.nsizelookup, stats.naddrlookup, sta ts.nhandoff); 1022 » » » stats.nsteal, stats.nstealcnt,
1023 » } 1023 » » » stats.nhandoff, stats.nhandoffcnt,
1024 1024 » » » stats.nprocyield, stats.nosyield, stats.nsleep);
1025 » runtime·semrelease(&gcsema); 1025 » }
1026 1026 »·······
1027 » // If we could have used another helper proc, start one now, 1027 » runtime·MProf_GC();
1028 » // in the hope that it will be available next time. 1028 » runtime·semrelease(&runtime·worldsema);
1029 » // It would have been even better to start it before the collection, 1029 » runtime·starttheworld();
1030 » // but doing so requires allocating memory, so it's tricky to 1030
1031 » // coordinate. This lazy approach works out in practice: 1031 » // give the queued finalizers, if any, a chance to run
1032 » // we don't mind if the first couple gc rounds don't have quite 1032 » if(finq != nil)
1033 » // the maximum number of procs.
1034 » runtime·starttheworld(extra);
1035
1036 » // give the queued finalizers, if any, a chance to run»·
1037 » if(finq != nil)»
1038 runtime·gosched(); 1033 runtime·gosched();
1039 1034
1040 if(gctrace > 1 && !force) 1035 if(gctrace > 1 && !force)
1041 runtime·gc(1); 1036 runtime·gc(1);
1042 } 1037 }
1043 1038
1044 void 1039 void
1045 runtime·UpdateMemStats(void) 1040 runtime·ReadMemStats(MStats *stats)
1046 { 1041 {
1047 » // Have to acquire gcsema to stop the world, 1042 » // Have to acquire worldsema to stop the world,
1048 // because stoptheworld can only be used by 1043 // because stoptheworld can only be used by
1049 // one goroutine at a time, and there might be 1044 // one goroutine at a time, and there might be
1050 // a pending garbage collection already calling it. 1045 // a pending garbage collection already calling it.
1051 » runtime·semacquire(&gcsema); 1046 » runtime·semacquire(&runtime·worldsema);
1052 m->gcing = 1; 1047 m->gcing = 1;
1053 runtime·stoptheworld(); 1048 runtime·stoptheworld();
1054 cachestats(0); 1049 cachestats(0);
1050 *stats = mstats;
1055 m->gcing = 0; 1051 m->gcing = 0;
1056 » runtime·semrelease(&gcsema); 1052 » runtime·semrelease(&runtime·worldsema);
1057 » runtime·starttheworld(false); 1053 » runtime·starttheworld();
1058 } 1054 }
1059 1055
1060 static void 1056 static void
1061 runfinq(void) 1057 runfinq(void)
1062 { 1058 {
1063 Finalizer *f; 1059 Finalizer *f;
1064 FinBlock *fb, *next; 1060 FinBlock *fb, *next;
1065 byte *frame; 1061 byte *frame;
1066 uint32 framesz, framecap, i; 1062 uint32 framesz, framecap, i;
1067 1063
(...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after
1302 uintptr n; 1298 uintptr n;
1303 1299
1304 n = (h->arena_used - h->arena_start) / wordsPerBitmapWord; 1300 n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
1305 n = (n+bitmapChunk-1) & ~(bitmapChunk-1); 1301 n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
1306 if(h->bitmap_mapped >= n) 1302 if(h->bitmap_mapped >= n)
1307 return; 1303 return;
1308 1304
1309 runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped); 1305 runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped);
1310 h->bitmap_mapped = n; 1306 h->bitmap_mapped = n;
1311 } 1307 }
1308
1309 #ifdef _64BIT
1310 // Amd64 uses 48-bit virtual addresses, 47-th bit is used as kernel/user flag.
1311 // So we use 17msb of pointers as ABA counter.
1312 # define PTR_BITS 47
1313 #else
1314 # define PTR_BITS 32
1315 #endif
1316 #define PTR_MASK ((1ull<<PTR_BITS)-1)
1317
1318 static void
1319 lifopush(uint64 *a, Workbuf *b)
1320 {
1321 uint64 old, new;
1322
1323 if((uint64)b != ((uint64)b&PTR_MASK)) {
1324 runtime·printf("p=%p\n", b);
1325 runtime·throw("lifopush: invalid pointer");
1326 }
1327
1328 if(work.nproc == 1) {
1329 b->next = (Workbuf*)(*a&PTR_MASK);
1330 *a = (uint64)b;
1331 return;
1332 }
1333
1334 b->pushcnt++;
1335 new = (uint64)b|(((uint64)b->pushcnt)<<PTR_BITS);
1336 old = runtime·atomicload64(a);
1337 for(;;) {
1338 b->next = (Workbuf*)(old&PTR_MASK);
1339 if(runtime·cas64(a, &old, new))
1340 break;
1341 }
1342 }
1343
1344 static Workbuf*
1345 lifopop(uint64 *a)
1346 {
1347 Workbuf *b, *b2;
1348 uint64 old, new;
1349
1350 if(work.nproc == 1) {
1351 b = (Workbuf*)(*a&PTR_MASK);
1352 if(b == nil)
1353 return nil;
1354 *a = (uint64)b->next;
1355 return b;
1356 }
1357
1358 old = runtime·atomicload64(a);
1359 for(;;) {
1360 if(old == 0)
1361 return nil;
1362 b = (Workbuf*)(old&PTR_MASK);
1363 b2 = runtime·atomicloadp(&b->next);
1364 new = 0;
1365 if(b2 != nil)
1366 new = (uint64)b2|(((uint64)b2->pushcnt)<<PTR_BITS);
1367 if(runtime·cas64(a, &old, new))
1368 return b;
1369 }
1370 }
1371
1372 void
1373 runtime·CTestLockFreeStack(bool isShort)
1374 {
1375 uint64 stack;
1376 Workbuf *b;
1377
1378 USED(isShort);
1379
1380 for(work.nproc=1; work.nproc<=2; work.nproc++) {
1381 // check the stack is initially empty
1382 stack = 0;
1383 if(lifopop(&stack) != nil)
1384 runtime·panicstring("stack is not empty");
1385
1386 // push one element
1387 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0);
1388 b->nobj = 42;
1389 lifopush(&stack, b);
1390
1391 // push another
1392 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0);
1393 b->nobj = 43;
1394 lifopush(&stack, b);
1395
1396 // pop one element
1397 b = lifopop(&stack);
1398 if(b == nil)
1399 runtime·panicstring("stack is empty");
1400 if(b->nobj != 43)
1401 runtime·panicstring("no lifo");
1402 runtime·free(b);
1403
1404 // pop another
1405 b = lifopop(&stack);
1406 if(b == nil)
1407 runtime·panicstring("stack is empty");
1408 if(b->nobj != 42)
1409 runtime·panicstring("no lifo");
1410 runtime·free(b);
1411
1412 // check the stack is empty again
1413 if(stack != 0)
1414 runtime·panicstring("stack is not empty");
1415 if(lifopop(&stack) != nil)
1416 runtime·panicstring("stack is not empty");
1417 }
1418 }
1419
1420 typedef struct StackTestCtx StackTestCtx;
1421 struct StackTestCtx
1422 {
1423 uint32 niter;
1424 uint32 waitsema;
1425 uint64 stack[2];
1426 };
1427
1428 static void
1429 stackTestProc(StackTestCtx *ctx)
1430 {
1431 int32 i, n;
1432 Workbuf *b;
1433
1434 for(i=0; i<ctx->niter; i++) {
1435 n = runtime·fastrand1()%2;
1436 b = lifopop(&ctx->stack[n]);
1437 if(b==nil)
1438 continue;
1439 n = runtime·fastrand1()%2;
1440 lifopush(&ctx->stack[n], b);
1441 }
1442 runtime·semrelease(&ctx->waitsema);
1443 }
1444
1445 void
1446 runtime·CTestLockFreeStackStress(bool isShort)
1447 {
1448 StackTestCtx ctx, *arg;
1449 Workbuf *b;
1450 int32 i, sum, sum2, cnt, procs;
1451 const int32 N = 100;
1452 const int32 P = 8;
1453 const int32 G = 16;
1454
1455 procs = runtime·gomaxprocsfunc(P);
1456 work.nproc = P;
1457 ctx.niter = isShort ? 10000 : 100000;
1458 ctx.waitsema = 0;
1459 ctx.stack[0] = 0;
1460 ctx.stack[1] = 0;
1461 arg = &ctx;
1462 sum = 0;
1463 for(i=0; i<N; i++) {
1464 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0);
1465 b->nobj = i;
1466 sum += i;
1467 lifopush(&ctx.stack[i%2], b);
1468 }
1469 m->locks++; // disable gc during the mallocs (gc resets work.nproc)
1470 for(i=0; i<G; i++)
1471 runtime·newproc1((byte*)stackTestProc, (byte*)&arg, sizeof(arg), 0, runtime·CTestLockFreeStackStress);
1472 m->locks--;
1473 for(i=0; i<G; i++)
1474 runtime·semacquire(&ctx.waitsema);
1475 sum2 = 0;
1476 cnt = 0;
1477 for(i=0; i<2; i++) {
1478 while((b=lifopop(&ctx.stack[i])) != nil) {
1479 cnt++;
1480 sum2 += b->nobj;
1481 b->next = (Workbuf*)123;
1482 runtime·free(b);
1483 }
1484 }
1485 if(cnt != N || sum2 != sum)
1486 runtime·panicstring("stack corruption");
1487 runtime·gomaxprocsfunc(procs);
1488 }
1489
1490 static void
1491 parforsetup(Parfor *desc, void (*body)(Parfor*, uint32), uint32 nthr, uint32 n, void *ctx, bool wait)
1492 {
1493 uint32 i, d, begin, end;
1494
1495 if(nthr == 0 || nthr > MaxGcproc || body == nil) {
1496 runtime·printf("nthr=%d count=%d body=%p\n",
1497 nthr, n, body);
1498 runtime·throw("parallel for: invalid args");
1499 }
1500
1501 desc->body = body;
1502 desc->done = 0;
1503 desc->nthr = nthr;
1504 desc->thrseq = 0;
1505 desc->cnt = n;
1506 desc->ctx = ctx;
1507 desc->wait = wait;
1508 for(i=0; i<nthr; i++) {
1509 begin = n / nthr * i;
1510 end = n / nthr * (i+1);
1511 d = n % nthr;
1512 if(i < d) {
1513 begin += i;
1514 end += (i+1);
1515 } else {
1516 begin += d;
1517 end += d;
1518 }
1519 desc->thr[i].pos = (uint64)begin | (((uint64)end)<<32);
1520 }
1521 }
1522
1523 static void
1524 parfor(Parfor *desc)
1525 {
1526 uint32 tid, begin, end, begin2, try, victim, i;
1527 uint64 *mypos, pos;
1528 bool idle;
1529
1530 // obtain 0-based thread index
1531 tid = runtime·xadd(&desc->thrseq, 1) - 1;
1532 if(tid >= desc->nthr) {
1533 runtime·throw("parallel for: invalid tid");
1534 }
1535
1536 // if single-threaded, just execute the for serially
1537 if(desc->nthr==1) {
1538 for(i=0; i<desc->cnt; i++)
1539 desc->body(desc, i);
1540 return;
1541 }
1542
1543 mypos = &desc->thr[tid].pos;
1544 for(;;) {
1545 for(;;) {
1546 // while have work, bump low index and execute the itera tion
1547 pos = runtime·xadd64(mypos, 1);
1548 begin = (uint32)pos-1;
1549 end = (uint32)(pos>>32);
1550 if(begin>=end)
1551 break;
1552 desc->body(desc, begin);
1553 }
1554
1555 // out of work, need to steal something
1556 idle = false;
1557 for(try=0;; try++) {
1558 // if we don't see any work for long enough,
1559 // increment the done counter...
1560 if(try > desc->nthr*4 && !idle) {
1561 idle = true;
1562 runtime·xadd(&desc->done, 1);
1563 }
1564 // ...if all threads have incremented the counter,
1565 // we are done
1566 if(desc->done + !idle == desc->nthr) {
1567 if(!idle)
1568 runtime·xadd(&desc->done, 1);
1569 return;
1570 }
1571 // choose a random victim for stealing
1572 victim = runtime·fastrand1() % (desc->nthr-1);
1573 if(victim >= tid)
1574 victim++;
1575 pos = runtime·atomicload64(&desc->thr[victim].pos);
1576 for(;;) {
1577 // see if it has any work
1578 begin = (uint32)pos;
1579 end = (uint32)(pos>>32);
1580 if(begin >= end-1) {
1581 begin = end = 0;
1582 break;
1583 }
1584 if(idle) {
1585 runtime·xadd(&desc->done, -1);
1586 idle = false;
1587 }
1588 begin2 = begin + (end-begin)/2;
1589 if(runtime·cas64(&desc->thr[victim].pos, &pos, ( uint64)begin | (((uint64)begin2)<<32))) {
1590 begin = begin2;
1591 break;
1592 }
1593 }
1594 if(begin < end) {
1595 // has successfully stolen some work
1596 if(idle)
1597 runtime·throw("parfor: should not be idl e");
1598 runtime·atomicstore64(mypos, (uint64)begin | ((u int64)end)<<32);
1599 m->gcstats.nsteal += 1;
1600 m->gcstats.nstealcnt += end-begin;
1601 break;
1602 }
1603 // backoff
1604 if(try < desc->nthr) {
1605 } else if (try < desc->nthr*4) {
1606 m->gcstats.nprocyield++;
1607 runtime·procyield(20);
1608 // if a user asked to not wait for others, exit now
1609 // (assume that most work is already done)
1610 } else if (!desc->wait) {
1611 if(!idle)
1612 runtime·xadd(&desc->done, 1);
1613 return;
1614 } else if (try < desc->nthr*6) {
1615 m->gcstats.nosyield++;
1616 runtime·osyield();
1617 } else {
1618 m->gcstats.nsleep++;
1619 runtime·usleep(1);
1620 }
1621 }
1622 }
1623 }
1624
1625 static void
1626 testParforBody(Parfor *desc, uint32 i)
1627 {
1628 uint64 *data;
1629
1630 data = (uint64*)desc->ctx;
1631 data[i] *= data[i];
1632 }
1633
1634 static void
1635 testParforProc(Parfor *desc)
1636 {
1637 parfor(desc);
1638 }
1639
1640 // Simple serial sanity test for parallelfor.
1641 void
1642 runtime·CTestParfor(bool isShort)
1643 {
1644 Parfor desc;
1645 uint64 *data;
1646 uint32 i, N;
1647
1648 USED(isShort);
1649
1650 N = 1000;
1651 data = (uint64*)runtime·mal(N*sizeof(uint64));
1652 for(i=0; i<N; i++)
1653 data[i] = i;
1654 parforsetup(&desc, testParforBody, 1, N, data, true);
1655 parfor(&desc);
1656 for(i=0; i<N; i++) {
1657 if(data[i] != i*i)
1658 runtime·panicstring("incorrect result");
1659 }
1660 runtime·free(data);
1661 }
1662
1663 // Test that iterations are properly distributed.
1664 void
1665 runtime·CTestParforSetup(bool isShort)
1666 {
1667 Parfor desc;
1668 uint32 n, t, i, begin, end, size, end0, size0, sum;
1669
1670 USED(isShort);
1671
1672 for(n=0; n<100; n++) {
1673 for(t=1; t<=MaxGcproc; t++) {
1674 parforsetup(&desc, testParforBody, t, n, 0, true);
1675 sum = 0;
1676 size0 = 0;
1677 end0 = 0;
1678 for(i=0; i<t; i++) {
1679 begin = (uint32)desc.thr[i].pos;
1680 end = (uint32)(desc.thr[i].pos>>32);
1681 size = end - begin;
1682 sum += size;
1683 if(i==0) {
1684 size0 = size;
1685 if(begin != 0)
1686 runtime·panicstring("incorrect b egin");
1687 } else {
1688 if(size != size0 && size != size0-1)
1689 runtime·panicstring("incorrect s ize");
1690 if(begin != end0)
1691 runtime·panicstring("incorrect b egin/end");
1692 }
1693 end0 = end;
1694 }
1695 if(sum != n)
1696 runtime·panicstring("incorrect sum");
1697 }
1698 }
1699 }
1700
1701 // Test that nonblocking parallelfor does not block.
1702 void
1703 runtime·CTestParforNonblock(bool isShort)
1704 {
1705 Parfor desc;
1706 uint64 *data;
1707 uint32 i, N;
1708
1709 USED(isShort);
1710
1711 N = 1000;
1712 data = (uint64*)runtime·mal(N*sizeof(uint64));
1713 for(i=0; i<N; i++)
1714 data[i] = i;
1715 parforsetup(&desc, testParforBody, MaxGcproc, N, data, false);
1716 for(i=0; i<MaxGcproc; i++)
1717 parfor(&desc);
1718 for(i=0; i<N; i++) {
1719 if(data[i] != i*i)
1720 runtime·panicstring("incorrect result");
1721 }
1722 runtime·free(data);
1723 }
1724
1725 // Test parallel parallelfor.
1726 void
1727 runtime·CTestParforParallel(bool isShort)
1728 {
1729 Parfor desc, *arg;
1730 uint64 *data;
1731 uint32 i, N;
1732 int32 procs;
1733
1734 procs = runtime·gomaxprocsfunc(MaxGcproc+1);
1735 N = 10000;
1736 if(isShort)
1737 N /= 10;
1738 data = (uint64*)runtime·mal(N*sizeof(uint64));
1739 for(i=0; i<N; i++)
1740 data[i] = i;
1741 parforsetup(&desc, testParforBody, MaxGcproc, N, data, true);
1742 arg = &desc;
1743 m->locks++; // disable gc during the mallocs in newproc
1744 for(i=1; i<MaxGcproc; i++)
1745 runtime·newproc1((byte*)testParforProc, (byte*)&arg, sizeof(arg) , 0, runtime·CTestParforParallel);
1746 m->locks--;
1747 parfor(&desc);
1748 for(i=0; i<N; i++) {
1749 if(data[i] != i*i)
1750 runtime·panicstring("incorrect result");
1751 }
1752 runtime·free(data);
1753 runtime·gomaxprocsfunc(procs);
1754 }
LEFTRIGHT

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b