Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(5)

Delta Between Two Patch Sets: src/pkg/runtime/mgc0.c

Issue 5279048: code review 5279048: runtime: faster and more scalable GC (Closed)
Left Patch Set: diff -r da9e7548e6ef https://go.googlecode.com/hg/ Created 13 years, 4 months ago
Right Patch Set: diff -r f44057cc01b2 https://go.googlecode.com/hg/ Created 12 years, 11 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « src/pkg/runtime/mcentral.c ('k') | src/pkg/runtime/mheap.c » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 // Copyright 2009 The Go Authors. All rights reserved. 1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style 2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file. 3 // license that can be found in the LICENSE file.
4 4
5 // Garbage collector. 5 // Garbage collector.
6 6
7 #include "runtime.h" 7 #include "runtime.h"
8 #include "arch.h" 8 #include "arch_GOARCH.h"
9 #include "malloc.h" 9 #include "malloc.h"
10 #include "stack.h" 10 #include "stack.h"
11 11
12 enum { 12 enum {
13 Debug = 0, 13 Debug = 0,
14 PtrSize = sizeof(void*), 14 PtrSize = sizeof(void*),
15 DebugMark = 0, // run second pass to check mark 15 DebugMark = 0, // run second pass to check mark
16 DataBlock = 8*1024,
16 17
17 // Four bits per word (see #defines below). 18 // Four bits per word (see #defines below).
18 wordsPerBitmapWord = sizeof(void*)*8/4, 19 wordsPerBitmapWord = sizeof(void*)*8/4,
19 bitShift = sizeof(void*)*8/4, 20 bitShift = sizeof(void*)*8/4,
20 }; 21 };
21 22
22 // Bits in per-word bitmap. 23 // Bits in per-word bitmap.
23 // #defines because enum might not be able to hold the values. 24 // #defines because enum might not be able to hold the values.
24 // 25 //
25 // Each word in the bitmap describes wordsPerBitmapWord words 26 // Each word in the bitmap describes wordsPerBitmapWord words
(...skipping 19 matching lines...) Expand all
45 // /* then test bits & bitAllocated, bits & bitMarked, etc. */ 46 // /* then test bits & bitAllocated, bits & bitMarked, etc. */
46 // 47 //
47 #define bitAllocated ((uintptr)1<<(bitShift*0)) 48 #define bitAllocated ((uintptr)1<<(bitShift*0))
48 #define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is set */ 49 #define bitNoPointers ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is set */
49 #define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAlloc ated is set */ 50 #define bitMarked ((uintptr)1<<(bitShift*2)) /* when bitAlloc ated is set */
50 #define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAlloc ated is set - has finalizer or being profiled */ 51 #define bitSpecial ((uintptr)1<<(bitShift*3)) /* when bitAlloc ated is set - has finalizer or being profiled */
51 #define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is NOT set */ 52 #define bitBlockBoundary ((uintptr)1<<(bitShift*1)) /* when bitAlloc ated is NOT set */
52 53
53 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial) 54 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial)
54 55
56 // Holding worldsema grants an M the right to try to stop the world.
57 // The procedure is:
58 //
59 // runtime·semacquire(&runtime·worldsema);
60 // m->gcing = 1;
61 // runtime·stoptheworld();
62 //
63 // ... do stuff ...
64 //
65 // m->gcing = 0;
66 // runtime·semrelease(&runtime·worldsema);
67 // runtime·starttheworld();
68 //
69 uint32 runtime·worldsema = 1;
55 static int32 gctrace; 70 static int32 gctrace;
56 71
57 typedef struct Workbuf Workbuf; 72 typedef struct Workbuf Workbuf;
58 struct Workbuf 73 struct Workbuf
59 { 74 {
60 Workbuf *next; 75 Workbuf *next;
76 uintptr pushcnt;
61 uintptr nobj; 77 uintptr nobj;
62 » byte *obj[512-2]; 78 » byte *obj[512-3];
63 }; 79 };
64 80
65 typedef struct Finalizer Finalizer; 81 typedef struct Finalizer Finalizer;
66 struct Finalizer 82 struct Finalizer
67 { 83 {
68 void (*fn)(void*); 84 void (*fn)(void*);
69 void *arg; 85 void *arg;
70 int32 nret; 86 int32 nret;
71 }; 87 };
72 88
73 typedef struct FinBlock FinBlock; 89 typedef struct FinBlock FinBlock;
74 struct FinBlock 90 struct FinBlock
75 { 91 {
76 FinBlock *alllink; 92 FinBlock *alllink;
77 FinBlock *next; 93 FinBlock *next;
78 int32 cnt; 94 int32 cnt;
79 int32 cap; 95 int32 cap;
80 Finalizer fin[1]; 96 Finalizer fin[1];
81 }; 97 };
82 98
83 extern byte data[]; 99 extern byte data[];
84 extern byte end[]; 100 extern byte etext[];
101 extern byte ebss[];
85 102
86 static G *fing; 103 static G *fing;
87 static FinBlock *finq; // list of finalizers that are to be executed 104 static FinBlock *finq; // list of finalizers that are to be executed
88 static FinBlock *finc; // cache of free blocks 105 static FinBlock *finc; // cache of free blocks
89 static FinBlock *allfin; // list of all blocks 106 static FinBlock *allfin; // list of all blocks
90 static Lock finlock; 107 static Lock finlock;
91 static int32 fingwait; 108 static int32 fingwait;
92 109
93 static void runfinq(void); 110 static void runfinq(void);
94 static Workbuf* getempty(Workbuf*); 111 static Workbuf* getempty(Workbuf*);
95 static Workbuf* getfull(Workbuf*); 112 static Workbuf* getfull(Workbuf*);
96 static void putempty(Workbuf*); 113 static void putempty(Workbuf*);
97 static Workbuf* handoff(Workbuf*); 114 static Workbuf* handoff(Workbuf*);
98 115
99 /* 116 // parallel for descriptor
100 typedef struct GcThread GcThread;
101 struct GcThread
102 {
103 » uint64 spanpos;
104 };
105 */
106
107 static struct {
108 » Lock fmu;
109 » Workbuf»*full;
110 » Lock emu;
111 » Workbuf»*empty;
112 » uint32» nproc;
113 » volatile uint32»nwait;
114 » volatile uint32»ndone;
115 » volatile uint32 markdone;
116 » //volatile uint32 sweepdone;
117 » Note» alldone;
118 » uint32» tidseq;
119
120 » Lock;
121 » byte» *chunk;
122 » uintptr»nchunk;
123
124 » struct
125 » {
126 » » byte *p;
127 » » uintptr n;
128 » } roots[64*1024];
129 » uintptr nroots;
130 } work;
131
132 typedef struct Parfor Parfor; 117 typedef struct Parfor Parfor;
133 struct Parfor 118 struct Parfor
134 { 119 {
120 // executed for each element
121 void (*body)(Parfor*, uint32);
122 // number of idle threads
135 uint32 done; 123 uint32 done;
124 // total number of threads
125 uint32 nthr;
126 // thread id sequencer
127 uint32 thrseq;
128 // iteration space [0, cnt)
129 uint32 cnt;
130 // arbitrary user context
131 void *ctx;
132 // if true, wait while all threads finish processing,
133 // otherwise parfor may return while other threads still working
134 bool wait;
136 byte pad[CacheLineSize]; 135 byte pad[CacheLineSize];
137 struct 136 struct
138 { 137 {
138 // the thread's iteration space [32lsb, 32msb)
139 uint64 pos; 139 uint64 pos;
140 byte pad[CacheLineSize-sizeof(uint64)]; 140 byte pad[CacheLineSize-sizeof(uint64)];
141 } thr[MaxGcproc]; 141 } thr[MaxGcproc];
142 }; 142 };
143 143
144 Parfor sweepfor; 144 typedef struct GcRoot GcRoot;
145 Parfor markfor; 145 struct GcRoot
146 {
147 » byte *p;
148 » uintptr n;
149 };
150
151 static struct
152 {
153 » uint64» full; // Workbuf* + ABA counter in 17msb
154 » uint64» empty; // Workbuf* + ABA counter in 17msb
155 » uint32» nproc;
156 » byte pad0[CacheLineSize];
157 » volatile uint32»nwait;
158 » byte pad1[CacheLineSize];
159 » volatile uint32 ndone;
160 » volatile uint32 debugmarkdone;
161 » Note» alldone;
162 » Parfor» sweepfor;
163 » Parfor» markfor;
164
165 » Lock;
166 » byte» *chunk;
167 » uintptr»nchunk;
168
169 » GcRoot» *roots;
170 » uint32» nroot;
171 » uint32» rootcap;
172 } work;
173
174 // lock-free stack
175 // used to manage empty and free Workbuf's during mark phase
176 static void» lifopush(uint64 *a, Workbuf *b);
177 static Workbuf*»lifopop(uint64 *a);
178
179 // parallel for over [0, n).
180 // body() is executed for each iteration.
181 // nthr - total number of worker threads.
182 // ctx - arbitrary user context.
183 // if wait=true, threads return from parfor() when all work is done;
184 // otherwise, threads can return while other threads still finishing processing. ·
185 static void» parforsetup(Parfor *desc, void (*body)(Parfor*, uint32),
186 » » » uint32 nthr, uint32 n, void *ctx, bool wait);
187 static void» parfor(Parfor *desc);
146 188
147 // scanblock scans a block of n bytes starting at pointer b for references 189 // scanblock scans a block of n bytes starting at pointer b for references
148 // to other objects, scanning any it finds recursively until there are no 190 // to other objects, scanning any it finds recursively until there are no
149 // unscanned objects left. Instead of using an explicit recursion, it keeps 191 // unscanned objects left. Instead of using an explicit recursion, it keeps
150 // a work list in the Workbuf* structures and loops in the main function 192 // a work list in the Workbuf* structures and loops in the main function
151 // body. Keeping an explicit work list is easier on the stack allocator and 193 // body. Keeping an explicit work list is easier on the stack allocator and
152 // more efficient. 194 // more efficient.
153 static void 195 static void
154 scanblock(byte *b, int64 n) 196 scanblock(byte *b, int64 n)
155 { 197 {
(...skipping 14 matching lines...) Expand all
170 // Memory arena parameters. 212 // Memory arena parameters.
171 arena_start = runtime·mheap.arena_start; 213 arena_start = runtime·mheap.arena_start;
172 arena_used = runtime·mheap.arena_used; 214 arena_used = runtime·mheap.arena_used;
173 nproc = work.nproc; 215 nproc = work.nproc;
174 216
175 wbuf = nil; // current work buffer 217 wbuf = nil; // current work buffer
176 wp = nil; // storage for next queued pointer (write pointer) 218 wp = nil; // storage for next queued pointer (write pointer)
177 nobj = 0; // number of queued objects 219 nobj = 0; // number of queued objects
178 220
179 // Scanblock helpers pass b==nil. 221 // Scanblock helpers pass b==nil.
180 » // The main proc needs to return to make more 222 » // Procs need to return to make more
181 // calls to scanblock. But if work.nproc==1 then 223 // calls to scanblock. But if work.nproc==1 then
182 // might as well process blocks as soon as we 224 // might as well process blocks as soon as we
183 // have them. 225 // have them.
184 keepworking = b == nil || work.nproc == 1; 226 keepworking = b == nil || work.nproc == 1;
185 227
186 // Align b to a word boundary. 228 // Align b to a word boundary.
187 off = (uintptr)b & (PtrSize-1); 229 off = (uintptr)b & (PtrSize-1);
188 if(off != 0) { 230 if(off != 0) {
189 b += PtrSize - off; 231 b += PtrSize - off;
190 n -= PtrSize - off; 232 n -= PtrSize - off;
191 } 233 }
192 234
193 for(;;) { 235 for(;;) {
194 m->gcstats.nscanblocks++;
195
196 // Each iteration scans the block b of length n, queueing pointe rs in 236 // Each iteration scans the block b of length n, queueing pointe rs in
197 // the work buffer. 237 // the work buffer.
198 if(Debug > 1) 238 if(Debug > 1)
199 runtime·printf("scanblock %p %D\n", b, n); 239 runtime·printf("scanblock %p %D\n", b, n);
200 240
201 vp = (void**)b; 241 vp = (void**)b;
202 n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */ 242 n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */
203 for(i=0; i<n; i++) { 243 for(i=0; i<n; i++) {
204 m->gcstats.nscanwords++;
205 obj = (byte*)vp[i]; 244 obj = (byte*)vp[i];
206 245
207 // Words outside the arena cannot be pointers. 246 // Words outside the arena cannot be pointers.
208 if((byte*)obj < arena_start || (byte*)obj >= arena_used) 247 if((byte*)obj < arena_start || (byte*)obj >= arena_used)
209 continue; 248 continue;
210 249
211 m->gcstats.nscanptr++;
212 // obj may be a pointer to a live object. 250 // obj may be a pointer to a live object.
213 // Try to find the beginning of the object. 251 // Try to find the beginning of the object.
214 252
215 // Round down to word boundary. 253 // Round down to word boundary.
216 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); 254 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
217 255
218 // Find bits for this word. 256 // Find bits for this word.
219 off = (uintptr*)obj - (uintptr*)arena_start; 257 off = (uintptr*)obj - (uintptr*)arena_start;
220 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; 258 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
221 shift = off % wordsPerBitmapWord; 259 shift = off % wordsPerBitmapWord;
(...skipping 10 matching lines...) Expand all
232 if(((xbits>>j) & (bitAllocated|bitBlockBoundary) ) != 0) { 270 if(((xbits>>j) & (bitAllocated|bitBlockBoundary) ) != 0) {
233 obj = (byte*)obj - (shift-j)*PtrSize; 271 obj = (byte*)obj - (shift-j)*PtrSize;
234 shift = j; 272 shift = j;
235 bits = xbits>>shift; 273 bits = xbits>>shift;
236 goto found; 274 goto found;
237 } 275 }
238 } 276 }
239 277
240 // Otherwise consult span table to find beginning. 278 // Otherwise consult span table to find beginning.
241 // (Manually inlined copy of MHeap_LookupMaybe.) 279 // (Manually inlined copy of MHeap_LookupMaybe.)
242 m->gcstats.naddrlookup++;
243 k = (uintptr)obj>>PageShift; 280 k = (uintptr)obj>>PageShift;
244 x = k; 281 x = k;
245 if(sizeof(void*) == 8) 282 if(sizeof(void*) == 8)
246 x -= (uintptr)arena_start>>PageShift; 283 x -= (uintptr)arena_start>>PageShift;
247 s = runtime·mheap.map[x]; 284 s = runtime·mheap.map[x];
248 if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse) 285 if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
249 continue; 286 continue;
250 p = (byte*)((uintptr)s->start<<PageShift); 287 p = (byte*)((uintptr)s->start<<PageShift);
251 if(s->sizeclass == 0) { 288 if(s->sizeclass == 0) {
252 obj = p; 289 obj = p;
253 } else { 290 } else {
254 if((byte*)obj >= (byte*)s->limit) 291 if((byte*)obj >= (byte*)s->limit)
255 continue; 292 continue;
256 size = runtime·class_to_size[s->sizeclass]; 293 size = runtime·class_to_size[s->sizeclass];
257 int32 i = ((byte*)obj - p)/size; 294 int32 i = ((byte*)obj - p)/size;
258 obj = p+i*size; 295 obj = p+i*size;
259 } 296 }
260 297
261 // Now that we know the object header, reload bits. 298 // Now that we know the object header, reload bits.
262 off = (uintptr*)obj - (uintptr*)arena_start; 299 off = (uintptr*)obj - (uintptr*)arena_start;
263 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; 300 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
264 shift = off % wordsPerBitmapWord; 301 shift = off % wordsPerBitmapWord;
265 xbits = *bitp; 302 xbits = *bitp;
266 bits = xbits >> shift; 303 bits = xbits >> shift;
267 304
268 found: 305 found:
306 // If another proc wants a pointer, give it some.
307 if(work.nwait > 0 && nobj > 4 && work.full == 0) {
308 wbuf->nobj = nobj;
309 wbuf = handoff(wbuf);
310 nobj = wbuf->nobj;
311 wp = wbuf->obj + nobj;
312 }
313
269 // Now we have bits, bitp, and shift correct for 314 // Now we have bits, bitp, and shift correct for
270 // obj pointing at the base of the object. 315 // obj pointing at the base of the object.
271 // Only care about allocated and not marked. 316 // Only care about allocated and not marked.
272 if((bits & (bitAllocated|bitMarked)) != bitAllocated) 317 if((bits & (bitAllocated|bitMarked)) != bitAllocated)
273 continue; 318 continue;
274 if(nproc == 1) 319 if(nproc == 1)
275 *bitp |= bitMarked<<shift; 320 *bitp |= bitMarked<<shift;
276 else { 321 else {
277 for(;;) { 322 for(;;) {
278 x = *bitp; 323 x = *bitp;
279 if(x & (bitMarked<<shift)) 324 if(x & (bitMarked<<shift))
280 goto continue_obj; 325 goto continue_obj;
281 if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift)))) 326 if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
282 break; 327 break;
283 } 328 }
284 } 329 }
285 330
286 // If object has no pointers, don't need to scan further . 331 // If object has no pointers, don't need to scan further .
287 if((bits & bitNoPointers) != 0) 332 if((bits & bitNoPointers) != 0)
288 continue; 333 continue;
289 334
290 » » » // If another proc wants a pointer, give it some. 335 » » » PREFETCH(obj);
291 » » » if(nobj > 4 && work.nwait > 0 && work.full == nil) {
292 » » » » wbuf->nobj = nobj;
293 » » » » wbuf = handoff(wbuf);
294 » » » » nobj = wbuf->nobj;
295 » » » » wp = wbuf->obj + nobj;
296 » » » }
297 336
298 // If buffer is full, get a new one. 337 // If buffer is full, get a new one.
299 if(wbuf == nil || nobj >= nelem(wbuf->obj)) { 338 if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
300 if(wbuf != nil) 339 if(wbuf != nil)
301 wbuf->nobj = nobj; 340 wbuf->nobj = nobj;
302 wbuf = getempty(wbuf); 341 wbuf = getempty(wbuf);
303 wp = wbuf->obj; 342 wp = wbuf->obj;
304 nobj = 0; 343 nobj = 0;
305 } 344 }
306 *wp++ = obj; 345 *wp++ = obj;
307 nobj++; 346 nobj++;
308 continue_obj:; 347 continue_obj:;
309 } 348 }
310 349
311 // Done scanning [b, b+n). Prepare for the next iteration of 350 // Done scanning [b, b+n). Prepare for the next iteration of
312 // the loop by setting b and n to the parameters for the next bl ock. 351 // the loop by setting b and n to the parameters for the next bl ock.
313 352
314 // Fetch b from the work buffer. 353 // Fetch b from the work buffer.
315 if(nobj == 0) { 354 if(nobj == 0) {
316 //runtime·printf("nobj==0\n");
317 if(!keepworking) { 355 if(!keepworking) {
318 » » » » putempty(wbuf); 356 » » » » if(wbuf != nil)
357 » » » » » putempty(wbuf);
319 return; 358 return;
320 } 359 }
321 // Emptied our buffer: refill. 360 // Emptied our buffer: refill.
322 wbuf = getfull(wbuf); 361 wbuf = getfull(wbuf);
323 if(wbuf == nil) 362 if(wbuf == nil)
324 return; 363 return;
325 nobj = wbuf->nobj; 364 nobj = wbuf->nobj;
326 wp = wbuf->obj + wbuf->nobj; 365 wp = wbuf->obj + wbuf->nobj;
327 //runtime·printf("getfull(%p) %d\n", wbuf, (uint32)wbuf->nobj);
328 } 366 }
329 b = *--wp; 367 b = *--wp;
330 nobj--; 368 nobj--;
331 369
332 » » // Figure out n = size of b. Start by loading bits for b. 370 » » // Ask span about size class.
333 » » off = (uintptr*)b - (uintptr*)arena_start;
334 » » bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
335 » » shift = off % wordsPerBitmapWord;
336 » » xbits = *bitp;
337 » » bits = xbits >> shift;
338
339 » » // Might be small; look for nearby block boundary.
340 » » // A block boundary is marked by either bitBlockBoundary
341 » » // or bitAllocated being set (see notes near their definition).
342 » » enum {
343 » » » boundary = bitBlockBoundary|bitAllocated
344 » » };
345 » » // Look for a block boundary both after and before b
346 » » // in the same bitmap word.
347 » » //
348 » » // A block boundary j words after b is indicated by
349 » » //» bits>>j & boundary
350 » » // assuming shift+j < bitShift. (If shift+j >= bitShift then
351 » » // we'll be bleeding other bit types like bitMarked into our tes t.)
352 » » // Instead of inserting the conditional shift+j < bitShift into the loop,
353 » » // we can let j range from 1 to bitShift as long as we first
354 » » // apply a mask to keep only the bits corresponding
355 » » // to shift+j < bitShift aka j < bitShift-shift.
356 » » bits &= (boundary<<(bitShift-shift)) - boundary;
357
358 » » // A block boundary j words before b is indicated by
359 » » //» xbits>>(shift-j) & boundary
360 » » // (assuming shift >= j). There is no cleverness here
361 » » // avoid the test, because when j gets too large the shift
362 » » // turns negative, which is undefined in C.
363
364 » » for(j=1; j<bitShift; j++) {
365 » » » if(((bits>>j)&boundary) != 0 || shift>=j && ((xbits>>(sh ift-j))&boundary) != 0) {
366 » » » » n = j*PtrSize;
367 » » » » goto scan;
368 » » » }
369 » » }
370
371 » » // Fall back to asking span about size class.
372 // (Manually inlined copy of MHeap_Lookup.) 371 // (Manually inlined copy of MHeap_Lookup.)
373 m->gcstats.nsizelookup++;
374 x = (uintptr)b>>PageShift; 372 x = (uintptr)b>>PageShift;
375 if(sizeof(void*) == 8) 373 if(sizeof(void*) == 8)
376 x -= (uintptr)arena_start>>PageShift; 374 x -= (uintptr)arena_start>>PageShift;
377 s = runtime·mheap.map[x]; 375 s = runtime·mheap.map[x];
378 if(s->sizeclass == 0) 376 if(s->sizeclass == 0)
379 n = s->npages<<PageShift; 377 n = s->npages<<PageShift;
380 else 378 else
381 n = runtime·class_to_size[s->sizeclass]; 379 n = runtime·class_to_size[s->sizeclass];
382 scan:;
383 } 380 }
384 } 381 }
385 382
386 // debug_scanblock is the debug copy of scanblock. 383 // debug_scanblock is the debug copy of scanblock.
387 // it is simpler, slower, single-threaded, recursive, 384 // it is simpler, slower, single-threaded, recursive,
388 // and uses bitSpecial as the mark bit. 385 // and uses bitSpecial as the mark bit.
389 static void 386 static void
390 debug_scanblock(byte *b, int64 n) 387 debug_scanblock(byte *b, int64 n)
391 { 388 {
392 byte *obj, *p; 389 byte *obj, *p;
(...skipping 25 matching lines...) Expand all
418 if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runti me·mheap.arena_used) 415 if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runti me·mheap.arena_used)
419 continue; 416 continue;
420 417
421 // Round down to word boundary. 418 // Round down to word boundary.
422 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); 419 obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
423 420
424 // Consult span table to find beginning. 421 // Consult span table to find beginning.
425 s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj); 422 s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj);
426 if(s == nil) 423 if(s == nil)
427 continue; 424 continue;
428
429 425
430 p = (byte*)((uintptr)s->start<<PageShift); 426 p = (byte*)((uintptr)s->start<<PageShift);
431 if(s->sizeclass == 0) { 427 if(s->sizeclass == 0) {
432 obj = p; 428 obj = p;
433 size = (uintptr)s->npages<<PageShift; 429 size = (uintptr)s->npages<<PageShift;
434 } else { 430 } else {
435 if((byte*)obj >= (byte*)s->limit) 431 if((byte*)obj >= (byte*)s->limit)
436 continue; 432 continue;
437 size = runtime·class_to_size[s->sizeclass]; 433 size = runtime·class_to_size[s->sizeclass];
438 int32 i = ((byte*)obj - p)/size; 434 int32 i = ((byte*)obj - p)/size;
(...skipping 17 matching lines...) Expand all
456 runtime·printf("found unmarked block %p in %p\n", obj, v p+i); 452 runtime·printf("found unmarked block %p in %p\n", obj, v p+i);
457 453
458 // If object has no pointers, don't need to scan further. 454 // If object has no pointers, don't need to scan further.
459 if((bits & bitNoPointers) != 0) 455 if((bits & bitNoPointers) != 0)
460 continue; 456 continue;
461 457
462 debug_scanblock(obj, size); 458 debug_scanblock(obj, size);
463 } 459 }
464 } 460 }
465 461
462 static void
463 markroot(Parfor *desc, uint32 i)
464 {
465 USED(desc);
466 scanblock(work.roots[i].p, work.roots[i].n);
467 }
468
466 // Get an empty work buffer off the work.empty list, 469 // Get an empty work buffer off the work.empty list,
467 // allocating new buffers as needed. 470 // allocating new buffers as needed.
468 static Workbuf* 471 static Workbuf*
469 getempty(Workbuf *b) 472 getempty(Workbuf *b)
470 { 473 {
471 » if(work.nproc == 1) { 474 » if(b != nil)
472 » » // Put b on full list. 475 » » lifopush(&work.full, b);
473 » » if(b != nil) { 476 » b = lifopop(&work.empty);
474 » » » b->next = work.full; 477 » if(b == nil) {
475 » » » work.full = b; 478 » » // Need to allocate.
476 » » } 479 » » runtime·lock(&work);
477 » » // Grab from empty list if possible. 480 » » if(work.nchunk < sizeof *b) {
478 » » b = work.empty; 481 » » » work.nchunk = 1<<20;
479 » » if(b != nil) { 482 » » » work.chunk = runtime·SysAlloc(work.nchunk);
480 » » » work.empty = b->next; 483 » » }
481 » » » goto haveb; 484 » » b = (Workbuf*)work.chunk;
482 » » } 485 » » work.chunk += sizeof *b;
483 » } else { 486 » » work.nchunk -= sizeof *b;
484 » » // Put b on full list. 487 » » runtime·unlock(&work);
485 » » if(b != nil) { 488 » }
486 » » » runtime·lock(&work.fmu);
487 » » » b->next = work.full;
488 » » » work.full = b;
489 » » » runtime·unlock(&work.fmu);
490 » » }
491 » » // Grab from empty list if possible.
492 » » runtime·lock(&work.emu);
493 » » b = work.empty;
494 » » if(b != nil)
495 » » » work.empty = b->next;
496 » » runtime·unlock(&work.emu);
497 » » if(b != nil)
498 » » » goto haveb;
499 » }
500
501 » // Need to allocate.
502 » runtime·lock(&work);
503 » if(work.nchunk < sizeof *b) {
504 » » work.nchunk = 1<<20;
505 » » work.chunk = runtime·SysAlloc(work.nchunk);
506 » }
507 » b = (Workbuf*)work.chunk;
508 » work.chunk += sizeof *b;
509 » work.nchunk -= sizeof *b;
510 » runtime·unlock(&work);
511
512 haveb:
513 b->nobj = 0; 489 b->nobj = 0;
514 return b; 490 return b;
515 } 491 }
516 492
517 static void 493 static void
518 putempty(Workbuf *b) 494 putempty(Workbuf *b)
519 { 495 {
520 » if(b == nil) 496 » lifopush(&work.empty, b);
521 » » return;
522
523 » if(work.nproc == 1) {
524 » » b->next = work.empty;
525 » » work.empty = b;
526 » » return;
527 » }
528
529 » runtime·lock(&work.emu);
530 » b->next = work.empty;
531 » work.empty = b;
532 » runtime·unlock(&work.emu);
533 } 497 }
534 498
535 // Get a full work buffer off the work.full list, or return nil. 499 // Get a full work buffer off the work.full list, or return nil.
536 static Workbuf* 500 static Workbuf*
537 getfull(Workbuf *b) 501 getfull(Workbuf *b)
538 { 502 {
539 int32 i; 503 int32 i;
540 » Workbuf *b1; 504
541 505 » if(b != nil)
542 //runtime·printf("getfull nproc=%d b=%p(%d)\n", work.nproc, b, (uint32)(b?b->nob j:0)); 506 » » lifopush(&work.empty, b);
543 » if(work.nproc == 1) { 507 » b = lifopop(&work.full);
544 » » // Put b on empty list. 508 » if(b != nil || work.nproc == 1)
545 » » if(b != nil) {
546 » » » b->next = work.empty;
547 » » » work.empty = b;
548 » » }
549 » » // Grab from full list if possible.
550 » » // Since work.nproc==1, no one else is
551 » » // going to give us work.
552 » » b = work.full;
553 » » if(b != nil)
554 » » » work.full = b->next;
555 return b; 509 return b;
556 }
557
558 putempty(b);
559
560 // Grab buffer from full list if possible.
561 for(;;) {
562 b1 = work.full;
563 if(b1 == nil)
564 break;
565 runtime·lock(&work.fmu);
566 if(work.full != nil) {
567 b1 = work.full;
568 work.full = b1->next;
569 runtime·unlock(&work.fmu);
570 return b1;
571 }
572 runtime·unlock(&work.fmu);
573 }
574 510
575 runtime·xadd(&work.nwait, +1); 511 runtime·xadd(&work.nwait, +1);
576 for(i=0;; i++) { 512 for(i=0;; i++) {
577 » » b1 = work.full; 513 » » if(work.full != 0) {
578 » » if(b1 != nil) { 514 » » » runtime·xadd(&work.nwait, -1);
579 » » » runtime·lock(&work.fmu); 515 » » » b = lifopop(&work.full);
580 » » » if(work.full != nil) { 516 » » » if(b != nil)
581 » » » » runtime·xadd(&work.nwait, -1); 517 » » » » return b;
582 » » » » b1 = work.full; 518 » » » runtime·xadd(&work.nwait, +1);
583 » » » » work.full = b1->next;
584 » » » » runtime·unlock(&work.fmu);
585 » » » » return b1;
586 » » » }
587 » » » runtime·unlock(&work.fmu);
588 » » » continue;
589 } 519 }
590 if(work.nwait == work.nproc) 520 if(work.nwait == work.nproc)
591 return nil; 521 return nil;
592 » » if(i < 10) 522 » » if(i < 10) {
523 » » » m->gcstats.nprocyield++;
593 runtime·procyield(20); 524 runtime·procyield(20);
594 » » else if(i < 20) 525 » » } else if(i < 20) {
526 » » » m->gcstats.nosyield++;
595 runtime·osyield(); 527 runtime·osyield();
596 » » else 528 » » } else {
529 » » » m->gcstats.nsleep++;
597 runtime·usleep(100); 530 runtime·usleep(100);
531 }
598 } 532 }
599 } 533 }
600 534
601 static Workbuf* 535 static Workbuf*
602 handoff(Workbuf *b) 536 handoff(Workbuf *b)
603 { 537 {
604 int32 n; 538 int32 n;
605 Workbuf *b1; 539 Workbuf *b1;
606 540
607 // Make new buffer with half of b's pointers. 541 // Make new buffer with half of b's pointers.
608 b1 = getempty(nil); 542 b1 = getempty(nil);
609 n = b->nobj/2; 543 n = b->nobj/2;
610 b->nobj -= n; 544 b->nobj -= n;
611 b1->nobj = n; 545 b1->nobj = n;
612 runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]); 546 runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
613 m->gcstats.nhandoff += 1; 547 m->gcstats.nhandoff += 1;
614 m->gcstats.nhandoffcnt += n; 548 m->gcstats.nhandoffcnt += n;
615 549
616 // Put b on full list - let first half of b get stolen. 550 // Put b on full list - let first half of b get stolen.
617 » runtime·lock(&work.fmu); 551 » lifopush(&work.full, b);
618 » b->next = work.full;
619 » work.full = b;
620 » runtime·unlock(&work.fmu);
621
622 return b1; 552 return b1;
623 } 553 }
624 554
625 // Scanstack calls scanblock on each of gp's stack segments.
626 static void 555 static void
627 scanstack(void (*scanblock)(byte*, int64), G *gp) 556 addroot(byte *p, uintptr n)
557 {
558 » uint32 cap;
559 » GcRoot *new;
560
561 » if(work.nroot >= work.rootcap) {
562 » » cap = PageSize/sizeof(GcRoot);
563 » » if(cap < 2*work.rootcap)
564 » » » cap = 2*work.rootcap;
565 » » new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
566 » » if(work.roots != nil) {
567 » » » runtime·memmove(new, work.roots, work.rootcap*sizeof(GcR oot));
568 » » » runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot)) ;
569 » » }
570 » » work.roots = new;
571 » » work.rootcap = cap;
572 » }
573 » work.roots[work.nroot].p = p;
574 » work.roots[work.nroot].n = n;
575 » work.nroot++;
576 }
577
578 static void
579 addstackroots(G *gp)
628 { 580 {
629 M *mp; 581 M *mp;
630 int32 n; 582 int32 n;
631 Stktop *stk; 583 Stktop *stk;
632 byte *sp, *guard; 584 byte *sp, *guard;
633 585
634 stk = (Stktop*)gp->stackbase; 586 stk = (Stktop*)gp->stackbase;
635 guard = gp->stackguard; 587 guard = gp->stackguard;
636 588
637 if(gp == g) { 589 if(gp == g) {
(...skipping 12 matching lines...) Expand all
650 // as schedlock and may have needed to start a new stack segment . 602 // as schedlock and may have needed to start a new stack segment .
651 // Use the stack segment and stack pointer at the time of 603 // Use the stack segment and stack pointer at the time of
652 // the system call instead, since that won't change underfoot. 604 // the system call instead, since that won't change underfoot.
653 if(gp->gcstack != nil) { 605 if(gp->gcstack != nil) {
654 stk = (Stktop*)gp->gcstack; 606 stk = (Stktop*)gp->gcstack;
655 sp = gp->gcsp; 607 sp = gp->gcsp;
656 guard = gp->gcguard; 608 guard = gp->gcguard;
657 } 609 }
658 } 610 }
659 611
660 if(Debug > 1)
661 runtime·printf("scanstack %d %p\n", gp->goid, sp);
662 n = 0; 612 n = 0;
663 while(stk) { 613 while(stk) {
664 if(sp < guard-StackGuard || (byte*)stk < sp) { 614 if(sp < guard-StackGuard || (byte*)stk < sp) {
665 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); 615 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
666 runtime·throw("scanstack"); 616 runtime·throw("scanstack");
667 } 617 }
668 scanblock(sp, (byte*)stk - sp);
669 sp = stk->gobuf.sp;
670 guard = stk->stackguard;
671 stk = (Stktop*)stk->stackbase;
672 n++;
673 }
674 }
675
676 // Markfin calls scanblock on the blocks that have finalizers:
677 // the things pointed at cannot be freed until the finalizers have run.
678 static void
679 markfin(void *v)
680 {
681 uintptr size;
682
683 size = 0;
684 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v))
685 runtime·throw("mark - finalizer inconsistency");
686
687 // do not mark the finalizer block itself. just mark the things it poin ts at.
688 scanblock(v, size);
689 }
690
691 static void
692 debug_markfin(void *v)
693 {
694 uintptr size;
695
696 if(!runtime·mlookup(v, &v, &size, nil))
697 runtime·throw("debug_mark - finalizer inconsistency");
698 debug_scanblock(v, size);
699 }
700
701 // Mark
702 static void
703 mark(void (*scan)(byte*, int64))
704 {
705 G *gp;
706 FinBlock *fb;
707
708 // mark data+bss.
709 // skip runtime·mheap itself, which has no interesting pointers
710 // and is mostly zeroed and would not otherwise be paged in.
711 //runtime·printf("mark globals\n");
712 scan(data, (byte*)&runtime·mheap - data);
713 scan((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1));
714
715 // mark stacks
716 //runtime·printf("mark stacks\n");
717 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
718 switch(gp->status){
719 default:
720 runtime·printf("unexpected G.status %d\n", gp->status);
721 runtime·throw("mark - bad status");
722 case Gdead:
723 break;
724 case Grunning:
725 if(gp != g)
726 runtime·throw("mark - world not stopped");
727 scanstack(scan, gp);
728 break;
729 case Grunnable:
730 case Gsyscall:
731 case Gwaiting:
732 scanstack(scan, gp);
733 break;
734 }
735 }
736
737 // mark things pointed at by objects with finalizers
738 //runtime·printf("mark fin\n");
739 if(scan == debug_scanblock)
740 runtime·walkfintab(debug_markfin);
741 else
742 runtime·walkfintab(markfin);
743
744 //runtime·printf("mark fin blocks\n");
745 for(fb=allfin; fb; fb=fb->alllink)
746 scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
747
748 // in multiproc mode, join in the queued work.
749 //runtime·printf("mark help\n");
750 scan(nil, 0);
751 }
752
753 static void
754 addroot(byte *p, uintptr n)
755 {
756 if(work.nroots == nelem(work.roots))
757 runtime·throw("gc: root overflow");
758 work.roots[work.nroots].p = p;
759 work.roots[work.nroots].n = n;
760 work.nroots++;
761 }
762
763 static void
764 collectstackroots(G *gp)
765 {
766 M *mp;
767 int32 n;
768 Stktop *stk;
769 byte *sp, *guard;
770 ········
771 stk = (Stktop*)gp->stackbase;
772 guard = gp->stackguard;
773 ········
774 if(gp == g) {
775 // Scanning our own stack: start at &gp.
776 sp = (byte*)&gp;
777 } else if((mp = gp->m) != nil && mp->helpgc) {
778 // gchelper's stack is in active use and has no interesting poin ters.
779 return;
780 } else {
781 // Scanning another goroutine's stack.
782 // The goroutine is usually asleep (the world is stopped).
783 sp = gp->sched.sp;
784 ················
785 // The exception is that if the goroutine is about to enter or m ight
786 // have just exited a system call, it may be executing code such
787 // as schedlock and may have needed to start a new stack segment .
788 // Use the stack segment and stack pointer at the time of
789 // the system call instead, since that won't change underfoot.
790 if(gp->gcstack != nil) {
791 stk = (Stktop*)gp->gcstack;
792 sp = gp->gcsp;
793 guard = gp->gcguard;
794 }
795 }
796 ········
797 n = 0;
798 while(stk) {
799 if(sp < guard-StackGuard || (byte*)stk < sp) {
800 runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
801 runtime·throw("scanstack");
802 }
803 addroot(sp, (byte*)stk - sp); 618 addroot(sp, (byte*)stk - sp);
804 sp = stk->gobuf.sp; 619 sp = stk->gobuf.sp;
805 guard = stk->stackguard; 620 guard = stk->stackguard;
806 stk = (Stktop*)stk->stackbase; 621 stk = (Stktop*)stk->stackbase;
807 n++; 622 n++;
808 } 623 }
809 } 624 }
810 625
811 static void 626 static void
812 collectfinroots(void *v) 627 addfinroots(void *v)
813 { 628 {
814 uintptr size; 629 uintptr size;
815 » 630
816 size = 0; 631 size = 0;
817 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v)) 632 if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v))
818 runtime·throw("mark - finalizer inconsistency"); 633 runtime·throw("mark - finalizer inconsistency");
819 » 634
820 // do not mark the finalizer block itself. just mark the things it poin ts at. 635 // do not mark the finalizer block itself. just mark the things it poin ts at.
821 addroot(v, size); 636 addroot(v, size);
822 } 637 }
823 638
824 static void 639 static void
825 collectroots(void) 640 addroots(void)
826 { 641 {
827 G *gp; 642 G *gp;
828 FinBlock *fb; 643 FinBlock *fb;
829 » byte *p, *e; 644 » byte *p;
830 645
831 » work.nroots = 0; 646 » work.nroot = 0;
832 »······· 647
833 » e = (byte*)&runtime·mheap; 648 » // mark data+bss.
834 » for(p=data; p<e; p+=4*1024) 649 » for(p=data; p<ebss; p+=DataBlock)
835 » » addroot(p, p+4*1024<e?4*1024:e-p); 650 » » addroot(p, p+DataBlock<ebss?DataBlock:ebss-p);
836 651
837 » e = end; 652 » runtime·walkfintab(addfinroots);
838 » for(p=(byte*)(&runtime·mheap+1); p<e; p+=4*1024) 653
839 » » addroot(p, p+4*1024<e?4*1024:e-p);
840 »·······
841 » runtime·walkfintab(collectfinroots);
842 »·······
843 for(fb=allfin; fb; fb=fb->alllink) 654 for(fb=allfin; fb; fb=fb->alllink)
844 addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); 655 addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
845 » 656
846 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { 657 for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
847 switch(gp->status){ 658 switch(gp->status){
848 default: 659 default:
849 runtime·printf("unexpected G.status %d\n", gp->s tatus); 660 runtime·printf("unexpected G.status %d\n", gp->s tatus);
850 runtime·throw("mark - bad status"); 661 runtime·throw("mark - bad status");
851 case Gdead: 662 case Gdead:
852 break; 663 break;
853 case Grunning: 664 case Grunning:
854 if(gp != g) 665 if(gp != g)
855 runtime·throw("mark - world not stopped" ); 666 runtime·throw("mark - world not stopped" );
856 » » » » collectstackroots(gp); 667 » » » » addstackroots(gp);
857 break; 668 break;
858 case Grunnable: 669 case Grunnable:
859 case Gsyscall: 670 case Gsyscall:
860 case Gwaiting: 671 case Gwaiting:
861 » » » » collectstackroots(gp); 672 » » » » addstackroots(gp);
862 break; 673 break;
863 } 674 }
864 } 675 }
865 ········
866 //runtime·printf("collected %d gc roots\n", (uint32)work.nroots);
867 } 676 }
868 677
869 static bool 678 static bool
870 handlespecial(byte *p, uintptr size) 679 handlespecial(byte *p, uintptr size)
871 { 680 {
872 void (*fn)(void*); 681 void (*fn)(void*);
873 int32 nret; 682 int32 nret;
874 FinBlock *block; 683 FinBlock *block;
875 Finalizer *f; 684 Finalizer *f;
876 » 685
877 if(!runtime·getfinalizer(p, true, &fn, &nret)) { 686 if(!runtime·getfinalizer(p, true, &fn, &nret)) {
878 runtime·setblockspecial(p, false); 687 runtime·setblockspecial(p, false);
879 runtime·MProf_Free(p, size); 688 runtime·MProf_Free(p, size);
880 return false; 689 return false;
881 } 690 }
882 691
883 runtime·lock(&finlock); 692 runtime·lock(&finlock);
884 if(finq == nil || finq->cnt == finq->cap) { 693 if(finq == nil || finq->cnt == finq->cap) {
885 if(finc == nil) { 694 if(finc == nil) {
886 finc = runtime·SysAlloc(PageSize); 695 finc = runtime·SysAlloc(PageSize);
887 finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Final izer) + 1; 696 finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Final izer) + 1;
888 finc->alllink = allfin; 697 finc->alllink = allfin;
889 allfin = finc; 698 allfin = finc;
890 } 699 }
891 block = finc; 700 block = finc;
892 finc = block->next; 701 finc = block->next;
893 block->next = finq; 702 block->next = finq;
894 finq = block; 703 finq = block;
895 } 704 }
896 f = &finq->fin[finq->cnt]; 705 f = &finq->fin[finq->cnt];
897 finq->cnt++; 706 finq->cnt++;
898 f->fn = fn; 707 f->fn = fn;
899 f->nret = nret; 708 f->nret = nret;
900 f->arg = p; 709 f->arg = p;
901 runtime·unlock(&finlock); 710 runtime·unlock(&finlock);
902 return true; 711 return true;
903 } 712 }
904 713
905 static void sweepspan(uint32 sid) 714 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
715 // It clears the mark bits in preparation for the next GC round.
716 static void
717 sweepspan(Parfor *desc, uint32 spanidx)
906 { 718 {
907 MSpan *s; 719 MSpan *s;
908 int32 cl, n, npages; 720 int32 cl, n, npages;
909 uintptr size; 721 uintptr size;
910 byte *p; 722 byte *p;
911 MCache *c; 723 MCache *c;
912 byte *arena_start; 724 byte *arena_start;
913 725 » MLink *start, *end;
914 » s = runtime·mheap.allspans[sid]; 726 » int32 nfree;
727
728 » USED(desc);
729
730 » s = runtime·mheap.allspans[spanidx];
731 » // Stamp newly unused spans. The scavenger will use that
732 » // info to potentially give back some pages to the OS.
733 » if(s->state == MSpanFree && s->unusedsince == 0)
734 » » s->unusedsince = runtime·nanotime();
915 if(s->state != MSpanInUse) 735 if(s->state != MSpanInUse)
916 return; 736 return;
917 737
918 arena_start = runtime·mheap.arena_start; 738 arena_start = runtime·mheap.arena_start;
919 739
920 m->gcstats.nspanfull++;
921 p = (byte*)(s->start << PageShift); 740 p = (byte*)(s->start << PageShift);
922 cl = s->sizeclass; 741 cl = s->sizeclass;
923 if(cl == 0) { 742 if(cl == 0) {
924 m->gcstats.nspanlarge++;
925 size = s->npages<<PageShift; 743 size = s->npages<<PageShift;
926 n = 1; 744 n = 1;
927 } else { 745 } else {
928 // Chunk full of small blocks. 746 // Chunk full of small blocks.
929 size = runtime·class_to_size[cl]; 747 size = runtime·class_to_size[cl];
930 npages = runtime·class_to_allocnpages[cl]; 748 npages = runtime·class_to_allocnpages[cl];
931 n = (npages << PageShift) / size; 749 n = (npages << PageShift) / size;
932 } 750 }
933 c = m->mcache; 751 c = m->mcache;
752 nfree = 0;
753 start = end = nil;
934 754
935 // Sweep through n objects of given size starting at p. 755 // Sweep through n objects of given size starting at p.
936 // This thread owns the span now, so it can manipulate 756 // This thread owns the span now, so it can manipulate
937 // the block bitmap without atomic operations. 757 // the block bitmap without atomic operations.
938 for(; n > 0; n--, p += size) { 758 for(; n > 0; n--, p += size) {
939 uintptr off, *bitp, shift, bits; 759 uintptr off, *bitp, shift, bits;
940 760
941 off = (uintptr*)p - (uintptr*)arena_start; 761 off = (uintptr*)p - (uintptr*)arena_start;
942 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; 762 bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
763 PREFETCH((byte*)bitp - size);
943 shift = off % wordsPerBitmapWord; 764 shift = off % wordsPerBitmapWord;
944 bits = *bitp>>shift; 765 bits = *bitp>>shift;
945 766
946 if((bits & bitAllocated) == 0) 767 if((bits & bitAllocated) == 0)
947 continue; 768 continue;
948 769
949 if((bits & bitMarked) != 0) { 770 if((bits & bitMarked) != 0) {
950 if(DebugMark) { 771 if(DebugMark) {
951 if(!(bits & bitSpecial)) 772 if(!(bits & bitSpecial))
952 runtime·printf("found spurious mark on % p\n", p); 773 runtime·printf("found spurious mark on % p\n", p);
953 *bitp &= ~(bitSpecial<<shift); 774 *bitp &= ~(bitSpecial<<shift);
954 } 775 }
955 *bitp &= ~(bitMarked<<shift); 776 *bitp &= ~(bitMarked<<shift);
956 continue; 777 continue;
957 } 778 }
958 779
959 // Special means it has a finalizer or is being profiled. 780 // Special means it has a finalizer or is being profiled.
960 // In DebugMark mode, the bit has been coopted so 781 // In DebugMark mode, the bit has been coopted so
961 // we have to assume all blocks are special. 782 // we have to assume all blocks are special.
962 if(DebugMark || (bits & bitSpecial) != 0) { 783 if(DebugMark || (bits & bitSpecial) != 0) {
963 if(handlespecial(p, size)) 784 if(handlespecial(p, size))
964 continue; 785 continue;
965 } 786 }
966 787
967 // Mark freed; restore block boundary bit. 788 // Mark freed; restore block boundary bit.
968 *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift); 789 *bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
969 790
970 » » if(s->sizeclass == 0) { 791 » » if(cl == 0) {
971 // Free large span. 792 // Free large span.
972 runtime·unmarkspan(p, 1<<PageShift); 793 runtime·unmarkspan(p, 1<<PageShift);
973 *(uintptr*)p = 1; // needs zeroing 794 *(uintptr*)p = 1; // needs zeroing
974 runtime·MHeap_Free(&runtime·mheap, s, 1); 795 runtime·MHeap_Free(&runtime·mheap, s, 1);
796 c->local_alloc -= size;
797 c->local_nfree++;
975 } else { 798 } else {
976 // Free small object. 799 // Free small object.
977 » » » if(size > sizeof(uintptr)) 800 » » » if(nfree) {
978 » » » » ((uintptr*)p)[1] = 1;» // mark as "needs to be zeroed" 801 » » » » PREFETCH(p);
979 » » » c->local_by_size[s->sizeclass].nfree++; 802 » » » » if(size > sizeof(uintptr))
980 » » » runtime·MCache_Free(c, p, s->sizeclass, size); 803 » » » » » ((uintptr*)end)[1] = 1; // mark as "ne eds to be zeroed"
981 » » } 804 » » » » end->next = (MLink*)p;
982 » » c->local_alloc -= size; 805 » » » » end = (MLink*)p;
983 » » c->local_nfree++; 806 » » » } else {
984 » } 807 » » » » start = (MLink*)p;
985 } 808 » » » » end = (MLink*)p;
986
987 static void
988 parallelfor(Parfor *desc, uint32 nthr, uint32 tid, uint32 count, void (*body)(ui nt32 i))
989 {
990 » uint32 begin, end, begin2, try, victim;
991 » uint64 *mypos, pos;
992 » bool idle;
993
994 » if(nthr > MaxGcproc || tid > nthr || body == nil) {
995 » » runtime·printf("nthr=%d tid=%d count=%d body=%p\n",
996 » » » nthr, tid, count, body);
997 » » runtime·throw("parallel for: invalid args");
998 » }
999
1000 » if(nthr==1 || count<=1) {
1001 » » for(pos=0; pos<count; pos++)
1002 » » » body(pos);
1003 » » return;
1004 » }
1005
1006 » begin = count / nthr * tid;
1007 » end = count / nthr * (tid+1);
1008 » if(tid==nthr-1)
1009 » » end = count;
1010 » mypos = &desc->thr[tid].pos;
1011 » runtime·atomicstore64(mypos, (uint64)begin | ((uint64)end)<<32);
1012 » //runtime·printf("thr #%d(%d): %d-%d (%d)\n", tid, work.nproc, begin, en d, runtime·mheap.nspan);
1013
1014 » for(;;) {
1015 » » //spanpos = runtime·atomicload64(&thr->spanpos);
1016 » » //begin = (uint32)spanpos;
1017 » » //end = (uint32)(spanpos>>32);
1018 » » //runtime·printf("thr #%d(%d): starting with %d-%d:%d\n", tid, w ork.nproc, begin, end, end-begin);
1019
1020 » » for(;;) {
1021 » » » pos = runtime·xadd64(mypos, 1);
1022 » » » begin = (uint32)pos-1;
1023 » » » end = (uint32)(pos>>32);
1024 » » » if(begin>=end)
1025 » » » » break;
1026 » » » body(begin);
1027 » » }
1028
1029 » » idle = false;
1030 » » for(try=0;; try++) {
1031 » » » if(try > nthr*4) {
1032 » » » » if(idle == false) {
1033 » » » » » runtime·xadd(&desc->done, 1);
1034 » » » » » idle = true;
1035 » » » » }
1036 » » » » if(desc->done == nthr)
1037 » » » » » return;
1038 } 809 }
1039 » » » victim = runtime·fastrand1() % (nthr-1); 810 » » » nfree++;
1040 » » » if(victim >= tid) 811 » » }
1041 » » » » victim++; 812 » }
1042 » » » pos = runtime·atomicload64(&desc->thr[victim].pos); 813
1043 » » » //begin = (uint32)spanpos; 814 » if(nfree) {
1044 » » » //end = (uint32)(spanpos>>32); 815 » » if(size > sizeof(uintptr))
1045 » » » //runtime·printf("thr #%d(%d): stealing from %d (%d-%d:% d)\n", tid, work.nproc, victim, begin, end, end-begin); 816 » » » ((uintptr*)end)[1] = 1; // mark as "needs to be zeroed "
1046 » » » for(;;) { 817 » » c->local_by_size[s->sizeclass].nfree += nfree;
1047 » » » » begin = (uint32)pos; 818 » » c->local_alloc -= size * nfree;
1048 » » » » end = (uint32)(pos>>32); 819 » » c->local_nfree += nfree;
1049 » » » » if(begin >= end-1) { 820 » » c->local_cachealloc -= nfree * size;
1050 » » » » » begin = end = 0; 821 » » c->local_objects -= nfree;
1051 » » » » » break; 822 » » runtime·MCentral_FreeSpan(&runtime·mheap.central[cl],
1052 » » » » } 823 » » » » » s, nfree, start, end);
1053 » » » » if(idle) { 824 » }
1054 » » » » » runtime·xadd(&desc->done, -1);
1055 » » » » » idle = false;
1056 » » » » }
1057 » » » » begin2 = begin + (end-begin)/2;
1058 » » » » if(runtime·cas64(&desc->thr[victim].pos, &pos, ( uint64)begin | (((uint64)begin2)<<32))) {
1059 » » » » » begin = begin2;
1060 » » » » » break;
1061 » » » » }
1062 » » » }
1063 » » » if(begin < end) {
1064 » » » » if(idle)
1065 » » » » » runtime·xadd(&desc->done, -1);
1066 » » » » runtime·atomicstore64(mypos, (uint64)begin | ((u int64)end)<<32);
1067 » » » » m->gcstats.nspansteals += 1;
1068 » » » » m->gcstats.nspanstolen += end-begin;
1069 » » » » break;
1070 » » » }
1071 » » » if(try<nthr) {
1072 » » » } else if (try<nthr*2) {
1073 » » » » runtime·procyield(20);
1074 » » » } else if (try<nthr*4) {
1075 » » » » runtime·osyield();
1076 » » » } else {
1077 » » » » runtime·usleep(1);
1078 » » » }
1079 » » }
1080 » }
1081 }
1082
1083 static void
1084 markcb(uint32 i)
1085 {
1086 » scanblock(work.roots[i].p, work.roots[i].n);
1087 }
1088
1089 static void
1090 parmark(uint32 tid)
1091 {
1092 » //uintptr i;
1093
1094 » parallelfor(&markfor, work.nproc, tid, work.nroots, markcb);
1095 » //» for(i=0; i<work.nroots; i++)
1096 » //» » scanblock(work.roots[i].p, work.roots[i].n);
1097 » scanblock(nil, 0);
1098 }
1099
1100 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
1101 // It clears the mark bits in preparation for the next GC round.
1102 static void
1103 sweep(uint32 tid)
1104 {
1105 » //GcThread *thr;
1106 » //MSpan *s;
1107 » //uint64 spanpos;
1108
1109 » parallelfor(&sweepfor, work.nproc, tid, runtime·mheap.nspan, sweepspan);
1110
1111 /*
1112 » uint32 begin = runtime·mheap.nspan / work.nproc * tid;
1113 » uint32 end = runtime·mheap.nspan / work.nproc * (tid+1);
1114 » if(tid==work.nproc-1)
1115 » » end = runtime·mheap.nspan;
1116 » thr = &work.thr[tid];
1117 » runtime·atomicstore64(&thr->spanpos, (uint64)begin | ((uint64)end)<<32);
1118 //runtime·printf("thr #%d(%d): %d-%d (%d)\n", tid, work.nproc, begin, end, runti me·mheap.nspan);
1119
1120 » for(;;) {
1121 » » //spanpos = runtime·atomicload64(&thr->spanpos);
1122 » » //begin = (uint32)spanpos;
1123 » » //end = (uint32)(spanpos>>32);
1124 » » //runtime·printf("thr #%d(%d): starting with %d-%d:%d\n", tid, w ork.nproc, begin, end, end-begin);
1125
1126 » » for(;;) {
1127 » » » spanpos = runtime·xadd64(&thr->spanpos, 1);
1128 » » » begin = (uint32)spanpos-1;
1129 » » » end = (uint32)(spanpos>>32);
1130 » » » if(begin>=end)
1131 » » » » break;
1132
1133 » » » sweepspan(begin);
1134 » » }
1135
1136 » » if(work.nproc==1)
1137 » » » return;
1138
1139 » » uint32 try;
1140 » » bool idle = false;
1141 » » for(try=0;; try++) {
1142 » » » if(try > work.nproc*4) {
1143 » » » » if(idle == false) {
1144 » » » » » runtime·xadd(&work.sweepdone, 1);
1145 » » » » » idle = true;
1146 » » » » }
1147 » » » » if(work.sweepdone == work.nproc)
1148 » » » » » return;
1149 » » » }
1150 » » » uint32 victim = runtime·fastrand1() % (work.nproc-1);
1151 » » » if(victim >= tid)
1152 » » » » victim++;
1153 » » » spanpos = runtime·atomicload64(&work.thr[victim].spanpos );
1154 » » » //begin = (uint32)spanpos;
1155 » » » //end = (uint32)(spanpos>>32);
1156 » » » //runtime·printf("thr #%d(%d): stealing from %d (%d-%d:% d)\n", tid, work.nproc, victim, begin, end, end-begin);
1157 » » » for(;;) {
1158 » » » » begin = (uint32)spanpos;
1159 » » » » end = (uint32)(spanpos>>32);
1160 » » » » if(begin >= end-1) {
1161 » » » » » begin = end = 0;
1162 » » » » » break;
1163 » » » » }
1164 » » » » if(idle) {
1165 » » » » » runtime·xadd(&work.sweepdone, -1);
1166 » » » » » idle = false;
1167 » » » » }
1168 » » » » uint32 mybegin = begin + (end-begin)/2;
1169 » » » » if(runtime·cas64(&work.thr[victim].spanpos, &spa npos, (uint64)begin | (((uint64)mybegin)<<32))) {
1170 » » » » » begin = mybegin;
1171 » » » » » break;
1172 » » » » }
1173 » » » }
1174 » » » if(begin < end) {
1175 » » » » if(idle)
1176 » » » » » runtime·xadd(&work.sweepdone, -1);
1177 » » » » runtime·atomicstore64(&thr->spanpos, (uint64)beg in | ((uint64)end)<<32);
1178 » » » » m->gcstats.nspansteals += 1;
1179 » » » » m->gcstats.nspanstolen += end-begin;
1180 » » » » break;
1181 » » » }
1182 » » » if(try<work.nproc) {
1183 » » » } else if (try<work.nproc*2) {
1184 » » » » runtime·procyield(20);
1185 » » » } else if (try<work.nproc*4) {
1186 » » » » runtime·osyield();
1187 » » » } else {
1188 » » » » runtime·usleep(1);
1189 » » » }
1190 » » }
1191 » }
1192 » */
1193 } 825 }
1194 826
1195 void 827 void
1196 runtime·gchelper(void) 828 runtime·gchelper(void)
1197 { 829 {
1198 » uint32 tid; 830 » // parallel mark for over gc roots
1199 831 » parfor(&work.markfor);
1200 » tid = runtime·xadd(&work.tidseq, 1); 832 » // help other threads scan secondary blocks
1201 833 » scanblock(nil, 0);
1202 » parmark(tid); 834
1203 » //scanblock(nil, 0);
1204 if(DebugMark) { 835 if(DebugMark) {
1205 » » while(runtime·atomicload(&work.markdone) == 0) 836 » » // wait while the main thread executes mark(debug_scanblock)
837 » » while(runtime·atomicload(&work.debugmarkdone) == 0)
1206 runtime·usleep(10); 838 runtime·usleep(10);
1207 } 839 }
1208 840
1209 » sweep(tid); 841 » // parallel sweep for over all spans
842 » parfor(&work.sweepfor);
1210 843
1211 if(runtime·xadd(&work.ndone, +1) == work.nproc-1) 844 if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
1212 runtime·notewakeup(&work.alldone); 845 runtime·notewakeup(&work.alldone);
1213 } 846 }
1214
1215 // Semaphore, not Lock, so that the goroutine
1216 // reschedules when there is contention rather
1217 // than spinning.
1218 static uint32 gcsema = 1;
1219 847
1220 // Initialized from $GOGC. GOGC=off means no gc. 848 // Initialized from $GOGC. GOGC=off means no gc.
1221 // 849 //
1222 // Next gc is after we've allocated an extra amount of 850 // Next gc is after we've allocated an extra amount of
1223 // memory proportional to the amount already in use. 851 // memory proportional to the amount already in use.
1224 // If gcpercent=100 and we're using 4M, we'll gc again 852 // If gcpercent=100 and we're using 4M, we'll gc again
1225 // when we get to 8M. This keeps the gc cost in linear 853 // when we get to 8M. This keeps the gc cost in linear
1226 // proportion to the allocation cost. Adjusting gcpercent 854 // proportion to the allocation cost. Adjusting gcpercent
1227 // just changes the linear constant (and also the amount of 855 // just changes the linear constant (and also the amount of
1228 // extra memory used). 856 // extra memory used).
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
1272 } 900 }
1273 mstats.stacks_inuse = stacks_inuse; 901 mstats.stacks_inuse = stacks_inuse;
1274 mstats.stacks_sys = stacks_sys; 902 mstats.stacks_sys = stacks_sys;
1275 } 903 }
1276 904
1277 void 905 void
1278 runtime·gc(int32 force) 906 runtime·gc(int32 force)
1279 { 907 {
1280 int64 t0, t1, t2, t3; 908 int64 t0, t1, t2, t3;
1281 uint64 heap0, heap1, obj0, obj1; 909 uint64 heap0, heap1, obj0, obj1;
910 GCStats stats;
911 uint32 i;
1282 byte *p; 912 byte *p;
1283 GCStats stats;
1284 913
1285 // The gc is turned off (via enablegc) until 914 // The gc is turned off (via enablegc) until
1286 // the bootstrap has completed. 915 // the bootstrap has completed.
1287 // Also, malloc gets called in the guts 916 // Also, malloc gets called in the guts
1288 // of a number of libraries that might be 917 // of a number of libraries that might be
1289 // holding locks. To avoid priority inversion 918 // holding locks. To avoid priority inversion
1290 // problems, don't bother trying to run gc 919 // problems, don't bother trying to run gc
1291 // while holding a lock. The next mallocgc 920 // while holding a lock. The next mallocgc
1292 // without a lock will do the gc instead. 921 // without a lock will do the gc instead.
1293 if(!mstats.enablegc || m->locks > 0 || runtime·panicking) 922 if(!mstats.enablegc || m->locks > 0 || runtime·panicking)
1294 return; 923 return;
1295 924
1296 if(gcpercent == -2) { // first time through 925 if(gcpercent == -2) { // first time through
1297 p = runtime·getenv("GOGC"); 926 p = runtime·getenv("GOGC");
1298 if(p == nil || p[0] == '\0') 927 if(p == nil || p[0] == '\0')
1299 gcpercent = 100; 928 gcpercent = 100;
1300 else if(runtime·strcmp(p, (byte*)"off") == 0) 929 else if(runtime·strcmp(p, (byte*)"off") == 0)
1301 gcpercent = -1; 930 gcpercent = -1;
1302 else 931 else
1303 gcpercent = runtime·atoi(p); 932 gcpercent = runtime·atoi(p);
1304 933
1305 p = runtime·getenv("GOGCTRACE"); 934 p = runtime·getenv("GOGCTRACE");
1306 if(p != nil) 935 if(p != nil)
1307 gctrace = runtime·atoi(p); 936 gctrace = runtime·atoi(p);
1308 } 937 }
1309 if(gcpercent < 0) 938 if(gcpercent < 0)
1310 return; 939 return;
1311 940
1312 » runtime·semacquire(&gcsema); 941 » runtime·semacquire(&runtime·worldsema);
1313 if(!force && mstats.heap_alloc < mstats.next_gc) { 942 if(!force && mstats.heap_alloc < mstats.next_gc) {
1314 » » runtime·semrelease(&gcsema); 943 » » runtime·semrelease(&runtime·worldsema);
1315 return; 944 return;
1316 } 945 }
1317 946
1318 t0 = runtime·nanotime(); 947 t0 = runtime·nanotime();
1319 948
1320 m->gcing = 1; 949 m->gcing = 1;
1321 runtime·stoptheworld(); 950 runtime·stoptheworld();
1322 951
1323 heap0 = 0; 952 heap0 = 0;
1324 obj0 = 0; 953 obj0 = 0;
1325 if(gctrace) { 954 if(gctrace) {
1326 cachestats(0); 955 cachestats(0);
1327 heap0 = mstats.heap_alloc; 956 heap0 = mstats.heap_alloc;
1328 obj0 = mstats.nmalloc - mstats.nfree; 957 obj0 = mstats.nmalloc - mstats.nfree;
1329 } 958 }
1330 959
1331 work.nproc = runtime·gcprocs(); 960 work.nproc = runtime·gcprocs();
1332 sweepfor.done = 0;
1333 markfor.done = 0;
1334
1335 work.nwait = 0; 961 work.nwait = 0;
1336 work.ndone = 0; 962 work.ndone = 0;
1337 » work.tidseq = 0; 963 » work.debugmarkdone = 0;
1338 » work.markdone = 0; 964 » addroots();
1339 » //work.sweepdone = 0; 965 » parforsetup(&work.markfor, markroot, work.nproc, work.nroot, nil, false) ;
1340 » collectroots(); 966 » parforsetup(&work.sweepfor, sweepspan, work.nproc, runtime·mheap.nspan, nil, true);
1341 if(work.nproc > 1) { 967 if(work.nproc > 1) {
1342 runtime·noteclear(&work.alldone); 968 runtime·noteclear(&work.alldone);
1343 » » runtime·helpgc(); 969 » » runtime·helpgc(work.nproc);
1344 » } 970 » }
1345 971
1346 //runtime·printf("mark\n"); 972 » parfor(&work.markfor);
1347 » parmark(0); 973 » scanblock(nil, 0);
1348 974
1349 » //mark(scanblock);
1350 if(DebugMark) { 975 if(DebugMark) {
1351 » » mark(debug_scanblock); 976 » » for(i=0; i<work.nroot; i++)
1352 » » runtime·xchg(&work.markdone, 1); 977 » » » debug_scanblock(work.roots[i].p, work.roots[i].n);
978 » » runtime·xchg(&work.debugmarkdone, 1);
1353 } 979 }
1354 t1 = runtime·nanotime(); 980 t1 = runtime·nanotime();
1355 981 » // parallel sweep for over all spans
1356 //runtime·printf("sweep\n"); 982 » parfor(&work.sweepfor);
1357 » sweep(0);
1358 » if(work.nproc > 1)
1359 » » runtime·notesleep(&work.alldone);
1360 t2 = runtime·nanotime(); 983 t2 = runtime·nanotime();
1361 984
1362 stealcache(); 985 stealcache();
1363 cachestats(&stats); 986 cachestats(&stats);
1364 987
1365 mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; 988 mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
1366 m->gcing = 0; 989 m->gcing = 0;
1367 990
1368 m->locks++; // disable gc during the mallocs in newproc
1369 if(finq != nil) { 991 if(finq != nil) {
992 m->locks++; // disable gc during the mallocs in newproc
1370 // kick off or wake up goroutine to run queued finalizers 993 // kick off or wake up goroutine to run queued finalizers
1371 if(fing == nil) 994 if(fing == nil)
1372 fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runti me·gc); 995 fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runti me·gc);
1373 else if(fingwait) { 996 else if(fingwait) {
1374 fingwait = 0; 997 fingwait = 0;
1375 runtime·ready(fing); 998 runtime·ready(fing);
1376 } 999 }
1377 » } 1000 » » m->locks--;
1378 » m->locks--; 1001 » }
1002 »·······
1003 » if(work.nproc > 1)
1004 » » runtime·notesleep(&work.alldone);
1379 1005
1380 heap1 = mstats.heap_alloc; 1006 heap1 = mstats.heap_alloc;
1381 obj1 = mstats.nmalloc - mstats.nfree; 1007 obj1 = mstats.nmalloc - mstats.nfree;
1382 1008
1383 t3 = runtime·nanotime(); 1009 t3 = runtime·nanotime();
1010 mstats.last_gc = t3;
1384 mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0; 1011 mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0;
1385 mstats.pause_total_ns += t3 - t0; 1012 mstats.pause_total_ns += t3 - t0;
1386 mstats.numgc++; 1013 mstats.numgc++;
1387 if(mstats.debuggc) 1014 if(mstats.debuggc)
1388 runtime·printf("pause %D\n", t3-t0); 1015 runtime·printf("pause %D\n", t3-t0);
1389 1016
1390 if(gctrace) { 1017 if(gctrace) {
1391 » » runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-% D) objects %D pointer lookups (%D size, %D addr) %D(%D) handoff, scan block/word /ptr: %D/%D/%D\n", 1018 » » runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-% D) objects, %D(%D) steals, %D(%D) handoffs, %D/%D/%D yields\n",
1392 mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/10000 00, (t3-t2)/1000000, 1019 mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/10000 00, (t3-t2)/1000000,
1393 heap0>>20, heap1>>20, obj0, obj1, 1020 heap0>>20, heap1>>20, obj0, obj1,
1394 mstats.nmalloc, mstats.nfree, 1021 mstats.nmalloc, mstats.nfree,
1395 » » » stats.nsizelookup+stats.naddrlookup, stats.nsizelookup, stats.naddrlookup, 1022 » » » stats.nsteal, stats.nstealcnt,
1396 stats.nhandoff, stats.nhandoffcnt, 1023 stats.nhandoff, stats.nhandoffcnt,
1397 » » » stats.nscanblocks, stats.nscanwords, stats.nscanptr); 1024 » » » stats.nprocyield, stats.nosyield, stats.nsleep);
1398 » » runtime·printf(" sweep: span total/full/large %D/%D/%D steal count/amount %D/%D\n", 1025 » }
1399 » » » runtime·mheap.nspan, stats.nspanfull, stats.nspanlarge, stats.nspansteals, stats.nspanstolen); 1026 »·······
1400 » } 1027 » runtime·MProf_GC();
1401 1028 » runtime·semrelease(&runtime·worldsema);
1402 » runtime·semrelease(&gcsema);
1403 runtime·starttheworld(); 1029 runtime·starttheworld();
1404 1030
1405 // give the queued finalizers, if any, a chance to run 1031 // give the queued finalizers, if any, a chance to run
1406 if(finq != nil) 1032 if(finq != nil)
1407 runtime·gosched(); 1033 runtime·gosched();
1408 1034
1409 if(gctrace > 1 && !force) 1035 if(gctrace > 1 && !force)
1410 runtime·gc(1); 1036 runtime·gc(1);
1411 } 1037 }
1412 1038
1413 void 1039 void
1414 runtime·UpdateMemStats(void) 1040 runtime·ReadMemStats(MStats *stats)
1415 { 1041 {
1416 » // Have to acquire gcsema to stop the world, 1042 » // Have to acquire worldsema to stop the world,
1417 // because stoptheworld can only be used by 1043 // because stoptheworld can only be used by
1418 // one goroutine at a time, and there might be 1044 // one goroutine at a time, and there might be
1419 // a pending garbage collection already calling it. 1045 // a pending garbage collection already calling it.
1420 » runtime·semacquire(&gcsema); 1046 » runtime·semacquire(&runtime·worldsema);
1421 m->gcing = 1; 1047 m->gcing = 1;
1422 runtime·stoptheworld(); 1048 runtime·stoptheworld();
1423 cachestats(0); 1049 cachestats(0);
1050 *stats = mstats;
1424 m->gcing = 0; 1051 m->gcing = 0;
1425 » runtime·semrelease(&gcsema); 1052 » runtime·semrelease(&runtime·worldsema);
1426 runtime·starttheworld(); 1053 runtime·starttheworld();
1427 } 1054 }
1428 1055
1429 static void 1056 static void
1430 runfinq(void) 1057 runfinq(void)
1431 { 1058 {
1432 Finalizer *f; 1059 Finalizer *f;
1433 FinBlock *fb, *next; 1060 FinBlock *fb, *next;
1434 byte *frame; 1061 byte *frame;
1435 uint32 framesz, framecap, i; 1062 uint32 framesz, framecap, i;
(...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after
1671 uintptr n; 1298 uintptr n;
1672 1299
1673 n = (h->arena_used - h->arena_start) / wordsPerBitmapWord; 1300 n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
1674 n = (n+bitmapChunk-1) & ~(bitmapChunk-1); 1301 n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
1675 if(h->bitmap_mapped >= n) 1302 if(h->bitmap_mapped >= n)
1676 return; 1303 return;
1677 1304
1678 runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped); 1305 runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped);
1679 h->bitmap_mapped = n; 1306 h->bitmap_mapped = n;
1680 } 1307 }
1308
1309 #ifdef _64BIT
1310 // Amd64 uses 48-bit virtual addresses, 47-th bit is used as kernel/user flag.
1311 // So we use 17msb of pointers as ABA counter.
1312 # define PTR_BITS 47
1313 #else
1314 # define PTR_BITS 32
1315 #endif
1316 #define PTR_MASK ((1ull<<PTR_BITS)-1)
1317
1318 static void
1319 lifopush(uint64 *a, Workbuf *b)
1320 {
1321 uint64 old, new;
1322
1323 if((uint64)b != ((uint64)b&PTR_MASK)) {
1324 runtime·printf("p=%p\n", b);
1325 runtime·throw("lifopush: invalid pointer");
1326 }
1327
1328 if(work.nproc == 1) {
1329 b->next = (Workbuf*)(*a&PTR_MASK);
1330 *a = (uint64)b;
1331 return;
1332 }
1333
1334 b->pushcnt++;
1335 new = (uint64)b|(((uint64)b->pushcnt)<<PTR_BITS);
1336 old = runtime·atomicload64(a);
1337 for(;;) {
1338 b->next = (Workbuf*)(old&PTR_MASK);
1339 if(runtime·cas64(a, &old, new))
1340 break;
1341 }
1342 }
1343
1344 static Workbuf*
1345 lifopop(uint64 *a)
1346 {
1347 Workbuf *b, *b2;
1348 uint64 old, new;
1349
1350 if(work.nproc == 1) {
1351 b = (Workbuf*)(*a&PTR_MASK);
1352 if(b == nil)
1353 return nil;
1354 *a = (uint64)b->next;
1355 return b;
1356 }
1357
1358 old = runtime·atomicload64(a);
1359 for(;;) {
1360 if(old == 0)
1361 return nil;
1362 b = (Workbuf*)(old&PTR_MASK);
1363 b2 = runtime·atomicloadp(&b->next);
1364 new = 0;
1365 if(b2 != nil)
1366 new = (uint64)b2|(((uint64)b2->pushcnt)<<PTR_BITS);
1367 if(runtime·cas64(a, &old, new))
1368 return b;
1369 }
1370 }
1371
1372 void
1373 runtime·CTestLockFreeStack(bool isShort)
1374 {
1375 uint64 stack;
1376 Workbuf *b;
1377
1378 USED(isShort);
1379
1380 for(work.nproc=1; work.nproc<=2; work.nproc++) {
1381 // check the stack is initially empty
1382 stack = 0;
1383 if(lifopop(&stack) != nil)
1384 runtime·panicstring("stack is not empty");
1385
1386 // push one element
1387 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0);
1388 b->nobj = 42;
1389 lifopush(&stack, b);
1390
1391 // push another
1392 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0);
1393 b->nobj = 43;
1394 lifopush(&stack, b);
1395
1396 // pop one element
1397 b = lifopop(&stack);
1398 if(b == nil)
1399 runtime·panicstring("stack is empty");
1400 if(b->nobj != 43)
1401 runtime·panicstring("no lifo");
1402 runtime·free(b);
1403
1404 // pop another
1405 b = lifopop(&stack);
1406 if(b == nil)
1407 runtime·panicstring("stack is empty");
1408 if(b->nobj != 42)
1409 runtime·panicstring("no lifo");
1410 runtime·free(b);
1411
1412 // check the stack is empty again
1413 if(stack != 0)
1414 runtime·panicstring("stack is not empty");
1415 if(lifopop(&stack) != nil)
1416 runtime·panicstring("stack is not empty");
1417 }
1418 }
1419
1420 typedef struct StackTestCtx StackTestCtx;
1421 struct StackTestCtx
1422 {
1423 uint32 niter;
1424 uint32 waitsema;
1425 uint64 stack[2];
1426 };
1427
1428 static void
1429 stackTestProc(StackTestCtx *ctx)
1430 {
1431 int32 i, n;
1432 Workbuf *b;
1433
1434 for(i=0; i<ctx->niter; i++) {
1435 n = runtime·fastrand1()%2;
1436 b = lifopop(&ctx->stack[n]);
1437 if(b==nil)
1438 continue;
1439 n = runtime·fastrand1()%2;
1440 lifopush(&ctx->stack[n], b);
1441 }
1442 runtime·semrelease(&ctx->waitsema);
1443 }
1444
1445 void
1446 runtime·CTestLockFreeStackStress(bool isShort)
1447 {
1448 StackTestCtx ctx, *arg;
1449 Workbuf *b;
1450 int32 i, sum, sum2, cnt, procs;
1451 const int32 N = 100;
1452 const int32 P = 8;
1453 const int32 G = 16;
1454
1455 procs = runtime·gomaxprocsfunc(P);
1456 work.nproc = P;
1457 ctx.niter = isShort ? 10000 : 100000;
1458 ctx.waitsema = 0;
1459 ctx.stack[0] = 0;
1460 ctx.stack[1] = 0;
1461 arg = &ctx;
1462 sum = 0;
1463 for(i=0; i<N; i++) {
1464 b = (Workbuf*)runtime·mallocgc(sizeof(Workbuf), FlagNoGC, 0, 0);
1465 b->nobj = i;
1466 sum += i;
1467 lifopush(&ctx.stack[i%2], b);
1468 }
1469 m->locks++; // disable gc during the mallocs (gc resets work.nproc)
1470 for(i=0; i<G; i++)
1471 runtime·newproc1((byte*)stackTestProc, (byte*)&arg, sizeof(arg), 0, runtime·CTestLockFreeStackStress);
1472 m->locks--;
1473 for(i=0; i<G; i++)
1474 runtime·semacquire(&ctx.waitsema);
1475 sum2 = 0;
1476 cnt = 0;
1477 for(i=0; i<2; i++) {
1478 while((b=lifopop(&ctx.stack[i])) != nil) {
1479 cnt++;
1480 sum2 += b->nobj;
1481 b->next = (Workbuf*)123;
1482 runtime·free(b);
1483 }
1484 }
1485 if(cnt != N || sum2 != sum)
1486 runtime·panicstring("stack corruption");
1487 runtime·gomaxprocsfunc(procs);
1488 }
1489
1490 static void
1491 parforsetup(Parfor *desc, void (*body)(Parfor*, uint32), uint32 nthr, uint32 n, void *ctx, bool wait)
1492 {
1493 uint32 i, d, begin, end;
1494
1495 if(nthr == 0 || nthr > MaxGcproc || body == nil) {
1496 runtime·printf("nthr=%d count=%d body=%p\n",
1497 nthr, n, body);
1498 runtime·throw("parallel for: invalid args");
1499 }
1500
1501 desc->body = body;
1502 desc->done = 0;
1503 desc->nthr = nthr;
1504 desc->thrseq = 0;
1505 desc->cnt = n;
1506 desc->ctx = ctx;
1507 desc->wait = wait;
1508 for(i=0; i<nthr; i++) {
1509 begin = n / nthr * i;
1510 end = n / nthr * (i+1);
1511 d = n % nthr;
1512 if(i < d) {
1513 begin += i;
1514 end += (i+1);
1515 } else {
1516 begin += d;
1517 end += d;
1518 }
1519 desc->thr[i].pos = (uint64)begin | (((uint64)end)<<32);
1520 }
1521 }
1522
1523 static void
1524 parfor(Parfor *desc)
1525 {
1526 uint32 tid, begin, end, begin2, try, victim, i;
1527 uint64 *mypos, pos;
1528 bool idle;
1529
1530 // obtain 0-based thread index
1531 tid = runtime·xadd(&desc->thrseq, 1) - 1;
1532 if(tid >= desc->nthr) {
1533 runtime·throw("parallel for: invalid tid");
1534 }
1535
1536 // if single-threaded, just execute the for serially
1537 if(desc->nthr==1) {
1538 for(i=0; i<desc->cnt; i++)
1539 desc->body(desc, i);
1540 return;
1541 }
1542
1543 mypos = &desc->thr[tid].pos;
1544 for(;;) {
1545 for(;;) {
1546 // while have work, bump low index and execute the itera tion
1547 pos = runtime·xadd64(mypos, 1);
1548 begin = (uint32)pos-1;
1549 end = (uint32)(pos>>32);
1550 if(begin>=end)
1551 break;
1552 desc->body(desc, begin);
1553 }
1554
1555 // out of work, need to steal something
1556 idle = false;
1557 for(try=0;; try++) {
1558 // if we don't see any work for long enough,
1559 // increment the done counter...
1560 if(try > desc->nthr*4 && !idle) {
1561 idle = true;
1562 runtime·xadd(&desc->done, 1);
1563 }
1564 // ...if all threads have incremented the counter,
1565 // we are done
1566 if(desc->done + !idle == desc->nthr) {
1567 if(!idle)
1568 runtime·xadd(&desc->done, 1);
1569 return;
1570 }
1571 // choose a random victim for stealing
1572 victim = runtime·fastrand1() % (desc->nthr-1);
1573 if(victim >= tid)
1574 victim++;
1575 pos = runtime·atomicload64(&desc->thr[victim].pos);
1576 for(;;) {
1577 // see if it has any work
1578 begin = (uint32)pos;
1579 end = (uint32)(pos>>32);
1580 if(begin >= end-1) {
1581 begin = end = 0;
1582 break;
1583 }
1584 if(idle) {
1585 runtime·xadd(&desc->done, -1);
1586 idle = false;
1587 }
1588 begin2 = begin + (end-begin)/2;
1589 if(runtime·cas64(&desc->thr[victim].pos, &pos, ( uint64)begin | (((uint64)begin2)<<32))) {
1590 begin = begin2;
1591 break;
1592 }
1593 }
1594 if(begin < end) {
1595 // has successfully stolen some work
1596 if(idle)
1597 runtime·throw("parfor: should not be idl e");
1598 runtime·atomicstore64(mypos, (uint64)begin | ((u int64)end)<<32);
1599 m->gcstats.nsteal += 1;
1600 m->gcstats.nstealcnt += end-begin;
1601 break;
1602 }
1603 // backoff
1604 if(try < desc->nthr) {
1605 } else if (try < desc->nthr*4) {
1606 m->gcstats.nprocyield++;
1607 runtime·procyield(20);
1608 // if a user asked to not wait for others, exit now
1609 // (assume that most work is already done)
1610 } else if (!desc->wait) {
1611 if(!idle)
1612 runtime·xadd(&desc->done, 1);
1613 return;
1614 } else if (try < desc->nthr*6) {
1615 m->gcstats.nosyield++;
1616 runtime·osyield();
1617 } else {
1618 m->gcstats.nsleep++;
1619 runtime·usleep(1);
1620 }
1621 }
1622 }
1623 }
1624
1625 static void
1626 testParforBody(Parfor *desc, uint32 i)
1627 {
1628 uint64 *data;
1629
1630 data = (uint64*)desc->ctx;
1631 data[i] *= data[i];
1632 }
1633
1634 static void
1635 testParforProc(Parfor *desc)
1636 {
1637 parfor(desc);
1638 }
1639
1640 // Simple serial sanity test for parallelfor.
1641 void
1642 runtime·CTestParfor(bool isShort)
1643 {
1644 Parfor desc;
1645 uint64 *data;
1646 uint32 i, N;
1647
1648 USED(isShort);
1649
1650 N = 1000;
1651 data = (uint64*)runtime·mal(N*sizeof(uint64));
1652 for(i=0; i<N; i++)
1653 data[i] = i;
1654 parforsetup(&desc, testParforBody, 1, N, data, true);
1655 parfor(&desc);
1656 for(i=0; i<N; i++) {
1657 if(data[i] != i*i)
1658 runtime·panicstring("incorrect result");
1659 }
1660 runtime·free(data);
1661 }
1662
1663 // Test that iterations are properly distributed.
1664 void
1665 runtime·CTestParforSetup(bool isShort)
1666 {
1667 Parfor desc;
1668 uint32 n, t, i, begin, end, size, end0, size0, sum;
1669
1670 USED(isShort);
1671
1672 for(n=0; n<100; n++) {
1673 for(t=1; t<=MaxGcproc; t++) {
1674 parforsetup(&desc, testParforBody, t, n, 0, true);
1675 sum = 0;
1676 size0 = 0;
1677 end0 = 0;
1678 for(i=0; i<t; i++) {
1679 begin = (uint32)desc.thr[i].pos;
1680 end = (uint32)(desc.thr[i].pos>>32);
1681 size = end - begin;
1682 sum += size;
1683 if(i==0) {
1684 size0 = size;
1685 if(begin != 0)
1686 runtime·panicstring("incorrect b egin");
1687 } else {
1688 if(size != size0 && size != size0-1)
1689 runtime·panicstring("incorrect s ize");
1690 if(begin != end0)
1691 runtime·panicstring("incorrect b egin/end");
1692 }
1693 end0 = end;
1694 }
1695 if(sum != n)
1696 runtime·panicstring("incorrect sum");
1697 }
1698 }
1699 }
1700
1701 // Test that nonblocking parallelfor does not block.
1702 void
1703 runtime·CTestParforNonblock(bool isShort)
1704 {
1705 Parfor desc;
1706 uint64 *data;
1707 uint32 i, N;
1708
1709 USED(isShort);
1710
1711 N = 1000;
1712 data = (uint64*)runtime·mal(N*sizeof(uint64));
1713 for(i=0; i<N; i++)
1714 data[i] = i;
1715 parforsetup(&desc, testParforBody, MaxGcproc, N, data, false);
1716 for(i=0; i<MaxGcproc; i++)
1717 parfor(&desc);
1718 for(i=0; i<N; i++) {
1719 if(data[i] != i*i)
1720 runtime·panicstring("incorrect result");
1721 }
1722 runtime·free(data);
1723 }
1724
1725 // Test parallel parallelfor.
1726 void
1727 runtime·CTestParforParallel(bool isShort)
1728 {
1729 Parfor desc, *arg;
1730 uint64 *data;
1731 uint32 i, N;
1732 int32 procs;
1733
1734 procs = runtime·gomaxprocsfunc(MaxGcproc+1);
1735 N = 10000;
1736 if(isShort)
1737 N /= 10;
1738 data = (uint64*)runtime·mal(N*sizeof(uint64));
1739 for(i=0; i<N; i++)
1740 data[i] = i;
1741 parforsetup(&desc, testParforBody, MaxGcproc, N, data, true);
1742 arg = &desc;
1743 m->locks++; // disable gc during the mallocs in newproc
1744 for(i=1; i<MaxGcproc; i++)
1745 runtime·newproc1((byte*)testParforProc, (byte*)&arg, sizeof(arg) , 0, runtime·CTestParforParallel);
1746 m->locks--;
1747 parfor(&desc);
1748 for(i=0; i<N; i++) {
1749 if(data[i] != i*i)
1750 runtime·panicstring("incorrect result");
1751 }
1752 runtime·free(data);
1753 runtime·gomaxprocsfunc(procs);
1754 }
LEFTRIGHT

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b