OLD | NEW |
(Empty) | |
| 1 // Copyright 2011 The Go Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. |
| 4 |
| 5 // Tests scheduler scalability on matrix multiplication workload with fork/join
style parallelism. |
| 6 // There are 3 variations as to how to join child goroutines: by means of |
| 7 // channels, by means of sync.WaitGroup or by means of direct shared memory noti
fications. |
| 8 // There are also 3 grain sizes: 32, 16 and 8, that relate to leaf tasks that do |
| 9 // 32x32, 16x16 and 8x8 matrix multiplication, respectively. |
| 10 // Expected to scale almost linearly.· |
| 11 |
| 12 package main |
| 13 |
| 14 import ( |
| 15 "container/vector" |
| 16 "runtime" |
| 17 "sync" |
| 18 ) |
| 19 |
| 20 type Matrix [][]int |
| 21 |
| 22 func makeMatrix(n int) Matrix { |
| 23 M := make(Matrix, n) |
| 24 for i := 0; i != n; i += 1 { |
| 25 M[i] = make([]int, n) |
| 26 } |
| 27 return M |
| 28 } |
| 29 |
| 30 func makeMatrixInit(n int) Matrix { |
| 31 M := makeMatrix(n) |
| 32 for i := 0; i != len(M); i += 1 { |
| 33 for j := 0; j != len(M[i]); j += 1 { |
| 34 M[i][j] = i*len(M) + j |
| 35 } |
| 36 } |
| 37 return M |
| 38 } |
| 39 |
| 40 // Parallel recursive cache-oblivious matrix multiplication. |
| 41 // Synchronization is based on channels. |
| 42 func matmultParallelImplChan(sync chan<- int, A, B, C Matrix, threshold, i0, i1,
j0, j1, k0, k1 int) { |
| 43 di := i1 - i0 |
| 44 dj := j1 - j0 |
| 45 dk := k1 - k0 |
| 46 if di >= dj && di >= dk && di >= threshold { |
| 47 mi := i0 + di/2 |
| 48 sync0 := make(chan int, 1) |
| 49 go matmultParallelImplChan(sync0, A, B, C, threshold, i0, mi, j0
, j1, k0, k1) |
| 50 matmultParallelImplChan(nil, A, B, C, threshold, mi, i1, j0, j1,
k0, k1) |
| 51 <-sync0 |
| 52 } else if dj >= dk && dj >= threshold { |
| 53 mj := j0 + dj/2 |
| 54 sync0 := make(chan int, 1) |
| 55 go matmultParallelImplChan(sync0, A, B, C, threshold, i0, i1, j0
, mj, k0, k1) |
| 56 matmultParallelImplChan(nil, A, B, C, threshold, i0, i1, mj, j1,
k0, k1) |
| 57 <-sync0 |
| 58 } else if dk >= threshold { |
| 59 mk := k0 + dk/2 |
| 60 matmultParallelImplChan(nil, A, B, C, threshold, i0, i1, j0, j1,
k0, mk) |
| 61 matmultParallelImplChan(nil, A, B, C, threshold, i0, i1, j0, j1,
mk, k1) |
| 62 } else { |
| 63 for i := 0; i != 1000; i += 1 { |
| 64 } |
| 65 for i := i0; i < i1; i += 1 { |
| 66 for j := j0; j < j1; j += 1 { |
| 67 for k := k0; k < k1; k += 1 { |
| 68 C[i][j] += A[i][k] * B[k][j] |
| 69 } |
| 70 } |
| 71 } |
| 72 } |
| 73 if sync != nil { |
| 74 sync <- 0 |
| 75 } |
| 76 } |
| 77 |
| 78 // Parallel recursive cache-oblivious matrix multiplication. |
| 79 // Synchronization is based on *racy* reads and writes (formally it's incorrect)
. |
| 80 func matmultParallelImplSync(sync *bool, A, B, C Matrix, threshold, i0, i1, j0,
j1, k0, k1 int) { |
| 81 di := i1 - i0 |
| 82 dj := j1 - j0 |
| 83 dk := k1 - k0 |
| 84 if di >= dj && di >= dk && di >= threshold { |
| 85 mi := i0 + di/2 |
| 86 sync0 := false |
| 87 go matmultParallelImplSync(&sync0, A, B, C, threshold, i0, mi, j
0, j1, k0, k1) |
| 88 matmultParallelImplSync(nil, A, B, C, threshold, mi, i1, j0, j1,
k0, k1) |
| 89 for { |
| 90 if sync0 == true { |
| 91 break |
| 92 } |
| 93 runtime.Gosched() |
| 94 } |
| 95 } else if dj >= dk && dj >= threshold { |
| 96 mj := j0 + dj/2 |
| 97 sync0 := false |
| 98 go matmultParallelImplSync(&sync0, A, B, C, threshold, i0, i1, j
0, mj, k0, k1) |
| 99 matmultParallelImplSync(nil, A, B, C, threshold, i0, i1, mj, j1,
k0, k1) |
| 100 for { |
| 101 if sync0 == true { |
| 102 break |
| 103 } |
| 104 runtime.Gosched() |
| 105 } |
| 106 } else if dk >= threshold { |
| 107 mk := k0 + dk/2 |
| 108 matmultParallelImplSync(nil, A, B, C, threshold, i0, i1, j0, j1,
k0, mk) |
| 109 matmultParallelImplSync(nil, A, B, C, threshold, i0, i1, j0, j1,
mk, k1) |
| 110 } else { |
| 111 for i := i0; i < i1; i += 1 { |
| 112 for j := j0; j < j1; j += 1 { |
| 113 for k := k0; k < k1; k += 1 { |
| 114 C[i][j] += A[i][k] * B[k][j] |
| 115 } |
| 116 } |
| 117 } |
| 118 } |
| 119 if sync != nil { |
| 120 *sync = true |
| 121 } |
| 122 } |
| 123 |
| 124 // Parallel recursive cache-oblivious matrix multiplication. |
| 125 // Synchronization is based on semaphores. |
| 126 func matmultParallelImplWg(wg *sync.WaitGroup, A, B, C Matrix, threshold, i0, i1
, j0, j1, k0, k1 int) { |
| 127 di := i1 - i0 |
| 128 dj := j1 - j0 |
| 129 dk := k1 - k0 |
| 130 if di >= dj && di >= dk && di >= threshold { |
| 131 mi := i0 + di/2 |
| 132 wg0 := &sync.WaitGroup{} |
| 133 wg0.Add(1) |
| 134 go matmultParallelImplWg(wg0, A, B, C, threshold, i0, mi, j0, j1
, k0, k1) |
| 135 matmultParallelImplWg(nil, A, B, C, threshold, mi, i1, j0, j1, k
0, k1) |
| 136 wg0.Wait() |
| 137 } else if dj >= dk && dj >= threshold { |
| 138 mj := j0 + dj/2 |
| 139 wg0 := &sync.WaitGroup{} |
| 140 wg0.Add(1) |
| 141 go matmultParallelImplWg(wg0, A, B, C, threshold, i0, i1, j0, mj
, k0, k1) |
| 142 matmultParallelImplWg(nil, A, B, C, threshold, i0, i1, mj, j1, k
0, k1) |
| 143 wg0.Wait() |
| 144 } else if dk >= threshold { |
| 145 mk := k0 + dk/2 |
| 146 matmultParallelImplWg(nil, A, B, C, threshold, i0, i1, j0, j1, k
0, mk) |
| 147 matmultParallelImplWg(nil, A, B, C, threshold, i0, i1, j0, j1, m
k, k1) |
| 148 } else { |
| 149 for i := i0; i < i1; i += 1 { |
| 150 for j := j0; j < j1; j += 1 { |
| 151 for k := k0; k < k1; k += 1 { |
| 152 C[i][j] += A[i][k] * B[k][j] |
| 153 } |
| 154 } |
| 155 } |
| 156 } |
| 157 if wg != nil { |
| 158 wg.Done() |
| 159 } |
| 160 } |
| 161 |
| 162 func benchmarkMatmult(impl func(A, B, C Matrix, threshold, n int), threshold int
) { |
| 163 N := 128 |
| 164 A := makeMatrixInit(N) |
| 165 B := makeMatrixInit(N) |
| 166 C := makeMatrix(N) |
| 167 for benchmarking { |
| 168 impl(A, B, C, threshold, N) |
| 169 totalWork += uint64(N * N * N) |
| 170 } |
| 171 } |
| 172 |
| 173 // Idiomatic channel-based |
| 174 func benchmarkMatmultChan(threshold int) { |
| 175 impl := func(A, B, C Matrix, threshold, n int) { |
| 176 matmultParallelImplChan(nil, A, B, C, threshold, 0, n, 0, n, 0,
n) |
| 177 } |
| 178 benchmarkMatmult(impl, threshold) |
| 179 } |
| 180 |
| 181 // Fast but incorrect |
| 182 func benchmarkMatmultSync(threshold int) { |
| 183 impl := func(A, B, C Matrix, threshold, n int) { |
| 184 matmultParallelImplSync(nil, A, B, C, threshold, 0, n, 0, n, 0,
n) |
| 185 } |
| 186 benchmarkMatmult(impl, threshold) |
| 187 } |
| 188 |
| 189 // Correct but slow (WaitGroup-based) |
| 190 func benchmarkMatmultWg(threshold int) { |
| 191 impl := func(A, B, C Matrix, threshold, n int) { |
| 192 matmultParallelImplWg(nil, A, B, C, threshold, 0, n, 0, n, 0, n) |
| 193 } |
| 194 benchmarkMatmult(impl, threshold) |
| 195 } |
| 196 |
| 197 func SuiteMatmult(benchmarks *vector.Vector) { |
| 198 benchmarks.Push(&Benchmark{"matmult-chan32", func() { |
| 199 benchmarkMatmultChan(32) |
| 200 }}) |
| 201 benchmarks.Push(&Benchmark{"matmult-chan16", func() { |
| 202 benchmarkMatmultChan(16) |
| 203 }}) |
| 204 benchmarks.Push(&Benchmark{"matmult-chan8", func() { |
| 205 benchmarkMatmultChan(8) |
| 206 }}) |
| 207 |
| 208 benchmarks.Push(&Benchmark{"matmult-sync32", func() { |
| 209 benchmarkMatmultSync(32) |
| 210 }}) |
| 211 benchmarks.Push(&Benchmark{"matmult-sync16", func() { |
| 212 benchmarkMatmultSync(16) |
| 213 }}) |
| 214 benchmarks.Push(&Benchmark{"matmult-sync8", func() { |
| 215 benchmarkMatmultSync(8) |
| 216 }}) |
| 217 |
| 218 benchmarks.Push(&Benchmark{"matmult-wg32", func() { |
| 219 benchmarkMatmultWg(32) |
| 220 }}) |
| 221 benchmarks.Push(&Benchmark{"matmult-wg16", func() { |
| 222 benchmarkMatmultWg(16) |
| 223 }}) |
| 224 benchmarks.Push(&Benchmark{"matmult-wg8", func() { |
| 225 benchmarkMatmultWg(8) |
| 226 }}) |
| 227 } |
OLD | NEW |