Background
I am currently working on my bachelor thesis and basically my task is to optimise a given code in Go, i.e. make it run as fast as possible. First, I optimised the serial function and then tried to introduce parallelism via goroutines. After researching on the internet I now understand the difference between concurrency and parallelism thanks to the following slides from talks.golang. I visited some parallel programming courses where we parallelised a c/c++ code with help of pthread/openmp, thus I tried to apply these paradigms in Go. That said, in this particular case I am optimising a function which computes the moving average of a slice with length len:=n+(window_size-1)
(it equals either 9393 or 10175), hence we have n
windows of which we compute the corresponding arithmetic average and save that properly in the output slice.
Note that this task is inherently embarrassing parallel.
My optimisation attempts and results
In moving_avg_concurrent2
I split up the slice into num_goroutines
smaller pieces and ran each with one goroutine. This function performed with one goroutine, out of some reason (could not find out why yet, but we are getting tangent here), better than moving_avg_serial4
but with more than one goroutine it started to perform worse than moving_avg_serial4
.
In moving_avg_concurrent3
I adopted the master/worker paradigm. The performance was worse than moving_avg_serial4
when using one goroutine. Here we at least I got a better performance when increasing num_goroutines
but still not better than moving_avg_serial4
.
To compare the performances of moving_avg_serial4
, moving_avg_concurrent2
and moving_avg_concurrent3
I wrote a benchmark and I tabulated the results:
fct & num_goroutines | timing in ns/op | percentage
---------------------------------------------------------------------
serial4 | 4357893 | 100.00%
concur2_1 | 5174818 | 118.75%
concur2_4 | 9986386 | 229.16%
concur2_8 | 18973443 | 435.38%
concur2_32 | 75602438 | 1734.84%
concur3_1 | 32423150 | 744.01%
concur3_4 | 21083897 | 483.81%
concur3_8 | 16427430 | 376.96%
concur3_32 | 15157314 | 347.81%
Question
Since as mentioned above this problem is embarrassingly parallel I was expecting to see a tremendous performance increase but that was not the case.
Why does moving_avg_concurrent2
not scale at all?
And why is moving_avg_concurrent3
that much slower than moving_avg_serial4
?
I know that goroutines are cheap but still are not free, but is it possible that this generates that much overhead such that we are even slower than moving_avg_serial4
?
Code
Functions:
// returns a slice containing the moving average of the input (given, i.e. not optimised)
func moving_avg_serial(input []float64, window_size int) []float64 {
first_time := true
var output = make([]float64, len(input))
if len(input) > 0 {
var buffer = make([]float64, window_size)
// initialise buffer with NaN
for i := range buffer {
buffer[i] = math.NaN()
}
for i, val := range input {
old_val := buffer[int((math.Mod(float64(i), float64(window_size))))]
buffer[int((math.Mod(float64(i), float64(window_size))))] = val
if !NaN_in_slice(buffer) && first_time {
sum := 0.0
for _, entry := range buffer {
sum += entry
}
output[i] = sum / float64(window_size)
first_time = false
} else if i > 0 && !math.IsNaN(output[i-1]) && !NaN_in_slice(buffer) {
output[i] = output[i-1] + (val-old_val)/float64(window_size) // solution without loop
} else {
output[i] = math.NaN()
}
}
} else { // empty input
fmt.Println("moving_avg is panicking!")
panic(fmt.Sprintf("%v", input))
}
return output
}
// returns a slice containing the moving average of the input
// reordering the control structures to exploid the short-circuit evaluation
func moving_avg_serial4(input []float64, window_size int) []float64 {
first_time := true
var output = make([]float64, len(input))
if len(input) > 0 {
var buffer = make([]float64, window_size)
// initialise buffer with NaN
for i := range buffer {
buffer[i] = math.NaN()
}
for i := range input {
// fmt.Printf("in mvg_avg4: i=%v
", i)
old_val := buffer[int((math.Mod(float64(i), float64(window_size))))]
buffer[int((math.Mod(float64(i), float64(window_size))))] = input[i]
if first_time && !NaN_in_slice(buffer) {
sum := 0.0
for j := range buffer {
sum += buffer[j]
}
output[i] = sum / float64(window_size)
first_time = false
} else if i > 0 && !math.IsNaN(output[i-1]) /* && !NaN_in_slice(buffer)*/ {
output[i] = output[i-1] + (input[i]-old_val)/float64(window_size) // solution without loop
} else {
output[i] = math.NaN()
}
}
} else { // empty input
fmt.Println("moving_avg is panicking!")
panic(fmt.Sprintf("%v", input))
}
return output
}
// returns a slice containing the moving average of the input
// splitting up slice into smaller pieces for the goroutines but without using the serial version, i.e. we only have NaN's in the beginning, thus hope to reduce some overhead
// still does not scale (decreasing performance with increasing size and num_goroutines)
func moving_avg_concurrent2(input []float64, window_size, num_goroutines int) []float64 {
var output = make([]float64, window_size-1, len(input))
for i := 0; i < window_size-1; i++ {
output[i] = math.NaN()
}
if len(input) > 0 {
num_items := len(input) - (window_size - 1)
var barrier_wg sync.WaitGroup
n := num_items / num_goroutines
go_avg := make([][]float64, num_goroutines)
for i := 0; i < num_goroutines; i++ {
go_avg[i] = make([]float64, 0, num_goroutines)
}
for i := 0; i < num_goroutines; i++ {
barrier_wg.Add(1)
go func(go_id int) {
defer barrier_wg.Done()
// computing boundaries
var start, stop int
start = go_id*int(n) + (window_size - 1) // starting index
// ending index
if go_id != (num_goroutines - 1) {
stop = start + n // Ending index
} else {
stop = num_items + (window_size - 1) // Ending index
}
loc_avg := moving_avg_serial4(input[start-(window_size-1):stop], window_size)
loc_avg = make([]float64, stop-start)
current_sum := 0.0
for i := start - (window_size - 1); i < start+1; i++ {
current_sum += input[i]
}
loc_avg[0] = current_sum / float64(window_size)
idx := 1
for i := start + 1; i < stop; i++ {
loc_avg[idx] = loc_avg[idx-1] + (input[i]-input[i-(window_size)])/float64(window_size)
idx++
}
go_avg[go_id] = append(go_avg[go_id], loc_avg...)
}(i)
}
barrier_wg.Wait()
for i := 0; i < num_goroutines; i++ {
output = append(output, go_avg[i]...)
}
} else { // empty input
fmt.Println("moving_avg is panicking!")
panic(fmt.Sprintf("%v", input))
}
return output
}
// returns a slice containing the moving average of the input
// change of paradigm, we opt for a master worker pattern and spawn all windows which each will be computed by a goroutine
func compute_window_avg(input, output []float64, start, end int) {
sum := 0.0
size := end - start
for _, val := range input[start:end] {
sum += val
}
output[end-1] = sum / float64(size)
}
func moving_avg_concurrent3(input []float64, window_size, num_goroutines int) []float64 {
var output = make([]float64, window_size-1, len(input))
for i := 0; i < window_size-1; i++ {
output[i] = math.NaN()
}
if len(input) > 0 {
num_windows := len(input) - (window_size - 1)
var output = make([]float64, len(input))
for i := 0; i < window_size-1; i++ {
output[i] = math.NaN()
}
pending := make(chan *Work)
done := make(chan *Work)
// creating work
go func() {
for i := 0; i < num_windows; i++ {
pending <- NewWork(compute_window_avg, input, output, i, i+window_size)
}
}()
// start goroutines which work through pending till there is nothing left
for i := 0; i < num_goroutines; i++ {
go func() {
Worker(pending, done)
}()
}
// wait till every work is done
for i := 0; i < num_windows; i++ {
<-done
}
return output
} else { // empty input
fmt.Println("moving_avg is panicking!")
panic(fmt.Sprintf("%v", input))
}
return output
}
Benchmarks:
//############### BENCHMARKS ###############
var import_data_res11 []float64
func benchmarkMoving_avg_serial(b *testing.B, window int) {
var r []float64
for n := 0; n < b.N; n++ {
r = moving_avg_serial(BackTest_res.F["Trading DrawDowns"], window)
}
import_data_res11 = r
}
var import_data_res14 []float64
func benchmarkMoving_avg_serial4(b *testing.B, window int) {
var r []float64
for n := 0; n < b.N; n++ {
r = moving_avg_serial4(BackTest_res.F["Trading DrawDowns"], window)
}
import_data_res14 = r
}
var import_data_res16 []float64
func benchmarkMoving_avg_concurrent2(b *testing.B, window, num_goroutines int) {
var r []float64
for n := 0; n < b.N; n++ {
r = moving_avg_concurrent2(BackTest_res.F["Trading DrawDowns"], window, num_goroutines)
}
import_data_res16 = r
}
var import_data_res17 []float64
func benchmarkMoving_avg_concurrent3(b *testing.B, window, num_goroutines int) {
var r []float64
for n := 0; n < b.N; n++ {
r = moving_avg_concurrent3(BackTest_res.F["Trading DrawDowns"], window, num_goroutines)
}
import_data_res17 = r
}
func BenchmarkMoving_avg_serial_261x10(b *testing.B) {
benchmarkMoving_avg_serial(b, 261*10)
}
func BenchmarkMoving_avg_serial4_261x10(b *testing.B) {
benchmarkMoving_avg_serial4(b, 261*10)
}
func BenchmarkMoving_avg_concurrent2_261x10_1(b *testing.B) {
benchmarkMoving_avg_concurrent2(b, 261*10, 1)
}
func BenchmarkMoving