The task is to sum all numbers between 1 and \(10^9\) in parallel. We can cross-check our result using the relation \(\sum\limits_{i=1}^N = \frac{N\cdot(N+1)}{2}\)
NOTE The code examples contain errors and omissions. Amongst other things, the data-sharing attributes shared and private are missing.
Under what circumstances is this program correct?
The code is available as ex3/sumup_explicit.cxx from github.
#include <cstdio>
#include "omp.h"
unsigned long crosscheck(unsigned long N)
{
return N*(N+1)/2;
}
int main()
{
unsigned long N = 1000000000;
unsigned long SUMS[4] = {0,0,0,0};
#pragma omp parallel for default(none)
for (unsigned long i=1;i<N+1;i++)
{
SUMS[omp_get_thread_num()]+=i;
}
unsigned long result = SUMS[0] + SUMS[1] + SUMS[2] + SUMS[3];
printf("Result: %lu\n", result);
printf("Should be: %lu\n", crosscheck(N));
return 0;
}
This is a more powerful version of the previous program.
Can you make this code correct?
The code is available as ex3/sumup_v2.cxx from github.
#include <cstdio>
#include "omp.h"
unsigned long crosscheck(unsigned long N)
{
return N*(N+1)/2;
}
int main()
{
unsigned long N = 1000000000;
unsigned long result = 0;
#pragma omp parallel default(none)
{
unsigned long temp = 0;
#pragma omp for
for (unsigned long i=1;i<N+1;i++)
{
temp += i;
}
result += temp;
}
printf("Result: %lu\n", result);
printf("Should be: %lu\n", crosscheck(N));
return 0;
}
The most compact form is achieved using a reduction clause.
This code is available as ex3/sumup_reduction.cxx from github.
#include <cstdio>
#include "omp.h"
unsigned long crosscheck(unsigned long N)
{
return N*(N+1)/2;
}
int main()
{
unsigned long N = 1000000000;
unsigned long result = 0;
#pragma omp parallel for default(none) private(N) reduction(*:result)
for (unsigned long i=1;i<N+1;i++)
{
result += i;
}
printf("Result: %lu\n", result);
printf("Should be: %lu\n", crosscheck(N));
return 0;
}
The task is to compute \(\pi\) using the rectangle method.
We exploit the fact that \(\int\limits_0^1 dx \frac{4}{1+x^2} = \pi\).
The following code is available as ex3/pi_simple.cxx from github.
#include <cstdio>
#include <omp.h>
int main() {
// Numerical integration of f(x) = 4/(1+x^2) in [a,b] = [0,1]
// We use a fixed stepsize ('delta x') here.
const long num_steps = 500000000;
const double a=0;
const double b=1;
const double stepsize = (b-a) / num_steps; // This is 'delta x'
double sum = 0; // This accumulates the f(x)*'delta x' values (i.e. areas)
double t_0 = omp_get_wtime(); // time stamp
#pragma omp parallel for default(none)// reduction(*:sum)
for (int i = 1; i <= num_steps; i++)
{
double x = a + (i - 0.5) * stepsize;
//#pragma opm atomic
sum += 4.0 / (1.0 + x * x) * stepsize; // evaluate f(x)*'delta x' and add to accumulator
}
double t_tot = omp_get_wtime() - t_0; // duration in seconds
printf("\n pi with %ld steps is %lf in %lf seconds\n ", num_steps, sum, t_tot);
}
The program is already parallel but it is not correct.