#include <cstdio> // Import the printf function
#include "omp.h" // Import the OpenMP library functions
// Start of program
int main()
{
// This is a "character array", consisting of 105 letters in byte form.
char mysecret[] = "Zpv!uijol!zpvs!qbjo!boe!zpvs!ifbsucsfbl!bsf!voqsfdfefoufe!jo!uif!ijtupsz!pg!uif!xpsme-!cvu!uifo!zpv!sfbe/";
// The original message was encoded by shifting all letters by one, i.e.
// A becomes B, c becomes d and so forth. This shift is easy to implement
// in byte form as all you need to do is to add 1 to a character.
// To printf a single character, use e.g.
// printf("%c\n", mysecret[0]);
// To printf the whole array do e.g. (subtle difference between %c and %s)
// printf("%s\n", mysecret);
#pragma omp parallel for
for (int i=0;i<105;i++)
{
mysecret[i] -=1;
printf("Thread [%d] sees %c\n", omp_get_thread_num(), mysecret[i]);
}
printf("%s\n", mysecret);
return 0;
}
Solution: You think your pain and your heartbreak are unprecedented in the history of the world, but then you read.
The point here is to figure out that the inner and outmost loops must be reversed and the the middle loop (over lines) parallelises perfectly.
A good indicator if a loop is parallelisable is if the program is invariant under reversal of the iteration.
I also want them to use omp_get_wtime to measure the execution time and make a graph varying OMP_NUM_THREADS.
They also need to remind themselves how to copy files back and forth and ideally how to use slurm.
The only additional technical hurdle is copyin CImg.h and to tell the compiler the include path. Also, they need gnu 9.3 but that’s in the exercise sheet.
#define cimg_display 0 // This gets around need to link libX11
#include "CImg.h"
#include <cstdio>
#include "omp.h"
using namespace cimg_library;
int main(int argc, char * argv[]) {
CImg<float> img = CImg<float>(argv[1]);
const int w = img.width();
const int h = img.height();
double start_time = omp_get_wtime();
// Decode
for (int r=300-1; r>=0;r--) // This is just the outermost encoder loop in reverse
{
#pragma omp parallel for
for (int y=0;y<h;y++) // This is the loop that can be parallelised
{
for (int x=w-1;x>=0;x--) // this loops iterates along the x-direction --- now reversed wrt encoding
{
// We swap the colour values of two pixels:
// 1. store the value at coordinates [(x+r+y)%w , y ] in a temporary variable 'buf'
float buf = img( (x+r+y)%w, y); // (x+r+y)%w ensures that the first coordinate is < w
// 2. replace the colour values at [(x+r+y)%w , y] with that of [x,y]
img( (x+r+y)%w, y) = img(x, y);
// 3. replace the colour value at [x,y] with that of the original value at [(x+r+y)%w]
img(x , y) = buf;
}
}
}
double time = omp_get_wtime() - start_time;
printf("Decoding took %f s\n", time);
// Write decoded image
img.save("decoded.pgm");
return 0;
}
\(\to\)
\(\to\)
#!/bin/bash
#SBATCH -t 5 # Request 5 minutes of time
#SBATCH -p par7.q # This selects machines from the queue par7.q
#SBATCH -N 1 # We want just one compute node
#SBATCH --exclusive # We want just one compute node
#SBATCH --mail-type=END # We want an email notification at the end of the job
#SBATCH --mail-user=username@durham.ac.uk # The email notification goes to this address
# Get rid of all currently loaded modules and only load gcc 9.3
module purge
module load gcc/9.3.0
# This is the command we run
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=1 ./decoder encoded.pgm
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=2 ./decoder encoded.pgm
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=4 ./decoder encoded.pgm
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=8 ./decoder encoded.pgm
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=16 ./decoder encoded.pgm
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=32 ./decoder encoded.pgm
OMP_PLACES=cores OMP_PROC_BIND=close OMP_NUM_THREADS=64 ./decoder encoded.pgm
exit 0
X=[1,2,4,8,16,32,64]
Y=[ 13.165511, 6.657706, 3.542630, 2.037756, 1.027711, 1.015704, 0.933740]
Ideal = [Y[0]/x for x in X]
eff = [i/y*100 for (y,i) in zip(Y,Ideal)]
from matplotlib import pyplot as plt
plt.plot(X,Y, label="Measured scaling")
plt.plot(X,Ideal, label="Ideal scaling")
plt.yscale("log")
plt.legend()
plt.xlabel("$N_\mathrm{threads}$")
plt.ylabel("$t$[s]")
plt.savefig("scaling.png")
plt.clf()
plt.plot(X,eff)
plt.xlabel("$N_\mathrm{threads}$")
plt.ylabel("Scaling efficiency [%]")
plt.savefig("scalingeff.png")
This is really contrived and meant to show bad practice. This program only works correctly when the number of threads is 4. Next week we will do this again but with atomic and reduction.
#include <cstdio>
#include "omp.h"
int main()
{
unsigned long N = 1000000000;
unsigned long SUMS[4] = {0,0,0,0};
#pragma omp parallel for num_threads(4)
for (unsigned long i=1;i<N+1;i++)
{
SUMS[omp_get_thread_num()]+=i;
}
unsigned long result = SUMS[0] + SUMS[1] + SUMS[2] + SUMS[3];
printf("Result: %lu\n", result);
return 0;
}