the benchmark simply does 1000x1000 single precision matrix multiplications, the codes isn't really optimised but is build with g++ -O3 enabled.

i'm getting 230 Mflops for single threaded runs and 514 Mflops for 4 threads concurrent computation on same sized matrices.

the rather low Mflops probably indicates further optimizations is possible. However, the main point of this test is to monitor temperature under load and the benchmark mainly generates a synthetic load.

during some of the early runs, at one point i got a reading of 78 deg C peak without heat sink (this is during the 4 concurrent threads phase of the run). But comparison is difficult as i need to gather temperature data in sync with the run to make a sensible comparison. hence, i did up the codes.

and as it turns out without heat sink my Pi 4 idles at 54 deg C, during the single thread phase of the run it goes up to about 62 deg C, but when all 4 threads concurrent runs are used for the matrix multiplication it goes up to 74 deg C.

then i fitted a 28mm x 28mm x 11mm heat sink, it adheres to the soc with some low cost heat sink compound

the run with heat sink my Pi 4 idles at 47-48 deg C, during the single thread phase of the run it stayed below 54 deg C, but when all 4 threads concurrent runs are used for the matrix multiplication it goes up to 60 deg C. that makes it about 14 deg C lower than without at peak loads.

and about 5 deg C lower at idle

i think these temperatures are still rather warm, hence to get lower temperatures, a fan blowing at the heat sink would be needed.

this experiment is done without a fan.

Pi 4B with 28x28x11mm heat sink seating on top to build this

Code: Select all

`c++ -pthread -O3 -o mat main.cpp`

https://github.com/mtrebi/matrix-multip ... -threading

Code: Select all

```
/*
MIT License
Copyright (c) 2016 Mariano Trebino
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#include <iostream>
#include <random>
#include <chrono>
#include <thread>
#include <time.h>
#include <unistd.h>
#include <stdlib.h>
static const long MATRIX_SIZE = 1000;
static const int THREADS_NUMBER = 4;
static const long N_EXECUTIONS = 10;
struct Matrix {
float ** elements;
void initialize_zero() {
elements = new float*[MATRIX_SIZE];
for (int i = 0; i < MATRIX_SIZE; ++i) {
elements[i] = new float[MATRIX_SIZE];
for (int j = 0; j < MATRIX_SIZE; ++j) {
elements[i][j] = 0.0f;
}
}
}
void initialize_random() {
std::random_device rd;
std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(-1e9, -1e9);
elements = new float*[MATRIX_SIZE];
for (int i = 0; i < MATRIX_SIZE; ++i) {
elements[i] = new float[MATRIX_SIZE];
for (int j = 0; j < MATRIX_SIZE; ++j) {
elements[i][j] = dist(mt);
}
}
}
void print() {
std::cout << std::endl;
for (int i = 0; i < MATRIX_SIZE; ++i) {
std::cout << "|\t";
for (int j = 0; j < MATRIX_SIZE; ++j) {
std::cout << elements[i][j] << "\t";
}
std::cout << "|" << std::endl;
}
}
};
void multiply(Matrix& r, const Matrix& m1, const Matrix& m2);
void single_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2);
void multithreading_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2);
void multiply_threading(Matrix& result, const int thread_number, const Matrix& m1, const Matrix& m2);
void benchmark_execution(void(*execution_function)(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2), int count);
long long milliseconds_now();
int main(int argc, char *argv[]) {
if(argc == 1) {
std::cout << "Single execution" << std::endl;
benchmark_execution(single_execution, N_EXECUTIONS);
std::cout << "Multi thread execution" << std::endl;
benchmark_execution(multithreading_execution, N_EXECUTIONS);
//sleep(10);
} else if (argc == 2) {
if(*argv[1] == 's') {
std::cout << "Single execution" << std::endl;
benchmark_execution(single_execution, N_EXECUTIONS);
} else {
std::cout << "Multi thread execution" << std::endl;
benchmark_execution(multithreading_execution, N_EXECUTIONS);
}
} else if (argc == 3) {
int count = atoi(argv[2]);
if(*argv[1] == 's') {
std::cout << "Single execution" << std::endl;
benchmark_execution(single_execution, count);
} else {
std::cout << "Multi thread execution" << std::endl;
benchmark_execution(multithreading_execution, count);
}
}
std::cout << "End of program" << std::endl;
}
void benchmark_execution(void(*execution_function)(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2), int counts) {
Matrix m1, m2, r;
long long total_time = 0.0;
for (int i = 0; i < counts; ++i) {
long long elapsed_time = 0.0;
m1.initialize_random();
m2.initialize_random();
r.initialize_zero();
execution_function(r, elapsed_time, m1, m2);
total_time += elapsed_time;
}
std::cout << "matrix size:\t" << MATRIX_SIZE << "\n";
std::cout << "loops:\t" << counts << "\n";
int ttime = (double) total_time / counts;
std::cout << "\tAverage execution took\t" << ttime << " ms" << std::endl;
int nops = MATRIX_SIZE * MATRIX_SIZE * MATRIX_SIZE +
MATRIX_SIZE * MATRIX_SIZE * (MATRIX_SIZE - 1);
std::cout << "MFlops:\t" << nops / ttime / 1000 << "\n";
}
void multiply(Matrix& r, const Matrix& m1, const Matrix& m2) {
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
float result = 0.0f;
for (int k = 0; k < MATRIX_SIZE; ++k) {
const float e1 = m1.elements[i][k];
const float e2 = m2.elements[k][j];
result += e1 * e2;
}
r.elements[i][j] = result;
}
}
}
void single_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2) {
//std::cout << "Starting single thread execution..." << std::endl;
long long start_time = milliseconds_now();
//std::cout << "Calculating...." << std::endl;
multiply(r, m1, m2);
long long end_time = milliseconds_now();
//std::cout << "Finishing single thread execution..." << std::endl;
elapsed_time = end_time - start_time;
}
void multiply_threading(Matrix& result, const int thread_number, const Matrix& m1, const Matrix& m2) {
// Calculate workload
const int n_elements = (MATRIX_SIZE * MATRIX_SIZE);
const int n_operations = n_elements / THREADS_NUMBER;
const int rest_operations = n_elements % THREADS_NUMBER;
int start_op, end_op;
if (thread_number == 0) {
// First thread does more job
start_op = n_operations * thread_number;
end_op = (n_operations * (thread_number + 1)) + rest_operations;
}
else {
start_op = n_operations * thread_number + rest_operations;
end_op = (n_operations * (thread_number + 1)) + rest_operations;
}
for (int op = start_op; op < end_op; ++op) {
const int row = op % MATRIX_SIZE;
const int col = op / MATRIX_SIZE;
float r = 0.0f;
for (int i = 0; i < MATRIX_SIZE; ++i) {
const float e1 = m1.elements[row][i];
const float e2 = m2.elements[i][col];
r += e1 * e2;
}
result.elements[row][col] = r;
}
}
void multithreading_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2) {
//std::cout << "Starting multithreading execution..." << std::endl;
long long start_time = milliseconds_now();
std::thread threads[THREADS_NUMBER];
for (int i = 0; i < THREADS_NUMBER; ++i) {
//std::cout << "Starting thread " << i << std::endl;
threads[i] = std::thread(multiply_threading, std::ref(r), i, std::ref(m1), std::ref(m2));
}
//std::cout << "Calculating...." << std::endl;
for (int i = 0; i < THREADS_NUMBER; ++i) {
//std::cout << "Joining thread " << i << std::endl;
threads[i].join();
}
long long end_time = milliseconds_now();
//std::cout << "Finishing multithreading execution..." << std::endl;
elapsed_time = end_time - start_time;
}
long long milliseconds_now() {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
return now.tv_sec * 1000 + now.tv_nsec / 1000000;
}
```

note that this code doesn't do the graphs, it saves the time (seconds elapsed and soc temperature to the file with the variable logfile "bench-nosink.log", you would need to find a separate utility to graph it

Code: Select all

```
#!/usr/bin/python3.6
from threading import Thread
from threading import Event
import threading
import time
import signal
import sys
import subprocess
import pickle
#logfile = ""
logfile = "bench-nosink.log"
lf = open(logfile, 'w+')
msg = list()
def signal_handler(sig, frame):
print('You pressed Ctrl+C!')
stopFlag.set()
sys.exit(0)
class MyThread(Thread):
def __init__(self, event):
Thread.__init__(self)
self.stopped = event
def run(self):
while not self.stopped.wait(1.0):
sample()
beg = time.perf_counter()
def sample():
ms = int((time.perf_counter() - beg) * 1000)
with open("/sys/class/thermal/thermal_zone0/temp") as f:
temp = f.read().strip();
m = list()
if (len(msg) > 0):
m = [ ms, temp]
for l in msg:
m.append(l)
msg.clear()
else:
m = [ms, temp]
if not logfile == "":
for i in range(0, len(m)):
lf.write(str(m[i]))
if i < len(m) - 1:
lf.write('\t')
lf.write("\n")
print(m)
def bench(cmd):
msg.append("starting")
for c in cmd:
msg.append(c)
with subprocess.Popen(cmd, stdout=subprocess.PIPE) as p:
msg.append(p.stdout.read().decode("utf-8").split('\n'))
if __name__ == '__main__':
stopFlag = Event()
signal.signal(signal.SIGINT, signal_handler)
print('Press Ctrl+C')
thread = MyThread(stopFlag)
thread.start()
#signal.pause()
cmd = ["./mat", 's', '10']
bt = threading.Thread(target=bench(cmd), daemon=True)
bt.start()
bt.join()
cmd = ["./mat", 'm', '25']
bt = threading.Thread(target=bench(cmd), daemon=True)
bt.start()
bt.join()
time.sleep(300)
# this will stop the timer
stopFlag.set()
```