'Why parallel matrix multiplication takes so long time?

I create test code where I am computing in parallel one complex matrix.

I am computing on CPU.

I observed that it takes around 3 seconds to finish all the blocks.

Can someone explain why it takes so long time ?

Code

Utils.hpp

#pragma once

#include <chrono>
#include <armadillo>

namespace utils
{
    class watch : std::chrono::steady_clock {
        time_point start_ = now();
    public: auto elapsed_sec() const {return std::chrono::duration<double>(now() - start_).count();}
    };

    void op_herk(arma::cx_mat && A, arma::cx_mat & C)
    {
        using blas_int = int;
        using T = double;

        const char uplo = 'U';
        const char trans_A = 'N';
        const auto n = blas_int(C.n_cols);
        const auto k = blas_int(A.n_cols);
        const T local_alpha = T(1);
        const T local_beta  = T(0);
        const blas_int lda = n;

        arma::blas::herk<T>( &uplo, &trans_A, &n, &k, &local_alpha, A.mem, &lda, &local_beta, C.memptr(), &n);
        arma::herk_helper::inplace_conj_copy_upper_tri_to_lower_tri(C);
    }
}

ThreadPoll

#pragma once

#include <boost/thread.hpp>
#include <boost/asio.hpp>
#include <boost/asio/thread_pool.hpp>

class ThreadPool {
public:
    explicit ThreadPool(size_t size = boost::thread::hardware_concurrency()) : threadPool(size)
    { }

    template<typename F>
    void addTask(F &&f)
    {
        boost::asio::post(threadPool, std::forward<F>(f));
    }
    void wait()
    {
        threadPool.wait();
    }

    ~ThreadPool()
    {
        threadPool.join();
    }
private:
    boost::asio::thread_pool threadPool;
};

main.cpp

#include <armadillo>
#include "Utils.h"
#include "ThreadPool.h"

int main() {
    ThreadPool threadPool;
    arma::cx_mat test (256, 30000 , arma::fill::randu);
    arma::vec averageTime(30, arma::fill::zeros);
    std::vector<arma::cx_mat > results(30);
    for(auto &it : results)
        it.set_size(256, 256);

    {
        for(int i = 0; i < 30; ++i)
        {
            threadPool.addTask([i = i, &results, &averageTime, test = test.submat(arma::span::all, arma::span(0, 20000)), _ = utils::watch() ]() {
                utils::op_herk(test, results[i]);
                arma::vec r = arma::sort(arma::eig_sym(results[i]), "descent");
                std::cout << _.elapsed_sec() << '\n';
                averageTime[i] = _.elapsed_sec();
            });
        }
        threadPool.wait();
        std::cout << "average " << arma::sum(averageTime)/averageTime.size() <<std::endl;
    }
    return 0;
}

Parameters : gcc 9.4 computer : Intel 6 Cores , 12 threads; armadillo 10.7.3 openblas 0.3.17

CMAKE parameters : set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -msse2 -O3 -mtune=native -flto")

My results :

1.16084
1.16434
1.16571
1.16601
1.17055
1.17118
1.17382
1.17511
1.1767
1.17981
1.18254
1.18537
2.40071
2.40225
2.4025
2.40511
2.40545
2.40565
2.40583
2.40941
2.40972
2.40974
2.41172
2.41291
3.23446
3.23592
3.23734
3.23972
3.24305
3.24484
3.24728
average 2.14871


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source