'Running time scales with the number of threads when running a function received from Python inside OpenMP parallel block

Here are the files for test.

# CMakeLists.txt

cmake_minimum_required(VERSION 3.16)
project(CALLBACK_TEST)

set(CMAKE_CXX_STANDARD 17)
add_compile_options(-O3 -fopenmp -fPIC)
add_link_options(-fopenmp)

add_subdirectory(pybind11)
pybind11_add_module(callback callback.cpp)

add_custom_command(TARGET callback POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E create_symlink $<TARGET_FILE:callback> ${CMAKE_CURRENT_SOURCE_DIR}/callback.so
)
// callback.cpp

#include <cmath>
#include <functional>
#include <vector>

#include <pybind11/pybind11.h>
#include <pybind11/functional.h>

namespace py = pybind11;

class C
{
public:
  C(std::function<float(float)> f, size_t s) : f_(f), v_(s, 1) {}
  void apply()
  {
#pragma omp parallel for
    for (size_t i = 0; i < v_.size(); i++)
      v_[i] = f_(v_[i]);
  }
  void apply_direct()
  {
#pragma omp parallel for
    for (size_t i = 0; i < v_.size(); i++)
      v_[i] = log(1 + v_[i]);
  }

private:
  std::vector<float> v_;
  std::function<float(float)> f_;
};

PYBIND11_MODULE(callback, m)
{
  py::class_<C>(m, "C")
      .def(py::init<std::function<float(float)>, size_t>())
      .def("apply", &C::apply, py::call_guard<py::gil_scoped_release>())
      .def("apply_direct", &C::apply_direct);
  m.def("log1p", [](float x) -> float
        { return log(1 + x); });
}
# callback.py

import math
import time

from callback import C, log1p


def run(n, func):
    start = time.time()
    if func:
        for _ in range(n):
            c = C(func, 1000)
            c.apply()
    else:
        for _ in range(n):
            c = C(func, 1000)
            c.apply_direct()
    end = time.time()
    print(end - start)


if __name__ == "__main__":
    n = 1000
    one = 1
    print("Python")
    run(n, lambda x: math.log(x + 1))
    print("C++")
    run(n, log1p)
    print("Direct")
    run(n, None)

I run the Python script on a server with 48 CPU cores. Here is the running time. It shows 1. the running time increases when OMP_NUM_THREADS increases especially when accepting the Python/C++ callback from Python, and 2. keeping everything inside C++ is much faster, which seems to contradict the "no overhead" claim as in the documentation.

$ python callback.py
Python
19.612852573394775
C++
19.268250226974487
Direct
0.04382634162902832
$ OMP_NUM_THREADS=4 python callback.py
Python
6.042902708053589
C++
5.48648738861084
Direct
0.03322458267211914
$ OMP_NUM_THREADS=1 python callback.py
Python
0.5964927673339844
C++
0.38849639892578125
Direct
0.020793914794921875

And when OpenMP is turned off:

$ python callback.py
Python
0.8492450714111328
C++
0.26660943031311035
Direct
0.010872125625610352

So what goes wrong here?



Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source