'Running time scales with the number of threads when running a function received from Python inside OpenMP parallel block
Here are the files for test.
# CMakeLists.txt
cmake_minimum_required(VERSION 3.16)
project(CALLBACK_TEST)
set(CMAKE_CXX_STANDARD 17)
add_compile_options(-O3 -fopenmp -fPIC)
add_link_options(-fopenmp)
add_subdirectory(pybind11)
pybind11_add_module(callback callback.cpp)
add_custom_command(TARGET callback POST_BUILD
COMMAND ${CMAKE_COMMAND} -E create_symlink $<TARGET_FILE:callback> ${CMAKE_CURRENT_SOURCE_DIR}/callback.so
)
// callback.cpp
#include <cmath>
#include <functional>
#include <vector>
#include <pybind11/pybind11.h>
#include <pybind11/functional.h>
namespace py = pybind11;
class C
{
public:
C(std::function<float(float)> f, size_t s) : f_(f), v_(s, 1) {}
void apply()
{
#pragma omp parallel for
for (size_t i = 0; i < v_.size(); i++)
v_[i] = f_(v_[i]);
}
void apply_direct()
{
#pragma omp parallel for
for (size_t i = 0; i < v_.size(); i++)
v_[i] = log(1 + v_[i]);
}
private:
std::vector<float> v_;
std::function<float(float)> f_;
};
PYBIND11_MODULE(callback, m)
{
py::class_<C>(m, "C")
.def(py::init<std::function<float(float)>, size_t>())
.def("apply", &C::apply, py::call_guard<py::gil_scoped_release>())
.def("apply_direct", &C::apply_direct);
m.def("log1p", [](float x) -> float
{ return log(1 + x); });
}
# callback.py
import math
import time
from callback import C, log1p
def run(n, func):
start = time.time()
if func:
for _ in range(n):
c = C(func, 1000)
c.apply()
else:
for _ in range(n):
c = C(func, 1000)
c.apply_direct()
end = time.time()
print(end - start)
if __name__ == "__main__":
n = 1000
one = 1
print("Python")
run(n, lambda x: math.log(x + 1))
print("C++")
run(n, log1p)
print("Direct")
run(n, None)
I run the Python script on a server with 48 CPU cores. Here is the running time. It shows 1. the running time increases when OMP_NUM_THREADS increases especially when accepting the Python/C++ callback from Python, and 2. keeping everything inside C++ is much faster, which seems to contradict the "no overhead" claim as in the documentation.
$ python callback.py
Python
19.612852573394775
C++
19.268250226974487
Direct
0.04382634162902832
$ OMP_NUM_THREADS=4 python callback.py
Python
6.042902708053589
C++
5.48648738861084
Direct
0.03322458267211914
$ OMP_NUM_THREADS=1 python callback.py
Python
0.5964927673339844
C++
0.38849639892578125
Direct
0.020793914794921875
And when OpenMP is turned off:
$ python callback.py
Python
0.8492450714111328
C++
0.26660943031311035
Direct
0.010872125625610352
So what goes wrong here?
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
