diff --git a/CMakeLists.txt b/CMakeLists.txt index e3f5c27..b5dc3f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,7 +129,7 @@ endif() # Set platform / compiler specific compile flags and checks if(APPLE) # Need to investigate how to reliably detect and use OpenMP on macOS... - set(USE_OPENMP_DEFAULT OFF) +# set(USE_OPENMP_DEFAULT OFF) # Among others, setting CMAKE_FIND_FRAMEWORK to LAST fixed issues # with installed Mono that contains old headers (libpng, ...). @@ -163,7 +163,7 @@ if(APPLE) endif() elseif(UNIX) - set(USE_OPENMP_DEFAULT ON) +# set(USE_OPENMP_DEFAULT ON) # assume libstdc++ set(STD_CXX_FS stdc++fs) @@ -186,13 +186,37 @@ endif() # OpenMP option and compile flags -option(USE_OPENMP "Use OpenMP (e.g. for parallel computation in Eigen)" ${USE_OPENMP_DEFAULT}) -if(USE_OPENMP) - message(STATUS "OpenMP Enabled") - set(BASALT_CXX_FLAGS "${BASALT_CXX_FLAGS} -fopenmp") -else() - message(STATUS "OpenMP Disabled") -endif() +# +# Note: OpenMP and TBB don't mix well, so we disable Eigen's parallelization. +# It's trying to paralellize matrix products during SC, which we run in a parallel_reduce using TBB. +# Turns out using OpenMP can slow down the computby factor 10-100x! So for now we discable it completely. +# One place where Eigen's parallelization could still have been useful is the CG solver in the mapper. +# We could in the future investiagte other implementations (paralellized with TBB) or selectively enabling +# Eigen's parallelization just for CG, setting number of threads to 1 everywhere else. +# Another way to ensure Eigen doesn't use OpenMP regardless of how it was built is setting the environment +# variable OMP_NUM_THREADS=1 beofre running the application. +# +# See: https://eigen.tuxfamily.org/dox/TopicMultiThreading.html +# +# If we enable BLAS / LAPACK either directly or via thirdparty libs like ceres, +# make sure to disable OpenMP for the linked BLAS library. In particular on Ubuntu it seems OpenBLAS is often installed, +# and it can have similar issues in multithreaded applications if it's own parallelization with OpenMP is enabled. +# You can set the environment varaibles OPENBLAS_NUM_THREADS=1 or OMP_NUM_THREADS=1. This is also mentioned in the ceres +# installation documentation. +# +# See also: https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded +# +# Set EIGEN_DONT_PARALLELIZE to be sure it doesn't use openmp, +# just in case some dependency enables openmp without us noticing. +set(BASALT_CXX_FLAGS "${BASALT_CXX_FLAGS} -DEIGEN_DONT_PARALLELIZE") + +#option(USE_OPENMP "Use OpenMP (e.g. for parallel computation in Eigen)" ${USE_OPENMP_DEFAULT}) +#if(USE_OPENMP) +# message(STATUS "OpenMP Enabled") +# set(BASALT_CXX_FLAGS "${BASALT_CXX_FLAGS} -fopenmp") +#else() +# message(STATUS "OpenMP Disabled") +#endif() # setup combined compiler flags diff --git a/include/basalt/optimization/accumulator.h b/include/basalt/optimization/accumulator.h index 94d90b6..0f20653 100644 --- a/include/basalt/optimization/accumulator.h +++ b/include/basalt/optimization/accumulator.h @@ -201,6 +201,10 @@ class SparseHashAccumulator { VectorX res; if (iterative_solver) { + // NOTE: since we have to disable Eigen's parallelization with OpenMP + // (interferes with TBB), the current CG is single-threaded, and we + // can expect a substantial speedup by switching to a parallel + // implementation of CG. Eigen::ConjugateGradient, Eigen::Lower | Eigen::Upper> cg; diff --git a/src/rs_t265_vio.cpp b/src/rs_t265_vio.cpp index 5b353ea..5f012b6 100644 --- a/src/rs_t265_vio.cpp +++ b/src/rs_t265_vio.cpp @@ -137,6 +137,12 @@ int main(int argc, char** argv) { app.add_option("--num-threads", num_threads, "Number of threads."); app.add_option("--step-by-step", step_by_step, "Path to config file."); + try { + app.parse(argc, argv); + } catch (const CLI::ParseError& e) { + return app.exit(e); + } + // global thread limit is in effect until global_control object is destroyed std::unique_ptr tbb_global_control; if (num_threads > 0) { @@ -144,12 +150,6 @@ int main(int argc, char** argv) { tbb::global_control::max_allowed_parallelism, num_threads); } - try { - app.parse(argc, argv); - } catch (const CLI::ParseError& e) { - return app.exit(e); - } - if (!config_path.empty()) { vio_config.load(config_path); } else { diff --git a/src/vio.cpp b/src/vio.cpp index 90bab30..e40ba1f 100644 --- a/src/vio.cpp +++ b/src/vio.cpp @@ -222,6 +222,12 @@ int main(int argc, char** argv) { "Save trajectory. Supported formats "); app.add_option("--use-imu", use_imu, "Use IMU."); + try { + app.parse(argc, argv); + } catch (const CLI::ParseError& e) { + return app.exit(e); + } + // global thread limit is in effect until global_control object is destroyed std::unique_ptr tbb_global_control; if (num_threads > 0) { @@ -229,12 +235,6 @@ int main(int argc, char** argv) { tbb::global_control::max_allowed_parallelism, num_threads); } - try { - app.parse(argc, argv); - } catch (const CLI::ParseError& e) { - return app.exit(e); - } - if (!config_path.empty()) { vio_config.load(config_path); diff --git a/thirdparty/basalt-headers b/thirdparty/basalt-headers index b44542a..4b62bcb 160000 --- a/thirdparty/basalt-headers +++ b/thirdparty/basalt-headers @@ -1 +1 @@ -Subproject commit b44542afd6c1aaccfc7e029771d025d066501030 +Subproject commit 4b62bcbdb343cc9464ba1de800ddcfd9942a9a6f