diff --git a/.vscode/settings.json b/.vscode/settings.json index 334f1f4..f319482 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -64,9 +64,11 @@ "Klemens", "Kohlhoff", "Kulukundis", + "LAPACK", "Lelbach", "Lemire", "Lib", + "libopenblas", "LIBPFM", "libunifex", "liburing", @@ -272,5 +274,7 @@ "variant": "cpp", "vector": "cpp", "version": "cpp" - } + }, + "C_Cpp.errorSquiggles": "disabled", + "cSpell.enabled": false } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a028c1..99042bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.5 + VERSION 0.10.12 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" @@ -72,11 +72,22 @@ else () set(_SHOULD_USE_INTEL_TBB OFF) endif () +# Probe for BLAS support +set(CMAKE_FIND_LIBRARY_PREFIXES ";lib") +find_package(BLAS QUIET) +if (BLAS_FOUND) + set(_SHOULD_USE_BLAS ON) +else () + set(_SHOULD_USE_BLAS OFF) +endif () + option(USE_INTEL_TBB "Use Intel TBB for parallel STL algorithms" ${_SHOULD_USE_INTEL_TBB}) option(USE_NVIDIA_CCCL "Use Nvidia CCCL for CUDA acceleration" ${_SHOULD_USE_NVIDIA_CCCL}) +option(USE_BLAS "Use BLAS for linear algebra" ${_SHOULD_USE_BLAS}) message(STATUS "USE_INTEL_TBB: ${USE_INTEL_TBB}") message(STATUS "USE_NVIDIA_CCCL: ${USE_NVIDIA_CCCL}") +message(STATUS "USE_BLAS: ${USE_BLAS}") # ------------------------------------------------------------------------------ # Dependencies @@ -102,20 +113,21 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") # ~~~ # # Moreover, CMake sometimes fails to find it on Windows: https://stackoverflow.com/a/78335726/2766161 -set(CMAKE_FIND_LIBRARY_PREFIXES ";lib") -find_package(BLAS REQUIRED) +if (USE_BLAS) + find_package(BLAS REQUIRED) -include(CheckFunctionExists) -check_function_exists(openblas_set_num_threads LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) -if (LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) - add_definitions(-DLESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + include(CheckFunctionExists) + check_function_exists(openblas_set_num_threads LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + if (LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + add_definitions(-D LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + endif () endif () # GTest (required by Google Benchmark) FetchContent_Declare( GoogleTest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.15.2 + GIT_TAG v1.17.0 ) FetchContent_MakeAvailable(GoogleTest) @@ -123,7 +135,7 @@ FetchContent_MakeAvailable(GoogleTest) FetchContent_Declare( GoogleBenchmark GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.9.1 + GIT_TAG v1.9.4 ) # Suppress building tests/docs/etc. for faster builds: @@ -223,7 +235,7 @@ endif () FetchContent_Declare( VictorZverovichFMT GIT_REPOSITORY https://github.com/fmtlib/fmt.git - GIT_TAG 11.1.2 + GIT_TAG 11.2.0 ) FetchContent_MakeAvailable(VictorZverovichFMT) @@ -259,7 +271,7 @@ FetchContent_MakeAvailable(MetaLibUnifEx) FetchContent_Declare( AshVardanianStringZilla GIT_REPOSITORY https://github.com/ashvardanian/stringzilla - GIT_TAG v3.12.5 + GIT_TAG v3.12.6 ) FetchContent_MakeAvailable(AshVardanianStringZilla) @@ -276,7 +288,7 @@ FetchContent_MakeAvailable(HanaDusikovaCTRE) FetchContent_Declare( GoogleAbseil GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git - GIT_TAG 20240722.0 # LTS version + GIT_TAG 20250512.1 # LTS version ) FetchContent_MakeAvailable(GoogleAbseil) @@ -284,7 +296,7 @@ FetchContent_MakeAvailable(GoogleAbseil) FetchContent_Declare( NielsLohmannJSON GIT_REPOSITORY https://github.com/nlohmann/json.git - GIT_TAG v3.11.3 + GIT_TAG v3.12.0 ) FetchContent_MakeAvailable(NielsLohmannJSON) @@ -292,10 +304,18 @@ FetchContent_MakeAvailable(NielsLohmannJSON) FetchContent_Declare( YaoyuanGuoYYJSON GIT_REPOSITORY https://github.com/ibireme/yyjson.git - GIT_TAG 0.10.0 + GIT_TAG 0.11.1 ) FetchContent_MakeAvailable(YaoyuanGuoYYJSON) +# Daniel Lemire's simdjson for SIMD-accelerated JSON parsing +FetchContent_Declare( + DanielLemireSimdJSON + GIT_REPOSITORY https://github.com/simdjson/simdjson.git + GIT_TAG v3.13.0 +) +FetchContent_MakeAvailable(DanielLemireSimdJSON) + # Chris Karloff's ASIO standalone, avoiding Boost... integration is a bit tricky: # https://github.com/cpm-cmake/CPM.cmake/blob/master/examples/asio-standalone/CMakeLists.txt FetchContent_Declare( @@ -343,7 +363,10 @@ set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON) if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64") set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM) target_sources(less_slow PRIVATE less_slow_amd64.S) -elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") + if (APPLE) + set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv8.6-a+bf16") + endif () set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM) target_sources(less_slow PRIVATE less_slow_aarch64.S) endif () @@ -428,6 +451,7 @@ endif () # ------------------------------------------------------------------------------ target_compile_definitions(less_slow PRIVATE USE_NVIDIA_CCCL=$) target_compile_definitions(less_slow PRIVATE USE_INTEL_TBB=$) +target_compile_definitions(less_slow PRIVATE USE_BLAS=$) target_link_libraries( less_slow PRIVATE Threads::Threads @@ -438,6 +462,7 @@ target_link_libraries( unifex stringzilla yyjson + simdjson ctre asio # There is no `absl` shortcut: @@ -445,9 +470,12 @@ target_link_libraries( absl::flat_hash_map nlohmann_json::nlohmann_json Eigen3::Eigen - ${BLAS_LIBRARIES} ) +if (USE_BLAS) + target_link_libraries(less_slow PRIVATE ${BLAS_LIBRARIES}) +endif () + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") # target_include_directories(less_slow PRIVATE ${LIBURING_INCLUDE_DIRS}) target_link_libraries(less_slow PRIVATE ${LIBURING_LIBRARIES}) diff --git a/README.md b/README.md index 93e3735..59e7fa9 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ -# Playing Around _Less Slow_ Coding Practices for C++, C, and Assembly Code +# Playing Around _Less Slow_ Coding Practices for C++, CUDA, and Assembly Code -> The benchmarks in this repository don’t aim to cover every topic entirely, but they help form a mindset and intuition for performance-oriented software design. -> It also provides an example of using some non-[STL](https://en.wikipedia.org/wiki/Standard_Template_Library) but de facto standard libraries in C++, importing them via CMake and compiling from source. -> For higher-level abstractions and languages, check out [`less_slow.rs`](https://github.com/ashvardanian/less_slow.rs) and [`less_slow.py`](https://github.com/ashvardanian/less_slow.py). -> I needed many of these measurements to reconsider my own coding habits, but hopefully they’re helpful to others as well. -> Most of the code is organized in very long, ordered, and `#pragma`-sectioned files — not necessarily the preferred form for everyone. +> The benchmarks in this repository don't aim to cover every topic entirely, but they help form a mindset and intuition for performance-oriented software design. +> It also provides an example of using some non-[STL](https://en.wikipedia.org/wiki/Standard_Template_Library) but de facto standard libraries in C++, importing them via CMake and compiling from source. +> For higher-level abstractions and languages, check out [`less_slow.rs`](https://github.com/ashvardanian/less_slow.rs) and [`less_slow.py`](https://github.com/ashvardanian/less_slow.py). +> I needed many of these measurements to reconsider my own coding habits, but hopefully they're helpful to others as well. +> Most of the code is organized in very long, ordered, and nested `#pragma` sections — not necessarily the preferred form for everyone. -Much of modern code suffers from common pitfalls — bugs, security vulnerabilities, and __performance bottlenecks__. -University curricula and coding bootcamps tend to stick to traditional coding styles and standard features, rarely exposing the more fun, unusual, and efficient design opportunities. +Much of modern code suffers from common pitfalls — bugs, security vulnerabilities, and __performance bottlenecks__. +University curricula and coding bootcamps tend to stick to traditional coding styles and standard features, rarely exposing the more fun, unusual, and potentially efficient design opportunities. +This repository explores just that. ![Less Slow C++](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/less_slow.cpp.jpg?raw=true) -This repository offers practical minimalistic examples of writing efficient C and C++ code. -It leverages C++20 features and is designed primarily for GCC and Clang compilers on Linux, though it may work on other platforms. +The code leverages C++20 and CUDA features and is designed primarily for GCC, Clang, and NVCC compilers on Linux, though it may work on other platforms. The topics range from basic micro-kernels executing in a few nanoseconds to more complex constructs involving parallel algorithms, coroutines, and polymorphism. Some of the highlights include: @@ -24,7 +24,7 @@ Some of the highlights include: - __Scaling AI?__ Measure the gap between theoretical [ALU](https://en.wikipedia.org/wiki/Arithmetic_logic_unit) throughput and your [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms). - __How many if conditions are too many?__ Test your CPU's branch predictor with just 10 lines of code. - __Prefer recursion to iteration?__ Measure the depth at which your algorithm will [`SEGFAULT`](https://en.wikipedia.org/wiki/Segmentation_fault). -- __Why avoid exceptions?__ Take `std::error_code` or [`std::variant`](https://en.cppreference.com/w/cpp/utility/variant)-like wrappers? +- __Why avoid exceptions?__ Take `std::error_code` or [`std::variant`](https://en.cppreference.com/w/cpp/utility/variant)-like [ADTs](https://en.wikipedia.org/wiki/Algebraic_data_type)? - __Scaling to many cores?__ Learn how to use [OpenMP](https://en.wikipedia.org/wiki/OpenMP), Intel's oneTBB, or your custom thread pool. - __How to handle [JSON](https://www.json.org/json-en.html) avoiding memory allocations?__ Is it easier with C++ 20 or old-school C 99 tools? - __How to properly use STL's associative containers__ with custom keys and transparent comparators? @@ -40,6 +40,7 @@ Some of the highlights include: - __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜 #31 To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments. +Keep in mind, that most modern IDEs have a navigation bar to help you view and jump between `#pragma region` sections. Follow the instructions below to run the code in your environment and compare it to the comments as you read through the source. ## Running the Benchmarks @@ -85,10 +86,22 @@ The build will pull and compile several third-party dependencies from the source - Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated algorithms. - Nvidia's [CUTLASS](https://github.com/nvidia/cutlass) for GPU-accelerated Linear Algebra. -To build without Parallel STL, Intel TBB, and CUDA: +To build without Parallel STL, Intel TBB, BLAS, and CUDA: ```sh -cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_NVIDIA_CCCL=OFF +cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_NVIDIA_CCCL=OFF -D USE_BLAS=OFF +cmake --build build_release --config Release +``` + +To build on MacOS, pulling key dependencies from [Homebrew](https://brew.sh): + +```sh +brew install openblas +cmake -B build_release \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_C_FLAGS="-I$(brew --prefix openblas)/include" \ + -D CMAKE_CXX_FLAGS="-I$(brew --prefix openblas)/include" \ + -D CMAKE_EXE_LINKER_FLAGS="-L$(brew --prefix openblas)/lib" cmake --build build_release --config Release ``` diff --git a/VERSION b/VERSION index 8877d79..5111446 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.5 \ No newline at end of file +0.10.12 \ No newline at end of file diff --git a/less_slow.cpp b/less_slow.cpp index dafffb3..59cb311 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -398,21 +398,16 @@ class aligned_array { type_ *data_ = nullptr; std::size_t size_ = 0; + std::size_t alignment_ = 0; public: -#if defined(_MSC_VER) //! MSVC doesn't support `std::aligned_alloc` yet - aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) { - data_ = static_cast(_aligned_malloc(sizeof(type_) * size_, alignment)); - if (!data_) throw std::bad_alloc(); + aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size), alignment_(alignment) { + // With `std::aligned_alloc` in C++17, an exception won't be raised, which may be preferred in + // some environments. MSVC was late to adopt it, and developers would often use a combination + // of lower-level `posix_memalign` and `_aligned_malloc`/`_aligned_free` across Unix and Windows. + data_ = (type_ *)::operator new(sizeof(type_) * size_, std::align_val_t(alignment_)); } - ~aligned_array() noexcept { _aligned_free(data_); } -#else - aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) { - data_ = static_cast(std::aligned_alloc(alignment, sizeof(type_) * size_)); - if (!data_) throw std::bad_alloc(); - } - ~aligned_array() noexcept { std::free(data_); } -#endif + ~aligned_array() noexcept { ::operator delete(data_, sizeof(type_) * size_, std::align_val_t(alignment_)); } aligned_array(aligned_array const &) = delete; aligned_array &operator=(aligned_array const &) = delete; @@ -992,15 +987,138 @@ static void branch_cost(bm::State &state) { for (auto _ : state) { std::int32_t random = random_values[(++iteration) & (count - 1)]; bm::DoNotOptimize( // - variable = // - (random & 1) // + variable = // ! Fun fact: multiplication compiles to a jump, + (random & 1) // ! but replacing with a bitwise operation results in a conditional move. ? (variable + random) - : (variable * random)); + : (variable ^ random)); } } BENCHMARK(branch_cost)->RangeMultiplier(4)->Range(256, 32 * 1024); +/** + * It's hard to reason if the above code should compile into a conditional move or a jump, + * so let's define explicit inline assembly kernels and compare both. + */ +#if defined(__GNUC__) && !defined(__clang__) //! GCC/Clang inline asm note in your code, keep MSVC out + +#if defined(__x86_64__) || defined(__i386__) + +static void branch_cost_cmov(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + std::int32_t sum; // early-clobber temp for LEA result + + asm volatile( // + "leal (%[var],%[rnd],1), %[sum]\n\t" // sum := variable + random + "xorl %[rnd], %[var]\n\t" // var := variable ^ random + "testl $1, %[rnd]\n\t" // if (random & 1) var := sum + "cmovne %[sum], %[var]\n\t" + : [var] "+r"(variable), [sum] "=&r"(sum) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +static void branch_cost_jump(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + + asm volatile( // + "testl $1, %[rnd]\n\t" + "jnz 1f\n\t" // if odd -> jump to add + "xorl %[rnd], %[var]\n\t" // even: var ^= rnd + "jmp 2f\n\t" + "1:\n\t" + "addl %[rnd], %[var]\n\t" // odd: var += rnd + "2:\n\t" + : [var] "+r"(variable) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +BENCHMARK(branch_cost_cmov)->RangeMultiplier(4)->Range(256, 32 * 1024); +BENCHMARK(branch_cost_jump)->RangeMultiplier(4)->Range(256, 32 * 1024); + +#elif defined(__aarch64__) + +static void branch_cost_csel(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + std::int32_t sum; + + asm volatile( // + "add %w[sum], %w[var], %w[rnd]\n\t" // sum := variable + random + "eor %w[var], %w[var], %w[rnd]\n\t" // var := variable ^ random + "tst %w[rnd], #1\n\t" // if (random & 1) var := sum + "csel %w[var], %w[sum], %w[var], NE\n\t" + : [var] "+r"(variable), [sum] "=&r"(sum) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +static void branch_cost_branch(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + + asm volatile( // + "tst %w[rnd], #1\n\t" + "b.ne 1f\n\t" // if odd -> jump to add + "eor %w[var], %w[var], %w[rnd]\n\t" // even: var ^= rnd + "b 2f\n\t" + "1:\n\t" + "add %w[var], %w[var], %w[rnd]\n\t" // odd: var += rnd + "2:\n\t" + : [var] "+r"(variable) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +BENCHMARK(branch_cost_csel)->RangeMultiplier(4)->Range(256, 32 * 1024); +BENCHMARK(branch_cost_branch)->RangeMultiplier(4)->Range(256, 32 * 1024); + +#endif + +#endif // __GNUC__ && !__clang__ + +/** + * Results are quite interesting. On Intel: + * - `branch_cost` up to 4K runs at @b 0.7ns, beyond that it jumps to @b 3.7ns. + * - `branch_cost_cmov` consistently runs at @b 1.3ns, regardless of the size. + * - `branch_cost_jump` has similar, but slightly worse performance than `branch_cost`. + */ + #pragma endregion // Branch Prediction #pragma region Cache Misses @@ -1074,28 +1192,54 @@ BENCHMARK(cache_misses_cost) * value. This optimization is crucial for performance, especially when dealing * with heavy objects. */ -#include // `std::optional` +struct heavy_t { + std::uint64_t data[8]; + + heavy_t() noexcept { std::iota(data, data + 8, 0); } -std::optional make_heavy_object_mutable() { - std::string x(1024, 'x'); + heavy_t(heavy_t &&) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + heavy_t(heavy_t const &) { std::this_thread::sleep_for(std::chrono::milliseconds(2)); } + heavy_t &operator=(heavy_t &&) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + return *this; + } + heavy_t &operator=(heavy_t const &) { + std::this_thread::sleep_for(std::chrono::milliseconds(2)); + return *this; + } +}; + +heavy_t make_heavy_object() { return heavy_t {}; } + +heavy_t make_named_heavy_object() { + heavy_t const x; //! Even with `const`, RVO is possible return x; } -std::optional make_heavy_object_immutable() { - std::string const x(1024, 'x'); //! `const` is the only difference - return x; +heavy_t make_conditional_heavy_object() { + heavy_t x; + heavy_t &x1 = x; + heavy_t &x2 = x; + static std::size_t counter = 0; //! Condition prevents RVO + if (counter++ % 2 == 0) { return x1; } + else { return x2; } } -static void rvo_friendly(bm::State &state) { - for (auto _ : state) bm::DoNotOptimize(make_heavy_object_mutable()); +static void rvo_trivial(bm::State &state) { + for (auto _ : state) bm::DoNotOptimize(make_heavy_object()); } -static void rvo_impossible(bm::State &state) { - for (auto _ : state) bm::DoNotOptimize(make_heavy_object_immutable()); +static void rvo_likely(bm::State &state) { + for (auto _ : state) bm::DoNotOptimize(make_named_heavy_object()); } -BENCHMARK(rvo_friendly); -BENCHMARK(rvo_impossible); +static void rvo_banned(bm::State &state) { + for (auto _ : state) bm::DoNotOptimize(make_conditional_heavy_object()); +} + +BENCHMARK(rvo_trivial); +BENCHMARK(rvo_likely); +BENCHMARK(rvo_banned); /** * Despite intuition, marking a local object as `const` hurts our performance. @@ -1334,13 +1478,23 @@ BENCHMARK(integral_division_by_const); * Since 64-bit doubles can exactly represent all 32-bit signed integers, * this method introduces @b no precision loss, making it a safe and efficient * alternative when division performance is critical. + * + * - The `float` can fit 24-bit integers exactly in its significand/mantissa. + * - The `double` can fit 52-bit integers exactly in its significand/mantissa. */ +static void integral_division_with_floats(bm::State &state) { + std::int32_t a = std::rand(), b = std::rand(), c = 0; + for (auto _ : state) + bm::DoNotOptimize(c = static_cast(static_cast(++a) / static_cast(++b))); +} + static void integral_division_with_doubles(bm::State &state) { std::int32_t a = std::rand(), b = std::rand(), c = 0; for (auto _ : state) bm::DoNotOptimize(c = static_cast(static_cast(++a) / static_cast(++b))); } +BENCHMARK(integral_division_with_floats); BENCHMARK(integral_division_with_doubles); /** @@ -1357,7 +1511,7 @@ BENCHMARK(integral_division_with_doubles); * while the internal logic remains identical. */ -#if defined(__GNUC__) && !defined(__clang__) +#if defined(__GNUC__) #if defined(__x86_64__) || defined(__i386__) [[gnu::target("arch=core2")]] @@ -2633,8 +2787,8 @@ class strided_ptr { strided_ptr operator--(int) noexcept { strided_ptr temp = *this; --(*this); return temp; } strided_ptr &operator+=(difference_type offset) noexcept { data_ += offset * stride_; return *this; } strided_ptr &operator-=(difference_type offset) noexcept { data_ -= offset * stride_; return *this; } - strided_ptr operator+(difference_type offset) noexcept { strided_ptr temp = *this; return temp += offset; } - strided_ptr operator-(difference_type offset) noexcept { strided_ptr temp = *this; return temp -= offset; } + strided_ptr operator+(difference_type offset) const noexcept { strided_ptr temp = *this; return temp += offset; } + strided_ptr operator-(difference_type offset) const noexcept { strided_ptr temp = *this; return temp -= offset; } friend difference_type operator-(strided_ptr const &a, strided_ptr const &b) noexcept { assert(a.stride_ == b.stride_); return (a.data_ - b.data_) / static_cast(a.stride_); } friend bool operator==(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ == b.data_; } friend bool operator<(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ < b.data_; } @@ -2909,6 +3063,20 @@ std::size_t parse_size_string(std::string const &str) { #pragma endregion // Non Uniform Memory Access #pragma region Memory Bound Linear Algebra + +#if !defined(USE_BLAS) +#if defined(__has_include) +#if __has_include() +#define USE_BLAS 1 +#else +#define USE_BLAS 0 +#endif // __has_include() +#else +#define USE_BLAS 0 +#endif // defined(__has_include) +#endif // !defined(USE_BLAS) + +#if USE_BLAS #include /** * ! OpenBLAS defines a `SIZE` macro for internal use, which conflicts with `fmt` @@ -2952,6 +3120,8 @@ static void cblas_tops(bm::State &state) { BENCHMARK(cblas_tops)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed); BENCHMARK(cblas_tops)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed); +#endif // USE_BLAS + /** * Eigen is a high-level C++ library for linear algebra that provides a * convenient templated API for matrix operations. @@ -5050,6 +5220,124 @@ BENCHMARK(json_nlohmann) ->Name("json_nlohmann") ->Threads(physical_cores()); +/** + * simdjson is designed for high-performance JSON parsing using SIMD instructions. + * It provides On-Demand parsing which is particularly efficient for selective data extraction. + */ +#include + +bool contains_xss_in_simdjson_ondemand(simdjson::ondemand::value element) noexcept { + + // Handle objects + if (element.type() == simdjson::ondemand::json_type::object) { + simdjson::ondemand::object obj; + if (element.get_object().get(obj) == simdjson::SUCCESS) { + for (auto sub : obj) { + simdjson::ondemand::value val; + if (sub.value().get(val) == simdjson::SUCCESS) + if (contains_xss_in_simdjson_ondemand(val)) return true; + } + } + return false; + } + // Handle arrays + else if (element.type() == simdjson::ondemand::json_type::array) { + simdjson::ondemand::array arr; + if (element.get_array().get(arr) == simdjson::SUCCESS) { + for (auto sub : arr) { + simdjson::ondemand::value val; + if (sub.get(val) == simdjson::SUCCESS) + if (contains_xss_in_simdjson_ondemand(val)) return true; + } + } + return false; + } + // Handle strings + else if (element.type() == simdjson::ondemand::json_type::string) { + std::string_view str; + if (element.get_string().get(str) == simdjson::SUCCESS) + return str.find("") != std::string_view::npos; + } + return false; +} + +bool contains_xss_in_simdjson_dom(simdjson::dom::element element) noexcept { + if (element.is_object()) { + for (auto [key, val] : element.get_object()) + if (contains_xss_in_simdjson_dom(val)) return true; + } + else if (element.is_array()) { + for (auto val : element.get_array()) + if (contains_xss_in_simdjson_dom(val)) return true; + } + else if (element.is_string()) { + std::string_view str = element.get_string(); + return str.find("") != std::string_view::npos; + } + return false; +} + +static void json_simdjson_ondemand(bm::State &state) { + std::size_t bytes_processed = 0; + std::size_t iteration = 0; + + // Pre-allocate padded strings outside the hot path + simdjson::padded_string padded_strings[3] = { + simdjson::padded_string(packets_json[0]), + simdjson::padded_string(packets_json[1]), + simdjson::padded_string(packets_json[2]), + }; + + // On-demand parser reuses internal buffers + simdjson::ondemand::parser parser; + simdjson::ondemand::document doc; + + for (auto _ : state) { + std::size_t const packet_index = iteration++ % 3; + bytes_processed += packets_json[packet_index].size(); + + auto error = parser.iterate(padded_strings[packet_index]).get(doc); + if (error == simdjson::SUCCESS) { + simdjson::ondemand::value root; + if (doc.get_value().get(root) == simdjson::SUCCESS) + bm::DoNotOptimize(contains_xss_in_simdjson_ondemand(root)); + } + } + + state.SetBytesProcessed(bytes_processed); +} + +static void json_simdjson_dom(bm::State &state) { + std::size_t bytes_processed = 0; + std::size_t iteration = 0; + + // Pre-allocate padded strings outside the hot path + simdjson::padded_string padded_strings[3] = { + simdjson::padded_string(packets_json[0]), + simdjson::padded_string(packets_json[1]), + simdjson::padded_string(packets_json[2]), + }; + + // Reuse the state + simdjson::dom::parser parser; + simdjson::dom::element doc; + + for (auto _ : state) { + std::size_t const packet_index = iteration++ % 3; + bytes_processed += packets_json[packet_index].size(); + + auto error = parser.parse(padded_strings[packet_index]).get(doc); + if (error == simdjson::SUCCESS) bm::DoNotOptimize(contains_xss_in_simdjson_dom(doc)); + } + + state.SetBytesProcessed(bytes_processed); +} + +BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson"); +BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson"); +BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson")->Threads(physical_cores()); +BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson")->Threads(physical_cores()); + /** * The results for the single-threaded case and the multi-threaded case without * Simultaneous Multi-Threading @b (SMT), with 96 threads on 96 Sapphire Rapids @@ -6043,7 +6331,7 @@ struct log_printf_t { // `std::chrono::high_resolution_clock` is usually just an alias to either `system_clock` or // `steady_clock`. There is debate on whether using it is a good idea at all. // https://en.cppreference.com/w/cpp/chrono/high_resolution_clock -#if defined(_MSC_VER) +#if defined(_MSC_VER) || defined(__APPLE__) auto now = std::chrono::system_clock::now(); #else auto now = std::chrono::high_resolution_clock::now(); @@ -6074,7 +6362,10 @@ struct log_printf_t { BENCHMARK(logging)->Name("log_printf")->MinTime(2); -#if !defined(_MSC_VER) +/** + * Formatting `std::chrono` with `std::format` fails on both Windows and MacOS. + */ +#if !defined(_MSC_VER) && !defined(__APPLE__) #if defined(__cpp_lib_format) #include // `std::format_to_n` diff --git a/less_slow_aarch64.S b/less_slow_aarch64.S index b5e7ba6..57e1a83 100644 --- a/less_slow_aarch64.S +++ b/less_slow_aarch64.S @@ -3,21 +3,27 @@ # Micro-kernels for building a performance-first mindset for 64-bit ARM (NEON). # ---------------------------------------------------------------------------- - .section .text - .global i32_add_asm_kernel +#ifdef __APPLE__ + #define SYMBOL_NAME(name) _##name // Add underscore on macOS +#else + #define SYMBOL_NAME(name) name // No underscore on GNU-based systems +#endif + + .text + .global SYMBOL_NAME(i32_add_asm_kernel) - .global tops_f64_neon_asm_kernel - .global tops_f32_neon_asm_kernel - .global tops_f16_neon_asm_kernel - .global tops_bf16_neon_asm_kernel - .global tops_i8_neon_asm_kernel - .global tops_u8_neon_asm_kernel + .global SYMBOL_NAME(tops_f64_neon_asm_kernel) + .global SYMBOL_NAME(tops_f32_neon_asm_kernel) + .global SYMBOL_NAME(tops_f16_neon_asm_kernel) + .global SYMBOL_NAME(tops_bf16_neon_asm_kernel) + .global SYMBOL_NAME(tops_i8_neon_asm_kernel) + .global SYMBOL_NAME(tops_u8_neon_asm_kernel) # ---------------------------------------------------------------------------- # Simple function that adds two 32-bit integers. # AArch64 ABI: W0 = 'a', W1 = 'b'. Return in W0. # ---------------------------------------------------------------------------- -i32_add_asm_kernel: +SYMBOL_NAME(i32_add_asm_kernel): add w0, w0, w1 ret @@ -26,7 +32,7 @@ i32_add_asm_kernel: # Each FMLA vD.2d, vN.2d, vM.2d => 2 multiplies + 2 adds = 4 FLOPs. # We'll do 10 instructions => 10 × 4 = 40 FLOPs total, returning 40 in W0. # ---------------------------------------------------------------------------- -tops_f64_neon_asm_kernel: +SYMBOL_NAME(tops_f64_neon_asm_kernel): fmla v0.2d, v1.2d, v2.2d fmla v3.2d, v4.2d, v5.2d fmla v6.2d, v7.2d, v8.2d @@ -47,7 +53,7 @@ tops_f64_neon_asm_kernel: # Let's do 10 instructions => 10 × 8 = 80 FLOPs total. # Return 80 in W0. # ---------------------------------------------------------------------------- -tops_f32_neon_asm_kernel: +SYMBOL_NAME(tops_f32_neon_asm_kernel): fmla v0.4s, v1.4s, v2.4s fmla v3.4s, v4.4s, v5.4s fmla v6.4s, v7.4s, v8.4s @@ -68,7 +74,7 @@ tops_f32_neon_asm_kernel: # Each FMLA vD.8h, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs. # We'll do 10 instructions => 160 FLOPs total, returning 160 in W0. # ---------------------------------------------------------------------------- -tops_f16_neon_asm_kernel: +SYMBOL_NAME(tops_f16_neon_asm_kernel): fmla v0.8h, v1.8h, v2.8h fmla v3.8h, v4.8h, v5.8h fmla v6.8h, v7.8h, v8.8h @@ -89,7 +95,7 @@ tops_f16_neon_asm_kernel: # bfmmla vD.4s, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs. # We'll do 10 instructions => 160 FLOPs total, returning 160 in W0. # ---------------------------------------------------------------------------- -tops_bf16_neon_asm_kernel: +SYMBOL_NAME(tops_bf16_neon_asm_kernel): bfmmla v0.4s, v1.8h, v2.8h bfmmla v3.4s, v4.8h, v5.8h bfmmla v6.4s, v7.8h, v8.8h @@ -110,7 +116,7 @@ tops_bf16_neon_asm_kernel: # sdot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs. # We'll do 10 instructions => 320 FLOPs total, returning 320 in W0. # ---------------------------------------------------------------------------- -tops_i8_neon_asm_kernel: +SYMBOL_NAME(tops_i8_neon_asm_kernel): sdot v0.4s, v1.16b, v2.16b sdot v3.4s, v4.16b, v5.16b sdot v6.4s, v7.16b, v8.16b @@ -131,7 +137,7 @@ tops_i8_neon_asm_kernel: # udot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs. # We'll do 10 instructions => 320 FLOPs total, returning 320 in W0. # ---------------------------------------------------------------------------- -tops_u8_neon_asm_kernel: +SYMBOL_NAME(tops_u8_neon_asm_kernel): udot v0.4s, v1.16b, v2.16b udot v3.4s, v4.16b, v5.16b udot v6.4s, v7.16b, v8.16b @@ -148,5 +154,7 @@ tops_u8_neon_asm_kernel: # ---------------------------------------------------------------------------- # Tell the linker/assembler that we do NOT need an executable stack: +#ifdef __linux__ .section .note.GNU-stack, "", @progbits +#endif # ---------------------------------------------------------------------------- \ No newline at end of file