From a66cfe21e9b29c621fd782e780049faea42bddb5 Mon Sep 17 00:00:00 2001 From: bmanga Date: Sun, 20 Apr 2025 12:54:10 +0200 Subject: [PATCH 01/19] Fix: Aligned allocation (#42) Uses `operator new` overload with alignment. Stores the alignment for the matching `delete` call. Closes #41 --------- Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> --- less_slow.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/less_slow.cpp b/less_slow.cpp index dafffb3..5845230 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -398,21 +398,16 @@ class aligned_array { type_ *data_ = nullptr; std::size_t size_ = 0; + std::size_t alignment_ = 0; public: -#if defined(_MSC_VER) //! MSVC doesn't support `std::aligned_alloc` yet - aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) { - data_ = static_cast(_aligned_malloc(sizeof(type_) * size_, alignment)); - if (!data_) throw std::bad_alloc(); + aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size), alignment_(alignment) { + // With `std::aligned_alloc` in C++17, an exception won't be raised, which may be preferred in + // some environments. MSVC was late to adopt it, and developers would often use a combination + // of lower-level `posix_memalign` and `_aligned_malloc`/`_aligned_free` across Unix and Windows. + data_ = (type_ *)::operator new(sizeof(type_) * size_, std::align_val_t(alignment_)); } - ~aligned_array() noexcept { _aligned_free(data_); } -#else - aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) { - data_ = static_cast(std::aligned_alloc(alignment, sizeof(type_) * size_)); - if (!data_) throw std::bad_alloc(); - } - ~aligned_array() noexcept { std::free(data_); } -#endif + ~aligned_array() noexcept { ::operator delete(data_, sizeof(type_) * size_, std::align_val_t(alignment_)); } aligned_array(aligned_array const &) = delete; aligned_array &operator=(aligned_array const &) = delete; From ab7bf3f22b2b4a55c412c4208d022b52d8334755 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 20 Apr 2025 11:03:56 +0000 Subject: [PATCH 02/19] Docs: Notes on `#pragma region`s --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 93e3735..349855a 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ -# Playing Around _Less Slow_ Coding Practices for C++, C, and Assembly Code +# Playing Around _Less Slow_ Coding Practices for C++, CUDA, and Assembly Code -> The benchmarks in this repository don’t aim to cover every topic entirely, but they help form a mindset and intuition for performance-oriented software design. -> It also provides an example of using some non-[STL](https://en.wikipedia.org/wiki/Standard_Template_Library) but de facto standard libraries in C++, importing them via CMake and compiling from source. -> For higher-level abstractions and languages, check out [`less_slow.rs`](https://github.com/ashvardanian/less_slow.rs) and [`less_slow.py`](https://github.com/ashvardanian/less_slow.py). -> I needed many of these measurements to reconsider my own coding habits, but hopefully they’re helpful to others as well. -> Most of the code is organized in very long, ordered, and `#pragma`-sectioned files — not necessarily the preferred form for everyone. +> The benchmarks in this repository don't aim to cover every topic entirely, but they help form a mindset and intuition for performance-oriented software design. +> It also provides an example of using some non-[STL](https://en.wikipedia.org/wiki/Standard_Template_Library) but de facto standard libraries in C++, importing them via CMake and compiling from source. +> For higher-level abstractions and languages, check out [`less_slow.rs`](https://github.com/ashvardanian/less_slow.rs) and [`less_slow.py`](https://github.com/ashvardanian/less_slow.py). +> I needed many of these measurements to reconsider my own coding habits, but hopefully they're helpful to others as well. +> Most of the code is organized in very long, ordered, and nested `#pragma` sections — not necessarily the preferred form for everyone. -Much of modern code suffers from common pitfalls — bugs, security vulnerabilities, and __performance bottlenecks__. -University curricula and coding bootcamps tend to stick to traditional coding styles and standard features, rarely exposing the more fun, unusual, and efficient design opportunities. +Much of modern code suffers from common pitfalls — bugs, security vulnerabilities, and __performance bottlenecks__. +University curricula and coding bootcamps tend to stick to traditional coding styles and standard features, rarely exposing the more fun, unusual, and potentially efficient design opportunities. +This repository explores just that. ![Less Slow C++](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/less_slow.cpp.jpg?raw=true) -This repository offers practical minimalistic examples of writing efficient C and C++ code. -It leverages C++20 features and is designed primarily for GCC and Clang compilers on Linux, though it may work on other platforms. +The code leverages C++20 and CUDA features and is designed primarily for GCC, Clang, and NVCC compilers on Linux, though it may work on other platforms. The topics range from basic micro-kernels executing in a few nanoseconds to more complex constructs involving parallel algorithms, coroutines, and polymorphism. Some of the highlights include: @@ -40,6 +40,7 @@ Some of the highlights include: - __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜 #31 To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments. +Keep in mind, that most modern IDEs have a navigation bar to help you view and jump between `#pragma region` sections. Follow the instructions below to run the code in your environment and compare it to the comments as you read through the source. ## Running the Benchmarks From 966d168e7a07451348a234e17acc1dbb145d7ccb Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Sun, 20 Apr 2025 11:04:57 +0000 Subject: [PATCH 03/19] Release: v0.10.6 [skip ci] ### Patch - Docs: Notes on `#pragma region`s (ab7bf3f) - Fix: Aligned allocation (#42) (a66cfe2) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a028c1..92f7b29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.5 + VERSION 0.10.6 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index 8877d79..6348c0f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.5 \ No newline at end of file +0.10.6 \ No newline at end of file From ecff6e35e71a77cea020391a427eb8880efc6dbe Mon Sep 17 00:00:00 2001 From: bmanga Date: Sun, 20 Apr 2025 21:30:21 +0200 Subject: [PATCH 04/19] Improve: Include Asm tests into macOS Arm builds (#45) --- CMakeLists.txt | 5 ++++- less_slow.cpp | 2 +- less_slow_aarch64.S | 38 +++++++++++++++++++++++--------------- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92f7b29..aebb41e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -343,7 +343,10 @@ set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON) if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64") set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM) target_sources(less_slow PRIVATE less_slow_amd64.S) -elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") + if (APPLE) + set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv8.6-a+bf16") + endif() set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM) target_sources(less_slow PRIVATE less_slow_aarch64.S) endif () diff --git a/less_slow.cpp b/less_slow.cpp index 5845230..dab29da 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -1352,7 +1352,7 @@ BENCHMARK(integral_division_with_doubles); * while the internal logic remains identical. */ -#if defined(__GNUC__) && !defined(__clang__) +#if defined(__GNUC__) #if defined(__x86_64__) || defined(__i386__) [[gnu::target("arch=core2")]] diff --git a/less_slow_aarch64.S b/less_slow_aarch64.S index b5e7ba6..57e1a83 100644 --- a/less_slow_aarch64.S +++ b/less_slow_aarch64.S @@ -3,21 +3,27 @@ # Micro-kernels for building a performance-first mindset for 64-bit ARM (NEON). # ---------------------------------------------------------------------------- - .section .text - .global i32_add_asm_kernel +#ifdef __APPLE__ + #define SYMBOL_NAME(name) _##name // Add underscore on macOS +#else + #define SYMBOL_NAME(name) name // No underscore on GNU-based systems +#endif + + .text + .global SYMBOL_NAME(i32_add_asm_kernel) - .global tops_f64_neon_asm_kernel - .global tops_f32_neon_asm_kernel - .global tops_f16_neon_asm_kernel - .global tops_bf16_neon_asm_kernel - .global tops_i8_neon_asm_kernel - .global tops_u8_neon_asm_kernel + .global SYMBOL_NAME(tops_f64_neon_asm_kernel) + .global SYMBOL_NAME(tops_f32_neon_asm_kernel) + .global SYMBOL_NAME(tops_f16_neon_asm_kernel) + .global SYMBOL_NAME(tops_bf16_neon_asm_kernel) + .global SYMBOL_NAME(tops_i8_neon_asm_kernel) + .global SYMBOL_NAME(tops_u8_neon_asm_kernel) # ---------------------------------------------------------------------------- # Simple function that adds two 32-bit integers. # AArch64 ABI: W0 = 'a', W1 = 'b'. Return in W0. # ---------------------------------------------------------------------------- -i32_add_asm_kernel: +SYMBOL_NAME(i32_add_asm_kernel): add w0, w0, w1 ret @@ -26,7 +32,7 @@ i32_add_asm_kernel: # Each FMLA vD.2d, vN.2d, vM.2d => 2 multiplies + 2 adds = 4 FLOPs. # We'll do 10 instructions => 10 × 4 = 40 FLOPs total, returning 40 in W0. # ---------------------------------------------------------------------------- -tops_f64_neon_asm_kernel: +SYMBOL_NAME(tops_f64_neon_asm_kernel): fmla v0.2d, v1.2d, v2.2d fmla v3.2d, v4.2d, v5.2d fmla v6.2d, v7.2d, v8.2d @@ -47,7 +53,7 @@ tops_f64_neon_asm_kernel: # Let's do 10 instructions => 10 × 8 = 80 FLOPs total. # Return 80 in W0. # ---------------------------------------------------------------------------- -tops_f32_neon_asm_kernel: +SYMBOL_NAME(tops_f32_neon_asm_kernel): fmla v0.4s, v1.4s, v2.4s fmla v3.4s, v4.4s, v5.4s fmla v6.4s, v7.4s, v8.4s @@ -68,7 +74,7 @@ tops_f32_neon_asm_kernel: # Each FMLA vD.8h, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs. # We'll do 10 instructions => 160 FLOPs total, returning 160 in W0. # ---------------------------------------------------------------------------- -tops_f16_neon_asm_kernel: +SYMBOL_NAME(tops_f16_neon_asm_kernel): fmla v0.8h, v1.8h, v2.8h fmla v3.8h, v4.8h, v5.8h fmla v6.8h, v7.8h, v8.8h @@ -89,7 +95,7 @@ tops_f16_neon_asm_kernel: # bfmmla vD.4s, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs. # We'll do 10 instructions => 160 FLOPs total, returning 160 in W0. # ---------------------------------------------------------------------------- -tops_bf16_neon_asm_kernel: +SYMBOL_NAME(tops_bf16_neon_asm_kernel): bfmmla v0.4s, v1.8h, v2.8h bfmmla v3.4s, v4.8h, v5.8h bfmmla v6.4s, v7.8h, v8.8h @@ -110,7 +116,7 @@ tops_bf16_neon_asm_kernel: # sdot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs. # We'll do 10 instructions => 320 FLOPs total, returning 320 in W0. # ---------------------------------------------------------------------------- -tops_i8_neon_asm_kernel: +SYMBOL_NAME(tops_i8_neon_asm_kernel): sdot v0.4s, v1.16b, v2.16b sdot v3.4s, v4.16b, v5.16b sdot v6.4s, v7.16b, v8.16b @@ -131,7 +137,7 @@ tops_i8_neon_asm_kernel: # udot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs. # We'll do 10 instructions => 320 FLOPs total, returning 320 in W0. # ---------------------------------------------------------------------------- -tops_u8_neon_asm_kernel: +SYMBOL_NAME(tops_u8_neon_asm_kernel): udot v0.4s, v1.16b, v2.16b udot v3.4s, v4.16b, v5.16b udot v6.4s, v7.16b, v8.16b @@ -148,5 +154,7 @@ tops_u8_neon_asm_kernel: # ---------------------------------------------------------------------------- # Tell the linker/assembler that we do NOT need an executable stack: +#ifdef __linux__ .section .note.GNU-stack, "", @progbits +#endif # ---------------------------------------------------------------------------- \ No newline at end of file From 35824e648514dd6a8a8882a74c3382cf94e11c68 Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Sun, 20 Apr 2025 19:30:33 +0000 Subject: [PATCH 05/19] Release: v0.10.7 [skip ci] ### Patch - Improve: Include Asm tests into macOS Arm builds (#45) (ecff6e3) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aebb41e..032e913 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.6 + VERSION 0.10.7 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index 6348c0f..a988815 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.6 \ No newline at end of file +0.10.7 \ No newline at end of file From 4d00abad86a2359b1c1b4512b9c98a3e45e81500 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 22 Apr 2025 12:22:51 +0100 Subject: [PATCH 06/19] Fix: Can't `std::format(time)` on macOS Co-authored-by: Armin Stepanyan <12305910+ab-10@users.noreply.github.com> --- less_slow.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/less_slow.cpp b/less_slow.cpp index dab29da..8ce7d1f 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -6038,7 +6038,7 @@ struct log_printf_t { // `std::chrono::high_resolution_clock` is usually just an alias to either `system_clock` or // `steady_clock`. There is debate on whether using it is a good idea at all. // https://en.cppreference.com/w/cpp/chrono/high_resolution_clock -#if defined(_MSC_VER) +#if defined(_MSC_VER) || defined(__APPLE__) auto now = std::chrono::system_clock::now(); #else auto now = std::chrono::high_resolution_clock::now(); @@ -6069,7 +6069,10 @@ struct log_printf_t { BENCHMARK(logging)->Name("log_printf")->MinTime(2); -#if !defined(_MSC_VER) +/** + * Formatting `std::chrono` with `std::format` fails on both Windows and MacOS. + */ +#if !defined(_MSC_VER) && !defined(__APPLE__) #if defined(__cpp_lib_format) #include // `std::format_to_n` From 91207235346c21682b3aaebc385727df65d6b5d2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 22 Apr 2025 12:23:26 +0100 Subject: [PATCH 07/19] Fix: Missing `const` qualifiers in `strided_ptr` Co-authored-by: Armin Stepanyan <12305910+ab-10@users.noreply.github.com> --- less_slow.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/less_slow.cpp b/less_slow.cpp index 8ce7d1f..9c124e7 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -2628,8 +2628,8 @@ class strided_ptr { strided_ptr operator--(int) noexcept { strided_ptr temp = *this; --(*this); return temp; } strided_ptr &operator+=(difference_type offset) noexcept { data_ += offset * stride_; return *this; } strided_ptr &operator-=(difference_type offset) noexcept { data_ -= offset * stride_; return *this; } - strided_ptr operator+(difference_type offset) noexcept { strided_ptr temp = *this; return temp += offset; } - strided_ptr operator-(difference_type offset) noexcept { strided_ptr temp = *this; return temp -= offset; } + strided_ptr operator+(difference_type offset) const noexcept { strided_ptr temp = *this; return temp += offset; } + strided_ptr operator-(difference_type offset) const noexcept { strided_ptr temp = *this; return temp -= offset; } friend difference_type operator-(strided_ptr const &a, strided_ptr const &b) noexcept { assert(a.stride_ == b.stride_); return (a.data_ - b.data_) / static_cast(a.stride_); } friend bool operator==(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ == b.data_; } friend bool operator<(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ < b.data_; } From be4a0bedc85c49a53b4260706d9fa9e08ac100e2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 22 Apr 2025 12:25:29 +0100 Subject: [PATCH 08/19] Docs: OpenBLAS installation on MacOS Co-authored-by: Armin Stepanjans <12305910+ab-10@users.noreply.github.com> --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 349855a..c7071e2 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Some of the highlights include: - __Scaling AI?__ Measure the gap between theoretical [ALU](https://en.wikipedia.org/wiki/Arithmetic_logic_unit) throughput and your [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms). - __How many if conditions are too many?__ Test your CPU's branch predictor with just 10 lines of code. - __Prefer recursion to iteration?__ Measure the depth at which your algorithm will [`SEGFAULT`](https://en.wikipedia.org/wiki/Segmentation_fault). -- __Why avoid exceptions?__ Take `std::error_code` or [`std::variant`](https://en.cppreference.com/w/cpp/utility/variant)-like wrappers? +- __Why avoid exceptions?__ Take `std::error_code` or [`std::variant`](https://en.cppreference.com/w/cpp/utility/variant)-like [ADTs](https://en.wikipedia.org/wiki/Algebraic_data_type)? - __Scaling to many cores?__ Learn how to use [OpenMP](https://en.wikipedia.org/wiki/OpenMP), Intel's oneTBB, or your custom thread pool. - __How to handle [JSON](https://www.json.org/json-en.html) avoiding memory allocations?__ Is it easier with C++ 20 or old-school C 99 tools? - __How to properly use STL's associative containers__ with custom keys and transparent comparators? @@ -93,6 +93,18 @@ cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_N cmake --build build_release --config Release ``` +To build on MacOS, pulling key dependencies from [Homebrew](https://brew.sh): + +```sh +brew install openblas +cmake -B build_release \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_C_FLAGS="-I$(brew --prefix openblas)/include" \ + -D CMAKE_CXX_FLAGS="-I$(brew --prefix openblas)/include" \ + -D CMAKE_EXE_LINKER_FLAGS="-L$(brew --prefix openblas)/lib" +cmake --build build_release --config Release +``` + To control the output or run specific benchmarks, use the following flags: ```sh From 2c44f558ec4b6fdf80a56e522b51bed84b9b3146 Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Tue, 22 Apr 2025 11:25:47 +0000 Subject: [PATCH 09/19] Release: v0.10.8 [skip ci] ### Patch - Docs: OpenBLAS installation on MacOS (be4a0be) - Fix: Missing `const` qualifiers in `strided_ptr` (9120723) - Fix: Can't `std::format(time)` on macOS (4d00aba) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 032e913..5f4ff7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.7 + VERSION 0.10.8 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index a988815..933f18d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.7 \ No newline at end of file +0.10.8 \ No newline at end of file From a27448f5676e42eb74ac8f8a2ae334afaa9bea58 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 19 May 2025 06:37:28 +0000 Subject: [PATCH 10/19] Make: `USE_BLAS` option --- .vscode/settings.json | 1 + CMakeLists.txt | 32 ++++++++++++++++++++++++-------- README.md | 4 ++-- less_slow.cpp | 16 ++++++++++++++++ 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 334f1f4..7b828ca 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -64,6 +64,7 @@ "Klemens", "Kohlhoff", "Kulukundis", + "LAPACK", "Lelbach", "Lemire", "Lib", diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f4ff7d..7605651 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,11 +72,22 @@ else () set(_SHOULD_USE_INTEL_TBB OFF) endif () +# Probe for BLAS support +set(CMAKE_FIND_LIBRARY_PREFIXES ";lib") +find_package(BLAS QUIET) +if (BLAS_FOUND) + set(_SHOULD_USE_BLAS ON) +else () + set(_SHOULD_USE_BLAS OFF) +endif () + option(USE_INTEL_TBB "Use Intel TBB for parallel STL algorithms" ${_SHOULD_USE_INTEL_TBB}) option(USE_NVIDIA_CCCL "Use Nvidia CCCL for CUDA acceleration" ${_SHOULD_USE_NVIDIA_CCCL}) +option(USE_BLAS "Use BLAS for linear algebra" ${_SHOULD_USE_BLAS}) message(STATUS "USE_INTEL_TBB: ${USE_INTEL_TBB}") message(STATUS "USE_NVIDIA_CCCL: ${USE_NVIDIA_CCCL}") +message(STATUS "USE_BLAS: ${USE_BLAS}") # ------------------------------------------------------------------------------ # Dependencies @@ -102,13 +113,14 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") # ~~~ # # Moreover, CMake sometimes fails to find it on Windows: https://stackoverflow.com/a/78335726/2766161 -set(CMAKE_FIND_LIBRARY_PREFIXES ";lib") -find_package(BLAS REQUIRED) +if (USE_BLAS) + find_package(BLAS REQUIRED) -include(CheckFunctionExists) -check_function_exists(openblas_set_num_threads LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) -if (LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) - add_definitions(-DLESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + include(CheckFunctionExists) + check_function_exists(openblas_set_num_threads LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + if (LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + add_definitions(-D LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS) + endif () endif () # GTest (required by Google Benchmark) @@ -346,7 +358,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64") elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") if (APPLE) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv8.6-a+bf16") - endif() + endif () set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM) target_sources(less_slow PRIVATE less_slow_aarch64.S) endif () @@ -431,6 +443,7 @@ endif () # ------------------------------------------------------------------------------ target_compile_definitions(less_slow PRIVATE USE_NVIDIA_CCCL=$) target_compile_definitions(less_slow PRIVATE USE_INTEL_TBB=$) +target_compile_definitions(less_slow PRIVATE USE_BLAS=$) target_link_libraries( less_slow PRIVATE Threads::Threads @@ -448,9 +461,12 @@ target_link_libraries( absl::flat_hash_map nlohmann_json::nlohmann_json Eigen3::Eigen - ${BLAS_LIBRARIES} ) +if (USE_BLAS) + target_link_libraries(less_slow PRIVATE ${BLAS_LIBRARIES}) +endif () + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") # target_include_directories(less_slow PRIVATE ${LIBURING_INCLUDE_DIRS}) target_link_libraries(less_slow PRIVATE ${LIBURING_LIBRARIES}) diff --git a/README.md b/README.md index c7071e2..59e7fa9 100644 --- a/README.md +++ b/README.md @@ -86,10 +86,10 @@ The build will pull and compile several third-party dependencies from the source - Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated algorithms. - Nvidia's [CUTLASS](https://github.com/nvidia/cutlass) for GPU-accelerated Linear Algebra. -To build without Parallel STL, Intel TBB, and CUDA: +To build without Parallel STL, Intel TBB, BLAS, and CUDA: ```sh -cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_NVIDIA_CCCL=OFF +cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_NVIDIA_CCCL=OFF -D USE_BLAS=OFF cmake --build build_release --config Release ``` diff --git a/less_slow.cpp b/less_slow.cpp index 9c124e7..fbe5373 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -2904,6 +2904,20 @@ std::size_t parse_size_string(std::string const &str) { #pragma endregion // Non Uniform Memory Access #pragma region Memory Bound Linear Algebra + +#if !defined(USE_BLAS) +#if defined(__has_include) +#if __has_include() +#define USE_BLAS 1 +#else +#define USE_BLAS 0 +#endif // __has_include() +#else +#define USE_BLAS 0 +#endif // defined(__has_include) +#endif // !defined(USE_BLAS) + +#if USE_BLAS #include /** * ! OpenBLAS defines a `SIZE` macro for internal use, which conflicts with `fmt` @@ -2947,6 +2961,8 @@ static void cblas_tops(bm::State &state) { BENCHMARK(cblas_tops)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed); BENCHMARK(cblas_tops)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed); +#endif // USE_BLAS + /** * Eigen is a high-level C++ library for linear algebra that provides a * convenient templated API for matrix operations. From b21507f7143f8175b92d0b2b2d827b3bd4bb081b Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Mon, 19 May 2025 06:37:43 +0000 Subject: [PATCH 11/19] Release: v0.10.9 [skip ci] ### Patch - Make: `USE_BLAS` option (a27448f) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7605651..6274579 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.8 + VERSION 0.10.9 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index 933f18d..75955dd 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.8 \ No newline at end of file +0.10.9 \ No newline at end of file From 8aa99216d9a6844b3c5efe5669cc477d1c598af9 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 12 Aug 2025 11:16:34 +0000 Subject: [PATCH 12/19] Improve: Division via `float`s --- less_slow.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/less_slow.cpp b/less_slow.cpp index fbe5373..537562d 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -1329,13 +1329,23 @@ BENCHMARK(integral_division_by_const); * Since 64-bit doubles can exactly represent all 32-bit signed integers, * this method introduces @b no precision loss, making it a safe and efficient * alternative when division performance is critical. + * + * - The `float` can fit 24-bit integers exactly in its significand/mantissa. + * - The `double` can fit 52-bit integers exactly in its significand/mantissa. */ +static void integral_division_with_floats(bm::State &state) { + std::int32_t a = std::rand(), b = std::rand(), c = 0; + for (auto _ : state) + bm::DoNotOptimize(c = static_cast(static_cast(++a) / static_cast(++b))); +} + static void integral_division_with_doubles(bm::State &state) { std::int32_t a = std::rand(), b = std::rand(), c = 0; for (auto _ : state) bm::DoNotOptimize(c = static_cast(static_cast(++a) / static_cast(++b))); } +BENCHMARK(integral_division_with_floats); BENCHMARK(integral_division_with_doubles); /** From c0a3b121c08b24c64b09af9f2b32f6924ef5f62e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 12 Aug 2025 11:20:52 +0000 Subject: [PATCH 13/19] Improve: `jmp` vs `cmov` --- less_slow.cpp | 173 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 161 insertions(+), 12 deletions(-) diff --git a/less_slow.cpp b/less_slow.cpp index 537562d..4126e0e 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -996,6 +996,129 @@ static void branch_cost(bm::State &state) { BENCHMARK(branch_cost)->RangeMultiplier(4)->Range(256, 32 * 1024); +/** + * It's hard to reason if the above code should compile into a conditional move or a jump, + * so let's define explicit inline assembly kernels and compare both. + */ +#if defined(__GNUC__) && !defined(__clang__) //! GCC/Clang inline asm note in your code, keep MSVC out + +#if defined(__x86_64__) || defined(__i386__) + +static void branch_cost_cmov(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + std::int32_t sum; // early-clobber temp for LEA result + + asm volatile( // + "leal (%[var],%[rnd],1), %[sum]\n\t" // sum := variable + random + "imull %[rnd], %[var]\n\t" // var := variable * random + "testl $1, %[rnd]\n\t" // if (random & 1) var := sum + "cmovne %[sum], %[var]\n\t" + : [var] "+r"(variable), [sum] "=&r"(sum) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +static void branch_cost_jump(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + + asm volatile( // + "testl $1, %[rnd]\n\t" + "jnz 1f\n\t" // if odd -> jump to add + "imull %[rnd], %[var]\n\t" // even: var *= rnd + "jmp 2f\n\t" + "1:\n\t" + "addl %[rnd], %[var]\n\t" // odd: var += rnd + "2:\n\t" + : [var] "+r"(variable) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +BENCHMARK(branch_cost_cmov)->RangeMultiplier(4)->Range(256, 32 * 1024); +BENCHMARK(branch_cost_jump)->RangeMultiplier(4)->Range(256, 32 * 1024); + +#elif defined(__aarch64__) + +static void branch_cost_csel(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + std::int32_t sum; + + asm volatile( // + "add %w[sum], %w[var], %w[rnd]\n\t" // sum := variable + random + "mul %w[var], %w[var], %w[rnd]\n\t" // var := variable * random + "tst %w[rnd], #1\n\t" // if (random & 1) var := sum + "csel %w[var], %w[sum], %w[var], NE\n\t" + : [var] "+r"(variable), [sum] "=&r"(sum) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +static void branch_cost_branch(bm::State &state) { + auto const count = static_cast(state.range(0)); + aligned_array random_values(count); + std::generate_n(random_values.begin(), count, &std::rand); + std::int32_t variable = 0; + std::size_t iteration = 0; + + for (auto _ : state) { + std::int32_t const random = random_values[(++iteration) & (count - 1)]; + + asm volatile( // + "tst %w[rnd], #1\n\t" + "b.ne 1f\n\t" // if odd -> jump to add + "mul %w[var], %w[var], %w[rnd]\n\t" // even: var *= rnd + "b 2f\n\t" + "1:\n\t" + "add %w[var], %w[var], %w[rnd]\n\t" // odd: var += rnd + "2:\n\t" + : [var] "+r"(variable) + : [rnd] "r"(random) + : "cc"); + bm::DoNotOptimize(variable); + } +} + +BENCHMARK(branch_cost_csel)->RangeMultiplier(4)->Range(256, 32 * 1024); +BENCHMARK(branch_cost_branch)->RangeMultiplier(4)->Range(256, 32 * 1024); + +#endif + +#endif // __GNUC__ && !__clang__ + +/** + * Results are quite interesting. On Intel: + * - `branch_cost` up to 4K runs at @b 0.7ns, beyond that it jumps to @b 3.7ns. + * - `branch_cost_cmov` consistently runs at @b 1.3ns, regardless of the size. + * - `branch_cost_jump` has similar, but slightly worse performance than `branch_cost`. + */ + #pragma endregion // Branch Prediction #pragma region Cache Misses @@ -1069,28 +1192,54 @@ BENCHMARK(cache_misses_cost) * value. This optimization is crucial for performance, especially when dealing * with heavy objects. */ -#include // `std::optional` +struct heavy_t { + std::uint64_t data[8]; + + heavy_t() noexcept { std::iota(data, data + 8, 0); } + + heavy_t(heavy_t &&) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + heavy_t(heavy_t const &) { std::this_thread::sleep_for(std::chrono::milliseconds(2)); } + heavy_t &operator=(heavy_t &&) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + return *this; + } + heavy_t &operator=(heavy_t const &) { + std::this_thread::sleep_for(std::chrono::milliseconds(2)); + return *this; + } +}; -std::optional make_heavy_object_mutable() { - std::string x(1024, 'x'); +heavy_t make_heavy_object() { return heavy_t {}; } + +heavy_t make_named_heavy_object() { + heavy_t const x; //! Even with `const`, RVO is possible return x; } -std::optional make_heavy_object_immutable() { - std::string const x(1024, 'x'); //! `const` is the only difference - return x; +heavy_t make_conditional_heavy_object() { + heavy_t x; + heavy_t &x1 = x; + heavy_t &x2 = x; + static std::size_t counter = 0; //! Condition prevents RVO + if (counter++ % 2 == 0) { return x1; } + else { return x2; } +} + +static void rvo_trivial(bm::State &state) { + for (auto _ : state) bm::DoNotOptimize(make_heavy_object()); } -static void rvo_friendly(bm::State &state) { - for (auto _ : state) bm::DoNotOptimize(make_heavy_object_mutable()); +static void rvo_likely(bm::State &state) { + for (auto _ : state) bm::DoNotOptimize(make_named_heavy_object()); } -static void rvo_impossible(bm::State &state) { - for (auto _ : state) bm::DoNotOptimize(make_heavy_object_immutable()); +static void rvo_banned(bm::State &state) { + for (auto _ : state) bm::DoNotOptimize(make_conditional_heavy_object()); } -BENCHMARK(rvo_friendly); -BENCHMARK(rvo_impossible); +BENCHMARK(rvo_trivial); +BENCHMARK(rvo_likely); +BENCHMARK(rvo_banned); /** * Despite intuition, marking a local object as `const` hurts our performance. From 3f8cd5f34026b2608b109b37d73aff23cb541eee Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Tue, 12 Aug 2025 11:21:24 +0000 Subject: [PATCH 14/19] Release: v0.10.10 [skip ci] ### Patch - Improve: `jmp` vs `cmov` (c0a3b12) - Improve: Division via `float`s (8aa9921) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6274579..5bb55d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.9 + VERSION 0.10.10 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index 75955dd..cd47247 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.9 \ No newline at end of file +0.10.10 \ No newline at end of file From 89e72b303479ee5f98489c29eeb1d0b01e3c3782 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 12 Aug 2025 11:37:38 +0000 Subject: [PATCH 15/19] Make: Bump dependencies --- CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6274579..97bbd71 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ endif () FetchContent_Declare( GoogleTest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.15.2 + GIT_TAG v1.17.0 ) FetchContent_MakeAvailable(GoogleTest) @@ -135,7 +135,7 @@ FetchContent_MakeAvailable(GoogleTest) FetchContent_Declare( GoogleBenchmark GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.9.1 + GIT_TAG v1.9.4 ) # Suppress building tests/docs/etc. for faster builds: @@ -235,7 +235,7 @@ endif () FetchContent_Declare( VictorZverovichFMT GIT_REPOSITORY https://github.com/fmtlib/fmt.git - GIT_TAG 11.1.2 + GIT_TAG 11.2.0 ) FetchContent_MakeAvailable(VictorZverovichFMT) @@ -271,7 +271,7 @@ FetchContent_MakeAvailable(MetaLibUnifEx) FetchContent_Declare( AshVardanianStringZilla GIT_REPOSITORY https://github.com/ashvardanian/stringzilla - GIT_TAG v3.12.5 + GIT_TAG v3.12.6 ) FetchContent_MakeAvailable(AshVardanianStringZilla) @@ -288,7 +288,7 @@ FetchContent_MakeAvailable(HanaDusikovaCTRE) FetchContent_Declare( GoogleAbseil GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git - GIT_TAG 20240722.0 # LTS version + GIT_TAG 20250512.1 # LTS version ) FetchContent_MakeAvailable(GoogleAbseil) @@ -296,7 +296,7 @@ FetchContent_MakeAvailable(GoogleAbseil) FetchContent_Declare( NielsLohmannJSON GIT_REPOSITORY https://github.com/nlohmann/json.git - GIT_TAG v3.11.3 + GIT_TAG v3.12.0 ) FetchContent_MakeAvailable(NielsLohmannJSON) @@ -304,7 +304,7 @@ FetchContent_MakeAvailable(NielsLohmannJSON) FetchContent_Declare( YaoyuanGuoYYJSON GIT_REPOSITORY https://github.com/ibireme/yyjson.git - GIT_TAG 0.10.0 + GIT_TAG 0.11.1 ) FetchContent_MakeAvailable(YaoyuanGuoYYJSON) From 787f985c40c00818eadb4b3992c1211cdeb5edc5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 12 Aug 2025 12:01:17 +0000 Subject: [PATCH 16/19] Improve: Parsing via `simdjson` --- .vscode/settings.json | 5 +- CMakeLists.txt | 9 ++++ less_slow.cpp | 118 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 7b828ca..f319482 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -68,6 +68,7 @@ "Lelbach", "Lemire", "Lib", + "libopenblas", "LIBPFM", "libunifex", "liburing", @@ -273,5 +274,7 @@ "variant": "cpp", "vector": "cpp", "version": "cpp" - } + }, + "C_Cpp.errorSquiggles": "disabled", + "cSpell.enabled": false } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 97bbd71..449df1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,6 +308,14 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(YaoyuanGuoYYJSON) +# Daniel Lemire's simdjson for SIMD-accelerated JSON parsing +FetchContent_Declare( + DanielLemireSimdJSON + GIT_REPOSITORY https://github.com/simdjson/simdjson.git + GIT_TAG v3.13.0 +) +FetchContent_MakeAvailable(DanielLemireSimdJSON) + # Chris Karloff's ASIO standalone, avoiding Boost... integration is a bit tricky: # https://github.com/cpm-cmake/CPM.cmake/blob/master/examples/asio-standalone/CMakeLists.txt FetchContent_Declare( @@ -454,6 +462,7 @@ target_link_libraries( unifex stringzilla yyjson + simdjson ctre asio # There is no `absl` shortcut: diff --git a/less_slow.cpp b/less_slow.cpp index 4126e0e..4de62c0 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -5220,6 +5220,124 @@ BENCHMARK(json_nlohmann) ->Name("json_nlohmann") ->Threads(physical_cores()); +/** + * simdjson is designed for high-performance JSON parsing using SIMD instructions. + * It provides On-Demand parsing which is particularly efficient for selective data extraction. + */ +#include + +bool contains_xss_in_simdjson_ondemand(simdjson::ondemand::value element) noexcept { + + // Handle objects + if (element.type() == simdjson::ondemand::json_type::object) { + simdjson::ondemand::object obj; + if (element.get_object().get(obj) == simdjson::SUCCESS) { + for (auto sub : obj) { + simdjson::ondemand::value val; + if (sub.value().get(val) == simdjson::SUCCESS) + if (contains_xss_in_simdjson_ondemand(val)) return true; + } + } + return false; + } + // Handle arrays + else if (element.type() == simdjson::ondemand::json_type::array) { + simdjson::ondemand::array arr; + if (element.get_array().get(arr) == simdjson::SUCCESS) { + for (auto sub : arr) { + simdjson::ondemand::value val; + if (sub.get(val) == simdjson::SUCCESS) + if (contains_xss_in_simdjson_ondemand(val)) return true; + } + } + return false; + } + // Handle strings + else if (element.type() == simdjson::ondemand::json_type::string) { + std::string_view str; + if (element.get_string().get(str) == simdjson::SUCCESS) + return str.find("") != std::string_view::npos; + } + return false; +} + +bool contains_xss_in_simdjson_dom(simdjson::dom::element element) noexcept { + if (element.is_object()) { + for (auto [key, val] : element.get_object()) + if (contains_xss_in_simdjson_dom(val)) return true; + } + else if (element.is_array()) { + for (auto val : element.get_array()) + if (contains_xss_in_simdjson_dom(val)) return true; + } + else if (element.is_string()) { + std::string_view str = element.get_string(); + return str.find("") != std::string_view::npos; + } + return false; +} + +static void json_simdjson_ondemand(bm::State &state) { + std::size_t bytes_processed = 0; + std::size_t iteration = 0; + + // Pre-allocate padded strings outside the hot path + simdjson::padded_string padded_strings[3] = { + simdjson::padded_string(packets_json[0]), + simdjson::padded_string(packets_json[1]), + simdjson::padded_string(packets_json[2]), + }; + + // On-demand parser reuses internal buffers + simdjson::ondemand::parser parser; + simdjson::ondemand::document doc; + + for (auto _ : state) { + std::size_t const packet_index = iteration++ % 3; + bytes_processed += packets_json[packet_index].size(); + + auto error = parser.iterate(padded_strings[packet_index]).get(doc); + if (error == simdjson::SUCCESS) { + simdjson::ondemand::value root; + if (doc.get_value().get(root) == simdjson::SUCCESS) + bm::DoNotOptimize(contains_xss_in_simdjson_ondemand(root)); + } + } + + state.SetBytesProcessed(bytes_processed); +} + +static void json_simdjson_dom(bm::State &state) { + std::size_t bytes_processed = 0; + std::size_t iteration = 0; + + // Pre-allocate padded strings outside the hot path + simdjson::padded_string padded_strings[3] = { + simdjson::padded_string(packets_json[0]), + simdjson::padded_string(packets_json[1]), + simdjson::padded_string(packets_json[2]), + }; + + // Reuse the state + simdjson::dom::parser parser; + simdjson::dom::element doc; + + for (auto _ : state) { + std::size_t const packet_index = iteration++ % 3; + bytes_processed += packets_json[packet_index].size(); + + auto error = parser.parse(padded_strings[packet_index]).get(doc); + if (error == simdjson::SUCCESS) bm::DoNotOptimize(contains_xss_in_simdjson_dom(doc)); + } + + state.SetBytesProcessed(bytes_processed); +} + +BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson"); +BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson"); +BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson")->Threads(physical_cores()); +BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson")->Threads(physical_cores()); + /** * The results for the single-threaded case and the multi-threaded case without * Simultaneous Multi-Threading @b (SMT), with 96 threads on 96 Sapphire Rapids From b454daebc09b1e67adef25940ca9b25393e023de Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Tue, 12 Aug 2025 12:01:30 +0000 Subject: [PATCH 17/19] Release: v0.10.11 [skip ci] ### Patch - Improve: Parsing via `simdjson` (787f985) - Make: Bump dependencies (89e72b3) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2561ca4..57d9aed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.10 + VERSION 0.10.11 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index cd47247..25fb08c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.10 \ No newline at end of file +0.10.11 \ No newline at end of file From c459c42f87df560e497d0d7fff5c7eef254f2ead Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 13 Aug 2025 21:38:51 +0000 Subject: [PATCH 18/19] Improve: Bitwise ops for branches Replacing multiplication with XOR results in a different assembly - GCC compiles a `cmov` instead of `jmp`. --- less_slow.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/less_slow.cpp b/less_slow.cpp index 4de62c0..59cb311 100644 --- a/less_slow.cpp +++ b/less_slow.cpp @@ -987,10 +987,10 @@ static void branch_cost(bm::State &state) { for (auto _ : state) { std::int32_t random = random_values[(++iteration) & (count - 1)]; bm::DoNotOptimize( // - variable = // - (random & 1) // + variable = // ! Fun fact: multiplication compiles to a jump, + (random & 1) // ! but replacing with a bitwise operation results in a conditional move. ? (variable + random) - : (variable * random)); + : (variable ^ random)); } } @@ -1017,7 +1017,7 @@ static void branch_cost_cmov(bm::State &state) { asm volatile( // "leal (%[var],%[rnd],1), %[sum]\n\t" // sum := variable + random - "imull %[rnd], %[var]\n\t" // var := variable * random + "xorl %[rnd], %[var]\n\t" // var := variable ^ random "testl $1, %[rnd]\n\t" // if (random & 1) var := sum "cmovne %[sum], %[var]\n\t" : [var] "+r"(variable), [sum] "=&r"(sum) @@ -1039,8 +1039,8 @@ static void branch_cost_jump(bm::State &state) { asm volatile( // "testl $1, %[rnd]\n\t" - "jnz 1f\n\t" // if odd -> jump to add - "imull %[rnd], %[var]\n\t" // even: var *= rnd + "jnz 1f\n\t" // if odd -> jump to add + "xorl %[rnd], %[var]\n\t" // even: var ^= rnd "jmp 2f\n\t" "1:\n\t" "addl %[rnd], %[var]\n\t" // odd: var += rnd @@ -1070,7 +1070,7 @@ static void branch_cost_csel(bm::State &state) { asm volatile( // "add %w[sum], %w[var], %w[rnd]\n\t" // sum := variable + random - "mul %w[var], %w[var], %w[rnd]\n\t" // var := variable * random + "eor %w[var], %w[var], %w[rnd]\n\t" // var := variable ^ random "tst %w[rnd], #1\n\t" // if (random & 1) var := sum "csel %w[var], %w[sum], %w[var], NE\n\t" : [var] "+r"(variable), [sum] "=&r"(sum) @@ -1093,7 +1093,7 @@ static void branch_cost_branch(bm::State &state) { asm volatile( // "tst %w[rnd], #1\n\t" "b.ne 1f\n\t" // if odd -> jump to add - "mul %w[var], %w[var], %w[rnd]\n\t" // even: var *= rnd + "eor %w[var], %w[var], %w[rnd]\n\t" // even: var ^= rnd "b 2f\n\t" "1:\n\t" "add %w[var], %w[var], %w[rnd]\n\t" // odd: var += rnd From cccf5fc90e3a207715fc26f67b927ac7b620292b Mon Sep 17 00:00:00 2001 From: TinySemVer Date: Wed, 13 Aug 2025 21:39:09 +0000 Subject: [PATCH 19/19] Release: v0.10.12 [skip ci] ### Patch - Improve: Bitwise ops for branches (c459c42) --- CMakeLists.txt | 2 +- VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57d9aed..99042bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR) # ------------------------------------------------------------------------------ project( less_slow - VERSION 0.10.11 + VERSION 0.10.12 LANGUAGES C CXX ASM DESCRIPTION "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines" diff --git a/VERSION b/VERSION index 25fb08c..5111446 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.10.11 \ No newline at end of file +0.10.12 \ No newline at end of file