From a66cfe21e9b29c621fd782e780049faea42bddb5 Mon Sep 17 00:00:00 2001
From: bmanga <bruno.manga95@gmail.com>
Date: Sun, 20 Apr 2025 12:54:10 +0200
Subject: [PATCH 01/19] Fix: Aligned allocation (#42)

Uses `operator new` overload with alignment.
Stores the alignment for the matching `delete` call.

Closes #41

---------

Co-authored-by: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
---
 less_slow.cpp | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/less_slow.cpp b/less_slow.cpp
index dafffb3..5845230 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -398,21 +398,16 @@ class aligned_array {
 
     type_ *data_ = nullptr;
     std::size_t size_ = 0;
+    std::size_t alignment_ = 0;
 
   public:
-#if defined(_MSC_VER) //! MSVC doesn't support `std::aligned_alloc` yet
-    aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) {
-        data_ = static_cast<type_ *>(_aligned_malloc(sizeof(type_) * size_, alignment));
-        if (!data_) throw std::bad_alloc();
+    aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size), alignment_(alignment) {
+        // With `std::aligned_alloc` in C++17, an exception won't be raised, which may be preferred in
+        // some environments. MSVC was late to adopt it, and developers would often use a combination
+        // of lower-level `posix_memalign` and `_aligned_malloc`/`_aligned_free` across Unix and Windows.
+        data_ = (type_ *)::operator new(sizeof(type_) * size_, std::align_val_t(alignment_));
     }
-    ~aligned_array() noexcept { _aligned_free(data_); }
-#else
-    aligned_array(std::size_t size, std::size_t alignment = 64) : size_(size) {
-        data_ = static_cast<type_ *>(std::aligned_alloc(alignment, sizeof(type_) * size_));
-        if (!data_) throw std::bad_alloc();
-    }
-    ~aligned_array() noexcept { std::free(data_); }
-#endif
+    ~aligned_array() noexcept { ::operator delete(data_, sizeof(type_) * size_, std::align_val_t(alignment_)); }
 
     aligned_array(aligned_array const &) = delete;
     aligned_array &operator=(aligned_array const &) = delete;

From ab7bf3f22b2b4a55c412c4208d022b52d8334755 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 20 Apr 2025 11:03:56 +0000
Subject: [PATCH 02/19] Docs: Notes on `#pragma region`s

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 93e3735..349855a 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,18 @@
-# Playing Around _Less Slow_ Coding Practices for C++, C, and Assembly Code
+# Playing Around _Less Slow_ Coding Practices for C++, CUDA, and Assembly Code
 
-> The benchmarks in this repository don’t aim to cover every topic entirely, but they help form a mindset and intuition for performance-oriented software design.  
-> It also provides an example of using some non-[STL](https://en.wikipedia.org/wiki/Standard_Template_Library) but de facto standard libraries in C++, importing them via CMake and compiling from source.  
-> For higher-level abstractions and languages, check out [`less_slow.rs`](https://github.com/ashvardanian/less_slow.rs) and [`less_slow.py`](https://github.com/ashvardanian/less_slow.py).  
-> I needed many of these measurements to reconsider my own coding habits, but hopefully they’re helpful to others as well.  
-> Most of the code is organized in very long, ordered, and `#pragma`-sectioned files — not necessarily the preferred form for everyone.
+> The benchmarks in this repository don't aim to cover every topic entirely, but they help form a mindset and intuition for performance-oriented software design.
+> It also provides an example of using some non-[STL](https://en.wikipedia.org/wiki/Standard_Template_Library) but de facto standard libraries in C++, importing them via CMake and compiling from source.
+> For higher-level abstractions and languages, check out [`less_slow.rs`](https://github.com/ashvardanian/less_slow.rs) and [`less_slow.py`](https://github.com/ashvardanian/less_slow.py).
+> I needed many of these measurements to reconsider my own coding habits, but hopefully they're helpful to others as well.
+> Most of the code is organized in very long, ordered, and nested `#pragma` sections — not necessarily the preferred form for everyone.
 
-Much of modern code suffers from common pitfalls — bugs, security vulnerabilities, and __performance bottlenecks__.  
-University curricula and coding bootcamps tend to stick to traditional coding styles and standard features, rarely exposing the more fun, unusual, and efficient design opportunities.
+Much of modern code suffers from common pitfalls — bugs, security vulnerabilities, and __performance bottlenecks__.
+University curricula and coding bootcamps tend to stick to traditional coding styles and standard features, rarely exposing the more fun, unusual, and potentially efficient design opportunities.
+This repository explores just that.
 
 ![Less Slow C++](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/less_slow.cpp.jpg?raw=true)
 
-This repository offers practical minimalistic examples of writing efficient C and C++ code.
-It leverages C++20 features and is designed primarily for GCC and Clang compilers on Linux, though it may work on other platforms.
+The code leverages C++20 and CUDA features and is designed primarily for GCC, Clang, and NVCC compilers on Linux, though it may work on other platforms.
 The topics range from basic micro-kernels executing in a few nanoseconds to more complex constructs involving parallel algorithms, coroutines, and polymorphism.
 Some of the highlights include:
 
@@ -40,6 +40,7 @@ Some of the highlights include:
 - __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜 #31
 
 To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments.
+Keep in mind, that most modern IDEs have a navigation bar to help you view and jump between `#pragma region` sections.
 Follow the instructions below to run the code in your environment and compare it to the comments as you read through the source.
 
 ## Running the Benchmarks

From 966d168e7a07451348a234e17acc1dbb145d7ccb Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Sun, 20 Apr 2025 11:04:57 +0000
Subject: [PATCH 03/19] Release: v0.10.6 [skip ci] ### Patch

- Docs: Notes on `#pragma region`s (ab7bf3f)
- Fix: Aligned allocation (#42) (a66cfe2)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a028c1..92f7b29 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.5
+    VERSION 0.10.6
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index 8877d79..6348c0f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.5
\ No newline at end of file
+0.10.6
\ No newline at end of file

From ecff6e35e71a77cea020391a427eb8880efc6dbe Mon Sep 17 00:00:00 2001
From: bmanga <bruno.manga95@gmail.com>
Date: Sun, 20 Apr 2025 21:30:21 +0200
Subject: [PATCH 04/19] Improve: Include Asm tests into macOS Arm builds (#45)

---
 CMakeLists.txt      |  5 ++++-
 less_slow.cpp       |  2 +-
 less_slow_aarch64.S | 38 +++++++++++++++++++++++---------------
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92f7b29..aebb41e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -343,7 +343,10 @@ set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64")
     set_source_files_properties(less_slow_amd64.S PROPERTIES LANGUAGE ASM)
     target_sources(less_slow PRIVATE less_slow_amd64.S)
-elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64")
+    if (APPLE)
+        set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv8.6-a+bf16")
+    endif()
     set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
     target_sources(less_slow PRIVATE less_slow_aarch64.S)
 endif ()
diff --git a/less_slow.cpp b/less_slow.cpp
index 5845230..dab29da 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1352,7 +1352,7 @@ BENCHMARK(integral_division_with_doubles);
  *  while the internal logic remains identical.
  */
 
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__GNUC__)
 
 #if defined(__x86_64__) || defined(__i386__)
 [[gnu::target("arch=core2")]]
diff --git a/less_slow_aarch64.S b/less_slow_aarch64.S
index b5e7ba6..57e1a83 100644
--- a/less_slow_aarch64.S
+++ b/less_slow_aarch64.S
@@ -3,21 +3,27 @@
 # Micro-kernels for building a performance-first mindset for 64-bit ARM (NEON).
 # ----------------------------------------------------------------------------
 
-    .section .text
-    .global i32_add_asm_kernel
+#ifdef __APPLE__
+    #define SYMBOL_NAME(name) _##name  // Add underscore on macOS
+#else
+    #define SYMBOL_NAME(name) name     // No underscore on GNU-based systems
+#endif
+
+    .text
+    .global SYMBOL_NAME(i32_add_asm_kernel)
     
-    .global tops_f64_neon_asm_kernel
-    .global tops_f32_neon_asm_kernel
-    .global tops_f16_neon_asm_kernel
-    .global tops_bf16_neon_asm_kernel
-    .global tops_i8_neon_asm_kernel
-    .global tops_u8_neon_asm_kernel
+    .global SYMBOL_NAME(tops_f64_neon_asm_kernel)
+    .global SYMBOL_NAME(tops_f32_neon_asm_kernel)
+    .global SYMBOL_NAME(tops_f16_neon_asm_kernel)
+    .global SYMBOL_NAME(tops_bf16_neon_asm_kernel)
+    .global SYMBOL_NAME(tops_i8_neon_asm_kernel)
+    .global SYMBOL_NAME(tops_u8_neon_asm_kernel)
 
 # ----------------------------------------------------------------------------
 # Simple function that adds two 32-bit integers. 
 # AArch64 ABI: W0 = 'a', W1 = 'b'. Return in W0.
 # ----------------------------------------------------------------------------
-i32_add_asm_kernel:
+SYMBOL_NAME(i32_add_asm_kernel):
     add     w0, w0, w1
     ret
 
@@ -26,7 +32,7 @@ i32_add_asm_kernel:
 # Each FMLA vD.2d, vN.2d, vM.2d => 2 multiplies + 2 adds = 4 FLOPs.
 # We'll do 10 instructions => 10 × 4 = 40 FLOPs total, returning 40 in W0.
 # ----------------------------------------------------------------------------
-tops_f64_neon_asm_kernel:
+SYMBOL_NAME(tops_f64_neon_asm_kernel):
     fmla    v0.2d,  v1.2d,  v2.2d
     fmla    v3.2d,  v4.2d,  v5.2d
     fmla    v6.2d,  v7.2d,  v8.2d
@@ -47,7 +53,7 @@ tops_f64_neon_asm_kernel:
 # Let's do 10 instructions => 10 × 8 = 80 FLOPs total.
 # Return 80 in W0.
 # ----------------------------------------------------------------------------
-tops_f32_neon_asm_kernel:
+SYMBOL_NAME(tops_f32_neon_asm_kernel):
     fmla    v0.4s,  v1.4s,  v2.4s
     fmla    v3.4s,  v4.4s,  v5.4s
     fmla    v6.4s,  v7.4s,  v8.4s
@@ -68,7 +74,7 @@ tops_f32_neon_asm_kernel:
 # Each FMLA vD.8h, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs.
 # We'll do 10 instructions => 160 FLOPs total, returning 160 in W0.
 # ----------------------------------------------------------------------------
-tops_f16_neon_asm_kernel:
+SYMBOL_NAME(tops_f16_neon_asm_kernel):
     fmla    v0.8h,  v1.8h,  v2.8h
     fmla    v3.8h,  v4.8h,  v5.8h
     fmla    v6.8h,  v7.8h,  v8.8h
@@ -89,7 +95,7 @@ tops_f16_neon_asm_kernel:
 # bfmmla  vD.4s, vN.8h, vM.8h => 8 multiplies + 8 adds = 16 FLOPs.
 # We'll do 10 instructions => 160 FLOPs total, returning 160 in W0.
 # ----------------------------------------------------------------------------
-tops_bf16_neon_asm_kernel:
+SYMBOL_NAME(tops_bf16_neon_asm_kernel):
     bfmmla  v0.4s,  v1.8h,  v2.8h
     bfmmla  v3.4s,  v4.8h,  v5.8h
     bfmmla  v6.4s,  v7.8h,  v8.8h
@@ -110,7 +116,7 @@ tops_bf16_neon_asm_kernel:
 # sdot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs.
 # We'll do 10 instructions => 320 FLOPs total, returning 320 in W0.
 # ----------------------------------------------------------------------------
-tops_i8_neon_asm_kernel:
+SYMBOL_NAME(tops_i8_neon_asm_kernel):
     sdot    v0.4s,  v1.16b,  v2.16b
     sdot    v3.4s,  v4.16b,  v5.16b
     sdot    v6.4s,  v7.16b,  v8.16b
@@ -131,7 +137,7 @@ tops_i8_neon_asm_kernel:
 # udot vD.4s, vN.16b, vM.16b => 16 multiplies + 16 adds = 32 FLOPs.
 # We'll do 10 instructions => 320 FLOPs total, returning 320 in W0.
 # ----------------------------------------------------------------------------
-tops_u8_neon_asm_kernel:
+SYMBOL_NAME(tops_u8_neon_asm_kernel):
     udot    v0.4s,  v1.16b,  v2.16b
     udot    v3.4s,  v4.16b,  v5.16b
     udot    v6.4s,  v7.16b,  v8.16b
@@ -148,5 +154,7 @@ tops_u8_neon_asm_kernel:
 
 # ----------------------------------------------------------------------------
 # Tell the linker/assembler that we do NOT need an executable stack:
+#ifdef __linux__
     .section .note.GNU-stack, "", @progbits
+#endif
 # ----------------------------------------------------------------------------
\ No newline at end of file

From 35824e648514dd6a8a8882a74c3382cf94e11c68 Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Sun, 20 Apr 2025 19:30:33 +0000
Subject: [PATCH 05/19] Release: v0.10.7 [skip ci] ### Patch

- Improve: Include Asm tests into macOS Arm builds (#45) (ecff6e3)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aebb41e..032e913 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.6
+    VERSION 0.10.7
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index 6348c0f..a988815 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.6
\ No newline at end of file
+0.10.7
\ No newline at end of file

From 4d00abad86a2359b1c1b4512b9c98a3e45e81500 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Apr 2025 12:22:51 +0100
Subject: [PATCH 06/19] Fix: Can't `std::format(time)` on macOS

Co-authored-by: Armin Stepanyan <12305910+ab-10@users.noreply.github.com>
---
 less_slow.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/less_slow.cpp b/less_slow.cpp
index dab29da..8ce7d1f 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -6038,7 +6038,7 @@ struct log_printf_t {
         // `std::chrono::high_resolution_clock` is usually just an alias to either `system_clock` or
         // `steady_clock`. There is debate on whether using it is a good idea at all.
         // https://en.cppreference.com/w/cpp/chrono/high_resolution_clock
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__APPLE__)
         auto now = std::chrono::system_clock::now();
 #else
         auto now = std::chrono::high_resolution_clock::now();
@@ -6069,7 +6069,10 @@ struct log_printf_t {
 
 BENCHMARK(logging<log_printf_t>)->Name("log_printf")->MinTime(2);
 
-#if !defined(_MSC_VER)
+/**
+ *  Formatting `std::chrono` with `std::format` fails on both Windows and MacOS.
+ */
+#if !defined(_MSC_VER) && !defined(__APPLE__)
 #if defined(__cpp_lib_format)
 #include <format> // `std::format_to_n`
 

From 91207235346c21682b3aaebc385727df65d6b5d2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Apr 2025 12:23:26 +0100
Subject: [PATCH 07/19] Fix: Missing `const` qualifiers in `strided_ptr`

Co-authored-by: Armin Stepanyan <12305910+ab-10@users.noreply.github.com>
---
 less_slow.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/less_slow.cpp b/less_slow.cpp
index 8ce7d1f..9c124e7 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -2628,8 +2628,8 @@ class strided_ptr {
     strided_ptr operator--(int) noexcept { strided_ptr temp = *this; --(*this); return temp; }
     strided_ptr &operator+=(difference_type offset) noexcept { data_ += offset * stride_; return *this; }
     strided_ptr &operator-=(difference_type offset) noexcept { data_ -= offset * stride_; return *this; }
-    strided_ptr operator+(difference_type offset) noexcept { strided_ptr temp = *this; return temp += offset; }
-    strided_ptr operator-(difference_type offset) noexcept { strided_ptr temp = *this; return temp -= offset; }
+    strided_ptr operator+(difference_type offset) const noexcept { strided_ptr temp = *this; return temp += offset; }
+    strided_ptr operator-(difference_type offset) const noexcept { strided_ptr temp = *this; return temp -= offset; }
     friend difference_type operator-(strided_ptr const &a, strided_ptr const &b) noexcept { assert(a.stride_ == b.stride_); return (a.data_ - b.data_) / static_cast<difference_type>(a.stride_); }
     friend bool operator==(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ == b.data_; }
     friend bool operator<(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ < b.data_; }

From be4a0bedc85c49a53b4260706d9fa9e08ac100e2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 22 Apr 2025 12:25:29 +0100
Subject: [PATCH 08/19] Docs: OpenBLAS installation on MacOS

Co-authored-by: Armin Stepanjans <12305910+ab-10@users.noreply.github.com>
---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 349855a..c7071e2 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Some of the highlights include:
 - __Scaling AI?__ Measure the gap between theoretical [ALU](https://en.wikipedia.org/wiki/Arithmetic_logic_unit) throughput and your [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms).
 - __How many if conditions are too many?__ Test your CPU's branch predictor with just 10 lines of code.
 - __Prefer recursion to iteration?__ Measure the depth at which your algorithm will [`SEGFAULT`](https://en.wikipedia.org/wiki/Segmentation_fault).
-- __Why avoid exceptions?__ Take `std::error_code` or [`std::variant`](https://en.cppreference.com/w/cpp/utility/variant)-like wrappers?
+- __Why avoid exceptions?__ Take `std::error_code` or [`std::variant`](https://en.cppreference.com/w/cpp/utility/variant)-like [ADTs](https://en.wikipedia.org/wiki/Algebraic_data_type)?
 - __Scaling to many cores?__ Learn how to use [OpenMP](https://en.wikipedia.org/wiki/OpenMP), Intel's oneTBB, or your custom thread pool.
 - __How to handle [JSON](https://www.json.org/json-en.html) avoiding memory allocations?__ Is it easier with C++ 20 or old-school C 99 tools?
 - __How to properly use STL's associative containers__ with custom keys and transparent comparators?
@@ -93,6 +93,18 @@ cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_N
 cmake --build build_release --config Release
 ```
 
+To build on MacOS, pulling key dependencies from [Homebrew](https://brew.sh):
+
+```sh
+brew install openblas
+cmake -B build_release \
+      -D CMAKE_BUILD_TYPE=Release \
+      -D CMAKE_C_FLAGS="-I$(brew --prefix openblas)/include" \
+      -D CMAKE_CXX_FLAGS="-I$(brew --prefix openblas)/include" \
+      -D CMAKE_EXE_LINKER_FLAGS="-L$(brew --prefix openblas)/lib"
+cmake --build build_release --config Release
+```
+
 To control the output or run specific benchmarks, use the following flags:
 
 ```sh

From 2c44f558ec4b6fdf80a56e522b51bed84b9b3146 Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Tue, 22 Apr 2025 11:25:47 +0000
Subject: [PATCH 09/19] Release: v0.10.8 [skip ci] ### Patch

- Docs: OpenBLAS installation on MacOS (be4a0be)
- Fix: Missing `const` qualifiers in `strided_ptr` (9120723)
- Fix: Can't `std::format(time)` on macOS (4d00aba)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 032e913..5f4ff7d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.7
+    VERSION 0.10.8
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index a988815..933f18d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.7
\ No newline at end of file
+0.10.8
\ No newline at end of file

From a27448f5676e42eb74ac8f8a2ae334afaa9bea58 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 19 May 2025 06:37:28 +0000
Subject: [PATCH 10/19] Make: `USE_BLAS` option

---
 .vscode/settings.json |  1 +
 CMakeLists.txt        | 32 ++++++++++++++++++++++++--------
 README.md             |  4 ++--
 less_slow.cpp         | 16 ++++++++++++++++
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 334f1f4..7b828ca 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -64,6 +64,7 @@
     "Klemens",
     "Kohlhoff",
     "Kulukundis",
+    "LAPACK",
     "Lelbach",
     "Lemire",
     "Lib",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f4ff7d..7605651 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,11 +72,22 @@ else ()
     set(_SHOULD_USE_INTEL_TBB OFF)
 endif ()
 
+# Probe for BLAS support
+set(CMAKE_FIND_LIBRARY_PREFIXES ";lib")
+find_package(BLAS QUIET)
+if (BLAS_FOUND)
+    set(_SHOULD_USE_BLAS ON)
+else ()
+    set(_SHOULD_USE_BLAS OFF)
+endif ()
+
 option(USE_INTEL_TBB "Use Intel TBB for parallel STL algorithms" ${_SHOULD_USE_INTEL_TBB})
 option(USE_NVIDIA_CCCL "Use Nvidia CCCL for CUDA acceleration" ${_SHOULD_USE_NVIDIA_CCCL})
+option(USE_BLAS "Use BLAS for linear algebra" ${_SHOULD_USE_BLAS})
 
 message(STATUS "USE_INTEL_TBB: ${USE_INTEL_TBB}")
 message(STATUS "USE_NVIDIA_CCCL: ${USE_NVIDIA_CCCL}")
+message(STATUS "USE_BLAS: ${USE_BLAS}")
 
 # ------------------------------------------------------------------------------
 # Dependencies
@@ -102,13 +113,14 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w")
 # ~~~
 #
 # Moreover, CMake sometimes fails to find it on Windows: https://stackoverflow.com/a/78335726/2766161
-set(CMAKE_FIND_LIBRARY_PREFIXES ";lib")
-find_package(BLAS REQUIRED)
+if (USE_BLAS)
+    find_package(BLAS REQUIRED)
 
-include(CheckFunctionExists)
-check_function_exists(openblas_set_num_threads LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS)
-if (LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS)
-    add_definitions(-DLESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS)
+    include(CheckFunctionExists)
+    check_function_exists(openblas_set_num_threads LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS)
+    if (LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS)
+        add_definitions(-D LESS_SLOW_HAS_OPENBLAS_SET_NUM_THREADS)
+    endif ()
 endif ()
 
 # GTest (required by Google Benchmark)
@@ -346,7 +358,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|x64")
 elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64")
     if (APPLE)
         set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv8.6-a+bf16")
-    endif()
+    endif ()
     set_source_files_properties(less_slow_aarch64.S PROPERTIES LANGUAGE ASM)
     target_sources(less_slow PRIVATE less_slow_aarch64.S)
 endif ()
@@ -431,6 +443,7 @@ endif ()
 # ------------------------------------------------------------------------------
 target_compile_definitions(less_slow PRIVATE USE_NVIDIA_CCCL=$<BOOL:${USE_NVIDIA_CCCL}>)
 target_compile_definitions(less_slow PRIVATE USE_INTEL_TBB=$<BOOL:${USE_INTEL_TBB}>)
+target_compile_definitions(less_slow PRIVATE USE_BLAS=$<BOOL:${USE_BLAS}>)
 target_link_libraries(
     less_slow
     PRIVATE Threads::Threads
@@ -448,9 +461,12 @@ target_link_libraries(
             absl::flat_hash_map
             nlohmann_json::nlohmann_json
             Eigen3::Eigen
-            ${BLAS_LIBRARIES}
 )
 
+if (USE_BLAS)
+    target_link_libraries(less_slow PRIVATE ${BLAS_LIBRARIES})
+endif ()
+
 if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
     # target_include_directories(less_slow PRIVATE ${LIBURING_INCLUDE_DIRS})
     target_link_libraries(less_slow PRIVATE ${LIBURING_LIBRARIES})
diff --git a/README.md b/README.md
index c7071e2..59e7fa9 100644
--- a/README.md
+++ b/README.md
@@ -86,10 +86,10 @@ The build will pull and compile several third-party dependencies from the source
 - Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated algorithms.
 - Nvidia's [CUTLASS](https://github.com/nvidia/cutlass) for GPU-accelerated Linear Algebra.
 
-To build without Parallel STL, Intel TBB, and CUDA:
+To build without Parallel STL, Intel TBB, BLAS, and CUDA:
 
 ```sh
-cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_NVIDIA_CCCL=OFF
+cmake -B build_release -D CMAKE_BUILD_TYPE=Release -D USE_INTEL_TBB=OFF -D USE_NVIDIA_CCCL=OFF -D USE_BLAS=OFF
 cmake --build build_release --config Release
 ```
 
diff --git a/less_slow.cpp b/less_slow.cpp
index 9c124e7..fbe5373 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -2904,6 +2904,20 @@ std::size_t parse_size_string(std::string const &str) {
 #pragma endregion // Non Uniform Memory Access
 
 #pragma region Memory Bound Linear Algebra
+
+#if !defined(USE_BLAS)
+#if defined(__has_include)
+#if __has_include(<cblas.h>)
+#define USE_BLAS 1
+#else
+#define USE_BLAS 0
+#endif // __has_include(<cblas.h>)
+#else
+#define USE_BLAS 0
+#endif // defined(__has_include)
+#endif // !defined(USE_BLAS)
+
+#if USE_BLAS
 #include <cblas.h>
 /**
  *  ! OpenBLAS defines a `SIZE` macro for internal use, which conflicts with `fmt`
@@ -2947,6 +2961,8 @@ static void cblas_tops(bm::State &state) {
 BENCHMARK(cblas_tops<float>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
 BENCHMARK(cblas_tops<double>)->RangeMultiplier(2)->Range(8, 16384)->Complexity(benchmark::oNCubed);
 
+#endif // USE_BLAS
+
 /**
  *  Eigen is a high-level C++ library for linear algebra that provides a
  *  convenient templated API for matrix operations.

From b21507f7143f8175b92d0b2b2d827b3bd4bb081b Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Mon, 19 May 2025 06:37:43 +0000
Subject: [PATCH 11/19] Release: v0.10.9 [skip ci] ### Patch

- Make: `USE_BLAS` option (a27448f)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7605651..6274579 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.8
+    VERSION 0.10.9
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index 933f18d..75955dd 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.8
\ No newline at end of file
+0.10.9
\ No newline at end of file

From 8aa99216d9a6844b3c5efe5669cc477d1c598af9 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 11:16:34 +0000
Subject: [PATCH 12/19] Improve: Division via `float`s

---
 less_slow.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/less_slow.cpp b/less_slow.cpp
index fbe5373..537562d 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1329,13 +1329,23 @@ BENCHMARK(integral_division_by_const);
  *  Since 64-bit doubles can exactly represent all 32-bit signed integers,
  *  this method introduces @b no precision loss, making it a safe and efficient
  *  alternative when division performance is critical.
+ *
+ *  - The `float` can fit 24-bit integers exactly in its significand/mantissa.
+ *  - The `double` can fit 52-bit integers exactly in its significand/mantissa.
  */
+static void integral_division_with_floats(bm::State &state) {
+    std::int32_t a = std::rand(), b = std::rand(), c = 0;
+    for (auto _ : state)
+        bm::DoNotOptimize(c = static_cast<std::int32_t>(static_cast<float>(++a) / static_cast<float>(++b)));
+}
+
 static void integral_division_with_doubles(bm::State &state) {
     std::int32_t a = std::rand(), b = std::rand(), c = 0;
     for (auto _ : state)
         bm::DoNotOptimize(c = static_cast<std::int32_t>(static_cast<double>(++a) / static_cast<double>(++b)));
 }
 
+BENCHMARK(integral_division_with_floats);
 BENCHMARK(integral_division_with_doubles);
 
 /**

From c0a3b121c08b24c64b09af9f2b32f6924ef5f62e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 11:20:52 +0000
Subject: [PATCH 13/19] Improve: `jmp` vs `cmov`

---
 less_slow.cpp | 173 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 161 insertions(+), 12 deletions(-)

diff --git a/less_slow.cpp b/less_slow.cpp
index 537562d..4126e0e 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -996,6 +996,129 @@ static void branch_cost(bm::State &state) {
 
 BENCHMARK(branch_cost)->RangeMultiplier(4)->Range(256, 32 * 1024);
 
+/**
+ *  It's hard to reason if the above code should compile into a conditional move or a jump,
+ *  so let's define explicit inline assembly kernels and compare both.
+ */
+#if defined(__GNUC__) && !defined(__clang__) //! GCC/Clang inline asm note in your code, keep MSVC out
+
+#if defined(__x86_64__) || defined(__i386__)
+
+static void branch_cost_cmov(bm::State &state) {
+    auto const count = static_cast<std::size_t>(state.range(0));
+    aligned_array<std::int32_t> random_values(count);
+    std::generate_n(random_values.begin(), count, &std::rand);
+    std::int32_t variable = 0;
+    std::size_t iteration = 0;
+
+    for (auto _ : state) {
+        std::int32_t const random = random_values[(++iteration) & (count - 1)];
+        std::int32_t sum; // early-clobber temp for LEA result
+
+        asm volatile(                            //
+            "leal (%[var],%[rnd],1), %[sum]\n\t" // sum := variable + random
+            "imull %[rnd], %[var]\n\t"           // var := variable * random
+            "testl $1, %[rnd]\n\t"               // if (random & 1) var := sum
+            "cmovne %[sum], %[var]\n\t"
+            : [var] "+r"(variable), [sum] "=&r"(sum)
+            : [rnd] "r"(random)
+            : "cc");
+        bm::DoNotOptimize(variable);
+    }
+}
+
+static void branch_cost_jump(bm::State &state) {
+    auto const count = static_cast<std::size_t>(state.range(0));
+    aligned_array<std::int32_t> random_values(count);
+    std::generate_n(random_values.begin(), count, &std::rand);
+    std::int32_t variable = 0;
+    std::size_t iteration = 0;
+
+    for (auto _ : state) {
+        std::int32_t const random = random_values[(++iteration) & (count - 1)];
+
+        asm volatile( //
+            "testl $1, %[rnd]\n\t"
+            "jnz 1f\n\t"               // if odd -> jump to add
+            "imull %[rnd], %[var]\n\t" // even: var *= rnd
+            "jmp 2f\n\t"
+            "1:\n\t"
+            "addl %[rnd], %[var]\n\t" // odd:  var += rnd
+            "2:\n\t"
+            : [var] "+r"(variable)
+            : [rnd] "r"(random)
+            : "cc");
+        bm::DoNotOptimize(variable);
+    }
+}
+
+BENCHMARK(branch_cost_cmov)->RangeMultiplier(4)->Range(256, 32 * 1024);
+BENCHMARK(branch_cost_jump)->RangeMultiplier(4)->Range(256, 32 * 1024);
+
+#elif defined(__aarch64__)
+
+static void branch_cost_csel(bm::State &state) {
+    auto const count = static_cast<std::size_t>(state.range(0));
+    aligned_array<std::int32_t> random_values(count);
+    std::generate_n(random_values.begin(), count, &std::rand);
+    std::int32_t variable = 0;
+    std::size_t iteration = 0;
+
+    for (auto _ : state) {
+        std::int32_t const random = random_values[(++iteration) & (count - 1)];
+        std::int32_t sum;
+
+        asm volatile(                           //
+            "add %w[sum], %w[var], %w[rnd]\n\t" // sum := variable + random
+            "mul %w[var], %w[var], %w[rnd]\n\t" // var := variable * random
+            "tst %w[rnd], #1\n\t"               // if (random & 1) var := sum
+            "csel %w[var], %w[sum], %w[var], NE\n\t"
+            : [var] "+r"(variable), [sum] "=&r"(sum)
+            : [rnd] "r"(random)
+            : "cc");
+        bm::DoNotOptimize(variable);
+    }
+}
+
+static void branch_cost_branch(bm::State &state) {
+    auto const count = static_cast<std::size_t>(state.range(0));
+    aligned_array<std::int32_t> random_values(count);
+    std::generate_n(random_values.begin(), count, &std::rand);
+    std::int32_t variable = 0;
+    std::size_t iteration = 0;
+
+    for (auto _ : state) {
+        std::int32_t const random = random_values[(++iteration) & (count - 1)];
+
+        asm volatile( //
+            "tst %w[rnd], #1\n\t"
+            "b.ne 1f\n\t"                       // if odd -> jump to add
+            "mul %w[var], %w[var], %w[rnd]\n\t" // even: var *= rnd
+            "b 2f\n\t"
+            "1:\n\t"
+            "add %w[var], %w[var], %w[rnd]\n\t" // odd:  var += rnd
+            "2:\n\t"
+            : [var] "+r"(variable)
+            : [rnd] "r"(random)
+            : "cc");
+        bm::DoNotOptimize(variable);
+    }
+}
+
+BENCHMARK(branch_cost_csel)->RangeMultiplier(4)->Range(256, 32 * 1024);
+BENCHMARK(branch_cost_branch)->RangeMultiplier(4)->Range(256, 32 * 1024);
+
+#endif
+
+#endif // __GNUC__ && !__clang__
+
+/**
+ *  Results are quite interesting. On Intel:
+ *  - `branch_cost` up to 4K runs at @b 0.7ns, beyond that it jumps to @b 3.7ns.
+ *  - `branch_cost_cmov` consistently runs at @b 1.3ns, regardless of the size.
+ *  - `branch_cost_jump` has similar, but slightly worse performance than `branch_cost`.
+ */
+
 #pragma endregion // Branch Prediction
 
 #pragma region Cache Misses
@@ -1069,28 +1192,54 @@ BENCHMARK(cache_misses_cost<access_order_t::random>)
  *  value. This optimization is crucial for performance, especially when dealing
  *  with heavy objects.
  */
-#include <optional> // `std::optional`
+struct heavy_t {
+    std::uint64_t data[8];
+
+    heavy_t() noexcept { std::iota(data, data + 8, 0); }
+
+    heavy_t(heavy_t &&) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+    heavy_t(heavy_t const &) { std::this_thread::sleep_for(std::chrono::milliseconds(2)); }
+    heavy_t &operator=(heavy_t &&) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        return *this;
+    }
+    heavy_t &operator=(heavy_t const &) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(2));
+        return *this;
+    }
+};
 
-std::optional<std::string> make_heavy_object_mutable() {
-    std::string x(1024, 'x');
+heavy_t make_heavy_object() { return heavy_t {}; }
+
+heavy_t make_named_heavy_object() {
+    heavy_t const x; //! Even with `const`, RVO is possible
     return x;
 }
 
-std::optional<std::string> make_heavy_object_immutable() {
-    std::string const x(1024, 'x'); //! `const` is the only difference
-    return x;
+heavy_t make_conditional_heavy_object() {
+    heavy_t x;
+    heavy_t &x1 = x;
+    heavy_t &x2 = x;
+    static std::size_t counter = 0; //! Condition prevents RVO
+    if (counter++ % 2 == 0) { return x1; }
+    else { return x2; }
+}
+
+static void rvo_trivial(bm::State &state) {
+    for (auto _ : state) bm::DoNotOptimize(make_heavy_object());
 }
 
-static void rvo_friendly(bm::State &state) {
-    for (auto _ : state) bm::DoNotOptimize(make_heavy_object_mutable());
+static void rvo_likely(bm::State &state) {
+    for (auto _ : state) bm::DoNotOptimize(make_named_heavy_object());
 }
 
-static void rvo_impossible(bm::State &state) {
-    for (auto _ : state) bm::DoNotOptimize(make_heavy_object_immutable());
+static void rvo_banned(bm::State &state) {
+    for (auto _ : state) bm::DoNotOptimize(make_conditional_heavy_object());
 }
 
-BENCHMARK(rvo_friendly);
-BENCHMARK(rvo_impossible);
+BENCHMARK(rvo_trivial);
+BENCHMARK(rvo_likely);
+BENCHMARK(rvo_banned);
 
 /**
  *  Despite intuition, marking a local object as `const` hurts our performance.

From 3f8cd5f34026b2608b109b37d73aff23cb541eee Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Tue, 12 Aug 2025 11:21:24 +0000
Subject: [PATCH 14/19] Release: v0.10.10 [skip ci] ### Patch

- Improve: `jmp` vs `cmov` (c0a3b12)
- Improve: Division via `float`s (8aa9921)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6274579..5bb55d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.9
+    VERSION 0.10.10
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index 75955dd..cd47247 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.9
\ No newline at end of file
+0.10.10
\ No newline at end of file

From 89e72b303479ee5f98489c29eeb1d0b01e3c3782 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 11:37:38 +0000
Subject: [PATCH 15/19] Make: Bump dependencies

---
 CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6274579..97bbd71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,7 +127,7 @@ endif ()
 FetchContent_Declare(
     GoogleTest
     GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG v1.15.2
+    GIT_TAG v1.17.0
 )
 FetchContent_MakeAvailable(GoogleTest)
 
@@ -135,7 +135,7 @@ FetchContent_MakeAvailable(GoogleTest)
 FetchContent_Declare(
     GoogleBenchmark
     GIT_REPOSITORY https://github.com/google/benchmark.git
-    GIT_TAG v1.9.1
+    GIT_TAG v1.9.4
 )
 
 # Suppress building tests/docs/etc. for faster builds:
@@ -235,7 +235,7 @@ endif ()
 FetchContent_Declare(
     VictorZverovichFMT
     GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-    GIT_TAG 11.1.2
+    GIT_TAG 11.2.0
 )
 FetchContent_MakeAvailable(VictorZverovichFMT)
 
@@ -271,7 +271,7 @@ FetchContent_MakeAvailable(MetaLibUnifEx)
 FetchContent_Declare(
     AshVardanianStringZilla
     GIT_REPOSITORY https://github.com/ashvardanian/stringzilla
-    GIT_TAG v3.12.5
+    GIT_TAG v3.12.6
 )
 FetchContent_MakeAvailable(AshVardanianStringZilla)
 
@@ -288,7 +288,7 @@ FetchContent_MakeAvailable(HanaDusikovaCTRE)
 FetchContent_Declare(
     GoogleAbseil
     GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
-    GIT_TAG 20240722.0 # LTS version
+    GIT_TAG 20250512.1 # LTS version
 )
 FetchContent_MakeAvailable(GoogleAbseil)
 
@@ -296,7 +296,7 @@ FetchContent_MakeAvailable(GoogleAbseil)
 FetchContent_Declare(
     NielsLohmannJSON
     GIT_REPOSITORY https://github.com/nlohmann/json.git
-    GIT_TAG v3.11.3
+    GIT_TAG v3.12.0
 )
 FetchContent_MakeAvailable(NielsLohmannJSON)
 
@@ -304,7 +304,7 @@ FetchContent_MakeAvailable(NielsLohmannJSON)
 FetchContent_Declare(
     YaoyuanGuoYYJSON
     GIT_REPOSITORY https://github.com/ibireme/yyjson.git
-    GIT_TAG 0.10.0
+    GIT_TAG 0.11.1
 )
 FetchContent_MakeAvailable(YaoyuanGuoYYJSON)
 

From 787f985c40c00818eadb4b3992c1211cdeb5edc5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:01:17 +0000
Subject: [PATCH 16/19] Improve: Parsing via `simdjson`

---
 .vscode/settings.json |   5 +-
 CMakeLists.txt        |   9 ++++
 less_slow.cpp         | 118 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 7b828ca..f319482 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -68,6 +68,7 @@
     "Lelbach",
     "Lemire",
     "Lib",
+    "libopenblas",
     "LIBPFM",
     "libunifex",
     "liburing",
@@ -273,5 +274,7 @@
     "variant": "cpp",
     "vector": "cpp",
     "version": "cpp"
-  }
+  },
+  "C_Cpp.errorSquiggles": "disabled",
+  "cSpell.enabled": false
 }
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97bbd71..449df1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,6 +308,14 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(YaoyuanGuoYYJSON)
 
+# Daniel Lemire's simdjson for SIMD-accelerated JSON parsing
+FetchContent_Declare(
+    DanielLemireSimdJSON
+    GIT_REPOSITORY https://github.com/simdjson/simdjson.git
+    GIT_TAG v3.13.0
+)
+FetchContent_MakeAvailable(DanielLemireSimdJSON)
+
 # Chris Karloff's ASIO standalone, avoiding Boost... integration is a bit tricky:
 # https://github.com/cpm-cmake/CPM.cmake/blob/master/examples/asio-standalone/CMakeLists.txt
 FetchContent_Declare(
@@ -454,6 +462,7 @@ target_link_libraries(
             unifex
             stringzilla
             yyjson
+            simdjson
             ctre
             asio
             # There is no `absl` shortcut:
diff --git a/less_slow.cpp b/less_slow.cpp
index 4126e0e..4de62c0 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -5220,6 +5220,124 @@ BENCHMARK(json_nlohmann<arena_json, exception_handling_t::noexcept_k>)
     ->Name("json_nlohmann<arena_allocator, noexcept>")
     ->Threads(physical_cores());
 
+/**
+ *  simdjson is designed for high-performance JSON parsing using SIMD instructions.
+ *  It provides On-Demand parsing which is particularly efficient for selective data extraction.
+ */
+#include <simdjson.h>
+
+bool contains_xss_in_simdjson_ondemand(simdjson::ondemand::value element) noexcept {
+
+    // Handle objects
+    if (element.type() == simdjson::ondemand::json_type::object) {
+        simdjson::ondemand::object obj;
+        if (element.get_object().get(obj) == simdjson::SUCCESS) {
+            for (auto sub : obj) {
+                simdjson::ondemand::value val;
+                if (sub.value().get(val) == simdjson::SUCCESS)
+                    if (contains_xss_in_simdjson_ondemand(val)) return true;
+            }
+        }
+        return false;
+    }
+    // Handle arrays
+    else if (element.type() == simdjson::ondemand::json_type::array) {
+        simdjson::ondemand::array arr;
+        if (element.get_array().get(arr) == simdjson::SUCCESS) {
+            for (auto sub : arr) {
+                simdjson::ondemand::value val;
+                if (sub.get(val) == simdjson::SUCCESS)
+                    if (contains_xss_in_simdjson_ondemand(val)) return true;
+            }
+        }
+        return false;
+    }
+    // Handle strings
+    else if (element.type() == simdjson::ondemand::json_type::string) {
+        std::string_view str;
+        if (element.get_string().get(str) == simdjson::SUCCESS)
+            return str.find("<script>alert('XSS')</script>") != std::string_view::npos;
+    }
+    return false;
+}
+
+bool contains_xss_in_simdjson_dom(simdjson::dom::element element) noexcept {
+    if (element.is_object()) {
+        for (auto [key, val] : element.get_object())
+            if (contains_xss_in_simdjson_dom(val)) return true;
+    }
+    else if (element.is_array()) {
+        for (auto val : element.get_array())
+            if (contains_xss_in_simdjson_dom(val)) return true;
+    }
+    else if (element.is_string()) {
+        std::string_view str = element.get_string();
+        return str.find("<script>alert('XSS')</script>") != std::string_view::npos;
+    }
+    return false;
+}
+
+static void json_simdjson_ondemand(bm::State &state) {
+    std::size_t bytes_processed = 0;
+    std::size_t iteration = 0;
+
+    // Pre-allocate padded strings outside the hot path
+    simdjson::padded_string padded_strings[3] = {
+        simdjson::padded_string(packets_json[0]),
+        simdjson::padded_string(packets_json[1]),
+        simdjson::padded_string(packets_json[2]),
+    };
+
+    // On-demand parser reuses internal buffers
+    simdjson::ondemand::parser parser;
+    simdjson::ondemand::document doc;
+
+    for (auto _ : state) {
+        std::size_t const packet_index = iteration++ % 3;
+        bytes_processed += packets_json[packet_index].size();
+
+        auto error = parser.iterate(padded_strings[packet_index]).get(doc);
+        if (error == simdjson::SUCCESS) {
+            simdjson::ondemand::value root;
+            if (doc.get_value().get(root) == simdjson::SUCCESS)
+                bm::DoNotOptimize(contains_xss_in_simdjson_ondemand(root));
+        }
+    }
+
+    state.SetBytesProcessed(bytes_processed);
+}
+
+static void json_simdjson_dom(bm::State &state) {
+    std::size_t bytes_processed = 0;
+    std::size_t iteration = 0;
+
+    // Pre-allocate padded strings outside the hot path
+    simdjson::padded_string padded_strings[3] = {
+        simdjson::padded_string(packets_json[0]),
+        simdjson::padded_string(packets_json[1]),
+        simdjson::padded_string(packets_json[2]),
+    };
+
+    // Reuse the state
+    simdjson::dom::parser parser;
+    simdjson::dom::element doc;
+
+    for (auto _ : state) {
+        std::size_t const packet_index = iteration++ % 3;
+        bytes_processed += packets_json[packet_index].size();
+
+        auto error = parser.parse(padded_strings[packet_index]).get(doc);
+        if (error == simdjson::SUCCESS) bm::DoNotOptimize(contains_xss_in_simdjson_dom(doc));
+    }
+
+    state.SetBytesProcessed(bytes_processed);
+}
+
+BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson<ondemand>");
+BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson<dom>");
+BENCHMARK(json_simdjson_ondemand)->MinTime(10)->Name("json_simdjson<ondemand>")->Threads(physical_cores());
+BENCHMARK(json_simdjson_dom)->MinTime(10)->Name("json_simdjson<dom>")->Threads(physical_cores());
+
 /**
  *  The results for the single-threaded case and the multi-threaded case without
  *  Simultaneous Multi-Threading @b (SMT), with 96 threads on 96 Sapphire Rapids

From b454daebc09b1e67adef25940ca9b25393e023de Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Tue, 12 Aug 2025 12:01:30 +0000
Subject: [PATCH 17/19] Release: v0.10.11 [skip ci] ### Patch

- Improve: Parsing via `simdjson` (787f985)
- Make: Bump dependencies (89e72b3)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2561ca4..57d9aed 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.10
+    VERSION 0.10.11
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index cd47247..25fb08c 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.10
\ No newline at end of file
+0.10.11
\ No newline at end of file

From c459c42f87df560e497d0d7fff5c7eef254f2ead Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 13 Aug 2025 21:38:51 +0000
Subject: [PATCH 18/19] Improve: Bitwise ops for branches

Replacing multiplication with XOR
results in a different assembly - GCC
compiles a `cmov` instead of `jmp`.
---
 less_slow.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/less_slow.cpp b/less_slow.cpp
index 4de62c0..59cb311 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -987,10 +987,10 @@ static void branch_cost(bm::State &state) {
     for (auto _ : state) {
         std::int32_t random = random_values[(++iteration) & (count - 1)];
         bm::DoNotOptimize( //
-            variable =     //
-            (random & 1)   //
+            variable =     // ! Fun fact: multiplication compiles to a jump,
+            (random & 1)   // ! but replacing with a bitwise operation results in a conditional move.
                 ? (variable + random)
-                : (variable * random));
+                : (variable ^ random));
     }
 }
 
@@ -1017,7 +1017,7 @@ static void branch_cost_cmov(bm::State &state) {
 
         asm volatile(                            //
             "leal (%[var],%[rnd],1), %[sum]\n\t" // sum := variable + random
-            "imull %[rnd], %[var]\n\t"           // var := variable * random
+            "xorl %[rnd], %[var]\n\t"            // var := variable ^ random
             "testl $1, %[rnd]\n\t"               // if (random & 1) var := sum
             "cmovne %[sum], %[var]\n\t"
             : [var] "+r"(variable), [sum] "=&r"(sum)
@@ -1039,8 +1039,8 @@ static void branch_cost_jump(bm::State &state) {
 
         asm volatile( //
             "testl $1, %[rnd]\n\t"
-            "jnz 1f\n\t"               // if odd -> jump to add
-            "imull %[rnd], %[var]\n\t" // even: var *= rnd
+            "jnz 1f\n\t"              // if odd -> jump to add
+            "xorl %[rnd], %[var]\n\t" // even: var ^= rnd
             "jmp 2f\n\t"
             "1:\n\t"
             "addl %[rnd], %[var]\n\t" // odd:  var += rnd
@@ -1070,7 +1070,7 @@ static void branch_cost_csel(bm::State &state) {
 
         asm volatile(                           //
             "add %w[sum], %w[var], %w[rnd]\n\t" // sum := variable + random
-            "mul %w[var], %w[var], %w[rnd]\n\t" // var := variable * random
+            "eor %w[var], %w[var], %w[rnd]\n\t" // var := variable ^ random
             "tst %w[rnd], #1\n\t"               // if (random & 1) var := sum
             "csel %w[var], %w[sum], %w[var], NE\n\t"
             : [var] "+r"(variable), [sum] "=&r"(sum)
@@ -1093,7 +1093,7 @@ static void branch_cost_branch(bm::State &state) {
         asm volatile( //
             "tst %w[rnd], #1\n\t"
             "b.ne 1f\n\t"                       // if odd -> jump to add
-            "mul %w[var], %w[var], %w[rnd]\n\t" // even: var *= rnd
+            "eor %w[var], %w[var], %w[rnd]\n\t" // even: var ^= rnd
             "b 2f\n\t"
             "1:\n\t"
             "add %w[var], %w[var], %w[rnd]\n\t" // odd:  var += rnd

From cccf5fc90e3a207715fc26f67b927ac7b620292b Mon Sep 17 00:00:00 2001
From: TinySemVer <tinysemver@ashvardanian.com>
Date: Wed, 13 Aug 2025 21:39:09 +0000
Subject: [PATCH 19/19] Release: v0.10.12 [skip ci] ### Patch

- Improve: Bitwise ops for branches (c459c42)
---
 CMakeLists.txt | 2 +-
 VERSION        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57d9aed..99042bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.25.2 FATAL_ERROR)
 # ------------------------------------------------------------------------------
 project(
     less_slow
-    VERSION 0.10.11
+    VERSION 0.10.12
     LANGUAGES C CXX ASM
     DESCRIPTION
         "Learning how to write Less Slow code, from numerical micro-kernels and SIMD to coroutines, ranges, and polymorphic state machines"
diff --git a/VERSION b/VERSION
index 25fb08c..5111446 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.10.11
\ No newline at end of file
+0.10.12
\ No newline at end of file