From 8d69f05ddf190fd8ea6e2d896772f5aac607f6b8 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 28 Nov 2024 10:59:01 +0100
Subject: [PATCH 01/23] MAINT: Refactor buffered loop to do an initial setup

This reorganizes the loop to do a larger initial setup rather than
figuring things out on every iteration.
That shrinks the buffersize often to fit better with the actual size.
We try to do that smartly, although, the logic is different.

It also means that "reduce" is not necessarily used (although it
is likely that before it was also effectively unused often).

This is not generally faster, but in some cases it will not use
buffering unnecessarily and it should be much better at buffer
re-use (at least with smaller fixes).
In those cases things will be much faster.

(As of writing, the setup time seems slightly slower, that is probably
not unexpected, since finding the best size is extra work and it would
only amortize if there are many iterations.)
---
 numpy/_core/src/multiarray/nditer_api.c    | 1039 +++++---------------
 numpy/_core/src/multiarray/nditer_constr.c |  366 ++++++-
 numpy/_core/src/multiarray/nditer_impl.h   |   33 +-
 3 files changed, 635 insertions(+), 803 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 552a59a0092e..751130fedc91 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -17,13 +17,7 @@
 #include "nditer_impl.h"
 #include "templ_common.h"
 #include "ctors.h"
-#include "refcount.h"
 
-/* Internal helper functions private to this file */
-static npy_intp
-npyiter_checkreducesize(NpyIter *iter, npy_intp count,
-                                npy_intp *reduce_innersize,
-                                npy_intp *reduce_outerdim);
 
 /*NUMPY_API
  * Removes an axis from iteration. This requires that NPY_ITER_MULTI_INDEX
@@ -814,6 +808,7 @@ NpyIter_IsFirstVisit(NpyIter *iter, int iop)
      * We only need to check the outer level of this two-level loop,
      * because of the requirement that EXTERNAL_LOOP be enabled.
      */
+     // TODO: Do I need to change this?  Chance is no, so long REDUCE_OUTERSTRIDES still makes sense!
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
         /* The outer reduce loop */
@@ -828,6 +823,9 @@ NpyIter_IsFirstVisit(NpyIter *iter, int iop)
 
 /*NUMPY_API
  * Whether the iteration could be done with no buffering.
+ *
+ * Note that the iterator may use buffering to increase the inner loop size
+ * even when buffering is not required.
  */
 NPY_NO_EXPORT npy_bool
 NpyIter_RequiresBuffering(NpyIter *iter)
@@ -1588,10 +1586,12 @@ NpyIter_DebugPrint(NpyIter *iter)
         if (itflags&NPY_ITFLAG_REDUCE) {
             printf("|   REDUCE Pos: %d\n",
                         (int)NBF_REDUCE_POS(bufferdata));
-            printf("|   REDUCE OuterSize: %d\n",
+            printf("|   BUFFER CoreSize: %d\n",
+                        (int)NBF_CORESIZE(bufferdata));
+            printf("|   BUFFER Reduce outersize: %d\n",
                         (int)NBF_REDUCE_OUTERSIZE(bufferdata));
-            printf("|   REDUCE OuterDim: %d\n",
-                        (int)NBF_REDUCE_OUTERDIM(bufferdata));
+            printf("|   BUFFER OuterDim: %d\n",
+                        (int)NBF_OUTERDIM(bufferdata));
         }
         printf("|   Strides: ");
         for (iop = 0; iop < nop; ++iop)
@@ -1884,8 +1884,91 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
             NIT_ADVANCE_AXISDATA(axisdata, -1);
         }
     }
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        /* Find the remainder if chunking to the buffers coresize */
+        npy_intp fact = iterindex / NIT_BUFFERDATA(iter)->coresize;
+        npy_intp offset = iterindex - fact * NIT_BUFFERDATA(iter)->coresize;
+        NIT_BUFFERDATA(iter)->coreoffset = offset;
+    }
 }
 
+
+/*
+ * This helper fills the bufferdata copy information for an operand.  It
+ * is very specific to copy from and to buffers.
+ */
+static inline void
+npyiter_fill_buffercopy_params(
+        int nop, int iop, int ndim, npy_uint32 opitflags, npy_intp transfersize,
+        NpyIter_BufferData *bufferdata,
+        NpyIter_AxisData *axisdata,
+        NpyIter_AxisData *outer_axisdata,
+        int *ndim_transfer,
+        npy_intp *op_transfersize,
+        npy_intp *buf_stride,
+        npy_intp *op_strides[], npy_intp *op_shape[], npy_intp *op_coords[])
+{
+    /*
+     * Set up if we had to do the full generic copy.
+     * NOTE: Except the transfersize itself everything here is fixed
+     *       and we could create it once early on.
+     */
+    static npy_intp zero = 0;  // TODO: better way?
+    *ndim_transfer = ndim;
+    *op_transfersize = transfersize;
+
+    if ((opitflags & NPY_OP_ITFLAG_REDUCE) && (NAD_STRIDES(outer_axisdata)[iop] != 0)) {
+        /*
+         * Reduce with inner stride ==0 (outer !=0), we buffer the outer stride
+         * which also means buffering only outersize items.
+         */
+        assert(NAD_STRIDES(axisdata)[iop] == 0);
+        *ndim_transfer = 1;
+        *op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
+        *buf_stride = NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop];
+
+        *op_shape = op_transfersize;
+        *op_coords = &zero;
+        *op_strides = &NAD_STRIDES(outer_axisdata)[iop];
+        return;
+    }
+
+    /*
+     * The copy is now a typical copy into a contiguous buffer.
+     * If it is a reduce, we only copy the inner part (i.e. less).
+     * The buffer strides are now always contiguous.
+     */
+    *buf_stride = NBF_STRIDES(bufferdata)[iop];
+
+    if (opitflags & NPY_OP_ITFLAG_REDUCE) {
+        /* Outer dim is reduced, so omit it from copying */
+        ndim_transfer -= 1;
+        if (*op_transfersize > bufferdata->coresize) {
+            *op_transfersize = bufferdata->coresize;
+        }
+        /* copy setup is identical to non-reduced now. */
+    }
+
+    if (opitflags & NPY_OP_ITFLAG_SINGLESTRIDE) {
+        /* Flatten the copy into a single stride. */
+        *ndim_transfer = 1;
+        *op_shape = op_transfersize;
+        *op_coords = &zero;
+        *op_strides = &NAD_STRIDES(axisdata)[iop];
+        if ((*op_strides)[0] == 0) {
+            *op_transfersize = 1;
+            *buf_stride = 0;
+        }
+    }
+    else {
+        /* We do a full multi-dimensional copy */
+        *op_shape = &NAD_SHAPE(axisdata);
+        *op_coords = &NAD_INDEX(axisdata);
+        *op_strides = &NAD_STRIDES(axisdata)[iop];
+    }
+}
+
+
 /*
  * This gets called after the buffers have been exhausted, and
  * their data needs to be written back to the arrays.  The multi-index
@@ -1901,21 +1984,17 @@ npyiter_copy_from_buffers(NpyIter *iter)
 
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
-    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter),
-                    *reduce_outeraxisdata = NULL;
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    NpyIter_AxisData *outer_axisdata = NULL;
 
     PyArray_Descr **dtypes = NIT_DTYPES(iter);
-    npy_intp transfersize = NBF_SIZE(bufferdata);
-    npy_intp *strides = NBF_STRIDES(bufferdata),
-             *ad_strides = NAD_STRIDES(axisdata);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+    npy_intp *strides = NBF_STRIDES(bufferdata);
+    npy_intp transfersize = bufferdata->size;
+
     char **ad_ptrs = NAD_PTRS(axisdata);
     char **buffers = NBF_BUFFERS(bufferdata);
     char *buffer;
 
-    npy_intp reduce_outerdim = 0;
-    npy_intp *reduce_outerstrides = NULL;
-
     npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) /
                                 NPY_SIZEOF_INTP;
 
@@ -1924,17 +2003,25 @@ npyiter_copy_from_buffers(NpyIter *iter)
         return 0;
     }
 
-    NPY_IT_DBG_PRINT("Iterator: Copying buffers to outputs\n");
-
-    if (itflags&NPY_ITFLAG_REDUCE) {
-        reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata);
-        reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
-        reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
+    if (itflags & NPY_ITFLAG_REDUCE) {
+        npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+        outer_axisdata = NIT_INDEX_AXISDATA(axisdata, bufferdata->outerdim);
         transfersize *= NBF_REDUCE_OUTERSIZE(bufferdata);
     }
 
+    NPY_IT_DBG_PRINT("Iterator: Copying buffers to outputs\n");
+
     NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
     for (iop = 0; iop < nop; ++iop) {
+        if (op_itflags[iop]&NPY_OP_ITFLAG_BUFNEVER) {
+            continue;
+        }
+
+        /* Currently, we always trash the buffer if there are references */
+        if (PyDataType_REFCHK(dtypes[iop])) {
+            NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE;
+        }
+
         buffer = buffers[iop];
         /*
          * Copy the data back to the arrays.  If the type has refs,
@@ -1943,73 +2030,26 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * The flag USINGBUFFER is set when the buffer was used, so
          * only copy back when this flag is on.
          */
-        if ((transferinfo[iop].write.func != NULL) &&
-               (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
-            npy_intp op_transfersize;
-
-            npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape;
-            int ndim_transfer;
-
+        if (transferinfo[iop].write.func != NULL) {
             NPY_IT_DBG_PRINT1("Iterator: Operand %d was buffered\n",
                                         (int)iop);
 
-            /*
-             * If this operand is being reduced in the inner loop,
-             * its buffering stride was set to zero, and just
-             * one element was copied.
-             */
-            if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) {
-                if (strides[iop] == 0) {
-                    if (reduce_outerstrides[iop] == 0) {
-                        op_transfersize = 1;
-                        src_stride = 0;
-                        dst_strides = &src_stride;
-                        dst_coords = &NAD_INDEX(reduce_outeraxisdata);
-                        dst_shape = &NAD_SHAPE(reduce_outeraxisdata);
-                        ndim_transfer = 1;
-                    }
-                    else {
-                        op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
-                        src_stride = reduce_outerstrides[iop];
-                        dst_strides =
-                                &NAD_STRIDES(reduce_outeraxisdata)[iop];
-                        dst_coords = &NAD_INDEX(reduce_outeraxisdata);
-                        dst_shape = &NAD_SHAPE(reduce_outeraxisdata);
-                        ndim_transfer = ndim - reduce_outerdim;
-                    }
-                }
-                else {
-                    if (reduce_outerstrides[iop] == 0) {
-                        op_transfersize = NBF_SIZE(bufferdata);
-                        src_stride = strides[iop];
-                        dst_strides = &ad_strides[iop];
-                        dst_coords = &NAD_INDEX(axisdata);
-                        dst_shape = &NAD_SHAPE(axisdata);
-                        ndim_transfer = reduce_outerdim ?
-                                        reduce_outerdim : 1;
-                    }
-                    else {
-                        op_transfersize = transfersize;
-                        src_stride = strides[iop];
-                        dst_strides = &ad_strides[iop];
-                        dst_coords = &NAD_INDEX(axisdata);
-                        dst_shape = &NAD_SHAPE(axisdata);
-                        ndim_transfer = ndim;
-                    }
-                }
-            }
-            else {
-                op_transfersize = transfersize;
-                src_stride = strides[iop];
-                dst_strides = &ad_strides[iop];
-                dst_coords = &NAD_INDEX(axisdata);
-                dst_shape = &NAD_SHAPE(axisdata);
-                ndim_transfer = ndim;
-            }
+            int ndim_transfer;
+            npy_intp op_transfersize;
+            npy_intp src_stride;
+            npy_intp *dst_strides;
+            npy_intp *dst_coords;
+            npy_intp *dst_shape;
 
-            NPY_IT_DBG_PRINT2("Iterator: Copying buffer to "
-                                "operand %d (%d items)\n",
-                                (int)iop, (int)op_transfersize);
+            npyiter_fill_buffercopy_params(nop, iop, ndim, op_itflags[iop],
+                transfersize, bufferdata, axisdata, outer_axisdata,
+                &ndim_transfer, &op_transfersize, &src_stride,
+                &dst_strides, &dst_shape, &dst_coords);
+
+            NPY_IT_DBG_PRINT(
+                "Iterator: Copying buffer to operand %d (%zd items):\n"
+                "    transfer ndim: %d, inner stride: %zd, inner shape: %zd, buffer stride: %zd\n",
+                iop, op_transfersize, ndim_transfer, dst_strides[0], dst_shape[0], src_stride);
 
             /* WRITEMASKED operand */
             if (op_itflags[iop] & NPY_OP_ITFLAG_WRITEMASKED) {
@@ -2019,11 +2059,11 @@ npyiter_copy_from_buffers(NpyIter *iter)
                  * The mask pointer may be in the buffer or in
                  * the array, detect which one.
                  */
-                if ((op_itflags[maskop]&NPY_OP_ITFLAG_USINGBUFFER) != 0) {
-                    maskptr = (npy_bool *)buffers[maskop];
+                if ((op_itflags[maskop]&NPY_OP_ITFLAG_BUFNEVER)) {
+                    maskptr = (npy_bool *)ad_ptrs[maskop];
                 }
                 else {
-                    maskptr = (npy_bool *)ad_ptrs[maskop];
+                    maskptr = (npy_bool *)buffers[maskop];
                 }
 
                 if (PyArray_TransferMaskedStridedToNDim(ndim_transfer,
@@ -2057,11 +2097,11 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * The flag USINGBUFFER is set when the buffer was used, so
          * only decrement refs when this flag is on.
          */
-        else if (transferinfo[iop].clear.func != NULL &&
-                    (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+        else if (transferinfo[iop].clear.func != NULL) {
             NPY_IT_DBG_PRINT1(
                     "Iterator: clearing refs of operand %d\n", (int)iop);
             npy_intp buf_stride = dtypes[iop]->elsize;
+            // TODO: transfersize is too large for reductions
             if (transferinfo[iop].clear.func(
                     NULL, transferinfo[iop].clear.descr, buffer, transfersize,
                     buf_stride, transferinfo[iop].clear.auxdata) < 0) {
@@ -2091,509 +2131,157 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
     NpyIter_AxisData *axisdata = NIT_AXISDATA(iter),
-                    *reduce_outeraxisdata = NULL;
+                     *outer_axisdata = NULL;
 
-    PyArray_Descr **dtypes = NIT_DTYPES(iter);
     PyArrayObject **operands = NIT_OPERANDS(iter);
-    npy_intp *strides = NBF_STRIDES(bufferdata),
-             *ad_strides = NAD_STRIDES(axisdata);
+
     npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     char **ptrs = NBF_PTRS(bufferdata), **ad_ptrs = NAD_PTRS(axisdata);
     char **buffers = NBF_BUFFERS(bufferdata);
-    npy_intp iterindex, iterend, transfersize,
-            singlestridesize, reduce_innersize = 0, reduce_outerdim = 0;
-    int is_onestride = 0, any_buffered = 0;
-
-    npy_intp *reduce_outerstrides = NULL;
-    char **reduce_outerptrs = NULL;
-
-    /*
-     * Have to get this flag before npyiter_checkreducesize sets
-     * it for the next iteration.
-     */
-    npy_bool reuse_reduce_loops = (prev_dataptrs != NULL) &&
-                    ((itflags&NPY_ITFLAG_REUSE_REDUCE_LOOPS) != 0);
+    npy_intp iterindex, iterend, transfersize;
 
     npy_intp axisdata_incr = NIT_AXISDATA_SIZEOF(itflags, ndim, nop) /
                                 NPY_SIZEOF_INTP;
 
-    NPY_IT_DBG_PRINT("Iterator: Copying inputs to buffers\n");
-
-    /* Calculate the size if using any buffers */
+    /* Fetch the maximum size we may wish to copy (or use if unbuffered) */
     iterindex = NIT_ITERINDEX(iter);
     iterend = NIT_ITEREND(iter);
     transfersize = NBF_BUFFERSIZE(bufferdata);
-    if (transfersize > iterend - iterindex) {
-        transfersize = iterend - iterindex;
-    }
-
-    /* If last time around, the reduce loop structure was full, we reuse it */
-    if (reuse_reduce_loops) {
-        npy_intp full_transfersize, prev_reduce_outersize;
+    outer_axisdata = NIT_INDEX_AXISDATA(axisdata, bufferdata->outerdim);
+    npy_intp remaining_outersize = (
+            outer_axisdata->shape - outer_axisdata->index);
 
-        prev_reduce_outersize = NBF_REDUCE_OUTERSIZE(bufferdata);
-        reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
-        reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
-        reduce_outerdim = NBF_REDUCE_OUTERDIM(bufferdata);
-        reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
-        reduce_innersize = NBF_SIZE(bufferdata);
-        NBF_REDUCE_POS(bufferdata) = 0;
-        /*
-         * Try to do make the outersize as big as possible. This allows
-         * it to shrink when processing the last bit of the outer reduce loop,
-         * then grow again at the beginning of the next outer reduce loop.
-         */
-        NBF_REDUCE_OUTERSIZE(bufferdata) = (NAD_SHAPE(reduce_outeraxisdata)-
-                                            NAD_INDEX(reduce_outeraxisdata));
-        full_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize;
-        /* If the full transfer size doesn't fit in the buffer, truncate it */
-        if (full_transfersize > NBF_BUFFERSIZE(bufferdata)) {
-            NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize;
-            transfersize = NBF_REDUCE_OUTERSIZE(bufferdata)*reduce_innersize;
-        }
-        else {
-            transfersize = full_transfersize;
-        }
-        if (prev_reduce_outersize < NBF_REDUCE_OUTERSIZE(bufferdata)) {
-            /*
-             * If the previous time around less data was copied it may not
-             * be safe to reuse the buffers even if the pointers match.
-             */
-            reuse_reduce_loops = 0;
-        }
-        NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
+    NPY_IT_DBG_PRINT("Iterator: Copying inputs to buffers\n");
+    NPY_IT_DBG_PRINT("    Max transfersize=%zd, coresize=%zd\n",
+                     transfersize, bufferdata->coresize);
 
-        NPY_IT_DBG_PRINT3("Reused reduce transfersize: %d innersize: %d "
-                        "itersize: %d\n",
-                            (int)transfersize,
-                            (int)reduce_innersize,
-                            (int)NpyIter_GetIterSize(iter));
-        NPY_IT_DBG_PRINT1("Reduced reduce outersize: %d",
-                            (int)NBF_REDUCE_OUTERSIZE(bufferdata));
-    }
     /*
-     * If there are any reduction operands, we may have to make
-     * the size smaller so we don't copy the same value into
-     * a buffer twice, as the buffering does not have a mechanism
-     * to combine values itself.
+     * If there is a coreoffset just copy to the end of a single coresize
+     * NB: Also if the size is shrunk, we definitely won't set buffer re-use.
      */
-    else if (itflags&NPY_ITFLAG_REDUCE) {
-        NPY_IT_DBG_PRINT("Iterator: Calculating reduce loops\n");
-        transfersize = npyiter_checkreducesize(iter, transfersize,
-                                                &reduce_innersize,
-                                                &reduce_outerdim);
-        NPY_IT_DBG_PRINT3("Reduce transfersize: %d innersize: %d "
-                        "itersize: %d\n",
-                            (int)transfersize,
-                            (int)reduce_innersize,
-                            (int)NpyIter_GetIterSize(iter));
-
-        reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
-        reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
-        reduce_outeraxisdata = NIT_INDEX_AXISDATA(axisdata, reduce_outerdim);
-        NBF_SIZE(bufferdata) = reduce_innersize;
-        NBF_REDUCE_POS(bufferdata) = 0;
-        NBF_REDUCE_OUTERDIM(bufferdata) = reduce_outerdim;
-        NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
-        if (reduce_innersize == 0) {
-            NBF_REDUCE_OUTERSIZE(bufferdata) = 0;
-            return 0;
-        }
-        else {
-            NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize;
-        }
+    if (bufferdata->coreoffset) {
+        prev_dataptrs = NULL;  /* No way we can re-use the buffers safely. */
+        transfersize = bufferdata->coresize - bufferdata->coreoffset;
+        NPY_IT_DBG_PRINT("    Shrunk transfersize due to coreoffset=%zd: %zd\n",
+                         bufferdata->coreoffset, transfersize);
     }
-    else {
-        NBF_SIZE(bufferdata) = transfersize;
-        NBF_BUFITEREND(bufferdata) = iterindex + transfersize;
+    else if (transfersize > bufferdata->coresize * remaining_outersize) {
+        /*
+         * Shrink transfersize to not go beyond outer axis size.  If not
+         * a reduction, it is unclear that this is necessary.
+         */
+        transfersize = bufferdata->coresize * remaining_outersize;
+        NPY_IT_DBG_PRINT("    Shrunk transfersize outer size: %zd\n", transfersize);
     }
 
-    /* Calculate the maximum size if using a single stride and no buffers */
-    singlestridesize = NAD_SHAPE(axisdata)-NAD_INDEX(axisdata);
-    if (singlestridesize > iterend - iterindex) {
-        singlestridesize = iterend - iterindex;
-    }
-    if (singlestridesize >= transfersize) {
-        is_onestride = 1;
+    /* And ensure that we don't go beyond the iterator end (if ranged) */
+    if (transfersize > iterend - iterindex) {
+        transfersize = iterend - iterindex;
+        NPY_IT_DBG_PRINT("    Shrunk transfersize to itersize: %zd\n", transfersize);
     }
 
-    NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
-    for (iop = 0; iop < nop; ++iop) {
+    bufferdata->size = transfersize;
+    NBF_BUFITEREND(bufferdata) = iterindex + transfersize;
 
-        switch (op_itflags[iop]&
-                        (NPY_OP_ITFLAG_BUFNEVER|
-                         NPY_OP_ITFLAG_CAST|
-                         NPY_OP_ITFLAG_REDUCE)) {
-            /* Never need to buffer this operand */
-            case NPY_OP_ITFLAG_BUFNEVER:
-                ptrs[iop] = ad_ptrs[iop];
-                if (itflags&NPY_ITFLAG_REDUCE) {
-                    reduce_outerstrides[iop] = reduce_innersize *
-                                                 strides[iop];
-                    reduce_outerptrs[iop] = ptrs[iop];
-                }
-                /*
-                 * Should not adjust the stride - ad_strides[iop]
-                 * could be zero, but strides[iop] was initialized
-                 * to the first non-trivial stride.
-                 */
-                /* The flag NPY_OP_ITFLAG_USINGBUFFER can be ignored here */
-                assert(!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER));
-                break;
-            /* Never need to buffer this operand */
-            case NPY_OP_ITFLAG_BUFNEVER|NPY_OP_ITFLAG_REDUCE:
-                ptrs[iop] = ad_ptrs[iop];
-                reduce_outerptrs[iop] = ptrs[iop];
-                reduce_outerstrides[iop] = 0;
-                /*
-                 * Should not adjust the stride - ad_strides[iop]
-                 * could be zero, but strides[iop] was initialized
-                 * to the first non-trivial stride.
-                 */
-                /* The flag NPY_OP_ITFLAG_USINGBUFFER can be ignored here */
-                assert(!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER));
-                break;
-            /* Just a copy */
-            case 0:
-                /* Do not reuse buffer if it did not exist */
-                if (!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER) &&
-                                                (prev_dataptrs != NULL)) {
-                    prev_dataptrs[iop] = NULL;
-                }
-                /*
-                 * No copyswap or cast was requested, so all we're
-                 * doing is copying the data to fill the buffer and
-                 * produce a single stride.  If the underlying data
-                 * already does that, no need to copy it.
-                 */
-                if (is_onestride) {
-                    ptrs[iop] = ad_ptrs[iop];
-                    strides[iop] = ad_strides[iop];
-                    /* Signal that the buffer is not being used */
-                    op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER);
-                }
-                /* If some other op is reduced, we have a double reduce loop */
-                else if ((itflags&NPY_ITFLAG_REDUCE) &&
-                                (reduce_outerdim == 1) &&
-                                (transfersize/reduce_innersize <=
-                                            NAD_SHAPE(reduce_outeraxisdata) -
-                                            NAD_INDEX(reduce_outeraxisdata))) {
-                    ptrs[iop] = ad_ptrs[iop];
-                    reduce_outerptrs[iop] = ptrs[iop];
-                    strides[iop] = ad_strides[iop];
-                    reduce_outerstrides[iop] =
-                                    NAD_STRIDES(reduce_outeraxisdata)[iop];
-                    /* Signal that the buffer is not being used */
-                    op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER);
-                }
-                else {
-                    /* In this case, the buffer is being used */
-                    ptrs[iop] = buffers[iop];
-                    strides[iop] = dtypes[iop]->elsize;
-                    if (itflags&NPY_ITFLAG_REDUCE) {
-                        reduce_outerstrides[iop] = reduce_innersize *
-                                                     strides[iop];
-                        reduce_outerptrs[iop] = ptrs[iop];
-                    }
-                    /* Signal that the buffer is being used */
-                    op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER;
-                }
-                break;
-            /* Just a copy, but with a reduction */
-            case NPY_OP_ITFLAG_REDUCE:
-                /* Do not reuse buffer if it did not exist */
-                if (!(op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER) &&
-                                                (prev_dataptrs != NULL)) {
-                    prev_dataptrs[iop] = NULL;
-                }
-                if (ad_strides[iop] == 0) {
-                    strides[iop] = 0;
-                    /* It's all in one stride in the inner loop dimension */
-                    if (is_onestride) {
-                        NPY_IT_DBG_PRINT1("reduce op %d all one stride\n", (int)iop);
-                        ptrs[iop] = ad_ptrs[iop];
-                        reduce_outerstrides[iop] = 0;
-                        /* Signal that the buffer is not being used */
-                        op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER);
-                    }
-                    /* It's all in one stride in the reduce outer loop */
-                    else if ((reduce_outerdim > 0) &&
-                                    (transfersize/reduce_innersize <=
-                                            NAD_SHAPE(reduce_outeraxisdata) -
-                                            NAD_INDEX(reduce_outeraxisdata))) {
-                        NPY_IT_DBG_PRINT1("reduce op %d all one outer stride\n",
-                                                            (int)iop);
-                        ptrs[iop] = ad_ptrs[iop];
-                        /* Outer reduce loop advances by one item */
-                        reduce_outerstrides[iop] =
-                                NAD_STRIDES(reduce_outeraxisdata)[iop];
-                        /* Signal that the buffer is not being used */
-                        op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER);
-                    }
-                    /* In this case, the buffer is being used */
-                    else {
-                        NPY_IT_DBG_PRINT1("reduce op %d must buffer\n", (int)iop);
-                        ptrs[iop] = buffers[iop];
-                        /* Both outer and inner reduce loops have stride 0 */
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
-                            reduce_outerstrides[iop] = 0;
-                        }
-                        /* Outer reduce loop advances by one item */
-                        else {
-                            reduce_outerstrides[iop] = dtypes[iop]->elsize;
-                        }
-                        /* Signal that the buffer is being used */
-                        op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER;
-                    }
+    if (transfersize == 0) {
+        return 0;
+    }
 
-                }
-                else if (is_onestride) {
-                    NPY_IT_DBG_PRINT1("reduce op %d all one stride in dim 0\n", (int)iop);
-                    ptrs[iop] = ad_ptrs[iop];
-                    strides[iop] = ad_strides[iop];
-                    reduce_outerstrides[iop] = 0;
-                    /* Signal that the buffer is not being used */
-                    op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER);
-                }
-                else {
-                    /* It's all in one stride in the reduce outer loop */
-                    if ((reduce_outerdim == 1) &&
-                                    (transfersize/reduce_innersize <=
-                                            NAD_SHAPE(reduce_outeraxisdata) -
-                                            NAD_INDEX(reduce_outeraxisdata))) {
-                        ptrs[iop] = ad_ptrs[iop];
-                        strides[iop] = ad_strides[iop];
-                        /* Outer reduce loop advances by one item */
-                        reduce_outerstrides[iop] =
-                                NAD_STRIDES(reduce_outeraxisdata)[iop];
-                        /* Signal that the buffer is not being used */
-                        op_itflags[iop] &= (~NPY_OP_ITFLAG_USINGBUFFER);
-                    }
-                    /* In this case, the buffer is being used */
-                    else {
-                        ptrs[iop] = buffers[iop];
-                        strides[iop] = dtypes[iop]->elsize;
+    NPY_IT_DBG_PRINT("Iterator: Buffer transfersize=%zd\n", transfersize);
 
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
-                            /* Reduction in outer reduce loop */
-                            reduce_outerstrides[iop] = 0;
-                        }
-                        else {
-                            /* Advance to next items in outer reduce loop */
-                            reduce_outerstrides[iop] = reduce_innersize *
-                                                         dtypes[iop]->elsize;
-                        }
-                        /* Signal that the buffer is being used */
-                        op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER;
-                    }
-                }
-                reduce_outerptrs[iop] = ptrs[iop];
-                break;
-            default:
-                /* In this case, the buffer is always being used */
-                any_buffered = 1;
-
-                /* Signal that the buffer is being used */
-                op_itflags[iop] |= NPY_OP_ITFLAG_USINGBUFFER;
-
-                if (!(op_itflags[iop]&NPY_OP_ITFLAG_REDUCE)) {
-                    ptrs[iop] = buffers[iop];
-                    strides[iop] = dtypes[iop]->elsize;
-                    if (itflags&NPY_ITFLAG_REDUCE) {
-                        reduce_outerstrides[iop] = reduce_innersize *
-                                                     strides[iop];
-                        reduce_outerptrs[iop] = ptrs[iop];
-                    }
-                }
-                /* The buffer is being used with reduction */
-                else {
-                    ptrs[iop] = buffers[iop];
-                    if (ad_strides[iop] == 0) {
-                        NPY_IT_DBG_PRINT1("cast op %d has innermost stride 0\n", (int)iop);
-                        strides[iop] = 0;
-                        /* Both outer and inner reduce loops have stride 0 */
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iop);
-                            reduce_outerstrides[iop] = 0;
-                        }
-                        /* Outer reduce loop advances by one item */
-                        else {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iop);
-                            reduce_outerstrides[iop] = dtypes[iop]->elsize;
-                        }
-                    }
-                    else {
-                        NPY_IT_DBG_PRINT1("cast op %d has innermost stride !=0\n", (int)iop);
-                        strides[iop] = dtypes[iop]->elsize;
-
-                        if (NAD_STRIDES(reduce_outeraxisdata)[iop] == 0) {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride 0\n", (int)iop);
-                            /* Reduction in outer reduce loop */
-                            reduce_outerstrides[iop] = 0;
-                        }
-                        else {
-                            NPY_IT_DBG_PRINT1("cast op %d has outermost stride !=0\n", (int)iop);
-                            /* Advance to next items in outer reduce loop */
-                            reduce_outerstrides[iop] = reduce_innersize *
-                                                         dtypes[iop]->elsize;
-                        }
-                    }
-                    reduce_outerptrs[iop] = ptrs[iop];
-                }
-                break;
+    if (itflags & NPY_ITFLAG_REDUCE) {
+        NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize / bufferdata->coresize;
+        if (NBF_REDUCE_OUTERSIZE(bufferdata) > 1) {
+            /* WARNING: bufferdata->size does not include reduce-outersize */
+            bufferdata->size = bufferdata->coresize;
         }
+        NBF_REDUCE_POS(bufferdata) = 0;
+    }
 
-        /*
-         * If OP_ITFLAG_USINGBUFFER is enabled and the read func is not NULL,
-         * the buffer needs to be read.
-         */
-        if (op_itflags[iop] & NPY_OP_ITFLAG_USINGBUFFER &&
-                transferinfo[iop].read.func != NULL) {
-            npy_intp src_itemsize;
-            npy_intp op_transfersize;
-
-            npy_intp dst_stride, *src_strides, *src_coords, *src_shape;
-            int ndim_transfer;
-
-            npy_bool skip_transfer = 0;
+    NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
+    for (iop = 0; iop < nop; ++iop) {
+        NPY_IT_DBG_PRINT("Iterator: buffer prep for op=%d @ %p inner-stride=%zd\n",
+                         iop, ad_ptrs[iop], NBF_STRIDES(bufferdata)[iop]);
 
-            src_itemsize = PyArray_DTYPE(operands[iop])->elsize;
+        if (op_itflags[iop]&NPY_OP_ITFLAG_BUFNEVER) {
+            ptrs[iop] = ad_ptrs[iop];
+            NBF_REDUCE_OUTERPTRS(bufferdata)[iop] = ptrs[iop];
+            NPY_IT_DBG_PRINT("    unbuffered op (skipping)\n");
+            continue;
+        }
+        else {
+            /* Pointers currently never change here. */
+            ptrs[iop] = buffers[iop];
+            NBF_REDUCE_OUTERPTRS(bufferdata)[iop] = ptrs[iop];
+        }
 
-            /* If we reach here, buffering is required */
-            any_buffered = 1;
+        if (!(op_itflags[iop]&NPY_OP_ITFLAG_READ)) {
+            NPY_IT_DBG_PRINT("    non-reading op (skipping)\n");
+            continue;
+        }
 
+        if (op_itflags[iop]&NPY_OP_ITFLAG_BUF_REUSABLE
+                && prev_dataptrs && prev_dataptrs[iop] == ad_ptrs[iop]) {
+            NPY_IT_DBG_PRINT2("Iterator: skipping operands %d "
+                    "copy (%d items) because the data pointer didn't change\n",
+                    (int)iop, (int)transfersize);
+            continue;
+        }
+        else if (transfersize == NBF_BUFFERSIZE(bufferdata)
+                    || (transfersize >= NBF_CORESIZE(bufferdata)
+                        && op_itflags[iop]&NPY_OP_ITFLAG_REDUCE
+                        && NAD_STRIDES(outer_axisdata)[iop] == 0)) {
             /*
-             * If this operand is being reduced in the inner loop,
-             * set its buffering stride to zero, and just copy
-             * one element.
+             * If we have a fully copy or a reduce with 0 stride outer and
+             * a copy larger than the coresize, this is now re-usable.
+             * NB: With a core-offset, we always copy less than the core-size.
              */
-            if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) {
-                if (ad_strides[iop] == 0) {
-                    strides[iop] = 0;
-                    if (reduce_outerstrides[iop] == 0) {
-                        op_transfersize = 1;
-                        dst_stride = 0;
-                        src_strides = &dst_stride;
-                        src_coords = &NAD_INDEX(reduce_outeraxisdata);
-                        src_shape = &NAD_SHAPE(reduce_outeraxisdata);
-                        ndim_transfer = 1;
-
-                        /*
-                         * When we're reducing a single element, and
-                         * it's still the same element, don't overwrite
-                         * it even when reuse reduce loops is unset.
-                         * This preserves the precision of the
-                         * intermediate calculation.
-                         */
-                        if (prev_dataptrs &&
-                                    prev_dataptrs[iop] == ad_ptrs[iop]) {
-                            NPY_IT_DBG_PRINT1("Iterator: skipping operand %d"
-                                    " copy because it's a 1-element reduce\n",
-                                    (int)iop);
-
-                            skip_transfer = 1;
-                        }
-                    }
-                    else {
-                        op_transfersize = NBF_REDUCE_OUTERSIZE(bufferdata);
-                        dst_stride = reduce_outerstrides[iop];
-                        src_strides = &NAD_STRIDES(reduce_outeraxisdata)[iop];
-                        src_coords = &NAD_INDEX(reduce_outeraxisdata);
-                        src_shape = &NAD_SHAPE(reduce_outeraxisdata);
-                        ndim_transfer = ndim - reduce_outerdim;
-                    }
-                }
-                else {
-                    if (reduce_outerstrides[iop] == 0) {
-                        op_transfersize = NBF_SIZE(bufferdata);
-                        dst_stride = strides[iop];
-                        src_strides = &ad_strides[iop];
-                        src_coords = &NAD_INDEX(axisdata);
-                        src_shape = &NAD_SHAPE(axisdata);
-                        ndim_transfer = reduce_outerdim ? reduce_outerdim : 1;
-                    }
-                    else {
-                        op_transfersize = transfersize;
-                        dst_stride = strides[iop];
-                        src_strides = &ad_strides[iop];
-                        src_coords = &NAD_INDEX(axisdata);
-                        src_shape = &NAD_SHAPE(axisdata);
-                        ndim_transfer = ndim;
-                    }
-                }
-            }
-            else {
-                op_transfersize = transfersize;
-                dst_stride = strides[iop];
-                src_strides = &ad_strides[iop];
-                src_coords = &NAD_INDEX(axisdata);
-                src_shape = &NAD_SHAPE(axisdata);
-                ndim_transfer = ndim;
-            }
+            NPY_IT_DBG_PRINT("    marking operand %d for buffer reuse\n", iop);
+            NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUF_REUSABLE;
+        }
 
-            /*
-             * If the whole buffered loop structure remains the same,
-             * and the source pointer for this data didn't change,
-             * we don't have to copy the data again.
-             */
-            if (reuse_reduce_loops && prev_dataptrs[iop] == ad_ptrs[iop]) {
-                NPY_IT_DBG_PRINT2("Iterator: skipping operands %d "
-                        "copy (%d items) because loops are reused and the data "
-                        "pointer didn't change\n",
-                        (int)iop, (int)op_transfersize);
-                skip_transfer = 1;
-            }
+        int ndim_transfer;
+        npy_intp op_transfersize;
+        npy_intp dst_stride;
+        npy_intp *src_strides;
+        npy_intp *src_coords;
+        npy_intp *src_shape;
+        npy_intp src_itemsize = PyArray_DTYPE(operands[iop])->elsize;
 
-            /*
-             * Copy data to the buffers if necessary.
-             *
-             * We always copy if the operand has references. In that case
-             * a "write" function must be in use that either copies or clears
-             * the buffer.
-             * This write from buffer call does not check for skip-transfer
-             * so we have to assume the buffer is cleared.  For dtypes that
-             * do not have references, we can assume that the write function
-             * will leave the source (buffer) unmodified.
-             */
-            if (!skip_transfer || PyDataType_REFCHK(dtypes[iop])) {
-                NPY_IT_DBG_PRINT2("Iterator: Copying operand %d to "
-                                "buffer (%d items)\n",
-                                (int)iop, (int)op_transfersize);
-
-                if (PyArray_TransferNDimToStrided(
-                        ndim_transfer, ptrs[iop], dst_stride,
-                        ad_ptrs[iop], src_strides, axisdata_incr,
-                        src_coords, axisdata_incr,
-                        src_shape, axisdata_incr,
-                        op_transfersize, src_itemsize,
-                        &transferinfo[iop].read) < 0) {
-                    return -1;
-                }
-            }
-        }
-    }
+        npyiter_fill_buffercopy_params(nop, iop, ndim, op_itflags[iop],
+            transfersize, bufferdata, axisdata, outer_axisdata,
+            &ndim_transfer, &op_transfersize, &dst_stride,
+            &src_strides, &src_shape, &src_coords);
 
-    /*
-     * If buffering wasn't needed, we can grow the inner
-     * loop to as large as possible.
-     *
-     * TODO: Could grow REDUCE loop too with some more logic above.
-     */
-    if (!any_buffered && (itflags&NPY_ITFLAG_GROWINNER) &&
-                        !(itflags&NPY_ITFLAG_REDUCE)) {
-        if (singlestridesize > transfersize) {
-            NPY_IT_DBG_PRINT2("Iterator: Expanding inner loop size "
-                    "from %d to %d since buffering wasn't needed\n",
-                    (int)NBF_SIZE(bufferdata), (int)singlestridesize);
-            NBF_SIZE(bufferdata) = singlestridesize;
-            NBF_BUFITEREND(bufferdata) = iterindex + singlestridesize;
+        /*
+         * Copy data to the buffers if necessary.
+         *
+         * We always copy if the operand has references. In that case
+         * a "write" function must be in use that either copies or clears
+         * the buffer.
+         * This write from buffer call does not check for skip-transfer
+         * so we have to assume the buffer is cleared.  For dtypes that
+         * do not have references, we can assume that the write function
+         * will leave the source (buffer) unmodified.
+         */
+        NPY_IT_DBG_PRINT(
+            "Iterator: Copying operand %d to buffer (%zd items):\n"
+            "    transfer ndim: %d, inner stride: %zd, inner shape: %zd, buffer stride: %zd\n",
+            iop, op_transfersize, ndim_transfer, src_strides[0], src_shape[0], dst_stride);
+
+        if (PyArray_TransferNDimToStrided(
+                ndim_transfer, ptrs[iop], dst_stride,
+                ad_ptrs[iop], src_strides, axisdata_incr,
+                src_coords, axisdata_incr,
+                src_shape, axisdata_incr,
+                op_transfersize, src_itemsize,
+                &transferinfo[iop].read) < 0) {
+            return -1;
         }
     }
 
-    NPY_IT_DBG_PRINT1("Any buffering needed: %d\n", any_buffered);
-
     NPY_IT_DBG_PRINT1("Iterator: Finished copying inputs to buffers "
-                        "(buffered size is %d)\n", (int)NBF_SIZE(bufferdata));
+                        "(buffered size is %zd)\n", transfersize);
     return 0;
 }
 
@@ -2631,13 +2319,16 @@ npyiter_clear_buffers(NpyIter *iter)
     PyArray_Descr **dtypes = NIT_DTYPES(iter);
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     for (int iop = 0; iop < nop; ++iop, ++buffers) {
-        if (transferinfo[iop].clear.func == NULL ||
-                 !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+        if (transferinfo[iop].clear.func == NULL) {
             continue;
         }
         if (*buffers == 0) {
             continue;
         }
+        assert(!(op_itflags[iop]&NPY_OP_ITFLAG_BUFNEVER));
+        /* Buffer cannot be re-used (not that we should ever try!) */
+        op_itflags[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE;
+
         int itemsize = dtypes[iop]->elsize;
         if (transferinfo[iop].clear.func(NULL,
                 dtypes[iop], *buffers, NBF_SIZE(bufferdata), itemsize,
@@ -2652,236 +2343,6 @@ npyiter_clear_buffers(NpyIter *iter)
 }
 
 
-/*
- * This checks how much space can be buffered without encountering the
- * same value twice, or for operands whose innermost stride is zero,
- * without encountering a different value.  By reducing the buffered
- * amount to this size, reductions can be safely buffered.
- *
- * Reductions are buffered with two levels of looping, to avoid
- * frequent copying to the buffers.  The return value is the over-all
- * buffer size, and when the flag NPY_ITFLAG_REDUCE is set, reduce_innersize
- * receives the size of the inner of the two levels of looping.
- *
- * The value placed in reduce_outerdim is the index into the AXISDATA
- * for where the second level of the double loop begins.
- *
- * The return value is always a multiple of the value placed in
- * reduce_innersize.
- */
-static npy_intp
-npyiter_checkreducesize(NpyIter *iter, npy_intp count,
-                                npy_intp *reduce_innersize,
-                                npy_intp *reduce_outerdim)
-{
-    npy_uint32 itflags = NIT_ITFLAGS(iter);
-    int idim, ndim = NIT_NDIM(iter);
-    int iop, nop = NIT_NOP(iter);
-
-    NpyIter_AxisData *axisdata;
-    npy_intp sizeof_axisdata;
-    npy_intp coord, shape, *strides;
-    npy_intp reducespace = 1, factor;
-    npy_bool nonzerocoord;
-
-    npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
-    char stride0op[NPY_MAXARGS];
-
-    /* Default to no outer axis */
-    *reduce_outerdim = 0;
-
-    /* If there's only one dimension, no need to calculate anything */
-    if (ndim == 1 || count == 0) {
-        *reduce_innersize = count;
-        return count;
-    }
-
-    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
-    axisdata = NIT_AXISDATA(iter);
-
-    /* Indicate which REDUCE operands have stride 0 in the inner loop */
-    strides = NAD_STRIDES(axisdata);
-    for (iop = 0; iop < nop; ++iop) {
-        stride0op[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) &&
-                           (strides[iop] == 0);
-        NPY_IT_DBG_PRINT2("Iterator: Operand %d has stride 0 in "
-                        "the inner loop? %d\n", iop, (int)stride0op[iop]);
-    }
-    shape = NAD_SHAPE(axisdata);
-    coord = NAD_INDEX(axisdata);
-    reducespace += (shape-coord-1);
-    factor = shape;
-    NIT_ADVANCE_AXISDATA(axisdata, 1);
-
-    /* Initialize nonzerocoord based on the first coordinate */
-    nonzerocoord = (coord != 0);
-
-    /* Go forward through axisdata, calculating the space available */
-    for (idim = 1; idim < ndim && reducespace < count;
-                                ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        NPY_IT_DBG_PRINT2("Iterator: inner loop reducespace %d, count %d\n",
-                                (int)reducespace, (int)count);
-
-        strides = NAD_STRIDES(axisdata);
-        for (iop = 0; iop < nop; ++iop) {
-            /*
-             * If a reduce stride switched from zero to non-zero, or
-             * vice versa, that's the point where the data will stop
-             * being the same element or will repeat, and if the
-             * buffer starts with an all zero multi-index up to this
-             * point, gives us the reduce_innersize.
-             */
-            if((stride0op[iop] && (strides[iop] != 0)) ||
-                        (!stride0op[iop] &&
-                         (strides[iop] == 0) &&
-                         (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE))) {
-                NPY_IT_DBG_PRINT1("Iterator: Reduce operation limits "
-                                    "buffer to %d\n", (int)reducespace);
-                /*
-                 * If we already found more elements than count, or
-                 * the starting coordinate wasn't zero, the two-level
-                 * looping is unnecessary/can't be done, so return.
-                 */
-                if (count <= reducespace) {
-                    *reduce_innersize = count;
-                    NIT_ITFLAGS(iter) |= NPY_ITFLAG_REUSE_REDUCE_LOOPS;
-                    return count;
-                }
-                else if (nonzerocoord) {
-                    if (reducespace < count) {
-                        count = reducespace;
-                    }
-                    *reduce_innersize = count;
-                    /* NOTE: This is similar to the (coord != 0) case below. */
-                    NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS;
-                    return count;
-                }
-                else {
-                    *reduce_innersize = reducespace;
-                    break;
-                }
-            }
-        }
-        /* If we broke out of the loop early, we found reduce_innersize */
-        if (iop != nop) {
-            NPY_IT_DBG_PRINT2("Iterator: Found first dim not "
-                            "reduce (%d of %d)\n", iop, nop);
-            break;
-        }
-
-        shape = NAD_SHAPE(axisdata);
-        coord = NAD_INDEX(axisdata);
-        if (coord != 0) {
-            nonzerocoord = 1;
-        }
-        reducespace += (shape-coord-1) * factor;
-        factor *= shape;
-    }
-
-    /*
-     * If there was any non-zero coordinate, the reduction inner
-     * loop doesn't fit in the buffersize, or the reduction inner loop
-     * covered the entire iteration size, can't do the double loop.
-     */
-    if (nonzerocoord || count < reducespace || idim == ndim) {
-        if (reducespace < count) {
-            count = reducespace;
-        }
-        *reduce_innersize = count;
-        /* In this case, we can't reuse the reduce loops */
-        NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS;
-        return count;
-    }
-
-    coord = NAD_INDEX(axisdata);
-    if (coord != 0) {
-        /*
-         * In this case, it is only safe to reuse the buffer if the amount
-         * of data copied is not more than the current axes, as is the
-         * case when reuse_reduce_loops was active already.
-         * It should be in principle OK when the idim loop returns immediately.
-         */
-        NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS;
-    }
-    else {
-        /* In this case, we can reuse the reduce loops */
-        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REUSE_REDUCE_LOOPS;
-    }
-
-    *reduce_innersize = reducespace;
-    count /= reducespace;
-
-    NPY_IT_DBG_PRINT2("Iterator: reduce_innersize %d count /ed %d\n",
-                    (int)reducespace, (int)count);
-
-    /*
-     * Continue through the rest of the dimensions.  If there are
-     * two separated reduction axes, we may have to cut the buffer
-     * short again.
-     */
-    *reduce_outerdim = idim;
-    reducespace = 1;
-    factor = 1;
-    /* Indicate which REDUCE operands have stride 0 at the current level */
-    strides = NAD_STRIDES(axisdata);
-    for (iop = 0; iop < nop; ++iop) {
-        stride0op[iop] = (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) &&
-                           (strides[iop] == 0);
-        NPY_IT_DBG_PRINT2("Iterator: Operand %d has stride 0 in "
-                        "the outer loop? %d\n", iop, (int)stride0op[iop]);
-    }
-    shape = NAD_SHAPE(axisdata);
-    reducespace += (shape-coord-1) * factor;
-    factor *= shape;
-    NIT_ADVANCE_AXISDATA(axisdata, 1);
-    ++idim;
-
-    for (; idim < ndim && reducespace < count;
-                                ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-        NPY_IT_DBG_PRINT2("Iterator: outer loop reducespace %d, count %d\n",
-                                (int)reducespace, (int)count);
-        strides = NAD_STRIDES(axisdata);
-        for (iop = 0; iop < nop; ++iop) {
-            /*
-             * If a reduce stride switched from zero to non-zero, or
-             * vice versa, that's the point where the data will stop
-             * being the same element or will repeat, and if the
-             * buffer starts with an all zero multi-index up to this
-             * point, gives us the reduce_innersize.
-             */
-            if((stride0op[iop] && (strides[iop] != 0)) ||
-                        (!stride0op[iop] &&
-                         (strides[iop] == 0) &&
-                         (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE))) {
-                NPY_IT_DBG_PRINT1("Iterator: Reduce operation limits "
-                                    "buffer to %d\n", (int)reducespace);
-                /*
-                 * This terminates the outer level of our double loop.
-                 */
-                if (count <= reducespace) {
-                    return count * (*reduce_innersize);
-                }
-                else {
-                    return reducespace * (*reduce_innersize);
-                }
-            }
-        }
-
-        shape = NAD_SHAPE(axisdata);
-        coord = NAD_INDEX(axisdata);
-        if (coord != 0) {
-            nonzerocoord = 1;
-        }
-        reducespace += (shape-coord-1) * factor;
-        factor *= shape;
-    }
-
-    if (reducespace < count) {
-        count = reducespace;
-    }
-    return count * (*reduce_innersize);
-}
-
 NPY_NO_EXPORT npy_bool
 npyiter_has_writeback(NpyIter *iter)
 {
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index d906e37e5dff..dc27682a8a67 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -99,6 +99,8 @@ npyiter_get_priority_subtype(int nop, PyArrayObject **op,
 static int
 npyiter_allocate_transfer_functions(NpyIter *iter);
 
+static void
+npyiter_find_buffering_setup(NpyIter *iter);
 
 /*NUMPY_API
  * Allocate a new iterator for multiple array objects, and advanced
@@ -472,14 +474,16 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
         }
     }
 
-    /* If buffering is set without delayed allocation */
+    /* If buffering is set prepare it */
     if (itflags & NPY_ITFLAG_BUFFER) {
+        npyiter_find_buffering_setup(iter);
+
         if (!npyiter_allocate_transfer_functions(iter)) {
             NpyIter_Deallocate(iter);
             return NULL;
         }
         if (!(itflags & NPY_ITFLAG_DELAYBUF)) {
-            /* Allocate the buffers */
+            /* Allocate the buffers if that is not delayed */
             if (!npyiter_allocate_buffers(iter, NULL)) {
                 NpyIter_Deallocate(iter);
                 return NULL;
@@ -1931,6 +1935,363 @@ operand_different_than_broadcast: {
     }
 }
 
+
+/*
+ * At this point we (presumably) use a buffered iterator and here we want
+ * to find out the best way to buffer the iterator in a fashion that we don't
+ * have to figure out a lot of things on every iteration.
+ *
+ * How do we iterate?
+ * ------------------
+ * There are currently two modes of "buffered" iteration:
+ * 1. The normal mode, where we either buffer each operand or not and
+ *    then do a normal 1-D loop on those buffers (or operands).
+ * 2. The "reduce" mode.  In reduce mode (ITFLAG_REDUCE) we internally use a
+ *    a double iteration where:
+ *      - One outer iteration with stride == 0 and a single stride core != 0.
+ *      - One outer iteration with stride != 0 and a core of strides == 0.
+ *    This setup allows filling the buffer with only the stride != 0 and then
+ *    doing the double loop on the buffer (either ).
+ *
+ * Only a writeable (reduce) operand require this reduce mode because for
+ * reading it is OK if the buffer holds duplicates.
+ * The reason for the reduce mode is that it allows for a larger core size.
+ * If we use the reduce-mode, we can apply it also to read-only operands.
+ *
+ * Best buffer and core size
+ * -------------------------
+ * We don't want to figure out the complicated copies every time we fill buffers
+ * so we want to find a the outer iteration dimension so that:
+ *   - Its core size is smaller than the buffersize if buffering is needed.
+ *   - Allows for reductions to work (with or without reduce-mode)
+ *   - Optimizes for iteration overhead.  We estimate the total overhead with:
+ *     `(1 + n_buffers) / size`.
+ *     The `size` is the `min(core_size * outer_dim_size, buffersize)` and
+ *     estimates how often we fill buffers (size is unbounded if no buffering
+ *     is needed).
+ *     NOTE: We could tweak this, it is not optimized/proven to be best.
+ *
+ * NOTE: The function does not attempt 
+ */
+static void
+npyiter_find_buffering_setup(NpyIter *iter)
+{
+    int nop = iter->nop;
+    int ndim = iter->ndim;
+    npy_uint32 itflags = iter->itflags;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    /*
+     * We check two things here, first whether the core is single strided
+     * and second, whether we found a reduce stride dimension for the operand.
+     * That is an outer dimension a reduce would have to take place on.
+     */
+    int op_single_stride_dims[NPY_MAXARGS];
+    int op_reduce_outer_dim[NPY_MAXARGS];
+
+    /*
+     * Note that this code requires/uses the fact that there is always one
+     * axisdata even for ndim == 0 (i.e. no need to worry about it).
+     */
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
+
+    /*
+     * We can only continue as long as we are within the maximum allowed size.
+     * When no buffering is needed and GROWINNER is set, we don't have to
+     * worry about this maximum.
+     */
+    npy_intp maximum_size = NBF_BUFFERSIZE(bufferdata);
+
+    /* The cost factor defined by: (1 + n_buffered) */
+    int cost = 1;
+
+    for (int iop = 0; iop < nop; ++iop) {
+        op_single_stride_dims[iop] = 1;
+        op_reduce_outer_dim[iop] = 0;
+        if (op_itflags[iop] & NPY_OP_ITFLAG_CAST) {
+            cost += 1;
+        }
+    }
+
+    /*
+     * Once a reduce operand reaches a ==0/!=0 stride flip, this dimension
+     * becomes the outer reduce dimension.
+     * We may continue growing, but only if strides align for any such operand.
+     */
+    int outer_reduce_dim = 0;
+
+    npy_intp size = axisdata->shape;  /* the current total size */
+
+    int best_dim = 0;
+    int best_cost = cost;
+    /* The size of the "outer" iteration and all previous dimensions: */
+    npy_intp best_size = size;
+    npy_intp best_coresize = 1;
+
+    NPY_IT_DBG_PRINT("Iterator: discovering best core size\n");
+    for (int idim = 1; idim < ndim; idim++) {
+        if (outer_reduce_dim) {
+            /* Cannot currently expand beyond reduce dim! */
+            break;
+        }
+        if (size >= maximum_size &&
+                (cost > 1 || !(itflags & NPY_ITFLAG_GROWINNER))) {
+            /* Exceeded buffer size, can only improve without buffers and growinner. */
+            break;
+        }
+
+        npy_intp *prev_strides = NAD_STRIDES(axisdata);
+        npy_intp prev_shape = NAD_SHAPE(axisdata);
+        NIT_ADVANCE_AXISDATA(axisdata, 1);
+        npy_intp *strides = NAD_STRIDES(axisdata);
+
+        int iop;
+        for (iop = 0; iop < nop; iop++) {
+            /* Check that we set things up nicely (if shape is ever 1) */
+            assert((axisdata->shape == 1) ? (prev_strides[iop] == strides[iop]) : 1);
+            /*
+             * Best case: the strides collaps for this operand, all fine.
+             * Keep track at which single-stride or outer dims we are.
+             */
+            if (prev_strides[iop] * prev_shape == strides[iop]) {
+                if (op_single_stride_dims[iop] == idim) {
+                    op_single_stride_dims[iop] += 1;
+                }
+                continue;
+            }
+
+            if (op_single_stride_dims[iop] == idim) {
+                /*
+                 * Operand now requires buffering (if it was not already).
+                 * NOTE: This is technically not true since we may still use
+                 *       an outer reduce at this point.
+                 *       So it prefers a non-reduce setup, which seems not
+                 *       ideal, but OK.
+                 */
+                if (!(op_itflags[iop] & NPY_OP_ITFLAG_CAST)) {
+                    cost += 1;
+                }
+            }
+
+            /*
+             * If this operand is a reduction operand and this is not the
+             * outer_reduce_dim, then we must stop.
+             */
+            if (op_itflags[iop] & NPY_OP_ITFLAG_REDUCE) {
+                /*
+                 * We swap between !=0/==0 and thus make it a reduce
+                 * (it is OK if another op started a reduce at this dimension)
+                 */
+                if (strides[iop] == 0 || prev_strides[iop] == 0) {
+                    if (outer_reduce_dim == 0 || outer_reduce_dim == idim) {
+                        op_reduce_outer_dim[iop] = idim;
+                        outer_reduce_dim = idim;
+                    }
+                }
+            }
+            /* For clarity: op_reduce_outer_dim[iop] if set always matches. */
+            assert(!op_reduce_outer_dim[iop] || op_reduce_outer_dim[iop] == outer_reduce_dim);
+        }
+        if (iop != nop) {
+            /* Including this dimension is invalid due to a reduction. */
+            break;
+        }
+
+        npy_intp coresize = size;  /* if we iterate here, this is the core */
+        size *= axisdata->shape;
+
+        double bufsize = size;
+        if (bufsize > maximum_size &&
+                (cost > 1 || !(itflags & NPY_ITFLAG_GROWINNER))) {
+            /* If we need buffering, limit size in cost calculation. */
+            bufsize = maximum_size;
+        }
+
+        NPY_IT_DBG_PRINT("    dim=%d, n_buffered=%d, cost=%g @bufsize=%g (prev scaled cost=%g)\n",
+                         idim, cost - 1, cost * (double)best_size, bufsize, best_cost * bufsize);
+
+        /*
+         * Compare cost (use double to avoid overflows), as explained above
+         * the cost is compared via the other buffersize.
+         */
+        if (cost * (double)best_size <= best_cost * bufsize) {
+            /* This dimension is better! */
+            best_cost = cost;
+            best_coresize = coresize;
+            best_size = size;
+            best_dim = idim;
+        }
+    }
+
+    npy_bool using_reduce = outer_reduce_dim && (best_dim == outer_reduce_dim);
+    npy_bool iterator_must_buffer = 0;
+
+    /* We found the best chunking store the information */
+    NIT_BUFFERDATA(iter)->coresize = best_coresize;
+    NIT_BUFFERDATA(iter)->outerdim = best_dim;
+
+    /*
+     * We found the best dimensions to iterate on and now need to fill
+     * in all the buffer information related to the iteration.
+     * This includes filling in information about reduce outer dims
+     * (we do this even if it is not a reduce for simplicity).
+     */
+    axisdata = NIT_AXISDATA(iter);
+    NpyIter_AxisData *reduce_axisdata = NIT_INDEX_AXISDATA(axisdata, outer_reduce_dim);
+
+    NPY_IT_DBG_PRINT("Iterator: Found core size=%zd, outer=%zd at dim=%d:\n",
+                      best_coresize, reduce_axisdata->shape, best_dim);
+
+    /* If we are not using a reduce axes mark it and shrink. */
+    if (using_reduce) {
+        assert(NIT_ITFLAGS(iter) & NPY_ITFLAG_REDUCE);
+        NPY_IT_DBG_PRINT("    using reduce logic\n");
+    }
+    else {
+        NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REDUCE;
+        NPY_IT_DBG_PRINT("    not using reduce logic\n");
+    }
+
+    for (int iop = 0; iop < nop; iop++) {
+        /* We need to fill in the following information */
+        npy_bool is_reduce_op;
+        npy_bool op_is_buffered = (op_itflags[iop]&NPY_OP_ITFLAG_CAST) != 0;
+
+        /*
+         * Figure out if this is iterated as a reduce op.  Even one marked
+         * for reduction may not be iterated as one.
+         */
+        if (!using_reduce) {
+            is_reduce_op = 0;
+        }
+        else if (op_reduce_outer_dim[iop] == best_dim) {
+            /* This op *must* use reduce semantics. */
+            is_reduce_op = 1;
+        }
+        else if (op_single_stride_dims[iop] == best_dim && !op_is_buffered) {
+            /*
+             * Optimization: This operand is not buffered and we might as well
+             * iterate it as an unbuffered reduce operand (if not yet buffered).
+             */
+            is_reduce_op = 1;
+        }
+        else if (NAD_STRIDES(reduce_axisdata)[iop] == 0
+                    && op_single_stride_dims[iop] <= best_dim) {
+            /*
+             * Optimization: If the outer (reduce) stride is 0 on the operand
+             * then we can iterate this in a reduce way: buffer the core only
+             * and repeat it in the "outer" dimension.
+             */
+            is_reduce_op = 1;
+        }
+        else {
+            is_reduce_op = 0;
+        }
+
+        /*
+         * See if the operand is a single stride (if we use reduce logic)
+         * we don't need to worry about the outermost dimension.
+         * If it is not a single stride, we must buffer the operand.
+         */
+        if (op_single_stride_dims[iop] + is_reduce_op > best_dim) {
+            NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_SINGLESTRIDE;
+        }
+        else {
+            op_is_buffered = 1;
+        }
+
+        npy_intp inner_stride;
+        npy_intp reduce_outer_stride;
+        if (op_is_buffered) {
+            npy_intp itemsize = NIT_DTYPES(iter)[iop]->elsize;
+            /*
+             * A buffered operand has a stride of itemsize unless we use
+             * reduce logic.  In that case, either the inner or outer stride
+             * is 0.
+             */
+            if (!is_reduce_op
+                    && NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_SINGLESTRIDE
+                    && NAD_STRIDES(axisdata)[iop] == 0) {
+                /* This op is always 0 strides, so even the buffer is that. */
+                inner_stride = 0;
+                reduce_outer_stride = 0;
+            }
+            else if (!is_reduce_op) {
+                /* normal buffered op */
+                inner_stride = itemsize;
+                reduce_outer_stride = itemsize * best_coresize;
+            }
+            else if (NAD_STRIDES(reduce_axisdata)[iop] == 0) {
+                inner_stride = itemsize;
+                reduce_outer_stride = 0;
+            }
+            else {
+                inner_stride = 0;
+                reduce_outer_stride = itemsize;
+            }
+        }
+        else {
+            inner_stride = NAD_STRIDES(axisdata)[iop];
+            reduce_outer_stride = NAD_STRIDES(reduce_axisdata)[iop];
+        }
+
+        if (!using_reduce) {
+            /* invalidate for now, since we should not use it */
+            reduce_outer_stride = NPY_MIN_INTP;
+        }
+
+        NPY_IT_DBG_PRINT(
+            "Iterator: op=%d (buffered=%d, reduce=%d, single-stride=%d):\n"
+            "    inner stride: %zd\n"
+            "    reduce outer stride: %zd  (if iterator uses reduce)\n",
+            iop, op_is_buffered, is_reduce_op,
+            (NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_SINGLESTRIDE) != 0,
+            inner_stride, reduce_outer_stride);
+
+        NBF_STRIDES(bufferdata)[iop] = inner_stride;
+        NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop] = reduce_outer_stride;
+
+        /* The actual reduce usage may have changed! */
+        if (is_reduce_op) {
+            NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_REDUCE;
+        }
+        else {
+            NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_REDUCE;
+        }
+
+        if (!op_is_buffered) {
+            NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUFNEVER;
+        }
+        else {
+            iterator_must_buffer = 1;
+        }
+    }
+
+    /*
+     * If we buffer or do not have grow-inner, make sure that the size is
+     * below the maximum_size, but a multiple of the coresize.
+     */
+    if (iterator_must_buffer || !(itflags & NPY_ITFLAG_GROWINNER)) {
+        if (maximum_size < best_size) {
+            best_size = best_coresize * (maximum_size / best_coresize);
+        }
+    }
+    /*
+     * Set the buffersize to either the:
+     * - the largest we amount trivially iterate (no buffering!).
+     * - the largest multiple of the coresize that is smaller than the
+     *   requested/default buffersize.
+     */
+    NIT_BUFFERDATA(iter)->buffersize = best_size;
+    /* Core size is 0 (unless the user applies a range explicitly). */
+    NIT_BUFFERDATA(iter)->coreoffset = 0;
+
+    return;
+}
+
+
 /*
  * Replaces the AXISDATA for the iop'th operand, broadcasting
  * the dimensions as necessary.  Assumes the replacement array is
@@ -3105,6 +3466,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
          * Reduction operands may be buffered with a different stride,
          * so we must pass NPY_MAX_INTP to the transfer function factory.
          */
+        // TODO: This should not be the case anymore!?  (was it ever?!?)
         op_stride = (flags & NPY_OP_ITFLAG_REDUCE) ? NPY_MAX_INTP :
                                                    strides[iop];
 
diff --git a/numpy/_core/src/multiarray/nditer_impl.h b/numpy/_core/src/multiarray/nditer_impl.h
index c907464ec53e..42ff6dadb8ca 100644
--- a/numpy/_core/src/multiarray/nditer_impl.h
+++ b/numpy/_core/src/multiarray/nditer_impl.h
@@ -55,13 +55,14 @@
 /********** PRINTF DEBUG TRACING **************/
 #define NPY_IT_DBG_TRACING 0
 
+/* TODO: Can remove the n-args macros, old C89 didn't have variadic macros. */
 #if NPY_IT_DBG_TRACING
-#define NPY_IT_DBG_PRINT(s) printf("%s", s)
-#define NPY_IT_DBG_PRINT1(s, p1) printf(s, p1)
-#define NPY_IT_DBG_PRINT2(s, p1, p2) printf(s, p1, p2)
-#define NPY_IT_DBG_PRINT3(s, p1, p2, p3) printf(s, p1, p2, p3)
+#define NPY_IT_DBG_PRINT(...) printf(__VA_ARGS__)
+#define NPY_IT_DBG_PRINT1(s, p1) NPY_IT_DBG_PRINT(s, p1)
+#define NPY_IT_DBG_PRINT2(s, p1, p2) NPY_IT_DBG_PRINT(s, p1, p2)
+#define NPY_IT_DBG_PRINT3(s, p1, p2, p3) NPY_IT_DBG_PRINT(s, p1, p2, p3)
 #else
-#define NPY_IT_DBG_PRINT(s)
+#define NPY_IT_DBG_PRINT(...)
 #define NPY_IT_DBG_PRINT1(s, p1)
 #define NPY_IT_DBG_PRINT2(s, p1, p2)
 #define NPY_IT_DBG_PRINT3(s, p1, p2, p3)
@@ -105,6 +106,7 @@
 #define NPY_ITFLAG_REDUCE       (1 << 12)
 /* Reduce iteration doesn't need to recalculate reduce loops next time */
 #define NPY_ITFLAG_REUSE_REDUCE_LOOPS (1 << 13)
+
 /*
  * Offset of (combined) ArrayMethod flags for all transfer functions.
  * For now, we use the top 8 bits.
@@ -119,7 +121,7 @@
 #define NPY_OP_ITFLAG_READ         0x0002
 /* The operand needs type conversion/byte swapping/alignment */
 #define NPY_OP_ITFLAG_CAST         0x0004
-/* The operand never needs buffering */
+/* The operand never needs buffering (implies BUF_SINGLESTRIDE) */
 #define NPY_OP_ITFLAG_BUFNEVER     0x0008
 /* The operand is being reduced */
 #define NPY_OP_ITFLAG_REDUCE       0x0020
@@ -127,12 +129,17 @@
 #define NPY_OP_ITFLAG_VIRTUAL      0x0040
 /* The operand requires masking when copying buffer -> array */
 #define NPY_OP_ITFLAG_WRITEMASKED  0x0080
-/* The operand's data pointer is pointing into its buffer */
-#define NPY_OP_ITFLAG_USINGBUFFER  0x0100
+/*
+ * Whether the buffer is *fully* filled and thus ready for re-usable.
+ * (Must check if the start pointer matches until copy-from-buffer checks)
+ */
+#define NPY_OP_ITFLAG_BUF_REUSABLE 0x0100
 /* The operand must be copied (with UPDATEIFCOPY if also ITFLAG_WRITE) */
 #define NPY_OP_ITFLAG_FORCECOPY    0x0200
 /* The operand has temporary data, write it back at dealloc */
 #define NPY_OP_ITFLAG_HAS_WRITEBACK 0x0400
+/* Whether the buffer filling can use a single stride (minus reduce if reduce) */
+#define NPY_OP_ITFLAG_SINGLESTRIDE 0x0800
 
 /*
  * The data layout of the iterator is fully specified by
@@ -176,7 +183,7 @@ typedef npy_int16 npyiter_opitflags;
         (NPY_PTR_ALIGNED(sizeof(npyiter_opitflags) * nop))
 #define NIT_BUFFERDATA_SIZEOF(itflags, ndim, nop) \
         ((itflags&NPY_ITFLAG_BUFFER) ? ( \
-            (NPY_SIZEOF_PY_INTPTR_T)*(6 + 5*nop) + sizeof(NpyIter_TransferInfo) * nop) : 0)
+            (NPY_SIZEOF_PY_INTPTR_T)*(8 + 5*nop) + sizeof(NpyIter_TransferInfo) * nop) : 0)
 
 /* Byte offsets of the iterator members starting from iter->iter_flexdata */
 #define NIT_PERM_OFFSET() \
@@ -249,7 +256,7 @@ struct NpyIter_TransferInfo_tag {
 
 struct NpyIter_BufferData_tag {
     npy_intp buffersize, size, bufiterend,
-             reduce_pos, reduce_outersize, reduce_outerdim;
+             reduce_pos, coresize, outersize, coreoffset, outerdim;
     Py_intptr_t bd_flexdata;
 };
 
@@ -257,8 +264,10 @@ struct NpyIter_BufferData_tag {
 #define NBF_SIZE(bufferdata) ((bufferdata)->size)
 #define NBF_BUFITEREND(bufferdata) ((bufferdata)->bufiterend)
 #define NBF_REDUCE_POS(bufferdata) ((bufferdata)->reduce_pos)
-#define NBF_REDUCE_OUTERSIZE(bufferdata) ((bufferdata)->reduce_outersize)
-#define NBF_REDUCE_OUTERDIM(bufferdata) ((bufferdata)->reduce_outerdim)
+#define NBF_CORESIZE(bufferdata) ((bufferdata)->coresize)
+#define NBF_COREOFFSET(bufferdata) ((bufferdata)->coreoffset)
+#define NBF_REDUCE_OUTERSIZE(bufferdata) ((bufferdata)->outersize)
+#define NBF_OUTERDIM(bufferdata) ((bufferdata)->outerdim)
 #define NBF_STRIDES(bufferdata) ( \
         &(bufferdata)->bd_flexdata + 0)
 #define NBF_PTRS(bufferdata) ((char **) \

From 7d19f2dcbd8bb6fe51a448f2f96ba4fbfd74ad0a Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Sat, 30 Nov 2024 19:50:12 +0100
Subject: [PATCH 02/23] TST: Fixup tests to pass (for now not correct)

---
 numpy/_core/tests/test_einsum.py |  3 ++-
 numpy/_core/tests/test_nditer.py | 17 +++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/numpy/_core/tests/test_einsum.py b/numpy/_core/tests/test_einsum.py
index 636c97f03e87..1130721ecdb9 100644
--- a/numpy/_core/tests/test_einsum.py
+++ b/numpy/_core/tests/test_einsum.py
@@ -5,7 +5,8 @@
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_almost_equal,
-    assert_raises, suppress_warnings, assert_raises_regex, assert_allclose
+    assert_raises, suppress_warnings, assert_raises_regex, assert_allclose,
+    assert_array_almost_equal_nulp
     )
 
 # Setup for optimize einsum
diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index 83aaa0a7795d..cad3f1a2bf34 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -2681,8 +2681,8 @@ def test_iter_buffering_reduction():
     it = np.nditer([p, None],
             ['delay_bufalloc', 'reduce_ok', 'buffered', 'external_loop'],
             [['readonly'], ['readwrite', 'allocate']],
-            op_axes=[[-1, 0], [-1, -1]],
-            itershape=(2, 2))
+            op_axes=[[-1, 0], [1, 0]],
+            itershape=(2, 2), op_dtypes=["float64", "int64"])
     with it:
         it.operands[1].fill(0)
         it.reset()
@@ -3296,7 +3296,7 @@ def test_debug_print(capfd):
     expected = """
     ------ BEGIN ITERATOR DUMP ------
     | Iterator Address:
-    | ItFlags: BUFFER REDUCE REUSE_REDUCE_LOOPS
+    | ItFlags: BUFFER REDUCE
     | NDim: 2
     | NOp: 2
     | IterSize: 50
@@ -3321,18 +3321,19 @@ def test_debug_print(capfd):
     | BufferData:
     |   BufferSize: 50
     |   Size: 5
-    |   BufIterEnd: 5
+    |   BufIterEnd: 50
     |   REDUCE Pos: 0
-    |   REDUCE OuterSize: 10
-    |   REDUCE OuterDim: 1
+    |   BUFFER CoreSize: 5
+    |   BUFFER Reduce outersize: 10
+    |   BUFFER OuterDim: 1
     |   Strides: 8 4
     |   Ptrs:
     |   REDUCE Outer Strides: 40 0
     |   REDUCE Outer Ptrs:
     |   ReadTransferFn:
-    |   ReadTransferData:
+    |   ReadTransferData: 0x0 0x0
     |   WriteTransferFn:
-    |   WriteTransferData:
+    |   WriteTransferData: 0x0 0x0
     |   Buffers:
     |
     | AxisData[0]:

From 809cf0cdde7555171739e70fe4de0ec750a1df1f Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 09:44:16 +0100
Subject: [PATCH 03/23] BUG: Avoid a coresize of 0

---
 numpy/_core/src/multiarray/nditer_constr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index dc27682a8a67..a96413467c3c 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -2101,6 +2101,9 @@ npyiter_find_buffering_setup(NpyIter *iter)
 
         npy_intp coresize = size;  /* if we iterate here, this is the core */
         size *= axisdata->shape;
+        if (size == 0) {
+            break;  /* Avoid a zero coresize. */
+        }
 
         double bufsize = size;
         if (bufsize > maximum_size &&
@@ -2129,6 +2132,7 @@ npyiter_find_buffering_setup(NpyIter *iter)
     npy_bool iterator_must_buffer = 0;
 
     /* We found the best chunking store the information */
+    assert(best_coresize != 0);
     NIT_BUFFERDATA(iter)->coresize = best_coresize;
     NIT_BUFFERDATA(iter)->outerdim = best_dim;
 

From 9302b303957d63b06a1bb5a85bd559339b361ea3 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 09:54:42 +0100
Subject: [PATCH 04/23] TST: The nullpointer doesn't print uniformly accross
 platforms

---
 numpy/_core/tests/test_nditer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index cad3f1a2bf34..80e0b8e0f1f3 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -3331,9 +3331,9 @@ def test_debug_print(capfd):
     |   REDUCE Outer Strides: 40 0
     |   REDUCE Outer Ptrs:
     |   ReadTransferFn:
-    |   ReadTransferData: 0x0 0x0
+    |   ReadTransferData:
     |   WriteTransferFn:
-    |   WriteTransferData: 0x0 0x0
+    |   WriteTransferData:
     |   Buffers:
     |
     | AxisData[0]:

From 54d967f21cc61be228eb4f3b72e7d67d8ba85e3f Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 10:13:54 +0100
Subject: [PATCH 05/23] BUG: Add missing pointer dereference

The old code just did bad things.  But a too large ndim would not
have mattered, since the copy would have finished early anyway.
---
 numpy/_core/src/multiarray/nditer_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 751130fedc91..728d23823b14 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1942,7 +1942,7 @@ npyiter_fill_buffercopy_params(
 
     if (opitflags & NPY_OP_ITFLAG_REDUCE) {
         /* Outer dim is reduced, so omit it from copying */
-        ndim_transfer -= 1;
+        *ndim_transfer -= 1;
         if (*op_transfersize > bufferdata->coresize) {
             *op_transfersize = bufferdata->coresize;
         }

From 87ad5a27bfa11a54c2837c6967349bdadb6d5984 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 12:55:42 +0100
Subject: [PATCH 06/23] BUG: Fix coreoffset mechanism and unset buf-reuse on
 different (partial) copy

---
 numpy/_core/src/multiarray/nditer_api.c | 16 ++++++++++++----
 numpy/_core/src/umath/ufunc_object.c    |  1 -
 numpy/_core/tests/test_nditer.py        |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 728d23823b14..426c4b9d93ff 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1571,6 +1571,8 @@ NpyIter_DebugPrint(NpyIter *iter)
             printf("VIRTUAL ");
         if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_WRITEMASKED)
             printf("WRITEMASKED ");
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_SINGLESTRIDE)
+            printf("SINGLESTRIDE ");
         printf("\n");
     }
     printf("|\n");
@@ -1583,11 +1585,11 @@ NpyIter_DebugPrint(NpyIter *iter)
         printf("|   BufferSize: %d\n", (int)NBF_BUFFERSIZE(bufferdata));
         printf("|   Size: %d\n", (int)NBF_SIZE(bufferdata));
         printf("|   BufIterEnd: %d\n", (int)NBF_BUFITEREND(bufferdata));
+        printf("|   BUFFER CoreSize: %d\n",
+                    (int)NBF_CORESIZE(bufferdata));
         if (itflags&NPY_ITFLAG_REDUCE) {
             printf("|   REDUCE Pos: %d\n",
                         (int)NBF_REDUCE_POS(bufferdata));
-            printf("|   BUFFER CoreSize: %d\n",
-                        (int)NBF_CORESIZE(bufferdata));
             printf("|   BUFFER Reduce outersize: %d\n",
                         (int)NBF_REDUCE_OUTERSIZE(bufferdata));
             printf("|   BUFFER OuterDim: %d\n",
@@ -1886,8 +1888,8 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
     }
     if (itflags&NPY_ITFLAG_BUFFER) {
         /* Find the remainder if chunking to the buffers coresize */
-        npy_intp fact = iterindex / NIT_BUFFERDATA(iter)->coresize;
-        npy_intp offset = iterindex - fact * NIT_BUFFERDATA(iter)->coresize;
+        npy_intp fact = NIT_ITERINDEX(iter) / NIT_BUFFERDATA(iter)->coresize;
+        npy_intp offset = NIT_ITERINDEX(iter) - fact * NIT_BUFFERDATA(iter)->coresize;
         NIT_BUFFERDATA(iter)->coreoffset = offset;
     }
 }
@@ -2194,6 +2196,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
         if (NBF_REDUCE_OUTERSIZE(bufferdata) > 1) {
             /* WARNING: bufferdata->size does not include reduce-outersize */
             bufferdata->size = bufferdata->coresize;
+            NBF_BUFITEREND(bufferdata) = iterindex + bufferdata->coresize;
         }
         NBF_REDUCE_POS(bufferdata) = 0;
     }
@@ -2239,6 +2242,11 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
             NPY_IT_DBG_PRINT("    marking operand %d for buffer reuse\n", iop);
             NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUF_REUSABLE;
         }
+        else if (prev_dataptrs == NULL || prev_dataptrs[iop] != ad_ptrs[iop] ||
+                 bufferdata->coreoffset) {
+            /* Copying at a different offset, so must unset re-use. */
+            NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE;
+        }
 
         int ndim_transfer;
         npy_intp op_transfersize;
diff --git a/numpy/_core/src/umath/ufunc_object.c b/numpy/_core/src/umath/ufunc_object.c
index 8748ad5e4974..fad233d8746c 100644
--- a/numpy/_core/src/umath/ufunc_object.c
+++ b/numpy/_core/src/umath/ufunc_object.c
@@ -2493,7 +2493,6 @@ reduce_loop(PyArrayMethod_Context *context,
             dataptrs_copy[3] = dataptrs[2];
             strides_copy[3] = strides[2];
         }
-
         retval = strided_loop(context,
                 dataptrs_copy, countptr, strides_copy, auxdata);
         if (retval < 0) {
diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index 80e0b8e0f1f3..1cba8c296fab 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -3321,7 +3321,7 @@ def test_debug_print(capfd):
     | BufferData:
     |   BufferSize: 50
     |   Size: 5
-    |   BufIterEnd: 50
+    |   BufIterEnd: 5
     |   REDUCE Pos: 0
     |   BUFFER CoreSize: 5
     |   BUFFER Reduce outersize: 10

From 67cddf7c24226b516c5770a0f17f11571596f972 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 13:04:04 +0100
Subject: [PATCH 07/23] TST: Forgot one more nditer debug_print() fixup

---
 numpy/_core/tests/test_nditer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index 1cba8c296fab..338cb0242e4a 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -3322,8 +3322,8 @@ def test_debug_print(capfd):
     |   BufferSize: 50
     |   Size: 5
     |   BufIterEnd: 5
-    |   REDUCE Pos: 0
     |   BUFFER CoreSize: 5
+    |   REDUCE Pos: 0
     |   BUFFER Reduce outersize: 10
     |   BUFFER OuterDim: 1
     |   Strides: 8 4

From facd23702fa3aa2aff5638483c01ebf1a929bb1a Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Mon, 2 Dec 2024 14:12:06 +0100
Subject: [PATCH 08/23] Apply suggestions from code review

---
 numpy/_core/src/multiarray/nditer_api.c    |  1 -
 numpy/_core/src/multiarray/nditer_constr.c | 14 ++------------
 numpy/_core/src/multiarray/nditer_impl.h   |  1 -
 numpy/_core/src/umath/ufunc_object.c       |  1 +
 numpy/_core/tests/test_einsum.py           |  3 +--
 numpy/_core/tests/test_nditer.py           |  4 ++--
 6 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 426c4b9d93ff..4a5e599b9c00 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -808,7 +808,6 @@ NpyIter_IsFirstVisit(NpyIter *iter, int iop)
      * We only need to check the outer level of this two-level loop,
      * because of the requirement that EXTERNAL_LOOP be enabled.
      */
-     // TODO: Do I need to change this?  Chance is no, so long REDUCE_OUTERSTRIDES still makes sense!
     if (itflags&NPY_ITFLAG_BUFFER) {
         NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
         /* The outer reduce loop */
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index a96413467c3c..e3347b9a3126 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -2018,7 +2018,6 @@ npyiter_find_buffering_setup(NpyIter *iter)
     /*
      * Once a reduce operand reaches a ==0/!=0 stride flip, this dimension
      * becomes the outer reduce dimension.
-     * We may continue growing, but only if strides align for any such operand.
      */
     int outer_reduce_dim = 0;
 
@@ -2051,10 +2050,7 @@ npyiter_find_buffering_setup(NpyIter *iter)
         for (iop = 0; iop < nop; iop++) {
             /* Check that we set things up nicely (if shape is ever 1) */
             assert((axisdata->shape == 1) ? (prev_strides[iop] == strides[iop]) : 1);
-            /*
-             * Best case: the strides collaps for this operand, all fine.
-             * Keep track at which single-stride or outer dims we are.
-             */
+            /*  Best case: the strides collapse for this operand. */
             if (prev_strides[iop] * prev_shape == strides[iop]) {
                 if (op_single_stride_dims[iop] == idim) {
                     op_single_stride_dims[iop] += 1;
@@ -2095,7 +2091,7 @@ npyiter_find_buffering_setup(NpyIter *iter)
             assert(!op_reduce_outer_dim[iop] || op_reduce_outer_dim[iop] == outer_reduce_dim);
         }
         if (iop != nop) {
-            /* Including this dimension is invalid due to a reduction. */
+            /* Including this dimension was invalid due to a reduction. */
             break;
         }
 
@@ -2282,12 +2278,6 @@ npyiter_find_buffering_setup(NpyIter *iter)
             best_size = best_coresize * (maximum_size / best_coresize);
         }
     }
-    /*
-     * Set the buffersize to either the:
-     * - the largest we amount trivially iterate (no buffering!).
-     * - the largest multiple of the coresize that is smaller than the
-     *   requested/default buffersize.
-     */
     NIT_BUFFERDATA(iter)->buffersize = best_size;
     /* Core size is 0 (unless the user applies a range explicitly). */
     NIT_BUFFERDATA(iter)->coreoffset = 0;
diff --git a/numpy/_core/src/multiarray/nditer_impl.h b/numpy/_core/src/multiarray/nditer_impl.h
index 42ff6dadb8ca..89be150fb8b4 100644
--- a/numpy/_core/src/multiarray/nditer_impl.h
+++ b/numpy/_core/src/multiarray/nditer_impl.h
@@ -106,7 +106,6 @@
 #define NPY_ITFLAG_REDUCE       (1 << 12)
 /* Reduce iteration doesn't need to recalculate reduce loops next time */
 #define NPY_ITFLAG_REUSE_REDUCE_LOOPS (1 << 13)
-
 /*
  * Offset of (combined) ArrayMethod flags for all transfer functions.
  * For now, we use the top 8 bits.
diff --git a/numpy/_core/src/umath/ufunc_object.c b/numpy/_core/src/umath/ufunc_object.c
index fad233d8746c..8748ad5e4974 100644
--- a/numpy/_core/src/umath/ufunc_object.c
+++ b/numpy/_core/src/umath/ufunc_object.c
@@ -2493,6 +2493,7 @@ reduce_loop(PyArrayMethod_Context *context,
             dataptrs_copy[3] = dataptrs[2];
             strides_copy[3] = strides[2];
         }
+
         retval = strided_loop(context,
                 dataptrs_copy, countptr, strides_copy, auxdata);
         if (retval < 0) {
diff --git a/numpy/_core/tests/test_einsum.py b/numpy/_core/tests/test_einsum.py
index 1130721ecdb9..636c97f03e87 100644
--- a/numpy/_core/tests/test_einsum.py
+++ b/numpy/_core/tests/test_einsum.py
@@ -5,8 +5,7 @@
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_almost_equal,
-    assert_raises, suppress_warnings, assert_raises_regex, assert_allclose,
-    assert_array_almost_equal_nulp
+    assert_raises, suppress_warnings, assert_raises_regex, assert_allclose
     )
 
 # Setup for optimize einsum
diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index 338cb0242e4a..e331a3f31f55 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -2681,8 +2681,8 @@ def test_iter_buffering_reduction():
     it = np.nditer([p, None],
             ['delay_bufalloc', 'reduce_ok', 'buffered', 'external_loop'],
             [['readonly'], ['readwrite', 'allocate']],
-            op_axes=[[-1, 0], [1, 0]],
-            itershape=(2, 2), op_dtypes=["float64", "int64"])
+            op_axes=[[-1, 0], [-1, -1]],
+            itershape=(2, 2))
     with it:
         it.operands[1].fill(0)
         it.reset()

From 22f335ce4a7f4e7add289487343d01e4ed79c949 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 14:36:49 +0100
Subject: [PATCH 09/23] MAINT: Remove bufnever logic from allocate-arrays (we
 do it later anyway)

---
 numpy/_core/src/multiarray/nditer_constr.c | 73 +---------------------
 1 file changed, 2 insertions(+), 71 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index e3347b9a3126..91a994d6a15e 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -3043,18 +3043,13 @@ npyiter_allocate_arrays(NpyIter *iter,
                         int **op_axes)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
-    int idim, ndim = NIT_NDIM(iter);
+    int ndim = NIT_NDIM(iter);
     int iop, nop = NIT_NOP(iter);
 
     int check_writemasked_reductions = 0;
 
-    NpyIter_BufferData *bufferdata = NULL;
     PyArrayObject **op = NIT_OPERANDS(iter);
 
-    if (itflags & NPY_ITFLAG_BUFFER) {
-        bufferdata = NIT_BUFFERDATA(iter);
-    }
-
     if (flags & NPY_ITER_COPY_IF_OVERLAP) {
         /*
          * Perform operand memory overlap checks, if requested.
@@ -3223,15 +3218,8 @@ npyiter_allocate_arrays(NpyIter *iter,
              */
             npyiter_replace_axisdata(iter, iop, op[iop], 0, NULL);
 
-            /*
-             * New arrays need no cast, and in the case
-             * of scalars, always have stride 0 so never need buffering
-             */
-            op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
+            /* New arrays need no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
-            if (itflags & NPY_ITFLAG_BUFFER) {
-                NBF_STRIDES(bufferdata)[iop] = 0;
-            }
         }
         /*
          * Make a temporary copy if,
@@ -3323,63 +3311,6 @@ npyiter_allocate_arrays(NpyIter *iter,
                 }
             }
         }
-
-        /*
-         * If no alignment, byte swap, or casting is needed,
-         * the inner stride of this operand works for the whole
-         * array, we can set NPY_OP_ITFLAG_BUFNEVER.
-         */
-        if ((itflags & NPY_ITFLAG_BUFFER) &&
-                                !(op_itflags[iop] & NPY_OP_ITFLAG_CAST)) {
-            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
-            if (ndim <= 1) {
-                op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
-                NBF_STRIDES(bufferdata)[iop] = NAD_STRIDES(axisdata)[iop];
-            }
-            else if (PyArray_NDIM(op[iop]) > 0) {
-                npy_intp stride, shape, innerstride = 0, innershape;
-                npy_intp sizeof_axisdata =
-                                    NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
-                /* Find stride of the first non-empty shape */
-                for (idim = 0; idim < ndim; ++idim) {
-                    innershape = NAD_SHAPE(axisdata);
-                    if (innershape != 1) {
-                        innerstride = NAD_STRIDES(axisdata)[iop];
-                        break;
-                    }
-                    NIT_ADVANCE_AXISDATA(axisdata, 1);
-                }
-                ++idim;
-                NIT_ADVANCE_AXISDATA(axisdata, 1);
-                /* Check that everything could have coalesced together */
-                for (; idim < ndim; ++idim) {
-                    stride = NAD_STRIDES(axisdata)[iop];
-                    shape = NAD_SHAPE(axisdata);
-                    if (shape != 1) {
-                        /*
-                         * If N times the inner stride doesn't equal this
-                         * stride, the multi-dimensionality is needed.
-                         */
-                        if (innerstride*innershape != stride) {
-                            break;
-                        }
-                        else {
-                            innershape *= shape;
-                        }
-                    }
-                    NIT_ADVANCE_AXISDATA(axisdata, 1);
-                }
-                /*
-                 * If we looped all the way to the end, one stride works.
-                 * Set that stride, because it may not belong to the first
-                 * dimension.
-                 */
-                if (idim == ndim) {
-                    op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
-                    NBF_STRIDES(bufferdata)[iop] = innerstride;
-                }
-            }
-        }
     }
 
     if (check_writemasked_reductions) {

From ac282fcc584243bc1f01941af103951ccd707274 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 2 Dec 2024 18:14:40 +0100
Subject: [PATCH 10/23] MAINT: Rename to BUF_SINGLESTRIDE and use the new gap
 in the flags

---
 numpy/_core/src/multiarray/nditer_api.c    | 6 +++---
 numpy/_core/src/multiarray/nditer_constr.c | 6 +++---
 numpy/_core/src/multiarray/nditer_impl.h   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 4a5e599b9c00..7c3fa4ccffdf 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1570,8 +1570,8 @@ NpyIter_DebugPrint(NpyIter *iter)
             printf("VIRTUAL ");
         if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_WRITEMASKED)
             printf("WRITEMASKED ");
-        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_SINGLESTRIDE)
-            printf("SINGLESTRIDE ");
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_BUF_SINGLESTRIDE)
+            printf("BUF_SINGLESTRIDE ");
         printf("\n");
     }
     printf("|\n");
@@ -1950,7 +1950,7 @@ npyiter_fill_buffercopy_params(
         /* copy setup is identical to non-reduced now. */
     }
 
-    if (opitflags & NPY_OP_ITFLAG_SINGLESTRIDE) {
+    if (opitflags & NPY_OP_ITFLAG_BUF_SINGLESTRIDE) {
         /* Flatten the copy into a single stride. */
         *ndim_transfer = 1;
         *op_shape = op_transfersize;
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 91a994d6a15e..16b0e50e9fbd 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -2196,7 +2196,7 @@ npyiter_find_buffering_setup(NpyIter *iter)
          * If it is not a single stride, we must buffer the operand.
          */
         if (op_single_stride_dims[iop] + is_reduce_op > best_dim) {
-            NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_SINGLESTRIDE;
+            NIT_OPITFLAGS(iter)[iop] |= NPY_OP_ITFLAG_BUF_SINGLESTRIDE;
         }
         else {
             op_is_buffered = 1;
@@ -2212,7 +2212,7 @@ npyiter_find_buffering_setup(NpyIter *iter)
              * is 0.
              */
             if (!is_reduce_op
-                    && NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_SINGLESTRIDE
+                    && NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE
                     && NAD_STRIDES(axisdata)[iop] == 0) {
                 /* This op is always 0 strides, so even the buffer is that. */
                 inner_stride = 0;
@@ -2247,7 +2247,7 @@ npyiter_find_buffering_setup(NpyIter *iter)
             "    inner stride: %zd\n"
             "    reduce outer stride: %zd  (if iterator uses reduce)\n",
             iop, op_is_buffered, is_reduce_op,
-            (NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_SINGLESTRIDE) != 0,
+            (NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE) != 0,
             inner_stride, reduce_outer_stride);
 
         NBF_STRIDES(bufferdata)[iop] = inner_stride;
diff --git a/numpy/_core/src/multiarray/nditer_impl.h b/numpy/_core/src/multiarray/nditer_impl.h
index 89be150fb8b4..8e5379c12ddd 100644
--- a/numpy/_core/src/multiarray/nditer_impl.h
+++ b/numpy/_core/src/multiarray/nditer_impl.h
@@ -122,6 +122,8 @@
 #define NPY_OP_ITFLAG_CAST         0x0004
 /* The operand never needs buffering (implies BUF_SINGLESTRIDE) */
 #define NPY_OP_ITFLAG_BUFNEVER     0x0008
+/* Whether the buffer filling can use a single stride (minus reduce if reduce) */
+#define NPY_OP_ITFLAG_BUF_SINGLESTRIDE 0x0010
 /* The operand is being reduced */
 #define NPY_OP_ITFLAG_REDUCE       0x0020
 /* The operand is for temporary use, does not have a backing array */
@@ -137,8 +139,6 @@
 #define NPY_OP_ITFLAG_FORCECOPY    0x0200
 /* The operand has temporary data, write it back at dealloc */
 #define NPY_OP_ITFLAG_HAS_WRITEBACK 0x0400
-/* Whether the buffer filling can use a single stride (minus reduce if reduce) */
-#define NPY_OP_ITFLAG_SINGLESTRIDE 0x0800
 
 /*
  * The data layout of the iterator is fully specified by

From 6ed3b99e9ba74ab624ef6d17456c1cb4e8458e35 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 3 Dec 2024 10:22:38 +0100
Subject: [PATCH 11/23] MAINT: Remove unnecessary fixed-stride logic

---
 numpy/_core/src/multiarray/nditer_api.c | 73 +++----------------------
 1 file changed, 8 insertions(+), 65 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 7c3fa4ccffdf..d50b1375037d 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1326,8 +1326,10 @@ NpyIter_GetAxisStrideArray(NpyIter *iter, int axis)
 
 /*NUMPY_API
  * Get an array of strides which are fixed.  Any strides which may
- * change during iteration receive the value NPY_MAX_INTP.  Once
- * the iterator is ready to iterate, call this to get the strides
+ * change during iteration receive the value NPY_MAX_INTP
+ * (as of NumPy 2.3, `NPY_MAX_INTP` will never happen but must be supported;
+ * we could guarantee this, but not sure if we should).
+ * Once the iterator is ready to iterate, call this to get the strides
  * which will always be fixed in the inner loop, then choose optimized
  * inner loop functions which take advantage of those fixed strides.
  *
@@ -1337,75 +1339,16 @@ NPY_NO_EXPORT void
 NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
-    int ndim = NIT_NDIM(iter);
-    int iop, nop = NIT_NOP(iter);
+    int nop = NIT_NOP(iter);
 
     NpyIter_AxisData *axisdata0 = NIT_AXISDATA(iter);
-    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     if (itflags&NPY_ITFLAG_BUFFER) {
-        NpyIter_BufferData *data = NIT_BUFFERDATA(iter);
-        npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
-        npy_intp stride, *strides = NBF_STRIDES(data),
-                *ad_strides = NAD_STRIDES(axisdata0);
-        PyArray_Descr **dtypes = NIT_DTYPES(iter);
-
-        for (iop = 0; iop < nop; ++iop) {
-            stride = strides[iop];
-            /*
-             * Operands which are always/never buffered have fixed strides,
-             * and everything has fixed strides when ndim is 0 or 1
-             */
-            if (ndim <= 1 || (op_itflags[iop]&
-                            (NPY_OP_ITFLAG_CAST|NPY_OP_ITFLAG_BUFNEVER))) {
-                out_strides[iop] = stride;
-            }
-            /* If it's a reduction, 0-stride inner loop may have fixed stride */
-            else if (stride == 0 && (itflags&NPY_ITFLAG_REDUCE)) {
-                /* If it's a reduction operand, definitely fixed stride */
-                if (op_itflags[iop]&NPY_OP_ITFLAG_REDUCE) {
-                    out_strides[iop] = stride;
-                }
-                /*
-                 * Otherwise it's guaranteed to be a fixed stride if the
-                 * stride is 0 for all the dimensions.
-                 */
-                else {
-                    NpyIter_AxisData *axisdata = axisdata0;
-                    int idim;
-                    for (idim = 0; idim < ndim; ++idim) {
-                        if (NAD_STRIDES(axisdata)[iop] != 0) {
-                            break;
-                        }
-                        NIT_ADVANCE_AXISDATA(axisdata, 1);
-                    }
-                    /* If all the strides were 0, the stride won't change */
-                    if (idim == ndim) {
-                        out_strides[iop] = stride;
-                    }
-                    else {
-                        out_strides[iop] = NPY_MAX_INTP;
-                    }
-                }
-            }
-            /*
-             * Inner loop contiguous array means its stride won't change when
-             * switching between buffering and not buffering
-             */
-            else if (ad_strides[iop] == dtypes[iop]->elsize) {
-                out_strides[iop] = ad_strides[iop];
-            }
-            /*
-             * Otherwise the strides can change if the operand is sometimes
-             * buffered, sometimes not.
-             */
-            else {
-                out_strides[iop] = NPY_MAX_INTP;
-            }
-        }
+        /* If there is buffering we wrote the strides into the bufferdata. */
+        memcpy(out_strides, NBF_STRIDES(NIT_BUFFERDATA(iter)), nop*NPY_SIZEOF_INTP);
     }
     else {
-        /* If there's no buffering, the strides are always fixed */
+        /* If there's no buffering, the strides come from the operands. */
         memcpy(out_strides, NAD_STRIDES(axisdata0), nop*NPY_SIZEOF_INTP);
     }
 }

From 162a951e84fb150a13b561c49d25f136e7bdba03 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 3 Dec 2024 10:52:26 +0100
Subject: [PATCH 12/23] MAINT: Fill in correct buffer transferstride always

Also clarifies the comment, since it may be a bit confusing:
The outer stride being zero means that there must be nonzero
inner strides (otherwise they are all zero and we don't use reduce
logic).
But the inner stride being zero is not meaningful.
---
 numpy/_core/src/multiarray/nditer_api.c    |  5 +++--
 numpy/_core/src/multiarray/nditer_constr.c | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index d50b1375037d..72e37629039b 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1863,8 +1863,9 @@ npyiter_fill_buffercopy_params(
 
     if ((opitflags & NPY_OP_ITFLAG_REDUCE) && (NAD_STRIDES(outer_axisdata)[iop] != 0)) {
         /*
-         * Reduce with inner stride ==0 (outer !=0), we buffer the outer stride
-         * which also means buffering only outersize items.
+         * Reduce with all inner strides ==0 (outer !=0).  We buffer the outer
+         * stride which also means buffering only outersize items.
+         * (If the outer stride is 0, some inner ones are guaranteed nonzero.)
          */
         assert(NAD_STRIDES(axisdata)[iop] == 0);
         *ndim_transfer = 1;
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 16b0e50e9fbd..5149b210d22d 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -3381,19 +3381,28 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
     npy_intp *strides = NAD_STRIDES(axisdata), op_stride;
     NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
 
+    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+    NpyIter_AxisData *reduce_axisdata = NIT_INDEX_AXISDATA(axisdata, bufferdata->outerdim);
+    npy_intp *reduce_strides = NAD_STRIDES(reduce_axisdata);
+
     /* combined cast flags, the new cast flags for each cast: */
     NPY_ARRAYMETHOD_FLAGS cflags = PyArrayMethod_MINIMAL_FLAGS;
     NPY_ARRAYMETHOD_FLAGS nc_flags;
 
     for (iop = 0; iop < nop; ++iop) {
         npyiter_opitflags flags = op_itflags[iop];
+
         /*
-         * Reduction operands may be buffered with a different stride,
-         * so we must pass NPY_MAX_INTP to the transfer function factory.
+         * Reduce operands buffer the outer stride if it is nonzero; compare
+         * `npyiter_fill_buffercopy_params`.
+         * (Inner strides cannot _all_ be zero if the outer is, but some can be.)
          */
-        // TODO: This should not be the case anymore!?  (was it ever?!?)
-        op_stride = (flags & NPY_OP_ITFLAG_REDUCE) ? NPY_MAX_INTP :
-                                                   strides[iop];
+        if ((op_itflags[iop] & NPY_OP_ITFLAG_REDUCE) && reduce_strides[iop] != 0) {
+            op_stride = reduce_strides[iop];
+        }
+        else {
+            op_stride = strides[iop];
+        }
 
         /*
          * If we have determined that a buffer may be needed,

From 0cc37b91cf381e463773f236626028a9a0207a29 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 3 Dec 2024 11:06:37 +0100
Subject: [PATCH 13/23] MAINT: Don't use static zero (initialize by caller
 instead)

---
 numpy/_core/src/multiarray/nditer_api.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 72e37629039b..bfb01b1ff548 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1857,7 +1857,6 @@ npyiter_fill_buffercopy_params(
      * NOTE: Except the transfersize itself everything here is fixed
      *       and we could create it once early on.
      */
-    static npy_intp zero = 0;  // TODO: better way?
     *ndim_transfer = ndim;
     *op_transfersize = transfersize;
 
@@ -1873,7 +1872,7 @@ npyiter_fill_buffercopy_params(
         *buf_stride = NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop];
 
         *op_shape = op_transfersize;
-        *op_coords = &zero;
+        assert(*op_coords == 0);  /* initialized by caller currently */
         *op_strides = &NAD_STRIDES(outer_axisdata)[iop];
         return;
     }
@@ -1898,7 +1897,7 @@ npyiter_fill_buffercopy_params(
         /* Flatten the copy into a single stride. */
         *ndim_transfer = 1;
         *op_shape = op_transfersize;
-        *op_coords = &zero;
+        assert(*op_coords == 0);  /* initialized by caller currently */
         *op_strides = &NAD_STRIDES(axisdata)[iop];
         if ((*op_strides)[0] == 0) {
             *op_transfersize = 1;
@@ -1979,11 +1978,12 @@ npyiter_copy_from_buffers(NpyIter *iter)
             NPY_IT_DBG_PRINT1("Iterator: Operand %d was buffered\n",
                                         (int)iop);
 
+            npy_intp zero = 0;  /* used as coord for 1-D copies */
             int ndim_transfer;
             npy_intp op_transfersize;
             npy_intp src_stride;
             npy_intp *dst_strides;
-            npy_intp *dst_coords;
+            npy_intp *dst_coords = &zero;
             npy_intp *dst_shape;
 
             npyiter_fill_buffercopy_params(nop, iop, ndim, op_itflags[iop],
@@ -2191,11 +2191,12 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
             NIT_OPITFLAGS(iter)[iop] &= ~NPY_OP_ITFLAG_BUF_REUSABLE;
         }
 
+        npy_intp zero = 0;  /* used as coord for 1-D copies */
         int ndim_transfer;
         npy_intp op_transfersize;
         npy_intp dst_stride;
         npy_intp *src_strides;
-        npy_intp *src_coords;
+        npy_intp *src_coords = &zero;
         npy_intp *src_shape;
         npy_intp src_itemsize = PyArray_DTYPE(operands[iop])->elsize;
 

From 786e54618a089b348d264b144ff1b44d5f1c6525 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 3 Dec 2024 11:28:21 +0100
Subject: [PATCH 14/23] DOC: Improve the function documentation somewhat.

---
 numpy/_core/src/multiarray/nditer_constr.c | 24 ++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 5149b210d22d..b204a9f1a6a2 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -1948,8 +1948,9 @@ operand_different_than_broadcast: {
  *    then do a normal 1-D loop on those buffers (or operands).
  * 2. The "reduce" mode.  In reduce mode (ITFLAG_REDUCE) we internally use a
  *    a double iteration where:
- *      - One outer iteration with stride == 0 and a single stride core != 0.
- *      - One outer iteration with stride != 0 and a core of strides == 0.
+ *      - One outer iteration with stride == 0 and a core with at least one
+ *        stride != 0 (all of them if this is a reduce operand).
+ *      - One outer iteration with stride != 0 and a core of all strides == 0.
  *    This setup allows filling the buffer with only the stride != 0 and then
  *    doing the double loop on the buffer (either ).
  *
@@ -1958,6 +1959,20 @@ operand_different_than_broadcast: {
  * The reason for the reduce mode is that it allows for a larger core size.
  * If we use the reduce-mode, we can apply it also to read-only operands.
  *
+ * The functino here finds an outer dimension and it's "core" to use that
+ * works with reductions.
+ * While iterating, we will buffer while making sure that:
+ *   - Never buffer beyond the outer dimension (optimize chance of re-use).
+ *   - Never buffer beyond the end of the core (all but outer dimension)
+ *     if the user manually set the iterator to the middle of the core.
+ *
+ * These two things mean that the buffer always looks the same, although it
+ * may be smaller and it also means that we can optimize buffer copies for
+ * the inner-sizes.
+ * (In the original version, the buffersize was always fully used for non
+ * reductions, which meant a lot of work on each buffer copy, less buffer
+ * re-use, and no fixed-strides into the buffer.)
+ *
  * Best buffer and core size
  * -------------------------
  * We don't want to figure out the complicated copies every time we fill buffers
@@ -1971,7 +1986,8 @@ operand_different_than_broadcast: {
  *     is needed).
  *     NOTE: We could tweak this, it is not optimized/proven to be best.
  *
- * NOTE: The function does not attempt 
+ * In theory, the reduction axis could also span multiple axes, but we do
+ * not try to discover this.
  */
 static void
 npyiter_find_buffering_setup(NpyIter *iter)
@@ -3395,7 +3411,7 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
         /*
          * Reduce operands buffer the outer stride if it is nonzero; compare
          * `npyiter_fill_buffercopy_params`.
-         * (Inner strides cannot _all_ be zero if the outer is, but some can be.)
+         * (Inner strides cannot _all_ be zero if the outer is, but some.)
          */
         if ((op_itflags[iop] & NPY_OP_ITFLAG_REDUCE) && reduce_strides[iop] != 0) {
             op_stride = reduce_strides[iop];

From d09be245375b5e1b1c9769415325592fd423761a Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 3 Dec 2024 11:40:04 +0100
Subject: [PATCH 15/23] BUG: Fix typo in new assert

---
 numpy/_core/src/multiarray/nditer_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index bfb01b1ff548..b204cbd9e71e 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1872,7 +1872,7 @@ npyiter_fill_buffercopy_params(
         *buf_stride = NBF_REDUCE_OUTERSTRIDES(bufferdata)[iop];
 
         *op_shape = op_transfersize;
-        assert(*op_coords == 0);  /* initialized by caller currently */
+        assert(**op_coords == 0);  /* initialized by caller currently */
         *op_strides = &NAD_STRIDES(outer_axisdata)[iop];
         return;
     }
@@ -1897,7 +1897,7 @@ npyiter_fill_buffercopy_params(
         /* Flatten the copy into a single stride. */
         *ndim_transfer = 1;
         *op_shape = op_transfersize;
-        assert(*op_coords == 0);  /* initialized by caller currently */
+        assert(**op_coords == 0);  /* initialized by caller currently */
         *op_strides = &NAD_STRIDES(axisdata)[iop];
         if ((*op_strides)[0] == 0) {
             *op_transfersize = 1;

From 92ecbaf29b2854a95108f6e8f7ceb627471cacf0 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 5 Dec 2024 18:09:12 +0100
Subject: [PATCH 16/23] MAINT: Reinstante the CONTIG flag (as buggy as it was)

---
 numpy/_core/src/multiarray/nditer_api.c    | 15 +++--
 numpy/_core/src/multiarray/nditer_constr.c | 26 ++++++--
 numpy/_core/src/multiarray/nditer_impl.h   |  2 +
 numpy/_core/tests/test_nditer.py           | 77 +++++++++++++++++++++-
 4 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index b204cbd9e71e..b3d21e8d35c3 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1515,6 +1515,8 @@ NpyIter_DebugPrint(NpyIter *iter)
             printf("WRITEMASKED ");
         if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_BUF_SINGLESTRIDE)
             printf("BUF_SINGLESTRIDE ");
+        if ((NIT_OPITFLAGS(iter)[iop])&NPY_OP_ITFLAG_CONTIG)
+            printf("CONTIG ");
         printf("\n");
     }
     printf("|\n");
@@ -1532,9 +1534,9 @@ NpyIter_DebugPrint(NpyIter *iter)
         if (itflags&NPY_ITFLAG_REDUCE) {
             printf("|   REDUCE Pos: %d\n",
                         (int)NBF_REDUCE_POS(bufferdata));
-            printf("|   BUFFER Reduce outersize: %d\n",
+            printf("|   REDUCE OuterSize: %d\n",
                         (int)NBF_REDUCE_OUTERSIZE(bufferdata));
-            printf("|   BUFFER OuterDim: %d\n",
+            printf("|   REDUCE OuterDim: %d\n",
                         (int)NBF_OUTERDIM(bufferdata));
         }
         printf("|   Strides: ");
@@ -1894,12 +1896,17 @@ npyiter_fill_buffercopy_params(
     }
 
     if (opitflags & NPY_OP_ITFLAG_BUF_SINGLESTRIDE) {
-        /* Flatten the copy into a single stride. */
         *ndim_transfer = 1;
         *op_shape = op_transfersize;
         assert(**op_coords == 0);  /* initialized by caller currently */
         *op_strides = &NAD_STRIDES(axisdata)[iop];
-        if ((*op_strides)[0] == 0) {
+        if ((*op_strides)[0] == 0 && (
+                !(opitflags & NPY_OP_ITFLAG_CONTIG) ||
+                (opitflags & NPY_OP_ITFLAG_WRITE))) {
+            /*
+             * If the user didn't force contig, optimize single element.
+             * (Unless CONTIG was requested and this is not a write/reduce!)
+             */
             *op_transfersize = 1;
             *buf_stride = 0;
         }
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index b204a9f1a6a2..260fb68a4fa0 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -1010,6 +1010,10 @@ npyiter_check_per_op_flags(npy_uint32 op_flags, npyiter_opitflags *op_itflags)
         *op_itflags |= NPY_OP_ITFLAG_VIRTUAL;
     }
 
+    if (op_flags & NPY_ITER_CONTIG) {
+        *op_itflags |= NPY_OP_ITFLAG_CONTIG;
+    }
+
     return 1;
 }
 
@@ -2175,6 +2179,11 @@ npyiter_find_buffering_setup(NpyIter *iter)
         npy_bool is_reduce_op;
         npy_bool op_is_buffered = (op_itflags[iop]&NPY_OP_ITFLAG_CAST) != 0;
 
+        /* If contig was requested and this is not writeable avoid zero strides */
+        npy_bool avoid_zero_strides = (
+                (op_itflags[iop] & NPY_OP_ITFLAG_CONTIG) != 0
+                && !(op_itflags[iop] & NPY_OP_ITFLAG_WRITE));
+
         /*
          * Figure out if this is iterated as a reduce op.  Even one marked
          * for reduction may not be iterated as one.
@@ -2189,16 +2198,19 @@ npyiter_find_buffering_setup(NpyIter *iter)
         else if (op_single_stride_dims[iop] == best_dim && !op_is_buffered) {
             /*
              * Optimization: This operand is not buffered and we might as well
-             * iterate it as an unbuffered reduce operand (if not yet buffered).
+             * iterate it as an unbuffered reduce operand.
              */
             is_reduce_op = 1;
         }
         else if (NAD_STRIDES(reduce_axisdata)[iop] == 0
-                    && op_single_stride_dims[iop] <= best_dim) {
+                    && op_single_stride_dims[iop] <= best_dim
+                    && !avoid_zero_strides) {
             /*
              * Optimization: If the outer (reduce) stride is 0 on the operand
              * then we can iterate this in a reduce way: buffer the core only
              * and repeat it in the "outer" dimension.
+             * If user requested contig, we may have to avoid 0 strides, this
+             * is incompatible with the reduce path.
              */
             is_reduce_op = 1;
         }
@@ -2229,7 +2241,8 @@ npyiter_find_buffering_setup(NpyIter *iter)
              */
             if (!is_reduce_op
                     && NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE
-                    && NAD_STRIDES(axisdata)[iop] == 0) {
+                    && NAD_STRIDES(axisdata)[iop] == 0
+                    && !avoid_zero_strides) {
                 /* This op is always 0 strides, so even the buffer is that. */
                 inner_stride = 0;
                 reduce_outer_stride = 0;
@@ -3310,11 +3323,16 @@ npyiter_allocate_arrays(NpyIter *iter,
         }
 
         /* Here we can finally check for contiguous iteration */
-        if (op_flags[iop] & NPY_ITER_CONTIG) {
+        if (op_itflags[iop] & NPY_OP_ITFLAG_CONTIG) {
             NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
             npy_intp stride = NAD_STRIDES(axisdata)[iop];
 
             if (stride != op_dtype[iop]->elsize) {
+                /*
+                 * Need to copy to buffer (cast) to ensure contiguous
+                 * NOTE: This is the wrong place in case of axes reorder
+                 *       (there is an xfailing test for this).
+                 */
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_CONTIG\n");
                 op_itflags[iop] |= NPY_OP_ITFLAG_CAST;
diff --git a/numpy/_core/src/multiarray/nditer_impl.h b/numpy/_core/src/multiarray/nditer_impl.h
index 8e5379c12ddd..42610d8efd30 100644
--- a/numpy/_core/src/multiarray/nditer_impl.h
+++ b/numpy/_core/src/multiarray/nditer_impl.h
@@ -139,6 +139,8 @@
 #define NPY_OP_ITFLAG_FORCECOPY    0x0200
 /* The operand has temporary data, write it back at dealloc */
 #define NPY_OP_ITFLAG_HAS_WRITEBACK 0x0400
+/* Whether the user request a contiguous operand */
+#define NPY_OP_ITFLAG_CONTIG 0x0800
 
 /*
  * The data layout of the iterator is fully specified by
diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index e331a3f31f55..90bec417c082 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -2320,6 +2320,79 @@ def test_iter_buffering_growinner():
     assert_equal(i[0].size, a.size)
 
 
+@pytest.mark.parametrize("read_or_readwrite", ["readonly", "readwrite"])
+def test_iter_contig_flag_reduce_error(read_or_readwrite):
+    # Test that a non-contiguous operand is rejected without buffering.
+    # NOTE: This is true even for a reduction, where we return a 0-stride
+    #       below!
+    with pytest.raises(TypeError, match="Iterator operand required buffering"):
+        it = np.nditer(
+            (np.zeros(()),), flags=["external_loop", "reduce_ok"],
+            op_flags=[(read_or_readwrite, "contig"),], itershape=(10,))
+
+
+@pytest.mark.parametrize("arr", [
+        lambda: np.zeros(()),
+        lambda: np.zeros((20, 1))[::20],
+        lambda: np.zeros((1, 20))[:, ::20]
+    ])
+def test_iter_contig_flag_single_operand_strides(arr):
+    """
+    Tests the strides with the contig flag for both broadcast and non-broadcast
+    operands in 3 cases where the logic is needed:
+    1. When everything has a zero stride, the broadcast op needs to repeated
+    2. When the reduce axis is the last axis (first to iterate).
+    3. When the reduce axis is the first axis (last to iterate).
+
+    NOTE: The semantics of the cast flag are not clearly defined when
+          it comes to reduction.  It is unclear that there are any users.
+    """
+    first_op = np.ones((10, 10))
+    broadcast_op = arr()
+    red_op = arr()
+    # Add a first operand to ensure no axis-reordering and the result shape.
+    iterator = np.nditer(
+        (first_op, broadcast_op, red_op),
+        flags=["external_loop", "reduce_ok", "buffered", "delay_bufalloc"],
+        op_flags=[("readonly", "contig")] * 2 + [("readwrite", "contig")])
+
+    with iterator:
+        iterator.reset()
+        try:
+            for f, b, r in iterator:
+                # The first operand is contigouos, we should have a view
+                assert np.shares_memory(f, first_op)
+                # Although broadcast, the second op always has a contiguous stride
+                assert b.strides[0] == 8
+                assert not np.shares_memory(b, broadcast_op)
+                # The reduction has a contiguous stride or a 0 stride
+                if red_op.ndim == 0 or red_op.shape[-1] == 1:
+                    assert r.strides[0] == 0
+                else:
+                    # The stride is 8, although it was not originally:
+                    assert r.strides[0] == 8
+                # If the reduce stride is 0, buffering makes no difference, but we
+                # do it anyway right now:
+                assert not np.shares_memory(r, red_op)
+        finally:
+            iterator.debug_print()
+
+
+@pytest.mark.xfail(reason="The contig flag was always buggy.")
+def test_iter_contig_flag_incorrect():
+    # This case does the wrong thing...
+    iterator = np.nditer(
+        (np.ones((10, 10)).T, np.ones((1, 10))),
+        flags=["external_loop", "reduce_ok", "buffered", "delay_bufalloc"],
+        op_flags=[("readonly", "contig")] * 2)
+
+    with iterator:
+        iterator.reset()
+        for a, b in iterator:
+            assert a.strides == 8
+            assert b.strides == 8  # should be 8 but is 0 due to axis reorder
+
+
 @pytest.mark.slow
 def test_iter_buffered_reduce_reuse():
     # large enough array for all views, including negative strides.
@@ -3324,8 +3397,8 @@ def test_debug_print(capfd):
     |   BufIterEnd: 5
     |   BUFFER CoreSize: 5
     |   REDUCE Pos: 0
-    |   BUFFER Reduce outersize: 10
-    |   BUFFER OuterDim: 1
+    |   REDUCE OuterSize: 10
+    |   REDUCE OuterDim: 1
     |   Strides: 8 4
     |   Ptrs:
     |   REDUCE Outer Strides: 40 0

From 5bad3576d8fd0a74c3f4b0e6d66568a8f5f262e3 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 5 Dec 2024 22:04:41 +0100
Subject: [PATCH 17/23] remove debug print (was this part of the sanitizer
 problem?!)

---
 numpy/_core/tests/test_nditer.py | 33 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index 90bec417c082..e3a37674e124 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -2358,24 +2358,21 @@ def test_iter_contig_flag_single_operand_strides(arr):
 
     with iterator:
         iterator.reset()
-        try:
-            for f, b, r in iterator:
-                # The first operand is contigouos, we should have a view
-                assert np.shares_memory(f, first_op)
-                # Although broadcast, the second op always has a contiguous stride
-                assert b.strides[0] == 8
-                assert not np.shares_memory(b, broadcast_op)
-                # The reduction has a contiguous stride or a 0 stride
-                if red_op.ndim == 0 or red_op.shape[-1] == 1:
-                    assert r.strides[0] == 0
-                else:
-                    # The stride is 8, although it was not originally:
-                    assert r.strides[0] == 8
-                # If the reduce stride is 0, buffering makes no difference, but we
-                # do it anyway right now:
-                assert not np.shares_memory(r, red_op)
-        finally:
-            iterator.debug_print()
+        for f, b, r in iterator:
+            # The first operand is contigouos, we should have a view
+            assert np.shares_memory(f, first_op)
+            # Although broadcast, the second op always has a contiguous stride
+            assert b.strides[0] == 8
+            assert not np.shares_memory(b, broadcast_op)
+            # The reduction has a contiguous stride or a 0 stride
+            if red_op.ndim == 0 or red_op.shape[-1] == 1:
+                assert r.strides[0] == 0
+            else:
+                # The stride is 8, although it was not originally:
+                assert r.strides[0] == 8
+            # If the reduce stride is 0, buffering makes no difference, but we
+            # do it anyway right now:
+            assert not np.shares_memory(r, red_op)
 
 
 @pytest.mark.xfail(reason="The contig flag was always buggy.")

From 67cb5c501066ddd448bdde4d4c186ad7319099c6 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Fri, 6 Dec 2024 12:43:47 +0100
Subject: [PATCH 18/23] TST: See if deleting a, b makes santizer tests pass

---
 numpy/_core/tests/test_nditer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/numpy/_core/tests/test_nditer.py b/numpy/_core/tests/test_nditer.py
index e3a37674e124..527265d4217d 100644
--- a/numpy/_core/tests/test_nditer.py
+++ b/numpy/_core/tests/test_nditer.py
@@ -2386,8 +2386,10 @@ def test_iter_contig_flag_incorrect():
     with iterator:
         iterator.reset()
         for a, b in iterator:
-            assert a.strides == 8
-            assert b.strides == 8  # should be 8 but is 0 due to axis reorder
+            # Remove a and b from locals (pytest may want to format them)
+            a, b = a.strides, b.strides
+            assert a == 8
+            assert b == 8  # should be 8 but is 0 due to axis reorder
 
 
 @pytest.mark.slow

From 5508cc5c2633321880b9983ad3aed636e9c18ef3 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 10 Dec 2024 14:14:50 +0100
Subject: [PATCH 19/23] MAINT: Move buffersize init (new code always limits it
 to itersize)

---
 numpy/_core/src/multiarray/nditer_constr.c | 39 ++++++++--------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 260fb68a4fa0..1a87c53aa523 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -100,7 +100,7 @@ static int
 npyiter_allocate_transfer_functions(NpyIter *iter);
 
 static void
-npyiter_find_buffering_setup(NpyIter *iter);
+npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize);
 
 /*NUMPY_API
  * Allocate a new iterator for multiple array objects, and advanced
@@ -255,28 +255,6 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
 
     NPY_IT_TIME_POINT(c_fill_axisdata);
 
-    if (itflags & NPY_ITFLAG_BUFFER) {
-        /*
-         * If buffering is enabled and no buffersize was given, use a default
-         * chosen to be big enough to get some amortization benefits, but
-         * small enough to be cache-friendly.
-         */
-        if (buffersize <= 0) {
-            buffersize = NPY_BUFSIZE;
-        }
-        /* No point in a buffer bigger than the iteration size */
-        if (buffersize > NIT_ITERSIZE(iter)) {
-            buffersize = NIT_ITERSIZE(iter);
-        }
-        NBF_BUFFERSIZE(bufferdata) = buffersize;
-
-        /*
-         * Initialize for use in FirstVisit, which may be called before
-         * the buffers are filled and the reduce pos is updated.
-         */
-        NBF_REDUCE_POS(bufferdata) = 0;
-    }
-
     /*
      * If an index was requested, compute the strides for it.
      * Note that we must do this before changing the order of the
@@ -476,7 +454,13 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
 
     /* If buffering is set prepare it */
     if (itflags & NPY_ITFLAG_BUFFER) {
-        npyiter_find_buffering_setup(iter);
+        npyiter_find_buffering_setup(iter, buffersize);
+
+        /*
+         * Initialize for use in FirstVisit, which may be called before
+         * the buffers are filled and the reduce pos is updated.
+         */
+        NBF_REDUCE_POS(bufferdata) = 0;
 
         if (!npyiter_allocate_transfer_functions(iter)) {
             NpyIter_Deallocate(iter);
@@ -1994,7 +1978,7 @@ operand_different_than_broadcast: {
  * not try to discover this.
  */
 static void
-npyiter_find_buffering_setup(NpyIter *iter)
+npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
 {
     int nop = iter->nop;
     int ndim = iter->ndim;
@@ -2021,8 +2005,11 @@ npyiter_find_buffering_setup(NpyIter *iter)
      * We can only continue as long as we are within the maximum allowed size.
      * When no buffering is needed and GROWINNER is set, we don't have to
      * worry about this maximum.
+     *
+     * If the user passed no buffersize, default to one small enough that it
+     * should be cache friendly and big enough to amortize overheads.
      */
-    npy_intp maximum_size = NBF_BUFFERSIZE(bufferdata);
+    npy_intp maximum_size = buffersize <= 0 ? NPY_BUFSIZE : buffersize;
 
     /* The cost factor defined by: (1 + n_buffered) */
     int cost = 1;

From be6749dcad0ea99cbc59707596666a57a0245830 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 12 Dec 2024 14:14:25 +0100
Subject: [PATCH 20/23] DOC: Explain buffer-setup cost more (maybe should
 rethink it...)

---
 numpy/_core/src/multiarray/nditer_constr.c | 51 ++++++++++++----------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 1a87c53aa523..91ef0fbf5513 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -1927,55 +1927,60 @@ operand_different_than_broadcast: {
 /*
  * At this point we (presumably) use a buffered iterator and here we want
  * to find out the best way to buffer the iterator in a fashion that we don't
- * have to figure out a lot of things on every iteration.
+ * have to figure out a lot of things on every outer iteration.
  *
  * How do we iterate?
  * ------------------
  * There are currently two modes of "buffered" iteration:
  * 1. The normal mode, where we either buffer each operand or not and
- *    then do a normal 1-D loop on those buffers (or operands).
+ *    then do a 1-D loop on those buffers (or operands).
  * 2. The "reduce" mode.  In reduce mode (ITFLAG_REDUCE) we internally use a
- *    a double iteration where:
+ *    a double iteration where for "reduce" operands we have:
  *      - One outer iteration with stride == 0 and a core with at least one
- *        stride != 0 (all of them if this is a reduce operand).
+ *        stride != 0 (all of them if this is a true reduce/writeable operand).
  *      - One outer iteration with stride != 0 and a core of all strides == 0.
  *    This setup allows filling the buffer with only the stride != 0 and then
- *    doing the double loop on the buffer (either ).
+ *    doing the double loop.
  *
  * Only a writeable (reduce) operand require this reduce mode because for
- * reading it is OK if the buffer holds duplicates.
- * The reason for the reduce mode is that it allows for a larger core size.
- * If we use the reduce-mode, we can apply it also to read-only operands.
+ * reading it is OK if the buffer holds duplicates elements.
+ * The goal of the reduce mode is that it allows for larger core sizes and
+ * buffers since the zero strides do not allow a single 1-d iteration.
+ * If do we use the reduce-mode, we can apply it also to read-only operands
+ * as an optimization.
  *
  * The functino here finds an outer dimension and it's "core" to use that
  * works with reductions.
- * While iterating, we will buffer while making sure that:
+ * While iterating, we will fill the buffers making sure that we:
  *   - Never buffer beyond the outer dimension (optimize chance of re-use).
  *   - Never buffer beyond the end of the core (all but outer dimension)
- *     if the user manually set the iterator to the middle of the core.
+ *     if the user manually moved the iterator to the middle of the core.
+ *     (Not typical and if, should usually happen once.)
  *
- * These two things mean that the buffer always looks the same, although it
- * may be smaller and it also means that we can optimize buffer copies for
- * the inner-sizes.
- * (In the original version, the buffersize was always fully used for non
- * reductions, which meant a lot of work on each buffer copy, less buffer
- * re-use, and no fixed-strides into the buffer.)
+ * This means that the data stored in the buffer has always the same structure
+ * (except when manually moved).  And we can simplify optimize the buffer
+ * filling in some cases and it is easy to decide whether a buffer content
+ * may be re-usable (this can happen with broadcasted operands).
  *
  * Best buffer and core size
  * -------------------------
  * We don't want to figure out the complicated copies every time we fill buffers
- * so we want to find a the outer iteration dimension so that:
- *   - Its core size is smaller than the buffersize if buffering is needed.
+ * so here we want to find a the outer iteration dimension so that:
+ *   - Its core size is <= the maximum buffersize if buffering is needed.
  *   - Allows for reductions to work (with or without reduce-mode)
  *   - Optimizes for iteration overhead.  We estimate the total overhead with:
  *     `(1 + n_buffers) / size`.
  *     The `size` is the `min(core_size * outer_dim_size, buffersize)` and
- *     estimates how often we fill buffers (size is unbounded if no buffering
- *     is needed).
- *     NOTE: We could tweak this, it is not optimized/proven to be best.
+ *     estimates how often we fill buffers (unbounded if not buffering).
  *
- * In theory, the reduction axis could also span multiple axes, but we do
- * not try to discover this.
+ * TODO: Probably should tweak or simplify?  The formula is clearly not
+ *       the actual cost (Buffers add a constant total cost as well).
+ *       Right now, it mostly rejects growing the core size when we are already
+ *       close to the maximum buffersize (even overhead wise not worth it).
+ *       That may be good enough, but maybe it can be spelled simpler?
+ *
+ * In theory, the reduction could also span multiple axes if other operands
+ * are buffered.  We do not try to discover this.
  */
 static void
 npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)

From 426934d2a7a303c7b9bfba38d1b18649e54c06a5 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Sun, 15 Dec 2024 18:04:32 +0100
Subject: [PATCH 21/23] DOC: Expand/change iteration explanation based on
 Marten's review

---
 numpy/_core/src/multiarray/nditer_constr.c | 78 ++++++++++++++++------
 1 file changed, 57 insertions(+), 21 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 91ef0fbf5513..05ae60165297 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -1941,37 +1941,73 @@ operand_different_than_broadcast: {
  *      - One outer iteration with stride != 0 and a core of all strides == 0.
  *    This setup allows filling the buffer with only the stride != 0 and then
  *    doing the double loop.
+ *    An Example for these two cases is:
+ *        arr = np.ones((100, 10, 10))[::2, :, :]
+ *        arr.sum(-1)
+ *        arr.sum(-2)
+ *    Where the slice prevents the iterator from collapsing axes and the
+ *    result has stride 0 either along the last or the second to last axis.
+ *    In both cases we can buffer 10x10 elements in reduce mode.
+ *    (This iteration needs no buffer, add a cast to ensure actual buffering.)
  *
  * Only a writeable (reduce) operand require this reduce mode because for
- * reading it is OK if the buffer holds duplicates elements.
- * The goal of the reduce mode is that it allows for larger core sizes and
+ * reading it is OK if the buffer holds duplicated elements.
+ * The benefit of the reduce mode is that it allows for larger core sizes and
  * buffers since the zero strides do not allow a single 1-d iteration.
- * If do we use the reduce-mode, we can apply it also to read-only operands
- * as an optimization.
+ * If we use reduce-mode, we can apply it also to read-only operands as an
+ * optimization.
  *
- * The functino here finds an outer dimension and it's "core" to use that
- * works with reductions.
+ * The function here finds the first "outer" dimension and it's "core" to use
+ * that works with reductions.
  * While iterating, we will fill the buffers making sure that we:
- *   - Never buffer beyond the outer dimension (optimize chance of re-use).
- *   - Never buffer beyond the end of the core (all but outer dimension)
- *     if the user manually moved the iterator to the middle of the core.
- *     (Not typical and if, should usually happen once.)
+ *   - Never buffer beyond the first outer dimension (optimize chance of re-use).
+ *   - If the iterator is manually set to an offset into what is part of the
+ *     core (see second example below), then we only fill the buffer to finish
+ *     that one core.  This re-aligns us with the core and is necessary for
+ *     reductions.  (Such manual setting should be rare or happens exactly once
+ *     for splitting the iteration into worker chunks.)
+ *
+ * And examples for these two constraints:
+ *   Given the iteration shape is (100, 10, 10) and the core size 10 with a
+ *   buffer size of 60 (due to limits), making dimension 1 the "outer" one.
+ *   The first iterations/buffers would then range (excluding end-point):
+ *     - (0, 0, 0) -> (0, 6, 0)
+ *     - (0, 6, 0) -> (1, 0, 0)  # Buffer only holds 40 of 60 possible elements.
+ *     - (1, 0, 0) -> (1, 6, 0)
+ *     - ...
+ *   If the user limits to a range starting from 75, we use:
+ *     - (0, 7, 5) -> (0, 8, 0)  # Only 5 elements to re-align with core.
+ *     - (0, 8, 0) -> (1, 0, 0)
+ *     - ...  # continue as above
  *
  * This means that the data stored in the buffer has always the same structure
- * (except when manually moved).  And we can simplify optimize the buffer
- * filling in some cases and it is easy to decide whether a buffer content
- * may be re-usable (this can happen with broadcasted operands).
+ * (except when manually moved),  which allows us to fill the buffer more simply
+ * and optimally in some cases, and makes it easier to determine whether buffer
+ * content is re-usable (e.g., because it represents broadcasted operands).
  *
  * Best buffer and core size
  * -------------------------
- * We don't want to figure out the complicated copies every time we fill buffers
- * so here we want to find a the outer iteration dimension so that:
- *   - Its core size is <= the maximum buffersize if buffering is needed.
- *   - Allows for reductions to work (with or without reduce-mode)
- *   - Optimizes for iteration overhead.  We estimate the total overhead with:
- *     `(1 + n_buffers) / size`.
- *     The `size` is the `min(core_size * outer_dim_size, buffersize)` and
- *     estimates how often we fill buffers (unbounded if not buffering).
+ * To avoid having to figure out what to copy every time we fill buffers,
+ * we here want to find the outer iteration dimension such that:
+ *   - Its core size is <= the maximum buffersize if buffering is needed;
+ *   - Reductions are possible (with or without reduce mode);
+ *   - Iteration overhead is minimized.  We estimate the total overhead with
+ *     the number "outer" iterations:
+ *
+ *        N_o = full_iterator_size / min(core_size * outer_dim_size, buffersize)
+ *
+ *     This is approximately how often `iternext()` is called when the user
+ *     is using an external-loop and how often we would fill buffers.
+ *     The total overhead is then estimated as:
+ *
+ *        (1 + n_buffers) * N_o
+ *
+ *     Since the iterator size is a constant, we can estimate the overhead as:
+ *
+ *        (1 + n_buffers) / min(core_size * outer_dim_size, buffersize)
+ *
+ *     And when comparing two options multiply by the others divisor/size to
+ *     avoid the division.
  *
  * TODO: Probably should tweak or simplify?  The formula is clearly not
  *       the actual cost (Buffers add a constant total cost as well).

From 1836fd14eba0e0c6466c884c5c8f6ab41392fda4 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Sun, 15 Dec 2024 22:11:00 +0100
Subject: [PATCH 22/23] MAINT: More review comments (mostly code
 clarifications/simplifications)

---
 numpy/_core/src/multiarray/nditer_api.c    |  2 +-
 numpy/_core/src/multiarray/nditer_constr.c | 85 ++++++++++------------
 numpy/_core/src/multiarray/nditer_impl.h   |  4 +-
 3 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index b3d21e8d35c3..5aa160cdcb9a 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -2185,7 +2185,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                         && op_itflags[iop]&NPY_OP_ITFLAG_REDUCE
                         && NAD_STRIDES(outer_axisdata)[iop] == 0)) {
             /*
-             * If we have a fully copy or a reduce with 0 stride outer and
+             * If we have a full copy or a reduce with 0 stride outer and
              * a copy larger than the coresize, this is now re-usable.
              * NB: With a core-offset, we always copy less than the core-size.
              */
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 05ae60165297..0ad5905595f7 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -2027,8 +2027,9 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
 
     /*
-     * We check two things here, first whether the core is single strided
-     * and second, whether we found a reduce stride dimension for the operand.
+     * We check two things here, first how many operand dimensions can be
+     * iterated using a single stride (all dimensions are consistent),
+     * and second, whether we found a reduce dimension for the operand.
      * That is an outer dimension a reduce would have to take place on.
      */
     int op_single_stride_dims[NPY_MAXARGS];
@@ -2094,19 +2095,17 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
         NIT_ADVANCE_AXISDATA(axisdata, 1);
         npy_intp *strides = NAD_STRIDES(axisdata);
 
-        int iop;
-        for (iop = 0; iop < nop; iop++) {
+        for (int iop = 0; iop < nop; iop++) {
             /* Check that we set things up nicely (if shape is ever 1) */
             assert((axisdata->shape == 1) ? (prev_strides[iop] == strides[iop]) : 1);
-            /*  Best case: the strides collapse for this operand. */
-            if (prev_strides[iop] * prev_shape == strides[iop]) {
-                if (op_single_stride_dims[iop] == idim) {
+
+            if (op_single_stride_dims[iop] == idim) {
+                /*  Best case: the strides still collapse for this operand. */
+                if (prev_strides[iop] * prev_shape == strides[iop]) {
                     op_single_stride_dims[iop] += 1;
+                    continue;
                 }
-                continue;
-            }
 
-            if (op_single_stride_dims[iop] == idim) {
                 /*
                  * Operand now requires buffering (if it was not already).
                  * NOTE: This is technically not true since we may still use
@@ -2120,28 +2119,19 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
             }
 
             /*
-             * If this operand is a reduction operand and this is not the
-             * outer_reduce_dim, then we must stop.
+             * If this operand is a reduction operand and the stride swapped
+             * between !=0 and ==0 then this is the `outer_reduce_dim` and
+             * we will never continue further (see break at start of op loop).
              */
-            if (op_itflags[iop] & NPY_OP_ITFLAG_REDUCE) {
-                /*
-                 * We swap between !=0/==0 and thus make it a reduce
-                 * (it is OK if another op started a reduce at this dimension)
-                 */
-                if (strides[iop] == 0 || prev_strides[iop] == 0) {
-                    if (outer_reduce_dim == 0 || outer_reduce_dim == idim) {
-                        op_reduce_outer_dim[iop] = idim;
-                        outer_reduce_dim = idim;
-                    }
-                }
+            if ((op_itflags[iop] & NPY_OP_ITFLAG_REDUCE)
+                    && (strides[iop] == 0 || prev_strides[iop] == 0)) {
+                assert(outer_reduce_dim == 0 || outer_reduce_dim == idim);
+                op_reduce_outer_dim[iop] = idim;
+                outer_reduce_dim = idim;
             }
             /* For clarity: op_reduce_outer_dim[iop] if set always matches. */
             assert(!op_reduce_outer_dim[iop] || op_reduce_outer_dim[iop] == outer_reduce_dim);
         }
-        if (iop != nop) {
-            /* Including this dimension was invalid due to a reduction. */
-            break;
-        }
 
         npy_intp coresize = size;  /* if we iterate here, this is the core */
         size *= axisdata->shape;
@@ -2209,7 +2199,7 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
 
         /* If contig was requested and this is not writeable avoid zero strides */
         npy_bool avoid_zero_strides = (
-                (op_itflags[iop] & NPY_OP_ITFLAG_CONTIG) != 0
+                (op_itflags[iop] & NPY_OP_ITFLAG_CONTIG)
                 && !(op_itflags[iop] & NPY_OP_ITFLAG_WRITE));
 
         /*
@@ -2267,26 +2257,29 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
              * reduce logic.  In that case, either the inner or outer stride
              * is 0.
              */
-            if (!is_reduce_op
-                    && NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE
-                    && NAD_STRIDES(axisdata)[iop] == 0
-                    && !avoid_zero_strides) {
-                /* This op is always 0 strides, so even the buffer is that. */
-                inner_stride = 0;
-                reduce_outer_stride = 0;
-            }
-            else if (!is_reduce_op) {
-                /* normal buffered op */
-                inner_stride = itemsize;
-                reduce_outer_stride = itemsize * best_coresize;
-            }
-            else if (NAD_STRIDES(reduce_axisdata)[iop] == 0) {
-                inner_stride = itemsize;
-                reduce_outer_stride = 0;
+            if (is_reduce_op) {
+                if (NAD_STRIDES(reduce_axisdata)[iop] == 0) {
+                    inner_stride = itemsize;
+                    reduce_outer_stride = 0;
+                }
+                else {
+                    inner_stride = 0;
+                    reduce_outer_stride = itemsize;
+                }
             }
             else {
-                inner_stride = 0;
-                reduce_outer_stride = itemsize;
+                if (NIT_OPITFLAGS(iter)[iop] & NPY_OP_ITFLAG_BUF_SINGLESTRIDE
+                        && NAD_STRIDES(axisdata)[iop] == 0
+                        && !avoid_zero_strides) {
+                    /* This op is always 0 strides, so even the buffer is that. */
+                    inner_stride = 0;
+                    reduce_outer_stride = 0;
+                }
+                else {
+                    /* normal buffered op */
+                    inner_stride = itemsize;
+                    reduce_outer_stride = itemsize * best_coresize;
+                }
             }
         }
         else {
diff --git a/numpy/_core/src/multiarray/nditer_impl.h b/numpy/_core/src/multiarray/nditer_impl.h
index 42610d8efd30..c8ac9e4fcce4 100644
--- a/numpy/_core/src/multiarray/nditer_impl.h
+++ b/numpy/_core/src/multiarray/nditer_impl.h
@@ -131,7 +131,7 @@
 /* The operand requires masking when copying buffer -> array */
 #define NPY_OP_ITFLAG_WRITEMASKED  0x0080
 /*
- * Whether the buffer is *fully* filled and thus ready for re-usable.
+ * Whether the buffer is *fully* filled and thus ready for reuse.
  * (Must check if the start pointer matches until copy-from-buffer checks)
  */
 #define NPY_OP_ITFLAG_BUF_REUSABLE 0x0100
@@ -139,7 +139,7 @@
 #define NPY_OP_ITFLAG_FORCECOPY    0x0200
 /* The operand has temporary data, write it back at dealloc */
 #define NPY_OP_ITFLAG_HAS_WRITEBACK 0x0400
-/* Whether the user request a contiguous operand */
+/* Whether the user requested a contiguous operand */
 #define NPY_OP_ITFLAG_CONTIG 0x0800
 
 /*

From e75898373487a95131396c966262096666161930 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 16 Dec 2024 21:49:40 +0100
Subject: [PATCH 23/23] MAINT: Smaller changes based on review

---
 numpy/_core/src/multiarray/nditer_api.c    | 8 ++++----
 numpy/_core/src/multiarray/nditer_constr.c | 7 ++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/numpy/_core/src/multiarray/nditer_api.c b/numpy/_core/src/multiarray/nditer_api.c
index 5aa160cdcb9a..344b33254980 100644
--- a/numpy/_core/src/multiarray/nditer_api.c
+++ b/numpy/_core/src/multiarray/nditer_api.c
@@ -1940,7 +1940,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
 
     PyArray_Descr **dtypes = NIT_DTYPES(iter);
     npy_intp *strides = NBF_STRIDES(bufferdata);
-    npy_intp transfersize = bufferdata->size;
+    npy_intp transfersize = NBF_SIZE(bufferdata);
 
     char **ad_ptrs = NAD_PTRS(axisdata);
     char **buffers = NBF_BUFFERS(bufferdata);
@@ -1956,7 +1956,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
 
     if (itflags & NPY_ITFLAG_REDUCE) {
         npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
-        outer_axisdata = NIT_INDEX_AXISDATA(axisdata, bufferdata->outerdim);
+        outer_axisdata = NIT_INDEX_AXISDATA(axisdata, NBF_OUTERDIM(bufferdata));
         transfersize *= NBF_REDUCE_OUTERSIZE(bufferdata);
     }
 
@@ -2082,8 +2082,8 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
-    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter),
-                     *outer_axisdata = NULL;
+    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
+    NpyIter_AxisData *outer_axisdata = NULL;
 
     PyArrayObject **operands = NIT_OPERANDS(iter);
 
diff --git a/numpy/_core/src/multiarray/nditer_constr.c b/numpy/_core/src/multiarray/nditer_constr.c
index 0ad5905595f7..41060f6b206e 100644
--- a/numpy/_core/src/multiarray/nditer_constr.c
+++ b/numpy/_core/src/multiarray/nditer_constr.c
@@ -2035,10 +2035,6 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
     int op_single_stride_dims[NPY_MAXARGS];
     int op_reduce_outer_dim[NPY_MAXARGS];
 
-    /*
-     * Note that this code requires/uses the fact that there is always one
-     * axisdata even for ndim == 0 (i.e. no need to worry about it).
-     */
     npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
     NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
@@ -2072,6 +2068,7 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
 
     npy_intp size = axisdata->shape;  /* the current total size */
 
+    /* Note that there is always one axidata that we use (even with ndim =0) */
     int best_dim = 0;
     int best_cost = cost;
     /* The size of the "outer" iteration and all previous dimensions: */
@@ -2329,7 +2326,7 @@ npyiter_find_buffering_setup(NpyIter *iter, npy_intp buffersize)
         }
     }
     NIT_BUFFERDATA(iter)->buffersize = best_size;
-    /* Core size is 0 (unless the user applies a range explicitly). */
+    /* Core starts at 0 initially, if needed it is set in goto index. */
     NIT_BUFFERDATA(iter)->coreoffset = 0;
 
     return;