diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index a3563925..8b80f1c9 100644
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -44,9 +44,44 @@ np Static API class (like `import numpy as np`)
| Decision | Rationale |
|----------|-----------|
| Unmanaged memory | Benchmarked fastest ~5y ago; Span/Memory immature then |
+| C-order only | Only row-major (C-order) memory layout. Uses `ArrayFlags.C_CONTIGUOUS` flag. No F-order/column-major support. The `order` parameter on `ravel`, `flatten`, `copy`, `reshape` is accepted but ignored. |
| Regen templating | ~200K lines generated for type-specific code |
| TensorEngine abstract | Future GPU/SIMD backends possible |
| View semantics | Slicing returns views (shared memory), not copies |
+| Shape readonly struct | Immutable after construction (NumPy-aligned). Contains `ArrayFlags` for cached O(1) property access |
+| Broadcast write protection | Broadcast views are read-only (`IsWriteable = false`), matching NumPy behavior |
+
+## Shape Architecture (NumPy-Aligned)
+
+Shape is a `readonly struct` with cached `ArrayFlags` computed at construction:
+
+```csharp
+public readonly partial struct Shape
+{
+ internal readonly int[] dimensions; // Dimension sizes
+ internal readonly int[] strides; // Stride values (0 = broadcast dimension)
+ internal readonly int offset; // Base offset into storage
+ internal readonly int bufferSize; // Size of underlying buffer
+ internal readonly int _flags; // Cached ArrayFlags bitmask
+}
+```
+
+**ArrayFlags enum** (matches NumPy's `ndarraytypes.h`):
+| Flag | Value | Meaning |
+|------|-------|---------|
+| `C_CONTIGUOUS` | 0x0001 | Data is row-major contiguous |
+| `F_CONTIGUOUS` | 0x0002 | Reserved (always false for NumSharp) |
+| `OWNDATA` | 0x0004 | Array owns its data buffer |
+| `ALIGNED` | 0x0100 | Always true for managed allocations |
+| `WRITEABLE` | 0x0400 | False for broadcast views |
+| `BROADCASTED` | 0x1000 | Has stride=0 with dim > 1 |
+
+**Key Shape properties:**
+- `IsContiguous` — O(1) check via `C_CONTIGUOUS` flag
+- `IsBroadcasted` — O(1) check via `BROADCASTED` flag
+- `IsWriteable` — False for broadcast views (prevents corruption)
+- `IsSliced` — True if offset != 0, different size, or non-contiguous
+- `IsSimpleSlice` — IsSliced && !IsBroadcasted (fast offset path)
## Critical: View Semantics
@@ -109,9 +144,12 @@ nd["..., -1"] // Ellipsis fills dimensions
### Math Functions (`Math/`)
| Function | File |
|----------|------|
+| `np.add`, `np.subtract`, `np.multiply`, `np.divide` | `np.math.cs` |
+| `np.mod`, `np.true_divide` | `np.math.cs` |
+| `np.positive`, `np.negative`, `np.convolve` | `np.math.cs` |
| `np.sum` | `np.sum.cs` |
-| `np.prod` | `NDArray.prod.cs` |
-| `np.cumsum` | `NDArray.cumsum.cs` |
+| `np.prod`, `nd.prod()` | `np.math.cs`, `NDArray.prod.cs` |
+| `np.cumsum`, `nd.cumsum()` | `APIs/np.cumsum.cs`, `Math/NDArray.cumsum.cs` |
| `np.power` | `np.power.cs` |
| `np.sqrt` | `np.sqrt.cs` |
| `np.abs`, `np.absolute` | `np.absolute.cs` |
@@ -131,9 +169,9 @@ nd["..., -1"] // Ellipsis fills dimensions
| `np.mean`, `nd.mean()` | `np.mean.cs`, `NDArray.mean.cs` |
| `np.std`, `nd.std()` | `np.std.cs`, `NDArray.std.cs` |
| `np.var`, `nd.var()` | `np.var.cs`, `NDArray.var.cs` |
-| `np.amax`, `nd.amax()` | `Sorting/np.amax.cs`, `NDArray.amax.cs` |
-| `np.amin`, `nd.amin()` | `Sorting/np.min.cs`, `NDArray.amin.cs` |
-| `np.argmax`, `nd.argmax()` | `Sorting/np.argmax.cs`, `NDArray.argmax.cs` |
+| `np.amax`, `nd.amax()` | `Sorting_Searching_Counting/np.amax.cs`, `NDArray.amax.cs` |
+| `np.amin`, `nd.amin()` | `Sorting_Searching_Counting/np.min.cs`, `NDArray.amin.cs` |
+| `np.argmax`, `nd.argmax()` | `Sorting_Searching_Counting/np.argmax.cs`, `NDArray.argmax.cs` |
| `np.argmin`, `nd.argmin()` | `Sorting_Searching_Counting/np.argmax.cs`, `NDArray.argmin.cs` |
### Sorting & Searching (`Sorting_Searching_Counting/`)
@@ -168,7 +206,7 @@ nd["..., -1"] // Ellipsis fills dimensions
| `np.swapaxes` | `np.swapaxes.cs`, `NdArray.swapaxes.cs` |
| `np.moveaxis` | `np.moveaxis.cs` |
| `np.rollaxis` | `np.rollaxis.cs` |
-| `nd.roll()` | `NDArray.roll.cs` | Partial: only Int32/Single/Double with axis; no-axis returns null |
+| `np.roll`, `nd.roll()` | `np.roll.cs`, `NDArray.roll.cs` | Fully implemented (all dtypes, with/without axis) |
| `np.atleast_1d/2d/3d` | `np.atleastd.cs` |
| `np.unique`, `nd.unique()` | `np.unique.cs`, `NDArray.unique.cs` |
| `np.repeat` | `np.repeat.cs` |
@@ -244,7 +282,6 @@ nd["..., -1"] // Ellipsis fills dimensions
| Function | File |
|----------|------|
| `np.size` | `np.size.cs` |
-| `np.cumsum` | `np.cumsum.cs` |
---
@@ -334,22 +371,97 @@ Create issues on `SciSharp/NumSharp` via `gh issue create`. `GH_TOKEN` is availa
## Build & Test
```bash
+# Build (silent, errors only)
dotnet build -v q --nologo "-clp:NoSummary;ErrorsOnly" -p:WarningLevel=0
-dotnet test -v q --nologo "-clp:ErrorsOnly" test/NumSharp.UnitTest/NumSharp.UnitTest.csproj
+```
+
+### Running Tests
+
+Tests use **TUnit** framework with source-generated test discovery.
+
+```bash
+# Run from test directory
+cd test/NumSharp.UnitTest
+
+# All tests (includes OpenBugs - expected failures)
+dotnet test --no-build
+
+# Exclude OpenBugs (CI-style - only real failures)
+dotnet test --no-build -- --treenode-filter "/*/*/*/*[Category!=OpenBugs]"
+
+# Run ONLY OpenBugs tests
+dotnet test --no-build -- --treenode-filter "/*/*/*/*[Category=OpenBugs]"
+```
+
+### Output Formatting
+
+```bash
+# Results only (no messages, no stack traces)
+dotnet test --no-build 2>&1 | grep -E "^(failed|skipped|Test run| total:| failed:| succeeded:| skipped:| duration:)"
+
+# Results with messages (no stack traces)
+dotnet test --no-build 2>&1 | grep -v "^ at " | grep -v "^ at " | grep -v "^ ---" | grep -v "^ from K:" | sed 's/TUnit.Engine.Exceptions.TestFailedException: //' | sed 's/AssertFailedException: //'
+
+# Detailed output (shows passed tests too)
+dotnet test --no-build -- --output Detailed
```
## Test Categories
-Tests are filtered by `[TestCategory]` attributes. Adding new bug reproductions or platform-specific tests only requires the right attribute — no CI workflow changes.
+Tests use typed category attributes defined in `TestCategory.cs`. Adding new bug reproductions or platform-specific tests only requires the right attribute — no CI workflow changes.
+
+| Category | Attribute | Purpose | CI Behavior |
+|----------|-----------|---------|-------------|
+| `OpenBugs` | `[OpenBugs]` | Known-failing bug reproductions. Remove when fixed. | **EXCLUDED** via filter |
+| `Misaligned` | `[Misaligned]` | Documents NumSharp vs NumPy behavioral differences. | Runs (tests pass) |
+| `WindowsOnly` | `[WindowsOnly]` | Requires GDI+/System.Drawing.Common | Runtime platform check |
+
+### How CI Excludes OpenBugs
+
+The CI pipeline (`.github/workflows/build-and-release.yml`) uses TUnit's `--treenode-filter` to exclude `OpenBugs`:
-| Category | Purpose | CI filter |
-|----------|---------|-----------|
-| `OpenBugs` | Known-failing bug reproductions. Remove category when fixed. | `TestCategory!=OpenBugs` (all platforms) |
-| `WindowsOnly` | Requires GDI+/System.Drawing.Common | `TestCategory!=WindowsOnly` (Linux/macOS) |
+```yaml
+env:
+ TEST_FILTER: '/*/*/*/*[Category!=OpenBugs]'
-Apply at class level (`[TestClass][TestCategory("OpenBugs")]`) or individual method level (`[TestMethod][TestCategory("OpenBugs")]`).
+- name: Test
+ run: dotnet run ... -- --treenode-filter "${{ env.TEST_FILTER }}"
+```
+
+This filter excludes all tests with `[OpenBugs]` or `[Category("OpenBugs")]` from CI runs. Tests pass locally when the bug is fixed — then remove the `[OpenBugs]` attribute.
+
+### Usage
-**OpenBugs files**: `OpenBugs.cs` (broadcast bugs), `OpenBugs.Bitmap.cs` (bitmap bugs). When a bug is fixed, the test starts passing — remove the `OpenBugs` category and move to a permanent test class.
+```csharp
+// Class-level (all tests in class)
+[OpenBugs]
+public class BroadcastBugTests { ... }
+
+// Method-level
+[Test]
+[OpenBugs]
+public async Task BroadcastWriteCorruptsData() { ... }
+
+// Documenting behavioral differences (NOT excluded from CI)
+[Test]
+[Misaligned]
+public void BroadcastSlice_MaterializesInNumSharp() { ... }
+```
+
+### Local Filtering
+
+```bash
+# Exclude OpenBugs (same as CI)
+dotnet test -- --treenode-filter "/*/*/*/*[Category!=OpenBugs]"
+
+# Run ONLY OpenBugs tests (to verify fixes)
+dotnet test -- --treenode-filter "/*/*/*/*[Category=OpenBugs]"
+
+# Run ONLY Misaligned tests
+dotnet test -- --treenode-filter "/*/*/*/*[Category=Misaligned]"
+```
+
+**OpenBugs files**: `OpenBugs.cs` (general bugs), `OpenBugs.Bitmap.cs` (bitmap bugs), `OpenBugs.ApiAudit.cs` (API audit failures).
## CI Pipeline
@@ -387,9 +499,14 @@ NumSharp uses unsafe in many places, hence include `#:property AllowUnsafeBlocks
|--------|----------------|
| `shape.dimensions` | Raw int[] of dimension sizes |
| `shape.strides` | Raw int[] of stride values |
-| `shape.size` | Total element count |
-| `shape.ViewInfo` | Slice/view metadata (null if not a view) |
-| `shape.BroadcastInfo` | Broadcast metadata (null if not broadcast) |
+| `shape.size` | Internal field: total element count |
+| `shape.offset` | Base offset into storage (NumPy-aligned) |
+| `shape.bufferSize` | Size of underlying buffer |
+| `shape._flags` | Cached ArrayFlags bitmask |
+| `shape.IsWriteable` | False for broadcast views (NumPy behavior) |
+| `shape.IsBroadcasted` | Has any stride=0 with dimension > 1 |
+| `shape.IsSimpleSlice` | IsSliced && !IsBroadcasted |
+| `shape.OriginalSize` | Product of non-broadcast dimensions |
| `arr.Storage` | Underlying `UnmanagedStorage` |
| `arr.GetTypeCode` | `NPTypeCode` of the array |
| `arr.Array` | `IArraySlice` — raw data access |
@@ -399,7 +516,18 @@ NumSharp uses unsafe in many places, hence include `#:property AllowUnsafeBlocks
| `NPTypeCode.GetPriority()` | Type priority for promotion |
| `NPTypeCode.AsNumpyDtypeName()` | NumPy dtype name (e.g. "int32") |
| `Shape.NewScalar()` | Create scalar shapes |
-| `Shape.ComputeHashcode()` | Recalculate shape hash |
+
+### Common Public NDArray Properties
+
+| Property | Description |
+|----------|-------------|
+| `nd.shape` | Dimensions as `int[]` |
+| `nd.ndim` | Number of dimensions |
+| `nd.size` | Total element count |
+| `nd.dtype` | Element type as `Type` |
+| `nd.typecode` | Element type as `NPTypeCode` |
+| `nd.T` | Transpose (swaps axes) |
+| `nd.flat` | 1D iterator over elements |
## Adding New Features
@@ -433,7 +561,7 @@ A: Yes, 1-to-1 matching.
A: Anything that can use the capabilities - porting Python ML code, standalone .NET scientific computing, integration with TensorFlow.NET/ML.NET.
**Q: Are there areas of known fragility?**
-A: Slicing/broadcasting system is complex with ViewInfo and BroadcastInfo interactions - fragile but working.
+A: Slicing/broadcasting system is complex — offset/stride calculations with contiguity detection require careful handling. The `readonly struct Shape` with `ArrayFlags` simplifies this but edge cases remain.
**Q: How is NumPy compatibility validated?**
A: Written by hand based on NumPy docs and original tests. Testing philosophy: run actual NumPy code, observe output, replicate 1-to-1 in C#.
@@ -455,7 +583,7 @@ A: Implementations that differ from original NumPy 2.x behavior. A comprehensive
A: `NDArray` (user-facing API), `UnmanagedStorage` (raw memory management), and `Shape` (dimensions, strides, coordinate translation). They work together: NDArray wraps Storage which uses Shape for offset calculations.
**Q: What is Shape responsible for?**
-A: Dimensions, strides, coordinate-to-offset translation, contiguity tracking, and slice/broadcast info. Key properties: `IsScalar`, `IsContiguous`, `IsSliced`, `IsBroadcasted`. Methods: `GetOffset(coords)`, `GetCoordinates(offset)`.
+A: Shape is a `readonly struct` containing dimensions, strides, offset, bufferSize, and cached `ArrayFlags`. Key properties: `IsScalar`, `IsContiguous`, `IsSliced`, `IsBroadcasted`, `IsWriteable`, `IsSimpleSlice`. Methods: `GetOffset(coords)`, `GetCoordinates(offset)`. NumPy-aligned: broadcast views are read-only (`IsWriteable = false`).
**Q: How does slicing work internally?**
A: The `Slice` class parses Python notation (e.g., "1:5:2") into `Start`, `Stop`, `Step`. It converts to `SliceDef` (absolute indices) for computation. `SliceDef.Merge()` handles recursive slicing (slice of a slice).
@@ -502,10 +630,10 @@ A: Core ops (`dot`, `matmul`) in `LinearAlgebra/`. Advanced decompositions (`inv
## Q&A - Development
**Q: What's in the test suite?**
-A: MSTest framework in `test/NumSharp.UnitTest/`. Many tests adapted from NumPy's own test suite. Decent coverage but gaps in edge cases.
+A: TUnit framework in `test/NumSharp.UnitTest/`. Many tests adapted from NumPy's own test suite. Decent coverage but gaps in edge cases. Uses source-generated test discovery (no special flags needed).
**Q: What .NET version is targeted?**
-A: Library and tests multi-target `net8.0` and `net10.0`. Dropped `netstandard2.0` in the dotnet810 branch upgrade.
+A: Library multi-targets `net8.0` and `net10.0`. Tests currently target `net10.0` only (TUnit requires .NET 9+ runtime). Dropped `netstandard2.0` in the dotnet810 branch upgrade.
**Q: What are the main dependencies?**
A: No external runtime dependencies. `System.Memory` and `System.Runtime.CompilerServices.Unsafe` (previously NuGet packages) are built into the .NET 8+ runtime.
diff --git a/.claude/PLAN_Base_Property_Storage_Level.md b/.claude/PLAN_Base_Property_Storage_Level.md
new file mode 100644
index 00000000..9a183288
--- /dev/null
+++ b/.claude/PLAN_Base_Property_Storage_Level.md
@@ -0,0 +1,332 @@
+# Plan: NumPy-compatible `.base` Property (Storage-Level Tracking)
+
+## Overview
+
+Implement NumPy's `.base` property by tracking view ownership at the `UnmanagedStorage` level, not `NDArray` level. This is architecturally cleaner because `UnmanagedStorage` is where data sharing actually occurs.
+
+## Design
+
+### Core Concept
+
+```
+NumPy: ndarray.base → the ndarray that owns the data (or None)
+NumSharp: UnmanagedStorage._baseStorage → the storage that owns the data
+ NDArray.@base → computed property that wraps _baseStorage
+```
+
+### Architecture
+
+```
+Original Array (owns data):
+ NDArray a
+ └── Storage A (_baseStorage = null, InternalArray = DATA)
+
+View (shares data):
+ NDArray b = a[2:5]
+ └── Storage B (_baseStorage = A, InternalArray = DATA) ← same DATA
+
+View of View (chains to original):
+ NDArray c = b[1:2]
+ └── Storage C (_baseStorage = A, InternalArray = DATA) ← still points to A
+```
+
+## Implementation
+
+### Step 1: Add `_baseStorage` field to UnmanagedStorage
+
+**File:** `src/NumSharp.Core/Backends/Unmanaged/UnmanagedStorage.cs`
+
+```csharp
+public partial class UnmanagedStorage : ICloneable
+{
+ // ... existing fields ...
+
+ ///
+ /// The original storage this is a view of, or null if this storage owns its data.
+ /// Always points to the ultimate owner (not intermediate views).
+ ///
+ internal UnmanagedStorage? _baseStorage;
+
+ // ... rest of class ...
+}
+```
+
+### Step 2: Update Alias methods to set `_baseStorage`
+
+**File:** `src/NumSharp.Core/Backends/Unmanaged/UnmanagedStorage.Cloning.cs`
+
+```csharp
+public UnmanagedStorage Alias()
+{
+ var r = new UnmanagedStorage();
+ r._shape = _shape;
+ r._typecode = _typecode;
+ r._dtype = _dtype;
+ if (InternalArray != null)
+ r.SetInternalArray(InternalArray);
+ r.Count = _shape.size;
+ r._baseStorage = _baseStorage ?? this; // ← ADD: chain to original
+ return r;
+}
+
+public UnmanagedStorage Alias(Shape shape)
+{
+ var r = new UnmanagedStorage();
+ r._typecode = _typecode;
+ r._dtype = _dtype;
+ if (InternalArray != null)
+ r.SetInternalArray(InternalArray);
+ r._shape = shape;
+ r.Count = shape.size;
+ r._baseStorage = _baseStorage ?? this; // ← ADD: chain to original
+ return r;
+}
+
+public UnmanagedStorage Alias(ref Shape shape)
+{
+ var r = new UnmanagedStorage();
+ r._shape = shape;
+ r._typecode = _typecode;
+ r._dtype = _dtype;
+ if (InternalArray != null)
+ r.SetInternalArray(InternalArray);
+ r.Count = shape.size;
+ r._baseStorage = _baseStorage ?? this; // ← ADD: chain to original
+ return r;
+}
+```
+
+### Step 3: Update GetView to propagate `_baseStorage`
+
+**File:** `src/NumSharp.Core/Backends/Unmanaged/UnmanagedStorage.Slicing.cs`
+
+Review `GetView()` - it calls `Alias()` internally, so should automatically work. Need to verify.
+
+### Step 4: Update CreateBroadcastedUnsafe
+
+**File:** `src/NumSharp.Core/Backends/Unmanaged/UnmanagedStorage.cs` lines 148-153
+
+Only the overload that takes `UnmanagedStorage` needs updating (the one taking `IArraySlice` creates owned data):
+
+```csharp
+public static UnmanagedStorage CreateBroadcastedUnsafe(UnmanagedStorage storage, Shape shape)
+{
+ var ret = new UnmanagedStorage();
+ ret._Allocate(shape, storage.InternalArray);
+ ret._baseStorage = storage._baseStorage ?? storage; // ← ADD: track original
+ return ret;
+}
+```
+
+The `IArraySlice` overload (line 135) stays unchanged - it creates owned data.
+
+### Step 5: Add computed `@base` property to NDArray
+
+**File:** `src/NumSharp.Core/Backends/NDArray.cs`
+
+```csharp
+///
+/// NumPy-compatible: The array owning the memory, or null if this array owns its data.
+///
+///
+/// https://numpy.org/doc/stable/reference/generated/numpy.ndarray.base.html
+/// Note: Unlike NumPy, this returns a new NDArray wrapper each time.
+/// For checking view status, use: arr.@base != null
+///
+public NDArray? @base => Storage._baseStorage is { } bs
+ ? WrapOwned(bs)
+ : null;
+```
+
+### Step 6: Add WrapOwned factory (if not exists)
+
+**File:** `src/NumSharp.Core/Backends/NDArray.cs`
+
+```csharp
+///
+/// Wrap an UnmanagedStorage in an NDArray. Used for Clone, Scalar, computed .base, etc.
+///
+internal static NDArray WrapOwned(UnmanagedStorage storage)
+ => new NDArray(storage);
+```
+
+### Step 7: Remove manual @base assignments
+
+Remove all the manual `view.@base = source.@base ?? source` assignments we added earlier:
+
+- `NDArray.Indexing.cs`
+- `NDArray.Indexing.Selection.Getter.cs`
+- `Default.Transpose.cs`
+- `np.expand_dims.cs`
+- `np.broadcast_to.cs`
+- `NdArray.ReShape.cs`
+- `NDArray.cs` (view method)
+- `Default.Reduction.*.cs`
+
+These are now handled automatically by `Alias()`.
+
+### Step 8: Remove `@base` field from NDArray
+
+The field we added:
+```csharp
+public NDArray? @base; // ← REMOVE this field
+```
+
+Replace with computed property from Step 5.
+
+## Files to Modify
+
+| File | Change |
+|------|--------|
+| `Backends/Unmanaged/UnmanagedStorage.cs` | Add `_baseStorage` field, update `CreateBroadcastedUnsafe(storage, shape)` |
+| `Backends/Unmanaged/UnmanagedStorage.Cloning.cs` | Set `_baseStorage` in all 3 `Alias()` methods |
+| `Backends/NDArray.cs` | Replace `@base` field with computed property |
+
+### Files to Revert (remove manual @base assignments)
+
+| File | Lines to Revert |
+|------|-----------------|
+| `Selection/NDArray.Indexing.cs` | Lines 63-66, 80-83 |
+| `Selection/NDArray.Indexing.Selection.Getter.cs` | Lines 52-56, 117-121 |
+| `Backends/Default/ArrayManipulation/Default.Transpose.cs` | Lines 192-194 |
+| `Manipulation/np.expand_dims.cs` | Lines 11-13 |
+| `Creation/np.broadcast_to.cs` | Lines 70-73, 111-114, 153-156 |
+| `Creation/NdArray.ReShape.cs` | Lines 37-39, 63-65, 89-91, 107-109 |
+| `Backends/NDArray.cs` | Lines 476-479 (view method) |
+| `Backends/Default/Math/Reduction/*.cs` | Various (7 files) |
+
+## Investigation Results
+
+### 1. CreateBroadcastedUnsafe Location
+
+**File:** `UnmanagedStorage.cs` lines 135-153
+
+Two overloads:
+```csharp
+// Creates new storage from raw slice - OWNS data, no _baseStorage
+public static UnmanagedStorage CreateBroadcastedUnsafe(IArraySlice arraySlice, Shape shape)
+
+// Creates view of existing storage - SHARES data, needs _baseStorage
+public static UnmanagedStorage CreateBroadcastedUnsafe(UnmanagedStorage storage, Shape shape)
+```
+
+**Action:** Only update the second overload (line 148) to propagate `_baseStorage`.
+
+### 2. GetView() Path Analysis
+
+**File:** `UnmanagedStorage.Slicing.cs`
+
+All paths go through `Alias()`:
+- Line 49: `view.Alias(view.Shape.ExpandDimension(axis))`
+- Line 98: `return Alias()`
+- Line 116: `return Alias(slicedShape)`
+
+**Exception:** Line 108 creates `new UnmanagedStorage(clonedData, cleanShape)` for broadcast materialization - this is a COPY (owns data), correctly should NOT set `_baseStorage`.
+
+**Action:** No changes needed - Alias() handles it.
+
+### 3. Clone() Behavior
+
+**File:** `UnmanagedStorage.Cloning.cs` line 200
+
+```csharp
+public UnmanagedStorage Clone() => new UnmanagedStorage(CloneData(), _shape.Clone(...));
+```
+
+Uses constructor, not Alias(). Creates owned data.
+
+**Action:** No changes needed - Clone() should NOT set `_baseStorage`.
+
+### 4. np.* Functions
+
+All use constructors like `new NDArray(dtype, shape)` or `new UnmanagedStorage(...)`.
+These create owned data, `_baseStorage` stays null by default.
+
+**Action:** No changes needed.
+
+### 5. NDArray Generic Class
+
+Inherits from NDArray. Uses `base(storage)` constructors.
+
+**Action:** No changes needed - inherits behavior from NDArray.
+
+## Behavior Comparison
+
+| Scenario | NumPy | NumSharp (this plan) |
+|----------|-------|---------------------|
+| `a = np.arange(10)` | `a.base is None` | `a.@base == null` |
+| `b = a[2:5]` | `b.base is a` | `b.@base != null` (wraps a's storage) |
+| `c = b[1:2]` | `c.base is a` | `c.@base` wraps a's storage |
+| `d = a.copy()` | `d.base is None` | `d.@base == null` |
+| `c.base is a` | `True` | `False` (different wrapper) |
+
+**Note:** Object identity (`is`) won't match, but semantic equivalence does.
+
+## Testing
+
+```csharp
+[Test]
+public void Base_OriginalArray_IsNull()
+{
+ var a = np.arange(10);
+ Assert.That(a.@base, Is.Null);
+}
+
+[Test]
+public void Base_SliceOfOriginal_PointsToOriginalStorage()
+{
+ var a = np.arange(10);
+ var b = a["2:5"];
+ Assert.That(b.@base, Is.Not.Null);
+ Assert.That(b.@base!.Storage, Is.SameAs(a.Storage));
+}
+
+[Test]
+public void Base_SliceOfSlice_PointsToOriginalStorage()
+{
+ var a = np.arange(10);
+ var b = a["2:7"];
+ var c = b["1:3"];
+ Assert.That(c.@base, Is.Not.Null);
+ Assert.That(c.@base!.Storage, Is.SameAs(a.Storage));
+}
+
+[Test]
+public void Base_Copy_IsNull()
+{
+ var a = np.arange(10);
+ var b = np.copy(a);
+ Assert.That(b.@base, Is.Null);
+}
+
+[Test]
+public void Base_Reshape_PointsToOriginalStorage()
+{
+ var a = np.arange(12);
+ var b = a.reshape(3, 4);
+ Assert.That(b.@base, Is.Not.Null);
+ Assert.That(b.@base!.Storage, Is.SameAs(a.Storage));
+}
+
+[Test]
+public void Base_Transpose_PointsToOriginalStorage()
+{
+ var a = np.arange(12).reshape(3, 4);
+ var b = a.T;
+ // b.@base points to reshape's storage, which points to a's storage
+ Assert.That(b.@base, Is.Not.Null);
+}
+```
+
+## Rollback Plan
+
+If issues arise:
+1. Remove `_baseStorage` field from UnmanagedStorage
+2. Revert Alias() methods
+3. Go back to NDArray-level tracking with manual assignments
+
+## Open Questions
+
+1. Should `@base` cache the created NDArray to avoid repeated allocations?
+2. Should we add a `bool OwnsData` property for cleaner checks?
+3. Impact on memory/GC - does `_baseStorage` reference prevent collection?
diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 19d5f6c5..e008b2de 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -18,24 +18,12 @@ env:
# NU5048: PackageIconUrl deprecated — cosmetic NuGet warning
DOTNET_NOWARN: CS1570%3BCS1571%3BCS1572%3BCS1573%3BCS1574%3BCS1587%3BCS1591%3BCS1711%3BCS1734%3BCS8981%3BNU5048
- # Exclude tests by category. Tag tests with [TestCategory("...")] instead of
- # adding individual Name!= exclusions here.
- # OpenBugs — known-failing bug reproductions (pass when fixed, then remove category)
- # WindowsOnly — require GDI+/System.Drawing.Common (excluded on Linux/macOS)
- TEST_FILTER: TestCategory!=OpenBugs
-
jobs:
test:
strategy:
fail-fast: false
matrix:
- include:
- - os: windows-latest
- extra_filter: ''
- - os: ubuntu-latest
- extra_filter: '& TestCategory!=WindowsOnly'
- - os: macos-latest
- extra_filter: '& TestCategory!=WindowsOnly'
+ os: [ windows-latest, ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
@@ -53,8 +41,18 @@ jobs:
- name: Build
run: dotnet build test/NumSharp.UnitTest/NumSharp.UnitTest.csproj --configuration Release -p:NoWarn=${{ env.DOTNET_NOWARN }}
- - name: Test
- run: dotnet test test/NumSharp.UnitTest/NumSharp.UnitTest.csproj --configuration Release --no-build --verbosity normal --logger trx --filter "${{ env.TEST_FILTER }} ${{ matrix.extra_filter }}"
+ # NOTE: Test project currently only targets net10.0. See TODO in NumSharp.UnitTest.csproj
+ # to re-enable net8.0 when TUnit compatibility is resolved.
+ #
+ # Test filtering:
+ # - OpenBugs: excluded via --treenode-filter (known-failing bug reproductions)
+ # - WindowsOnly: auto-skipped at runtime via [SkipOnNonWindows] attribute
+ - name: Test (net10.0)
+ shell: bash
+ run: |
+ dotnet run --project test/NumSharp.UnitTest/NumSharp.UnitTest.csproj \
+ --configuration Release --no-build --framework net10.0 \
+ -- --treenode-filter '/*/*/*/*[Category!=OpenBugs]' --report-trx
- name: Upload Test Results
uses: actions/upload-artifact@v4
diff --git a/SciSharp.NumSharp.sln.DotSettings b/SciSharp.NumSharp.sln.DotSettings
index c7177296..b0e3b222 100644
--- a/SciSharp.NumSharp.sln.DotSettings
+++ b/SciSharp.NumSharp.sln.DotSettings
@@ -2,5 +2,9 @@
NO_INDENT
NO_INDENT
LIVE_MONITOR
+ True
+ True
+ True
+ True
True
True
\ No newline at end of file
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 00000000..215b544e
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1,14 @@
+# Generated benchmark outputs (recreated by run-benchmarks.ps1)
+# README.md and benchmark-report.md are tracked (results documentation)
+# Only JSON and CSV are ignored as intermediate/machine-readable outputs
+/benchmark-report.json
+/benchmark-report.csv
+
+# Benchmark run outputs
+*.txt
+
+# Python cache
+__pycache__/
+*.pyc
+
+
diff --git a/benchmark/CLAUDE.md b/benchmark/CLAUDE.md
new file mode 100644
index 00000000..647356e8
--- /dev/null
+++ b/benchmark/CLAUDE.md
@@ -0,0 +1,1003 @@
+# NumSharp Benchmark Suite - Development Guide
+
+This document provides comprehensive guidance for working with the NumSharp benchmark infrastructure. It covers architecture, patterns, API usage, extending benchmarks, and troubleshooting.
+
+---
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Directory Structure](#directory-structure)
+3. [Architecture](#architecture)
+4. [Infrastructure Classes](#infrastructure-classes)
+5. [Benchmark Categories](#benchmark-categories)
+6. [Python Benchmark System](#python-benchmark-system)
+7. [Report Generation](#report-generation)
+8. [Running Benchmarks](#running-benchmarks)
+9. [Adding New Benchmarks](#adding-new-benchmarks)
+10. [Type System](#type-system)
+11. [NumSharp API Patterns](#numsharp-api-patterns)
+12. [Common Issues & Solutions](#common-issues--solutions)
+13. [Performance Interpretation](#performance-interpretation)
+14. [CI Integration](#ci-integration)
+
+---
+
+## Overview
+
+The benchmark suite provides fair, reproducible performance comparisons between NumSharp and NumPy. It was designed with these principles:
+
+- **Matching methodology**: Same operations, same array sizes, same random seeds
+- **Comprehensive type coverage**: All 12 NumSharp-supported data types
+- **Categorical organization**: Operations grouped by type (arithmetic, unary, reduction, etc.)
+- **Automated reporting**: JSON export and Markdown report generation
+- **Cross-platform**: Runs on Windows, Linux, macOS
+
+### Key Metrics
+
+| Metric | Current Coverage |
+|--------|-----------------|
+| Operations | 132+ |
+| Data Types | 12 (all NumSharp types) |
+| Suites | 12 (dispatch, fusion, arithmetic, unary, reduction, broadcast, creation, manipulation, slicing, multidim) |
+| Array Sizes | 5 (Scalar, 100, 1K, 100K, 10M) |
+
+---
+
+## Directory Structure
+
+```
+benchmark/
+├── CLAUDE.md # This file (development guide)
+├── run-benchmarks.ps1 # PowerShell benchmark runner
+├── README.md # Benchmark results (== benchmark-report.md)
+├── benchmark-report.md # Generated report (after running)
+├── benchmark-report.json # JSON results (after running)
+│
+├── scripts/ # Helper scripts
+│ └── merge-results.py # Merges NumPy and NumSharp results
+│
+├── NumSharp.Benchmark.Python/ # Python/NumPy benchmarks
+│ └── numpy_benchmark.py # NumPy benchmark implementation
+│
+└── NumSharp.Benchmark.GraphEngine/ # C# BenchmarkDotNet project
+ ├── README.md # C# benchmark documentation
+ ├── Program.cs # Entry point with interactive menu
+ ├── NumSharp.Benchmark.GraphEngine.csproj
+ │
+ ├── Infrastructure/ # Base classes and configuration
+ │ ├── BenchmarkConfig.cs # BenchmarkDotNet configurations
+ │ ├── BenchmarkBase.cs # Base class for all benchmarks
+ │ ├── TypeParameterSource.cs # Type parameter sources (NPTypeCode)
+ │ └── ArraySizeSource.cs # Standard array size constants
+ │
+ └── Benchmarks/ # Benchmark implementations
+ ├── DispatchBenchmarks.cs # Original: DynamicMethod dispatch
+ ├── FusionBenchmarks.cs # Original: Kernel fusion patterns
+ ├── NumSharpBenchmarks.cs # Original: NumSharp baseline
+ ├── DynamicEmissionBenchmarks.cs # Original: Per-op DynMethod
+ │
+ ├── Arithmetic/ # Binary arithmetic operations
+ │ ├── AddBenchmarks.cs
+ │ ├── SubtractBenchmarks.cs
+ │ ├── MultiplyBenchmarks.cs
+ │ ├── DivideBenchmarks.cs
+ │ └── ModuloBenchmarks.cs
+ │
+ ├── Unary/ # Unary operations
+ │ ├── MathBenchmarks.cs # sqrt, abs, sign, floor, ceil, around, clip
+ │ ├── ExpLogBenchmarks.cs # exp, exp2, expm1, log, log2, log10, log1p
+ │ ├── TrigBenchmarks.cs # sin, cos, tan
+ │ └── PowerBenchmarks.cs # power with scalar exponents
+ │
+ ├── Reduction/ # Reduction operations
+ │ ├── SumBenchmarks.cs # sum, cumsum
+ │ ├── MeanBenchmarks.cs # mean
+ │ ├── VarStdBenchmarks.cs # var, std
+ │ ├── MinMaxBenchmarks.cs # amin, amax, argmin, argmax
+ │ └── ProdBenchmarks.cs # prod
+ │
+ ├── Broadcasting/ # Broadcasting operations
+ │ └── BroadcastBenchmarks.cs # Scalar, row, column, 3D, broadcast_to
+ │
+ ├── MultiDim/ # Multi-dimensional comparisons
+ │ └── MultiDimBenchmarks.cs # 1D vs 2D vs 3D performance
+ │
+ ├── Slicing/ # View and slice operations
+ │ └── SliceBenchmarks.cs # Contiguous, strided, reversed, copy
+ │
+ ├── Creation/ # Array creation
+ │ └── CreationBenchmarks.cs # zeros, ones, empty, full, copy, *_like
+ │
+ └── Manipulation/ # Shape manipulation
+ ├── ReshapeBenchmarks.cs # reshape, transpose, ravel, flatten
+ ├── StackBenchmarks.cs # concatenate, stack, hstack, vstack, dstack
+ └── DimsBenchmarks.cs # squeeze, expand_dims, swapaxes, moveaxis
+```
+
+---
+
+## Architecture
+
+### Three-Tier Design
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Report Generation Layer │
+│ run-benchmarks.ps1 → benchmark-report.md │
+└─────────────────────────────────────────────────────────────────┘
+ ↑
+┌─────────────────────────────────────────────────────────────────┐
+│ Benchmark Execution Layer │
+│ ┌─────────────────────┐ ┌─────────────────────────────────┐ │
+│ │ C# / BenchmarkDotNet │ │ Python / NumPy │ │
+│ │ NumSharp.Benchmark │ │ numpy_benchmark.py │ │
+│ └─────────────────────┘ └─────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────┘
+ ↑
+┌─────────────────────────────────────────────────────────────────┐
+│ Infrastructure Layer │
+│ BenchmarkBase → TypeParameterSource → ArraySizeSource │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Data Flow
+
+```
+1. Setup Phase:
+ - CreateRandomArray(N, dtype, seed) → NDArray
+ - Same seed ensures reproducibility across C#/Python
+
+2. Benchmark Phase:
+ - BenchmarkDotNet / Python benchmark() wrapper
+ - Warmup iterations excluded
+ - Statistical analysis (mean, stddev, min, max)
+
+3. Export Phase:
+ - C#: JSON via BenchmarkDotNet exporters
+ - Python: JSON via --output flag
+
+4. Report Phase:
+ - PowerShell merges results
+ - Generates Markdown report with tables
+```
+
+---
+
+## Infrastructure Classes
+
+### BenchmarkBase
+
+Base class providing array creation helpers.
+
+```csharp
+public abstract class BenchmarkBase
+{
+ // Override in derived class
+ public virtual int N { get; set; } = ArraySizeSource.Large;
+
+ // Reproducible random seed
+ protected const int Seed = 42;
+
+ // Create typed random arrays
+ protected static NDArray CreateRandomArray(int n, NPTypeCode dtype, int seed = Seed);
+ protected static NDArray CreatePositiveArray(int n, NPTypeCode dtype, int seed = Seed);
+ protected static NDArray CreateRandomArray2D(int rows, int cols, NPTypeCode dtype, int seed = Seed);
+ protected static NDArray CreateRandomArray3D(int d1, int d2, int d3, NPTypeCode dtype, int seed = Seed);
+
+ // Create scalar values for each type
+ protected static object GetScalar(NPTypeCode dtype, double value = 42.0);
+}
+```
+
+### TypedBenchmarkBase
+
+Extension that adds dtype parameterization.
+
+```csharp
+public abstract class TypedBenchmarkBase : BenchmarkBase
+{
+ [ParamsSource(nameof(Types))]
+ public NPTypeCode DType { get; set; }
+
+ // Override to customize available types
+ public virtual IEnumerable Types => TypeParameterSource.CommonTypes;
+}
+```
+
+### TypeParameterSource
+
+Static class providing type collections.
+
+```csharp
+public static class TypeParameterSource
+{
+ // All 12 NumSharp types
+ public static IEnumerable AllNumericTypes;
+
+ // Fast subset: int32, int64, float32, float64
+ public static IEnumerable CommonTypes;
+
+ // All except bool and char
+ public static IEnumerable ArithmeticTypes;
+
+ // float32, float64, decimal (for sqrt, log, trig)
+ public static IEnumerable TranscendentalTypes;
+
+ // Minimal: int32, float64
+ public static IEnumerable MinimalTypes;
+
+ // Integer types only
+ public static IEnumerable IntegerTypes;
+
+ // Floating types only
+ public static IEnumerable FloatingTypes;
+
+ // Helpers
+ public static string GetDtypeName(NPTypeCode code); // NumPy dtype name
+ public static string GetShortName(NPTypeCode code); // Display name
+}
+```
+
+### ArraySizeSource
+
+Standard array sizes for consistency.
+
+```csharp
+public static class ArraySizeSource
+{
+ public const int Small = 1_000; // L1 cache, per-element overhead
+ public const int Medium = 100_000; // L2/L3 cache, typical use
+ public const int Large = 10_000_000; // Memory-bound throughput
+
+ public static IEnumerable StandardSizes; // All three
+ public static IEnumerable QuickSizes; // Large only
+
+ // 2D/3D size tuples
+ public static IEnumerable<(int, int)> Matrix2DSizes;
+ public static IEnumerable<(int, int, int)> Tensor3DSizes;
+}
+```
+
+---
+
+## Benchmark Categories
+
+### Arithmetic (`Benchmarks/Arithmetic/`)
+
+Binary arithmetic operations between arrays and scalars.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `AddBenchmarks.cs` | `+`, `np.add`, scalar add, row/col broadcast | Tests both operator and function syntax |
+| `SubtractBenchmarks.cs` | `-`, scalar subtract | Tests both directions (a-b, scalar-a) |
+| `MultiplyBenchmarks.cs` | `*`, square, scalar multiply | Tests self-multiplication |
+| `DivideBenchmarks.cs` | `/`, scalar divide | Uses positive arrays to avoid div-by-zero |
+| `ModuloBenchmarks.cs` | `%`, scalar modulo | Limited to types supporting modulo |
+
+**Type coverage**: `ArithmeticTypes` (excludes bool, char)
+
+### Unary (`Benchmarks/Unary/`)
+
+Single-input mathematical functions.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `MathBenchmarks.cs` | sqrt, abs, sign, floor, ceil, around, clip | Basic math operations |
+| `ExpLogBenchmarks.cs` | exp, exp2, expm1, log, log2, log10, log1p | Exponential/logarithmic |
+| `TrigBenchmarks.cs` | sin, cos, tan | Trigonometric (expensive!) |
+| `PowerBenchmarks.cs` | power(a, 2), power(a, 3), power(a, 0.5) | Scalar exponents only |
+
+**Type coverage**: `TranscendentalTypes` (float32, float64, decimal)
+
+**Note**: NumSharp `np.power` only supports scalar exponents (`ValueType`), not element-wise NDArray exponents.
+
+### Reduction (`Benchmarks/Reduction/`)
+
+Operations that reduce array dimensionality.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `SumBenchmarks.cs` | sum, sum(axis=0), sum(axis=1), cumsum | Full and axis reductions |
+| `MeanBenchmarks.cs` | mean, mean(axis) | Returns floating-point |
+| `VarStdBenchmarks.cs` | var, std (full and axis) | Multiple passes required |
+| `MinMaxBenchmarks.cs` | amin, amax, argmin, argmax | Value and index operations |
+| `ProdBenchmarks.cs` | prod (full and axis) | Uses small arrays to avoid overflow |
+
+**Type coverage**: `ArithmeticTypes` for sum/minmax, `TranscendentalTypes` for var/std
+
+### Broadcasting (`Benchmarks/Broadcasting/`)
+
+Operations involving shape broadcasting.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `BroadcastBenchmarks.cs` | scalar, row, column, 3D, broadcast_to | All broadcast patterns |
+
+**Broadcasting patterns**:
+- Scalar: `(N,M) + scalar`
+- Row: `(N,M) + (M,)` → broadcasts row across all rows
+- Column: `(N,M) + (N,1)` → broadcasts column across all columns
+- 3D: `(D,D,D) + (D,D)` → broadcasts 2D across first dimension
+
+### Creation (`Benchmarks/Creation/`)
+
+Array allocation and initialization.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `CreationBenchmarks.cs` | zeros, ones, empty, full, arange, linspace, copy, *_like | All creation patterns |
+
+**Key insight**: `zeros` and `empty` are O(1) lazy allocation (~0.01ms), while `ones` and `full` require initialization (~8-16ms for 10M elements).
+
+### Manipulation (`Benchmarks/Manipulation/`)
+
+Shape and dimension manipulation.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `ReshapeBenchmarks.cs` | reshape, transpose, ravel, flatten | View vs copy semantics |
+| `StackBenchmarks.cs` | concatenate, stack, hstack, vstack, dstack | Combining arrays |
+| `DimsBenchmarks.cs` | squeeze, expand_dims, swapaxes, moveaxis, rollaxis | Dimension manipulation |
+
+**Key insight**: View operations (reshape, transpose, ravel) are O(1), while copy operations (flatten, concatenate) are O(N).
+
+### Slicing (`Benchmarks/Slicing/`)
+
+Array slicing and view operations.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `SliceBenchmarks.cs` | contiguous, strided, reversed, row/col slices, copy | View creation and operations |
+
+**Key insight**: Strided slices are slower than contiguous due to non-sequential memory access.
+
+### MultiDim (`Benchmarks/MultiDim/`)
+
+Comparing 1D, 2D, and 3D array performance.
+
+| File | Operations | Notes |
+|------|------------|-------|
+| `MultiDimBenchmarks.cs` | add, sum, sqrt on 1D/2D/3D | Same total elements, different shapes |
+
+---
+
+## Python Benchmark System
+
+### Structure
+
+```python
+# Configuration
+ARRAY_SIZES = {'small': 1_000, 'medium': 100_000, 'large': 10_000_000}
+DTYPES = {'int32': np.int32, 'float64': np.float64, ...}
+COMMON_DTYPES = ['int32', 'int64', 'float32', 'float64']
+
+# Benchmark function
+def benchmark(func, n, warmup=10, iterations=50) -> BenchmarkResult
+
+# Suite functions
+def run_arithmetic_benchmarks(n, dtype_name, iterations) -> List[BenchmarkResult]
+def run_unary_benchmarks(n, dtype_name, iterations) -> List[BenchmarkResult]
+def run_reduction_benchmarks(n, dtype_name, iterations) -> List[BenchmarkResult]
+def run_broadcast_benchmarks(n, iterations) -> List[BenchmarkResult]
+def run_creation_benchmarks(n, dtype_name, iterations) -> List[BenchmarkResult]
+def run_manipulation_benchmarks(n, iterations) -> List[BenchmarkResult]
+def run_slicing_benchmarks(n, iterations) -> List[BenchmarkResult]
+def run_dispatch_benchmarks(n, iterations) -> List[BenchmarkResult]
+def run_fusion_benchmarks(n, iterations) -> List[BenchmarkResult]
+```
+
+### Command-Line Interface
+
+```bash
+cd NumSharp.Benchmark.Python
+python numpy_benchmark.py # All benchmarks
+python numpy_benchmark.py --suite dispatch # Specific suite
+python numpy_benchmark.py --quick # 10 iterations
+python numpy_benchmark.py --type int32 # Specific dtype
+python numpy_benchmark.py --size large # 10M elements
+python numpy_benchmark.py --output results.json # JSON export
+```
+
+### Result Format
+
+```python
+@dataclass
+class BenchmarkResult:
+ name: str # "np.sum (float64)"
+ category: str # "Sum"
+ suite: str # "Reduction"
+ dtype: str # "float64"
+ n: int # 10000000
+ mean_ms: float # 5.248
+ stddev_ms: float # 0.395
+ min_ms: float # 4.821
+ max_ms: float # 6.134
+ iterations: int # 50
+ ops_per_sec: float # 190.55
+```
+
+---
+
+## Report Generation
+
+### PowerShell Script (`run-benchmarks.ps1`)
+
+```powershell
+# Parameters
+-Quick # Fewer iterations
+-Suite # Specific suite (all, arithmetic, reduction, etc.)
+-OutputPath # Report file path
+-SkipCSharp # Skip C# benchmarks
+-SkipPython # Skip Python benchmarks
+-Type # Specific dtype
+-Size # Array size preset
+```
+
+### Report Sections
+
+1. **Environment** - .NET SDK, Python, NumPy versions, CPU
+2. **Executive Summary** - Operations tested, suites, array size
+3. **NumPy Baseline Performance** - Grouped by suite with tables
+4. **NumSharp Results** - BenchmarkDotNet tables (when C# runs)
+5. **Performance Comparison Guide** - Legend and interpretation
+6. **Key Insights** - Improvement areas and advantages
+7. **Reproduction** - Command examples
+8. **Type Coverage Matrix** - NumPy dtype to C# type mapping
+
+### Status Icons
+
+| Ratio | Icon | Meaning |
+|-------|------|---------|
+| ≤ 1.0 | ✅ | NumSharp faster or equal |
+| 1.0 - 2.0 | 🟡 | Close to NumPy |
+| 2.0 - 5.0 | 🟠 | Slower |
+| > 5.0 | 🔴 | Much slower |
+
+---
+
+## Running Benchmarks
+
+### Quick Start
+
+```powershell
+# Full suite with report
+.\run-benchmarks.ps1
+
+# Quick NumPy-only
+.\run-benchmarks.ps1 -Quick -SkipCSharp
+
+# Specific suite
+.\run-benchmarks.ps1 -Suite arithmetic -Type float64
+```
+
+### C# Interactive Menu
+
+```bash
+cd NumSharp.Benchmark.GraphEngine
+dotnet run -c Release -f net10.0
+```
+
+Menu options:
+1. Dispatch Mechanism Comparison
+2. Fusion Pattern Benchmarks
+3. NumSharp Current Performance
+4. DynamicMethod Emission
+5. Arithmetic Operations
+6. Unary Operations
+7. Reduction Operations
+8. Broadcasting Operations
+9. Array Creation Operations
+10. Shape Manipulation
+11. Slicing Operations
+12. Multi-dimensional Arrays
+A. All Benchmarks
+Q. Quick smoke test
+
+### C# Command-Line
+
+```bash
+# Filter by name pattern
+dotnet run -c Release -- --filter "*Add*"
+
+# Specific job type
+dotnet run -c Release -- --job Short --filter "*"
+
+# JSON export
+dotnet run -c Release -- --exporters json
+
+# Multiple filters
+dotnet run -c Release -- --filter "*Arithmetic*,*Reduction*"
+```
+
+### Python Standalone
+
+```bash
+cd NumSharp.Benchmark.Python
+
+# All with JSON
+python numpy_benchmark.py --output ../benchmark-report.json
+
+# Quick arithmetic only
+python numpy_benchmark.py --quick --suite arithmetic
+
+# Specific type and size
+python numpy_benchmark.py --type float32 --size medium
+```
+
+---
+
+## Adding New Benchmarks
+
+### C# Benchmark Template
+
+```csharp
+using BenchmarkDotNet.Attributes;
+using NumSharp;
+using NumSharp.Benchmark.GraphEngine.Infrastructure;
+
+namespace NumSharp.Benchmark.GraphEngine.Benchmarks.YourCategory;
+
+///
+/// Brief description of what this benchmarks.
+///
+[BenchmarkCategory("YourCategory", "SubCategory")]
+public class YourBenchmarks : TypedBenchmarkBase
+{
+ private NDArray _a = null!;
+ private NDArray _b = null!;
+
+ // Array sizes to test
+ [Params(ArraySizeSource.Small, ArraySizeSource.Medium, ArraySizeSource.Large)]
+ public override int N { get; set; }
+
+ // Types to test (override for custom selection)
+ public override IEnumerable Types => TypeParameterSource.CommonTypes;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ _a = CreateRandomArray(N, DType);
+ _b = CreateRandomArray(N, DType, seed: 43); // Different seed
+ }
+
+ [GlobalCleanup]
+ public void Cleanup()
+ {
+ _a = null!;
+ _b = null!;
+ GC.Collect();
+ }
+
+ [Benchmark(Description = "Operation description")]
+ [BenchmarkCategory("SubCategory")]
+ public NDArray YourOperation() => np.your_operation(_a);
+
+ [Benchmark(Description = "Another operation")]
+ [BenchmarkCategory("AnotherSubCategory")]
+ public NDArray AnotherOperation() => _a + _b;
+}
+```
+
+### Python Benchmark Template
+
+```python
+def run_your_benchmarks(n: int, dtype_name: str, iterations: int) -> List[BenchmarkResult]:
+ """Benchmark your operations."""
+ results = []
+ dtype = DTYPES[dtype_name]
+
+ # Setup
+ np.random.seed(42)
+ a = create_random_array(n, dtype_name, seed=42)
+ b = create_random_array(n, dtype_name, seed=43)
+
+ # Benchmark 1
+ def your_operation(): return np.your_operation(a)
+ r = benchmark(your_operation, n, iterations=iterations)
+ r.name = f"np.your_operation ({dtype_name})"
+ r.category = "YourCategory"
+ r.suite = "YourSuite"
+ r.dtype = dtype_name
+ results.append(r)
+
+ # Benchmark 2
+ def another_operation(): return a + b
+ r = benchmark(another_operation, n, iterations=iterations)
+ r.name = f"a + b ({dtype_name})"
+ r.category = "AnotherCategory"
+ r.suite = "YourSuite"
+ r.dtype = dtype_name
+ results.append(r)
+
+ return results
+```
+
+### Adding to main()
+
+```python
+# In main():
+if args.suite in ["yoursuite", "all"]:
+ for dtype in dtypes_to_run:
+ results = run_your_benchmarks(args.n, dtype, args.iterations)
+ all_results.extend(results)
+```
+
+### Updating Program.cs Menu
+
+```csharp
+// Add to menu:
+Console.WriteLine("13. Your New Benchmarks");
+
+// Add to switch:
+"13" => ["--filter", "*YourBenchmarks*"],
+```
+
+### Updating run-benchmarks.ps1
+
+```powershell
+# Add to $filter switch:
+'yoursuite' { "*YourBenchmarks*" }
+
+# Add to ValidateSet:
+[ValidateSet('all', ..., 'yoursuite')]
+```
+
+---
+
+## Type System
+
+### NumSharp NPTypeCode to NumPy dtype Mapping
+
+| NPTypeCode | C# Type | NumPy dtype | Size (bytes) |
+|------------|---------|-------------|--------------|
+| Boolean | bool | bool | 1 |
+| Byte | byte | uint8 | 1 |
+| Int16 | short | int16 | 2 |
+| UInt16 | ushort | uint16 | 2 |
+| Int32 | int | int32 | 4 |
+| UInt32 | uint | uint32 | 4 |
+| Int64 | long | int64 | 8 |
+| UInt64 | ulong | uint64 | 8 |
+| Char | char | uint16 | 2 |
+| Single | float | float32 | 4 |
+| Double | double | float64 | 8 |
+| Decimal | decimal | float128* | 16 |
+
+*Note: NumPy's float128 is platform-dependent and not exactly equivalent to C# decimal.
+
+### Type Selection Guidelines
+
+| Operation Type | Use Types |
+|----------------|-----------|
+| Arithmetic (+, -, *, /) | `ArithmeticTypes` |
+| Transcendental (sqrt, exp, log, trig) | `TranscendentalTypes` |
+| Reduction (sum, mean) | `ArithmeticTypes` |
+| Variance/StdDev | `TranscendentalTypes` |
+| Comparison | `AllNumericTypes` |
+| Quick benchmarks | `CommonTypes` or `MinimalTypes` |
+
+---
+
+## NumSharp API Patterns
+
+### Array Creation
+
+```csharp
+// NumSharp requires Shape, not int
+np.zeros(new Shape(N), typeCode); // NOT np.zeros(N, typeCode)
+np.ones(new Shape(N), typeCode);
+np.empty(new Shape(N), typeCode);
+np.full(new Shape(N), value, typeCode);
+
+// Scalar creation
+NDArray.Scalar(value, typeCode); // NOT np.array(value)
+```
+
+### Rounding
+
+```csharp
+// NumSharp uses np.around or np.round_, NOT np.round
+np.around(_a);
+np.round_(_a);
+```
+
+### Power
+
+```csharp
+// NumSharp np.power only accepts ValueType exponents
+np.power(_a, 2); // OK
+np.power(_a, 0.5); // OK
+np.power(_a, _b); // NOT SUPPORTED - use _a * _b
+```
+
+### Type Conversion
+
+```csharp
+// astype for conversion
+_a.astype(NPTypeCode.Float64);
+_a.astype(np.float64);
+
+// copy vs view
+_a.copy(); // Always creates copy
+np.copy(_a); // Same as above
+```
+
+### Slicing Syntax
+
+```csharp
+// String-based slicing
+_arr["100:1000"]; // Contiguous slice
+_arr["::2"]; // Every 2nd element
+_arr["::-1"]; // Reversed
+_arr["10:100, :"]; // 2D row slice
+_arr[":, 10:100"]; // 2D column slice (strided)
+```
+
+---
+
+## Common Issues & Solutions
+
+### Build Error: Type object has no attribute 'type'
+
+**Python error**: `AttributeError: type object 'numpy.int32' has no attribute 'type'`
+
+**Cause**: Using `dtype.type(5)` instead of `dtype(5)`
+
+**Fix**:
+```python
+# Wrong
+scalar = dtype.type(5)
+
+# Correct
+scalar = dtype(5)
+```
+
+### Build Error: Cannot reshape array
+
+**Python error**: `ValueError: cannot reshape array of size X into shape (Y,Z)`
+
+**Cause**: Integer division means `rows * cols != n`
+
+**Fix**:
+```python
+rows = int(np.sqrt(n))
+cols = n // rows
+actual_n = rows * cols # May be slightly less than n
+arr_1d = np.random.random(actual_n) # Use actual_n
+```
+
+### Build Error: CS8377 - Type must be non-nullable value type
+
+**C# error**: `np.array(params T[])` fails when T is object
+
+**Cause**: `GetScalar` returns `object`, not `ValueType`
+
+**Fix**:
+```csharp
+// Wrong
+_scalar = np.array(GetScalar(DType, 5.0)).astype(DType);
+
+// Correct
+_scalar = NDArray.Scalar(GetScalar(DType, 5.0), DType);
+```
+
+### Build Error: CS0117 - 'np' does not contain definition for 'round'
+
+**Cause**: NumSharp uses `np.around` or `np.round_`, not `np.round`
+
+**Fix**:
+```csharp
+// Wrong
+np.round(_a);
+
+// Correct
+np.around(_a);
+// or
+np.round_(_a);
+```
+
+### Build Error: File is locked
+
+**Error**: `The file is locked by: "NumSharp.Benchmark.GraphEngine (PID)"`
+
+**Cause**: Previous benchmark run is still executing
+
+**Fix**: Wait for process to complete or kill it manually
+
+### PowerShell Variable Conflict
+
+**Error**: `The value Microsoft.PowerShell.Commands.GroupInfo is not a valid value for the Suite variable`
+
+**Cause**: Local variable `$suite` conflicts with parameter `$Suite`
+
+**Fix**: Rename local variable to `$suiteGroup` or similar
+
+---
+
+## Performance Interpretation
+
+### What the Numbers Mean
+
+| Time Range | Interpretation |
+|------------|----------------|
+| < 0.1 ms | O(1) operation (view creation, metadata) |
+| 0.1 - 1 ms | Small overhead + minimal computation |
+| 1 - 10 ms | Cache-efficient operations |
+| 10 - 50 ms | Memory-bound operations |
+| 50 - 200 ms | Complex operations (trig, var/std) |
+| > 200 ms | Multi-pass or unoptimized |
+
+### Memory Bandwidth Expectations
+
+For 10M elements:
+- float32 (40 MB) read: ~5 ms theoretical minimum
+- float64 (80 MB) read: ~10 ms theoretical minimum
+- Operations should be within 2-5x of theoretical minimum
+
+### What Affects Performance
+
+1. **Memory layout**: Contiguous > strided > random access
+2. **Cache utilization**: L1 > L2 > L3 > RAM
+3. **SIMD**: Vectorized operations are 4-8x faster
+4. **Type size**: Smaller types = more elements per cache line
+5. **Complexity**: Arithmetic < transcendental < trig
+
+### NumPy Advantages (Why It's Fast)
+
+1. **BLAS/LAPACK**: Highly optimized linear algebra
+2. **AVX/SSE**: SIMD vectorization throughout
+3. **C implementation**: No GC overhead
+4. **Memory pools**: Reduced allocation overhead
+5. **Expression templates**: Some kernel fusion
+
+### NumSharp Advantages
+
+1. **.NET integration**: Type safety, tooling, ecosystem
+2. **Unmanaged memory**: No GC pressure for large arrays
+3. **View semantics**: Zero-copy slicing
+4. **Future potential**: SIMD via DynamicMethod emission
+
+---
+
+## CI Integration
+
+### JSON Export
+
+```bash
+# Python (from benchmark/ directory)
+python NumSharp.Benchmark.Python/numpy_benchmark.py --output benchmark-report.json
+
+# C#
+cd NumSharp.Benchmark.GraphEngine
+dotnet run -c Release -- --exporters json
+# Results in BenchmarkDotNet.Artifacts/results/*.json
+```
+
+### CI Workflow Example
+
+```yaml
+name: Benchmarks
+
+on:
+ push:
+ branches: [master]
+ schedule:
+ - cron: '0 0 * * 0' # Weekly
+
+jobs:
+ benchmark:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Setup .NET
+ uses: actions/setup-dotnet@v4
+ with:
+ dotnet-version: '10.0.x'
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.12'
+
+ - name: Install NumPy
+ run: pip install numpy tabulate
+
+ - name: Run NumPy benchmarks
+ run: |
+ cd benchmark
+ python NumSharp.Benchmark.Python/numpy_benchmark.py --quick --output benchmark-report.json
+
+ - name: Run NumSharp benchmarks
+ run: |
+ cd benchmark/NumSharp.Benchmark.GraphEngine
+ dotnet run -c Release -- --job Short --exporters json
+
+ - name: Upload results
+ uses: actions/upload-artifact@v4
+ with:
+ name: benchmark-results
+ path: |
+ benchmark/benchmark-report.json
+ benchmark/NumSharp.Benchmark.GraphEngine/BenchmarkDotNet.Artifacts/
+```
+
+### Regression Detection
+
+Compare current results against baseline:
+
+```python
+import json
+
+def check_regression(current_file, baseline_file, threshold=1.2):
+ """Alert if any operation is >threshold slower than baseline."""
+ current = json.load(open(current_file))
+ baseline = json.load(open(baseline_file))
+
+ baseline_map = {r['name']: r['mean_ms'] for r in baseline}
+
+ regressions = []
+ for r in current:
+ if r['name'] in baseline_map:
+ ratio = r['mean_ms'] / baseline_map[r['name']]
+ if ratio > threshold:
+ regressions.append((r['name'], ratio))
+
+ return regressions
+```
+
+---
+
+## Quick Reference
+
+### Common Commands
+
+```powershell
+# Full benchmark with report
+.\run-benchmarks.ps1
+
+# Quick NumPy only
+.\run-benchmarks.ps1 -Quick -SkipCSharp
+
+# Specific suite
+.\run-benchmarks.ps1 -Suite arithmetic
+
+# C# interactive
+cd NumSharp.Benchmark.GraphEngine && dotnet run -c Release
+
+# C# specific filter
+dotnet run -c Release -- --filter "*Sum*" --job Short
+
+# Python specific
+python NumSharp.Benchmark.Python/numpy_benchmark.py --suite reduction --type float64 --quick
+```
+
+### File Locations
+
+| Item | Location |
+|------|----------|
+| C# benchmarks | `NumSharp.Benchmark.GraphEngine/Benchmarks/` |
+| Infrastructure | `NumSharp.Benchmark.GraphEngine/Infrastructure/` |
+| Python benchmarks | `NumSharp.Benchmark.Python/numpy_benchmark.py` |
+| Helper scripts | `scripts/merge-results.py` |
+| Report generator | `run-benchmarks.ps1` |
+| Generated report | `benchmark-report.md` (also `README.md`) |
+| Results JSON | `benchmark-report.json` |
+| C# JSON | `NumSharp.Benchmark.GraphEngine/BenchmarkDotNet.Artifacts/results/` |
+
+### Type Shorthand
+
+| Short | Full | NumPy |
+|-------|------|-------|
+| i32 | Int32 | int32 |
+| i64 | Int64 | int64 |
+| f32 | Single | float32 |
+| f64 | Double | float64 |
+| u8 | Byte | uint8 |
+| bool | Boolean | bool |
+
+---
+
+*Last updated: 2026-02-13*
+*Benchmark suite version: 2.0 (comprehensive coverage)*
diff --git a/benchmark/NumSharp.Benchmark.Exploration/.gitignore b/benchmark/NumSharp.Benchmark.Exploration/.gitignore
new file mode 100644
index 00000000..8eaa4989
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/.gitignore
@@ -0,0 +1,13 @@
+# Build outputs
+bin/
+obj/
+
+# Generated benchmark results (keep .gitkeep only)
+Results/*.csv
+Results/*.json
+Results/*.md
+Results/*.cs
+Results/*.txt
+
+# BenchmarkDotNet artifacts
+BenchmarkDotNet.Artifacts/
diff --git a/benchmark/NumSharp.Benchmark.Exploration/BenchmarkDotNet/BroadcastBenchmarks.cs b/benchmark/NumSharp.Benchmark.Exploration/BenchmarkDotNet/BroadcastBenchmarks.cs
new file mode 100644
index 00000000..b83f1243
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/BenchmarkDotNet/BroadcastBenchmarks.cs
@@ -0,0 +1,261 @@
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Diagnosers;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.BenchmarkDotNet;
+
+///
+/// BenchmarkDotNet validation benchmarks for broadcasting scenarios.
+/// Provides statistically rigorous validation of Stopwatch results.
+///
+[MemoryDiagnoser]
+[DisassemblyDiagnoser(maxDepth: 3)]
+public unsafe class BroadcastBenchmarks
+{
+ private double* _lhs = null!;
+ private double* _rhs = null!;
+ private double* _result = null!;
+
+ [Params(1_000, 100_000, 1_000_000, 10_000_000)]
+ public int Size { get; set; }
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ _lhs = SimdImplementations.AllocateAligned(Size);
+ _rhs = SimdImplementations.AllocateAligned(Size);
+ _result = SimdImplementations.AllocateAligned(Size);
+
+ SimdImplementations.FillRandom(_lhs, Size, 42);
+ SimdImplementations.FillRandom(_rhs, Size, 43);
+ }
+
+ [GlobalCleanup]
+ public void Cleanup()
+ {
+ SimdImplementations.FreeAligned(_lhs);
+ SimdImplementations.FreeAligned(_rhs);
+ SimdImplementations.FreeAligned(_result);
+ }
+
+ [Benchmark(Baseline = true, Description = "Scalar loop")]
+ public void ScalarLoop()
+ {
+ SimdImplementations.AddScalarLoop_Float64(_lhs, _rhs, _result, Size);
+ }
+
+ [Benchmark(Description = "SIMD Vector256")]
+ public void SimdFull()
+ {
+ SimdImplementations.AddFull_Float64(_lhs, _rhs, _result, Size);
+ }
+}
+
+///
+/// BenchmarkDotNet benchmarks for scalar broadcast (S2).
+///
+[MemoryDiagnoser]
+public unsafe class ScalarBroadcastBenchmarks
+{
+ private double* _lhs = null!;
+ private double* _result = null!;
+ private double _scalar = 42.5;
+
+ [Params(1_000, 100_000, 1_000_000, 10_000_000)]
+ public int Size { get; set; }
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ _lhs = SimdImplementations.AllocateAligned(Size);
+ _result = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_lhs, Size, 42);
+ }
+
+ [GlobalCleanup]
+ public void Cleanup()
+ {
+ SimdImplementations.FreeAligned(_lhs);
+ SimdImplementations.FreeAligned(_result);
+ }
+
+ [Benchmark(Baseline = true, Description = "Scalar loop")]
+ public void ScalarLoop()
+ {
+ for (int i = 0; i < Size; i++)
+ {
+ _result[i] = _lhs[i] + _scalar;
+ }
+ }
+
+ [Benchmark(Description = "SIMD Vector256")]
+ public void SimdScalar()
+ {
+ SimdImplementations.AddScalar_Float64(_lhs, _scalar, _result, Size);
+ }
+}
+
+///
+/// BenchmarkDotNet benchmarks for row broadcast (S4).
+///
+[MemoryDiagnoser]
+public unsafe class RowBroadcastBenchmarks
+{
+ private double* _matrix = null!;
+ private double* _row = null!;
+ private double* _result = null!;
+ private int _rows;
+ private int _cols;
+
+ [Params(100, 316, 1000, 3162)]
+ public int MatrixDim { get; set; }
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ _rows = MatrixDim;
+ _cols = MatrixDim;
+ int size = _rows * _cols;
+
+ _matrix = SimdImplementations.AllocateAligned(size);
+ _row = SimdImplementations.AllocateAligned(_cols);
+ _result = SimdImplementations.AllocateAligned(size);
+
+ SimdImplementations.FillRandom(_matrix, size, 42);
+ SimdImplementations.FillRandom(_row, _cols, 43);
+ }
+
+ [GlobalCleanup]
+ public void Cleanup()
+ {
+ SimdImplementations.FreeAligned(_matrix);
+ SimdImplementations.FreeAligned(_row);
+ SimdImplementations.FreeAligned(_result);
+ }
+
+ [Benchmark(Baseline = true, Description = "Scalar nested loops")]
+ public void ScalarLoop()
+ {
+ SimdImplementations.AddRowBroadcastScalar_Float64(_matrix, _row, _result, _rows, _cols);
+ }
+
+ [Benchmark(Description = "SIMD-CHUNK (SIMD inner)")]
+ public void SimdChunk()
+ {
+ SimdImplementations.AddRowBroadcast_Float64(_matrix, _row, _result, _rows, _cols);
+ }
+}
+
+///
+/// BenchmarkDotNet benchmarks for different dtypes.
+///
+[MemoryDiagnoser]
+public unsafe class DtypeBenchmarks
+{
+ private const int Size = 1_000_000;
+
+ private byte* _byteLhs = null!;
+ private byte* _byteRhs = null!;
+ private byte* _byteResult = null!;
+
+ private short* _shortLhs = null!;
+ private short* _shortRhs = null!;
+ private short* _shortResult = null!;
+
+ private int* _intLhs = null!;
+ private int* _intRhs = null!;
+ private int* _intResult = null!;
+
+ private long* _longLhs = null!;
+ private long* _longRhs = null!;
+ private long* _longResult = null!;
+
+ private float* _floatLhs = null!;
+ private float* _floatRhs = null!;
+ private float* _floatResult = null!;
+
+ private double* _doubleLhs = null!;
+ private double* _doubleRhs = null!;
+ private double* _doubleResult = null!;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ _byteLhs = SimdImplementations.AllocateAligned(Size);
+ _byteRhs = SimdImplementations.AllocateAligned(Size);
+ _byteResult = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_byteLhs, Size, 42);
+ SimdImplementations.FillRandom(_byteRhs, Size, 43);
+
+ _shortLhs = SimdImplementations.AllocateAligned(Size);
+ _shortRhs = SimdImplementations.AllocateAligned(Size);
+ _shortResult = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_shortLhs, Size, 42);
+ SimdImplementations.FillRandom(_shortRhs, Size, 43);
+
+ _intLhs = SimdImplementations.AllocateAligned(Size);
+ _intRhs = SimdImplementations.AllocateAligned(Size);
+ _intResult = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_intLhs, Size, 42);
+ SimdImplementations.FillRandom(_intRhs, Size, 43);
+
+ _longLhs = SimdImplementations.AllocateAligned(Size);
+ _longRhs = SimdImplementations.AllocateAligned(Size);
+ _longResult = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_longLhs, Size, 42);
+ SimdImplementations.FillRandom(_longRhs, Size, 43);
+
+ _floatLhs = SimdImplementations.AllocateAligned(Size);
+ _floatRhs = SimdImplementations.AllocateAligned(Size);
+ _floatResult = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_floatLhs, Size, 42);
+ SimdImplementations.FillRandom(_floatRhs, Size, 43);
+
+ _doubleLhs = SimdImplementations.AllocateAligned(Size);
+ _doubleRhs = SimdImplementations.AllocateAligned(Size);
+ _doubleResult = SimdImplementations.AllocateAligned(Size);
+ SimdImplementations.FillRandom(_doubleLhs, Size, 42);
+ SimdImplementations.FillRandom(_doubleRhs, Size, 43);
+ }
+
+ [GlobalCleanup]
+ public void Cleanup()
+ {
+ SimdImplementations.FreeAligned(_byteLhs);
+ SimdImplementations.FreeAligned(_byteRhs);
+ SimdImplementations.FreeAligned(_byteResult);
+ SimdImplementations.FreeAligned(_shortLhs);
+ SimdImplementations.FreeAligned(_shortRhs);
+ SimdImplementations.FreeAligned(_shortResult);
+ SimdImplementations.FreeAligned(_intLhs);
+ SimdImplementations.FreeAligned(_intRhs);
+ SimdImplementations.FreeAligned(_intResult);
+ SimdImplementations.FreeAligned(_longLhs);
+ SimdImplementations.FreeAligned(_longRhs);
+ SimdImplementations.FreeAligned(_longResult);
+ SimdImplementations.FreeAligned(_floatLhs);
+ SimdImplementations.FreeAligned(_floatRhs);
+ SimdImplementations.FreeAligned(_floatResult);
+ SimdImplementations.FreeAligned(_doubleLhs);
+ SimdImplementations.FreeAligned(_doubleRhs);
+ SimdImplementations.FreeAligned(_doubleResult);
+ }
+
+ [Benchmark(Description = "byte SIMD")]
+ public void ByteSimd() => SimdImplementations.AddFull_Byte(_byteLhs, _byteRhs, _byteResult, Size);
+
+ [Benchmark(Description = "int16 SIMD")]
+ public void Int16Simd() => SimdImplementations.AddFull_Int16(_shortLhs, _shortRhs, _shortResult, Size);
+
+ [Benchmark(Description = "int32 SIMD")]
+ public void Int32Simd() => SimdImplementations.AddFull_Int32(_intLhs, _intRhs, _intResult, Size);
+
+ [Benchmark(Description = "int64 SIMD")]
+ public void Int64Simd() => SimdImplementations.AddFull_Int64(_longLhs, _longRhs, _longResult, Size);
+
+ [Benchmark(Description = "float32 SIMD")]
+ public void Float32Simd() => SimdImplementations.AddFull_Float32(_floatLhs, _floatRhs, _floatResult, Size);
+
+ [Benchmark(Description = "float64 SIMD")]
+ public void Float64Simd() => SimdImplementations.AddFull_Float64(_doubleLhs, _doubleRhs, _doubleResult, Size);
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/BenchFramework.cs b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/BenchFramework.cs
new file mode 100644
index 00000000..d6a9c386
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/BenchFramework.cs
@@ -0,0 +1,328 @@
+using System.Diagnostics;
+using System.Runtime;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace NumSharp.Benchmark.Exploration.Infrastructure;
+
+///
+/// Lightweight Stopwatch-based benchmark runner for fast iteration during exploration.
+/// Provides consistent measurement methodology with warmup, GC control, and statistics.
+///
+public static class BenchFramework
+{
+ /// Default warmup iterations to trigger JIT compilation.
+ public const int DefaultWarmup = 5;
+
+ /// Default measurement iterations.
+ public const int DefaultMeasure = 20;
+
+ /// Quick mode: still enough iterations for stable results.
+ public const int QuickMeasure = 10;
+
+ ///
+ /// Run a benchmark with specified warmup and measurement iterations.
+ ///
+ /// The action to benchmark (should be self-contained).
+ /// Scenario identifier (e.g., "S1_contiguous").
+ /// Strategy identifier (e.g., "SIMD-FULL").
+ /// Data type name (e.g., "float64").
+ /// Number of elements processed.
+ /// Size of each element in bytes.
+ /// Number of warmup iterations (default: 3).
+ /// Number of measurement iterations (default: 10).
+ /// Suite name for grouping.
+ /// Benchmark result with statistics.
+ public static BenchResult Run(
+ Action action,
+ string scenario,
+ string strategy,
+ string dtype,
+ int size,
+ int elementBytes,
+ int warmup = DefaultWarmup,
+ int measure = DefaultMeasure,
+ string suite = "")
+ {
+ // Force full GC before test to minimize interference
+ ForceGC();
+
+ // Warmup phase - trigger JIT, fill caches
+ for (int i = 0; i < warmup; i++)
+ {
+ action();
+ }
+
+ // Let any background GC settle
+ Thread.Sleep(1);
+
+ // Measurement phase
+ var times = new double[measure];
+ for (int i = 0; i < measure; i++)
+ {
+ var sw = Stopwatch.StartNew();
+ action();
+ sw.Stop();
+ times[i] = sw.Elapsed.TotalMicroseconds;
+ }
+
+ // Calculate statistics
+ var mean = times.Average();
+ var variance = times.Select(t => (t - mean) * (t - mean)).Average();
+ var stddev = Math.Sqrt(variance);
+ var min = times.Min();
+ var max = times.Max();
+ var gbps = BenchResult.CalculateGBps(size, elementBytes, mean);
+
+ return new BenchResult
+ {
+ Scenario = scenario,
+ Strategy = strategy,
+ Dtype = dtype,
+ Size = size,
+ MeanUs = mean,
+ StdDevUs = stddev,
+ MinUs = min,
+ MaxUs = max,
+ GBps = gbps,
+ Reps = measure,
+ Timestamp = DateTime.UtcNow.ToString("O"),
+ Suite = suite
+ };
+ }
+
+ ///
+ /// Run a benchmark that returns a result (to prevent dead code elimination).
+ ///
+ public static BenchResult Run(
+ Func func,
+ string scenario,
+ string strategy,
+ string dtype,
+ int size,
+ int elementBytes,
+ int warmup = DefaultWarmup,
+ int measure = DefaultMeasure,
+ string suite = "")
+ {
+ T result = default!;
+ // Use block lambda to ensure it resolves as Action, not Func
+ // (assignment expressions return a value, which would cause infinite recursion)
+ return Run(
+ () => { result = func(); },
+ scenario, strategy, dtype, size, elementBytes, warmup, measure, suite);
+ }
+
+ ///
+ /// Run a benchmark with a setup action that runs before each measurement.
+ /// Useful when the benchmark modifies its inputs (e.g., in-place operations).
+ ///
+ public static BenchResult RunWithSetup(
+ Action setup,
+ Action benchmark,
+ string scenario,
+ string strategy,
+ string dtype,
+ int size,
+ int elementBytes,
+ int warmup = DefaultWarmup,
+ int measure = DefaultMeasure,
+ string suite = "")
+ {
+ ForceGC();
+
+ // Warmup
+ for (int i = 0; i < warmup; i++)
+ {
+ setup();
+ benchmark();
+ }
+
+ Thread.Sleep(1);
+
+ // Measurement
+ var times = new double[measure];
+ for (int i = 0; i < measure; i++)
+ {
+ setup();
+
+ var sw = Stopwatch.StartNew();
+ benchmark();
+ sw.Stop();
+
+ times[i] = sw.Elapsed.TotalMicroseconds;
+ }
+
+ var mean = times.Average();
+ var variance = times.Select(t => (t - mean) * (t - mean)).Average();
+ var stddev = Math.Sqrt(variance);
+
+ return new BenchResult
+ {
+ Scenario = scenario,
+ Strategy = strategy,
+ Dtype = dtype,
+ Size = size,
+ MeanUs = mean,
+ StdDevUs = stddev,
+ MinUs = times.Min(),
+ MaxUs = times.Max(),
+ GBps = BenchResult.CalculateGBps(size, elementBytes, mean),
+ Reps = measure,
+ Timestamp = DateTime.UtcNow.ToString("O"),
+ Suite = suite
+ };
+ }
+
+ ///
+ /// Run a comparative benchmark with baseline and optimized versions.
+ ///
+ public static (BenchResult Baseline, BenchResult Optimized, double Speedup) RunComparison(
+ Action baseline,
+ Action optimized,
+ string scenario,
+ string dtype,
+ int size,
+ int elementBytes,
+ int warmup = DefaultWarmup,
+ int measure = DefaultMeasure,
+ string suite = "")
+ {
+ var baseResult = Run(baseline, scenario, "Baseline", dtype, size, elementBytes, warmup, measure, suite);
+ var optResult = Run(optimized, scenario, "Optimized", dtype, size, elementBytes, warmup, measure, suite);
+
+ var speedup = baseResult.MeanUs / optResult.MeanUs;
+
+ return (
+ baseResult,
+ optResult with { SpeedupVsBaseline = speedup },
+ speedup
+ );
+ }
+
+ ///
+ /// Force a full garbage collection to reduce interference.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void ForceGC()
+ {
+ GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced, blocking: true);
+ GC.WaitForPendingFinalizers();
+ GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced, blocking: true);
+ }
+
+ ///
+ /// Print a formatted header for console output.
+ ///
+ public static void PrintHeader(string title)
+ {
+ Console.WriteLine();
+ Console.WriteLine(new string('=', 80));
+ Console.WriteLine($" {title}");
+ Console.WriteLine(new string('=', 80));
+ Console.WriteLine();
+ Console.WriteLine($"{"Scenario",-12} {"Strategy",-10} {"Dtype",-8} {"Size",12} | {"Mean (us)",12} ± {"StdDev",8} | {"GB/s",8}");
+ Console.WriteLine(new string('-', 80));
+ }
+
+ ///
+ /// Print a result line.
+ ///
+ public static void PrintResult(BenchResult result)
+ {
+ Console.WriteLine(result.ToString());
+ }
+
+ ///
+ /// Print a section divider.
+ ///
+ public static void PrintDivider(string? label = null)
+ {
+ if (label != null)
+ {
+ Console.WriteLine($"\n--- {label} ---");
+ }
+ else
+ {
+ Console.WriteLine(new string('-', 80));
+ }
+ }
+
+ ///
+ /// Print environment information.
+ ///
+ public static void PrintEnvironment()
+ {
+ Console.WriteLine("Environment:");
+ Console.WriteLine($" .NET Version: {RuntimeInformation.FrameworkDescription}");
+ Console.WriteLine($" OS: {RuntimeInformation.OSDescription}");
+ Console.WriteLine($" Architecture: {RuntimeInformation.ProcessArchitecture}");
+ Console.WriteLine($" Processor Count: {Environment.ProcessorCount}");
+ Console.WriteLine($" SIMD Enabled: {System.Numerics.Vector.IsHardwareAccelerated}");
+ Console.WriteLine($" Vector256 Supported: {System.Runtime.Intrinsics.X86.Avx2.IsSupported}");
+ Console.WriteLine();
+ }
+}
+
+///
+/// Standard array sizes for benchmarking.
+///
+public static class ArraySizes
+{
+ /// Fits in L1 cache (32 KB).
+ public const int Tiny = 32;
+
+ /// Fits in L1 cache (~8 KB for float64).
+ public const int Small = 1_000;
+
+ /// Fits in L2/L3 cache (~800 KB for float64).
+ public const int Medium = 100_000;
+
+ /// Exceeds L3 cache (~8 MB for float64).
+ public const int Large = 1_000_000;
+
+ /// Memory-bound (~80 MB for float64).
+ public const int Huge = 10_000_000;
+
+ /// Large memory-bound (~800 MB for float64).
+ public const int Massive = 100_000_000;
+
+ /// Standard sizes for systematic testing.
+ public static readonly int[] Standard = [Small, Medium, Large, Huge];
+
+ /// Quick sizes for fast iteration.
+ public static readonly int[] Quick = [Medium, Huge];
+
+ /// Threshold discovery sizes (powers of 2).
+ public static readonly int[] Thresholds = [
+ 8, 16, 32, 64, 128, 256, 512,
+ 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072
+ ];
+
+ /// All sizes including edge cases.
+ public static readonly int[] All = [Tiny, Small, Medium, Large, Huge, Massive];
+}
+
+///
+/// Standard dtypes for benchmarking.
+///
+public static class Dtypes
+{
+ /// All numeric types to test.
+ public static readonly string[] All = ["byte", "int16", "int32", "int64", "float32", "float64"];
+
+ /// Common types for quick tests.
+ public static readonly string[] Common = ["int32", "float64"];
+
+ /// All integer types.
+ public static readonly string[] Integer = ["byte", "int16", "int32", "int64"];
+
+ /// All floating types.
+ public static readonly string[] Float = ["float32", "float64"];
+
+ /// Types most benefiting from SIMD (more elements per vector).
+ public static readonly string[] SmallElements = ["byte", "int16"];
+
+ /// Types where SIMD benefit is smaller.
+ public static readonly string[] LargeElements = ["int64", "float64"];
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/BenchResult.cs b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/BenchResult.cs
new file mode 100644
index 00000000..727756e1
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/BenchResult.cs
@@ -0,0 +1,124 @@
+using System.Text.Json.Serialization;
+
+namespace NumSharp.Benchmark.Exploration.Infrastructure;
+
+///
+/// Result of a single benchmark run.
+///
+public record BenchResult
+{
+ /// Broadcasting scenario identifier (S1-S7).
+ public required string Scenario { get; init; }
+
+ /// SIMD strategy used (FULL, SCALAR, CHUNK, GATHER, LOOP).
+ public required string Strategy { get; init; }
+
+ /// Data type tested (byte, int16, int32, int64, float32, float64).
+ public required string Dtype { get; init; }
+
+ /// Number of elements in the array.
+ public required int Size { get; init; }
+
+ /// Mean execution time in microseconds.
+ public required double MeanUs { get; init; }
+
+ /// Standard deviation in microseconds.
+ public required double StdDevUs { get; init; }
+
+ /// Minimum execution time in microseconds.
+ public required double MinUs { get; init; }
+
+ /// Maximum execution time in microseconds.
+ public required double MaxUs { get; init; }
+
+ /// Throughput in GB/s (based on read+write bytes).
+ public required double GBps { get; init; }
+
+ /// Number of measurement repetitions.
+ public required int Reps { get; init; }
+
+ /// ISO 8601 timestamp when benchmark was run.
+ public required string Timestamp { get; init; }
+
+ /// Suite name for grouping.
+ public string Suite { get; init; } = "";
+
+ /// Additional notes or flags.
+ public string Notes { get; init; } = "";
+
+ /// Baseline comparison ratio (this / baseline).
+ [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
+ public double? SpeedupVsBaseline { get; init; }
+
+ ///
+ /// Calculate throughput given element count and element size.
+ /// Assumes read LHS + read RHS + write result = 3 * elements * elementSize bytes.
+ ///
+ public static double CalculateGBps(int elements, int elementBytes, double microseconds)
+ {
+ // Binary op: read 2 inputs + write 1 output = 3 arrays
+ var totalBytes = (long)elements * elementBytes * 3;
+ var seconds = microseconds / 1_000_000.0;
+ return seconds > 0 ? (totalBytes / 1e9) / seconds : 0;
+ }
+
+ ///
+ /// Create a formatted one-line summary for console output.
+ ///
+ public override string ToString()
+ {
+ var speedup = SpeedupVsBaseline.HasValue ? $" ({SpeedupVsBaseline:F2}x)" : "";
+ return $"{Scenario,-12} {Strategy,-10} {Dtype,-8} {Size,12:N0} | {MeanUs,12:F2} us ± {StdDevUs,8:F2} | {GBps,8:F2} GB/s{speedup}";
+ }
+}
+
+///
+/// Aggregated results for multiple scenarios/dtypes/sizes.
+///
+public class BenchResultSet
+{
+ public required string SuiteName { get; init; }
+ public required string Description { get; init; }
+ public required DateTime StartTime { get; init; }
+ public DateTime EndTime { get; set; }
+ public required string CpuModel { get; init; }
+ public required string DotNetVersion { get; init; }
+ public List Results { get; } = new();
+
+ public TimeSpan Duration => EndTime - StartTime;
+
+ ///
+ /// Add a result and return this for fluent chaining.
+ ///
+ public BenchResultSet Add(BenchResult result)
+ {
+ Results.Add(result);
+ return this;
+ }
+}
+
+///
+/// Element size lookup for each dtype.
+///
+public static class DtypeInfo
+{
+ public static int GetElementSize(string dtype) => dtype.ToLowerInvariant() switch
+ {
+ "byte" or "uint8" or "bool" or "boolean" => 1,
+ "int16" or "short" or "uint16" or "ushort" => 2,
+ "int32" or "int" or "uint32" or "uint" or "float32" or "single" or "float" => 4,
+ "int64" or "long" or "uint64" or "ulong" or "float64" or "double" => 8,
+ "decimal" => 16,
+ _ => throw new ArgumentException($"Unknown dtype: {dtype}")
+ };
+
+ ///
+ /// Get the number of elements that fit in a Vector256.
+ ///
+ public static int GetVector256Count(string dtype) => 32 / GetElementSize(dtype);
+
+ ///
+ /// Get the number of elements that fit in a Vector128.
+ ///
+ public static int GetVector128Count(string dtype) => 16 / GetElementSize(dtype);
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/OutputFormatters.cs b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/OutputFormatters.cs
new file mode 100644
index 00000000..251e2480
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/OutputFormatters.cs
@@ -0,0 +1,281 @@
+using System.Globalization;
+using System.Text;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+
+namespace NumSharp.Benchmark.Exploration.Infrastructure;
+
+///
+/// Output formatters for benchmark results.
+/// Supports CSV, JSON, and Markdown table formats.
+///
+public static class OutputFormatters
+{
+ private static readonly JsonSerializerOptions JsonOptions = new()
+ {
+ WriteIndented = true,
+ PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
+ DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
+ };
+
+ ///
+ /// Export results to a CSV file.
+ ///
+ public static void ExportCsv(IEnumerable results, string filePath)
+ {
+ var sb = new StringBuilder();
+
+ // Header
+ sb.AppendLine("Scenario,Strategy,Dtype,Size,MeanUs,StdDevUs,MinUs,MaxUs,GBps,Reps,Timestamp,Suite,Notes,SpeedupVsBaseline");
+
+ // Data rows
+ foreach (var r in results)
+ {
+ sb.AppendLine(string.Join(",",
+ Escape(r.Scenario),
+ Escape(r.Strategy),
+ Escape(r.Dtype),
+ r.Size.ToString(CultureInfo.InvariantCulture),
+ r.MeanUs.ToString("F4", CultureInfo.InvariantCulture),
+ r.StdDevUs.ToString("F4", CultureInfo.InvariantCulture),
+ r.MinUs.ToString("F4", CultureInfo.InvariantCulture),
+ r.MaxUs.ToString("F4", CultureInfo.InvariantCulture),
+ r.GBps.ToString("F4", CultureInfo.InvariantCulture),
+ r.Reps.ToString(CultureInfo.InvariantCulture),
+ Escape(r.Timestamp),
+ Escape(r.Suite),
+ Escape(r.Notes),
+ r.SpeedupVsBaseline?.ToString("F4", CultureInfo.InvariantCulture) ?? ""
+ ));
+ }
+
+ Directory.CreateDirectory(Path.GetDirectoryName(filePath)!);
+ File.WriteAllText(filePath, sb.ToString());
+ Console.WriteLine($"CSV exported to: {filePath}");
+ }
+
+ ///
+ /// Export results to a JSON file.
+ ///
+ public static void ExportJson(BenchResultSet resultSet, string filePath)
+ {
+ var json = JsonSerializer.Serialize(resultSet, JsonOptions);
+ Directory.CreateDirectory(Path.GetDirectoryName(filePath)!);
+ File.WriteAllText(filePath, json);
+ Console.WriteLine($"JSON exported to: {filePath}");
+ }
+
+ ///
+ /// Export results to a JSON file (simple array format).
+ ///
+ public static void ExportJson(IEnumerable results, string filePath)
+ {
+ var json = JsonSerializer.Serialize(results.ToList(), JsonOptions);
+ Directory.CreateDirectory(Path.GetDirectoryName(filePath)!);
+ File.WriteAllText(filePath, json);
+ Console.WriteLine($"JSON exported to: {filePath}");
+ }
+
+ ///
+ /// Generate a Markdown table from results.
+ ///
+ public static string ToMarkdownTable(IEnumerable results, string title = "")
+ {
+ var sb = new StringBuilder();
+
+ if (!string.IsNullOrEmpty(title))
+ {
+ sb.AppendLine($"## {title}");
+ sb.AppendLine();
+ }
+
+ // Header
+ sb.AppendLine("| Scenario | Strategy | Dtype | Size | Mean (us) | StdDev | GB/s | Speedup |");
+ sb.AppendLine("|----------|----------|-------|-----:|----------:|-------:|-----:|--------:|");
+
+ // Data rows
+ foreach (var r in results)
+ {
+ var speedup = r.SpeedupVsBaseline.HasValue ? $"{r.SpeedupVsBaseline:F2}x" : "-";
+ sb.AppendLine($"| {r.Scenario} | {r.Strategy} | {r.Dtype} | {r.Size:N0} | {r.MeanUs:F2} | {r.StdDevUs:F2} | {r.GBps:F2} | {speedup} |");
+ }
+
+ return sb.ToString();
+ }
+
+ ///
+ /// Generate a compact comparison table (baseline vs optimized).
+ ///
+ public static string ToComparisonTable(IEnumerable results, string title = "")
+ {
+ var sb = new StringBuilder();
+
+ if (!string.IsNullOrEmpty(title))
+ {
+ sb.AppendLine($"## {title}");
+ sb.AppendLine();
+ }
+
+ // Group by scenario/dtype/size, show baseline and optimized side-by-side
+ var grouped = results
+ .GroupBy(r => (r.Scenario, r.Dtype, r.Size))
+ .OrderBy(g => g.Key.Scenario)
+ .ThenBy(g => g.Key.Dtype)
+ .ThenBy(g => g.Key.Size);
+
+ sb.AppendLine("| Scenario | Dtype | Size | Baseline (us) | Optimized (us) | Speedup |");
+ sb.AppendLine("|----------|-------|-----:|--------------:|---------------:|--------:|");
+
+ foreach (var g in grouped)
+ {
+ var baseline = g.FirstOrDefault(r => r.Strategy == "Baseline");
+ var optimized = g.FirstOrDefault(r => r.Strategy != "Baseline" && r.SpeedupVsBaseline.HasValue);
+
+ if (baseline != null && optimized != null)
+ {
+ var icon = optimized.SpeedupVsBaseline switch
+ {
+ > 2.0 => " ✅",
+ > 1.2 => " 🟡",
+ > 1.0 => " 🟢",
+ _ => " 🔴"
+ };
+
+ sb.AppendLine($"| {g.Key.Scenario} | {g.Key.Dtype} | {g.Key.Size:N0} | {baseline.MeanUs:F2} | {optimized.MeanUs:F2} | {optimized.SpeedupVsBaseline:F2}x{icon} |");
+ }
+ }
+
+ return sb.ToString();
+ }
+
+ ///
+ /// Generate a pivot table with dtypes as columns and sizes as rows.
+ ///
+ public static string ToPivotTable(IEnumerable results, string scenario, string metric = "speedup")
+ {
+ var filtered = results.Where(r => r.Scenario == scenario && r.SpeedupVsBaseline.HasValue).ToList();
+ if (!filtered.Any()) return "";
+
+ var sb = new StringBuilder();
+ sb.AppendLine($"### {scenario} - Speedup Matrix");
+ sb.AppendLine();
+
+ var dtypes = filtered.Select(r => r.Dtype).Distinct().OrderBy(d => d).ToList();
+ var sizes = filtered.Select(r => r.Size).Distinct().OrderBy(s => s).ToList();
+
+ // Header
+ sb.Append("| Size |");
+ foreach (var d in dtypes) sb.Append($" {d} |");
+ sb.AppendLine();
+
+ sb.Append("|-----:|");
+ foreach (var _ in dtypes) sb.Append("------:|");
+ sb.AppendLine();
+
+ // Rows
+ foreach (var size in sizes)
+ {
+ sb.Append($"| {size:N0} |");
+ foreach (var dtype in dtypes)
+ {
+ var result = filtered.FirstOrDefault(r => r.Dtype == dtype && r.Size == size);
+ if (result != null)
+ {
+ var value = metric switch
+ {
+ "speedup" => $"{result.SpeedupVsBaseline:F2}x",
+ "mean" => $"{result.MeanUs:F1}",
+ "gbps" => $"{result.GBps:F1}",
+ _ => "-"
+ };
+ sb.Append($" {value} |");
+ }
+ else
+ {
+ sb.Append(" - |");
+ }
+ }
+ sb.AppendLine();
+ }
+
+ return sb.ToString();
+ }
+
+ ///
+ /// Export results to Markdown file.
+ ///
+ public static void ExportMarkdown(BenchResultSet resultSet, string filePath)
+ {
+ var sb = new StringBuilder();
+
+ sb.AppendLine($"# {resultSet.SuiteName}");
+ sb.AppendLine();
+ sb.AppendLine($"_{resultSet.Description}_");
+ sb.AppendLine();
+ sb.AppendLine("## Environment");
+ sb.AppendLine();
+ sb.AppendLine($"- **CPU**: {resultSet.CpuModel}");
+ sb.AppendLine($"- **.NET**: {resultSet.DotNetVersion}");
+ sb.AppendLine($"- **Start**: {resultSet.StartTime:u}");
+ sb.AppendLine($"- **Duration**: {resultSet.Duration:hh\\:mm\\:ss}");
+ sb.AppendLine();
+
+ // Group results by suite
+ var grouped = resultSet.Results.GroupBy(r => r.Suite).OrderBy(g => g.Key);
+
+ foreach (var group in grouped)
+ {
+ var suiteName = string.IsNullOrEmpty(group.Key) ? "Results" : group.Key;
+ sb.AppendLine(ToMarkdownTable(group, suiteName));
+ sb.AppendLine();
+ }
+
+ Directory.CreateDirectory(Path.GetDirectoryName(filePath)!);
+ File.WriteAllText(filePath, sb.ToString());
+ Console.WriteLine($"Markdown exported to: {filePath}");
+ }
+
+ ///
+ /// Print a summary of results to console.
+ ///
+ public static void PrintSummary(IEnumerable results)
+ {
+ var list = results.ToList();
+
+ Console.WriteLine();
+ Console.WriteLine("=== Summary ===");
+ Console.WriteLine($"Total benchmarks: {list.Count}");
+
+ var withSpeedup = list.Where(r => r.SpeedupVsBaseline.HasValue).ToList();
+ if (withSpeedup.Any())
+ {
+ var avgSpeedup = withSpeedup.Average(r => r.SpeedupVsBaseline!.Value);
+ var maxSpeedup = withSpeedup.Max(r => r.SpeedupVsBaseline!.Value);
+ var minSpeedup = withSpeedup.Min(r => r.SpeedupVsBaseline!.Value);
+ var wins = withSpeedup.Count(r => r.SpeedupVsBaseline > 1.0);
+
+ Console.WriteLine($"Comparisons: {withSpeedup.Count}");
+ Console.WriteLine($" Wins (speedup > 1.0): {wins} ({100.0 * wins / withSpeedup.Count:F1}%)");
+ Console.WriteLine($" Avg speedup: {avgSpeedup:F2}x");
+ Console.WriteLine($" Max speedup: {maxSpeedup:F2}x");
+ Console.WriteLine($" Min speedup: {minSpeedup:F2}x");
+
+ // Best result
+ var best = withSpeedup.OrderByDescending(r => r.SpeedupVsBaseline).First();
+ Console.WriteLine($" Best: {best.Scenario} {best.Dtype} {best.Size:N0} = {best.SpeedupVsBaseline:F2}x");
+
+ // Worst result
+ var worst = withSpeedup.OrderBy(r => r.SpeedupVsBaseline).First();
+ Console.WriteLine($" Worst: {worst.Scenario} {worst.Dtype} {worst.Size:N0} = {worst.SpeedupVsBaseline:F2}x");
+ }
+ }
+
+ private static string Escape(string s)
+ {
+ if (s.Contains(',') || s.Contains('"') || s.Contains('\n'))
+ {
+ return $"\"{s.Replace("\"", "\"\"")}\"";
+ }
+ return s;
+ }
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/SimdImplementations.cs b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/SimdImplementations.cs
new file mode 100644
index 00000000..4f745206
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Infrastructure/SimdImplementations.cs
@@ -0,0 +1,632 @@
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace NumSharp.Benchmark.Exploration.Infrastructure;
+
+///
+/// SIMD implementations for benchmarking different strategies.
+/// These are raw implementations without NumSharp overhead for isolated testing.
+///
+public static unsafe class SimdImplementations
+{
+ #region SIMD-FULL: Contiguous arrays, same shape
+
+ ///
+ /// Add two contiguous float64 arrays using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddFull_Float64(double* lhs, double* rhs, double* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx.LoadVector256(lhs + i);
+ var vb = Avx.LoadVector256(rhs + i);
+ var vr = Avx.Add(va, vb);
+ Avx.Store(result + i, vr);
+ }
+ }
+
+ // Scalar tail
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Add two contiguous float32 arrays using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddFull_Float32(float* lhs, float* rhs, float* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx.LoadVector256(lhs + i);
+ var vb = Avx.LoadVector256(rhs + i);
+ var vr = Avx.Add(va, vb);
+ Avx.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Add two contiguous int32 arrays using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddFull_Int32(int* lhs, int* rhs, int* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vb = Avx2.LoadVector256(rhs + i);
+ var vr = Avx2.Add(va, vb);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Add two contiguous int64 arrays using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddFull_Int64(long* lhs, long* rhs, long* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vb = Avx2.LoadVector256(rhs + i);
+ var vr = Avx2.Add(va, vb);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Add two contiguous int16 arrays using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddFull_Int16(short* lhs, short* rhs, short* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vb = Avx2.LoadVector256(rhs + i);
+ var vr = Avx2.Add(va, vb);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = (short)(lhs[i] + rhs[i]);
+ }
+ }
+
+ ///
+ /// Add two contiguous byte arrays using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddFull_Byte(byte* lhs, byte* rhs, byte* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vb = Avx2.LoadVector256(rhs + i);
+ var vr = Avx2.Add(va, vb);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = (byte)(lhs[i] + rhs[i]);
+ }
+ }
+
+ #endregion
+
+ #region SIMD-SCALAR: One operand is a scalar
+
+ ///
+ /// Add a scalar to a contiguous float64 array using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddScalar_Float64(double* lhs, double scalar, double* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ var vs = Vector256.Create(scalar);
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx.LoadVector256(lhs + i);
+ var vr = Avx.Add(va, vs);
+ Avx.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + scalar;
+ }
+ }
+
+ ///
+ /// Add a scalar to a contiguous float32 array using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddScalar_Float32(float* lhs, float scalar, float* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ var vs = Vector256.Create(scalar);
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx.LoadVector256(lhs + i);
+ var vr = Avx.Add(va, vs);
+ Avx.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + scalar;
+ }
+ }
+
+ ///
+ /// Add a scalar to a contiguous int32 array using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddScalar_Int32(int* lhs, int scalar, int* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ var vs = Vector256.Create(scalar);
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vr = Avx2.Add(va, vs);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + scalar;
+ }
+ }
+
+ ///
+ /// Add a scalar to a contiguous int64 array using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddScalar_Int64(long* lhs, long scalar, long* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ var vs = Vector256.Create(scalar);
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vr = Avx2.Add(va, vs);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + scalar;
+ }
+ }
+
+ ///
+ /// Add a scalar to a contiguous int16 array using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddScalar_Int16(short* lhs, short scalar, short* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ var vs = Vector256.Create(scalar);
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vr = Avx2.Add(va, vs);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = (short)(lhs[i] + scalar);
+ }
+ }
+
+ ///
+ /// Add a scalar to a contiguous byte array using Vector256.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddScalar_Byte(byte* lhs, byte scalar, byte* result, int count)
+ {
+ int i = 0;
+
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ var vs = Vector256.Create(scalar);
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var va = Avx2.LoadVector256(lhs + i);
+ var vr = Avx2.Add(va, vs);
+ Avx2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = (byte)(lhs[i] + scalar);
+ }
+ }
+
+ #endregion
+
+ #region SIMD-CHUNK: Row broadcast (inner dimension contiguous)
+
+ ///
+ /// Add a row vector to each row of a matrix using SIMD on inner dimension.
+ /// matrix[M, N] + row[N] = result[M, N]
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddRowBroadcast_Float64(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var matrixRow = matrix + r * cols;
+ var resultRow = result + r * cols;
+ AddFull_Float64(matrixRow, row, resultRow, cols);
+ }
+ }
+
+ ///
+ /// Add a row vector to each row of a matrix using SIMD on inner dimension.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddRowBroadcast_Float32(float* matrix, float* row, float* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var matrixRow = matrix + r * cols;
+ var resultRow = result + r * cols;
+ AddFull_Float32(matrixRow, row, resultRow, cols);
+ }
+ }
+
+ ///
+ /// Add a row vector to each row of a matrix using SIMD on inner dimension.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddRowBroadcast_Int32(int* matrix, int* row, int* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var matrixRow = matrix + r * cols;
+ var resultRow = result + r * cols;
+ AddFull_Int32(matrixRow, row, resultRow, cols);
+ }
+ }
+
+ #endregion
+
+ #region Scalar loops (baseline)
+
+ ///
+ /// Scalar add baseline for float64.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddScalarLoop_Float64(double* lhs, double* rhs, double* result, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Scalar add baseline for float32.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddScalarLoop_Float32(float* lhs, float* rhs, float* result, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Scalar add baseline for int32.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddScalarLoop_Int32(int* lhs, int* rhs, int* result, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Scalar add baseline for int64.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddScalarLoop_Int64(long* lhs, long* rhs, long* result, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+
+ ///
+ /// Scalar add baseline for int16.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddScalarLoop_Int16(short* lhs, short* rhs, short* result, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = (short)(lhs[i] + rhs[i]);
+ }
+ }
+
+ ///
+ /// Scalar add baseline for byte.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddScalarLoop_Byte(byte* lhs, byte* rhs, byte* result, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = (byte)(lhs[i] + rhs[i]);
+ }
+ }
+
+ ///
+ /// Row broadcast baseline - scalar inner loop.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddRowBroadcastScalar_Float64(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ for (int c = 0; c < cols; c++)
+ {
+ result[r * cols + c] = matrix[r * cols + c] + row[c];
+ }
+ }
+ }
+
+ ///
+ /// Column broadcast baseline - scalar loop with strided access.
+ /// matrix[M, N] + col[M, 1] = result[M, N]
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ public static void AddColBroadcastScalar_Float64(double* matrix, double* col, double* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var colVal = col[r];
+ for (int c = 0; c < cols; c++)
+ {
+ result[r * cols + c] = matrix[r * cols + c] + colVal;
+ }
+ }
+ }
+
+ ///
+ /// Column broadcast with SIMD - each row uses same scalar from col.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static void AddColBroadcast_Float64(double* matrix, double* col, double* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var colVal = col[r];
+ var matrixRow = matrix + r * cols;
+ var resultRow = result + r * cols;
+ AddScalar_Float64(matrixRow, colVal, resultRow, cols);
+ }
+ }
+
+ #endregion
+
+ #region Array allocation helpers
+
+ ///
+ /// Result of pinned array allocation.
+ ///
+ public readonly struct PinnedArray : IDisposable where T : unmanaged
+ {
+ public readonly T[] Array;
+ public readonly GCHandle Handle;
+ public readonly IntPtr Pointer;
+
+ public PinnedArray(int count)
+ {
+ Array = GC.AllocateArray(count, pinned: true);
+ Handle = GCHandle.Alloc(Array, GCHandleType.Pinned);
+ Pointer = Handle.AddrOfPinnedObject();
+ }
+
+ public T* Ptr => (T*)Pointer;
+
+ public void Dispose()
+ {
+ if (Handle.IsAllocated)
+ Handle.Free();
+ }
+ }
+
+ ///
+ /// Allocate a pinned array and return a struct with the array, handle, and pointer.
+ ///
+ public static PinnedArray AllocatePinned(int count) where T : unmanaged
+ {
+ return new PinnedArray(count);
+ }
+
+ ///
+ /// Allocate aligned memory using NativeMemory.
+ ///
+ public static T* AllocateAligned(int count, nuint alignment = 32) where T : unmanaged
+ {
+ return (T*)NativeMemory.AlignedAlloc((nuint)(count * sizeof(T)), alignment);
+ }
+
+ ///
+ /// Free aligned memory.
+ ///
+ public static void FreeAligned(T* ptr) where T : unmanaged
+ {
+ NativeMemory.AlignedFree(ptr);
+ }
+
+ ///
+ /// Fill array with random values.
+ ///
+ public static void FillRandom(double* ptr, int count, int seed = 42)
+ {
+ var rng = new Random(seed);
+ for (int i = 0; i < count; i++)
+ {
+ ptr[i] = rng.NextDouble() * 100;
+ }
+ }
+
+ ///
+ /// Fill array with random float values.
+ ///
+ public static void FillRandom(float* ptr, int count, int seed = 42)
+ {
+ var rng = new Random(seed);
+ for (int i = 0; i < count; i++)
+ {
+ ptr[i] = (float)(rng.NextDouble() * 100);
+ }
+ }
+
+ ///
+ /// Fill array with random int values.
+ ///
+ public static void FillRandom(int* ptr, int count, int seed = 42)
+ {
+ var rng = new Random(seed);
+ for (int i = 0; i < count; i++)
+ {
+ ptr[i] = rng.Next(0, 100);
+ }
+ }
+
+ ///
+ /// Fill array with random long values.
+ ///
+ public static void FillRandom(long* ptr, int count, int seed = 42)
+ {
+ var rng = new Random(seed);
+ for (int i = 0; i < count; i++)
+ {
+ ptr[i] = rng.NextInt64(0, 100);
+ }
+ }
+
+ ///
+ /// Fill array with random short values.
+ ///
+ public static void FillRandom(short* ptr, int count, int seed = 42)
+ {
+ var rng = new Random(seed);
+ for (int i = 0; i < count; i++)
+ {
+ ptr[i] = (short)rng.Next(0, 100);
+ }
+ }
+
+ ///
+ /// Fill array with random byte values.
+ ///
+ public static void FillRandom(byte* ptr, int count, int seed = 42)
+ {
+ var rng = new Random(seed);
+ var bytes = new byte[count];
+ rng.NextBytes(bytes);
+ for (int i = 0; i < count; i++)
+ {
+ ptr[i] = bytes[i];
+ }
+ }
+
+ #endregion
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Integration/NumSharpBroadcast.cs b/benchmark/NumSharp.Benchmark.Exploration/Integration/NumSharpBroadcast.cs
new file mode 100644
index 00000000..aa9fcc60
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Integration/NumSharpBroadcast.cs
@@ -0,0 +1,314 @@
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Integration;
+
+///
+/// Test NumSharp broadcasting performance through actual TensorEngine.
+/// Measures overhead of NumSharp vs isolated implementations.
+///
+public static class NumSharpBroadcast
+{
+ private const string Suite = "NumSharpIntegration";
+ private const int Seed = 42;
+
+ ///
+ /// Run all NumSharp integration benchmarks.
+ ///
+ public static List RunAll(bool quick = false)
+ {
+ var results = new List();
+ var warmup = quick ? 2 : BenchFramework.DefaultWarmup;
+ var measure = quick ? 5 : BenchFramework.DefaultMeasure;
+
+ var sizes = quick
+ ? new[] { 100_000, 1_000_000 }
+ : new[] { 1_000, 100_000, 1_000_000, 10_000_000 };
+
+ BenchFramework.PrintHeader($"{Suite}: NumSharp Broadcasting Performance");
+
+ long initialMemory = GC.GetTotalMemory(true);
+ const long MemoryThresholdBytes = 4L * 1024 * 1024 * 1024; // 4 GB
+
+ foreach (var size in sizes)
+ {
+ BenchFramework.PrintDivider($"Size: {size:N0}");
+
+ results.AddRange(TestContiguousAdd(size, warmup, measure));
+ CleanupAndCheckMemory(initialMemory, MemoryThresholdBytes, $"after ContiguousAdd size={size}");
+
+ results.AddRange(TestScalarBroadcast(size, warmup, measure));
+ CleanupAndCheckMemory(initialMemory, MemoryThresholdBytes, $"after ScalarBroadcast size={size}");
+
+ results.AddRange(TestRowBroadcast(size, warmup, measure));
+ CleanupAndCheckMemory(initialMemory, MemoryThresholdBytes, $"after RowBroadcast size={size}");
+ }
+
+ OutputFormatters.PrintSummary(results);
+ return results;
+ }
+
+ ///
+ /// Force GC and check memory usage.
+ ///
+ private static void CleanupAndCheckMemory(long initialMemory, long threshold, string context)
+ {
+ // Force full GC collection
+ GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced, blocking: true);
+ GC.WaitForPendingFinalizers();
+ GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced, blocking: true);
+
+ long currentMemory = GC.GetTotalMemory(false);
+ long memoryGrowth = currentMemory - initialMemory;
+
+ if (memoryGrowth > threshold)
+ {
+ Console.WriteLine($"[WARNING] Memory leak detected {context}: {memoryGrowth / 1024 / 1024} MB above baseline");
+ }
+ }
+
+ ///
+ /// Test contiguous array addition (S1 scenario).
+ ///
+ public static List TestContiguousAdd(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ // Create NumSharp arrays
+ np.random.seed(Seed);
+ var a = np.random.rand(size);
+ var b = np.random.rand(size);
+ NDArray result = null!;
+
+ // NumSharp contiguous add
+ var numsharp = BenchFramework.Run(
+ () => result = a + b,
+ "S1_contiguous", "NumSharp", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(numsharp);
+ BenchFramework.PrintResult(numsharp);
+
+ // Also test np.add function
+ var npAdd = BenchFramework.Run(
+ () => result = np.add(a, b),
+ "S1_contiguous", "np.add", "float64", size, elementBytes, warmup, measure, Suite);
+ npAdd = npAdd with { SpeedupVsBaseline = numsharp.MeanUs / npAdd.MeanUs };
+ results.Add(npAdd);
+ BenchFramework.PrintResult(npAdd);
+
+ // Cleanup: release references to allow GC
+ a = null!;
+ b = null!;
+ result = null!;
+
+ return results;
+ }
+
+ ///
+ /// Test scalar broadcast (S2 scenario).
+ ///
+ public static List TestScalarBroadcast(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ np.random.seed(Seed);
+ var a = np.random.rand(size);
+ double scalar = 42.5;
+ NDArray result = null!;
+
+ // NumSharp scalar add
+ var numsharp = BenchFramework.Run(
+ () => result = a + scalar,
+ "S2_scalar", "NumSharp", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(numsharp);
+ BenchFramework.PrintResult(numsharp);
+
+ // Cleanup
+ a = null!;
+ result = null!;
+
+ return results;
+ }
+
+ ///
+ /// Test row broadcast (S4 scenario).
+ ///
+ public static List TestRowBroadcast(int totalElements, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ // Create square-ish matrix
+ int rows = (int)Math.Sqrt(totalElements);
+ int cols = totalElements / rows;
+ int actualSize = rows * cols;
+
+ np.random.seed(Seed);
+ var matrix = np.random.rand(rows, cols);
+ var row = np.random.rand(cols);
+ NDArray result = null!;
+
+ // NumSharp row broadcast
+ var numsharp = BenchFramework.Run(
+ () => result = matrix + row,
+ "S4_rowBC", "NumSharp", "float64", actualSize, elementBytes, warmup, measure, Suite);
+ results.Add(numsharp);
+ BenchFramework.PrintResult(numsharp);
+
+ // Cleanup
+ matrix = null!;
+ row = null!;
+ result = null!;
+
+ return results;
+ }
+
+ ///
+ /// Test various NumSharp operations to measure overhead.
+ ///
+ public static List TestOperationOverhead(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ BenchFramework.PrintDivider($"Operation Overhead: Size = {size:N0}");
+
+ np.random.seed(Seed);
+ var a = np.random.rand(size);
+ var b = np.random.rand(size);
+ NDArray result = null!;
+
+ // Test different operations
+ var ops = new (string name, Func op)[]
+ {
+ ("Add", () => a + b),
+ ("Sub", () => a - b),
+ ("Mul", () => a * b),
+ ("Div", () => a / b),
+ ("np.sqrt", () => np.sqrt(a)),
+ ("np.exp", () => np.exp(a)),
+ ("np.sum", () => np.sum(a)),
+ ("np.mean", () => np.mean(a)),
+ };
+
+ foreach (var (name, op) in ops)
+ {
+ var r = BenchFramework.Run(
+ () => result = op(),
+ "Overhead", name, "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(r);
+ BenchFramework.PrintResult(r);
+
+ // Force GC between operations to prevent accumulation
+ result = null!;
+ GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced, blocking: true);
+ GC.WaitForPendingFinalizers();
+ }
+
+ // Cleanup
+ a = null!;
+ b = null!;
+
+ return results;
+ }
+
+ ///
+ /// Compare NumSharp overhead vs raw pointer operations.
+ ///
+ public static unsafe List CompareOverhead(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ BenchFramework.PrintDivider($"NumSharp vs Raw Pointer: Size = {size:N0}");
+
+ // Raw pointer implementation
+ var lhsRaw = SimdImplementations.AllocateAligned(size);
+ var rhsRaw = SimdImplementations.AllocateAligned(size);
+ var resultRaw = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhsRaw, size, 42);
+ SimdImplementations.FillRandom(rhsRaw, size, 43);
+
+ // NumSharp arrays
+ np.random.seed(42);
+ var lhsNp = np.random.rand(size);
+ np.random.seed(43);
+ var rhsNp = np.random.rand(size);
+ NDArray resultNp = null!;
+
+ // Raw SIMD
+ var raw = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float64(lhsRaw, rhsRaw, resultRaw, size),
+ "Overhead", "RawSIMD", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(raw);
+ BenchFramework.PrintResult(raw);
+
+ // NumSharp
+ var numsharp = BenchFramework.Run(
+ () => resultNp = lhsNp + rhsNp,
+ "Overhead", "NumSharp", "float64", size, elementBytes, warmup, measure, Suite);
+ numsharp = numsharp with
+ {
+ SpeedupVsBaseline = raw.MeanUs / numsharp.MeanUs,
+ Notes = $"Overhead: {numsharp.MeanUs / raw.MeanUs:F2}x"
+ };
+ results.Add(numsharp);
+ BenchFramework.PrintResult(numsharp);
+
+ // Cleanup raw memory
+ SimdImplementations.FreeAligned(lhsRaw);
+ SimdImplementations.FreeAligned(rhsRaw);
+ SimdImplementations.FreeAligned(resultRaw);
+
+ // Cleanup NumSharp arrays
+ lhsNp = null!;
+ rhsNp = null!;
+ resultNp = null!;
+
+ return results;
+ }
+
+ ///
+ /// Test different dtypes through NumSharp.
+ ///
+ public static List TestDtypes(int size, int warmup, int measure)
+ {
+ var results = new List();
+
+ BenchFramework.PrintDivider($"NumSharp Dtype Performance: Size = {size:N0}");
+
+ var dtypes = new[]
+ {
+ (name: "byte", type: typeof(byte), bytes: 1),
+ (name: "int16", type: typeof(short), bytes: 2),
+ (name: "int32", type: typeof(int), bytes: 4),
+ (name: "int64", type: typeof(long), bytes: 8),
+ (name: "float32", type: typeof(float), bytes: 4),
+ (name: "float64", type: typeof(double), bytes: 8),
+ };
+
+ NDArray result = null!;
+
+ foreach (var (name, type, bytes) in dtypes)
+ {
+ np.random.seed(Seed);
+ var a = np.random.randint(0, 100, new Shape(size), type);
+ var b = np.random.randint(0, 100, new Shape(size), type);
+
+ var r = BenchFramework.Run(
+ () => result = a + b,
+ "DtypePerf", "Add", name, size, bytes, warmup, measure, Suite);
+ results.Add(r);
+ BenchFramework.PrintResult(r);
+
+ // Cleanup after each dtype test
+ a = null!;
+ b = null!;
+ result = null!;
+ GC.Collect(GC.MaxGeneration, GCCollectionMode.Forced, blocking: true);
+ GC.WaitForPendingFinalizers();
+ }
+
+ return results;
+ }
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Isolated/BroadcastScenarios.cs b/benchmark/NumSharp.Benchmark.Exploration/Isolated/BroadcastScenarios.cs
new file mode 100644
index 00000000..e24753bd
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Isolated/BroadcastScenarios.cs
@@ -0,0 +1,608 @@
+using System.Runtime.InteropServices;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Isolated;
+
+///
+/// Benchmark all 7 broadcasting scenarios to identify which benefit from SIMD.
+///
+/// Scenarios:
+/// S1: Both contiguous, same shape — SIMD-FULL
+/// S2: Scalar broadcast (right) — SIMD-SCALAR
+/// S3: Scalar broadcast (left) — SIMD-SCALAR
+/// S4: Row broadcast (M,N) + (N,) — SIMD-CHUNK
+/// S5: Column broadcast (M,N) + (M,1) — SIMD per-row scalar
+/// S6: Mutual broadcast (M,1) + (1,N) — Outer product
+/// S7: Strided arrays a[::2] + b[::2] — SCALAR/GATHER
+///
+public static unsafe class BroadcastScenarios
+{
+ private const string Suite = "BroadcastScenarios";
+
+ ///
+ /// Run all broadcast scenario benchmarks.
+ ///
+ public static List RunAll(int[]? sizes = null, string[]? dtypes = null, bool quick = false)
+ {
+ sizes ??= quick ? ArraySizes.Quick : ArraySizes.Standard;
+ dtypes ??= quick ? Dtypes.Common : Dtypes.All;
+ var warmup = quick ? 2 : BenchFramework.DefaultWarmup;
+ var measure = quick ? BenchFramework.QuickMeasure : BenchFramework.DefaultMeasure;
+
+ var results = new List();
+
+ BenchFramework.PrintHeader($"{Suite}: All 7 Broadcasting Scenarios");
+
+ foreach (var dtype in dtypes)
+ {
+ BenchFramework.PrintDivider($"dtype={dtype}");
+
+ foreach (var size in sizes)
+ {
+ // S1: Both contiguous, same shape
+ results.AddRange(RunS1_Contiguous(dtype, size, warmup, measure));
+
+ // S2: Scalar broadcast (right)
+ results.AddRange(RunS2_ScalarRight(dtype, size, warmup, measure));
+
+ // S3: Scalar broadcast (left) — same as S2 due to commutativity
+ // Skipped for brevity, same perf characteristics
+
+ // S4: Row broadcast
+ results.AddRange(RunS4_RowBroadcast(dtype, size, warmup, measure));
+
+ // S5: Column broadcast
+ results.AddRange(RunS5_ColBroadcast(dtype, size, warmup, measure));
+ }
+ }
+
+ OutputFormatters.PrintSummary(results);
+ return results;
+ }
+
+ #region S1: Contiguous same shape
+
+ ///
+ /// S1: Both operands contiguous with same shape — ideal for SIMD-FULL.
+ ///
+ public static List RunS1_Contiguous(string dtype, int size, int warmup, int measure)
+ {
+ var results = new List();
+ var elementBytes = DtypeInfo.GetElementSize(dtype);
+
+ switch (dtype.ToLowerInvariant())
+ {
+ case "float64" or "double":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ // Baseline: scalar loop
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size),
+ "S1_contiguous", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // SIMD-FULL
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float64(lhs, rhs, result, size),
+ "S1_contiguous", "SIMD-FULL", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "float32" or "single" or "float":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Float32(lhs, rhs, result, size),
+ "S1_contiguous", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float32(lhs, rhs, result, size),
+ "S1_contiguous", "SIMD-FULL", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int32" or "int":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Int32(lhs, rhs, result, size),
+ "S1_contiguous", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Int32(lhs, rhs, result, size),
+ "S1_contiguous", "SIMD-FULL", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int64" or "long":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Int64(lhs, rhs, result, size),
+ "S1_contiguous", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Int64(lhs, rhs, result, size),
+ "S1_contiguous", "SIMD-FULL", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int16" or "short":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Int16(lhs, rhs, result, size),
+ "S1_contiguous", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Int16(lhs, rhs, result, size),
+ "S1_contiguous", "SIMD-FULL", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "byte" or "uint8":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Byte(lhs, rhs, result, size),
+ "S1_contiguous", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Byte(lhs, rhs, result, size),
+ "S1_contiguous", "SIMD-FULL", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+ }
+
+ return results;
+ }
+
+ #endregion
+
+ #region S2: Scalar broadcast (right)
+
+ ///
+ /// S2: Array + scalar — ideal for SIMD-SCALAR strategy.
+ ///
+ public static List RunS2_ScalarRight(string dtype, int size, int warmup, int measure)
+ {
+ var results = new List();
+ var elementBytes = DtypeInfo.GetElementSize(dtype);
+
+ switch (dtype.ToLowerInvariant())
+ {
+ case "float64" or "double":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ double scalar = 42.5;
+
+ // Baseline: scalar loop
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < size; i++) result[i] = lhs[i] + scalar;
+ },
+ "S2_scalar", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // SIMD-SCALAR
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddScalar_Float64(lhs, scalar, result, size),
+ "S2_scalar", "SIMD-SCALAR", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "float32" or "single" or "float":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ float scalar = 42.5f;
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < size; i++) result[i] = lhs[i] + scalar;
+ },
+ "S2_scalar", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddScalar_Float32(lhs, scalar, result, size),
+ "S2_scalar", "SIMD-SCALAR", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int32" or "int":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ int scalar = 42;
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < size; i++) result[i] = lhs[i] + scalar;
+ },
+ "S2_scalar", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddScalar_Int32(lhs, scalar, result, size),
+ "S2_scalar", "SIMD-SCALAR", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int64" or "long":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ long scalar = 42;
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < size; i++) result[i] = lhs[i] + scalar;
+ },
+ "S2_scalar", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddScalar_Int64(lhs, scalar, result, size),
+ "S2_scalar", "SIMD-SCALAR", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int16" or "short":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ short scalar = 42;
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < size; i++) result[i] = (short)(lhs[i] + scalar);
+ },
+ "S2_scalar", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddScalar_Int16(lhs, scalar, result, size),
+ "S2_scalar", "SIMD-SCALAR", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "byte" or "uint8":
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ byte scalar = 42;
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < size; i++) result[i] = (byte)(lhs[i] + scalar);
+ },
+ "S2_scalar", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddScalar_Byte(lhs, scalar, result, size),
+ "S2_scalar", "SIMD-SCALAR", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+ }
+
+ return results;
+ }
+
+ #endregion
+
+ #region S4: Row broadcast
+
+ ///
+ /// S4: (M,N) + (N,) row broadcast — ideal for SIMD-CHUNK strategy.
+ ///
+ public static List RunS4_RowBroadcast(string dtype, int totalElements, int warmup, int measure)
+ {
+ // Use square-ish dimensions
+ int rows = (int)Math.Sqrt(totalElements);
+ int cols = totalElements / rows;
+ int actualSize = rows * cols;
+
+ var results = new List();
+ var elementBytes = DtypeInfo.GetElementSize(dtype);
+
+ switch (dtype.ToLowerInvariant())
+ {
+ case "float64" or "double":
+ {
+ var matrix = SimdImplementations.AllocateAligned(actualSize);
+ var row = SimdImplementations.AllocateAligned(cols);
+ var result = SimdImplementations.AllocateAligned(actualSize);
+ SimdImplementations.FillRandom(matrix, actualSize, 42);
+ SimdImplementations.FillRandom(row, cols, 43);
+
+ // Baseline: nested scalar loops
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddRowBroadcastScalar_Float64(matrix, row, result, rows, cols),
+ "S4_rowBC", "Scalar", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // SIMD-CHUNK: SIMD on inner dimension
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddRowBroadcast_Float64(matrix, row, result, rows, cols),
+ "S4_rowBC", "SIMD-CHUNK", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(matrix);
+ SimdImplementations.FreeAligned(row);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "float32" or "single" or "float":
+ {
+ var matrix = SimdImplementations.AllocateAligned(actualSize);
+ var row = SimdImplementations.AllocateAligned(cols);
+ var result = SimdImplementations.AllocateAligned(actualSize);
+ SimdImplementations.FillRandom(matrix, actualSize, 42);
+ SimdImplementations.FillRandom(row, cols, 43);
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int r = 0; r < rows; r++)
+ for (int c = 0; c < cols; c++)
+ result[r * cols + c] = matrix[r * cols + c] + row[c];
+ },
+ "S4_rowBC", "Scalar", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddRowBroadcast_Float32(matrix, row, result, rows, cols),
+ "S4_rowBC", "SIMD-CHUNK", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(matrix);
+ SimdImplementations.FreeAligned(row);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ case "int32" or "int":
+ {
+ var matrix = SimdImplementations.AllocateAligned(actualSize);
+ var row = SimdImplementations.AllocateAligned(cols);
+ var result = SimdImplementations.AllocateAligned(actualSize);
+ SimdImplementations.FillRandom(matrix, actualSize, 42);
+ SimdImplementations.FillRandom(row, cols, 43);
+
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int r = 0; r < rows; r++)
+ for (int c = 0; c < cols; c++)
+ result[r * cols + c] = matrix[r * cols + c] + row[c];
+ },
+ "S4_rowBC", "Scalar", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddRowBroadcast_Int32(matrix, row, result, rows, cols),
+ "S4_rowBC", "SIMD-CHUNK", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(matrix);
+ SimdImplementations.FreeAligned(row);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ // Other dtypes similar pattern...
+ default:
+ Console.WriteLine($" S4 row broadcast: dtype {dtype} not implemented yet");
+ break;
+ }
+
+ return results;
+ }
+
+ #endregion
+
+ #region S5: Column broadcast
+
+ ///
+ /// S5: (M,N) + (M,1) column broadcast — use SIMD-SCALAR per row.
+ ///
+ public static List RunS5_ColBroadcast(string dtype, int totalElements, int warmup, int measure)
+ {
+ int rows = (int)Math.Sqrt(totalElements);
+ int cols = totalElements / rows;
+ int actualSize = rows * cols;
+
+ var results = new List();
+ var elementBytes = DtypeInfo.GetElementSize(dtype);
+
+ switch (dtype.ToLowerInvariant())
+ {
+ case "float64" or "double":
+ {
+ var matrix = SimdImplementations.AllocateAligned(actualSize);
+ var col = SimdImplementations.AllocateAligned(rows);
+ var result = SimdImplementations.AllocateAligned(actualSize);
+ SimdImplementations.FillRandom(matrix, actualSize, 42);
+ SimdImplementations.FillRandom(col, rows, 43);
+
+ // Baseline: nested scalar loops
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddColBroadcastScalar_Float64(matrix, col, result, rows, cols),
+ "S5_colBC", "Scalar", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // SIMD per row (using scalar broadcast within each row)
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddColBroadcast_Float64(matrix, col, result, rows, cols),
+ "S5_colBC", "SIMD-SCALAR", dtype, actualSize, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+ results.Add(simd);
+ BenchFramework.PrintResult(simd);
+
+ SimdImplementations.FreeAligned(matrix);
+ SimdImplementations.FreeAligned(col);
+ SimdImplementations.FreeAligned(result);
+ break;
+ }
+
+ default:
+ Console.WriteLine($" S5 col broadcast: dtype {dtype} not implemented yet");
+ break;
+ }
+
+ return results;
+ }
+
+ #endregion
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Isolated/CombinedOptimizations.cs b/benchmark/NumSharp.Benchmark.Exploration/Isolated/CombinedOptimizations.cs
new file mode 100644
index 00000000..35d42e28
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Isolated/CombinedOptimizations.cs
@@ -0,0 +1,344 @@
+using System.Buffers;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Isolated;
+
+///
+/// Test combinations of single-threaded optimizations (NO PARALLEL).
+/// C1: Scalar baseline
+/// C2: SIMD only
+/// C3: SIMD + ArrayPool (chained ops)
+/// C4: In-place (+=)
+/// C5: SIMD + In-place
+/// C6: SIMD + Buffer reuse
+///
+public static unsafe class CombinedOptimizations
+{
+ private const string Suite = "CombinedOptimizations";
+
+ ///
+ /// Run all combined optimization tests.
+ ///
+ public static List RunAll(bool quick = false)
+ {
+ var results = new List();
+ var warmup = quick ? 2 : BenchFramework.DefaultWarmup;
+ var measure = quick ? 5 : BenchFramework.DefaultMeasure;
+
+ var sizes = quick
+ ? new[] { 100_000, 10_000_000 }
+ : new[] { 1_000, 100_000, 1_000_000, 10_000_000 };
+
+ BenchFramework.PrintHeader($"{Suite}: Single-Threaded Optimization Combinations");
+
+ // Test 1: Single operation optimizations
+ foreach (var size in sizes)
+ {
+ BenchFramework.PrintDivider($"Single Operation: Size = {size:N0}");
+ results.AddRange(TestSingleOperation(size, warmup, measure));
+ }
+
+ // Test 2: Chained operations (a + b + c + d)
+ foreach (var size in sizes)
+ {
+ BenchFramework.PrintDivider($"Chained Operations (a+b+c+d): Size = {size:N0}");
+ results.AddRange(TestChainedOperations(size, warmup, measure));
+ }
+
+ // Test 3: In-place operations (+=)
+ foreach (var size in sizes)
+ {
+ BenchFramework.PrintDivider($"In-Place Operations (+=): Size = {size:N0}");
+ results.AddRange(TestInPlaceOperations(size, warmup, measure));
+ }
+
+ OutputFormatters.PrintSummary(results);
+ return results;
+ }
+
+ ///
+ /// Test single operation optimizations.
+ ///
+ public static List TestSingleOperation(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ // C1: Scalar baseline
+ var c1 = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size),
+ "Single", "C1_Scalar", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(c1);
+ BenchFramework.PrintResult(c1);
+
+ // C2: SIMD only
+ var c2 = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float64(lhs, rhs, result, size),
+ "Single", "C2_SIMD", "float64", size, elementBytes, warmup, measure, Suite);
+ c2 = c2 with { SpeedupVsBaseline = c1.MeanUs / c2.MeanUs };
+ results.Add(c2);
+ BenchFramework.PrintResult(c2);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+
+ return results;
+ }
+
+ ///
+ /// Test chained operations: a + b + c + d
+ /// Compare naive (4 allocations) vs pooled (1 allocation reused).
+ ///
+ public static List TestChainedOperations(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ var a = SimdImplementations.AllocateAligned(size);
+ var b = SimdImplementations.AllocateAligned(size);
+ var c = SimdImplementations.AllocateAligned(size);
+ var d = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(a, size, 42);
+ SimdImplementations.FillRandom(b, size, 43);
+ SimdImplementations.FillRandom(c, size, 44);
+ SimdImplementations.FillRandom(d, size, 45);
+
+ // Baseline: Scalar with allocations
+ var baseline = BenchFramework.RunWithSetup(
+ () => { }, // No setup needed
+ () =>
+ {
+ var t1 = new double[size];
+ var t2 = new double[size];
+ var result = new double[size];
+
+ fixed (double* pt1 = t1, pt2 = t2, pr = result)
+ {
+ SimdImplementations.AddScalarLoop_Float64(a, b, pt1, size);
+ SimdImplementations.AddScalarLoop_Float64(pt1, c, pt2, size);
+ SimdImplementations.AddScalarLoop_Float64(pt2, d, pr, size);
+ }
+ },
+ "Chained", "0_ScalarAlloc", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // C1: SIMD with new allocations each time
+ var c1 = BenchFramework.RunWithSetup(
+ () => { },
+ () =>
+ {
+ var t1 = new double[size];
+ var t2 = new double[size];
+ var result = new double[size];
+
+ fixed (double* pt1 = t1, pt2 = t2, pr = result)
+ {
+ SimdImplementations.AddFull_Float64(a, b, pt1, size);
+ SimdImplementations.AddFull_Float64(pt1, c, pt2, size);
+ SimdImplementations.AddFull_Float64(pt2, d, pr, size);
+ }
+ },
+ "Chained", "C1_SimdAlloc", "float64", size, elementBytes, warmup, measure, Suite);
+ c1 = c1 with { SpeedupVsBaseline = baseline.MeanUs / c1.MeanUs };
+ results.Add(c1);
+ BenchFramework.PrintResult(c1);
+
+ // C2: SIMD with pre-allocated buffers (simulates NumSharp output allocation)
+ var t1Pre = SimdImplementations.AllocateAligned(size);
+ var t2Pre = SimdImplementations.AllocateAligned(size);
+ var resultPre = SimdImplementations.AllocateAligned(size);
+
+ var c2 = BenchFramework.Run(
+ () =>
+ {
+ SimdImplementations.AddFull_Float64(a, b, t1Pre, size);
+ SimdImplementations.AddFull_Float64(t1Pre, c, t2Pre, size);
+ SimdImplementations.AddFull_Float64(t2Pre, d, resultPre, size);
+ },
+ "Chained", "C2_SimdPreAlloc", "float64", size, elementBytes, warmup, measure, Suite);
+ c2 = c2 with { SpeedupVsBaseline = baseline.MeanUs / c2.MeanUs };
+ results.Add(c2);
+ BenchFramework.PrintResult(c2);
+
+ // C3: SIMD with ArrayPool
+ var c3 = BenchFramework.Run(
+ () =>
+ {
+ var pool = ArrayPool.Shared;
+ var t1Arr = pool.Rent(size);
+ var t2Arr = pool.Rent(size);
+ var resultArr = pool.Rent(size);
+
+ fixed (double* pt1 = t1Arr, pt2 = t2Arr, pr = resultArr)
+ {
+ SimdImplementations.AddFull_Float64(a, b, pt1, size);
+ SimdImplementations.AddFull_Float64(pt1, c, pt2, size);
+ SimdImplementations.AddFull_Float64(pt2, d, pr, size);
+ }
+
+ pool.Return(t1Arr);
+ pool.Return(t2Arr);
+ pool.Return(resultArr);
+ },
+ "Chained", "C3_SimdPool", "float64", size, elementBytes, warmup, measure, Suite);
+ c3 = c3 with { SpeedupVsBaseline = baseline.MeanUs / c3.MeanUs };
+ results.Add(c3);
+ BenchFramework.PrintResult(c3);
+
+ // C4: SIMD with single buffer reuse (ping-pong)
+ var buf1 = SimdImplementations.AllocateAligned(size);
+ var buf2 = SimdImplementations.AllocateAligned(size);
+
+ var c4 = BenchFramework.Run(
+ () =>
+ {
+ SimdImplementations.AddFull_Float64(a, b, buf1, size);
+ SimdImplementations.AddFull_Float64(buf1, c, buf2, size);
+ SimdImplementations.AddFull_Float64(buf2, d, buf1, size);
+ // Result is in buf1
+ },
+ "Chained", "C4_PingPong", "float64", size, elementBytes, warmup, measure, Suite);
+ c4 = c4 with { SpeedupVsBaseline = baseline.MeanUs / c4.MeanUs };
+ results.Add(c4);
+ BenchFramework.PrintResult(c4);
+
+ SimdImplementations.FreeAligned(a);
+ SimdImplementations.FreeAligned(b);
+ SimdImplementations.FreeAligned(c);
+ SimdImplementations.FreeAligned(d);
+ SimdImplementations.FreeAligned(t1Pre);
+ SimdImplementations.FreeAligned(t2Pre);
+ SimdImplementations.FreeAligned(resultPre);
+ SimdImplementations.FreeAligned(buf1);
+ SimdImplementations.FreeAligned(buf2);
+
+ return results;
+ }
+
+ ///
+ /// Test in-place operations (a += b).
+ ///
+ public static List TestInPlaceOperations(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ var a = SimdImplementations.AllocateAligned(size);
+ var b = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(b, size, 43);
+
+ // Baseline: Out-of-place scalar
+ var baseline = BenchFramework.RunWithSetup(
+ () => SimdImplementations.FillRandom(a, size, 42),
+ () => SimdImplementations.AddScalarLoop_Float64(a, b, result, size),
+ "InPlace", "0_OutOfPlace", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // C1: In-place scalar (a += b)
+ var c1 = BenchFramework.RunWithSetup(
+ () => SimdImplementations.FillRandom(a, size, 42),
+ () => AddInPlace_Scalar(a, b, size),
+ "InPlace", "C1_InPlace", "float64", size, elementBytes, warmup, measure, Suite);
+ c1 = c1 with { SpeedupVsBaseline = baseline.MeanUs / c1.MeanUs };
+ results.Add(c1);
+ BenchFramework.PrintResult(c1);
+
+ // C2: Out-of-place SIMD
+ var c2 = BenchFramework.RunWithSetup(
+ () => SimdImplementations.FillRandom(a, size, 42),
+ () => SimdImplementations.AddFull_Float64(a, b, result, size),
+ "InPlace", "C2_SimdOut", "float64", size, elementBytes, warmup, measure, Suite);
+ c2 = c2 with { SpeedupVsBaseline = baseline.MeanUs / c2.MeanUs };
+ results.Add(c2);
+ BenchFramework.PrintResult(c2);
+
+ // C3: In-place SIMD (a += b)
+ var c3 = BenchFramework.RunWithSetup(
+ () => SimdImplementations.FillRandom(a, size, 42),
+ () => AddInPlace_Simd(a, b, size),
+ "InPlace", "C3_SimdIn", "float64", size, elementBytes, warmup, measure, Suite);
+ c3 = c3 with { SpeedupVsBaseline = baseline.MeanUs / c3.MeanUs };
+ results.Add(c3);
+ BenchFramework.PrintResult(c3);
+
+ SimdImplementations.FreeAligned(a);
+ SimdImplementations.FreeAligned(b);
+ SimdImplementations.FreeAligned(result);
+
+ return results;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void AddInPlace_Scalar(double* a, double* b, int count)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ a[i] += b[i];
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void AddInPlace_Simd(double* a, double* b, int count)
+ {
+ // In-place: a += b, so result goes back to a
+ SimdImplementations.AddFull_Float64(a, b, a, count);
+ }
+
+ ///
+ /// Test ArrayPool overhead specifically.
+ ///
+ public static List TestPoolOverhead(int size, int iterations, int warmup, int measure)
+ {
+ var results = new List();
+
+ BenchFramework.PrintDivider($"ArrayPool Overhead (size={size:N0}, iterations={iterations})");
+
+ // Baseline: New allocation each time
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ var arr = new double[size];
+ // Touch array to ensure allocation
+ arr[0] = 1.0;
+ arr[size - 1] = 1.0;
+ }
+ },
+ "PoolOverhead", "New", "float64", size, 8, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // ArrayPool
+ var pool = ArrayPool.Shared;
+ var pooled = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ var arr = pool.Rent(size);
+ arr[0] = 1.0;
+ arr[size - 1] = 1.0;
+ pool.Return(arr);
+ }
+ },
+ "PoolOverhead", "ArrayPool", "float64", size, 8, warmup, measure, Suite);
+ pooled = pooled with { SpeedupVsBaseline = baseline.MeanUs / pooled.MeanUs };
+ results.Add(pooled);
+ BenchFramework.PrintResult(pooled);
+
+ return results;
+ }
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Isolated/DispatchOverhead.cs b/benchmark/NumSharp.Benchmark.Exploration/Isolated/DispatchOverhead.cs
new file mode 100644
index 00000000..a437de5e
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Isolated/DispatchOverhead.cs
@@ -0,0 +1,303 @@
+using System.Runtime.CompilerServices;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Isolated;
+
+///
+/// Measure the overhead of different dispatch mechanisms.
+/// Goal: Determine acceptable dispatch overhead budget.
+///
+public static unsafe class DispatchOverhead
+{
+ private const string Suite = "DispatchOverhead";
+
+ ///
+ /// Run all dispatch overhead benchmarks.
+ ///
+ public static List RunAll(bool quick = false)
+ {
+ var results = new List();
+ var warmup = quick ? 3 : BenchFramework.DefaultWarmup;
+ var measure = quick ? 15 : 30; // More iterations for small overhead measurement
+
+ var sizes = quick
+ ? new[] { 1_000, 100_000 }
+ : new[] { 100, 1_000, 10_000, 100_000, 1_000_000 };
+
+ BenchFramework.PrintHeader($"{Suite}: Dispatch Mechanism Overhead");
+
+ foreach (var size in sizes)
+ {
+ BenchFramework.PrintDivider($"Size: {size:N0}");
+ results.AddRange(MeasureDispatchOverhead(size, warmup, measure));
+ }
+
+ OutputFormatters.PrintSummary(results);
+ return results;
+ }
+
+ ///
+ /// Measure overhead of various dispatch mechanisms.
+ ///
+ public static List MeasureDispatchOverhead(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ // Test 1: Direct function call (baseline)
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float64(lhs, rhs, result, size),
+ "Dispatch", "1_Direct", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // Test 2: Single if/else dispatch
+ var ifElse = BenchFramework.Run(
+ () => DispatchIfElse(lhs, rhs, result, size, isContiguous: true),
+ "Dispatch", "2_IfElse", "float64", size, elementBytes, warmup, measure, Suite);
+ ifElse = ifElse with { SpeedupVsBaseline = baseline.MeanUs / ifElse.MeanUs };
+ results.Add(ifElse);
+ BenchFramework.PrintResult(ifElse);
+
+ // Test 3: Nested if/else dispatch (simulate shape checks)
+ var nested = BenchFramework.Run(
+ () => DispatchNested(lhs, rhs, result, size, isContiguous: true, isBroadcast: false, isScalar: false),
+ "Dispatch", "3_Nested", "float64", size, elementBytes, warmup, measure, Suite);
+ nested = nested with { SpeedupVsBaseline = baseline.MeanUs / nested.MeanUs };
+ results.Add(nested);
+ BenchFramework.PrintResult(nested);
+
+ // Test 4: Switch dispatch
+ var switchDisp = BenchFramework.Run(
+ () => DispatchSwitch(lhs, rhs, result, size, scenario: 0),
+ "Dispatch", "4_Switch", "float64", size, elementBytes, warmup, measure, Suite);
+ switchDisp = switchDisp with { SpeedupVsBaseline = baseline.MeanUs / switchDisp.MeanUs };
+ results.Add(switchDisp);
+ BenchFramework.PrintResult(switchDisp);
+
+ // Test 5: Virtual method dispatch
+ IAddOperation operation = new SimdAddOperation();
+ var virtualDisp = BenchFramework.Run(
+ () => operation.Execute(lhs, rhs, result, size),
+ "Dispatch", "5_Virtual", "float64", size, elementBytes, warmup, measure, Suite);
+ virtualDisp = virtualDisp with { SpeedupVsBaseline = baseline.MeanUs / virtualDisp.MeanUs };
+ results.Add(virtualDisp);
+ BenchFramework.PrintResult(virtualDisp);
+
+ // Test 6: Delegate invocation
+ AddDelegate del = SimdImplementations.AddFull_Float64;
+ var delegateDisp = BenchFramework.Run(
+ () => del(lhs, rhs, result, size),
+ "Dispatch", "6_Delegate", "float64", size, elementBytes, warmup, measure, Suite);
+ delegateDisp = delegateDisp with { SpeedupVsBaseline = baseline.MeanUs / delegateDisp.MeanUs };
+ results.Add(delegateDisp);
+ BenchFramework.PrintResult(delegateDisp);
+
+ // Test 7: Function pointer
+ delegate* managed funcPtr = &SimdImplementations.AddFull_Float64;
+ var funcPtrDisp = BenchFramework.Run(
+ () => funcPtr(lhs, rhs, result, size),
+ "Dispatch", "7_FuncPtr", "float64", size, elementBytes, warmup, measure, Suite);
+ funcPtrDisp = funcPtrDisp with { SpeedupVsBaseline = baseline.MeanUs / funcPtrDisp.MeanUs };
+ results.Add(funcPtrDisp);
+ BenchFramework.PrintResult(funcPtrDisp);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+
+ return results;
+ }
+
+ #region Dispatch Implementations
+
+ private delegate void AddDelegate(double* lhs, double* rhs, double* result, int count);
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void DispatchIfElse(double* lhs, double* rhs, double* result, int size, bool isContiguous)
+ {
+ if (isContiguous)
+ {
+ SimdImplementations.AddFull_Float64(lhs, rhs, result, size);
+ }
+ else
+ {
+ SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void DispatchNested(double* lhs, double* rhs, double* result, int size,
+ bool isContiguous, bool isBroadcast, bool isScalar)
+ {
+ if (isScalar)
+ {
+ // Scalar path (not used in this test)
+ SimdImplementations.AddScalar_Float64(lhs, 0.0, result, size);
+ }
+ else if (isContiguous && !isBroadcast)
+ {
+ SimdImplementations.AddFull_Float64(lhs, rhs, result, size);
+ }
+ else if (isBroadcast)
+ {
+ // Broadcast path (not used in this test)
+ SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size);
+ }
+ else
+ {
+ SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void DispatchSwitch(double* lhs, double* rhs, double* result, int size, int scenario)
+ {
+ switch (scenario)
+ {
+ case 0: // S1: Contiguous
+ SimdImplementations.AddFull_Float64(lhs, rhs, result, size);
+ break;
+ case 1: // S2: Scalar broadcast
+ SimdImplementations.AddScalar_Float64(lhs, 0.0, result, size);
+ break;
+ case 2: // S4: Row broadcast
+ SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size);
+ break;
+ default:
+ SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size);
+ break;
+ }
+ }
+
+ private interface IAddOperation
+ {
+ void Execute(double* lhs, double* rhs, double* result, int size);
+ }
+
+ private class SimdAddOperation : IAddOperation
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void Execute(double* lhs, double* rhs, double* result, int size)
+ {
+ SimdImplementations.AddFull_Float64(lhs, rhs, result, size);
+ }
+ }
+
+ #endregion
+
+ ///
+ /// Measure overhead of property checks (simulated shape properties).
+ ///
+ public static List MeasurePropertyCheckOverhead(int iterations, int warmup, int measure)
+ {
+ var results = new List();
+
+ BenchFramework.PrintDivider("Property Check Overhead");
+
+ // Simulate shape-like object with properties
+ var shape = new SimulatedShape(1000, 1000, isContiguous: true, isBroadcast: false);
+
+ // Test 1: No property checks (baseline)
+ bool result1 = false;
+ var baseline = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ result1 = true; // Just assign
+ }
+ },
+ "PropCheck", "0_None", "N/A", iterations, 1, warmup, measure, Suite);
+ results.Add(baseline);
+ BenchFramework.PrintResult(baseline);
+
+ // Test 2: Single property check
+ var single = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ result1 = shape.IsContiguous;
+ }
+ },
+ "PropCheck", "1_Single", "N/A", iterations, 1, warmup, measure, Suite);
+ single = single with { SpeedupVsBaseline = baseline.MeanUs / single.MeanUs };
+ results.Add(single);
+ BenchFramework.PrintResult(single);
+
+ // Test 3: Multiple property checks
+ var multi = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ result1 = shape.IsContiguous && !shape.IsBroadcast && shape.Size > 0;
+ }
+ },
+ "PropCheck", "2_Multi", "N/A", iterations, 1, warmup, measure, Suite);
+ multi = multi with { SpeedupVsBaseline = baseline.MeanUs / multi.MeanUs };
+ results.Add(multi);
+ BenchFramework.PrintResult(multi);
+
+ // Test 4: Computed property
+ var computed = BenchFramework.Run(
+ () =>
+ {
+ for (int i = 0; i < iterations; i++)
+ {
+ result1 = shape.IsContiguousComputed;
+ }
+ },
+ "PropCheck", "3_Computed", "N/A", iterations, 1, warmup, measure, Suite);
+ computed = computed with { SpeedupVsBaseline = baseline.MeanUs / computed.MeanUs };
+ results.Add(computed);
+ BenchFramework.PrintResult(computed);
+
+ // Keep result to prevent optimization
+ _ = result1;
+
+ return results;
+ }
+
+ private class SimulatedShape
+ {
+ private readonly int[] _dimensions;
+ private readonly int[] _strides;
+ private readonly bool _isContiguous;
+ private readonly bool _isBroadcast;
+
+ public SimulatedShape(int rows, int cols, bool isContiguous, bool isBroadcast)
+ {
+ _dimensions = [rows, cols];
+ _strides = [cols, 1];
+ _isContiguous = isContiguous;
+ _isBroadcast = isBroadcast;
+ }
+
+ public bool IsContiguous => _isContiguous;
+ public bool IsBroadcast => _isBroadcast;
+ public int Size => _dimensions[0] * _dimensions[1];
+
+ // Computed property that checks strides
+ public bool IsContiguousComputed
+ {
+ get
+ {
+ int expected = 1;
+ for (int i = _dimensions.Length - 1; i >= 0; i--)
+ {
+ if (_strides[i] != expected) return false;
+ expected *= _dimensions[i];
+ }
+ return true;
+ }
+ }
+ }
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Isolated/MemoryPatterns.cs b/benchmark/NumSharp.Benchmark.Exploration/Isolated/MemoryPatterns.cs
new file mode 100644
index 00000000..6006ea5e
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Isolated/MemoryPatterns.cs
@@ -0,0 +1,333 @@
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Isolated;
+
+///
+/// Test memory access patterns and cache behavior.
+/// Investigates when gather/scatter and prefetch help.
+///
+public static unsafe class MemoryPatterns
+{
+ private const string Suite = "MemoryPatterns";
+
+ ///
+ /// Run all memory pattern tests.
+ ///
+ public static List RunAll(bool quick = false)
+ {
+ var results = new List();
+ var warmup = quick ? 2 : BenchFramework.DefaultWarmup;
+ var measure = quick ? 5 : BenchFramework.DefaultMeasure;
+
+ var sizes = quick
+ ? new[] { 100_000, 1_000_000 }
+ : new[] { 10_000, 100_000, 1_000_000, 10_000_000 };
+
+ BenchFramework.PrintHeader($"{Suite}: Memory Access Patterns & Cache Behavior");
+
+ // Test 1: Strided access patterns
+ foreach (var size in sizes)
+ {
+ results.AddRange(TestStridedAccess(size, warmup, measure));
+ }
+
+ // Test 2: Gather vs Copy-then-SIMD
+ foreach (var size in sizes)
+ {
+ results.AddRange(TestGatherVsCopy(size, warmup, measure));
+ }
+
+ OutputFormatters.PrintSummary(results);
+ return results;
+ }
+
+ ///
+ /// Test strided access performance degradation.
+ ///
+ public static List TestStridedAccess(int totalElements, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ BenchFramework.PrintDivider($"Strided Access: {totalElements:N0} total elements");
+
+ var strides = new[] { 1, 2, 4, 8, 16, 32, 64 };
+
+ foreach (var stride in strides)
+ {
+ // Allocate enough to hold strided access
+ int actualSize = totalElements * stride;
+ if (actualSize > 100_000_000) continue; // Skip if too large
+
+ var src = SimdImplementations.AllocateAligned(actualSize);
+ var dst = SimdImplementations.AllocateAligned(totalElements);
+ SimdImplementations.FillRandom(src, actualSize, 42);
+
+ // Measure strided read + sequential write
+ var result = BenchFramework.Run(
+ () => StridedRead(src, dst, totalElements, stride),
+ "Strided", $"Stride{stride}", "float64", totalElements, elementBytes, warmup, measure, Suite);
+ results.Add(result);
+
+ // Calculate effective bandwidth
+ var seqBandwidth = results.FirstOrDefault(r => r.Strategy == "Stride1")?.GBps ?? result.GBps;
+ var efficiency = result.GBps / seqBandwidth;
+ Console.WriteLine($" Stride {stride,2}: {result.MeanUs,10:F2} us | {result.GBps,8:F2} GB/s | Efficiency: {efficiency * 100:F1}%");
+
+ SimdImplementations.FreeAligned(src);
+ SimdImplementations.FreeAligned(dst);
+ }
+
+ return results;
+ }
+
+ ///
+ /// Compare Gather intrinsic vs Copy-to-contiguous-then-SIMD.
+ ///
+ public static List TestGatherVsCopy(int totalElements, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ BenchFramework.PrintDivider($"Gather vs Copy: {totalElements:N0} elements");
+
+ // Test with stride=2 (every other element)
+ int stride = 2;
+ int actualSize = totalElements * stride;
+
+ var src = SimdImplementations.AllocateAligned(actualSize);
+ var other = SimdImplementations.AllocateAligned(totalElements);
+ var result = SimdImplementations.AllocateAligned(totalElements);
+ var tempContiguous = SimdImplementations.AllocateAligned(totalElements);
+ SimdImplementations.FillRandom(src, actualSize, 42);
+ SimdImplementations.FillRandom(other, totalElements, 43);
+
+ // Method 1: Scalar strided loop
+ var scalar = BenchFramework.Run(
+ () => StridedAdd_Scalar(src, other, result, totalElements, stride),
+ "GatherVsCopy", "1_Scalar", "float64", totalElements, elementBytes, warmup, measure, Suite);
+ results.Add(scalar);
+ BenchFramework.PrintResult(scalar);
+
+ // Method 2: Copy to contiguous, then SIMD
+ var copySimd = BenchFramework.Run(
+ () =>
+ {
+ // Copy strided to contiguous
+ for (int i = 0; i < totalElements; i++)
+ {
+ tempContiguous[i] = src[i * stride];
+ }
+ // SIMD add
+ SimdImplementations.AddFull_Float64(tempContiguous, other, result, totalElements);
+ },
+ "GatherVsCopy", "2_CopySimd", "float64", totalElements, elementBytes, warmup, measure, Suite);
+ copySimd = copySimd with { SpeedupVsBaseline = scalar.MeanUs / copySimd.MeanUs };
+ results.Add(copySimd);
+ BenchFramework.PrintResult(copySimd);
+
+ // Method 3: AVX2 Gather (if supported)
+ if (Avx2.IsSupported && totalElements >= 4)
+ {
+ var gather = BenchFramework.Run(
+ () => StridedAdd_Gather(src, other, result, totalElements, stride),
+ "GatherVsCopy", "3_Gather", "float64", totalElements, elementBytes, warmup, measure, Suite);
+ gather = gather with { SpeedupVsBaseline = scalar.MeanUs / gather.MeanUs };
+ results.Add(gather);
+ BenchFramework.PrintResult(gather);
+ }
+ else
+ {
+ Console.WriteLine(" 3_Gather: AVX2 not supported");
+ }
+
+ SimdImplementations.FreeAligned(src);
+ SimdImplementations.FreeAligned(other);
+ SimdImplementations.FreeAligned(result);
+ SimdImplementations.FreeAligned(tempContiguous);
+
+ return results;
+ }
+
+ #region Implementation
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void StridedRead(double* src, double* dst, int count, int stride)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ dst[i] = src[i * stride];
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void StridedAdd_Scalar(double* strided, double* contiguous, double* result, int count, int stride)
+ {
+ for (int i = 0; i < count; i++)
+ {
+ result[i] = strided[i * stride] + contiguous[i];
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void StridedAdd_Gather(double* strided, double* contiguous, double* result, int count, int stride)
+ {
+ // Using AVX2 GatherVector256 for double
+ // Each gather loads 4 doubles at specified indices
+
+ int i = 0;
+ int vecSize = Vector256.Count; // 4
+
+ if (Avx2.IsSupported && count >= vecSize)
+ {
+ // Create index vector: [0, stride, 2*stride, 3*stride] * sizeof(double)
+ var indices = Vector256.Create(0, stride, stride * 2, stride * 3);
+
+ int vectorCount = count - vecSize + 1;
+ for (; i < vectorCount; i += vecSize)
+ {
+ // Gather from strided source
+ var gathered = Avx2.GatherVector256(strided + i * stride, indices, 8);
+
+ // Load contiguous
+ var cont = Avx.LoadVector256(contiguous + i);
+
+ // Add
+ var sum = Avx.Add(gathered, cont);
+
+ // Store
+ Avx.Store(result + i, sum);
+ }
+ }
+
+ // Scalar tail
+ for (; i < count; i++)
+ {
+ result[i] = strided[i * stride] + contiguous[i];
+ }
+ }
+
+ #endregion
+
+ ///
+ /// Test cache line effects.
+ ///
+ public static List TestCacheLines(int warmup, int measure)
+ {
+ var results = new List();
+
+ BenchFramework.PrintDivider("Cache Line Effects");
+
+ // Test accessing elements at different cache line boundaries
+ // Typical cache line is 64 bytes = 8 doubles
+ var sizes = new[] { 8, 16, 32, 64, 128, 256, 512 }; // Elements per "chunk"
+
+ int totalIterations = 1_000_000;
+
+ foreach (var chunkSize in sizes)
+ {
+ int totalElements = chunkSize * 1000; // 1000 chunks
+ var data = SimdImplementations.AllocateAligned(totalElements);
+ SimdImplementations.FillRandom(data, totalElements, 42);
+
+ double sum = 0;
+ var result = BenchFramework.Run(
+ () =>
+ {
+ sum = 0;
+ for (int chunk = 0; chunk < totalElements / chunkSize; chunk++)
+ {
+ // Access first element of each chunk
+ sum += data[chunk * chunkSize];
+ }
+ },
+ "CacheLine", $"Chunk{chunkSize}", "float64", totalIterations / chunkSize, 8, warmup, measure, Suite);
+ results.Add(result);
+ Console.WriteLine($" Chunk size {chunkSize,4}: {result.MeanUs,10:F2} us | sum={sum:F2}");
+
+ SimdImplementations.FreeAligned(data);
+ }
+
+ return results;
+ }
+
+ ///
+ /// Test non-temporal (streaming) stores.
+ ///
+ public static List TestNonTemporalStores(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ BenchFramework.PrintDivider($"Non-Temporal Stores: {size:N0} elements");
+
+ var src = SimdImplementations.AllocateAligned(size);
+ var dst = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(src, size, 42);
+
+ // Normal stores
+ var normal = BenchFramework.Run(
+ () => CopyNormal(src, dst, size),
+ "NTStore", "Normal", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(normal);
+ BenchFramework.PrintResult(normal);
+
+ // Non-temporal stores (streaming)
+ if (Avx2.IsSupported)
+ {
+ var nt = BenchFramework.Run(
+ () => CopyNonTemporal(src, dst, size),
+ "NTStore", "NonTemp", "float64", size, elementBytes, warmup, measure, Suite);
+ nt = nt with { SpeedupVsBaseline = normal.MeanUs / nt.MeanUs };
+ results.Add(nt);
+ BenchFramework.PrintResult(nt);
+ }
+
+ SimdImplementations.FreeAligned(src);
+ SimdImplementations.FreeAligned(dst);
+
+ return results;
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void CopyNormal(double* src, double* dst, int count)
+ {
+ int i = 0;
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var v = Avx.LoadVector256(src + i);
+ Avx.Store(dst + i, v);
+ }
+ }
+ for (; i < count; i++)
+ {
+ dst[i] = src[i];
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void CopyNonTemporal(double* src, double* dst, int count)
+ {
+ int i = 0;
+ if (Avx2.IsSupported && count >= Vector256.Count)
+ {
+ int vectorCount = count - Vector256.Count + 1;
+ for (; i < vectorCount; i += Vector256.Count)
+ {
+ var v = Avx.LoadVector256(src + i);
+ Avx.StoreAlignedNonTemporal(dst + i, v);
+ }
+ }
+ for (; i < count; i++)
+ {
+ dst[i] = src[i];
+ }
+ }
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Isolated/SimdStrategies.cs b/benchmark/NumSharp.Benchmark.Exploration/Isolated/SimdStrategies.cs
new file mode 100644
index 00000000..67587431
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Isolated/SimdStrategies.cs
@@ -0,0 +1,340 @@
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Isolated;
+
+///
+/// Compare different SIMD strategies for row broadcast scenario.
+/// This is the most complex common scenario where multiple strategies are viable.
+///
+public static unsafe class SimdStrategies
+{
+ private const string Suite = "SimdStrategies";
+
+ ///
+ /// Run all strategy comparisons.
+ ///
+ public static List RunAll(bool quick = false)
+ {
+ var results = new List();
+ var warmup = quick ? 2 : BenchFramework.DefaultWarmup;
+ var measure = quick ? 5 : BenchFramework.DefaultMeasure;
+
+ // Test with different matrix shapes
+ var shapes = quick
+ ? new[] { (100, 100), (1000, 1000) }
+ : new[] { (100, 100), (316, 316), (500, 500), (1000, 1000), (3162, 3162) };
+
+ BenchFramework.PrintHeader($"{Suite}: Row Broadcast Strategy Comparison");
+
+ foreach (var (rows, cols) in shapes)
+ {
+ BenchFramework.PrintDivider($"Shape: ({rows}, {cols}) = {rows * cols:N0} elements");
+ results.AddRange(CompareStrategies_Float64(rows, cols, warmup, measure));
+ }
+
+ OutputFormatters.PrintSummary(results);
+ return results;
+ }
+
+ ///
+ /// Compare all strategies for row broadcast with float64.
+ /// matrix[M, N] + row[N] = result[M, N]
+ ///
+ public static List CompareStrategies_Float64(int rows, int cols, int warmup, int measure)
+ {
+ var results = new List();
+ int size = rows * cols;
+ int elementBytes = 8;
+
+ var matrix = SimdImplementations.AllocateAligned(size);
+ var row = SimdImplementations.AllocateAligned(cols);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(matrix, size, 42);
+ SimdImplementations.FillRandom(row, cols, 43);
+
+ // Strategy 1: Pure scalar with GetOffset-like indexing
+ var s1 = BenchFramework.Run(
+ () => Strategy1_ScalarGetOffset(matrix, row, result, rows, cols),
+ "S4_rowBC", "1_ScalarOfs", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(s1);
+ BenchFramework.PrintResult(s1);
+
+ // Strategy 2: Nested loops with scalar inner
+ var s2 = BenchFramework.Run(
+ () => Strategy2_NestedScalar(matrix, row, result, rows, cols),
+ "S4_rowBC", "2_Nested", "float64", size, elementBytes, warmup, measure, Suite);
+ s2 = s2 with { SpeedupVsBaseline = s1.MeanUs / s2.MeanUs };
+ results.Add(s2);
+ BenchFramework.PrintResult(s2);
+
+ // Strategy 3: Nested loops with SIMD inner (SIMD-CHUNK)
+ var s3 = BenchFramework.Run(
+ () => Strategy3_NestedSimdInner(matrix, row, result, rows, cols),
+ "S4_rowBC", "3_SimdChunk", "float64", size, elementBytes, warmup, measure, Suite);
+ s3 = s3 with { SpeedupVsBaseline = s1.MeanUs / s3.MeanUs };
+ results.Add(s3);
+ BenchFramework.PrintResult(s3);
+
+ // Strategy 4: Pretend contiguous (incorrect but shows ceiling)
+ // This strategy is NOT correct for broadcasting but shows max SIMD throughput
+ var s4 = BenchFramework.Run(
+ () => Strategy4_FlatSimd(matrix, row, result, rows, cols),
+ "S4_rowBC", "4_FlatSimd*", "float64", size, elementBytes, warmup, measure, Suite);
+ s4 = s4 with { SpeedupVsBaseline = s1.MeanUs / s4.MeanUs, Notes = "INCORRECT - ceiling only" };
+ results.Add(s4);
+ BenchFramework.PrintResult(s4);
+
+ // Strategy 5: Copy row to temp buffer, use flat SIMD
+ var rowExpanded = SimdImplementations.AllocateAligned(size);
+ var s5 = BenchFramework.Run(
+ () => Strategy5_ExpandThenSimd(matrix, row, rowExpanded, result, rows, cols),
+ "S4_rowBC", "5_Expand", "float64", size, elementBytes, warmup, measure, Suite);
+ s5 = s5 with { SpeedupVsBaseline = s1.MeanUs / s5.MeanUs };
+ results.Add(s5);
+ BenchFramework.PrintResult(s5);
+
+ // Strategy 6: Unrolled SIMD inner loop
+ var s6 = BenchFramework.Run(
+ () => Strategy6_UnrolledSimdInner(matrix, row, result, rows, cols),
+ "S4_rowBC", "6_Unrolled", "float64", size, elementBytes, warmup, measure, Suite);
+ s6 = s6 with { SpeedupVsBaseline = s1.MeanUs / s6.MeanUs };
+ results.Add(s6);
+ BenchFramework.PrintResult(s6);
+
+ SimdImplementations.FreeAligned(matrix);
+ SimdImplementations.FreeAligned(row);
+ SimdImplementations.FreeAligned(result);
+ SimdImplementations.FreeAligned(rowExpanded);
+
+ return results;
+ }
+
+ #region Strategy Implementations
+
+ ///
+ /// Strategy 1: Pure scalar with linear index calculation (simulates GetOffset).
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void Strategy1_ScalarGetOffset(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ for (int i = 0; i < rows * cols; i++)
+ {
+ int c = i % cols; // Simulates GetOffset calculation
+ result[i] = matrix[i] + row[c];
+ }
+ }
+
+ ///
+ /// Strategy 2: Nested loops with direct indexing, scalar inner.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void Strategy2_NestedScalar(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var offset = r * cols;
+ for (int c = 0; c < cols; c++)
+ {
+ result[offset + c] = matrix[offset + c] + row[c];
+ }
+ }
+ }
+
+ ///
+ /// Strategy 3: Nested loops with SIMD inner (SIMD-CHUNK strategy).
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void Strategy3_NestedSimdInner(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ for (int r = 0; r < rows; r++)
+ {
+ var matrixRow = matrix + r * cols;
+ var resultRow = result + r * cols;
+
+ int c = 0;
+ if (Avx2.IsSupported && cols >= Vector256.Count)
+ {
+ int vectorCount = cols - Vector256.Count + 1;
+ for (; c < vectorCount; c += Vector256.Count)
+ {
+ var vm = Avx.LoadVector256(matrixRow + c);
+ var vr = Avx.LoadVector256(row + c);
+ var vres = Avx.Add(vm, vr);
+ Avx.Store(resultRow + c, vres);
+ }
+ }
+
+ // Scalar tail
+ for (; c < cols; c++)
+ {
+ resultRow[c] = matrixRow[c] + row[c];
+ }
+ }
+ }
+
+ ///
+ /// Strategy 4: Flat SIMD ignoring broadcast (INCORRECT but shows ceiling).
+ /// This pretends both arrays are the same shape - shows max possible throughput.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void Strategy4_FlatSimd(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ // NOTE: This is INCORRECT for broadcasting - it ignores row repetition
+ // It's only here to show the maximum throughput ceiling
+ int size = rows * cols;
+ SimdImplementations.AddFull_Float64(matrix, matrix, result, size); // Using matrix twice
+ }
+
+ ///
+ /// Strategy 5: Expand row to full matrix, then flat SIMD.
+ /// Trades memory for simpler SIMD loop.
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void Strategy5_ExpandThenSimd(double* matrix, double* row, double* rowExpanded, double* result, int rows, int cols)
+ {
+ // Step 1: Expand row to full matrix
+ for (int r = 0; r < rows; r++)
+ {
+ Buffer.MemoryCopy(row, rowExpanded + r * cols, cols * sizeof(double), cols * sizeof(double));
+ }
+
+ // Step 2: Flat SIMD add
+ SimdImplementations.AddFull_Float64(matrix, rowExpanded, result, rows * cols);
+ }
+
+ ///
+ /// Strategy 6: Unrolled SIMD inner loop (2x unroll).
+ ///
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ private static void Strategy6_UnrolledSimdInner(double* matrix, double* row, double* result, int rows, int cols)
+ {
+ const int Unroll = 2;
+ int vectorSize = Vector256.Count;
+ int unrollSize = vectorSize * Unroll;
+
+ for (int r = 0; r < rows; r++)
+ {
+ var matrixRow = matrix + r * cols;
+ var resultRow = result + r * cols;
+
+ int c = 0;
+
+ if (Avx2.IsSupported && cols >= unrollSize)
+ {
+ int vectorCount = cols - unrollSize + 1;
+ for (; c < vectorCount; c += unrollSize)
+ {
+ // Load 2 vectors from matrix
+ var vm0 = Avx.LoadVector256(matrixRow + c);
+ var vm1 = Avx.LoadVector256(matrixRow + c + vectorSize);
+
+ // Load 2 vectors from row
+ var vr0 = Avx.LoadVector256(row + c);
+ var vr1 = Avx.LoadVector256(row + c + vectorSize);
+
+ // Add
+ var vres0 = Avx.Add(vm0, vr0);
+ var vres1 = Avx.Add(vm1, vr1);
+
+ // Store
+ Avx.Store(resultRow + c, vres0);
+ Avx.Store(resultRow + c + vectorSize, vres1);
+ }
+ }
+
+ // Non-unrolled SIMD
+ if (Avx2.IsSupported)
+ {
+ int vectorCount = cols - vectorSize + 1;
+ for (; c < vectorCount; c += vectorSize)
+ {
+ var vm = Avx.LoadVector256(matrixRow + c);
+ var vr = Avx.LoadVector256(row + c);
+ var vres = Avx.Add(vm, vr);
+ Avx.Store(resultRow + c, vres);
+ }
+ }
+
+ // Scalar tail
+ for (; c < cols; c++)
+ {
+ resultRow[c] = matrixRow[c] + row[c];
+ }
+ }
+ }
+
+ #endregion
+
+ ///
+ /// Test Vector256 vs Vector128 performance.
+ ///
+ public static List CompareVectorWidths(int size, int warmup, int measure)
+ {
+ var results = new List();
+ int elementBytes = 8;
+
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ BenchFramework.PrintDivider($"Vector Width Comparison (size={size:N0})");
+
+ // Scalar baseline
+ var scalar = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size),
+ "VecWidth", "Scalar", "float64", size, elementBytes, warmup, measure, Suite);
+ results.Add(scalar);
+ BenchFramework.PrintResult(scalar);
+
+ // Vector128
+ var v128 = BenchFramework.Run(
+ () => AddVector128(lhs, rhs, result, size),
+ "VecWidth", "Vector128", "float64", size, elementBytes, warmup, measure, Suite);
+ v128 = v128 with { SpeedupVsBaseline = scalar.MeanUs / v128.MeanUs };
+ results.Add(v128);
+ BenchFramework.PrintResult(v128);
+
+ // Vector256
+ var v256 = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float64(lhs, rhs, result, size),
+ "VecWidth", "Vector256", "float64", size, elementBytes, warmup, measure, Suite);
+ v256 = v256 with { SpeedupVsBaseline = scalar.MeanUs / v256.MeanUs };
+ results.Add(v256);
+ BenchFramework.PrintResult(v256);
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+
+ return results;
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private static void AddVector128(double* lhs, double* rhs, double* result, int count)
+ {
+ int i = 0;
+
+ if (Sse2.IsSupported && count >= Vector128.Count)
+ {
+ int vectorCount = count - Vector128.Count + 1;
+ for (; i < vectorCount; i += Vector128.Count)
+ {
+ var va = Sse2.LoadVector128(lhs + i);
+ var vb = Sse2.LoadVector128(rhs + i);
+ var vr = Sse2.Add(va, vb);
+ Sse2.Store(result + i, vr);
+ }
+ }
+
+ for (; i < count; i++)
+ {
+ result[i] = lhs[i] + rhs[i];
+ }
+ }
+}
diff --git a/benchmark/NumSharp.Benchmark.Exploration/Isolated/SizeThresholds.cs b/benchmark/NumSharp.Benchmark.Exploration/Isolated/SizeThresholds.cs
new file mode 100644
index 00000000..91737244
--- /dev/null
+++ b/benchmark/NumSharp.Benchmark.Exploration/Isolated/SizeThresholds.cs
@@ -0,0 +1,372 @@
+using System.Runtime.InteropServices;
+using NumSharp.Benchmark.Exploration.Infrastructure;
+
+namespace NumSharp.Benchmark.Exploration.Isolated;
+
+///
+/// Discover size thresholds where SIMD starts helping.
+/// Tests fine-grained sizes to find crossover points per dtype.
+///
+public static unsafe class SizeThresholds
+{
+ private const string Suite = "SizeThresholds";
+
+ ///
+ /// Size range optimized for threshold discovery.
+ ///
+ public static readonly int[] ThresholdSizes = [
+ 8, 16, 24, 32, 48, 64, 96, 128,
+ 192, 256, 384, 512, 768, 1024,
+ 1536, 2048, 3072, 4096, 6144, 8192,
+ 12288, 16384, 24576, 32768, 49152, 65536,
+ 98304, 131072
+ ];
+
+ ///
+ /// Run threshold discovery for all dtypes.
+ ///
+ public static List RunAll(string[]? dtypes = null, bool quick = false)
+ {
+ dtypes ??= quick ? Dtypes.Common : Dtypes.All;
+ var sizes = quick ? ThresholdSizes.Where(s => s >= 32 && (s <= 1024 || s >= 16384)).ToArray() : ThresholdSizes;
+ var warmup = quick ? 2 : BenchFramework.DefaultWarmup;
+ var measure = quick ? 5 : BenchFramework.DefaultMeasure;
+
+ var results = new List();
+
+ BenchFramework.PrintHeader($"{Suite}: SIMD Threshold Discovery");
+
+ foreach (var dtype in dtypes)
+ {
+ BenchFramework.PrintDivider($"dtype={dtype}");
+ results.AddRange(FindThreshold(dtype, sizes, warmup, measure));
+ }
+
+ // Summary: find crossover points
+ PrintCrossoverSummary(results);
+
+ return results;
+ }
+
+ ///
+ /// Find the threshold where SIMD becomes faster for a specific dtype.
+ ///
+ public static List FindThreshold(string dtype, int[] sizes, int warmup, int measure)
+ {
+ var results = new List();
+ var elementBytes = DtypeInfo.GetElementSize(dtype);
+
+ switch (dtype.ToLowerInvariant())
+ {
+ case "float64" or "double":
+ {
+ foreach (var size in sizes)
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned(size);
+ SimdImplementations.FillRandom(lhs, size, 42);
+ SimdImplementations.FillRandom(rhs, size, 43);
+
+ var baseline = BenchFramework.Run(
+ () => SimdImplementations.AddScalarLoop_Float64(lhs, rhs, result, size),
+ "Threshold", "Scalar", dtype, size, elementBytes, warmup, measure, Suite);
+
+ var simd = BenchFramework.Run(
+ () => SimdImplementations.AddFull_Float64(lhs, rhs, result, size),
+ "Threshold", "SIMD", dtype, size, elementBytes, warmup, measure, Suite);
+ simd = simd with { SpeedupVsBaseline = baseline.MeanUs / simd.MeanUs };
+
+ results.Add(baseline);
+ results.Add(simd);
+
+ // Compact output for threshold discovery
+ var winner = simd.SpeedupVsBaseline > 1.0 ? "SIMD" : "Scalar";
+ var icon = simd.SpeedupVsBaseline > 1.0 ? "✓" : " ";
+ Console.WriteLine($" {size,8:N0} | Scalar: {baseline.MeanUs,10:F2} us | SIMD: {simd.MeanUs,10:F2} us | {simd.SpeedupVsBaseline,6:F2}x {icon}");
+
+ SimdImplementations.FreeAligned(lhs);
+ SimdImplementations.FreeAligned(rhs);
+ SimdImplementations.FreeAligned(result);
+ }
+ break;
+ }
+
+ case "float32" or "single" or "float":
+ {
+ foreach (var size in sizes)
+ {
+ var lhs = SimdImplementations.AllocateAligned(size);
+ var rhs = SimdImplementations.AllocateAligned(size);
+ var result = SimdImplementations.AllocateAligned