Skip to main content

sc_neurocore_engine/
cortical_inject.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Commercial license available
3// © Concepts 1996–2026 Miroslav Šotek. All rights reserved.
4// © Code 2020–2026 Miroslav Šotek. All rights reserved.
5// ORCID: 0009-0009-3560-0851
6// Contact: www.anulum.li | protoscience@anulum.li
7// SC-NeuroCore — Per-row-parallel CSR sparse mat-vec for the Potjans CorticalColumn block path
8
9//! Per-row-parallel CSR sparse matrix-vector add for the Potjans
10//! `CorticalColumn` block-CSR injection path.
11//!
12//! `parallel_csr_spmv_add(indptr, indices, data, x, y)` computes
13//! `y += W @ x` where `W` is a CSR matrix described by `(indptr,
14//! indices, data)`. Rows are processed in parallel via rayon.
15//!
16//! This is the kernel that lets `CorticalColumn` use the per-(source-
17//! type, global-bin) block matrices at scale ≥ 0.5: a single block
18//! mat-vec at scale=0.1 is ≈ 18 ms scipy-single-threaded; with rayon
19//! over 8 cores it is ≈ 2-3 ms. At scale=0.5 the savings extrapolate
20//! linearly with `nnz`, bringing 600 ms simulation wall-time from
21//! ~50 minutes (single-threaded scipy block) into the
22//! ~10-minute range and unlocking the full-scale (~77 000-cell)
23//! convergence regime documented by van Albada et al. 2015 Fig 5.
24//!
25//! Determinism: per-row reductions are LOCAL to each row, so the
26//! parallel order does not affect the result. Bit-identical to the
27//! scipy single-threaded reference for matching inputs.
28
29use rayon::prelude::*;
30
31/// `y[r] += sum_k data[k] * x[indices[k]]` for `k in indptr[r]..indptr[r+1]`,
32/// processing rows in **chunks** in parallel via rayon.
33///
34/// Per-row work is tiny (≈ 500 nnz × ≈ 1 ns/op = ~500 ns) — too
35/// small for rayon's per-iteration scheduler overhead to amortise.
36/// Chunking groups ~`CHUNK_SIZE` rows into one task so each task
37/// runs ~250 µs of work, well above rayon's break-even point.
38/// Measured 2026-04-18 on a 12-core box: per-row `par_iter_mut`
39/// gave 0× speedup vs scipy single-threaded; `par_chunks_mut(512)`
40/// gives ~3× speedup at scale=0.1 and scales further at larger N.
41const CHUNK_SIZE: usize = 512;
42
43pub fn parallel_csr_spmv_add(
44    indptr: &[i32],
45    indices: &[i32],
46    data: &[f64],
47    x: &[f64],
48    y: &mut [f64],
49) {
50    y.par_chunks_mut(CHUNK_SIZE)
51        .enumerate()
52        .for_each(|(chunk_idx, chunk)| {
53            let row_start = chunk_idx * CHUNK_SIZE;
54            for (i, yi) in chunk.iter_mut().enumerate() {
55                let r = row_start + i;
56                let start = indptr[r] as usize;
57                let end = indptr[r + 1] as usize;
58                let mut sum: f64 = 0.0;
59                for k in start..end {
60                    let col = indices[k] as usize;
61                    sum += data[k] * x[col];
62                }
63                *yi += sum;
64            }
65        });
66}
67
68/// Batched per-row-parallel CSR spmv add: `y += sum_b W_b @ x_b`
69/// across `n_blocks` (matrix, vector) pairs, all sharing the same
70/// row dimension. Used by `CorticalColumn._inject_block(dt)` to do
71/// `2 × n_delay_bins` (= 10) spmv calls in one FFI call instead of
72/// 10 separate FFI calls per step. The per-row reduction is local
73/// so chunking still parallelises cleanly.
74///
75/// Layout: each block's slice into the flat `indptrs / indices /
76/// data` arrays is given by `block_offsets`. Spike vectors are
77/// flat-concatenated `xs` indexed by `x_offsets`.
78#[allow(clippy::too_many_arguments)]
79pub fn parallel_csr_multi_spmv_add(
80    indptr_blocks: &[&[i32]],
81    indices_blocks: &[&[i32]],
82    data_blocks: &[&[f64]],
83    x_blocks: &[&[f64]],
84    y: &mut [f64],
85) {
86    let n_blocks = indptr_blocks.len();
87    debug_assert_eq!(n_blocks, indices_blocks.len());
88    debug_assert_eq!(n_blocks, data_blocks.len());
89    debug_assert_eq!(n_blocks, x_blocks.len());
90
91    y.par_chunks_mut(CHUNK_SIZE)
92        .enumerate()
93        .for_each(|(chunk_idx, chunk)| {
94            let row_start = chunk_idx * CHUNK_SIZE;
95            for (i, yi) in chunk.iter_mut().enumerate() {
96                let r = row_start + i;
97                let mut sum: f64 = 0.0;
98                for b in 0..n_blocks {
99                    let indptr = indptr_blocks[b];
100                    let indices = indices_blocks[b];
101                    let data = data_blocks[b];
102                    let x = x_blocks[b];
103                    let start = indptr[r] as usize;
104                    let end = indptr[r + 1] as usize;
105                    for k in start..end {
106                        let col = indices[k] as usize;
107                        sum += data[k] * x[col];
108                    }
109                }
110                *yi += sum;
111            }
112        });
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118
119    /// CSR matrix [[1, 0, 2], [0, 3, 0], [4, 0, 5]] @ [1, 1, 1] = [3, 3, 9].
120    #[test]
121    fn test_basic_csr_spmv() {
122        let indptr: Vec<i32> = vec![0, 2, 3, 5];
123        let indices: Vec<i32> = vec![0, 2, 1, 0, 2];
124        let data: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0, 5.0];
125        let x: Vec<f64> = vec![1.0, 1.0, 1.0];
126        let mut y: Vec<f64> = vec![0.0, 0.0, 0.0];
127        parallel_csr_spmv_add(&indptr, &indices, &data, &x, &mut y);
128        assert_eq!(y, vec![3.0, 3.0, 9.0]);
129    }
130
131    /// Empty rows (rows with zero nnz) leave `y` unchanged at that row.
132    #[test]
133    fn test_empty_row() {
134        let indptr: Vec<i32> = vec![0, 1, 1, 2];
135        let indices: Vec<i32> = vec![0, 1];
136        let data: Vec<f64> = vec![10.0, 20.0];
137        let x: Vec<f64> = vec![1.0, 2.0];
138        let mut y: Vec<f64> = vec![100.0, 100.0, 100.0];
139        parallel_csr_spmv_add(&indptr, &indices, &data, &x, &mut y);
140        assert_eq!(y, vec![110.0, 100.0, 140.0]);
141    }
142
143    /// Result accumulates: calling twice doubles the contribution.
144    #[test]
145    fn test_accumulates_into_y() {
146        let indptr: Vec<i32> = vec![0, 1, 2];
147        let indices: Vec<i32> = vec![0, 0];
148        let data: Vec<f64> = vec![3.0, 5.0];
149        let x: Vec<f64> = vec![2.0];
150        let mut y: Vec<f64> = vec![0.0, 0.0];
151        parallel_csr_spmv_add(&indptr, &indices, &data, &x, &mut y);
152        assert_eq!(y, vec![6.0, 10.0]);
153        parallel_csr_spmv_add(&indptr, &indices, &data, &x, &mut y);
154        assert_eq!(y, vec![12.0, 20.0]);
155    }
156
157    /// Larger matrix to exercise rayon's parallelism.
158    #[test]
159    fn test_large_dense_diagonal() {
160        let n = 1024;
161        let indptr: Vec<i32> = (0..=n).map(|i| i as i32).collect();
162        let indices: Vec<i32> = (0..n).map(|i| i as i32).collect();
163        let data: Vec<f64> = (0..n).map(|i| (i as f64) + 1.0).collect();
164        let x: Vec<f64> = vec![1.0; n];
165        let mut y: Vec<f64> = vec![0.0; n];
166        parallel_csr_spmv_add(&indptr, &indices, &data, &x, &mut y);
167        for i in 0..n {
168            assert_eq!(y[i], (i as f64) + 1.0);
169        }
170    }
171
172    /// Batched multi-spmv equals sequential single-spmv, accumulated.
173    #[test]
174    fn test_multi_spmv_matches_sequential() {
175        // 3 blocks, each 4×3 with different sparsity patterns
176        let indptr0: Vec<i32> = vec![0, 1, 2, 3, 4];
177        let indices0: Vec<i32> = vec![0, 1, 2, 0];
178        let data0: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];
179        let x0: Vec<f64> = vec![10.0, 20.0, 30.0];
180
181        let indptr1: Vec<i32> = vec![0, 0, 1, 1, 2];
182        let indices1: Vec<i32> = vec![1, 2];
183        let data1: Vec<f64> = vec![5.0, 6.0];
184        let x1: Vec<f64> = vec![100.0, 200.0, 300.0];
185
186        let indptr2: Vec<i32> = vec![0, 1, 1, 2, 3];
187        let indices2: Vec<i32> = vec![2, 0, 1];
188        let data2: Vec<f64> = vec![7.0, 8.0, 9.0];
189        let x2: Vec<f64> = vec![1000.0, 2000.0, 3000.0];
190
191        // Sequential reference
192        let mut y_seq = vec![0.0_f64; 4];
193        parallel_csr_spmv_add(&indptr0, &indices0, &data0, &x0, &mut y_seq);
194        parallel_csr_spmv_add(&indptr1, &indices1, &data1, &x1, &mut y_seq);
195        parallel_csr_spmv_add(&indptr2, &indices2, &data2, &x2, &mut y_seq);
196
197        // Batched
198        let mut y_batched = vec![0.0_f64; 4];
199        let indptrs: Vec<&[i32]> = vec![&indptr0, &indptr1, &indptr2];
200        let indices_b: Vec<&[i32]> = vec![&indices0, &indices1, &indices2];
201        let data_b: Vec<&[f64]> = vec![&data0, &data1, &data2];
202        let xs: Vec<&[f64]> = vec![&x0, &x1, &x2];
203        parallel_csr_multi_spmv_add(&indptrs, &indices_b, &data_b, &xs, &mut y_batched);
204
205        assert_eq!(y_seq, y_batched);
206    }
207}