Skip to main content

sc_neurocore_engine/simd/
sve.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later | Commercial license available
2// © Concepts 1996–2026 Miroslav Šotek. All rights reserved.
3// © Code 2020–2026 Miroslav Šotek. All rights reserved.
4// ORCID: 0009-0009-3560-0851
5// Contact: www.anulum.li | protoscience@anulum.li
6// SC-NeuroCore — Scalar fallback for ARM SVE targets
7
8//! Scalar fallback for ARM SVE targets.
9//! Hardware SVE intrinsics are not yet implemented; all operations
10//! use the portable scalar path from `super::mod.rs`.
11//!
12//! SVE operates on variable-length vectors (128–2048 bits depending on
13//! hardware).  When Rust stabilises `core::arch::aarch64` SVE intrinsics,
14//! replace the bodies below with predicated vector loops.
15//!
16//! Build with:
17//!   RUSTFLAGS="-C target-feature=+sve" cargo build --target aarch64-unknown-linux-gnu
18
19/// Pack u8 bit array into u64 words using SVE wide loads.
20///
21/// # Safety
22/// Caller must ensure the target CPU supports SVE.
23#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
24pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
25    // SVE pack: process VL bytes at a time using svld1_u8 + bit gathering.
26    // Current implementation: portable fallback (SVE intrinsics are
27    // nightly-only as of Rust 1.82).  Replace with svptrue_b8 / svld1_u8 /
28    // svlsr_n_u8_x / svorr_u8_x pipeline when stabilised.
29    crate::bitstream::pack_fast(bits).data
30}
31
32/// Pack u8 bit array into u64 words (portable fallback).
33///
34/// # Safety
35/// No hardware requirements in fallback mode.
36#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
37pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
38    crate::bitstream::pack_fast(bits).data
39}
40
41/// Count set bits using SVE BCNT instruction.
42///
43/// # Safety
44/// Caller must ensure the target CPU supports SVE.
45#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
46pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
47    // SVE provides svcnt_u64_x (BCNT) for per-element popcount.
48    // Pending stabilisation of core::arch::aarch64::sve intrinsics.
49    crate::bitstream::popcount_words_portable(data)
50}
51
52/// Count set bits (portable fallback).
53///
54/// # Safety
55/// No hardware requirements in fallback mode.
56#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
57pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
58    crate::bitstream::popcount_words_portable(data)
59}
60
61/// Fused AND + popcount using SVE.
62///
63/// # Safety
64/// Caller must ensure the target CPU supports SVE.
65#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
66pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
67    // SVE: svand_u64_x + svcnt_u64_x in a single predicated loop.
68    // Pending intrinsic stabilisation.
69    let len = a.len().min(b.len());
70    a[..len]
71        .iter()
72        .zip(&b[..len])
73        .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
74        .sum()
75}
76
77/// Fused AND + popcount (portable fallback).
78///
79/// # Safety
80/// No hardware requirements in fallback mode.
81#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
82pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
83    let len = a.len().min(b.len());
84    a[..len]
85        .iter()
86        .zip(&b[..len])
87        .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
88        .sum()
89}
90
91/// Fused XOR + popcount using SVE.
92///
93/// # Safety
94/// No hardware requirements (portable implementation).
95pub unsafe fn fused_xor_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
96    let len = a.len().min(b.len());
97    a[..len]
98        .iter()
99        .zip(&b[..len])
100        .map(|(&wa, &wb)| (wa ^ wb).count_ones() as u64)
101        .sum()
102}
103
104// --- f64 operations (portable fallback, SVE intrinsics pending stabilisation) ---
105
106/// # Safety
107/// No hardware requirements (portable implementation).
108pub unsafe fn dot_f64_sve(a: &[f64], b: &[f64]) -> f64 {
109    let len = a.len().min(b.len());
110    a[..len].iter().zip(&b[..len]).map(|(&x, &y)| x * y).sum()
111}
112
113/// # Safety
114/// No hardware requirements (portable implementation).
115pub unsafe fn max_f64_sve(a: &[f64]) -> f64 {
116    a.iter().copied().fold(f64::NEG_INFINITY, f64::max)
117}
118
119/// # Safety
120/// No hardware requirements (portable implementation).
121pub unsafe fn sum_f64_sve(a: &[f64]) -> f64 {
122    a.iter().sum()
123}
124
125/// # Safety
126/// No hardware requirements (portable implementation).
127pub unsafe fn scale_f64_sve(alpha: f64, y: &mut [f64]) {
128    for v in y.iter_mut() {
129        *v *= alpha;
130    }
131}
132
133/// Hamming distance between two packed bitstream slices.
134///
135/// # Safety
136/// No hardware requirements (portable implementation).
137pub unsafe fn hamming_distance_sve(a: &[u64], b: &[u64]) -> u64 {
138    fused_xor_popcount_sve(a, b)
139}
140
141/// In-place softmax (portable fallback for SVE).
142///
143/// # Safety
144/// No hardware requirements (portable implementation).
145pub unsafe fn softmax_inplace_f64_sve(scores: &mut [f64]) {
146    if scores.is_empty() {
147        return;
148    }
149    let max_val = max_f64_sve(scores);
150    for s in scores.iter_mut() {
151        *s = (*s - max_val).exp();
152    }
153    let exp_sum = sum_f64_sve(scores);
154    if exp_sum > 0.0 {
155        scale_f64_sve(1.0 / exp_sum, scores);
156    }
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    #[test]
164    fn sve_popcount_matches_portable() {
165        let data: Vec<u64> = vec![0xFFFF_FFFF_FFFF_FFFF, 0x0, 0xAAAA_AAAA_AAAA_AAAA];
166        let expected = 64 + 32;
167        let got = unsafe { popcount_sve(&data) };
168        assert_eq!(got, expected);
169    }
170
171    #[test]
172    fn sve_softmax_sums_to_one() {
173        let mut scores: Vec<f64> = (0..20).map(|i| (i as f64 * 0.5) - 5.0).collect();
174        unsafe { super::softmax_inplace_f64_sve(&mut scores) };
175        let sum: f64 = scores.iter().sum();
176        assert!((sum - 1.0).abs() < 1e-10);
177        assert!(scores.iter().all(|&s| s >= 0.0));
178    }
179
180    #[test]
181    fn sve_hamming_distance() {
182        let a = vec![0xFFu64, 0x00];
183        let b = vec![0x0Fu64, 0x00];
184        let expected = (0xFFu64 ^ 0x0F).count_ones() as u64;
185        let got = unsafe { super::hamming_distance_sve(&a, &b) };
186        assert_eq!(got, expected);
187    }
188
189    #[test]
190    fn sve_fused_and_popcount() {
191        let a = vec![0xFFu64, 0xF0];
192        let b = vec![0x0Fu64, 0xFF];
193        let expected = (0xFFu64 & 0x0F).count_ones() as u64 + (0xF0u64 & 0xFF).count_ones() as u64;
194        let got = unsafe { fused_and_popcount_sve(&a, &b) };
195        assert_eq!(got, expected);
196    }
197}