Skip to main content

sc_neurocore_engine/simd/
sve.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Commercial license available
3// © Concepts 1996–2026 Miroslav Šotek. All rights reserved.
4// © Code 2020–2026 Miroslav Šotek. All rights reserved.
5// ORCID: 0009-0009-3560-0851
6// Contact: www.anulum.li | protoscience@anulum.li
7// SC-NeuroCore — Scalar fallback for ARM SVE targets
8
9//! Scalar fallback for ARM SVE targets.
10//! Hardware SVE intrinsics are not yet implemented; all operations
11//! use the portable scalar path from `super::mod.rs`.
12//!
13//! SVE operates on variable-length vectors (128–2048 bits depending on
14//! hardware).  When Rust stabilises `core::arch::aarch64` SVE intrinsics,
15//! replace the bodies below with predicated vector loops.
16//!
17//! Build with:
18//!   RUSTFLAGS="-C target-feature=+sve" cargo build --target aarch64-unknown-linux-gnu
19
20/// Pack u8 bit array into u64 words using SVE wide loads.
21///
22/// # Safety
23/// Caller must ensure the target CPU supports SVE.
24#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
25pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
26    // SVE pack: process VL bytes at a time using svld1_u8 + bit gathering.
27    // Current implementation: portable fallback (SVE intrinsics are
28    // nightly-only as of Rust 1.82).  Replace with svptrue_b8 / svld1_u8 /
29    // svlsr_n_u8_x / svorr_u8_x pipeline when stabilised.
30    crate::bitstream::pack_fast(bits).data
31}
32
33/// Pack u8 bit array into u64 words (portable fallback).
34///
35/// # Safety
36/// No hardware requirements in fallback mode.
37#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
38pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
39    crate::bitstream::pack_fast(bits).data
40}
41
42/// Count set bits using SVE BCNT instruction.
43///
44/// # Safety
45/// Caller must ensure the target CPU supports SVE.
46#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
47pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
48    // SVE provides svcnt_u64_x (BCNT) for per-element popcount.
49    // Pending stabilisation of core::arch::aarch64::sve intrinsics.
50    crate::bitstream::popcount_words_portable(data)
51}
52
53/// Count set bits (portable fallback).
54///
55/// # Safety
56/// No hardware requirements in fallback mode.
57#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
58pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
59    crate::bitstream::popcount_words_portable(data)
60}
61
62/// Fused AND + popcount using SVE.
63///
64/// # Safety
65/// Caller must ensure the target CPU supports SVE.
66#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
67pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
68    // SVE: svand_u64_x + svcnt_u64_x in a single predicated loop.
69    // Pending intrinsic stabilisation.
70    let len = a.len().min(b.len());
71    a[..len]
72        .iter()
73        .zip(&b[..len])
74        .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
75        .sum()
76}
77
78/// Fused AND + popcount (portable fallback).
79///
80/// # Safety
81/// No hardware requirements in fallback mode.
82#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
83pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
84    let len = a.len().min(b.len());
85    a[..len]
86        .iter()
87        .zip(&b[..len])
88        .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
89        .sum()
90}
91
92/// Fused XOR + popcount using SVE.
93///
94/// # Safety
95/// No hardware requirements (portable implementation).
96pub unsafe fn fused_xor_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
97    let len = a.len().min(b.len());
98    a[..len]
99        .iter()
100        .zip(&b[..len])
101        .map(|(&wa, &wb)| (wa ^ wb).count_ones() as u64)
102        .sum()
103}
104
105// --- f64 operations (portable fallback, SVE intrinsics pending stabilisation) ---
106
107/// # Safety
108/// No hardware requirements (portable implementation).
109pub unsafe fn dot_f64_sve(a: &[f64], b: &[f64]) -> f64 {
110    let len = a.len().min(b.len());
111    a[..len].iter().zip(&b[..len]).map(|(&x, &y)| x * y).sum()
112}
113
114/// # Safety
115/// No hardware requirements (portable implementation).
116pub unsafe fn max_f64_sve(a: &[f64]) -> f64 {
117    a.iter().copied().fold(f64::NEG_INFINITY, f64::max)
118}
119
120/// # Safety
121/// No hardware requirements (portable implementation).
122pub unsafe fn sum_f64_sve(a: &[f64]) -> f64 {
123    a.iter().sum()
124}
125
126/// # Safety
127/// No hardware requirements (portable implementation).
128pub unsafe fn scale_f64_sve(alpha: f64, y: &mut [f64]) {
129    for v in y.iter_mut() {
130        *v *= alpha;
131    }
132}
133
134/// Hamming distance between two packed bitstream slices.
135///
136/// # Safety
137/// No hardware requirements (portable implementation).
138pub unsafe fn hamming_distance_sve(a: &[u64], b: &[u64]) -> u64 {
139    fused_xor_popcount_sve(a, b)
140}
141
142/// In-place softmax (portable fallback for SVE).
143///
144/// # Safety
145/// No hardware requirements (portable implementation).
146pub unsafe fn softmax_inplace_f64_sve(scores: &mut [f64]) {
147    if scores.is_empty() {
148        return;
149    }
150    let max_val = max_f64_sve(scores);
151    for s in scores.iter_mut() {
152        *s = (*s - max_val).exp();
153    }
154    let exp_sum = sum_f64_sve(scores);
155    if exp_sum > 0.0 {
156        scale_f64_sve(1.0 / exp_sum, scores);
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn sve_popcount_matches_portable() {
166        let data: Vec<u64> = vec![0xFFFF_FFFF_FFFF_FFFF, 0x0, 0xAAAA_AAAA_AAAA_AAAA];
167        let expected = 64 + 32;
168        let got = unsafe { popcount_sve(&data) };
169        assert_eq!(got, expected);
170    }
171
172    #[test]
173    fn sve_softmax_sums_to_one() {
174        let mut scores: Vec<f64> = (0..20).map(|i| (i as f64 * 0.5) - 5.0).collect();
175        unsafe { super::softmax_inplace_f64_sve(&mut scores) };
176        let sum: f64 = scores.iter().sum();
177        assert!((sum - 1.0).abs() < 1e-10);
178        assert!(scores.iter().all(|&s| s >= 0.0));
179    }
180
181    #[test]
182    fn sve_hamming_distance() {
183        let a = vec![0xFFu64, 0x00];
184        let b = vec![0x0Fu64, 0x00];
185        let expected = (0xFFu64 ^ 0x0F).count_ones() as u64;
186        let got = unsafe { super::hamming_distance_sve(&a, &b) };
187        assert_eq!(got, expected);
188    }
189
190    #[test]
191    fn sve_fused_and_popcount() {
192        let a = vec![0xFFu64, 0xF0];
193        let b = vec![0x0Fu64, 0xFF];
194        let expected = (0xFFu64 & 0x0F).count_ones() as u64 + (0xF0u64 & 0xFF).count_ones() as u64;
195        let got = unsafe { fused_and_popcount_sve(&a, &b) };
196        assert_eq!(got, expected);
197    }
198}