sc_neurocore_engine/simd/
sve.rs1#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
25pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
26 crate::bitstream::pack_fast(bits).data
31}
32
33#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
38pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
39 crate::bitstream::pack_fast(bits).data
40}
41
42#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
47pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
48 crate::bitstream::popcount_words_portable(data)
51}
52
53#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
58pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
59 crate::bitstream::popcount_words_portable(data)
60}
61
62#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
67pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
68 let len = a.len().min(b.len());
71 a[..len]
72 .iter()
73 .zip(&b[..len])
74 .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
75 .sum()
76}
77
78#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
83pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
84 let len = a.len().min(b.len());
85 a[..len]
86 .iter()
87 .zip(&b[..len])
88 .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
89 .sum()
90}
91
92pub unsafe fn fused_xor_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
97 let len = a.len().min(b.len());
98 a[..len]
99 .iter()
100 .zip(&b[..len])
101 .map(|(&wa, &wb)| (wa ^ wb).count_ones() as u64)
102 .sum()
103}
104
105pub unsafe fn dot_f64_sve(a: &[f64], b: &[f64]) -> f64 {
110 let len = a.len().min(b.len());
111 a[..len].iter().zip(&b[..len]).map(|(&x, &y)| x * y).sum()
112}
113
114pub unsafe fn max_f64_sve(a: &[f64]) -> f64 {
117 a.iter().copied().fold(f64::NEG_INFINITY, f64::max)
118}
119
120pub unsafe fn sum_f64_sve(a: &[f64]) -> f64 {
123 a.iter().sum()
124}
125
126pub unsafe fn scale_f64_sve(alpha: f64, y: &mut [f64]) {
129 for v in y.iter_mut() {
130 *v *= alpha;
131 }
132}
133
134pub unsafe fn hamming_distance_sve(a: &[u64], b: &[u64]) -> u64 {
139 fused_xor_popcount_sve(a, b)
140}
141
142pub unsafe fn softmax_inplace_f64_sve(scores: &mut [f64]) {
147 if scores.is_empty() {
148 return;
149 }
150 let max_val = max_f64_sve(scores);
151 for s in scores.iter_mut() {
152 *s = (*s - max_val).exp();
153 }
154 let exp_sum = sum_f64_sve(scores);
155 if exp_sum > 0.0 {
156 scale_f64_sve(1.0 / exp_sum, scores);
157 }
158}
159
160#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
165 fn sve_popcount_matches_portable() {
166 let data: Vec<u64> = vec![0xFFFF_FFFF_FFFF_FFFF, 0x0, 0xAAAA_AAAA_AAAA_AAAA];
167 let expected = 64 + 32;
168 let got = unsafe { popcount_sve(&data) };
169 assert_eq!(got, expected);
170 }
171
172 #[test]
173 fn sve_softmax_sums_to_one() {
174 let mut scores: Vec<f64> = (0..20).map(|i| (i as f64 * 0.5) - 5.0).collect();
175 unsafe { super::softmax_inplace_f64_sve(&mut scores) };
176 let sum: f64 = scores.iter().sum();
177 assert!((sum - 1.0).abs() < 1e-10);
178 assert!(scores.iter().all(|&s| s >= 0.0));
179 }
180
181 #[test]
182 fn sve_hamming_distance() {
183 let a = vec![0xFFu64, 0x00];
184 let b = vec![0x0Fu64, 0x00];
185 let expected = (0xFFu64 ^ 0x0F).count_ones() as u64;
186 let got = unsafe { super::hamming_distance_sve(&a, &b) };
187 assert_eq!(got, expected);
188 }
189
190 #[test]
191 fn sve_fused_and_popcount() {
192 let a = vec![0xFFu64, 0xF0];
193 let b = vec![0x0Fu64, 0xFF];
194 let expected = (0xFFu64 & 0x0F).count_ones() as u64 + (0xF0u64 & 0xFF).count_ones() as u64;
195 let got = unsafe { fused_and_popcount_sve(&a, &b) };
196 assert_eq!(got, expected);
197 }
198}