sc_neurocore_engine/simd/
sve.rs1#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
24pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
25 crate::bitstream::pack_fast(bits).data
30}
31
32#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
37pub unsafe fn pack_sve(bits: &[u8]) -> Vec<u64> {
38 crate::bitstream::pack_fast(bits).data
39}
40
41#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
46pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
47 crate::bitstream::popcount_words_portable(data)
50}
51
52#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
57pub unsafe fn popcount_sve(data: &[u64]) -> u64 {
58 crate::bitstream::popcount_words_portable(data)
59}
60
61#[cfg(all(target_arch = "aarch64", target_feature = "sve"))]
66pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
67 let len = a.len().min(b.len());
70 a[..len]
71 .iter()
72 .zip(&b[..len])
73 .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
74 .sum()
75}
76
77#[cfg(not(all(target_arch = "aarch64", target_feature = "sve")))]
82pub unsafe fn fused_and_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
83 let len = a.len().min(b.len());
84 a[..len]
85 .iter()
86 .zip(&b[..len])
87 .map(|(&wa, &wb)| (wa & wb).count_ones() as u64)
88 .sum()
89}
90
91pub unsafe fn fused_xor_popcount_sve(a: &[u64], b: &[u64]) -> u64 {
96 let len = a.len().min(b.len());
97 a[..len]
98 .iter()
99 .zip(&b[..len])
100 .map(|(&wa, &wb)| (wa ^ wb).count_ones() as u64)
101 .sum()
102}
103
104pub unsafe fn dot_f64_sve(a: &[f64], b: &[f64]) -> f64 {
109 let len = a.len().min(b.len());
110 a[..len].iter().zip(&b[..len]).map(|(&x, &y)| x * y).sum()
111}
112
113pub unsafe fn max_f64_sve(a: &[f64]) -> f64 {
116 a.iter().copied().fold(f64::NEG_INFINITY, f64::max)
117}
118
119pub unsafe fn sum_f64_sve(a: &[f64]) -> f64 {
122 a.iter().sum()
123}
124
125pub unsafe fn scale_f64_sve(alpha: f64, y: &mut [f64]) {
128 for v in y.iter_mut() {
129 *v *= alpha;
130 }
131}
132
133pub unsafe fn hamming_distance_sve(a: &[u64], b: &[u64]) -> u64 {
138 fused_xor_popcount_sve(a, b)
139}
140
141pub unsafe fn softmax_inplace_f64_sve(scores: &mut [f64]) {
146 if scores.is_empty() {
147 return;
148 }
149 let max_val = max_f64_sve(scores);
150 for s in scores.iter_mut() {
151 *s = (*s - max_val).exp();
152 }
153 let exp_sum = sum_f64_sve(scores);
154 if exp_sum > 0.0 {
155 scale_f64_sve(1.0 / exp_sum, scores);
156 }
157}
158
159#[cfg(test)]
160mod tests {
161 use super::*;
162
163 #[test]
164 fn sve_popcount_matches_portable() {
165 let data: Vec<u64> = vec![0xFFFF_FFFF_FFFF_FFFF, 0x0, 0xAAAA_AAAA_AAAA_AAAA];
166 let expected = 64 + 32;
167 let got = unsafe { popcount_sve(&data) };
168 assert_eq!(got, expected);
169 }
170
171 #[test]
172 fn sve_softmax_sums_to_one() {
173 let mut scores: Vec<f64> = (0..20).map(|i| (i as f64 * 0.5) - 5.0).collect();
174 unsafe { super::softmax_inplace_f64_sve(&mut scores) };
175 let sum: f64 = scores.iter().sum();
176 assert!((sum - 1.0).abs() < 1e-10);
177 assert!(scores.iter().all(|&s| s >= 0.0));
178 }
179
180 #[test]
181 fn sve_hamming_distance() {
182 let a = vec![0xFFu64, 0x00];
183 let b = vec![0x0Fu64, 0x00];
184 let expected = (0xFFu64 ^ 0x0F).count_ones() as u64;
185 let got = unsafe { super::hamming_distance_sve(&a, &b) };
186 assert_eq!(got, expected);
187 }
188
189 #[test]
190 fn sve_fused_and_popcount() {
191 let a = vec![0xFFu64, 0xF0];
192 let b = vec![0x0Fu64, 0xFF];
193 let expected = (0xFFu64 & 0x0F).count_ones() as u64 + (0xF0u64 & 0xFF).count_ones() as u64;
194 let got = unsafe { fused_and_popcount_sve(&a, &b) };
195 assert_eq!(got, expected);
196 }
197}