Skip to main content

sc_neurocore_engine/
optimizer.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Commercial license available
3// © Concepts 1996–2026 Miroslav Šotek. All rights reserved.
4// © Code 2020–2026 Miroslav Šotek. All rights reserved.
5// ORCID: 0009-0009-3560-0851
6// Contact: www.anulum.li | protoscience@anulum.li
7// SC-NeuroCore — Rust SC-Optimizer Acceleration
8
9//! Accelerates SC-Optimizer hot paths:
10//! - Simulated annealing design-space search
11//! - Pareto frontier extraction (O(N²) dominance check)
12//! - Resource estimation batch evaluation
13
14use rayon::prelude::*;
15
16// ── Resource estimation ──────────────────────────────────────────────
17
18/// Pre-computed candidate config for one layer.
19#[derive(Clone, Debug)]
20pub struct Candidate {
21    pub bitstream_length: u32,
22    pub decorrelator: u8, // 0=None,1=LFSR,2=Sobol,3=Halton,4=SCC
23    pub mode: u8,         // 0=SC, 1=Deterministic, 2=Hybrid
24    pub luts: i64,
25    pub power: f64,
26    pub accuracy: f64,
27    pub latency: i64,
28}
29
30/// Estimate resources for a single (mac_count, length, decorr, mode) tuple.
31pub fn estimate_resources(mac_count: i64, length: u32, decorr: u8, mode: u8) -> Candidate {
32    let len = length as f64;
33    let log2_len = (len.log2()).floor() as i64;
34
35    if mode == 1 {
36        // Deterministic
37        return Candidate {
38            bitstream_length: length,
39            decorrelator: decorr,
40            mode,
41            luts: mac_count * 120,
42            power: mac_count as f64 * 0.5,
43            accuracy: 1.0,
44            latency: 1,
45        };
46    }
47
48    if mode == 2 {
49        // Hybrid
50        let sc_frac = 0.7_f64;
51        let det_frac = 0.3_f64;
52        let sc_macs = (mac_count as f64 * sc_frac) as i64;
53        let det_macs = (mac_count as f64 * det_frac) as i64;
54        let mut luts = sc_macs * 2 + log2_len * 5 + det_macs * 120;
55        let power = sc_macs as f64 * 0.01 * (len / 256.0) + det_macs as f64 * 0.5;
56        let mut accuracy = 0.95_f64;
57
58        match decorr {
59            2 => {
60                luts += (sc_macs as f64 * 15.0) as i64;
61                accuracy = 0.97;
62            }
63            1 => {
64                luts += 16;
65                accuracy = 0.96;
66            }
67            _ => {}
68        }
69
70        return Candidate {
71            bitstream_length: length,
72            decorrelator: decorr,
73            mode,
74            luts,
75            power,
76            accuracy: accuracy.min(1.0),
77            latency: length as i64,
78        };
79    }
80
81    // SC mode
82    let mut luts = mac_count * 2 + log2_len * 5;
83    let power = mac_count as f64 * 0.01 * (len / 256.0);
84
85    let accuracy = match decorr {
86        2 => {
87            luts += mac_count * 15;
88            1.0 - 1.0 / len
89        } // Sobol
90        3 => {
91            luts += mac_count * 12;
92            1.0 - 1.2 / len
93        } // Halton
94        4 => {
95            luts += mac_count * 8;
96            1.0 - 1.5 / len
97        } // SCC
98        1 => {
99            luts += 16;
100            1.0 - 1.0 / len.sqrt()
101        } // LFSR
102        _ => 1.0 - 2.0 / len.sqrt(), // None
103    };
104
105    Candidate {
106        bitstream_length: length,
107        decorrelator: decorr,
108        mode,
109        luts,
110        power,
111        accuracy: accuracy.clamp(0.1, 1.0),
112        latency: length as i64,
113    }
114}
115
116/// Batch-generate all candidates for a layer.
117pub fn generate_candidates(mac_count: i64) -> Vec<Candidate> {
118    let lengths: &[u32] = &[64, 128, 256, 512, 1024, 2048];
119    let decorrs: &[u8] = &[0, 1, 2, 3, 4];
120    let mut result = Vec::with_capacity(1 + lengths.len() * decorrs.len() * 2);
121
122    // Deterministic
123    result.push(estimate_resources(mac_count, 1, 0, 1));
124
125    // SC + Hybrid
126    for &mode in &[0u8, 2] {
127        for &length in lengths {
128            for &decorr in decorrs {
129                result.push(estimate_resources(mac_count, length, decorr, mode));
130            }
131        }
132    }
133    result
134}
135
136// ── Simulated annealing ──────────────────────────────────────────────
137
138/// PRNG (xoshiro256++)
139struct Xoshiro256pp([u64; 4]);
140
141impl Xoshiro256pp {
142    fn new(seed: u64) -> Self {
143        let mut s = [0u64; 4];
144        let mut z = seed.wrapping_add(0x9E3779B97F4A7C15);
145        for slot in s.iter_mut() {
146            z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
147            z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
148            *slot = z ^ (z >> 31);
149        }
150        Self(s)
151    }
152
153    fn next_u64(&mut self) -> u64 {
154        let result = (self.0[0].wrapping_add(self.0[3]))
155            .rotate_left(23)
156            .wrapping_add(self.0[0]);
157        let t = self.0[1] << 17;
158        self.0[2] ^= self.0[0];
159        self.0[3] ^= self.0[1];
160        self.0[1] ^= self.0[2];
161        self.0[0] ^= self.0[3];
162        self.0[2] ^= t;
163        self.0[3] = self.0[3].rotate_left(45);
164        result
165    }
166
167    fn next_usize(&mut self, bound: usize) -> usize {
168        (self.next_u64() % bound as u64) as usize
169    }
170
171    fn next_f64(&mut self) -> f64 {
172        (self.next_u64() >> 11) as f64 * (1.0 / (1u64 << 53) as f64)
173    }
174}
175
176/// SA result.
177#[derive(Clone, Debug)]
178pub struct SAResult {
179    pub best_config: Vec<usize>, // index into candidates per layer
180    pub best_score: f64,
181    pub pareto_luts: Vec<i64>,
182    pub pareto_power: Vec<f64>,
183    pub pareto_score: Vec<f64>,
184}
185
186/// Run SA design-space search.
187///
188/// * `candidates` — Vec of candidate lists, one per layer
189/// * `weights` — per-layer scoring weight (2.0 for critical, 1.0 otherwise)
190/// * `max_luts`, `max_power`, `max_latency` — budget constraints
191pub fn simulated_annealing(
192    candidates: &[Vec<Candidate>],
193    weights: &[f64],
194    max_luts: i64,
195    max_power: f64,
196    max_latency: i64,
197    t_init: f64,
198    t_min: f64,
199    alpha: f64,
200    max_iter: usize,
201    seed: u64,
202) -> Option<SAResult> {
203    let n = candidates.len();
204    let mut rng = Xoshiro256pp::new(seed);
205
206    // Start from cheapest
207    let mut current: Vec<usize> = (0..n)
208        .map(|i| {
209            candidates[i]
210                .iter()
211                .enumerate()
212                .min_by_key(|(_, c)| c.luts)
213                .map(|(idx, _)| idx)
214                .unwrap_or(0)
215        })
216        .collect();
217
218    if !is_feasible(candidates, &current, max_luts, max_power, max_latency) {
219        return None;
220    }
221
222    let weight_sum: f64 = weights.iter().sum();
223    let score_fn = |cfg: &[usize]| -> f64 {
224        let mut total = 0.0;
225        for i in 0..n {
226            total += candidates[i][cfg[i]].accuracy * weights[i];
227        }
228        total / weight_sum
229    };
230
231    let mut best = current.clone();
232    let mut best_score = score_fn(&best);
233    let mut current_score = best_score;
234    let mut t = t_init;
235
236    let mut pareto_luts = Vec::new();
237    let mut pareto_power = Vec::new();
238    let mut pareto_score = Vec::new();
239
240    for _ in 0..max_iter {
241        if t <= t_min {
242            break;
243        }
244
245        let layer = rng.next_usize(n);
246        let cand_idx = rng.next_usize(candidates[layer].len());
247
248        let old = current[layer];
249        current[layer] = cand_idx;
250
251        if !is_feasible(candidates, &current, max_luts, max_power, max_latency) {
252            current[layer] = old;
253            t *= alpha;
254            continue;
255        }
256
257        let trial_score = score_fn(&current);
258        let delta = trial_score - current_score;
259
260        if delta > 0.0 || rng.next_f64() < (delta / t).exp() {
261            current_score = trial_score;
262
263            if current_score > best_score {
264                best = current.clone();
265                best_score = current_score;
266            }
267
268            let luts: i64 = (0..n).map(|i| candidates[i][current[i]].luts).sum();
269            let power: f64 = (0..n).map(|i| candidates[i][current[i]].power).sum();
270            pareto_luts.push(luts);
271            pareto_power.push(power);
272            pareto_score.push(current_score);
273        } else {
274            current[layer] = old;
275        }
276
277        t *= alpha;
278    }
279
280    Some(SAResult {
281        best_config: best,
282        best_score,
283        pareto_luts,
284        pareto_power,
285        pareto_score,
286    })
287}
288
289fn is_feasible(
290    candidates: &[Vec<Candidate>],
291    config: &[usize],
292    max_luts: i64,
293    max_power: f64,
294    max_latency: i64,
295) -> bool {
296    let n = candidates.len();
297    let mut total_luts: i64 = 0;
298    let mut total_power: f64 = 0.0;
299    let mut max_lat: i64 = 0;
300    for i in 0..n {
301        let c = &candidates[i][config[i]];
302        total_luts += c.luts;
303        total_power += c.power;
304        if c.latency > max_lat {
305            max_lat = c.latency;
306        }
307    }
308    total_luts <= max_luts
309        && total_power <= max_power
310        && (max_latency <= 0 || max_lat <= max_latency)
311}
312
313// ── Pareto extraction ────────────────────────────────────────────────
314
315/// Extract non-dominated Pareto frontier (parallelized for large N).
316pub fn extract_pareto(luts: &[i64], power: &[f64], score: &[f64]) -> Vec<usize> {
317    let n = luts.len();
318    if n == 0 {
319        return vec![];
320    }
321
322    let indices: Vec<usize> = (0..n)
323        .into_par_iter()
324        .filter(|&i| {
325            for j in 0..n {
326                if j == i {
327                    continue;
328                }
329                if luts[j] <= luts[i]
330                    && power[j] <= power[i]
331                    && score[j] >= score[i]
332                    && (luts[j] < luts[i] || power[j] < power[i] || score[j] > score[i])
333                {
334                    return false;
335                }
336            }
337            true
338        })
339        .collect();
340
341    indices
342}
343
344// ── Tests ────────────────────────────────────────────────────────────
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn test_estimate_deterministic() {
352        let c = estimate_resources(100, 1, 0, 1);
353        assert_eq!(c.luts, 12000);
354        assert!((c.accuracy - 1.0).abs() < 1e-10);
355        assert_eq!(c.latency, 1);
356    }
357
358    #[test]
359    fn test_estimate_sc_sobol() {
360        let c = estimate_resources(10, 256, 2, 0);
361        assert!(c.luts > 0);
362        assert!(c.accuracy > 0.99);
363    }
364
365    #[test]
366    fn test_estimate_hybrid() {
367        let c = estimate_resources(10, 256, 2, 2);
368        assert!((c.accuracy - 0.97).abs() < 1e-10);
369    }
370
371    #[test]
372    fn test_generate_candidates_count() {
373        let cands = generate_candidates(10);
374        // 1 deterministic + 6 lengths × 5 decorrs × 2 modes = 61
375        assert_eq!(cands.len(), 61);
376    }
377
378    #[test]
379    fn test_sa_finds_solution() {
380        let cands: Vec<Vec<Candidate>> = (0..3).map(|_| generate_candidates(10)).collect();
381        let weights = vec![1.0, 1.0, 2.0];
382        let result = simulated_annealing(
383            &cands, &weights, 100_000, 1000.0, 0, 1.0, 0.001, 0.95, 500, 42,
384        );
385        assert!(result.is_some());
386        let r = result.unwrap();
387        assert!(r.best_score > 0.5);
388    }
389
390    #[test]
391    fn test_sa_infeasible() {
392        let cands: Vec<Vec<Candidate>> = (0..3).map(|_| generate_candidates(10)).collect();
393        let weights = vec![1.0, 1.0, 1.0];
394        // Budget too tight for even cheapest
395        let result = simulated_annealing(&cands, &weights, 1, 0.001, 0, 1.0, 0.001, 0.95, 100, 42);
396        assert!(result.is_none());
397    }
398
399    #[test]
400    fn test_pareto_single_point() {
401        let indices = extract_pareto(&[100], &[1.0], &[0.9]);
402        assert_eq!(indices, vec![0]);
403    }
404
405    #[test]
406    fn test_pareto_dominated() {
407        // Point 1 dominates point 0 (lower luts+power, higher score)
408        let indices = extract_pareto(&[200, 100], &[2.0, 1.0], &[0.8, 0.9]);
409        assert_eq!(indices, vec![1]);
410    }
411
412    #[test]
413    fn test_pareto_both_nondominated() {
414        // Neither dominates: lower luts but lower score
415        let indices = extract_pareto(&[100, 200], &[1.0, 2.0], &[0.8, 0.95]);
416        assert_eq!(indices.len(), 2);
417    }
418
419    #[test]
420    fn test_xoshiro_reproducible() {
421        let mut rng1 = Xoshiro256pp::new(42);
422        let mut rng2 = Xoshiro256pp::new(42);
423        for _ in 0..100 {
424            assert_eq!(rng1.next_u64(), rng2.next_u64());
425        }
426    }
427}