Skip to main content

sc_neurocore_engine/ir/
sv_target.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Commercial license available
3// Copyright (C) 2020-2026 Miroslav Sotek. All rights reserved.
4// ORCID: 0009-0009-3560-0851
5// Contact: www.anulum.li | protoscience@anulum.li
6// SC-NeuroCore - SystemVerilog target metadata
7
8//! Target metadata and resource estimates for SystemVerilog emission.
9
10use crate::ir::graph::{ScConst, ScGraph, ScOp};
11
12/// Supported Zynq UltraScale+ MPSoC SKUs for the first target lane.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum SkuKind {
15    Zu3eg,
16    Zu9eg,
17}
18
19/// SystemVerilog emission target.
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub enum SvTarget {
22    Generic,
23    Zynq7 {
24        device: String,
25        clock_mhz: u32,
26    },
27    ZynqUltraScalePlus {
28        sku: SkuKind,
29        clock_mhz: u32,
30        dsp_budget: u32,
31        bram_36k_budget: u32,
32        uram_budget: u32,
33        prefer_uram_over_bram_above_bits: u64,
34    },
35}
36
37/// Conservative compiler-side resource estimate.
38#[derive(Debug, Clone, PartialEq)]
39pub struct ResourceReport {
40    pub target_name: String,
41    pub device_part: String,
42    pub clock_mhz: u32,
43    pub lut_estimated: u32,
44    pub ff_estimated: u32,
45    pub dsp_estimated: u32,
46    pub bram_36k_estimated: u32,
47    pub uram_estimated: u32,
48    pub critical_path_estimate_ns: f64,
49    pub dsp_budget: u32,
50    pub bram_36k_budget: u32,
51    pub uram_budget: u32,
52    pub fits_dsp_budget: bool,
53    pub fits_bram_budget: bool,
54    pub fits_uram_budget: bool,
55    pub dense_fold_plan: Option<DenseFoldPlan>,
56}
57
58/// Deterministic time-multiplexing plan for dense layers that exceed a target
59/// one-DSP-per-MAC budget.
60#[derive(Debug, Clone, PartialEq, Eq)]
61pub struct DenseFoldPlan {
62    pub n_inputs: usize,
63    pub n_outputs: usize,
64    pub mac_count: u32,
65    pub dsp_budget: u32,
66    pub output_parallelism: u32,
67    pub input_parallelism: u32,
68    pub dsp_per_cycle: u32,
69    pub input_fold_factor: u32,
70    pub output_fold_factor: u32,
71    pub compute_cycles: u32,
72    pub fold_required: bool,
73    pub fits_dsp_budget: bool,
74}
75
76impl SkuKind {
77    pub fn as_str(self) -> &'static str {
78        match self {
79            Self::Zu3eg => "ZU3EG",
80            Self::Zu9eg => "ZU9EG",
81        }
82    }
83
84    pub fn device_part(self) -> &'static str {
85        match self {
86            Self::Zu3eg => "xczu3eg-sbva484-1-e",
87            Self::Zu9eg => "xczu9eg-ffvb1156-2-e",
88        }
89    }
90
91    pub fn lut_budget(self) -> u32 {
92        match self {
93            Self::Zu3eg => 70_560,
94            Self::Zu9eg => 274_080,
95        }
96    }
97
98    pub fn ff_budget(self) -> u32 {
99        match self {
100            Self::Zu3eg => 141_120,
101            Self::Zu9eg => 548_160,
102        }
103    }
104
105    pub fn dsp_budget(self) -> u32 {
106        match self {
107            Self::Zu3eg => 360,
108            Self::Zu9eg => 2_520,
109        }
110    }
111
112    pub fn bram_36k_budget(self) -> u32 {
113        match self {
114            Self::Zu3eg => 216,
115            Self::Zu9eg => 912,
116        }
117    }
118
119    pub fn uram_budget(self) -> u32 {
120        0
121    }
122}
123
124impl SvTarget {
125    pub fn zynq_ultrascale_plus(sku: SkuKind, clock_mhz: u32) -> Self {
126        Self::ZynqUltraScalePlus {
127            sku,
128            clock_mhz,
129            dsp_budget: sku.dsp_budget(),
130            bram_36k_budget: sku.bram_36k_budget(),
131            uram_budget: sku.uram_budget(),
132            prefer_uram_over_bram_above_bits: 1_u64 << 20,
133        }
134    }
135
136    pub fn target_name(&self) -> String {
137        match self {
138            Self::Generic => "generic".to_string(),
139            Self::Zynq7 { device, .. } => format!("zynq7:{device}"),
140            Self::ZynqUltraScalePlus { sku, .. } => {
141                format!("zynq-ultrascale-plus:{}", sku.as_str())
142            }
143        }
144    }
145
146    pub fn clock_mhz(&self) -> u32 {
147        match self {
148            Self::Generic => 100,
149            Self::Zynq7 { clock_mhz, .. } | Self::ZynqUltraScalePlus { clock_mhz, .. } => {
150                *clock_mhz
151            }
152        }
153    }
154
155    pub fn device_part(&self) -> String {
156        match self {
157            Self::Generic => "generic".to_string(),
158            Self::Zynq7 { device, .. } => device.clone(),
159            Self::ZynqUltraScalePlus { sku, .. } => sku.device_part().to_string(),
160        }
161    }
162
163    pub fn dsp_primitive(&self) -> &'static str {
164        match self {
165            Self::ZynqUltraScalePlus { .. } => "DSP48E2",
166            Self::Generic | Self::Zynq7 { .. } => "generic",
167        }
168    }
169
170    pub fn dsp_attribute(&self) -> Option<&'static str> {
171        match self {
172            Self::ZynqUltraScalePlus { .. } => Some(
173                "(* use_dsp = \"yes\", sc_target_dsp = \"DSP48E2\", sc_target_family = \"zynq_ultrascale_plus\" *)",
174            ),
175            Self::Generic | Self::Zynq7 { .. } => None,
176        }
177    }
178
179    pub fn ram_style_for_bits(&self, bits: u64) -> Option<&'static str> {
180        match self {
181            Self::ZynqUltraScalePlus {
182                uram_budget,
183                prefer_uram_over_bram_above_bits,
184                ..
185            } if *uram_budget > 0 && bits >= *prefer_uram_over_bram_above_bits => Some("ultra"),
186            Self::ZynqUltraScalePlus { .. } if bits >= 1_024 => Some("block"),
187            Self::ZynqUltraScalePlus { .. } => Some("distributed"),
188            Self::Generic | Self::Zynq7 { .. } => None,
189        }
190    }
191
192    pub fn dense_fold_plan(&self, n_inputs: usize, n_outputs: usize) -> Option<DenseFoldPlan> {
193        let dsp_budget = match self {
194            Self::ZynqUltraScalePlus { dsp_budget, .. } => *dsp_budget,
195            Self::Generic | Self::Zynq7 { .. } => return None,
196        };
197        Some(plan_dense_fold(n_inputs, n_outputs, dsp_budget))
198    }
199
200    pub fn header_comment(&self) -> String {
201        match self {
202            Self::Generic => String::new(),
203            Self::Zynq7 { device, clock_mhz } => format!(
204                "// Target: Zynq-7 device={device}, clock={clock_mhz} MHz\n\n"
205            ),
206            Self::ZynqUltraScalePlus { sku, clock_mhz, .. } => format!(
207                "// Target: Zynq UltraScale+ MPSoC {}, part={}, clock={} MHz\n// DSP primitive: DSP48E2\n\n",
208                sku.as_str(),
209                sku.device_part(),
210                clock_mhz
211            ),
212        }
213    }
214
215    pub fn estimate_graph(&self, graph: &ScGraph) -> ResourceReport {
216        let mut lut_estimated = 128_u32;
217        let mut ff_estimated = 128_u32;
218        let mut dsp_estimated = 0_u32;
219        let mut bram_bits = 0_u64;
220        let uram_bits = 0_u64;
221        let mut critical_path_estimate_ns = 2.5_f64;
222        let mut dense_fold_plan: Option<DenseFoldPlan> = None;
223
224        for op in &graph.ops {
225            match op {
226                ScOp::DenseForward { params, .. } => {
227                    let macs = saturating_u32(params.n_inputs.saturating_mul(params.n_neurons));
228                    dsp_estimated = dsp_estimated.saturating_add(macs);
229                    if let Some(plan) = self.dense_fold_plan(params.n_inputs, params.n_neurons) {
230                        if plan.fold_required {
231                            dense_fold_plan = Some(plan);
232                        }
233                    }
234                    lut_estimated = lut_estimated.saturating_add(220).saturating_add(macs * 6);
235                    ff_estimated = ff_estimated.saturating_add(180).saturating_add(macs * 4);
236                    bram_bits = bram_bits.saturating_add(
237                        (params.n_inputs as u64)
238                            .saturating_mul(params.n_neurons as u64)
239                            .saturating_mul(params.data_width as u64),
240                    );
241                    critical_path_estimate_ns = critical_path_estimate_ns.max(4.0);
242                }
243                ScOp::DclsLayer { params, .. } => {
244                    let taps = saturating_u32(params.n_taps);
245                    dsp_estimated = dsp_estimated.saturating_add(taps);
246                    lut_estimated = lut_estimated.saturating_add(320).saturating_add(taps * 48);
247                    ff_estimated = ff_estimated.saturating_add(220).saturating_add(taps * 32);
248                    bram_bits = bram_bits.saturating_add(
249                        (params.delay_depth as u64).saturating_mul(params.n_taps as u64),
250                    );
251                    critical_path_estimate_ns = critical_path_estimate_ns.max(4.5);
252                }
253                ScOp::LifStep { .. } => {
254                    dsp_estimated = dsp_estimated.saturating_add(2);
255                    lut_estimated = lut_estimated.saturating_add(180);
256                    ff_estimated = ff_estimated.saturating_add(96);
257                    critical_path_estimate_ns = critical_path_estimate_ns.max(3.2);
258                }
259                ScOp::KuramotoStep { .. } => {
260                    dsp_estimated = dsp_estimated.saturating_add(4);
261                    lut_estimated = lut_estimated.saturating_add(512);
262                    ff_estimated = ff_estimated.saturating_add(256);
263                    critical_path_estimate_ns = critical_path_estimate_ns.max(5.0);
264                }
265                ScOp::Constant { value, .. } => {
266                    bram_bits = bram_bits.saturating_add(constant_bits(value));
267                }
268                ScOp::Input { .. }
269                | ScOp::Output { .. }
270                | ScOp::Encode { .. }
271                | ScOp::BitwiseAnd { .. }
272                | ScOp::Popcount { .. }
273                | ScOp::BitwiseXor { .. }
274                | ScOp::Reduce { .. }
275                | ScOp::GraphForward { .. }
276                | ScOp::SoftmaxAttention { .. }
277                | ScOp::Scale { .. }
278                | ScOp::Offset { .. }
279                | ScOp::DivConst { .. } => {
280                    lut_estimated = lut_estimated.saturating_add(16);
281                    ff_estimated = ff_estimated.saturating_add(8);
282                }
283            }
284        }
285
286        let bram_36k_estimated = ceil_div_u64(bram_bits, 36_864).min(u64::from(u32::MAX)) as u32;
287        let uram_estimated = ceil_div_u64(uram_bits, 294_912).min(u64::from(u32::MAX)) as u32;
288        let (dsp_budget, bram_budget, uram_budget) = match self {
289            Self::ZynqUltraScalePlus {
290                dsp_budget,
291                bram_36k_budget,
292                uram_budget,
293                ..
294            } => (*dsp_budget, *bram_36k_budget, *uram_budget),
295            Self::Zynq7 { .. } | Self::Generic => (u32::MAX, u32::MAX, u32::MAX),
296        };
297
298        ResourceReport {
299            target_name: self.target_name(),
300            device_part: self.device_part(),
301            clock_mhz: self.clock_mhz(),
302            lut_estimated,
303            ff_estimated,
304            dsp_estimated,
305            bram_36k_estimated,
306            uram_estimated,
307            critical_path_estimate_ns,
308            dsp_budget,
309            bram_36k_budget: bram_budget,
310            uram_budget,
311            fits_dsp_budget: dsp_estimated <= dsp_budget,
312            fits_bram_budget: bram_36k_estimated <= bram_budget,
313            fits_uram_budget: uram_estimated <= uram_budget,
314            dense_fold_plan,
315        }
316    }
317}
318
319fn plan_dense_fold(n_inputs: usize, n_outputs: usize, dsp_budget: u32) -> DenseFoldPlan {
320    let mac_count = saturating_u32(n_inputs.saturating_mul(n_outputs));
321    if n_inputs == 0 || n_outputs == 0 || dsp_budget == 0 {
322        return DenseFoldPlan {
323            n_inputs,
324            n_outputs,
325            mac_count,
326            dsp_budget,
327            output_parallelism: 0,
328            input_parallelism: 0,
329            dsp_per_cycle: 0,
330            input_fold_factor: 0,
331            output_fold_factor: 0,
332            compute_cycles: 0,
333            fold_required: mac_count > dsp_budget,
334            fits_dsp_budget: dsp_budget >= 1,
335        };
336    }
337
338    let n_inputs_u32 = saturating_u32(n_inputs).max(1);
339    let n_outputs_u32 = saturating_u32(n_outputs).max(1);
340    let output_parallelism = if dsp_budget >= n_inputs_u32 {
341        (dsp_budget / n_inputs_u32).clamp(1, n_outputs_u32)
342    } else {
343        1
344    };
345    let input_parallelism = (dsp_budget / output_parallelism).clamp(1, n_inputs_u32);
346    let dsp_per_cycle = output_parallelism.saturating_mul(input_parallelism);
347    let input_fold_factor = ceil_div_u64(n_inputs_u32 as u64, input_parallelism as u64) as u32;
348    let output_fold_factor = ceil_div_u64(n_outputs_u32 as u64, output_parallelism as u64) as u32;
349    let compute_cycles = input_fold_factor.saturating_mul(output_fold_factor);
350    DenseFoldPlan {
351        n_inputs,
352        n_outputs,
353        mac_count,
354        dsp_budget,
355        output_parallelism,
356        input_parallelism,
357        dsp_per_cycle,
358        input_fold_factor,
359        output_fold_factor,
360        compute_cycles,
361        fold_required: mac_count > dsp_budget,
362        fits_dsp_budget: dsp_per_cycle <= dsp_budget,
363    }
364}
365
366fn constant_bits(value: &ScConst) -> u64 {
367    match value {
368        ScConst::F64(_) | ScConst::I64(_) => 16,
369        ScConst::U64(_) => 32,
370        ScConst::F64Vec(values) => values.len() as u64 * 16,
371        ScConst::I64Vec(values) => values.len() as u64 * 16,
372    }
373}
374
375fn ceil_div_u64(value: u64, divisor: u64) -> u64 {
376    if value == 0 {
377        0
378    } else {
379        1 + (value - 1) / divisor
380    }
381}
382
383fn saturating_u32(value: usize) -> u32 {
384    value.min(u32::MAX as usize) as u32
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390    use crate::ir::builder::ScGraphBuilder;
391    use crate::ir::graph::{DenseParams, ScConst, ScType};
392
393    #[test]
394    fn ultrascale_plus_sku_budgets_match_target_baseline_table() {
395        assert_eq!(SkuKind::Zu3eg.dsp_budget(), 360);
396        assert_eq!(SkuKind::Zu3eg.bram_36k_budget(), 216);
397        assert_eq!(SkuKind::Zu3eg.uram_budget(), 0);
398        assert_eq!(SkuKind::Zu9eg.dsp_budget(), 2_520);
399        assert_eq!(SkuKind::Zu9eg.bram_36k_budget(), 912);
400        assert_eq!(SkuKind::Zu9eg.uram_budget(), 0);
401    }
402
403    #[test]
404    fn ultrascale_plus_uses_device_family_dsp_primitive() {
405        let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
406        assert_eq!(target.dsp_primitive(), "DSP48E2");
407        assert!(target.dsp_attribute().unwrap().contains("DSP48E2"));
408        assert!(!target
409            .dsp_attribute()
410            .unwrap()
411            .contains(&format!("DSP{}", 58)));
412    }
413
414    #[test]
415    fn ultrascale_plus_resource_report_tracks_dense_mac_budget() {
416        let mut builder = ScGraphBuilder::new("resource_dense");
417        let inputs = builder.input(
418            "inputs",
419            ScType::Vec {
420                element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
421                count: 4,
422            },
423        );
424        let weights = builder.constant(
425            ScConst::I64Vec(vec![128; 12]),
426            ScType::Vec {
427                element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
428                count: 12,
429            },
430        );
431        let leak = builder.constant(ScConst::I64(16), ScType::FixedPoint { width: 16, frac: 8 });
432        let gain = builder.constant(ScConst::I64(1), ScType::FixedPoint { width: 16, frac: 8 });
433        let dense = builder.dense_forward(
434            inputs,
435            weights,
436            leak,
437            gain,
438            DenseParams {
439                n_inputs: 4,
440                n_neurons: 3,
441                ..DenseParams::default()
442            },
443        );
444        builder.output("spikes", dense);
445        let graph = builder.build();
446        let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
447        let report = target.estimate_graph(&graph);
448        assert!(report.dsp_estimated >= 12);
449        assert!(report.bram_36k_estimated <= report.bram_36k_budget);
450        assert!(report.fits_dsp_budget);
451        assert_eq!(report.device_part, "xczu3eg-sbva484-1-e");
452    }
453
454    #[test]
455    fn dense_fold_plan_maps_shd_scale_dense_into_zu3eg_budget() {
456        let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
457        let plan = target
458            .dense_fold_plan(64, 32)
459            .expect("UltraScale+ target must expose dense folding");
460        assert_eq!(plan.mac_count, 2_048);
461        assert_eq!(plan.dsp_budget, 360);
462        assert_eq!(plan.output_parallelism, 5);
463        assert_eq!(plan.input_parallelism, 64);
464        assert_eq!(plan.dsp_per_cycle, 320);
465        assert_eq!(plan.output_fold_factor, 7);
466        assert_eq!(plan.input_fold_factor, 1);
467        assert_eq!(plan.compute_cycles, 7);
468        assert!(plan.fold_required);
469        assert!(plan.fits_dsp_budget);
470    }
471
472    #[test]
473    fn resource_report_carries_dense_fold_plan_when_unfurled_budget_fails() {
474        let mut builder = ScGraphBuilder::new("folded_resource_dense");
475        let inputs = builder.input(
476            "inputs",
477            ScType::Vec {
478                element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
479                count: 64,
480            },
481        );
482        let weights = builder.constant(
483            ScConst::I64Vec(vec![128; 64 * 32]),
484            ScType::Vec {
485                element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
486                count: 64 * 32,
487            },
488        );
489        let leak = builder.constant(ScConst::I64(16), ScType::FixedPoint { width: 16, frac: 8 });
490        let gain = builder.constant(ScConst::I64(1), ScType::FixedPoint { width: 16, frac: 8 });
491        let dense = builder.dense_forward(
492            inputs,
493            weights,
494            leak,
495            gain,
496            DenseParams {
497                n_inputs: 64,
498                n_neurons: 32,
499                ..DenseParams::default()
500            },
501        );
502        builder.output("spikes", dense);
503        let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
504        let report = target.estimate_graph(&builder.build());
505        let plan = report
506            .dense_fold_plan
507            .as_ref()
508            .expect("over-budget dense graph must carry fold plan");
509        assert!(!report.fits_dsp_budget);
510        assert_eq!(plan.dsp_per_cycle, 320);
511        assert!(plan.fits_dsp_budget);
512    }
513}