1use crate::ir::graph::{ScConst, ScGraph, ScOp};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum SkuKind {
15 Zu3eg,
16 Zu9eg,
17}
18
19#[derive(Debug, Clone, PartialEq, Eq)]
21pub enum SvTarget {
22 Generic,
23 Zynq7 {
24 device: String,
25 clock_mhz: u32,
26 },
27 ZynqUltraScalePlus {
28 sku: SkuKind,
29 clock_mhz: u32,
30 dsp_budget: u32,
31 bram_36k_budget: u32,
32 uram_budget: u32,
33 prefer_uram_over_bram_above_bits: u64,
34 },
35}
36
37#[derive(Debug, Clone, PartialEq)]
39pub struct ResourceReport {
40 pub target_name: String,
41 pub device_part: String,
42 pub clock_mhz: u32,
43 pub lut_estimated: u32,
44 pub ff_estimated: u32,
45 pub dsp_estimated: u32,
46 pub bram_36k_estimated: u32,
47 pub uram_estimated: u32,
48 pub critical_path_estimate_ns: f64,
49 pub dsp_budget: u32,
50 pub bram_36k_budget: u32,
51 pub uram_budget: u32,
52 pub fits_dsp_budget: bool,
53 pub fits_bram_budget: bool,
54 pub fits_uram_budget: bool,
55 pub dense_fold_plan: Option<DenseFoldPlan>,
56}
57
58#[derive(Debug, Clone, PartialEq, Eq)]
61pub struct DenseFoldPlan {
62 pub n_inputs: usize,
63 pub n_outputs: usize,
64 pub mac_count: u32,
65 pub dsp_budget: u32,
66 pub output_parallelism: u32,
67 pub input_parallelism: u32,
68 pub dsp_per_cycle: u32,
69 pub input_fold_factor: u32,
70 pub output_fold_factor: u32,
71 pub compute_cycles: u32,
72 pub fold_required: bool,
73 pub fits_dsp_budget: bool,
74}
75
76impl SkuKind {
77 pub fn as_str(self) -> &'static str {
78 match self {
79 Self::Zu3eg => "ZU3EG",
80 Self::Zu9eg => "ZU9EG",
81 }
82 }
83
84 pub fn device_part(self) -> &'static str {
85 match self {
86 Self::Zu3eg => "xczu3eg-sbva484-1-e",
87 Self::Zu9eg => "xczu9eg-ffvb1156-2-e",
88 }
89 }
90
91 pub fn lut_budget(self) -> u32 {
92 match self {
93 Self::Zu3eg => 70_560,
94 Self::Zu9eg => 274_080,
95 }
96 }
97
98 pub fn ff_budget(self) -> u32 {
99 match self {
100 Self::Zu3eg => 141_120,
101 Self::Zu9eg => 548_160,
102 }
103 }
104
105 pub fn dsp_budget(self) -> u32 {
106 match self {
107 Self::Zu3eg => 360,
108 Self::Zu9eg => 2_520,
109 }
110 }
111
112 pub fn bram_36k_budget(self) -> u32 {
113 match self {
114 Self::Zu3eg => 216,
115 Self::Zu9eg => 912,
116 }
117 }
118
119 pub fn uram_budget(self) -> u32 {
120 0
121 }
122}
123
124impl SvTarget {
125 pub fn zynq_ultrascale_plus(sku: SkuKind, clock_mhz: u32) -> Self {
126 Self::ZynqUltraScalePlus {
127 sku,
128 clock_mhz,
129 dsp_budget: sku.dsp_budget(),
130 bram_36k_budget: sku.bram_36k_budget(),
131 uram_budget: sku.uram_budget(),
132 prefer_uram_over_bram_above_bits: 1_u64 << 20,
133 }
134 }
135
136 pub fn target_name(&self) -> String {
137 match self {
138 Self::Generic => "generic".to_string(),
139 Self::Zynq7 { device, .. } => format!("zynq7:{device}"),
140 Self::ZynqUltraScalePlus { sku, .. } => {
141 format!("zynq-ultrascale-plus:{}", sku.as_str())
142 }
143 }
144 }
145
146 pub fn clock_mhz(&self) -> u32 {
147 match self {
148 Self::Generic => 100,
149 Self::Zynq7 { clock_mhz, .. } | Self::ZynqUltraScalePlus { clock_mhz, .. } => {
150 *clock_mhz
151 }
152 }
153 }
154
155 pub fn device_part(&self) -> String {
156 match self {
157 Self::Generic => "generic".to_string(),
158 Self::Zynq7 { device, .. } => device.clone(),
159 Self::ZynqUltraScalePlus { sku, .. } => sku.device_part().to_string(),
160 }
161 }
162
163 pub fn dsp_primitive(&self) -> &'static str {
164 match self {
165 Self::ZynqUltraScalePlus { .. } => "DSP48E2",
166 Self::Generic | Self::Zynq7 { .. } => "generic",
167 }
168 }
169
170 pub fn dsp_attribute(&self) -> Option<&'static str> {
171 match self {
172 Self::ZynqUltraScalePlus { .. } => Some(
173 "(* use_dsp = \"yes\", sc_target_dsp = \"DSP48E2\", sc_target_family = \"zynq_ultrascale_plus\" *)",
174 ),
175 Self::Generic | Self::Zynq7 { .. } => None,
176 }
177 }
178
179 pub fn ram_style_for_bits(&self, bits: u64) -> Option<&'static str> {
180 match self {
181 Self::ZynqUltraScalePlus {
182 uram_budget,
183 prefer_uram_over_bram_above_bits,
184 ..
185 } if *uram_budget > 0 && bits >= *prefer_uram_over_bram_above_bits => Some("ultra"),
186 Self::ZynqUltraScalePlus { .. } if bits >= 1_024 => Some("block"),
187 Self::ZynqUltraScalePlus { .. } => Some("distributed"),
188 Self::Generic | Self::Zynq7 { .. } => None,
189 }
190 }
191
192 pub fn dense_fold_plan(&self, n_inputs: usize, n_outputs: usize) -> Option<DenseFoldPlan> {
193 let dsp_budget = match self {
194 Self::ZynqUltraScalePlus { dsp_budget, .. } => *dsp_budget,
195 Self::Generic | Self::Zynq7 { .. } => return None,
196 };
197 Some(plan_dense_fold(n_inputs, n_outputs, dsp_budget))
198 }
199
200 pub fn header_comment(&self) -> String {
201 match self {
202 Self::Generic => String::new(),
203 Self::Zynq7 { device, clock_mhz } => format!(
204 "// Target: Zynq-7 device={device}, clock={clock_mhz} MHz\n\n"
205 ),
206 Self::ZynqUltraScalePlus { sku, clock_mhz, .. } => format!(
207 "// Target: Zynq UltraScale+ MPSoC {}, part={}, clock={} MHz\n// DSP primitive: DSP48E2\n\n",
208 sku.as_str(),
209 sku.device_part(),
210 clock_mhz
211 ),
212 }
213 }
214
215 pub fn estimate_graph(&self, graph: &ScGraph) -> ResourceReport {
216 let mut lut_estimated = 128_u32;
217 let mut ff_estimated = 128_u32;
218 let mut dsp_estimated = 0_u32;
219 let mut bram_bits = 0_u64;
220 let uram_bits = 0_u64;
221 let mut critical_path_estimate_ns = 2.5_f64;
222 let mut dense_fold_plan: Option<DenseFoldPlan> = None;
223
224 for op in &graph.ops {
225 match op {
226 ScOp::DenseForward { params, .. } => {
227 let macs = saturating_u32(params.n_inputs.saturating_mul(params.n_neurons));
228 dsp_estimated = dsp_estimated.saturating_add(macs);
229 if let Some(plan) = self.dense_fold_plan(params.n_inputs, params.n_neurons) {
230 if plan.fold_required {
231 dense_fold_plan = Some(plan);
232 }
233 }
234 lut_estimated = lut_estimated.saturating_add(220).saturating_add(macs * 6);
235 ff_estimated = ff_estimated.saturating_add(180).saturating_add(macs * 4);
236 bram_bits = bram_bits.saturating_add(
237 (params.n_inputs as u64)
238 .saturating_mul(params.n_neurons as u64)
239 .saturating_mul(params.data_width as u64),
240 );
241 critical_path_estimate_ns = critical_path_estimate_ns.max(4.0);
242 }
243 ScOp::DclsLayer { params, .. } => {
244 let taps = saturating_u32(params.n_taps);
245 dsp_estimated = dsp_estimated.saturating_add(taps);
246 lut_estimated = lut_estimated.saturating_add(320).saturating_add(taps * 48);
247 ff_estimated = ff_estimated.saturating_add(220).saturating_add(taps * 32);
248 bram_bits = bram_bits.saturating_add(
249 (params.delay_depth as u64).saturating_mul(params.n_taps as u64),
250 );
251 critical_path_estimate_ns = critical_path_estimate_ns.max(4.5);
252 }
253 ScOp::LifStep { .. } => {
254 dsp_estimated = dsp_estimated.saturating_add(2);
255 lut_estimated = lut_estimated.saturating_add(180);
256 ff_estimated = ff_estimated.saturating_add(96);
257 critical_path_estimate_ns = critical_path_estimate_ns.max(3.2);
258 }
259 ScOp::KuramotoStep { .. } => {
260 dsp_estimated = dsp_estimated.saturating_add(4);
261 lut_estimated = lut_estimated.saturating_add(512);
262 ff_estimated = ff_estimated.saturating_add(256);
263 critical_path_estimate_ns = critical_path_estimate_ns.max(5.0);
264 }
265 ScOp::Constant { value, .. } => {
266 bram_bits = bram_bits.saturating_add(constant_bits(value));
267 }
268 ScOp::Input { .. }
269 | ScOp::Output { .. }
270 | ScOp::Encode { .. }
271 | ScOp::BitwiseAnd { .. }
272 | ScOp::Popcount { .. }
273 | ScOp::BitwiseXor { .. }
274 | ScOp::Reduce { .. }
275 | ScOp::GraphForward { .. }
276 | ScOp::SoftmaxAttention { .. }
277 | ScOp::Scale { .. }
278 | ScOp::Offset { .. }
279 | ScOp::DivConst { .. } => {
280 lut_estimated = lut_estimated.saturating_add(16);
281 ff_estimated = ff_estimated.saturating_add(8);
282 }
283 }
284 }
285
286 let bram_36k_estimated = ceil_div_u64(bram_bits, 36_864).min(u64::from(u32::MAX)) as u32;
287 let uram_estimated = ceil_div_u64(uram_bits, 294_912).min(u64::from(u32::MAX)) as u32;
288 let (dsp_budget, bram_budget, uram_budget) = match self {
289 Self::ZynqUltraScalePlus {
290 dsp_budget,
291 bram_36k_budget,
292 uram_budget,
293 ..
294 } => (*dsp_budget, *bram_36k_budget, *uram_budget),
295 Self::Zynq7 { .. } | Self::Generic => (u32::MAX, u32::MAX, u32::MAX),
296 };
297
298 ResourceReport {
299 target_name: self.target_name(),
300 device_part: self.device_part(),
301 clock_mhz: self.clock_mhz(),
302 lut_estimated,
303 ff_estimated,
304 dsp_estimated,
305 bram_36k_estimated,
306 uram_estimated,
307 critical_path_estimate_ns,
308 dsp_budget,
309 bram_36k_budget: bram_budget,
310 uram_budget,
311 fits_dsp_budget: dsp_estimated <= dsp_budget,
312 fits_bram_budget: bram_36k_estimated <= bram_budget,
313 fits_uram_budget: uram_estimated <= uram_budget,
314 dense_fold_plan,
315 }
316 }
317}
318
319fn plan_dense_fold(n_inputs: usize, n_outputs: usize, dsp_budget: u32) -> DenseFoldPlan {
320 let mac_count = saturating_u32(n_inputs.saturating_mul(n_outputs));
321 if n_inputs == 0 || n_outputs == 0 || dsp_budget == 0 {
322 return DenseFoldPlan {
323 n_inputs,
324 n_outputs,
325 mac_count,
326 dsp_budget,
327 output_parallelism: 0,
328 input_parallelism: 0,
329 dsp_per_cycle: 0,
330 input_fold_factor: 0,
331 output_fold_factor: 0,
332 compute_cycles: 0,
333 fold_required: mac_count > dsp_budget,
334 fits_dsp_budget: dsp_budget >= 1,
335 };
336 }
337
338 let n_inputs_u32 = saturating_u32(n_inputs).max(1);
339 let n_outputs_u32 = saturating_u32(n_outputs).max(1);
340 let output_parallelism = if dsp_budget >= n_inputs_u32 {
341 (dsp_budget / n_inputs_u32).clamp(1, n_outputs_u32)
342 } else {
343 1
344 };
345 let input_parallelism = (dsp_budget / output_parallelism).clamp(1, n_inputs_u32);
346 let dsp_per_cycle = output_parallelism.saturating_mul(input_parallelism);
347 let input_fold_factor = ceil_div_u64(n_inputs_u32 as u64, input_parallelism as u64) as u32;
348 let output_fold_factor = ceil_div_u64(n_outputs_u32 as u64, output_parallelism as u64) as u32;
349 let compute_cycles = input_fold_factor.saturating_mul(output_fold_factor);
350 DenseFoldPlan {
351 n_inputs,
352 n_outputs,
353 mac_count,
354 dsp_budget,
355 output_parallelism,
356 input_parallelism,
357 dsp_per_cycle,
358 input_fold_factor,
359 output_fold_factor,
360 compute_cycles,
361 fold_required: mac_count > dsp_budget,
362 fits_dsp_budget: dsp_per_cycle <= dsp_budget,
363 }
364}
365
366fn constant_bits(value: &ScConst) -> u64 {
367 match value {
368 ScConst::F64(_) | ScConst::I64(_) => 16,
369 ScConst::U64(_) => 32,
370 ScConst::F64Vec(values) => values.len() as u64 * 16,
371 ScConst::I64Vec(values) => values.len() as u64 * 16,
372 }
373}
374
375fn ceil_div_u64(value: u64, divisor: u64) -> u64 {
376 if value == 0 {
377 0
378 } else {
379 1 + (value - 1) / divisor
380 }
381}
382
383fn saturating_u32(value: usize) -> u32 {
384 value.min(u32::MAX as usize) as u32
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390 use crate::ir::builder::ScGraphBuilder;
391 use crate::ir::graph::{DenseParams, ScConst, ScType};
392
393 #[test]
394 fn ultrascale_plus_sku_budgets_match_target_baseline_table() {
395 assert_eq!(SkuKind::Zu3eg.dsp_budget(), 360);
396 assert_eq!(SkuKind::Zu3eg.bram_36k_budget(), 216);
397 assert_eq!(SkuKind::Zu3eg.uram_budget(), 0);
398 assert_eq!(SkuKind::Zu9eg.dsp_budget(), 2_520);
399 assert_eq!(SkuKind::Zu9eg.bram_36k_budget(), 912);
400 assert_eq!(SkuKind::Zu9eg.uram_budget(), 0);
401 }
402
403 #[test]
404 fn ultrascale_plus_uses_device_family_dsp_primitive() {
405 let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
406 assert_eq!(target.dsp_primitive(), "DSP48E2");
407 assert!(target.dsp_attribute().unwrap().contains("DSP48E2"));
408 assert!(!target
409 .dsp_attribute()
410 .unwrap()
411 .contains(&format!("DSP{}", 58)));
412 }
413
414 #[test]
415 fn ultrascale_plus_resource_report_tracks_dense_mac_budget() {
416 let mut builder = ScGraphBuilder::new("resource_dense");
417 let inputs = builder.input(
418 "inputs",
419 ScType::Vec {
420 element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
421 count: 4,
422 },
423 );
424 let weights = builder.constant(
425 ScConst::I64Vec(vec![128; 12]),
426 ScType::Vec {
427 element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
428 count: 12,
429 },
430 );
431 let leak = builder.constant(ScConst::I64(16), ScType::FixedPoint { width: 16, frac: 8 });
432 let gain = builder.constant(ScConst::I64(1), ScType::FixedPoint { width: 16, frac: 8 });
433 let dense = builder.dense_forward(
434 inputs,
435 weights,
436 leak,
437 gain,
438 DenseParams {
439 n_inputs: 4,
440 n_neurons: 3,
441 ..DenseParams::default()
442 },
443 );
444 builder.output("spikes", dense);
445 let graph = builder.build();
446 let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
447 let report = target.estimate_graph(&graph);
448 assert!(report.dsp_estimated >= 12);
449 assert!(report.bram_36k_estimated <= report.bram_36k_budget);
450 assert!(report.fits_dsp_budget);
451 assert_eq!(report.device_part, "xczu3eg-sbva484-1-e");
452 }
453
454 #[test]
455 fn dense_fold_plan_maps_shd_scale_dense_into_zu3eg_budget() {
456 let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
457 let plan = target
458 .dense_fold_plan(64, 32)
459 .expect("UltraScale+ target must expose dense folding");
460 assert_eq!(plan.mac_count, 2_048);
461 assert_eq!(plan.dsp_budget, 360);
462 assert_eq!(plan.output_parallelism, 5);
463 assert_eq!(plan.input_parallelism, 64);
464 assert_eq!(plan.dsp_per_cycle, 320);
465 assert_eq!(plan.output_fold_factor, 7);
466 assert_eq!(plan.input_fold_factor, 1);
467 assert_eq!(plan.compute_cycles, 7);
468 assert!(plan.fold_required);
469 assert!(plan.fits_dsp_budget);
470 }
471
472 #[test]
473 fn resource_report_carries_dense_fold_plan_when_unfurled_budget_fails() {
474 let mut builder = ScGraphBuilder::new("folded_resource_dense");
475 let inputs = builder.input(
476 "inputs",
477 ScType::Vec {
478 element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
479 count: 64,
480 },
481 );
482 let weights = builder.constant(
483 ScConst::I64Vec(vec![128; 64 * 32]),
484 ScType::Vec {
485 element: Box::new(ScType::FixedPoint { width: 16, frac: 8 }),
486 count: 64 * 32,
487 },
488 );
489 let leak = builder.constant(ScConst::I64(16), ScType::FixedPoint { width: 16, frac: 8 });
490 let gain = builder.constant(ScConst::I64(1), ScType::FixedPoint { width: 16, frac: 8 });
491 let dense = builder.dense_forward(
492 inputs,
493 weights,
494 leak,
495 gain,
496 DenseParams {
497 n_inputs: 64,
498 n_neurons: 32,
499 ..DenseParams::default()
500 },
501 );
502 builder.output("spikes", dense);
503 let target = SvTarget::zynq_ultrascale_plus(SkuKind::Zu3eg, 250);
504 let report = target.estimate_graph(&builder.build());
505 let plan = report
506 .dense_fold_plan
507 .as_ref()
508 .expect("over-budget dense graph must carry fold plan");
509 assert!(!report.fits_dsp_budget);
510 assert_eq!(plan.dsp_per_cycle, 320);
511 assert!(plan.fits_dsp_budget);
512 }
513}