PandA Publications - panda.deib.polimi.it

This page contains a list of publications related to the PandA framework.

The publication list is organized by year. Inside each year, most recent publications appear first.

2025

C. Silvano, F. Ferrandi, S. Curzel, D. Ielmini, S. Perri, F. Spagnolo, P. Corsonello, S. F. Schifano, C. Zambelli, A. Garofalo, F. Conti, and L. Benini, “Multi-Partner Project: Architectures and Design Methodologies to Accelerate AI Workloads. The ICSC Flagship 2 Project,” in Proceedings -Design, Automation and Test in Europe, DATE, 2025, p. 1–7.
[BibTeX]

@conference{11311_1297602,
author = {Silvano, Cristina and Ferrandi, Fabrizio and Curzel, Serena and Ielmini, Daniele and Perri, Stefania and Spagnolo, Fanny and Corsonello, Pasquale and Schifano, Sebastiano Fabio and Zambelli, Cristian and Garofalo, Angelo and Conti, Francesco and Benini, Luca},
title = {Multi-Partner Project: Architectures and Design Methodologies to Accelerate AI Workloads. The ICSC Flagship 2 Project},
year = {2025},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
booktitle = {Proceedings -Design, Automation and Test in Europe, DATE},
doi = {10.23919/date64628.2025.10993254},
pages = {1--7}
}

S. Curzel and M. Gribaudo, “Custom Floating-Point Computations for the Optimization of ODE Solvers on FPGA,” in 16th Workshop on Parallel Programming and Run-Time Management Techniques for Many-Core Architectures and 14th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms (PARMA-DITAM 2025), 2025, p. 1–13.
[BibTeX]

@conference{11311_1285808,
author = {Curzel, Serena and Gribaudo, Marco},
title = {Custom Floating-Point Computations for the Optimization of ODE Solvers on FPGA},
year = {2025},
publisher = {Schloss Dagstuhl- Leibniz-Zentrum fur Informatik GmbH, Dagstuhl Publishing},
volume = {127},
booktitle = {16th Workshop on Parallel Programming and Run-Time Management Techniques for Many-Core Architectures and 14th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms (PARMA-DITAM 2025)},
doi = {10.4230/oasics.parma-ditam.2025.2},
pages = {1--13}
}

C. Silvano, D. Ielmini, F. Ferrandi, L. Fiorin, S. Curzel, L. Benini, F. Conti, A. Garofalo, C. Zambelli, E. Calore, S. Schifano, M. Palesi, G. Ascia, D. Patti, N. Petra, D. De Caro, L. Lavagno, T. Urso, V. Cardellini, G. Cardarilli, R. Birke, and S. Perri, “A Survey on Deep Learning Hardware Accelerators for Heterogeneous HPC Platforms,” ACM COMPUTING SURVEYS, vol. 57, iss. 11, p. 1–39, 2025.
[BibTeX]

@article{11311_1297601,
author = {Silvano, Cristina and Ielmini, Daniele and Ferrandi, Fabrizio and Fiorin, Leandro and Curzel, Serena and Benini, Luca and Conti, Francesco and Garofalo, Angelo and Zambelli, Cristian and Calore, Enrico and Schifano, Sebastiano and Palesi, Maurizio and Ascia, Giuseppe and Patti, Davide and Petra, Nicola and De Caro, Davide and Lavagno, Luciano and Urso, Teodoro and Cardellini, Valeria and Cardarilli, Gian and Birke, Robert and Perri, Stefania},
title = {A Survey on Deep Learning Hardware Accelerators for Heterogeneous HPC Platforms},
year = {2025},
journal = {ACM COMPUTING SURVEYS},
volume = {57},
url = {https://dl.acm.org/doi/10.1145/3729215},
doi = {10.1145/3729215},
pages = {1--39},
number = {11}
}

S. Curzel, S. Jovic, M. Fiorito, A. Tumeo, and F. Ferrandi, “Pre-Scheduling of Affine Loops for HLS Pipelining,” in Euro-Par 2024: Parallel Processing Workshops, 2025, p. 1–12.
[BibTeX]

@conference{11311_1277195,
author = {Curzel, Serena and Jovic, Sofija and Fiorito, Michele and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {Pre-Scheduling of Affine Loops for HLS Pipelining},
year = {2025},
publisher = {Springer},
volume = {15386},
booktitle = {Euro-Par 2024: Parallel Processing Workshops},
doi = {10.1007/978-3-031-90203-1_53},
pages = {1--12}
}

M. Fiorito, S. Curzel, and F. Ferrandi, “Augmented Co-Simulation for Fast Functional and System-Level Verification of HLS Accelerators,” in 2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD), 2025, p. 1–9.
[BibTeX]

@conference{11311_1300850,
author = {Fiorito, Michele and Curzel, Serena and Ferrandi, Fabrizio},
title = {Augmented Co-Simulation for Fast Functional and System-Level Verification of HLS Accelerators},
year = {2025},
booktitle = {2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)},
doi = {10.1109/iccad66269.2025.11240905},
pages = {1--9}
}

A. Limaye, N. Bohm Agostini, C. Barone, V. G. Castellana, M. Fiorito, F. Ferrandi, A. Marquez, and A. Tumeo, “A Synthesis Methodology for Intelligent Memory Interfaces in Accelerator Systems,” in Proceedings of the 30th Asia and South Pacific Design Automation Conference, 2025, p. 1016–1022.
[BibTeX]

@conference{11311_1285593,
author = {Limaye, Ankur and Bohm Agostini, Nicolas and Barone, Claudio and Castellana, Vito Giovanni and Fiorito, Michele and Ferrandi, Fabrizio and Marquez, Andres and Tumeo, Antonino},
title = {A Synthesis Methodology for Intelligent Memory Interfaces in Accelerator Systems},
year = {2025},
publisher = {IEEE},
booktitle = {Proceedings of the 30th Asia and South Pacific Design Automation Conference},
doi = {10.1145/3658617.3697553},
pages = {1016--1022}
}

2024

M. Fiorito, S. Curzel, G. Gozzi, and F. Ferrandi, “A DNN-based Background Segmentation Accelerator for FPGA-equipped satellites,” in CF ’24 Companion: Proceedings of the 21st ACM International Conference on Computing Frontiers Workshops and Special Sessions, 2024, p. 128–132.
[BibTeX]

@conference{11311_1268546,
author = {Fiorito, Michele and Curzel, Serena and Gozzi, Giovanni and Ferrandi, Fabrizio},
title = {A DNN-based Background Segmentation Accelerator for FPGA-equipped satellites},
year = {2024},
booktitle = {CF '24 Companion: Proceedings of the 21st ACM International Conference on Computing Frontiers Workshops and Special Sessions},
doi = {10.1145/3637543.3652979},
pages = {128--132}
}

F. Ferrandi, M. Fiorito, C. Barone, G. Gozzi, and S. Curzel, “High-Level Synthesis Developments in the Context of European Space Technology Research (Invited Talk),” in 15th Workshop on Parallel Programming and Run-Time Management Techniques for Many-Core Architectures and 13th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms (PARMA-DITAM 2024), 2024, p. 1–12.
[BibTeX]

@conference{11311_1262109,
author = {Ferrandi, Fabrizio and Fiorito, Michele and Barone, Claudio and Gozzi, Giovanni and Curzel, Serena},
title = {High-Level Synthesis Developments in the Context of European Space Technology Research (Invited Talk)},
year = {2024},
volume = {116},
booktitle = {15th Workshop on Parallel Programming and Run-Time Management Techniques for Many-Core Architectures and 13th Workshop on Design Tools and Architectures for Multicore Embedded Computing Platforms (PARMA-DITAM 2024)},
doi = {10.4230/oasics.parma-ditam.2024.1},
pages = {1--12}
}

S. Curzel, “Modern High-Level Synthesis: Improving Productivity with a Multi-level Approach,” in Special Topics in Information Technology, Springer, 2024, p. 15–25.
[BibTeX]

@inbook{11311_1262560,
author = {Curzel, Serena},
title = {Modern High-Level Synthesis: Improving Productivity with a Multi-level Approach},
year = {2024},
publisher = {Springer},
booktitle = {Special Topics in Information Technology},
doi = {10.1007/978-3-031-51500-2_2},
isbn = {9783031514999},
isbn = {9783031515002},
pages = {15--25}
}

C. Pilato, S. Banik, J. Beranek, F. Brocheton, J. Castrillon, R. Cevasco, R. Cmar, S. Curzel, F. Ferrandi, K. F. A. Friebel, A. Galizia, M. Grasso, P. Silva, J. Martinovic, G. Palermo, M. Paolino, A. Parodi, A. Parodi, F. Pintus, R. Polig, D. Poulet, F. Regazzoni, B. Ringlein, R. Rocco, K. Slaninova, T. Slooff, S. Soldavini, F. Suchert, M. Tibaldi, B. Weiss, and C. Hagleitner, “A System Development Kit for Big Data Applications on FPGA-based Clusters: The EVEREST Approach,” in Proceedings -Design, Automation and Test in Europe, DATE, 2024, p. 1–6.
[BibTeX]

@conference{11311_1272283,
author = {Pilato, C. and Banik, S. and Beranek, J. and Brocheton, F. and Castrillon, J. and Cevasco, R. and Cmar, R. and Curzel, S. and Ferrandi, F. and Friebel, K. F. A. and Galizia, A. and Grasso, M. and Silva, P. and Martinovic, J. and Palermo, G. and Paolino, M. and Parodi, A. and Parodi, A. and Pintus, F. and Polig, R. and Poulet, D. and Regazzoni, F. and Ringlein, B. and Rocco, R. and Slaninova, K. and Slooff, T. and Soldavini, S. and Suchert, F. and Tibaldi, M. and Weiss, B. and Hagleitner, C.},
title = {A System Development Kit for Big Data Applications on FPGA-based Clusters: The EVEREST Approach},
year = {2024},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
booktitle = {Proceedings -Design, Automation and Test in Europe, DATE},
doi = {10.23919/DATE58400.2024.10546518},
pages = {1--6}
}

S. Soldavini, F. Suchert, S. Curzel, M. Fiorito, K. Friebel, F. Ferrandi, R. Cmar, J. Castrillon, and C. Pilato, “Etna: MLIR-Based System-Level Design and Optimization for Transparent Application Execution on CPU-FPGA Nodes,” in Proceedings – 2024 IEEE 32nd Annual International Symposium on Field-Programmable Custom Computing Machines, FCCM 2024, 2024, p. 224–224.
[BibTeX]

@conference{11311_1297708,
author = {Soldavini, S. and Suchert, F. and Curzel, S. and Fiorito, M. and Friebel, K. and Ferrandi, F. and Cmar, R. and Castrillon, J. and Pilato, C.},
title = {Etna: MLIR-Based System-Level Design and Optimization for Transparent Application Execution on CPU-FPGA Nodes},
year = {2024},
publisher = {IEEE},
booktitle = {Proceedings - 2024 IEEE 32nd Annual International Symposium on Field-Programmable Custom Computing Machines, FCCM 2024},
keywords = {FPGA; HBM; HLS; MLIR},
doi = {10.1109/FCCM60383.2024.00012},
pages = {224--224}
}

C. Barone, R. Kushwah, A. Limaye, V. G. Castellana, G. Gozzi, M. Fiorito, F. Ferrandi, and A. Tumeo, “To Cache or not to Cache? Exploring the Design Space of Tunable, HLS-generated Accelerators,” in PROCEEDINGS OF THE INTERNATIONAL SYMPOSIUM ON MEMORY SYSTEMS, MEMSYS 2024, 2024, p. 210–218.
[BibTeX]

@conference{11311_1285590,
author = {Barone, Claudio and Kushwah, Rishika and Limaye, Ankur and Castellana, Vito Giovanni and Gozzi, Giovanni and Fiorito, Michele and Ferrandi, Fabrizio and Tumeo, Antonino},
title = {To Cache or not to Cache? Exploring the Design Space of Tunable, HLS-generated Accelerators},
year = {2024},
publisher = {Association for Computing Machinery},
booktitle = {PROCEEDINGS OF THE INTERNATIONAL SYMPOSIUM ON MEMORY SYSTEMS, MEMSYS 2024},
doi = {10.1145/3695794.3695815},
pages = {210--218}
}

A. Limaye, C. Barone, N. B. Agostini, M. Minutoli, J. Manzano, V. G. Castellana, G. Gozzi, M. Fiorito, S. Curzel, F. Ferrandi, and A. Tumeo, “Towards Automated Generation of Chiplet-Based Systems Invited Paper,” in 2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC), 2024, p. 771–776.
[BibTeX]

@conference{11311_1263097,
author = {Limaye, Ankur and Barone, Claudio and Agostini, Nicolas Bohm and Minutoli, Marco and Manzano, Joseph and Castellana, Vito Giovanni and Gozzi, Giovanni and Fiorito, Michele and Curzel, Serena and Ferrandi, Fabrizio and Tumeo, Antonino},
title = {Towards Automated Generation of Chiplet-Based Systems Invited Paper},
year = {2024},
booktitle = {2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC)},
doi = {10.1109/asp-dac58780.2024.10473980},
pages = {771--776}
}

G. Gozzi, M. Fiorito, S. Curzel, C. Barone, V. G. Castellana, M. Minutoli, A. Tumeo, and F. Ferrandi, “SPARTA: High-Level Synthesis of Parallel Multi-Threaded Accelerators,” ACM TRANSACTIONS ON RECONFIGURABLE TECHNOLOGY AND SYSTEMS, vol. 18, iss. 1, p. 1–30, 2024.
[BibTeX]

@article{11311_1270384,
author = {Gozzi, Giovanni and Fiorito, Michele and Curzel, Serena and Barone, Claudio and Castellana, Vito Giovanni and Minutoli, Marco and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {SPARTA: High-Level Synthesis of Parallel Multi-Threaded Accelerators},
year = {2024},
journal = {ACM TRANSACTIONS ON RECONFIGURABLE TECHNOLOGY AND SYSTEMS},
volume = {18},
url = {https://dl.acm.org/doi/10.1145/3677035},
doi = {10.1145/3677035},
pages = {1--30},
number = {1}
}

2023

S. Curzel, M. Fiorito, P. L. Cueva, T. Jorge, T. Tsiodras, and F. Ferrandi, “Exploration of Synthesis Methods from Simulink Models to FPGA for Aerospace Applications,” in CF ’23: Proceedings of the 20th ACM International Conference on Computing Frontiers, 2023, p. 243–249.
[BibTeX]

@conference{11311_1247638,
author = {Curzel, Serena and Fiorito, Michele and Cueva, Patricia Lopez and Jorge, Tiago and Tsiodras, Thanassis and Ferrandi, Fabrizio},
title = {Exploration of Synthesis Methods from Simulink Models to FPGA for Aerospace Applications},
year = {2023},
booktitle = {CF '23: Proceedings of the 20th ACM International Conference on Computing Frontiers},
doi = {10.1145/3587135.3592766},
isbn = {9798400701405},
pages = {243--249}
}

L. E. Pozzoni, F. Ferrandi, L. Mendola, A. A. Palazzo, and F. Pappalardo, “Using High-Level Synthesis to model System Verilog procedural timing controls,” in Design, Automation & Test in Europe Conference & Exhibition (DATE), 2023, p. 1–6.
[BibTeX]

@conference{11311_1251607,
author = {Pozzoni, Luca Ezio and Ferrandi, Fabrizio and Mendola, Loris and Palazzo, Alfio Antonino and Pappalardo, Francesco},
title = {Using High-Level Synthesis to model System Verilog procedural timing controls},
year = {2023},
booktitle = {Design, Automation & Test in Europe Conference & Exhibition (DATE)},
doi = {10.23919/DATE56975.2023.10136907},
isbn = {979-8-3503-9624-9},
pages = {1--6}
}

M. Fiorito, S. Curzel, and F. Ferrandi, “TrueFloat: A Templatized Arithmetic Library for HLS Floating-Point Operators,” in SAMOS 2023: Embedded Computer Systems: Architectures, Modeling, and Simulation, 2023, p. 486–493.
[BibTeX]

@conference{11311_1254959,
author = {Fiorito, Michele and Curzel, Serena and Ferrandi, Fabrizio},
title = {TrueFloat: A Templatized Arithmetic Library for HLS Floating-Point Operators},
year = {2023},
volume = {14385},
booktitle = {SAMOS 2023: Embedded Computer Systems: Architectures, Modeling, and Simulation},
doi = {10.1007/978-3-031-46077-7_35},
isbn = {978-3-031-46076-0},
isbn = {978-3-031-46077-7},
pages = {486--493}
}

V. G. Castellana, N. B. Agostini, A. Limaye, V. Amatya, M. Minutoli, J. Manzano, A. Tumeo, S. Curzel, M. Fiorito, and F. Ferrandi, “Towards On-Chip Learning for Low Latency Reasoning with End-to-End Synthesis,” in ASPDAC ’23: Proceedings of the 28th Asia and South Pacific Design Automation Conference, 2023, p. 632–638.
[BibTeX] [Abstract]

The Software Defined Architectures (SODA) Synthesizer is an open-source compiler-based tool able to automatically generate domain-specialized systems targeting Application-Specific Integrated Circuits (ASICs) or Field Programmable Gate Arrays (FPGAs) starting from high-level programming. SODA is composed of a frontend, SODA-OPT, which leverages the multilevel intermediate representation (MLIR) framework to interface with productive programming tools (e.g., machine learning frameworks), identify kernels suitable for acceleration, and perform high-level optimizations, and of a state-of-the-art high-level synthesis backend, Bambu from the PandA framework, to generate custom accelerators. One specific application of the SODA Synthesizer is the generation of accelerators to enable ultra-low latency inference and control on autonomous systems for scientific discovery (e.g., electron microscopes, sensors in particle accelerators, etc.). This paper provides an overview of the flow in the context of the generation of accelerators for edge processing to be integrated in transmission electron microscopy (TEM) devices, focusing on use cases from precision material synthesis. We show the tool in action with an example of design space exploration for inference on reconfigurable devices with a conventional deep neural network model (LeNet). Finally, we discuss the research directions and opportunities enabled by SODA in the area of autonomous control for scientific experimental workflows.

@conference{11311_1229391,
author = {Castellana, Vito Giovanni and Agostini, Nicolas Bohm and Limaye, Ankur and Amatya, Vinay and Minutoli, Marco and Manzano, Joseph and Tumeo, Antonino and Curzel, Serena and Fiorito, Michele and Ferrandi, Fabrizio},
title = {Towards On-Chip Learning for Low Latency Reasoning with End-to-End Synthesis},
year = {2023},
booktitle = {ASPDAC '23: Proceedings of the 28th Asia and South Pacific Design Automation Conference},
abstract = {The Software Defined Architectures (SODA) Synthesizer is an open-source compiler-based tool able to automatically generate domain-specialized systems targeting Application-Specific Integrated Circuits (ASICs) or Field Programmable Gate Arrays (FPGAs) starting from high-level programming. SODA is composed of a frontend, SODA-OPT, which leverages the multilevel intermediate representation (MLIR) framework to interface with productive programming tools (e.g., machine learning frameworks), identify kernels suitable for acceleration, and perform high-level optimizations, and of a state-of-the-art high-level synthesis backend, Bambu from the PandA framework, to generate custom accelerators. One specific application of the SODA Synthesizer is the generation of accelerators to enable ultra-low latency inference and control on autonomous systems for scientific discovery (e.g., electron microscopes, sensors in particle accelerators, etc.). This paper provides an overview of the flow in the context of the generation of accelerators for edge processing to be integrated in transmission electron microscopy (TEM) devices, focusing on use cases from precision material synthesis. We show the tool in action with an example of design space exploration for inference on reconfigurable devices with a conventional deep neural network model (LeNet). Finally, we discuss the research directions and opportunities enabled by SODA in the area of autonomous control for scientific experimental workflows.},
doi = {10.1145/3566097.3568360},
isbn = {9781450397834},
pages = {632--638}
}

N. Ibellaatti, E. Lepape, A. Kilic, K. Akyel, K. Chouayakh, F. Ferrandi, C. Barone, S. Curzel, M. Fiorito, G. Gozzi, M. Masmano, A. R. Navarro, M. Muñioz, V. N. Gallego, P. L. Cueva, J. Letrillard, and F. Wartel, “HERMES: qualification of High pErformance pRogrammable Microprocessor and dEvelopment of Software ecosystem,” in 2023 Design, Automation & Test in Europe Conference & Exhibition (DATE), 2023, p. 1–5.
[BibTeX]

@conference{11311_1249326,
author = {Ibellaatti, Nadia and Lepape, Edouard and Kilic, Alp and Akyel, Kaya and Chouayakh, Kassem and Ferrandi, Fabrizio and Barone, Claudio and Curzel, Serena and Fiorito, Michele and Gozzi, Giovanni and Masmano, Miguel and Navarro, Ana Risquez and Muñioz, Manuel and Gallego, Vicente Nicolau and Cueva, Patricia Lopez and Letrillard, Jean-noel and Wartel, Franck},
title = {HERMES: qualification of High pErformance pRogrammable Microprocessor and dEvelopment of Software ecosystem},
year = {2023},
booktitle = {2023 Design, Automation & Test in Europe Conference & Exhibition (DATE)},
doi = {10.23919/DATE56975.2023.10136921},
pages = {1--5}
}

2022

S. Curzel, S. Jovic, M. Fiorito, A. Tumeo, and F. Ferrandi, “Higher-Level Synthesis: experimenting with MLIR polyhedral representations for accelerator design,” in IMPACT 2022: 12th International Workshop on Polyhedral Compilation Techniques, 2022, p. 1–10.
[BibTeX]

@conference{11311_1220331,
author = {Curzel, S. and Jovic, S. and Fiorito, M. and Tumeo, A. and Ferrandi, F.},
title = {Higher-Level Synthesis: experimenting with MLIR polyhedral representations for accelerator design},
year = {2022},
booktitle = {IMPACT 2022: 12th International Workshop on Polyhedral Compilation Techniques},
pages = {1--10}
}

S. Curzel, N. Bohm Agostini, V. G. Castellana, M. Minutoli, A. Limaye, J. Manzano, J. J. Zhang, D. Brooks, G. Wei, F. Ferrandi, and A. Tumeo, “End-to-End Synthesis of Dynamically Controlled Machine Learning Accelerators,” IEEE TRANSACTIONS ON COMPUTERS, vol. 71, iss. 12, p. 3074–3087, 2022.
[BibTeX] [Abstract]

Edge systems are required to autonomously make real-time decisions based on large quantities of input data under strict power, performance, area, and other constraints. Meeting these constraints is only possible by specializing systems through hardware accelerators purposefully built for machine learning and data analysis algorithms. However, data science evolves at a quick pace, and manual design of custom accelerators has high non-recurrent engineering costs: general solutions are needed to automatically and rapidly transition from the formulation of a new algorithm to the deployment of a dedicated hardware implementation. Our solution is the SOftware Defined Architectures (SODA) Synthesizer, an end-to-end, multi-level, modular, extensible compiler toolchain providing a direct path from machine learning tools to hardware. The SODA Synthesizer frontend is based on the multilevel intermediate representation (MLIR) framework; it ingests pre-trained machine learning models, identifies kernels suited for acceleration, performs high-level optimizations, and prepares them for hardware synthesis. In the backend, SODA leverages state-of-the-art high-level synthesis techniques to generate highly efficient accelerators, targeting both field programmable devices (FPGAs) and application-specific circuits (ASICs). In this paper, we describe how the SODA Synthesizer can also assemble the generated accelerators (based on the finite state machine with datapath model) in a custom system driven by a distributed controller, building a coarse-grained dataflow architecture that does not require a host processor to orchestrate parallel execution of multiple accelerators. We show the effectiveness of our approach by automatically generating ASIC accelerators for layers of popular deep neural networks (DNNs). Our high-level optimizations result in up to 74x speedup on isolated accelerators for individual DNN layers, and our dynamically scheduled architecture yields an additional 3x performance improvement when combining accelerators to handle streaming inputs.

@article{11311_1222754,
author = {Curzel, S. and Bohm Agostini, N. and Castellana, V. G. and Minutoli, M. and Limaye, A. and Manzano, J. and Zhang, J. J. and Brooks, D. and Wei, G. and Ferrandi, F. and Tumeo, A.},
title = {End-to-End Synthesis of Dynamically Controlled Machine Learning Accelerators},
year = {2022},
journal = {IEEE TRANSACTIONS ON COMPUTERS},
volume = {71},
abstract = {Edge systems are required to autonomously make real-time decisions based on large quantities of input data under strict power, performance, area, and other constraints. Meeting these constraints is only possible by specializing systems through hardware accelerators purposefully built for machine learning and data analysis algorithms. However, data science evolves at a quick pace, and manual design of custom accelerators has high non-recurrent engineering costs: general solutions are needed to automatically and rapidly transition from the formulation of a new algorithm to the deployment of a dedicated hardware implementation. Our solution is the SOftware Defined Architectures (SODA) Synthesizer, an end-to-end, multi-level, modular, extensible compiler toolchain providing a direct path from machine learning tools to hardware. The SODA Synthesizer frontend is based on the multilevel intermediate representation (MLIR) framework; it ingests pre-trained machine learning models, identifies kernels suited for acceleration, performs high-level optimizations, and prepares them for hardware synthesis. In the backend, SODA leverages state-of-the-art high-level synthesis techniques to generate highly efficient accelerators, targeting both field programmable devices (FPGAs) and application-specific circuits (ASICs). In this paper, we describe how the SODA Synthesizer can also assemble the generated accelerators (based on the finite state machine with datapath model) in a custom system driven by a distributed controller, building a coarse-grained dataflow architecture that does not require a host processor to orchestrate parallel execution of multiple accelerators. We show the effectiveness of our approach by automatically generating ASIC accelerators for layers of popular deep neural networks (DNNs). Our high-level optimizations result in up to 74x speedup on isolated accelerators for individual DNN layers, and our dynamically scheduled architecture yields an additional 3x performance improvement when combining accelerators to handle streaming inputs.},
doi = {10.1109/TC.2022.3211430},
pages = {3074--3087},
number = {12}
}

N. Bohm Agostini, S. Curzel, J. Zhang, A. Limaye, C. Tan, V. Amatya, M. Minutoli, V. G. Castellana, J. Manzano, D. Brooks, G. Wei, and A. Tumeo, “Bridging Python to Silicon: The SODA Toolchain,” IEEE MICRO, vol. 42, iss. 5, p. 78–88, 2022.
[BibTeX] [Abstract]

Systems performing scientific computing, data analysis, and machine learning tasks have a growing demand for application-specific accelerators that can provide high computational performance while meeting strict size and power requirements. However, the algorithms and applications that need to be accelerated are evolving at a rate that is incompatible with manual design processes based on hardware description languages. Agile hardware design tools based on compiler techniques can help by quickly producing an application-specific integrated circuit (ASIC) accelerator starting from a high-level algorithmic description. We present the software-defined accelerator (SODA) synthesizer, a modular and open-source hardware compiler that provides automated end-to-end synthesis from high-level software frameworks to ASIC implementation, relying on multilevel representations to progressively lower and optimize the input code. Our approach does not require the application developer to write any register-transfer level code, and it is able to reach up to 364 giga floating point operations per second (GFLOPS)/W efficiency (32-bit precision) on typical convolutional neural network operators.

@article{11311_1220338,
author = {Bohm Agostini, N. and Curzel, S. and Zhang, J. and Limaye, A. and Tan, C. and Amatya, V. and Minutoli, M. and Castellana, V. G. and Manzano, J. and Brooks, D. and Wei, G. and Tumeo, A.},
title = {Bridging Python to Silicon: The SODA Toolchain},
year = {2022},
journal = {IEEE MICRO},
volume = {42},
abstract = {Systems performing scientific computing, data analysis, and machine learning tasks have a growing demand for application-specific accelerators that can provide high computational performance while meeting strict size and power requirements. However, the algorithms and applications that need to be accelerated are evolving at a rate that is incompatible with manual design processes based on hardware description languages. Agile hardware design tools based on compiler techniques can help by quickly producing an application-specific integrated circuit (ASIC) accelerator starting from a high-level algorithmic description. We present the software-defined accelerator (SODA) synthesizer, a modular and open-source hardware compiler that provides automated end-to-end synthesis from high-level software frameworks to ASIC implementation, relying on multilevel representations to progressively lower and optimize the input code. Our approach does not require the application developer to write any register-transfer level code, and it is able to reach up to 364 giga floating point operations per second (GFLOPS)/W efficiency (32-bit precision) on typical convolutional neural network operators.},
doi = {10.1109/MM.2022.3178580},
pages = {78--88},
number = {5}
}

N. B. Agostini, S. Curzel, A. Limaye, V. Amatya, M. Minutoli, V. G. Castellana, J. Manzano, A. Tumeo, and F. Ferrandi, “The SODA approach: leveraging high-level synthesis for hardware/software co-design and hardware specialization: invited,” in DAC ’22: Proceedings of the 59th ACM/IEEE Design Automation Conference, 2022, p. 1359–1362.
[BibTeX] [Abstract]

Novel “converged” applications combine phases of scientific simulation with data analysis and machine learning. Each computational phase can benefit from specialized accelerators. However, algorithms evolve so quickly that mapping them on existing accelerators is suboptimal or even impossible. This paper presents the SODA (Software Defined Accelerators) framework, a modular, multi-level, open-source, no-human-in-the-loop, hardware synthesizer that enables end-to-end generation of specialized accelerators. SODA is composed of SODA-Opt, a high-level frontend developed in MLIR that interfaces with domain-specific programming frameworks and allows performing system level design, and Bambu, a state-of-the-art high-level synthesis engine that can target different device technologies. The framework implements design space exploration as compiler optimization passes. We show how the modular, yet tight, integration of the high-level optimizer and lower-level HLS tools enables the generation of accelerators optimized for the computational patterns of converged applications. We then discuss some of the research opportunities that such a framework allows, including system-level design, profile driven optimization, and supporting new optimization metrics.

@conference{11311_1220191,
author = {Agostini, Nicolas Bohm and Curzel, Serena and Limaye, Ankur and Amatya, Vinay and Minutoli, Marco and Castellana, Vito Giovanni and Manzano, Joseph and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {The SODA approach: leveraging high-level synthesis for hardware/software co-design and hardware specialization: invited},
year = {2022},
booktitle = {DAC '22: Proceedings of the 59th ACM/IEEE Design Automation Conference},
abstract = {Novel "converged" applications combine phases of scientific simulation with data analysis and machine learning. Each computational phase can benefit from specialized accelerators. However, algorithms evolve so quickly that mapping them on existing accelerators is suboptimal or even impossible. This paper presents the SODA (Software Defined Accelerators) framework, a modular, multi-level, open-source, no-human-in-the-loop, hardware synthesizer that enables end-to-end generation of specialized accelerators. SODA is composed of SODA-Opt, a high-level frontend developed in MLIR that interfaces with domain-specific programming frameworks and allows performing system level design, and Bambu, a state-of-the-art high-level synthesis engine that can target different device technologies. The framework implements design space exploration as compiler optimization passes. We show how the modular, yet tight, integration of the high-level optimizer and lower-level HLS tools enables the generation of accelerators optimized for the computational patterns of converged applications. We then discuss some of the research opportunities that such a framework allows, including system-level design, profile driven optimization, and supporting new optimization metrics.},
doi = {10.1145/3489517.3530628},
isbn = {9781450391429},
pages = {1359--1362}
}

N. B. Agostini, S. Curzel, V. Amatya, C. Tan, M. Minutoli, V. G. Castellana, J. Manzano, D. Kaeli, and A. Tumeo, “An MLIR-based compiler flow for system-level design and hardware acceleration,” in ICCAD ’22: Proceedings of the 41st IEEE/ACM International Conference on Computer-Aided Design, 2022, p. 1–9.
[BibTeX] [Abstract]

The generation of custom hardware accelerators for applications implemented within high-level productive programming frameworks requires considerable manual effort. To automate this process, we introduce SODA-OPT, a compiler tool that extends the MLIR infrastructure. SODA-OPT automatically searches, outlines, tiles, and pre-optimizes relevant code regions to generate high-quality accelerators through high-level synthesis. SODA-OPT can support any high-level programming framework and domain-specific language that interface with the MLIR infrastructure. By leveraging MLIR, SODA-OPT solves compiler optimization problems with specialized abstractions. Backend synthesis tools connect to SODA-OPT through progressive intermediate representation lowerings. SODAOPT interfaces to a design space exploration engine to identify the combination of compiler optimization passes and options that provides high-performance generated designs for different backends and targets. We demonstrate the practical applicability of the compilation flow by exploring the automatic generation of accelerators for deep neural networks operators outlined at arbitrary granularity and by combining outlining with tiling on large convolution layers. Experimental results with kernels from the PolyBench benchmark show that our high-level optimizations improve execution delays of synthesized accelerators up to 60x. We also show that for the selected kernels, our solution outperforms the current of state-ofthe art in more than 70% of the benchmarks and provides better average speedup in 55% of them. SODA-OPT is an open source project available at https://gitlab.pnnl.gov/sodalite/soda-opt.

@conference{11311_1229389,
author = {Agostini, N. B. and Curzel, S. and Amatya, V. and Tan, C. and Minutoli, M. and Castellana, V. G. and Manzano, J. and Kaeli, D. and Tumeo, A.},
title = {An MLIR-based compiler flow for system-level design and hardware acceleration},
year = {2022},
booktitle = {ICCAD '22: Proceedings of the 41st IEEE/ACM International Conference on Computer-Aided Design},
abstract = {The generation of custom hardware accelerators for applications implemented within high-level productive programming frameworks requires considerable manual effort. To automate this process, we introduce SODA-OPT, a compiler tool that extends the MLIR infrastructure. SODA-OPT automatically searches, outlines, tiles, and pre-optimizes relevant code regions to generate high-quality accelerators through high-level synthesis. SODA-OPT can support any high-level programming framework and domain-specific language that interface with the MLIR infrastructure. By leveraging MLIR, SODA-OPT solves compiler optimization problems with specialized abstractions. Backend synthesis tools connect to SODA-OPT through progressive intermediate representation lowerings. SODAOPT interfaces to a design space exploration engine to identify the combination of compiler optimization passes and options that provides high-performance generated designs for different backends and targets. We demonstrate the practical applicability of the compilation flow by exploring the automatic generation of accelerators for deep neural networks operators outlined at arbitrary granularity and by combining outlining with tiling on large convolution layers. Experimental results with kernels from the PolyBench benchmark show that our high-level optimizations improve execution delays of synthesized accelerators up to 60x. We also show that for the selected kernels, our solution outperforms the current of state-ofthe art in more than 70% of the benchmarks and provides better average speedup in 55% of them. SODA-OPT is an open source project available at https://gitlab.pnnl.gov/sodalite/soda-opt.},
doi = {10.1145/3508352.3549424},
isbn = {9781450392174},
pages = {1--9}
}

N. B. Agostini, A. Limaye, M. Minutoli, V. G. Castellana, J. Manzano, A. Tumeo, S. Curzel, and F. Ferrandi, “SODA synthesizer: An open-source, multi-level, modular, extensible compiler from high-level frameworks to silicon,” in ICCAD ’22: Proceedings of the 41st IEEE/ACM International Conference on Computer-Aided Design, 2022, p. 1–7.
[BibTeX] [Abstract]

The SODA Synthesizer is an open-source, modular, end-to-end hardware compiler framework. The SODA frontend, developed in MLIR, performs system-level design, code partitioning, and highlevel optimizations to prepare the specifications for the hardware synthesis. The backend is based on a state-of-the-art high-level synthesis tool and generates the final hardware design. The backend can interface with logic synthesis tools for field programmable gate arrays or with commercial and open-source logic synthesis tools for application-specific integrated circuits. We discuss the opportunities and challenges in integrating with commercial and open-source tools both at the frontend and backend, and highlight the role that an end-to-end compiler framework like SODA can play in an open-source hardware design ecosystem.

@conference{11311_1229392,
author = {Agostini, N. B. and Limaye, A. and Minutoli, M. and Castellana, V. G. and Manzano, J. and Tumeo, A. and Curzel, S. and Ferrandi, F.},
title = {SODA synthesizer: An open-source, multi-level, modular, extensible compiler from high-level frameworks to silicon},
year = {2022},
booktitle = {ICCAD '22: Proceedings of the 41st IEEE/ACM International Conference on Computer-Aided Design},
abstract = {The SODA Synthesizer is an open-source, modular, end-to-end hardware compiler framework. The SODA frontend, developed in MLIR, performs system-level design, code partitioning, and highlevel optimizations to prepare the specifications for the hardware synthesis. The backend is based on a state-of-the-art high-level synthesis tool and generates the final hardware design. The backend can interface with logic synthesis tools for field programmable gate arrays or with commercial and open-source logic synthesis tools for application-specific integrated circuits. We discuss the opportunities and challenges in integrating with commercial and open-source tools both at the frontend and backend, and highlight the role that an end-to-end compiler framework like SODA can play in an open-source hardware design ecosystem.},
doi = {10.1145/3508352.3561101},
isbn = {9781450392174},
pages = {1--7}
}

2021

M. Minutoli, V. G. Castellana, N. Saporetti, S. Devecchi, M. Lattuada, P. Fezzardi, A. Tumeo, and F. Ferrandi, “Svelto: High-Level Synthesis of Multi-Threaded Accelerators for Graph Analytics,” IEEE TRANSACTIONS ON COMPUTERS, vol. 71, iss. 3, p. 520–533, 2021.
[BibTeX]

@article{11311_1161042,
author = {Minutoli, Marco and Castellana, Vito Giovanni and Saporetti, Nicola and Devecchi, Stefano and Lattuada, Marco and Fezzardi, Pietro and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {Svelto: High-Level Synthesis of Multi-Threaded Accelerators for Graph Analytics},
year = {2021},
journal = {IEEE TRANSACTIONS ON COMPUTERS},
volume = {71},
doi = {10.1109/TC.2021.3057860},
pages = {520--533},
number = {3}
}

C. Pilato, S. Bohm, F. Brocheton, J. Castrillon, R. Cevasco, V. Cima, R. Cmar, D. Diamantopoulos, F. Ferrandi, J. Martinovic, G. Palermo, M. Paolino, A. Parodi, L. Pittaluga, D. Raho, F. Regazzoni, K. Slaninova, and C. Hagleitner, “EVEREST: A design environment for extreme-scale big data analytics on heterogeneous platforms,” in PROCEEDINGS OF THE 2021 DESIGN, AUTOMATION & TEST IN EUROPE CONFERENCE & EXHIBITION (DATE 2021), 2021, p. 1320–1325.
[BibTeX]

@conference{11311_1181959,
author = {Pilato, C. and Bohm, S. and Brocheton, F. and Castrillon, J. and Cevasco, R. and Cima, V. and Cmar, R. and Diamantopoulos, D. and Ferrandi, F. and Martinovic, J. and Palermo, G. and Paolino, M. and Parodi, A. and Pittaluga, L. and Raho, D. and Regazzoni, F. and Slaninova, K. and Hagleitner, C.},
title = {EVEREST: A design environment for extreme-scale big data analytics on heterogeneous platforms},
year = {2021},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
volume = {2021-},
booktitle = {PROCEEDINGS OF THE 2021 DESIGN, AUTOMATION & TEST IN EUROPE CONFERENCE & EXHIBITION (DATE 2021)},
doi = {10.23919/DATE51398.2021.9473940},
pages = {1320--1325}
}

V. G. Castellana, A. Tumeo, and F. Ferrandi, “High-Level Synthesis of Parallel Specifications Coupling Static and Dynamic Controllers,” in Proceedings of International Symposium on Parallel and Distributed Processing (IPDPS), 2021, p. 192–202.
[BibTeX]

@conference{11311_1180263,
author = {Castellana, Vito Giovanni and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {High-Level Synthesis of Parallel Specifications Coupling Static and Dynamic Controllers},
year = {2021},
booktitle = {Proceedings of International Symposium on Parallel and Distributed Processing (IPDPS)},
doi = {10.1109/IPDPS49936.2021.00028},
isbn = {978-1-6654-4066-0},
pages = {192--202}
}

F. Ferrandi, V. G. Castellana, S. Curzel, P. Fezzardi, M. Fiorito, M. Lattuada, M. Minutoli, C. Pilato, and A. Tumeo, “Invited: Bambu: an Open-Source Research Framework for the High-Level Synthesis of Complex Applications,” in Proceedings of the 2021 58th ACM/IEEE Design Automation Conference (DAC), 2021, p. 1327–1330.
[BibTeX]

@conference{11311_1189606,
author = {Ferrandi, Fabrizio and Castellana, Vito Giovanni and Curzel, Serena and Fezzardi, Pietro and Fiorito, Michele and Lattuada, Marco and Minutoli, Marco and Pilato, Christian and Tumeo, Antonino},
title = {Invited: Bambu: an Open-Source Research Framework for the High-Level Synthesis of Complex Applications},
year = {2021},
booktitle = {Proceedings of the 2021 58th ACM/IEEE Design Automation Conference (DAC)},
doi = {10.1109/DAC18074.2021.9586110},
isbn = {978-1-6654-3274-0},
pages = {1327--1330}
}

S. Curzel, N. B. Agostini, S. Song, I. Dagli, A. Limaye, C. Tan, M. Minutoli, V. G. Castellana, V. Amatya, J. Manzano, A. Das, F. Ferrandi, and A. Tumeo, “Automated Generation of Integrated Digital and Spiking Neuromorphic Machine Learning Accelerators,” in Proceedings of the 2021 IEEE/ACM International Conference On Computer Aided Design (ICCAD), 2021, p. 1–7.
[BibTeX]

@conference{11311_1194314,
author = {Curzel, Serena and Agostini, Nicolas Bohm and Song, Shihao and Dagli, Ismet and Limaye, Ankur and Tan, Cheng and Minutoli, Marco and Castellana, Vito Giovanni and Amatya, Vinay and Manzano, Joseph and Das, Anup and Ferrandi, Fabrizio and Tumeo, Antonino},
title = {Automated Generation of Integrated Digital and Spiking Neuromorphic Machine Learning Accelerators},
year = {2021},
booktitle = {Proceedings of the 2021 IEEE/ACM International Conference On Computer Aided Design (ICCAD)},
url = {https://ieeexplore.ieee.org/document/9643474},
doi = {10.1109/ICCAD51958.2021.9643474},
isbn = {978-1-6654-4507-8},
pages = {1--7}
}

2020

M. Siracusa and F. Ferrandi, “Tensor Optimization for High-Level Synthesis Design Flows,” IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS, vol. 39, iss. 11, p. 4217–4228, 2020.
[BibTeX]

@article{11311_1148653,
author = {Siracusa, Marco and Ferrandi, Fabrizio},
title = {Tensor Optimization for High-Level Synthesis Design Flows},
year = {2020},
journal = {IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS},
volume = {39},
doi = {10.1109/TCAD.2020.3012318},
pages = {4217--4228},
number = {11}
}

P. Fezzardi and F. Ferrandi, “Automated Bug Detection for High-level Synthesis of Multi-threaded Irregular Applications,” ACM TRANSACTIONS ON PARALLEL COMPUTING, vol. 7, iss. 4, p. 1–26, 2020.
[BibTeX]

@article{11311_1148640,
author = {Fezzardi, Pietro and Ferrandi, Fabrizio},
title = {Automated Bug Detection for High-level Synthesis of Multi-threaded Irregular Applications},
year = {2020},
journal = {ACM TRANSACTIONS ON PARALLEL COMPUTING},
volume = {7},
doi = {10.1145/3418086},
pages = {1--26},
number = {4}
}

2019

V. G. Castellana, M. Minutoli, A. Tumeo, M. Lattuada, P. Fezzardi, and F. Ferrandi, “Software defined architectures for data analytics,” in ASPDAC ’19 Proceedings of the 24th Asia and South Pacific Design Automation Conference, 2019, p. 711–718.
[BibTeX]

@conference{11311_1074488,
author = {Castellana, Vito Giovanni and Minutoli, Marco and Tumeo, Antonino and Lattuada, Marco and Fezzardi, Pietro and Ferrandi, Fabrizio},
title = {Software defined architectures for data analytics},
year = {2019},
booktitle = {ASPDAC '19 Proceedings of the 24th Asia and South Pacific Design Automation Conference},
doi = {10.1145/3287624.3288754},
isbn = {9781450360074},
pages = {711--718}
}

M. Lattuada and F. Ferrandi, “A Design Flow Engine for the Support of Customized Dynamic High Level Synthesis Flows,” ACM TRANSACTIONS ON RECONFIGURABLE TECHNOLOGY AND SYSTEMS, vol. 12, iss. 4, p. 19:1–19:26, 2019.
[BibTeX]

@article{11311_1114656,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {A Design Flow Engine for the Support of Customized Dynamic High Level Synthesis Flows},
year = {2019},
journal = {ACM TRANSACTIONS ON RECONFIGURABLE TECHNOLOGY AND SYSTEMS},
volume = {12},
keywords = {High level synthesis; Compilation steps},
url = {http://doi.acm.org/10.1145/3356475},
doi = {10.1145/3356475},
pages = {19:1--19:26},
number = {4}
}

2018

P. Fezzardi, C. Pilato, and F. Ferrandi, “Enabling Automated Bug Detection for IP-based Designs using High-Level Synthesis,” IEEE DESIGN & TEST, vol. 35, iss. 5, p. 54–62, 2018.
[BibTeX]

@article{11311_1052325,
author = {Fezzardi, Pietro and Pilato, Christian and Ferrandi, Fabrizio},
title = {Enabling Automated Bug Detection for IP-based Designs using High-Level Synthesis},
year = {2018},
journal = {IEEE DESIGN & TEST},
volume = {35},
url = {https://ieeexplore.ieee.org/document/8332976/},
doi = {10.1109/MDAT.2018.2824121},
pages = {54--62},
number = {5}
}

M. Lattuada, F. Ferrandi, and M. Perrotin, “Data Transfers Analysis in Computer Assisted Design Flow of FPGA Accelerators for Aerospace Systems,” IEEE TRANSACTIONS ON MULTI-SCALE COMPUTING SYSTEMS, vol. 4, iss. 1, p. 3–16, 2018.
[BibTeX]

@article{11311_1020704,
author = {Lattuada, Marco and Ferrandi, Fabrizio and Perrotin, Maxime},
title = {Data Transfers Analysis in Computer Assisted Design Flow of FPGA Accelerators for Aerospace Systems},
year = {2018},
journal = {IEEE TRANSACTIONS ON MULTI-SCALE COMPUTING SYSTEMS},
volume = {4},
url = {http://ieeexplore.ieee.org/document/7914673/},
doi = {10.1109/TMSCS.2017.2699647},
pages = {3--16},
number = {1}
}

2017

M. Lattuada and F. Ferrandi, “Exploiting Vectorization in High Level Synthesis of Nested Irregular Loops,” JOURNAL OF SYSTEMS ARCHITECTURE, vol. 75, p. 1–14, 2017.
[BibTeX]

@article{11311_1010813,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Exploiting Vectorization in High Level Synthesis of Nested Irregular Loops},
year = {2017},
journal = {JOURNAL OF SYSTEMS ARCHITECTURE},
volume = {75},
url = {http://dx.doi.org/10.1016/j.sysarc.2017.03.001},
doi = {10.1016/j.sysarc.2017.03.001},
pages = {1--14}
}

P. Fezzardi, M. Lattuada, and F. Ferrandi, “Using Efficient Path Profiling to Optimize Memory Consumption of On-Chip Debugging for High-Level Synthesis,” ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS, vol. 1, iss. Special Issue on ESWEEK 2017, p. 1–19, 2017.
[BibTeX]

@article{11311_1030731,
author = {Fezzardi, Pietro and Lattuada, Marco and Ferrandi, Fabrizio},
title = {Using Efficient Path Profiling to Optimize Memory Consumption of On-Chip Debugging for High-Level Synthesis},
year = {2017},
journal = {ACM TRANSACTIONS ON EMBEDDED COMPUTING SYSTEMS},
volume = {1},
pages = {1--19},
number = {Special Issue on ESWEEK 2017}
}

2016

P. Fezzardi and F. Ferrandi, “Automated bug detection for pointers and memory accesses in High-Level Synthesis compilers,” in FPL 2016 – 26th International Conference on Field-Programmable Logic and Applications, 2016, p. 1–9.
[BibTeX]

@conference{11311_998431,
author = {Fezzardi, Pietro and Ferrandi, Fabrizio},
title = {Automated bug detection for pointers and memory accesses in High-Level Synthesis compilers},
year = {2016},
booktitle = {FPL 2016 - 26th International Conference on Field-Programmable Logic and Applications},
doi = {10.1109/FPL.2016.7577369},
isbn = {978-2-8399-1844-2},
isbn = {978-2-8399-1844-2},
pages = {1--9}
}

M. Minutoli, V. G. Castellana, A. Tumeo, F. Ferrandi, and M. Lattuada, “A Dynamically Scheduled Architecture for the Synthesis of Graph Database Queries,” in 2016 IEEE 24th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM), 2016, p. 136–136.
[BibTeX]

@conference{11311_995143,
author = {Minutoli, M. and Castellana, VITO GIOVANNI and Tumeo, Antonino and Ferrandi, Fabrizio and Lattuada, Marco},
title = {A Dynamically Scheduled Architecture for the Synthesis of Graph Database Queries},
year = {2016},
booktitle = {2016 IEEE 24th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)},
keywords = {Databases;Dynamic scheduling;High level synthesis;Memory architecture;Pipeline processing;Registers},
doi = {10.1109/FCCM.2016.41},
pages = {136--136}
}

M. Minutoli, V. G. Castellana, A. Tumeo, M. Lattuada, and F. Ferrandi, “Enabling the high level synthesis of data analytics accelerators,” in CODES ’16 Proceedings of the Eleventh IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis, 2016, p. 1–3.
[BibTeX]

@conference{11311_999155,
author = {Minutoli, Marco and Castellana, VITO GIOVANNI and Tumeo, Antonino and Lattuada, Marco and Ferrandi, Fabrizio},
title = {Enabling the high level synthesis of data analytics accelerators},
year = {2016},
publisher = {ACM},
booktitle = {CODES '16 Proceedings of the Eleventh IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis},
doi = {10.1145/2968456.2976764},
isbn = {9781450344838},
isbn = {9781450344838},
pages = {1--3}
}

M. Minutoli, V. G. Castellana, A. Tumeo, M. Lattuada, and F. Ferrandi, “Efficient synthesis of graph methods: a dynamically scheduled architecture,” in ICCAD ’16 Proceedings of the 35th International Conference on Computer-Aided Design, New York, 2016, p. 1–8.
[BibTeX]

@conference{11311_1000685,
author = {Minutoli, Marco and Castellana, VITO GIOVANNI and Tumeo, Antonino and Lattuada, Marco and Ferrandi, Fabrizio},
title = {Efficient synthesis of graph methods: a dynamically scheduled architecture},
year = {2016},
publisher = {ACM},
address = {New York},
booktitle = {ICCAD '16 Proceedings of the 35th International Conference on Computer-Aided Design},
doi = {10.1145/2966986.2967030},
isbn = {9781450344661},
isbn = {978-145034466-1},
pages = {1--8}
}

R. Nane, V. M. Sima, C. Pilato, J. Choi, B. Fort, A. Canis, Y. T. Chen, H. Hsiao, S. Brown, F. Ferrandi, J. Anderson, and K. Bertels, “A Survey and Evaluation of FPGA High-Level Synthesis Tools,” IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS, vol. 35, iss. 10, p. 1591–1604, 2016.
[BibTeX]

@article{11311_998432,
author = {Nane, Razvan and Sima, Vlad Mihai and Pilato, Christian and Choi, Jongsok and Fort, Blair and Canis, Andrew and Chen, Yu Ting and Hsiao, Hsuan and Brown, Stephen and Ferrandi, Fabrizio and Anderson, Jason and Bertels, Koen},
title = {A Survey and Evaluation of FPGA High-Level Synthesis Tools},
year = {2016},
journal = {IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS},
volume = {35},
doi = {10.1109/TCAD.2015.2513673},
pages = {1591--1604},
number = {10}
}

M. Lattuada, F. Ferrandi, and M. Perrotin, “Computer Assisted Design and Integration of FPGA Accelerators in Aerospace Systems,” in Proceedings of IEEE Aerospace Conference 2016, 2016, p. 1–11.
[BibTeX] [Abstract]

The integration of Field Programmable Gate Arrays (FPGAs) in an aerospace system allows to improve its efficiency and its flexibility thanks to their programmability. To exploit these devices, the designer has to identify the functionalities that have to be executed on them and provide their implementation by means of Hardware Description Languages. Generating these descriptions for a software developer could be a very difficult task because of the different programming paradigms of software programs and hardware descriptions. To facilitate the developer in this activity, High Level Synthesis techniques have been developed aiming at (semi-)automatically generating hardware implementations of specifications written in high level languages (e.g., C). State of the art tools implementing such methodologies have not been designed for the integration with aerospace systems design flows, so significant adaptations could be required to the designer for integrating the hardware implementations with the rest of the design solution. In this paper the integration of a High Level Synthesis design flow in the TASTE framework (http://taste.tuxfamily.org) is presented. TASTE is a set of freely available tools for the development of real time embedded systems developed by the European Space Agency together with a set of its industrial partners. This framework allows to integrate specifications described in different languages (e.g., C, ADA, Simulink, SDL) by means of formal languages (AADL and ASN.1) and to early verify the correctness of the produced solutions. TASTE has been extended with Bambu (http://panda.dei.polimi.it), a tool for the High Level Synthesis developed at Politecnico di Milano. In this way the TASTE users have the possibility to specify which functionalities, provided by means of high level languages such C, have to be implemented in hardware on the FPGA without having to directly provide the hardware implementations. Thanks to the integration of the High Level Synthesis tool indeed, the framework is able not only to produce the hardware implementations, but also to integrate them in the rest of the aerospace system by automatically generating the whole architecture to be implemented on the FPGA. This architecture contains not only the implementation of the hardware accelerators, but also of the components required to transfer the data from and to the rest of the system and to correctly manage their size and endianness. The application of the extended framework to a real case study shows its effective usability.

@conference{11311_977752,
author = {Lattuada, Marco and Ferrandi, Fabrizio and Perrotin, Maxime},
title = {Computer Assisted Design and Integration of FPGA Accelerators in Aerospace Systems},
year = {2016},
booktitle = {Proceedings of IEEE Aerospace Conference 2016},
abstract = {The integration of Field Programmable Gate Arrays (FPGAs) in an aerospace system allows to improve its efficiency and its flexibility thanks to their programmability. To exploit these devices, the designer has to identify the functionalities that have to be executed on them and provide their implementation by means of Hardware Description Languages. Generating these descriptions for a software developer could be a very difficult task because of the different programming paradigms of software programs and hardware descriptions. To facilitate the developer in this activity, High Level Synthesis techniques have been developed aiming at (semi-)automatically generating hardware implementations of specifications written in high level languages (e.g., C). State of the art tools implementing such methodologies have not been designed for the integration with aerospace systems design flows, so significant adaptations could be required to the designer for integrating the hardware implementations with the rest of the design solution.
In this paper the integration of a High Level Synthesis design flow in the TASTE framework (http://taste.tuxfamily.org) is presented. TASTE is a set of freely available tools for the development of real time embedded systems developed by the European Space Agency together with a set of its industrial partners. This framework allows to integrate specifications described in different languages (e.g., C, ADA, Simulink, SDL) by means of formal languages (AADL and ASN.1) and to early verify the correctness of the produced solutions. TASTE has been extended with Bambu (http://panda.dei.polimi.it), a tool for the High Level Synthesis developed at Politecnico di Milano. In this way the TASTE users have the possibility to specify which functionalities, provided by means of high level languages such C, have to be implemented in hardware on the FPGA without having to directly provide the hardware implementations. Thanks to the integration of the High Level Synthesis tool indeed, the framework is able not only to produce the hardware implementations, but also to integrate them in the rest of the aerospace system by automatically generating the whole architecture to be implemented on the FPGA. This architecture contains not only the implementation of the hardware accelerators, but also of the components required to transfer the data from and to the rest of the system and to correctly manage their size and endianness.
The application of the extended framework to a real case study shows its effective usability.},
doi = {10.1109/AERO.2016.7500675},
isbn = {978-146737676-1},
pages = {1--11}
}

2015

M. Minutoli, V. G. Castellana, A. Tumeo, and F. Ferrandi, “Inter-procedural resource sharing in High Level Synthesis through function proxies,” in Field Programmable Logic and Applications (FPL), 2015 25th International Conference on, 2015, p. 1–8.
[BibTeX] [Abstract]

Modular design is becoming increasingly important in High Level Synthesis (HLS) flows. Current HLS flows generate hierarchical and modular designs that mimic the structure and call graph of the input specification by translating functions into modules. Function calls are translated by instantiating the callee module in the data-path of its caller, allowing for resource sharing when the same function is called multiple times. However, if two different callers invoke the same function, current HLS flows cannot share the instance of the module between the two callers, even if they invoke the function in a mutually exclusive way. In this paper, we propose a methodology that enables sharing of (sub)modules across modules boundaries. Sharing is obtained through function proxies, which act as forwarders of function calls in the original specification to shared modules without reducing performance. Building on the concept of function proxies, we propose a methodology and the related components to perform HLS of function calls through function pointers, without requiring complete static knowledge of the alias set (point-to set). We show that module sharing through function proxies provides valuable area savings and no significant impacts on the execution delays, and that our synthesis approach for function pointers enables dynamic polymorphism.

@conference{11311_966133,
author = {Minutoli, Marco and Castellana, VITO GIOVANNI and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {Inter-procedural resource sharing in High Level Synthesis through function proxies},
year = {2015},
booktitle = {Field Programmable Logic and Applications (FPL), 2015 25th International Conference on},
abstract = {Modular design is becoming increasingly important in High Level Synthesis (HLS) flows. Current HLS flows generate hierarchical and modular designs that mimic the structure and call graph of the input specification by translating functions into modules. Function calls are translated by instantiating the callee module in the data-path of its caller, allowing for resource sharing when the same function is called multiple times. However, if two different callers invoke the same function, current HLS flows cannot share the instance of the module between the two callers, even if they invoke the function in a mutually exclusive way. In this paper, we propose a methodology that enables sharing of (sub)modules across modules boundaries. Sharing is obtained through function proxies, which act as forwarders of function calls in the original specification to shared modules without reducing performance. Building on the concept of function proxies, we propose a methodology and the related components to perform HLS of function calls through function pointers, without requiring complete static knowledge of the alias set (point-to set). We show that module sharing through function proxies provides valuable area savings and no significant impacts on the execution delays, and that our synthesis approach for function pointers enables dynamic polymorphism.},
doi = {10.1109/FPL.2015.7293958},
isbn = {978-0-9934-2800-5},
isbn = {978-0-9934-2800-5},
pages = {1--8}
}

M. Minutoli, V. G. Castellana, A. Tumeo, and F. Ferrandi, “Function Proxies for Improved Resource Sharing in High Level Synthesis,” in Field-Programmable Custom Computing Machines (FCCM), 2015 IEEE 23rd Annual International Symposium on, 2015, p. 100–100.
[BibTeX] [Abstract]

The current generation of High Level Synthesis (HLS) tools usually generates hierarchical and modular designs, mimicking the structure of the call graph of the original high-level input specification. The standard approach is to progressively synthesize functions into modules by navigating the application call graph from the leaves up to the top function. In the synthesized architecture, function calls corresponds to the instantiation of the related module into the data path generated for the caller. Our work introduces a methodology that enables sharing of (sub)modules across modules boundaries.

@conference{11311_964843,
author = {Minutoli, Marco and Castellana, VITO GIOVANNI and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {Function Proxies for Improved Resource Sharing in High Level Synthesis},
year = {2015},
booktitle = {Field-Programmable Custom Computing Machines (FCCM), 2015 IEEE 23rd Annual International Symposium on},
abstract = {The current generation of High Level Synthesis (HLS) tools usually generates hierarchical and modular designs, mimicking the structure of the call graph of the original high-level input specification. The standard approach is to progressively synthesize functions into modules by navigating the application call graph from the leaves up to the top function. In the synthesized architecture, function calls corresponds to the instantiation of the related module into the data path generated for the caller. Our work introduces a methodology that enables sharing of (sub)modules across modules boundaries.},
doi = {10.1109/FCCM.2015.60},
isbn = {978-1-4799-9969-9},
isbn = {978-1-4799-9969-9},
pages = {100--100}
}

M. Lattuada, C. Pilato, and F. Ferrandi, “Performance Estimation of Task Graphs Based on Path Profiling,” INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING, vol. 44, iss. 4, p. 1–37, 2015.
[BibTeX] [Abstract]

Correctly estimating the speed-up of a parallel embedded application is crucial to efficiently compare different parallelization techniques, task graph transformations or mapping and scheduling solutions. Unfortunately, especially in case of control-dominated applications, task correlations may heavily affect the execution time of the solutions and usually this is not properly taken into account during performance analysis. We propose a methodology that combines a single profiling of the initial sequential specification with different decisions in terms of partitioning, mapping, and scheduling in order to better estimate the actual speed-up of these solutions. We validated our approach on a multi-processor simulation platform: experimental results show that our methodology, effectively identifying the correlations among tasks, significantly outperforms existing approaches for speed-up estimation. Indeed, we obtained an absolute error less than 5 % in average, even when compiling the code with different optimization levels.

@article{11311_977870,
author = {Lattuada, Marco and Pilato, Christian and Ferrandi, Fabrizio},
title = {Performance Estimation of Task Graphs Based on Path Profiling},
year = {2015},
journal = {INTERNATIONAL JOURNAL OF PARALLEL PROGRAMMING},
volume = {44},
abstract = {Correctly estimating the speed-up of a parallel embedded application is crucial to efficiently compare different parallelization techniques, task graph transformations or mapping and scheduling solutions. Unfortunately, especially in case of control-dominated applications, task correlations may heavily affect the execution time of the solutions and usually this is not properly taken into account during performance analysis. We propose a methodology that combines a single profiling of the initial sequential specification with different decisions in terms of partitioning, mapping, and scheduling in order to better estimate the actual speed-up of these solutions. We validated our approach on a multi-processor simulation platform: experimental results show that our methodology, effectively identifying the correlations among tasks, significantly outperforms existing approaches for speed-up estimation. Indeed, we obtained an absolute error less than 5 % in average, even when compiling the code with different optimization levels.},
keywords = {Hierarchical Task Graph; Path profiling; Performance estimation; Software; Information Systems; Theoretical Computer Science},
doi = {10.1007/s10766-015-0372-7},
pages = {1--37},
number = {4}
}

P. Fezzardi, M. Castellana, and F. Ferrandi, “Trace-based automated logical debugging for high-level synthesis generated circuits,” in 33rd IEEE International Conference on Computer Design, ICCD 2015,, 2015, p. 251–258.
[BibTeX] [Abstract]

In this paper we present an approach for debugging hardware designs generated by High-Level Synthesis (HLS), relieving users from the burden of identifying the signals to trace and from the error-prone task of manually checking the traces. The necessary steps are performed after HLS, independently of it and without affecting the synthesized design. For this reason our methodology should be easily adaptable to any HLS tools. The proposed approach makes full use of HLS compile time informations. The executions of the simulated design and the original C program can be compared, checking if there are discrepancies between values of C variables and signals in the design. The detection is completely automated, that is, it does not need any input but the program itself and the user does not have to know anything about the overall compilation process. The design can be validated on a given set of test cases and the discrepancies are detected by the tool. Relationships between the original high-level source code and the generated HDL are kept by the compiler and shown to the user. The granularity of such discrepancy analysis is per-operation and it includes the temporary variables inserted by the compiler. As a consequence the design can be debugged as is, with no restrictions on optimizations available during HLS. We show how this methodology can be used to identify different kind of bugs: 1) introduced by the HLS tool used for the synthesis; 2) introduced using buggy libraries of hardware components for HLS; 3) undefined behavior bugs in the original high-level source code.

@conference{11311_973455,
author = {Fezzardi, Pietro and Castellana, Michele and Ferrandi, Fabrizio},
title = {Trace-based automated logical debugging for high-level synthesis generated circuits},
year = {2015},
booktitle = {33rd {IEEE} International Conference on Computer Design, {ICCD} 2015,},
abstract = {In this paper we present an approach for debugging hardware designs generated by High-Level Synthesis (HLS), relieving users from the burden of identifying the signals to trace and from the error-prone task of manually checking the traces. The necessary steps are performed after HLS, independently of it and without affecting the synthesized design. For this reason our methodology should be easily adaptable to any HLS tools. The proposed approach makes full use of HLS compile time informations. The executions of the simulated design and the original C program can be compared, checking if there are discrepancies between values of C variables and signals in the design. The detection is completely automated, that is, it does not need any input but the program itself and the user does not have to know anything about the overall compilation process. The design can be validated on a given set of test cases and the discrepancies are detected by the tool. Relationships between the original high-level source code and the generated HDL are kept by the compiler and shown to the user. The granularity of such discrepancy analysis is per-operation and it includes the temporary variables inserted by the compiler. As a consequence the design can be debugged as is, with no restrictions on optimizations available during HLS. We show how this methodology can be used to identify different kind of bugs: 1) introduced by the HLS tool used for the synthesis; 2) introduced using buggy libraries of hardware components for HLS; 3) undefined behavior bugs in the original high-level source code.},
keywords = {Computer bugs;Controllability;Debugging;Hardware;Layout;Observability;Optimization},
doi = {10.1109/ICCD.2015.7357111},
isbn = {978-1-4673-7166-7},
isbn = {978-1-4673-7166-7},
pages = {251--258}
}

M. Lattuada and F. Ferrandi, “Exploiting Outer Loops Vectorization in High Level Synthesis,” in Architecture of Computing Systems – ARCS 2015, 2015, p. 31–42.
[BibTeX] [Abstract]

Synthesis of DoAll loops is a key aspect of High Level Synthesis since they allow to easily exploit the potential parallelism provided by programmable devices. This type of parallelism can be implemented in several ways: by duplicating the implementation of body loop, by exploiting loop pipelining or by applying vectorization. In this paper a methodology for the synthesis of complex DoAll loops based on outer vectorization is proposed. Vectorization is not limited to the innermost loops: complex constructs such as nested loops, conditional constructs and function calls are supported. Experimental results on parallel benchmarks show up to 7.35x speed-up and up to 40 % reduction of area-delay product.

@conference{11311_964118,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Exploiting Outer Loops Vectorization in High Level Synthesis},
year = {2015},
publisher = {Springer International Publishing},
booktitle = {Architecture of Computing Systems – ARCS 2015},
abstract = {Synthesis of DoAll loops is a key aspect of High Level Synthesis since they allow to easily exploit the potential parallelism provided by programmable devices. This type of parallelism can be implemented in several ways: by duplicating the implementation of body loop, by exploiting loop pipelining or by applying vectorization.
In this paper a methodology for the synthesis of complex DoAll loops based on outer vectorization is proposed. Vectorization is not limited to the innermost loops: complex constructs such as nested loops, conditional constructs and function calls are supported. Experimental results on parallel benchmarks show up to 7.35x speed-up and up to 40 % reduction of area-delay product.},
doi = {10.1007/978-3-319-16086-3_3},
isbn = {978-3-319-16085-6},
isbn = {978-3-319-16086-3},
isbn = {978-3-319-16085-6},
isbn = {978-3-319-16086-3},
pages = {31--42}
}

V. G. Castellana, M. Minutoli, A. Morari, A. Tumeo, M. Lattuada, and F. Ferrandi, “High level synthesis of RDF queries for graph analytics,” in Proceedings of the IEEE/ACM International Conference on Computer-Aided Design, ICCAD, Piscataway, NJ, 2015, p. 323–330.
[BibTeX] [Abstract]

In this paper we present a set of techniques that enable the synthesis of efficient custom accelerators for memory intensive, irregular applications. To address the challenges of irregular applications (large memory footprint, unpredictable fine-grained data accesses, and high synchronization intensity), and exploit their opportunities (thread level parallelism, memory level parallelism), we propose a novel accelerator design that employs an adaptive and Distributed Controller (DC) architecture, and a Memory Interface Controller (MIC) that supports concurrent and atomic memory operations on a multi-ported/multi-banked shared memory. Among the multitude of algorithms that may benefit from our solution, we focus on the acceleration of graph analytics applications and, in particular, on the synthesis of SPARQL queries on Resource Description Framework (RDF) databases. We achieve this objective by incorporating the synthesis techniques into Bambu, an Open Source high-level synthesis tools, and interfacing it with GEMS, the Graph database Engine for Multithreaded Systems. The GEMS’ front-end generates optimized C implementations of the input queries, modeled as graph pattern matching algorithms, which are then automatically synthesized by Bambu. We validate our approach by synthesizing several SPARQL queries from the Lehigh University Benchmark (LUBM).

@conference{11311_977866,
author = {Castellana, VITO GIOVANNI and Minutoli, Marco and Morari, Alessandro and Tumeo, Antonino and Lattuada, Marco and Ferrandi, Fabrizio},
title = {High level synthesis of RDF queries for graph analytics},
year = {2015},
publisher = {IEEE Press},
address = {Piscataway, NJ},
booktitle = {Proceedings of the IEEE/ACM International Conference on Computer-Aided Design, ICCAD},
abstract = {In this paper we present a set of techniques that enable the synthesis of efficient custom accelerators for memory intensive, irregular applications. To address the challenges of irregular applications (large memory footprint, unpredictable fine-grained data accesses, and high synchronization intensity), and exploit their opportunities (thread level parallelism, memory level parallelism), we propose a novel accelerator design that employs an adaptive and Distributed Controller (DC) architecture, and a Memory Interface Controller (MIC) that supports concurrent and atomic memory operations on a multi-ported/multi-banked shared memory. Among the multitude of algorithms that may benefit from our solution, we focus on the acceleration of graph analytics applications and, in particular, on the synthesis of SPARQL queries on Resource Description Framework (RDF) databases. We achieve this objective by incorporating the synthesis techniques into Bambu, an Open Source high-level synthesis tools, and interfacing it with GEMS, the Graph database Engine for Multithreaded Systems. The GEMS' front-end generates optimized C implementations of the input queries, modeled as graph pattern matching algorithms, which are then automatically synthesized by Bambu. We validate our approach by synthesizing several SPARQL queries from the Lehigh University Benchmark (LUBM).},
keywords = {graph theory;high level synthesis;memory architecture;multi-threading;query languages;shared memory systems;Bambu;DC architecture;GEMS;LUBM;Lehigh University Benchmark;MIC;RDF databases;RDF queries;SPARQL queries;accelerator design;adaptive architecture;atomic memory operations;concurrent memory operations;distributed controller architecture;graph analytics;graph database engine;graph pattern matching algorithms;memory intensive irregular applications;memory interface controller;multiported/multibanked shared memory;multithreaded systems;open source high-level synthesis tools;resource description framework databases;Acceleration;Computer architecture;Databases;Field programmable gate arrays;Parallel processing;Program processors;Resource description framework},
doi = {10.1109/ICCAD.2015.7372587},
isbn = {978-1-4673-8388-2},
isbn = {978-1-4673-8388-2},
pages = {323--330}
}

M. Lattuada and F. Ferrandi, “Code transformations based on speculative SDC scheduling,” in Proceedings of the IEEE/ACM International Conference on Computer-Aided Design, ICCAD, 2015, p. 71–77.
[BibTeX] [Abstract]

Code motion and speculations are usually exploited in the High Level Synthesis of control dominated applications to improve the performances of the synthesized designs. Selecting the transformations to be applied is not a trivial task: their effects can indeed indirectly spread across the whole design, potentially worsening the quality of the results. In this paper we propose a code transformation flow, based on a new extension of the System of Difference Constraints (SDC) scheduling algorithm, which introduces a large number of transformations, whose profitability is guaranteed by SDC formulation. Experimental results show that the proposed technique in average reduces the execution time of control dominated applications by 37% with respect to a commercial tool without increasing the area usage.

@conference{11311_973456,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Code transformations based on speculative SDC scheduling},
year = {2015},
publisher = {IEEE, 345 E 47TH ST, NEW YORK, NY 10017 USA},
booktitle = {Proceedings of the IEEE/ACM International Conference on Computer-Aided Design, ICCAD},
abstract = {Code motion and speculations are usually exploited in the High Level Synthesis of control dominated applications to improve the performances of the synthesized designs. Selecting the transformations to be applied is not a trivial task: their effects can indeed indirectly spread across the whole design, potentially worsening the quality of the results.
In this paper we propose a code transformation flow, based on a new extension of the System of Difference Constraints (SDC) scheduling algorithm, which introduces a large number of transformations, whose profitability is guaranteed by SDC formulation. Experimental results show that the proposed technique in average reduces the execution time of control dominated applications by 37% with respect to a commercial tool without increasing the area usage.},
doi = {10.1109/ICCAD.2015.7372552},
isbn = {978-1-4673-8388-2},
pages = {71--77}
}

M. Lattuada and F. Ferrandi, “Modeling Resolution of Resources Contention in Synchronous Data Flow Graphs,” JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL, IMAGE, AND VIDEO TECHNOLOGY, vol. 80, iss. 1, p. 39–47, 2015.
[BibTeX] [Abstract]

Synchronous Data Flow graphs are widely adopted in the designing of streaming applications, but were originally formulated to describe only how an application is partitioned and which data are exchanged among different tasks. Since Synchronous Data Flow graphs are often used to describe and evaluate complete design solutions, missing information (e.g., mapping, scheduling, etc.) has to be included in them by means of further actors and channels to obtain accurate evaluations. To address this issue preserving the simplicity of the representation, techniques that model data transfer delays by means of ad-hoc actors have been proposed, but they model independently each communication ignoring contentions. Moreover, they do not usually consider at all delays due to buffer contentions, potentially overestimating the throughput of a design solution. In this paper a technique to extend Synchronous Data Flow graphs by adding ad-hoc actors and channels to model resolution of resources contentions is proposed. The results show that the number of added actors and channels is limited but that they can significantly increase the Synchronous Data Flow graph accuracy.

@article{11311_869737,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Modeling Resolution of Resources Contention in Synchronous Data Flow Graphs},
year = {2015},
journal = {JOURNAL OF SIGNAL PROCESSING SYSTEMS FOR SIGNAL, IMAGE, AND VIDEO TECHNOLOGY},
volume = {80},
abstract = {Synchronous Data Flow graphs are widely adopted in the designing of streaming applications, but were originally formulated to describe only how an application is partitioned and which data are exchanged among different tasks. Since Synchronous Data Flow graphs are often used to describe and evaluate complete design solutions, missing information (e.g., mapping, scheduling, etc.) has to be included in them by means of further actors and channels to obtain accurate evaluations. To address this issue preserving the simplicity of the representation, techniques that model data transfer delays by means of ad-hoc actors have been proposed, but they model independently each communication ignoring contentions. Moreover, they do not usually consider at all delays due to buffer contentions, potentially overestimating the throughput of a design solution. In this paper a technique to extend Synchronous Data Flow graphs by adding ad-hoc actors and channels to model resolution of resources contentions is proposed. The results show that the number of added actors and channels is limited but that they can significantly increase the Synchronous Data Flow graph accuracy.},
doi = {10.1007/s11265-014-0923-y},
pages = {39--47},
number = {1}
}

2014

V. G. Castellana, T. Antonino, and F. Ferrandi, “An adaptive Memory Interface Controller for improving bandwidth utilization of hybrid and reconfigurable systems,” in Design, Automation & Test in Europe Conference & Exhibition (DATE), 2014, 2014, p. 1–4.
[BibTeX] [Abstract]

Data mining, bioinformatics, knowledge discovery, social network analysis, are emerging irregular applications that exploits data structures based on pointers or linked lists, such as graphs, unbalanced trees or unstructured grids. These applications are characterized by unpredictable memory accesses and generally are memory bandwidth bound, but also presents large amounts of inherent dynamic parallelism because they can potentially spawn concurrent activities for each one of the element they are exploring. Hybrid architectures, which integrate general purpose processors with reconfigurable devices, appears promising target platforms for accelerating irregular applications. These systems often connect to distributed and multi-ported memories, potentially enabling parallel memory operations. However, these memory architectures introduce several challenges, such as the necessity to manage concurrency and synchronization to avoid structural conflicts on shared memory locations and to guarantee consistency. In this paper we present an adaptive Memory Interface Controller (MIC) that addresses these issues. The MIC is a general and customizable solution that can target several different memory structures, and is suitable for High Level Synthesis frameworks. It implements a dynamic arbitration scheme, which avoids conflicts on memory resources at runtime, and supports atomic memory operations, commonly exploited for synchronization directives in parallel programming paradigms. The MIC simultaneously maps multiple accesses to different memory ports, allowing fine grained parallelism exploitation and ensuring correctness also in the presence of irregular and statically unpredictable memory access patterns. We evaluated the effectiveness of our approach on a typical irregular kernel, graph Breadth First Search (BFS), exploring different design alternatives.

@conference{11311_869736,
author = {Castellana, VITO GIOVANNI and Antonino, Tumeo and Ferrandi, Fabrizio},
title = {An adaptive Memory Interface Controller for improving bandwidth utilization of hybrid and reconfigurable systems},
year = {2014},
booktitle = {Design, Automation & Test in Europe Conference & Exhibition (DATE), 2014},
abstract = {Data mining, bioinformatics, knowledge discovery, social network analysis, are emerging irregular applications that exploits data structures based on pointers or linked lists, such as graphs, unbalanced trees or unstructured grids. These applications are characterized by unpredictable memory accesses and generally are memory bandwidth bound, but also presents large amounts of inherent dynamic parallelism because they can potentially spawn concurrent activities for each one of the element they are exploring. Hybrid architectures, which integrate general purpose processors with reconfigurable devices, appears promising target platforms for accelerating irregular applications. These systems often connect to distributed and multi-ported memories, potentially enabling parallel memory operations. However, these memory architectures introduce several challenges, such as the necessity to manage concurrency and synchronization to avoid structural conflicts on shared memory locations and to guarantee consistency. In this paper we present an adaptive Memory Interface Controller (MIC) that addresses these issues. The MIC is a general and customizable solution that can target several different memory structures, and is suitable for High Level Synthesis frameworks. It implements a dynamic arbitration scheme, which avoids conflicts on memory resources at runtime, and supports atomic memory operations, commonly exploited for synchronization directives in parallel programming paradigms. The MIC simultaneously maps multiple accesses to different memory ports, allowing fine grained parallelism exploitation and ensuring correctness also in the presence of irregular and statically unpredictable memory access patterns. We evaluated the effectiveness of our approach on a typical irregular kernel, graph Breadth First Search (BFS), exploring different design alternatives.},
doi = {10.7873/DATE.2014.192},
isbn = {9783981537024},
pages = {1--4}
}

2013

C. Pilato and F. Ferrandi, “Bambu: A Modular Framework for the High Level Synthesis of Memory-Intensive Applications,” in Proceedings of the 23rd International Conference on Field Programmable Logic and Applications (FPL), 2013, p. 1–4.
[BibTeX]

@conference{11311_767090,
author = {Pilato, Christian and Ferrandi, Fabrizio},
title = {Bambu: A Modular Framework for the High Level Synthesis of Memory-Intensive Applications},
year = {2013},
booktitle = {Proceedings of the 23rd International Conference on Field Programmable Logic and Applications (FPL)},
doi = {10.1109/FPL.2013.6645550},
isbn = {9781479900046},
pages = {1--4}
}

S. Lovergine and F. Ferrandi, “Harnessing Adaptivity Analysis for the Automatic Design of Efficient Embedded and HPC Systems,” in Proceedings IPDPS 2013, 2013, p. 2298–2301.
[BibTeX]

@conference{11311_768471,
author = {Lovergine, Silvia and Ferrandi, Fabrizio},
title = {Harnessing Adaptivity Analysis for the Automatic Design of Efficient Embedded and HPC Systems},
year = {2013},
booktitle = {Proceedings IPDPS 2013},
doi = {10.1109/IPDPSW.2013.230},
isbn = {9780769549798},
pages = {2298--2301}
}

S. Lovergine and F. Ferrandi, “Dynamic AC-scheduling for hardware cores with unknown and uncertain information,” in Proceedings ICCD 2013, 2013, p. 475–478.
[BibTeX] [Abstract]

Modern hardware cores necessarily have to deal with many sources of unknown or uncertain information. Components with variable latency and unpredictable behavior are becoming predominant in hardware designs. Conventional hardware cores underperform when dealing with unknown or uncertain information. Common High-Level Synthesis (HLS) approaches, which require to specify the complete behavior at design-time, present significant restrictions in supporting this kind of conditions. The literature proposes several dynamic scheduling techniques to improve the cores performance by handling inherent uncertainty of applications. However, they do not address other sources of unknown information. In this paper, we propose the dynamic Activating Conditions (AC)-scheduling: a methodology for the design automation of hardware cores which can dynamically adapt the instructions scheduling according to behaviors unknown at design-time. Neither assumptions about components latency nor worst case approach are required. Experimental results show significant performance increase, with limited area overhead, with respect to state-of-the-art approaches.

@conference{11311_768467,
author = {Lovergine, Silvia and Ferrandi, Fabrizio},
title = {Dynamic AC-scheduling for hardware cores with unknown and uncertain information},
year = {2013},
booktitle = {Proceedings ICCD 2013},
abstract = {Modern hardware cores necessarily have to deal with many sources of unknown or uncertain information. Components with variable latency and unpredictable behavior are becoming predominant in hardware designs. Conventional hardware cores underperform when dealing with unknown or uncertain information. Common High-Level Synthesis (HLS) approaches, which require to specify the complete behavior at design-time, present significant restrictions in supporting this kind of conditions. The literature proposes several dynamic scheduling techniques to improve the cores performance by handling inherent uncertainty of applications. However, they do not address other sources of unknown information. In this paper, we propose the dynamic Activating Conditions (AC)-scheduling: a methodology for the design automation of hardware cores which can dynamically adapt the instructions scheduling according to behaviors unknown at design-time. Neither assumptions about components latency nor worst case approach are required. Experimental results show significant performance increase, with limited area overhead, with respect to state-of-the-art approaches.},
doi = {10.1109/ICCD.2013.6657086},
isbn = {9781479929870},
pages = {475--478}
}

M. Lattuada and F. Ferrandi, “Modeling pipelined application with Synchronous Data Flow graphs,” in Proceedings ICSAMOS, 2013, p. 49–55.
[BibTeX]

@conference{11311_768470,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Modeling pipelined application with Synchronous Data Flow graphs},
year = {2013},
booktitle = {Proceedings ICSAMOS},
doi = {10.1109/SAMOS.2013.6621105},
isbn = {9781479901036},
pages = {49--55}
}

V. G. Castellana and F. Ferrandi, “An automated flow for the High Level Synthesis of coarse grained parallel applications,” in 2013 International Conference on Field-Programmable Technology (FPT), 2013, p. 294–301.
[BibTeX] [Abstract]

High Level Synthesis (HLS) provides a way to significantly enhance the productivity of embedded system designers, by enabling the automatic or semiautomatic generation of hardware accelerators starting from high level descriptions with (usually software) programming languages. Typical HLS approaches build a centralized Finite State Machine (FSM) to control the generated datapath, performing the operations according to a pre-determined, static schedule. However, FSM-based approaches are only able to extract parallelism within a single execution flow. In the presence of coarse grained parallelism, in the form of concurrent function calls or parallel control structures, they either serialize all the operations, or build excessively complex controllers, aiming at executing as many operation as possible in a single control step (i.e., they try to extract as much instruction level parallelism as possible). The resulting controllers occupy an excessive amount of area or lead to very low operating frequencies. In this paper we propose a methodology for the HLS of accelerators supporting parallel execution and dynamic scheduling. The approach exploits an adaptive distributed controller, composed of a set of communicating elements associated with each operation. This controller design enables supporting multiple concurrent execution flows, thus increasing parallelism exploitation beyond instruction level parallelism. The approach also supports variable latency operations, such as memory accesses and speculative operations. We apply our methodology on a set of typical HLS benchmarks, and demonstrate valuable speed ups with limited area overheads with respect to conventional FSM-based flows.

@conference{11311_775515,
author = {Castellana, VITO GIOVANNI and Ferrandi, Fabrizio},
title = {An automated flow for the High Level Synthesis of coarse grained parallel applications},
year = {2013},
booktitle = {2013 International Conference on Field-Programmable Technology (FPT)},
abstract = {High Level Synthesis (HLS) provides a way to significantly enhance the productivity of embedded system designers, by enabling the automatic or semiautomatic generation of hardware accelerators starting from high level descriptions with (usually software) programming languages. Typical HLS approaches build a centralized Finite State Machine (FSM) to control the generated datapath, performing the operations according to a pre-determined, static schedule. However, FSM-based approaches are only able to extract parallelism within a single execution flow. In the presence of coarse grained parallelism, in the form of concurrent function calls or parallel control structures, they either serialize all the operations, or build excessively complex controllers, aiming at executing as many operation as possible in a single control step (i.e., they try to extract as much instruction level parallelism as possible). The resulting controllers occupy an excessive amount of area or lead to very low operating frequencies. In this paper we propose a methodology for the HLS of accelerators supporting parallel execution and dynamic scheduling. The approach exploits an adaptive distributed controller, composed of a set of communicating elements associated with each operation. This controller design enables supporting multiple concurrent execution flows, thus increasing parallelism exploitation beyond instruction level parallelism. The approach also supports variable latency operations, such as memory accesses and speculative operations. We apply our methodology on a set of typical HLS benchmarks, and demonstrate valuable speed ups with limited area overheads with respect to conventional FSM-based flows.},
doi = {10.1109/FPT.2013.6718370},
isbn = {9781479921980},
isbn = {9781479921997},
pages = {294--301}
}

F. Ferrandi, P. L. Lanzi, C. Pilato, D. Sciuto, and A. Tumeo, “Ant Colony Optimization for Mapping, Scheduling and Placing in Reconﬁgurable Systems,” in Proceedings of the NASA/ESA Conference on Adaptive Hardware and Systems (AHS 2013), 2013, p. 47–54.
[BibTeX]

@conference{11311_741174,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {Ant Colony Optimization for Mapping, Scheduling and Placing
in Reconﬁgurable Systems},
year = {2013},
booktitle = {Proceedings of the NASA/ESA Conference on Adaptive Hardware and Systems (AHS 2013)},
doi = {10.1109/AHS.2013.6604225},
isbn = {9781467363839},
pages = {47--54}
}

V. G. Castellana and F. Ferrandi, “Scheduling independent liveness analysis for register binding in high level synthesis,” in Proceedings DATE 2013, 2013, p. 1571–1574.
[BibTeX] [Abstract]

Classical techniques for register allocation and binding require the definition of the program execution order, since a partial ordering relation between operations must be induced to perform liveness analysis through data-flow equations. In High Level Synthesis (HLS) flows this is commonly obtained through the scheduling task. However for some HLS approaches, such a relation can be difficult to be computed, or not statically computable at all, and adopting conventional register binding techniques, even when feasible, cannot guarantee maximum performances. To overcome these issues we introduce a novel scheduling-independent liveness analysis methodology, suitable for dynamic scheduling architectures. Such liveness analysis is exploited in register binding using standard graph coloring techniques, and unlike other approaches it avoids the insertion of structural dependencies, introduced to prevent run-time resource conflicts in dynamic scheduling environments. The absence of additional dependencies avoids performance degradation and makes parallelism exploitation independent from the register binding task, while on average not impacting on area, as shown through the experimental results.

@conference{11311_768465,
author = {Castellana, VITO GIOVANNI and Ferrandi, Fabrizio},
title = {Scheduling independent liveness analysis for register binding in high level synthesis},
year = {2013},
booktitle = {Proceedings DATE 2013},
abstract = {Classical techniques for register allocation and binding require the definition of the program execution order, since a partial ordering relation between operations must be induced to perform liveness analysis through data-flow equations. In High Level Synthesis (HLS) flows this is commonly obtained through the scheduling task. However for some HLS approaches, such a relation can be difficult to be computed, or not statically computable at all, and adopting conventional register binding techniques, even when feasible, cannot guarantee maximum performances. To overcome these issues we introduce a novel scheduling-independent liveness analysis methodology, suitable for dynamic scheduling architectures. Such liveness analysis is exploited in register binding using standard graph coloring techniques, and unlike other approaches it avoids the insertion of structural dependencies, introduced to prevent run-time resource conflicts in dynamic scheduling environments. The absence of additional dependencies avoids performance degradation and makes parallelism exploitation independent from the register binding task, while on average not impacting on area, as shown through the experimental results.},
doi = {10.7873/DATE.2013.319},
isbn = {9781450321532},
pages = {1571--1574}
}

2012

K. Bertels, A. Lattanzi, E. Ciavattini, F. Bettarelli, M. T. Chiaradia, R. Nutricato, A. Morea, A. M. Antola, F. Ferrandi, M. Lattuada, C. Pilato, D. Sciuto, R. J. Meeuws, Y. Yankova, V. M. Sima, K. Sigdel, W. Luk, J. G. F. Coutinho, Y. M. Lam, T. Todman, A. Michelotti, and A. Cerruto, “The hArtes Tool Chain,” in Hardware/Software Co-design for Heterogeneous Multi-core Platforms: The hArtes Toolchain, Springer, 2012, p. 9–109.
[BibTeX] [Abstract]

This chapter describes the different design steps needed to go from legacy code to a transformed application that can be efficiently mapped on the hArtes platform.

@inbook{11311_630182,
author = {Bertels, K. and Lattanzi, A. and Ciavattini, E. and Bettarelli, F. and Chiaradia, M. T. and Nutricato, R. and Morea, A. and Antola, ANNA MARIA and Ferrandi, Fabrizio and Lattuada, Marco and Pilato, Christian and Sciuto, Donatella and Meeuws, R. J. and Yankova, Y. and Sima, V. M. and Sigdel, K. and Luk, W. and Coutinho, J. G. F. and Lam, Y. M. and Todman, T. and Michelotti, A. and Cerruto, A.},
title = {The hArtes Tool Chain},
year = {2012},
publisher = {Springer},
booktitle = {Hardware/Software Co-design for Heterogeneous Multi-core Platforms: The hArtes Toolchain},
abstract = {This chapter describes the different design steps needed to go from legacy code to a transformed application that can be efficiently mapped on the hArtes platform.},
doi = {10.1007/978-94-007-1406-9_2},
isbn = {9789400714052},
pages = {9--109}
}

F. Bettarelli, E. Ciavattini, A. Lattanzi, G. Beltrame, F. Ferrandi, L. Fossati, C. Pilato, D. Sciuto, R. J. Meeuws, S. A. Ostadzadeh, Z. Nawaz, Y. Lu, T. Marconi, M. Sabeghi, V. M. Sima, and K. Sigdel, “Extensions of the hArtes Tool Chain,” in Hardware/Software Co-design for Heterogeneous Multi-core Platforms: The hArtes Toolchain, Springer, 2012, p. 193–227.
[BibTeX] [Abstract]

In this chapter, we describe functionality which has also been developed in the context of the hArtes project but that were not included in the final release or that are separately released. The development of the tools described here was often initiated after certain limitations of the current toolset were identified. This was the case of the memory analyser QUAD which does a detailed analysis of the memory accesses. Other tools, such as the rSesame tool, were developed and explored in parallel with the hArtes tool chain. This tool assumes a KPN-version of the application and then allows for high level simulation and experimentation with different mappings and partitionings. Finally, ReSP was developed to validate the partitioning results before a real implementation was possible.

@inbook{11311_630184,
author = {Bettarelli, F. and Ciavattini, E. and Lattanzi, A. and Beltrame, G. and Ferrandi, Fabrizio and Fossati, L. and Pilato, Christian and Sciuto, Donatella and Meeuws, R. J. and Ostadzadeh, S. A. and Nawaz, Z. and Lu, Y. and Marconi, T. and Sabeghi, M. and Sima, V. M. and Sigdel, K.},
title = {Extensions of the hArtes Tool Chain},
year = {2012},
publisher = {Springer},
booktitle = {Hardware/Software Co-design for Heterogeneous Multi-core Platforms: The hArtes Toolchain},
abstract = {In this chapter, we describe functionality which has also been developed in the context of the hArtes project but that were not included in the final release or that are separately released. The development of the tools described here was often initiated after certain limitations of the current toolset were identified. This was the case of the memory analyser QUAD which does a detailed analysis of the memory accesses. Other tools, such as the rSesame tool, were developed and explored in parallel with the hArtes tool chain. This tool assumes a KPN-version of the application and then allows for high level simulation and experimentation with different mappings and partitionings. Finally, ReSP was developed to validate the partitioning results before a real implementation was possible.},
doi = {10.1007/978-94-007-1406-9_6},
isbn = {9789400714052},
pages = {193--227}
}

S. Lovergine and F. Ferrandi, “Instructions activating conditions for hardware-based auto-scheduling,” in Proceedings of the 9th conference on Computing Frontiers – CF ’12, 2012, p. 253–256.
[BibTeX]

@conference{11311_665711,
author = {Lovergine, Silvia and Ferrandi, Fabrizio},
title = {Instructions activating conditions for hardware-based auto-scheduling},
year = {2012},
booktitle = {Proceedings of the 9th conference on Computing Frontiers - CF '12},
doi = {10.1145/2212908.2212946},
isbn = {9781450312158},
pages = {253--256}
}

M. Lattuada and F. Ferrandi, “Performance estimation of embedded software with confidence levels,” in 17th Asia and South Pacific Design Automation Conference, 2012, p. 573–578.
[BibTeX] [Abstract]

Since time constraints are a very critical aspect of an embedded system, performance evaluation can not be postponed to the end of the design flow, but it has to be introduced since its early stages. Estimation techniques based on mathematical models are usually preferred during this phase since they provide quite accurate estimation of the application performance in a fast way. However, the estimation error has to be considered during design space exploration to evaluate if a solution can be accepted (e.g., by discarding solutions whose estimated time is too close to constraint). Evaluate if the possible error can be significant analyzing a punctual estimation is not a trivial task. In this paper we propose a methodology, based on statistical analysis, that provides a prediction interval on the estimation and a confidence level on meeting a time constraint. This information can drive design space exploration reducing the number of solutions to be validated. The results show how the produced intervals effectively capture the estimation error introduced by a linear model.

@conference{11311_665733,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Performance estimation of embedded software with confidence levels},
year = {2012},
booktitle = {17th Asia and South Pacific Design Automation Conference},
abstract = {Since time constraints are a very critical aspect of an embedded system, performance evaluation can not be postponed to the end of the design flow, but it has to be introduced since its early stages. Estimation techniques based on mathematical models are usually preferred during this phase since they provide quite accurate estimation of the application performance in a fast way. However, the estimation error has to be considered during design space exploration to evaluate if a solution can be accepted (e.g., by discarding solutions whose estimated time is too close to constraint). Evaluate if the possible error can be significant analyzing a punctual estimation is not a trivial task. In this paper we propose a methodology, based on statistical analysis, that provides a prediction interval on the estimation and a confidence level on meeting a time constraint. This information can drive design space exploration reducing the number of solutions to be validated. The results show how the produced intervals effectively capture the estimation error introduced by a linear model.},
doi = {10.1109/ASPDAC.2012.6165022},
isbn = {9781467307703},
isbn = {9781467307710},
isbn = {9781467307727},
pages = {573--578}
}

2011

C. Pilato, F. Ferrandi, and D. Sciuto, “A design methodology to implement memory accesses in High-Level Synthesis,” in 2011 Proceedings of the 9th International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS), 2011, p. 49–58.
[BibTeX]

@conference{11311_608571,
author = {Pilato, Christian and Ferrandi, Fabrizio and Sciuto, Donatella},
title = {A design methodology to implement memory accesses in High-Level Synthesis},
year = {2011},
booktitle = {2011 Proceedings of the 9th International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)},
doi = {10.1145/2039370.2039381},
isbn = {978-145030715-4},
pages = {49--58}
}

M. Elhoj, A. Reis, R. Ribas, F. Ferrandi, C. Pilato, F. Moll, M. Miranda, P. Dobrovolny, N. Woolaway, A. Grasset, P. Bonnot, G. Desoli, and D. Pandini, “SYNAPTIC Project: Regularity Applied to Enhance Manufacturability and Yield at Several Abstraction Levels,” in ERDIAP 2011 – Workshop on Exploiting Regularity in the Design of IPs, Architectures and Platforms, 2011, p. 189–192.
[BibTeX]

@conference{11311_630186,
author = {Elhoj, M. and Reis, A. and Ribas, R. and Ferrandi, Fabrizio and Pilato, Christian and Moll, F. and Miranda, M. and Dobrovolny, P. and Woolaway, N. and Grasset, A. and Bonnot, P. and Desoli, G. and Pandini, D.},
title = {SYNAPTIC Project: Regularity Applied to Enhance Manufacturability and Yield at Several Abstraction Levels},
year = {2011},
booktitle = {ERDIAP 2011 - Workshop on Exploiting Regularity in the Design of IPs, Architectures and Platforms},
url = {http://www.vde-verlag.de/proceedings-en/563333026.html},
pages = {189--192}
}

C. Pilato, F. Ferrandi, and D. Pandini, “A Design Methodology for the Automatic Sizing of Standard-Cell Libraries,” in Proc. ACM Great Lakes Symposium on VLSI, 2011, p. 151–156.
[BibTeX]

@conference{11311_630180,
author = {Pilato, Christian and Ferrandi, Fabrizio and Pandini, D.},
title = {A Design Methodology for the Automatic Sizing of Standard-Cell Libraries},
year = {2011},
booktitle = {Proc. ACM Great Lakes Symposium on VLSI},
doi = {10.1145/1973009.1973040},
pages = {151--156}
}

C. Pilato, F. Ferrandi, and D. Pandini, “Evaluating Static CMOS Complex Cells in Technology Mapping,” in ERDIAP 2011 – Workshop on Exploiting Regularity in the Design of IPs, Architectures and Platforms, 2011, p. 222–229.
[BibTeX]

@conference{11311_630185,
author = {Pilato, Christian and Ferrandi, Fabrizio and Pandini, D.},
title = {Evaluating Static CMOS Complex Cells in Technology Mapping},
year = {2011},
booktitle = {ERDIAP 2011 - Workshop on Exploiting Regularity in the Design of IPs, Architectures and Platforms},
url = {http://www.vde-verlag.de/proceedings-en/563333031.html},
pages = {222--229}
}

K. Georgi, S. Vlad Mihai, B. Koen, J. G. F. Coutinho, L. Wayne, M. Giacomo, T. Raffaele, and F. Ferrandi, “hArtes: Holistic Approach to Reconfigurable Real-Time Embedded SystemsReconfigurable Computing,” in Reconfigurable Computing, New York: -New York, NY: Plenum -SPRINGER, 233 SPRING STREET, NEW YORK, USA, NY, 10013, 2011, p. 91–115.
[BibTeX]

@inbook{11311_693770,
author = {Georgi, Kuzmanov and Vlad Mihai, Sima and Koen, Bertels and Coutinho, José Gabriel F. and Wayne, Luk and Giacomo, Marchiori and Raffaele, Tripiccione and Ferrandi, Fabrizio},
title = {hArtes: Holistic Approach to Reconfigurable Real-Time Embedded SystemsReconfigurable Computing},
year = {2011},
publisher = {-New York, NY: Plenum -SPRINGER, 233 SPRING STREET, NEW YORK, USA, NY, 10013},
address = {New York},
booktitle = {Reconfigurable Computing},
doi = {10.1007/978-1-4614-0061-5_5},
isbn = {9781461400608},
pages = {91--115}
}

S. Cecchi, A. Primavera, F. Piazza, F. Bettarelli, E. Ciavattini, R. Toppi, J. G. F. Coutinho, W. Luk, C. Pilato, F. Ferrandi, V. M. Sima, and K. Bertels, “The hArtes CarLab: A New Approach to Advanced Algorithms Development for Automotive Audio,” AES, vol. 59, iss. 11, p. 858–869, 2011.
[BibTeX] [Abstract]

In the last decade automotive audio has been gaining great attention by the scientific and industrial communities. In this context, a new approach to test and develop advanced audio algorithms for a heterogeneous embedded platform has been proposed within the European hArtes project. A real audio laboratory installed in a real car (hArtes CarLab) has been developed employing professional audio equipment. The algorithms can be tested and validated on a PC exploiting each application as a plug-in of a real time framework. Then a set of tools (hArtes Toolchain) can be used to generate code for the embedded platform starting from plug-in implementation. An overview of the entire system is here presented, showing its effectiveness.

@article{11311_630187,
author = {Cecchi, S. and Primavera, A. and Piazza, F. and Bettarelli, F. and Ciavattini, E. and Toppi, R. and Coutinho, J. G. F. and Luk, W. and Pilato, Christian and Ferrandi, Fabrizio and Sima, V. M. and Bertels, K.},
title = {The hArtes CarLab: A New Approach to Advanced Algorithms Development for Automotive Audio},
year = {2011},
journal = {AES},
volume = {59},
abstract = {In the last decade automotive audio has been gaining great attention by the scientific and industrial communities. In this context, a new approach to test and develop advanced audio algorithms for a heterogeneous embedded platform has been proposed within the European hArtes project. A real audio laboratory installed in a real car (hArtes CarLab) has been developed employing professional audio equipment. The algorithms can be tested and validated on a PC exploiting each application as a plug-in of a real time framework. Then a set of tools (hArtes Toolchain) can be used to generate code for the embedded platform starting from plug-in implementation. An overview of the entire system is here presented, showing its effectiveness.},
keywords = {Embedded platforms; Entire system; Industrial communities; Plug-ins; Real time, Algorithms, Audio equipment},
url = {http://www.aes.org/e-lib/browse.cfm?elib=16153},
pages = {858--869},
number = {11}
}

C. Pilato, V. G. Castellana, S. Lovergine, and F. Ferrandi, “A Runtime Adaptive Controller for Supporting Hardware Components with Variable Latency,” in Proc. of 2011 NASA/ESA Conference on Adaptive Hardware and Systems, 2011, p. 153–160.
[BibTeX]

@conference{11311_630181,
author = {Pilato, Christian and Castellana, VITO GIOVANNI and Lovergine, Silvia and Ferrandi, Fabrizio},
title = {A Runtime Adaptive Controller for Supporting Hardware Components with Variable Latency},
year = {2011},
booktitle = {Proc. of 2011 NASA/ESA Conference on Adaptive Hardware and Systems},
doi = {10.1109/AHS.2011.5963930},
pages = {153--160}
}

2010

K. Bertels, V. M. Sima, Y. Yankova, G. Kuzmanov, W. Luk, J. G. F. Coutinho, F. Ferrandi, C. Pilato, M. Lattuada, D. Sciuto, and A. Michelotti, “hArtes: Hardware-Software Codesign for Heterogeneous Multicore Platforms,” IEEE MICRO, vol. 30, iss. 5, p. 88–97, 2010.
[BibTeX] [Abstract]

Developing heterogeneous multicore platforms requires choosing the best hardware configuration for mapping the application, and modifying that application so that different parts execute on the most appropriate hardware component. The hArtes toolchain provides the option of automatic or semi-automatic support for this mapping. During test and validation on several computation-intensive applications, hArtes achieved substantial speedups and drastically reduced development times.

@article{11311_575576,
author = {Bertels, K. and Sima, V. M. and Yankova, Y. and Kuzmanov, G. and Luk, W. and Coutinho, J. G. F. and Ferrandi, Fabrizio and Pilato, Christian and Lattuada, Marco and Sciuto, Donatella and Michelotti, A.},
title = {hArtes: Hardware-Software Codesign for Heterogeneous Multicore Platforms},
year = {2010},
journal = {IEEE MICRO},
volume = {30},
abstract = {Developing heterogeneous multicore platforms requires choosing the best hardware configuration for mapping the application, and modifying that application so that different parts execute on the most appropriate hardware component. The hArtes toolchain provides the option of automatic or semi-automatic support for this mapping. During test and validation on several computation-intensive applications, hArtes achieved substantial speedups and drastically reduced development times.},
keywords = {INF},
url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5640604},
doi = {10.1109/MM.2010.91},
pages = {88--97},
number = {5}
}

M. Lattuada and F. Ferrandi, “Performance modeling of embedded applications with zero architectural knowledge,” in CODES/ISSS ’10 Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis, New York, NY, USA, 2010, p. 277–286.
[BibTeX] [Abstract]

Performance estimation is a key step in the development of an embedded system. Normally, the performance evaluation is performed using a simulator or a performance mathematical model of the target architecture. However, both these approaches are usually based on the knowledge of the architectural details of the target. In this paper we present a methodology for automatically building an analytical model to estimate the performance of an application on a generic processor without requiring any information about the processor architecture but the one provided by the GNU GCC Intermediate Representation. The proposed methodology exploits the linear regression technique based on an application analysis performed on the Register Transfer Level internal representation of the GNU GCC compiler. The benefits of working with this type of model and with this intermediate representation are three: we take into account most of the compiler optimizations, we implicitly consider some architectural characteristics of the target processor and we can easily estimate the performance of portions of the specification. We validate our approach by evaluating with cross-validation technique the accuracy and the generality of the performance models built for the ARM926EJ-S and the LEON3 processors

@conference{11311_575582,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Performance modeling of embedded applications with zero architectural knowledge},
year = {2010},
publisher = {ACM},
address = {New York, NY, USA},
booktitle = {CODES/ISSS '10 Proceedings of the eighth IEEE/ACM/IFIP international conference on Hardware/software codesign and system synthesis},
abstract = {Performance estimation is a key step in the development of an embedded system. Normally, the performance evaluation is performed using a simulator or a performance mathematical model of the target architecture. However, both these approaches are usually based on the knowledge of the architectural details of the target.
In this paper we present a methodology for automatically building an analytical model to estimate the performance of an application on a generic processor without requiring any information about the processor architecture but the one provided by the GNU GCC Intermediate Representation. The proposed methodology exploits the linear regression technique based on an application analysis performed on the Register Transfer Level internal representation of the GNU GCC compiler. The benefits of working with this type of model and with this intermediate representation are three: we take into account most of the compiler optimizations, we implicitly consider some architectural characteristics of the target processor and we can easily estimate the performance of portions of the specification. We validate our approach by evaluating with cross-validation technique the accuracy and the generality of the performance models built for the ARM926EJ-S and the LEON3 processors},
url = {http://dl.acm.org/citation.cfm?id=1879010},
doi = {10.1145/1878961.1879010},
isbn = {9781605589053},
pages = {277--286}
}

C. Pilato, F. Ferrandi, and D. Pandini, “A Fast Heuristic for Extending Standard Cell Libraries with Regular Macro Cells,” in IEEE Computer Society Annual Symposium on VLSI, 2010, ISVLSI 2010, 2010, p. 23–28.
[BibTeX]

@conference{11311_575577,
author = {Pilato, Christian and Ferrandi, Fabrizio and Pandini, D.},
title = {A Fast Heuristic for Extending Standard Cell Libraries with Regular Macro Cells},
year = {2010},
booktitle = {IEEE Computer Society Annual Symposium on VLSI, 2010, ISVLSI 2010},
keywords = {INF},
url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5572753},
doi = {10.1109/ISVLSI.2010.69},
pages = {23--28}
}

F. Ferrandi, C. Pilato, D. Sciuto, and A. Tumeo, “Mapping and scheduling of parallel C applications with Ant Colony Optimization onto heterogeneous reconfigurable MPSoCs,” in Proceedings of the Asia and South Pacific Design Automation Conference (ASP-DAC 2010), 2010, p. 799–804.
[BibTeX]

@conference{11311_572916,
author = {Ferrandi, Fabrizio and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {Mapping and scheduling of parallel C applications with Ant Colony Optimization onto heterogeneous reconfigurable MPSoCs},
year = {2010},
booktitle = {Proceedings of the Asia and South Pacific Design Automation Conference (ASP-DAC 2010)},
keywords = {INF},
doi = {10.1109/ASPDAC.2010.5419782},
isbn = {9781424457656},
pages = {799--804}
}

S. Cecchi, A. Primavera, F. Piazza, F. Bettarelli, E. Ciavattini, R. Toppi, J. G. F. Coutinho, W. Luk, C. Pilato, F. Ferrandi, V. M. Sima, and K. Bertels, “The hArtes CarLab: A New Approach to Advanced Algorithms Development for Automotive Audio,” in Proceedings of 129th Audio Engineering Society Convention (AES), 2010, p. 1–12.
[BibTeX]

@conference{11311_575580,
author = {Cecchi, S. and Primavera, A. and Piazza, F. and Bettarelli, F. and Ciavattini, E. and Toppi, R. and Coutinho, J. G. F. and Luk, W. and Pilato, Christian and Ferrandi, Fabrizio and Sima, V. M. and Bertels, K.},
title = {The hArtes CarLab: A New Approach to Advanced Algorithms Development for Automotive Audio},
year = {2010},
booktitle = {Proceedings of 129th Audio Engineering Society Convention (AES)},
keywords = {INF},
url = {http://www.aes.org/e-lib/browse.cfm?elib=15605},
pages = {1--12}
}

M. Lattuada and F. Ferrandi, “Fine grain analysis of simulators accuracy for calibrating performance models,” in Rapid System Prototyping (RSP), 2010 21st IEEE International Symposium on, 2010, p. 1–7.
[BibTeX] [Abstract]

In embedded system design, the tuning and validation of a cycle accurate simulator is a difficult task. The designer has to assure that the estimation error of the simulator meets the design constraints on every application. If an application is not correctly estimated, the designer has to identify on which parts of the application the simulator introduces an estimation error and consequently fix the simulator. However, detecting which are the mispredicted parts of a very large application can be a difficult process which requires a lot of time. In this paper we propose a methodology which helps the designer to fast and automatically isolate the portions of the application mispredicted by a simulator. This is accomplished by recursively analyzing the application source code trace highlighting the mispredicted sections of source code. The results obtained applying the methodology to the TSIM simulator show how our methodology is able to fast analyze large applications isolating small portions of mispredicted code.

@conference{11311_575579,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Fine grain analysis of simulators accuracy for calibrating performance models},
year = {2010},
booktitle = {Rapid System Prototyping (RSP), 2010 21st IEEE International Symposium on},
abstract = {In embedded system design, the tuning and validation of a cycle accurate simulator is a difficult task. The designer has to assure that the estimation error of the simulator meets the design constraints on every application. If an application is not correctly estimated, the designer has to identify on which parts of the application the simulator introduces an estimation error and consequently fix the simulator. However, detecting which are the mispredicted parts of a very large application can be a difficult process which requires a lot of time. In this paper we propose a methodology which helps the designer to fast and automatically isolate the portions of the application mispredicted by a simulator. This is accomplished by recursively analyzing the application source code trace highlighting the mispredicted sections of source code. The results obtained applying the methodology to the TSIM simulator show how our methodology is able to fast analyze large applications isolating small portions of mispredicted code.},
doi = {10.1109/RSP.2010.5656414},
isbn = {9781424470723},
isbn = {9781424470730},
pages = {1--7}
}

M. Lattuada and F. Ferrandi, “Combining Target-independent Analysis with Dynamic Profiling to Build the Performance Model of a DSP,” in Computer and Information Technology (CIT), 2010 IEEE 10th International Conference on, 2010, p. 1895–1901.
[BibTeX] [Abstract]

Fast and accurate performance estimation is a key aspect of heterogeneous embedded systems design flow, since cycle-accurate simulators, when exist, are usually too slow to be used during design space exploration. Performance estimation techniques are usually based on combination of estimation of the single processing elements which compose the system. Architectural characteristics of Digital Signal Processors (DSP), such as the presence of Single Instruction Multiple Data operations or of special hardware units to control loop executions, introduce peculiar aspects in the performance estimation problem. In this paper we present a methodology to estimate the performance of a function on a given dataset on a DSP. Estimation is performed combining the host profiling data with the function GNU GCC GIMPLE representation. Starting from the results of this analysis, we build a performance model of a DSP by exploiting the Linear Regression Technique. Use of GIMPLE representation allows to take directly into account the target-independent optimizations performed by the DSP compiler. We validate our approach by building a performance model of the MagicV DSP and by testing the model on a set of significative benchmarks.

@conference{11311_575581,
author = {Lattuada, Marco and Ferrandi, Fabrizio},
title = {Combining Target-independent Analysis with Dynamic Profiling to Build the Performance Model of a DSP},
year = {2010},
booktitle = {Computer and Information Technology (CIT), 2010 IEEE 10th International Conference on},
abstract = {Fast and accurate performance estimation is a key aspect of heterogeneous embedded systems design flow, since cycle-accurate simulators, when exist, are usually too slow to be used during design space exploration. Performance estimation techniques are usually based on combination of estimation of the single processing elements which compose the system. Architectural characteristics of Digital Signal Processors (DSP), such as the presence of Single Instruction Multiple Data operations or of special hardware units to control loop executions, introduce peculiar aspects in the performance estimation problem. In this paper we present a methodology to estimate the performance of a function on a given dataset on a DSP. Estimation is performed combining the host profiling data with the function GNU GCC GIMPLE representation. Starting from the results of this analysis, we build a performance model of a DSP by exploiting the Linear Regression Technique. Use of GIMPLE representation allows to take directly into account the target-independent optimizations performed by the DSP compiler. We validate our approach by building a performance model of the MagicV DSP and by testing the model on a set of significative benchmarks.},
url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5578004},
doi = {10.1109/CIT.2010.324},
isbn = {9781424475476},
pages = {1895--1901}
}

F. Ferrandi, P. L. Lanzi, C. Pilato, D. Sciuto, and A. Tumeo, “Ant Colony Heuristic for Mapping and Scheduling Tasks and Communications on Heterogeneous Embedded Systems,” IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS, vol. 29, iss. 6, p. 911–924, 2010.
[BibTeX] [Abstract]

To exploit the power of modern heterogeneous multiprocessor embedded platforms on partitioned applications, the designer usually needs to efficiently map and schedule all the tasks and the communications of the application, respecting the constraints imposed by the target architecture. Since the problem is heavily constrained, common methods used to explore such design space usually fail, obtaining low-quality solutions. In this paper, we propose an ant colony optimization (ACO) heuristic that, given a model of the target architecture and the application, efficiently executes both scheduling and mapping to optimize the application performance. We compare our approach with several other heuristics, including simulated annealing, tabu search, and genetic algorithms, on the performance to reach the optimum value and on the potential to explore the design space. We show that our approach obtains better results than other heuristics by at least 16% on average, despite an overhead in execution time. Finally, we validate the approach by scheduling and mapping a JPEG encoder on a realistic target architecture.

@article{11311_572925,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {Ant Colony Heuristic for Mapping and Scheduling Tasks and Communications on Heterogeneous Embedded Systems},
year = {2010},
journal = {IEEE TRANSACTIONS ON COMPUTER-AIDED DESIGN OF INTEGRATED CIRCUITS AND SYSTEMS},
volume = {29},
abstract = {To exploit the power of modern heterogeneous multiprocessor embedded platforms on partitioned applications, the designer usually needs to efficiently map and schedule all the tasks and the communications of the application, respecting the constraints imposed by the target architecture. Since the problem is heavily constrained, common methods used to explore such design space usually fail, obtaining low-quality solutions. In this paper, we propose an ant colony optimization (ACO) heuristic that, given a model of the target architecture and the application, efficiently executes both scheduling and mapping to optimize the application performance. We compare our approach with several other heuristics, including simulated annealing, tabu search, and genetic algorithms, on the performance to reach the optimum value and on the potential to explore the design space. We show that our approach obtains better results than other heuristics by at least 16% on average, despite an overhead in execution time. Finally, we validate the approach by scheduling and mapping a JPEG encoder on a realistic target architecture.},
keywords = {INF},
doi = {10.1109/TCAD.2010.2048354},
pages = {911--924},
number = {6}
}

2009

R. Muhammad, F. Ferrandi, and B. Koen, “hArtes design flow for heterogeneous platforms,” in Quality of Electronic Design, 2009. ISQED 2009. Quality Electronic Design, 2009, p. 330–338.
[BibTeX]

@conference{11311_553638,
author = {Muhammad, Rashid and Ferrandi, Fabrizio and Koen, Bertels},
title = {hArtes design flow for heterogeneous platforms},
year = {2009},
booktitle = {Quality of Electronic Design, 2009. ISQED 2009. Quality Electronic Design},
keywords = {XML;embedded systems;hardware description languages;hardware-software codesign;optimisation;program compilers;reconfigurable architectures;task analysis;C pragma notations;H.264 video encoding application;HDL compilation;HDL generation;XML architecture description file;algorithm exploration and translation;design space exploration;hArtes design flow;hardware/software co-design;holistic approach to reconfigurable real time embedded systems design flow;holistic tool-chain;information exchange;reconfigurable heterogeneous platforms;system synthesis;task assignment;task optimization;task partitioning;Algorithm design and analysis;Application software;Computer architecture;Design optimization;Embedded software;Hardware design languages;Partitioning algorithms;Real time systems;Space exploration;XML;Design space exploration;application partitioning;simulation;system synthesis},
url = {http://dx.doi.org/10.1109/ISQED.2009.4810316},
doi = {10.1109/ISQED.2009.4810316},
pages = {330--338}
}

M. Branca, L. Camerini, F. Ferrandi, P. L. Lanzi, C. Pilato, D. Sciuto, and A. Tumeo, “Mapping pipelined applications onto heterogeneous embedded systems: a Bayesian Optimization Algorithm based approach,” in Proceedings of the 7th IEEE/ACM International Conference on Hardware/Software Codesign and System Synthesis, 2009, p. 443–452.
[BibTeX] [Abstract]

In this paper we propose a flow based on the Bayesian Optimization Algorithm (BOA) for mapping pipelined applications on a heterogeneous multiprocessor platform on Field Programmable Gate Array (FPGA) with customizable processors. BOA is a Probabilistic Model Building Genetic Algorithm (PMBGA) that, substituting the classical mutation and crossover operators with the construction and the sampling of a Bayesian network, is able to identify correlated sub-structures within the problem to be maintained while generating new solutions. The paper introduces the model adopted for pipelined applications and then shows why BOA fits the problem better than other search algorithms, like Genetic Algorithm (GA), Simulated Annealing (SA) and Tabu Search (TS). We also show that our algorithm is able to cope with data parallel pipelined algorithms. We finally validate our flow on realistic applications like JPEG and ADPCM coding by executing the resulting mapping on our platform.

@conference{11311_553649,
author = {Branca, M. and Camerini, L. and Ferrandi, Fabrizio and Lanzi, PIER LUCA and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {Mapping pipelined applications onto heterogeneous embedded systems: a Bayesian Optimization Algorithm based approach},
year = {2009},
booktitle = {Proceedings of the 7th IEEE/ACM International Conference on Hardware/Software Codesign and System Synthesis},
abstract = {In this paper we propose a flow based on the Bayesian Optimization Algorithm (BOA) for mapping pipelined applications on a heterogeneous multiprocessor platform on Field Programmable Gate Array (FPGA) with customizable processors. BOA is a Probabilistic Model Building Genetic Algorithm (PMBGA) that, substituting the classical mutation and crossover operators with the construction and the sampling of a Bayesian network, is able to identify correlated sub-structures within the problem to be maintained while generating new solutions.
The paper introduces the model adopted for pipelined applications and then shows why BOA fits the problem better than other search algorithms, like Genetic Algorithm (GA), Simulated Annealing (SA) and Tabu Search (TS). We also show that our algorithm is able to cope with data parallel pipelined algorithms. We finally validate our flow on realistic applications like JPEG and ADPCM coding by executing the resulting mapping on our platform.},
keywords = {INF},
url = {http://doi.acm.org/10.1145/1629435.1629495},
doi = {10.1145/1629435.1629495},
pages = {443--452}
}

M. Lattuada, C. Pilato, A. Tumeo, and F. Ferrandi, “Performance Modeling of Parallel Applications on MPSoCs,” in System-on-Chip, 2009. SOC 2009. International Symposium on, 2009, p. 64–68.
[BibTeX] [Abstract]

In this paper we present a new technique for automatically measuring the performance of tasks, functions or arbitrary parts of a program on a multiprocessor embedded system. The technique instruments the tasks described by OpenMP, used to represent the task parallelism, while ad hoc pragmas in the source indicate other pieces of code to profile. The annotations and the instrumentation are completely target-independent, so the same code can be measured on different target architectures, on simulators or on prototypes. We validate the approach on a single and on a dual LEON 3 platform synthesized on FPGA, demonstrating a low instrumentation overhead. We show how the information obtained with this technique can be easily exploited in a hardware/software design space exploration tool, by estimating, with good accuracy, the speed-up of a parallel application given the profiling on the single processor prototype.

@conference{11311_553648,
author = {Lattuada, Marco and Pilato, Christian and Tumeo, Antonino and Ferrandi, Fabrizio},
title = {Performance Modeling of Parallel Applications on MPSoCs},
year = {2009},
booktitle = {System-on-Chip, 2009. SOC 2009. International Symposium on},
abstract = {In this paper we present a new technique for automatically measuring the performance of tasks, functions or arbitrary parts of a program on a multiprocessor embedded system. The technique instruments the tasks described by OpenMP, used to represent the task parallelism, while ad hoc pragmas in the source indicate other pieces of code to profile. The annotations and the instrumentation are completely target-independent, so the same code can be measured on different target architectures, on simulators or on prototypes. We validate the approach on a single and on a dual LEON 3 platform synthesized on FPGA, demonstrating a low instrumentation overhead. We show how the information obtained with this technique can be easily exploited in a hardware/software design space exploration tool, by estimating, with good accuracy, the speed-up of a parallel application given the profiling on the single processor prototype.},
keywords = {embedded systems;field programmable gate arrays;hardware-software codesign;logic design;multiprocessing systems;system-on-chip;FPGA;MPSoC design;OpenMP;ad hoc pragmas;dual LEON 3 platform;hardware-software design;multiprocessor embedded system;performance modeling;single processor prototype;task parallelism;Application software;Computer architecture;Embedded system;Field programmable gate arrays;Hardware;Instruments;Software design;Software prototyping;Space exploration;Virtual prototyping},
url = {http://doi.ieeecomputersociety.org/10.1109/SOCC.2009.5335675},
doi = {10.1109/SOCC.2009.5335675},
pages = {64--68}
}

M. Branca, L. Camerini, F. Ferrandi, P. L. Lanzi, C. Pilato, D. Sciuto, and A. Tumeo, “Evolutionary algorithms for the mapping of pipelined applications onto heterogeneous embedded systems,” in Proceedings of the 11th Annual Conference on Genetic and Evolutionary Computation, 2009, p. 1435–1442.
[BibTeX]

@conference{11311_553642,
author = {Branca, M. and Camerini, L. and Ferrandi, Fabrizio and Lanzi, PIER LUCA and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {Evolutionary algorithms for the mapping of pipelined applications onto heterogeneous embedded systems},
year = {2009},
booktitle = {Proceedings of the 11th Annual Conference on Genetic and Evolutionary Computation},
keywords = {INF},
url = {http://doi.acm.org/10.1145/1569901.1570094},
doi = {10.1145/1569901.1570094},
pages = {1435--1442}
}

F. Ferrandi, M. Lattuada, C. Pilato, and A. Tumeo, “Performance Estimation for Task Graphs Combining Sequential Path Profiling and Control Dependence Regions,” in Proceedings of the 7th IEEE/ACM International Conference on Formal Methods and Models for Codesign, 2009, p. 131–140.
[BibTeX] [Abstract]

The speed-up estimation of parallelized code is crucial to efficiently compare different parallelization techniques or task graph transformations. Unfortunately, most of the time, during the parallelization of a specification, the information that can be extracted by profiling the corresponding sequential code (e.g. the most executed paths) are not properly taken into account. In particular, correlating sequential path profiling with the corresponding parallelized code can help in the identification of code hot spots, opening new possibilities for automatic parallelization. For this reason, starting from a well-known profiling technique, the Efficient Path Profiling, we propose a methodology that estimates the speed-up of a parallelized specification, just using the corresponding hierarchical task graph representation and the information coming from the dynamic profiling of the initial sequential specification. Experimental results show that the proposed solution outperforms existing approaches.

@conference{11311_553643,
author = {Ferrandi, Fabrizio and Lattuada, Marco and Pilato, Christian and Tumeo, Antonino},
title = {Performance Estimation for Task Graphs Combining Sequential Path Profiling and Control Dependence Regions},
year = {2009},
booktitle = {Proceedings of the 7th IEEE/ACM International Conference on Formal Methods and Models for Codesign},
abstract = {The speed-up estimation of parallelized code is crucial to efficiently compare different parallelization techniques or task graph transformations. Unfortunately, most of the time, during the parallelization of a specification, the information that can be extracted by profiling the corresponding sequential code (e.g. the most executed paths) are not properly taken into account. In particular, correlating sequential path profiling with the corresponding parallelized code can help in the identification of code hot spots, opening new possibilities for automatic parallelization. For this reason, starting from a well-known profiling technique, the Efficient Path Profiling, we propose a methodology that estimates the speed-up of a parallelized specification, just using the corresponding hierarchical task graph representation and the information coming from the dynamic profiling of the initial sequential specification. Experimental results show that the proposed solution outperforms existing approaches.},
keywords = {INF},
url = {http://doi.ieeecomputersociety.org/10.1109/MEMCOD.2009.5185389},
doi = {10.1109/MEMCOD.2009.5185389},
pages = {131--140}
}

2008

C. Pilato, D. Loiacono, F. Ferrandi, P. L. Lanzi, and D. Sciuto, “High-level synthesis with multi-objective genetic algorithm: A comparative encoding analysis,” in Proceedings of the IEEE Congress on Evolutionary Computation 2008, CEC 2008, 2008, p. 3334–3341.
[BibTeX]

@conference{11311_544245,
author = {Pilato, Christian and Loiacono, Daniele and Ferrandi, Fabrizio and Lanzi, PIER LUCA and Sciuto, Donatella},
title = {High-level synthesis with multi-objective genetic algorithm: A comparative encoding analysis},
year = {2008},
booktitle = {Proceedings of the IEEE Congress on Evolutionary Computation 2008, CEC 2008},
doi = {10.1109/CEC.2008.4631249},
isbn = {9781424418220},
pages = {3334--3341}
}

F. Ferrandi, P. L. Lanzi, D. Loiacono, C. Pilato, and D. Sciuto, “A Multi-objective Genetic Algorithm for Design Space Exploration in High-Level Synthesis,” in IEEE Computer Society Annual on VLSI, 2008, ISVLSI 2008, 2008, p. 417–422.
[BibTeX]

@conference{11311_544244,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Loiacono, Daniele and Pilato, Christian and Sciuto, Donatella},
title = {A Multi-objective Genetic Algorithm for Design Space Exploration in High-Level Synthesis},
year = {2008},
booktitle = {IEEE Computer Society Annual on VLSI, 2008, ISVLSI 2008},
doi = {10.1109/ISVLSI.2008.73},
pages = {417--422}
}

C. Pilato, A. Tumeo, G. Palermo, F. Ferrandi, P. L. Lanzi, and D. Sciuto, “Improving Evolutionary Exploration to Area-Time Optimization of FPGA Designs,” JOURNAL OF SYSTEMS ARCHITECTURE, vol. 54, iss. 11, p. 1046–1057, 2008.
[BibTeX] [Abstract]

This paper presents a new methodology based on evolutionary multi-objective optimization (EMO) to synthesize multiple complex modules on reprogrammable devices. It starts from a behavioral description written in a common high-level language (for instance C) to automatically produce the register-transfer level (RTL) design in a hardware description language (e.g. Verilog). Since all high-level synthesis problems (scheduling, resource allocation and binding) are notoriously NP-complete and interdependent, these problems should be considered simultaneously. This drives to a wide design space, that needs to be thoroughly explored to obtain solutions able to satisfy the design constraints (e.g. area and performance). Since evolutionary algorithms are good candidates to tackle such complex explorations, in this paper we provide a solution based on the non-dominated sorting genetic algorithm (NSGA-II) to explore the design space and obtain the best solutions in terms of performance given the area constraints of a target reprogrammable device, for instance a Field Programmable Gate Array (FPGA). To further reduce the time needed for the exploration, that theoretically requires the complete logic synthesis of each visited point, the evaluation of the solutions have been speed-up by using two techniques: a good cost estimation model and a technique to exploit fitness inheritance by substituting the expensive actual evaluations with estimations based on closeness in an hypothetical design space. We show on the JPEG case study that the proposed approach provides good results in terms of trade-off between total area occupied and execution time. The results shows also that the Pareto-optimal set obtained by applying the proposed fitness inheritance model well approximates the set obtained without the proposed technique and reduces the overall execution time up to the 25% in average.

@article{11311_544405,
author = {Pilato, Christian and Tumeo, Antonino and Palermo, Gianluca and Ferrandi, Fabrizio and Lanzi, PIER LUCA and Sciuto, Donatella},
title = {Improving Evolutionary Exploration to Area-Time Optimization of FPGA Designs},
year = {2008},
journal = {JOURNAL OF SYSTEMS ARCHITECTURE},
volume = {54},
abstract = {This paper presents a new methodology based on evolutionary multi-objective optimization (EMO) to synthesize multiple complex modules on reprogrammable devices. It starts from a behavioral description written in a common high-level language (for instance C) to automatically produce the register-transfer level (RTL) design in a hardware description language (e.g. Verilog). Since all high-level synthesis problems (scheduling, resource allocation and binding) are notoriously NP-complete and interdependent, these problems should be considered simultaneously. This drives to a wide design space, that needs to be thoroughly explored to obtain solutions able to satisfy the design constraints (e.g. area and performance). Since evolutionary algorithms are good candidates to tackle such complex explorations, in this paper we provide a solution based on the non-dominated sorting genetic algorithm (NSGA-II) to explore the design space and obtain the best solutions in terms of performance given the area constraints of a target reprogrammable device, for instance a Field Programmable Gate Array (FPGA). To further reduce the time needed for the exploration, that theoretically requires the complete logic synthesis of each visited point, the evaluation of the solutions have been speed-up by using two techniques: a good cost estimation model and a technique to exploit fitness inheritance by substituting the expensive actual evaluations with estimations based on closeness in an hypothetical design space. We show on the JPEG case study that the proposed approach provides good results in terms of trade-off between total area occupied and execution time. The results shows also that the Pareto-optimal set obtained by applying the proposed fitness inheritance model well approximates the set obtained without the proposed technique and reduces the overall execution time up to the 25% in average.},
doi = {10.1016/j.sysarc.2008.04.010},
pages = {1046--1057},
number = {11}
}

A. Tumeo, C. Pilato, F. Ferrandi, D. Sciuto, and P. L. Lanzi, “Ant Colony Optimization for Mapping and Scheduling in Heterogeneous Multiprocessor Systems,” in Proceedings IEEE International Conference in Embedded Computer Systems: Architectures, Modeling, and Simulation, 2008, SAMOS VIII, 2008, p. 142–149.
[BibTeX]

@conference{11311_544246,
author = {Tumeo, Antonino and Pilato, Christian and Ferrandi, Fabrizio and Sciuto, Donatella and Lanzi, PIER LUCA},
title = {Ant Colony Optimization for Mapping and Scheduling in Heterogeneous Multiprocessor Systems},
year = {2008},
booktitle = {Proceedings IEEE International Conference in Embedded Computer Systems: Architectures, Modeling, and Simulation, 2008, SAMOS VIII},
doi = {10.1109/ICSAMOS.2008.4664857},
pages = {142--149}
}

2007

F. Ferrandi, P. L. Lanzi, G. Palermo, C. Pilato, D. Sciuto, and A. Tumeo, “An Evolutionary Approach to Area-Time Optimization of FPGA designs,” in Proceedings of IC-SAMOS 2007. International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation, 2007., 2007, p. 145–152.
[BibTeX]

@conference{11311_248119,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Palermo, Gianluca and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {An Evolutionary Approach to Area-Time Optimization of FPGA designs},
year = {2007},
publisher = {IEEE},
booktitle = {Proceedings of IC-SAMOS 2007. International Conference on Embedded Computer Systems: Architectures, Modeling and Simulation, 2007.},
url = {http://dx.doi.org/10.1109/ICSAMOS.2007.4285745},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4285745},
doi = {10.1109/ICSAMOS.2007.4285745},
isbn = {1424410584},
pages = {145--152}
}

F. Ferrandi, L. Fossati, M. Lattuada, G. Palermo, D. Sciuto, and A. Tumeo, “Partitioning and Mapping for the hArtes European Project,” in Proc. Workshop on Directions in FPGAs and Reconfigurable Systems: Design, Programming and Technologies for adaptive heterogeneous Systems-on-Chip and their European Dimensions, held during Design Automation and Test in Europe 2007 (DATE ’07),, 2007, p. 47–52.
[BibTeX]

@conference{11311_268434,
author = {Ferrandi, Fabrizio and Fossati, Luca and Lattuada, Marco and Palermo, Gianluca and Sciuto, Donatella and Tumeo, Antonino},
title = {Partitioning and Mapping for the hArtes European Project},
year = {2007},
booktitle = {Proc. Workshop on Directions in FPGAs and Reconfigurable Systems: Design, Programming and Technologies for adaptive heterogeneous Systems-on-Chip and their European Dimensions, held during Design Automation and Test in Europe 2007 (DATE '07),},
pages = {47--52}
}

F. Ferrandi, P. L. Lanzi, G. Palermo, C. Pilato, D. Sciuto, and A. Tumeo, “Fitness Inheritance in Evolutionary and Multi-Objective High-Level Synthesis,” in Proceedings of the IEEE CEC 2007 – Congress on Evolutionary Computation, 2007, p. 2459–2466.
[BibTeX] [Abstract]

The high-level synthesis process allows the automatic design and implementation of digital circuits starting from a behavioral description. Evolutionary algorithms are very widely adopted to approach this problem or just part of it. Neverthless, some concerns regarding execution times exist. In evolutionary high-level synthesis, design solutions have to be evaluated to extract information about some figures of merit (such as performance, area, etc.) and to allow the genetic algorithm to evolve and converge to Pareto-optimal solutions. Since the execution time of such evaluations increases with the complexity of the specification, the overall methodology could lead to unacceptable execution time. This paper presents a model to exploit fitness inheritance in a multi-objective optimization algorithm (i.e. NSGA-II) by substituting the expensive real evaluations with estimations based on closeness in an hypothetical design space. The estimations are based on the measure of the distance between individuals and a weighted average of the fitnesses of the closest ones. The results shows that the Pareto-optimal set obtained by applying the proposed model well approximates the set obtained without fitness inheritance. Moreover, the overall execution time is reduced up to the 25% in average.

@conference{11311_253573,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Palermo, Gianluca and Pilato, Christian and Sciuto, Donatella and Tumeo, Antonino},
title = {Fitness Inheritance in Evolutionary and Multi-Objective High-Level Synthesis},
year = {2007},
publisher = {IEEE},
booktitle = {Proceedings of the IEEE CEC 2007 - Congress on Evolutionary Computation},
abstract = {The high-level synthesis process allows the automatic design and implementation of digital circuits starting from a behavioral description. Evolutionary algorithms are very widely adopted to approach this problem or just part of it. Neverthless, some concerns regarding execution times exist. In evolutionary high-level synthesis, design solutions have to be evaluated to extract information about some figures of merit (such as performance, area, etc.) and to allow the genetic algorithm to evolve and converge to Pareto-optimal solutions. Since the execution time of such evaluations increases with the complexity of the specification, the overall methodology could lead to unacceptable execution time. This paper presents a model to exploit fitness inheritance in a multi-objective optimization algorithm (i.e. NSGA-II) by substituting the expensive real evaluations with estimations based on closeness in an hypothetical design space. The estimations are based on the measure of the distance between individuals and a weighted average of the fitnesses of the closest ones. The results shows that the Pareto-optimal set obtained by applying the proposed model well approximates the set obtained without fitness inheritance. Moreover, the overall execution time is reduced up to the 25% in average.},
url = {http://dx.doi.org/10.1109/CEC.2007.4424920},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4424920},
doi = {10.1109/CEC.2007.4424920},
isbn = {9781424413393},
isbn = {9781424413409},
pages = {2459--2466}
}

F. Ferrandi, L. Fossati, M. Lattuada, G. Palermo, D. Sciuto, and A. Tumeo, “Automatic parallelization of sequential specifications for symmetric MPSoCs,” in Embedded System Design: Topics, Techniques and Trends, 2007, p. 179–192.
[BibTeX]

@conference{11311_240811,
author = {Ferrandi, Fabrizio and Fossati, Luca and Lattuada, Marco and Palermo, Gianluca and Sciuto, Donatella and Tumeo, Antonino},
title = {Automatic parallelization of sequential specifications for symmetric MPSoCs},
year = {2007},
publisher = {Springer US},
booktitle = {Embedded System Design: Topics, Techniques and Trends},
url = {http://link.springer.com/chapter/10.1007/978-0-387-72258-0_16?null},
doi = {10.1007/978-0-387-72258-0_16},
isbn = {9780387722573},
isbn = {9780387722580},
pages = {179--192}
}

F. Ferrandi, D. Pandini, A. Rosiello, and D. Sciuto, “A Hash-based Approach for Functional Regularity Extraction During Logic Synthesis,” in VLSI, 2007. ISVLSI ’07. IEEE Computer Society Annual Symposium on, 2007, p. 92–97.
[BibTeX] [Abstract]

Performance, power, and functionality, yield and manufacturability are rapidly becoming additional critical factors that must be considered at higher levels of abstraction. A possible solution to improve yield and manufacturability is based on the detection of regularity at logic level This paper focuses its attention on regularity extraction, after technology independent logic synthesis, to detect recurring functionalities during logic synthesis and thus constraining the physical design phase to exploit the regular netlist produced. A fast heuristic to the template identification is proposed and analyzed on a standard set of benchmarks both sequential and combinational.

@conference{11311_259699,
author = {Ferrandi, Fabrizio and Pandini, D. and Rosiello, A. and Sciuto, Donatella},
title = {A Hash-based Approach for Functional Regularity Extraction During Logic Synthesis},
year = {2007},
publisher = {IEEE Compurter Soc.},
booktitle = {VLSI, 2007. ISVLSI '07. IEEE Computer Society Annual Symposium on},
abstract = {Performance, power, and functionality, yield and manufacturability are rapidly becoming additional critical factors that must be considered at higher levels of abstraction. A possible solution to improve yield and manufacturability is based on the detection of regularity at logic level This paper focuses its attention on regularity extraction, after technology independent logic synthesis, to detect recurring functionalities during logic synthesis and thus constraining the physical design phase to exploit the regular netlist produced. A fast heuristic to the template identification is proposed and analyzed on a standard set of benchmarks both sequential and combinational.},
keywords = {cryptography;design for manufacture;high level synthesis;integrated circuit yield;logic design;functional regularity extraction;hash-based approach;integrated circuit yield;logic level;logic synthesis;regular netlist;template identification;Circuit synthesis;Clustering algorithms;Hardware design languages;Latches;Libraries;Logic circuits;Logic design;Manufacturing;Minimization;Phase detection},
doi = {10.1109/ISVLSI.2007.5},
isbn = {9781595936059},
pages = {92--97}
}

2006

R. Cordone, F. Ferrandi, G. Palermo, M. D. Santambrogio, and D. Sciuto, “Using Speculative Computation and Parallelizing Techniques to Improve Scheduling of Control based Designs,” in IEEE Proc. Asia and South Pacific Conference on Design Automation 2006, 2006, p. 898–904.
[BibTeX] [Abstract]

Recent research results have seen the application of parallelizing techniques to high-level synthesis. In particular, the effect of speculative code transformations on mixed control-data flow designs has demonstrated effective results on schedule lengths. In this paper we first analyze the use of the control and data dependence graph as an intermediate representation that provides the possibility of extracting the maximum parallelism. Then we analyze the scheduling problem by formulating an approach based on Integer Linear Programming (ILP) to minimize the number of control steps given the amount of resources. We improve the already proposed ILP scheduling approaches by introducing a new conditional resource sharing constraint which is then extended to the case of speculative computation. The ILP formulation has been solved by using a Branch and Cut framework which provides better results than standard branch and bound techniques

@conference{11311_257923,
author = {Cordone, R. and Ferrandi, Fabrizio and Palermo, Gianluca and Santambrogio, MARCO DOMENICO and Sciuto, Donatella},
title = {Using Speculative Computation and Parallelizing Techniques to Improve Scheduling of Control based Designs},
year = {2006},
publisher = {IEEE},
booktitle = {IEEE Proc. Asia and South Pacific Conference on Design Automation 2006},
abstract = {Recent research results have seen the application of parallelizing techniques to high-level synthesis. In particular, the effect of speculative code transformations on mixed control-data flow designs has demonstrated effective results on schedule lengths. In this paper we first analyze the use of the control and data dependence graph as an intermediate representation that provides the possibility of extracting the maximum parallelism. Then we analyze the scheduling problem by formulating an approach based on Integer Linear Programming (ILP) to minimize the number of control steps given the amount of resources. We improve the already proposed ILP scheduling approaches by introducing a new conditional resource sharing constraint which is then extended to the case of speculative computation. The ILP formulation has been solved by using a Branch and Cut framework which provides better results than standard branch and bound techniques},
url = {http://dl.acm.org/citation.cfm?id=1118502},
url = {http://dx.doi.org/10.1145/1118299.1118502},
doi = {10.1109/ASPDAC.2006.1594800},
isbn = {0780394518},
pages = {898--904}
}

F. Bruschi and F. Ferrandi, “A systemC based framework for the early evaluation of communication architectures,” in Proc. Forum on Specification & Design Languages, FDL’06, 2006, p. 319–326.
[BibTeX]

@conference{11311_251538,
author = {Bruschi, Francesco and Ferrandi, Fabrizio},
title = {A systemC based framework for the early evaluation of communication architectures},
year = {2006},
booktitle = {Proc. Forum on Specification & Design Languages, FDL’06},
keywords = {INF},
isbn = {9783000197109},
pages = {319--326}
}

2004

F. Ferrandi, P. L. Lanzi, D. Sciuto, and M. Tanelli, “System-level metrics for hardware/software architectural mapping,” in Electronic Design, Test and Applications, Proceedings. DELTA 2004. Second IEEE International Workshop on, 2004, p. 231–236.
[BibTeX] [Abstract]

The current trend in Embedded Systems (ES) design is moving towards the integration of increasingly complex applications on a single chip, while having to meet strict market demands which force to face always shortening design times. In general, the ideal design methodology shall support the exploration of the highest possible number of alternatives (in terms of HW-SW architectures) starting in the early design stages as this will prevent costly correction efforts in the deployment phase. The present paper will propose a new methodology for tackling the design exploration problem, with the aim of providing a solution in terms of optimal partitioning with respect of the overall system performance.

@conference{11311_262750,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Sciuto, Donatella and Tanelli, Mara},
title = {System-level metrics for hardware/software architectural mapping},
year = {2004},
booktitle = {Electronic Design, Test and Applications, Proceedings. DELTA 2004. Second IEEE International Workshop on},
abstract = {The current trend in Embedded Systems (ES) design is moving towards the integration of increasingly complex applications on a single chip, while having to meet strict market demands which force to face always shortening design times. In general, the ideal design methodology shall support the exploration of the highest possible number of alternatives (in terms of HW-SW architectures) starting in the early design stages as this will prevent costly correction efforts in the deployment phase. The present paper will propose a new methodology for tackling the design exploration problem, with the aim of providing a solution in terms of optimal partitioning with respect of the overall system performance.},
keywords = {computer architecture;embedded systems;software architecture;software metrics;software performance evaluation;systems analysis;HW-SW architectures;communication performance estimation;embedded systems design;hardware performance estimation;hardware/software architectural mapping;optimal partitioning;software performance estimation;system level metrics;Application software;Computer architecture;Delay estimation;Design methodology;Embedded system;Hardware;Modeling;Performance analysis;Software performance;System performance},
doi = {10.1109/DELTA.2004.10060},
pages = {231--236}
}

F. Ferrandi, P. L. Lanzi, and D. Sciuto, “System Level Hardware–Software Design Exploration with XCS,” in Genetic and Evolutionary Computation – GECCO 2004: Genetic and Evolutionary Computation Conference, Seattle, WA, USA, June 26-30, 2004. Proceedings, Part II, 2004, p. 763–773.
[BibTeX] [Abstract]

The current trend in Embedded Systems (ES) design is moving towards the integration of increasingly complex applications on a single chip. An Embedded System has to satisfy both performance constraints and cost limits; it is composed of both dedicated elements, i.e. hardware (HW) components, and programmable units, i.e. software (SW) components, Hardware (HW) and software (SW) components have to interact with each other for accomplishing a specific task. One of the aims of codesign is to support the exploration of the most significant architectural alternatives in terms of decomposition between hardware (HW) and software (SW) components. In this paper, we propose a novel approach to support the exploration of feasible hardware-software (HW-SW) configurations. The approach exploits the learning classifier system XCS both to identify existing relationships among the system components and to support HW-SW partitioning decisions. We validate the approach by applying it to the design of a Digital Sound Spatializer.

@conference{11311_268312,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Sciuto, Donatella},
title = {System Level Hardware--Software Design Exploration with XCS},
year = {2004},
publisher = {Springer Verlag},
journal = {LECTURE NOTES IN COMPUTER SCIENCE},
booktitle = {Genetic and Evolutionary Computation -- GECCO 2004: Genetic and Evolutionary Computation Conference, Seattle, WA, USA, June 26-30, 2004. Proceedings, Part II},
abstract = {The current trend in Embedded Systems (ES) design is moving towards the integration of increasingly complex applications on a single chip. An Embedded System has to satisfy both performance constraints and cost limits; it is composed of both dedicated elements, i.e. hardware (HW) components, and programmable units, i.e. software (SW) components, Hardware (HW) and software (SW) components have to interact with each other for accomplishing a specific task. One of the aims of codesign is to support the exploration of the most significant architectural alternatives in terms of decomposition between hardware (HW) and software (SW) components. In this paper, we propose a novel approach to support the exploration of feasible hardware-software (HW-SW) configurations. The approach exploits the learning classifier system XCS both to identify existing relationships among the system components and to support HW-SW partitioning decisions. We validate the approach by applying it to the design of a Digital Sound Spatializer.},
url = {http://dx.doi.org/10.1007/978-3-540-24855-2_91},
doi = {10.1007/978-3-540-24855-2_91},
isbn = {9783540223436},
pages = {763--773}
}

2003

F. Ferrandi, P. L. Lanzi, and D. Sciuto, “Mining Interesting Patterns from Hardware-Software Codesign Data with the Learning Classifier System XCS,” in Proceedings of the IEEE Congress on Evolutionary Computation (CEC 2003), 2003, p. 1486–1492.
[BibTeX]

@conference{11311_268104,
author = {Ferrandi, Fabrizio and Lanzi, PIER LUCA and Sciuto, Donatella},
title = {Mining Interesting Patterns from Hardware-Software Codesign Data with the Learning Classifier System XCS},
year = {2003},
booktitle = {Proceedings of the IEEE Congress on Evolutionary Computation (CEC 2003)},
isbn = {0780378040},
pages = {1486--1492}
}

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2004

2003

A framework for Hardware-Software Co-Design of Embedded Systems