<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Shader Slang</title>
    <description>Slang project web site</description>
    <link>http://shader-slang.org/</link>
    <atom:link href="http://shader-slang.org/feed.xml" rel="self" type="application/rss+xml"/>
    <pubDate>Sat, 11 Apr 2026 03:12:50 +0000</pubDate>
    <lastBuildDate>Sat, 11 Apr 2026 03:12:50 +0000</lastBuildDate>
    <generator>Jekyll v3.10.0</generator>
    
     
      <item>
        <title>Shading Languages Symposium</title>
        <description>&lt;h2 id=&quot;shading-language-symposium&quot;&gt;Shading Language Symposium&lt;/h2&gt;

&lt;p&gt;This ground-breaking Shading Language Symposium brings together shading language implementors, developers, researchers and technical artists to explore the future of shading languages and real-time rendering technologies.&lt;/p&gt;

&lt;p&gt;The symposium will focus on the evolving landscape of shading languages and the emerging technologies that are shaping the next generation of graphics programming.&lt;/p&gt;

&lt;p&gt;If you work with Shading Languages then this event will provide a unique opportunity to connect with your peers, learn from leaders in their fields, and advance your expertise.&lt;/p&gt;

&lt;p&gt;The Symposium is organised by the Khronos Group and will take place immediately after &lt;a href=&quot;https://vulkan.org/events/vulkanised-2026&quot;&gt;Vulkanised 2026&lt;/a&gt;. We value a diversity of voices, perspectives and experiences and are dedicated to providing a harassment-free experience for everyone.&lt;/p&gt;

&lt;p&gt;&lt;a class=&quot;btn btn-primary&quot; href=&quot;https://www.khronos.org/events/shading-languages-symposium-2026&quot;&gt;Learn More&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Fri, 17 Oct 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/event/2025/10/17/shading-language-symposium/</link>
        <guid isPermaLink="true">http://shader-slang.org/event/2025/10/17/shading-language-symposium/</guid>
        
        <category>slang</category>
        
        
        <category>event</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2026-02-12-symposium-thumbnail-for-slang-website-events.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>SIGGRAPH 2025 Materials Now Available: Recordings and Resources from Our Events</title>
        <description>&lt;p&gt;Recordings and materials from all three of our SIGGRAPH 2025 sessions are now available! Whether you missed the events or want to revisit the content, you can now access everything from our hands-on lab, Birds of a Feather session, and neural shading course.&lt;/p&gt;

&lt;h3 id=&quot;hands-on-class-introduction-to-slang&quot;&gt;Hands-On Class: Introduction to Slang&lt;/h3&gt;

&lt;p&gt;Our hands-on lab covered everything from language fundamentals to advanced features like modules, generics, and automatic differentiation. You can now see the recording, or work through the entire session at your own pace.&lt;/p&gt;

&lt;p&gt;Download the lab materials and walk through the examples on your own computer here:&lt;br /&gt;
&lt;a href=&quot;https://developer.download.nvidia.com/ProGraphics/nvpro-samples/SlangLab/Lab.zip&quot;&gt;Lab Materials&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;https://developer.download.nvidia.com/ProGraphics/nvpro-samples/SlangLab/Slides.pdf&quot;&gt;Lab Slides&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;And view the recording here:&lt;br /&gt;
&lt;a href=&quot;https://youtu.be/F7OS9Zvztmw?si=vXLCu0x-s-jHNagX&quot;&gt;Lab Recording&lt;/a&gt;&lt;/p&gt;

&lt;h3 id=&quot;birds-of-a-feather-developing-with-slang&quot;&gt;Birds of a Feather: Developing with Slang&lt;/h3&gt;

&lt;p&gt;Our Birds of a Feather session covered the latest developments, language roadmap, and featured discussions with developers using Slang in production.&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://www.khronos.org/assets/uploads/developers/presentations/Slang_BOF_SIGGRAPH_2025.pdf&quot;&gt;Presentation Materials&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;https://www.youtube.com/watch?v=Y7uBfTxFnnA&quot;&gt;Recording&lt;/a&gt;&lt;/p&gt;

&lt;h3 id=&quot;neural-shading-course&quot;&gt;Neural Shading Course&lt;/h3&gt;

&lt;p&gt;Our intensive course on neural shading techniques provides a deep dive into the cutting edge of graphics programming, showing how to implement neural networks in Slang while leveraging automatic differentiation and modern GPU capabilities.&lt;/p&gt;

&lt;p&gt;You can find the slides for this course, along with all example code in our repository, or watch the recordings:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://github.com/shader-slang/neural-shading-s25&quot;&gt;Neural Shading Course Materials&lt;/a&gt;&lt;br /&gt;
&lt;a href=&quot;https://youtube.com/playlist?list=PLPTS9gmXL0u_BA3bG67IYQHgrMl48Xml3&amp;amp;si=-b5OVpGnqywCvfHA&quot;&gt;Course Recordings&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;These materials provide comprehensive coverage of modern shader programming with Slang, from basic concepts to advanced techniques. Whether you’re new to Slang or looking to expand your skills, these resources offer valuable insights into the future of shader development.&lt;/p&gt;
</description>
        <pubDate>Mon, 06 Oct 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/10/06/siggraph-roundup/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/10/06/siggraph-roundup/</guid>
        
        <category>slang</category>
        
        <category>siggraph</category>
        
        <category>events</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/neuralshading-thumbnail.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Automatic Differentiation</title>
        <description>
</description>
        <pubDate>Mon, 06 Oct 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2025/10/06/getting-started-with-shader-slang-automatic-differentiation/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2025/10/06/getting-started-with-shader-slang-automatic-differentiation/</guid>
        
        <category>slang</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-10-21-Slang-Auto-Diff-Banner.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Automatic Differentiation</title>
        <description>&lt;h2 id=&quot;presentations&quot;&gt;Presentations&lt;/h2&gt;
&lt;p&gt;Powerpoint Slides - &lt;a href=&quot;/images/events/2025-10-06-Getting-Started-with-Slang-Autodiff.pdf&quot;&gt;Getting Started with Shader Slang Autodiff&lt;/a&gt; (pdf)&lt;/p&gt;

&lt;h2 id=&quot;automatic-differentiation&quot;&gt;Automatic Differentiation&lt;/h2&gt;

&lt;p&gt;Join us for a focused deep dive on automatic differentiation in Slang with Shannon Woods. This session distills the core concepts of autodiff, emphasizing how forward and reverse modes work, how gradients propagate, and how to reason about correctness and efficiency with clear, minimal examples.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What we’ll cover:&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
  &lt;li&gt;Forward and backward gradient propagation: intuition and step‑by‑step examples&lt;/li&gt;
  &lt;li&gt;How to invoke autodiff from Slang&lt;/li&gt;
  &lt;li&gt;Diff pairs and the forward/backward operators&lt;/li&gt;
  &lt;li&gt;Differentiable vs. non‑differentiable types; making custom structs differentiable&lt;/li&gt;
  &lt;li&gt;Providing custom derivatives for fine‑grained control&lt;/li&gt;
  &lt;li&gt;Handling buffer access and gradient accumulation patterns&lt;/li&gt;
  &lt;li&gt;Practical techniques for debugging and validating gradients
&lt;br /&gt;&lt;br /&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Who should attend:&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
  &lt;li&gt;Graphics engineers exploring optimization or inverse problems&lt;/li&gt;
  &lt;li&gt;Researchers and practitioners in differentiable rendering or neural graphics&lt;/li&gt;
  &lt;li&gt;Developers curious about practical, GPU‑friendly autodiff concepts&lt;/li&gt;
  &lt;li&gt;Bring your questions—there will be time for discussion and hands‑on gradient debugging tips.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;br /&gt;&lt;a class=&quot;btn btn-primary&quot; href=&quot;https://khronosgroup.zoom.us/webinar/register/WN_ynYZnewkRvmoM5Fgbx-iYA&quot; target=&quot;_blank&quot;&gt;Register&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Mon, 06 Oct 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/event/2025/10/06/getting-started-with-slang-automatic-differentiation/</link>
        <guid isPermaLink="true">http://shader-slang.org/event/2025/10/06/getting-started-with-slang-automatic-differentiation/</guid>
        
        <category>slang</category>
        
        
        <category>event</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-10-21-Slang-Auto-Diff-Banner.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Graphics Programming Conference 2025</title>
        <description>&lt;p&gt;&lt;strong&gt;Location&lt;/strong&gt;: Breda, The Netherlands&lt;br /&gt;
&lt;strong&gt;Website&lt;/strong&gt;: &lt;a href=&quot;https://graphicsprogrammingconference.com/&quot; target=&quot;_blank&quot;&gt;https://graphicsprogrammingconference.com/&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;In its second year, Graphics Programming Conference will have the latest advances in real-time rendering in games, interactive applications and other new developments.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Wednesday, November 19th at 16:00&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Slang Working Group Chair, Shannon Woods, will lead a session on Slang, the open-source, cross-platform shading language hosted by the Khronos Group. 
This presentation will explore how Slang’s modern features enable neural graphics techniques across diverse GPU platforms. We will examine Slang’s key architectural advantages: its modular system for scalable shader development, built-in automatic differentiation capabilities, and seamless cross-platform compilation. We will explore Slang’s automatic differentiation system, enabling gradient-based optimization without requiring external ML frameworks. By understanding Slang’s language design and capabilities, graphics programmers can leverage its neural graphics features to build more sophisticated and performant shader systems, unlocking new possibilities in real-time rendering across all target platforms.&lt;/p&gt;

&lt;p&gt;&lt;a class=&quot;btn btn-primary&quot; href=&quot;https://graphicsprogrammingconference.com/&quot; target=&quot;_blank&quot;&gt;View the GPC2025 Website&lt;/a&gt;&lt;/p&gt;
</description>
        <pubDate>Fri, 12 Sep 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/event/2025/09/12/graphics-programming-conference-2025/</link>
        <guid isPermaLink="true">http://shader-slang.org/event/2025/09/12/graphics-programming-conference-2025/</guid>
        
        <category>slang</category>
        
        
        <category>event</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-11-18-slang-graphics-programming-conference-2025.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Modularization</title>
        <description>
</description>
        <pubDate>Tue, 09 Sep 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2025/09/09/getting-started-with-slang-modularization/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2025/09/09/getting-started-with-slang-modularization/</guid>
        
        <category>modularization</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-07-getting-started-with-slang-modularization-thumbnail.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang at SIGGRAPH 2025</title>
        <description>&lt;p&gt;We’re excited to announce that Slang will have a significant presence at &lt;strong&gt;SIGGRAPH 2025&lt;/strong&gt; in Vancouver this August! This year’s conference will feature three major Slang events: a hands-on lab for learning the language, a comprehensive course on neural shading techniques, and a Birds of a Feather session for community discussion and updates.&lt;/p&gt;

&lt;h2 id=&quot;hands-on-class-introduction-to-slang&quot;&gt;&lt;a href=&quot;https://s2025.conference-schedule.org/?post_type=page&amp;amp;p=14&amp;amp;id=gensubcur_104&amp;amp;sess=sess287&quot;&gt;Hands-On Class: Introduction to Slang&lt;/a&gt;&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Sunday, August 10, 4:00-5:30 PM PDT&lt;/strong&gt;&lt;br /&gt;
&lt;strong&gt;West Building, Rooms 121-122&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Join us for a comprehensive hands-on lab that will take you from Slang basics to advanced features. This interactive session, led by NVIDIA’s Nia Bickford, Tristan Lorach, and Chris Hebert, will cover:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Language Fundamentals&lt;/strong&gt;: Modern shader programming constructs and syntax&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Advanced Features&lt;/strong&gt;: Modules, generics, and interfaces for scalable code organization&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;SlangPy Integration&lt;/strong&gt;: Python-based development and rapid prototyping&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Automatic Differentiation&lt;/strong&gt;: Neural graphics and machine learning capabilities&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Cross-Platform Development&lt;/strong&gt;: Writing shaders that work across multiple GPU APIs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;The lab is designed for graphics developers of all experience levels, from those new to shader programming to experienced developers looking to modernize their workflow. Computers will be provided, so no preparation is required—just bring your curiosity and enthusiasm for modern shader development!&lt;/p&gt;

&lt;h2 id=&quot;birds-of-a-feather-developing-with-slang&quot;&gt;&lt;a href=&quot;https://s2025.conference-schedule.org/?post_type=page&amp;amp;p=14&amp;amp;id=bof_177&amp;amp;sess=sess558&quot;&gt;Birds of a Feather: Developing with Slang&lt;/a&gt;&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Wednesday, August 13, 2:30-3:30 PM PDT&lt;/strong&gt;&lt;br /&gt;
&lt;strong&gt;British Ballroom, Fairmont Waterfront&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;This community-focused session will provide the latest updates on Slang development and foster discussions about the future of shader programming. The session will include:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Community Update&lt;/strong&gt;: Latest developments and ecosystem growth&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Language Roadmap&lt;/strong&gt;: Upcoming features and development priorities&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Industry Perspectives&lt;/strong&gt;: Real-world experiences from companies using Slang&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Open Discussion&lt;/strong&gt;: Q&amp;amp;A and community feedback&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is your opportunity to connect with the Slang Working Group, share your experiences, and help shape the future direction of the language. Whether you’re actively using Slang or just curious about modern shader development, this session will provide valuable insights and networking opportunities.&lt;/p&gt;

&lt;h2 id=&quot;an-introduction-to-neural-shading&quot;&gt;&lt;a href=&quot;https://s2025.conference-schedule.org/?post_type=page&amp;amp;p=14&amp;amp;id=gensub_420&amp;amp;sess=sess208&quot;&gt;An Introduction to Neural Shading&lt;/a&gt;&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Thursday, August 14, 9:00 AM-12:15 PM PDT&lt;/strong&gt;&lt;br /&gt;
&lt;strong&gt;West Building, Rooms 109-110&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;This intensive course will teach you how to implement neural shading techniques using Slang, where traditional graphics algorithms are replaced with neural networks. This course covers:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Neural Shading Fundamentals&lt;/strong&gt;: Theory and practical implementation of neural networks in graphics&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Automatic Differentiation&lt;/strong&gt;: Deep dive into Slang’s autodiff capabilities for gradient-based optimization&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;MLP Implementation&lt;/strong&gt;: How to build and optimize Multi-Layer Perceptrons in Slang&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Hardware Acceleration&lt;/strong&gt;: Leveraging modern GPU tensor cores and cooperative vectors&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Production Deployment&lt;/strong&gt;: Real-world considerations for shipping neural shading techniques&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;The course includes interactive samples written in Python and Slang, allowing you to follow along and experiment with the techniques in real-time. This is your opportunity to learn from the experts who are pushing the boundaries of what’s possible with neural graphics.&lt;/p&gt;

&lt;h2 id=&quot;latest-slang-release&quot;&gt;Latest Slang Release&lt;/h2&gt;

&lt;p&gt;We’re also excited to announce our latest Slang release, which brings significant improvements and new features to the language. This release continues our commitment to making shader programming more accessible and powerful: Slang Release &lt;a href=&quot;https://github.com/shader-slang/slang/releases/tag/v2025.14.3&quot;&gt;v2025.14.3&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;We’re also excited to announce our latest Slang release, which brings significant improvements and new features to the language, including:&lt;/p&gt;

&lt;h3 id=&quot;language-enhancements-and-new-functionality&quot;&gt;Language Enhancements and New Functionality&lt;/h3&gt;

&lt;ul&gt;
  &lt;li&gt;Support for default implementations in interface methods&lt;/li&gt;
  &lt;li&gt;New &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;override&lt;/code&gt; keyword requirement for overriding default interface methods&lt;/li&gt;
  &lt;li&gt;Added control arguments for floating-point denormal mode&lt;/li&gt;
  &lt;li&gt;Extended &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;expand&lt;/code&gt; operator support for concrete tuple values&lt;/li&gt;
&lt;/ul&gt;

&lt;h3 id=&quot;compiler-architecture-improvements&quot;&gt;Compiler Architecture Improvements&lt;/h3&gt;

&lt;p&gt;We’ve redesigned how AST deserialization and deduplication is implemented in the compiler, enabling on-demand deserialization of the core module. This architectural improvement leads to a significant performance boost, with over 3x speedup in &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;createGlobalSession&lt;/code&gt; and reduced end-to-end compile times for small to medium shaders. In our benchmarks, the compile time for a typical fragment shader with &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;slangc&lt;/code&gt; dropped from 260ms to 80ms.&lt;/p&gt;

&lt;h3 id=&quot;improvements-to-code-generation-debugging-and-platform-support&quot;&gt;Improvements to Code Generation, Debugging, and Platform Support.&lt;/h3&gt;

&lt;ul&gt;
  &lt;li&gt;Added support for &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;SPV_EXT_fragment_invocation_density&lt;/code&gt; (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;SPV_NV_shading_rate&lt;/code&gt;)&lt;/li&gt;
  &lt;li&gt;Implemented GLSL/SPIR-V built-in variable &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;DeviceIndex&lt;/code&gt;&lt;/li&gt;
  &lt;li&gt;Added MSVC-style bitfield packing&lt;/li&gt;
  &lt;li&gt;Improved matrix type handling:
    &lt;ul&gt;
      &lt;li&gt;Automatic lowering of unsupported matrix types for GLSL/WGSL/Metal targets&lt;/li&gt;
      &lt;li&gt;Conversion of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int&lt;/code&gt;/&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;uint&lt;/code&gt;/&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bool&lt;/code&gt; matrices to arrays for SPIR-V&lt;/li&gt;
    &lt;/ul&gt;
  &lt;/li&gt;
  &lt;li&gt;Enhanced reflection API with combined texture-sampler flag to differentiate &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;Texture2D&lt;/code&gt; from &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;Sampler2D&lt;/code&gt;&lt;/li&gt;
  &lt;li&gt;Added &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;mad&lt;/code&gt; operation support in WGSL&lt;/li&gt;
  &lt;li&gt;Improved debugging capabilities:
    &lt;ul&gt;
      &lt;li&gt;Added &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;DebugGlobalVariable&lt;/code&gt; instructions to SPIR-V output&lt;/li&gt;
      &lt;li&gt;Updated to 1-based argument indexing for &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;DebugLocalVariable&lt;/code&gt;&lt;/li&gt;
    &lt;/ul&gt;
  &lt;/li&gt;
&lt;/ul&gt;

&lt;h3 id=&quot;language-server-improvements&quot;&gt;Language Server Improvements&lt;/h3&gt;

&lt;ul&gt;
  &lt;li&gt;Auto-sort completion suggestions by relevance and context&lt;/li&gt;
  &lt;li&gt;Show function signature assistance when working with generic types and functions&lt;/li&gt;
  &lt;li&gt;Intelligent auto-completion when implementing interface methods with override keyword&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Apart from these changes, we also landed many smaller fixes that improve the compiler’s performance, stability and consistency.&lt;/p&gt;

&lt;p&gt;This release demonstrates our ongoing investment in Slang’s capabilities and our commitment to the graphics development community. Whether you’re working on traditional rendering pipelines or exploring neural graphics techniques, these new features will help you write better, more maintainable shader code.&lt;/p&gt;

&lt;h2 id=&quot;cant-make-it-to-siggraph&quot;&gt;Can’t Make It to SIGGRAPH?&lt;/h2&gt;

&lt;p&gt;If you can’t attend SIGGRAPH 2025 in person, you can still stay connected with the Slang community:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Try Slang Online&lt;/strong&gt;: Experiment with &lt;a href=&quot;https://shader-slang.org/slang-playground&quot;&gt;Slang in your browser&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Join the Community&lt;/strong&gt;: Connect with other developers on our &lt;a href=&quot;https://khr.io/slangdiscord&quot;&gt;Discord server&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Explore Examples&lt;/strong&gt;: Check out our &lt;a href=&quot;https://github.com/shader-slang/slang&quot;&gt;GitHub repository&lt;/a&gt; for tutorials and sample code&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Follow Updates&lt;/strong&gt;: Stay informed about Slang developments through our &lt;a href=&quot;https://shader-slang.com/blog&quot;&gt;blog&lt;/a&gt; and social media&lt;/li&gt;
&lt;/ul&gt;

&lt;h2 id=&quot;looking-forward&quot;&gt;Looking Forward&lt;/h2&gt;

&lt;p&gt;SIGGRAPH 2025 marks an exciting milestone for Slang as we continue to build a modern, accessible shader programming ecosystem. Whether you’re attending the hands-on lab, the neural shading course, the Birds of a Feather session, or all three, we’re looking forward to meeting you and sharing the latest developments in shader language technology.&lt;/p&gt;

&lt;p&gt;See you in Vancouver!&lt;/p&gt;

&lt;hr /&gt;

&lt;p&gt;&lt;em&gt;For more information about SIGGRAPH 2025, visit the &lt;a href=&quot;https://s2025.siggraph.org/&quot;&gt;official conference website&lt;/a&gt;.&lt;/em&gt;&lt;/p&gt;
</description>
        <pubDate>Fri, 08 Aug 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/08/08/slang-at-siggraph-2025/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/08/08/slang-at-siggraph-2025/</guid>
        
        <category>slang</category>
        
        <category>siggraph</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/siggraph-logo.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang: Using AI coding assistants</title>
        <description>&lt;h2 id=&quot;introduction&quot;&gt;Introduction&lt;/h2&gt;

&lt;p&gt;Large Language Models(LLMs) aid coding by boosting productivity. llms.txt is a proposed standard for websites to offer LLM-friendly content, mainly to assist LLMs during inference.
Slang now supports the llms.txt standard. You can now use LLMs to accelerate your development with help of coding assistants like &lt;a href=&quot;https://cursor.com/agents&quot;&gt;Cursor&lt;/a&gt; or any tool of choice by providing relevant context about Slang’s
documentation, within your projects. We provide 3 different versions of llms.txt:&lt;/p&gt;

&lt;ol&gt;
  &lt;li&gt;&lt;a href=&quot;/docs/llms.txt&quot;&gt;llms.txt&lt;/a&gt; - contains links with brief descriptions for agents to navigate. Use this if you’re looking for basic understanding of Slang&lt;/li&gt;
  &lt;li&gt;&lt;a href=&quot;/docs/llms-full.txt&quot;&gt;llms-full.txt&lt;/a&gt; - contains entire Slang docs compressed into the llms.txt format. Use this for more detailed documentation or for detailed explanations of Slang.&lt;/li&gt;
  &lt;li&gt;&lt;a href=&quot;/docs/llms-slangpy-full.txt&quot;&gt;llms-SlangPy-full.txt&lt;/a&gt; - contains SlangPy related information. Use this for SlangPy related topics.&lt;/li&gt;
&lt;/ol&gt;

&lt;h2 id=&quot;usage&quot;&gt;Usage&lt;/h2&gt;

&lt;p&gt;You can download these and use them with Cursor, or simply include them as part of the context to get started.The SlangPy version of the file is maintained separately due to its distinct repository and focus on Python interfaces. These files are available for download and can be integrated with Cursor, or alternatively, included as contextual resources to facilitate initial setup.
Integrating llms-full.txt directly into the prompt significantly enhances the quality of the output.&lt;/p&gt;

&lt;p&gt;Below is an illustration of utilizing llms.txt within Cursor for the development of a Slang-based application. 
This demonstrates how users can access high-quality documentation concerning generics and gain insight into their implementation. Here we can see the following happen:&lt;/p&gt;

&lt;ol&gt;
  &lt;li&gt;User provides the llms-full.txt as part of the prompt and posts a query related to how they can use Slang features&lt;/li&gt;
  &lt;li&gt;We can see that the LLM goes through the relevant documentation and obtains answers as to how generics and interfaces can help reduce code duplication.&lt;/li&gt;
  &lt;li&gt;The LLM then goes ahead and provides implementation as well on how this can be done.&lt;/li&gt;
  &lt;li&gt;Note: Given the current state of LLMs, output may not be 100% accurate, compilable code, so you will need to review it closely.&lt;/li&gt;
&lt;/ol&gt;

&lt;video width=&quot;480&quot; controls=&quot;&quot;&gt;
  &lt;source src=&quot;/images/posts/2025-08-01-slang-ai-assistant.webm&quot; type=&quot;video/webm&quot; /&gt;
  Your browser does not support the video tag.
&lt;/video&gt;

&lt;p&gt;We encourage you to try out this integration and experience firsthand how it can streamline your development workflow, 
and we welcome your feedback to help us further improve this valuable tool. Kindly start discussion on our Slang Discord or file issues on our GitHub if you have suggestions.&lt;/p&gt;
</description>
        <pubDate>Fri, 01 Aug 2025 00:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/08/01/slang-ai-assistant/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/08/01/slang-ai-assistant/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2025-08-01-slang-ai-assistant.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Modularization</title>
        <description>&lt;p&gt;Tired of wrestling with preprocessor macros and copy-pasting shader code? Discover how Slang’s modern module system can revolutionize your shader development workflow!&lt;/p&gt;

&lt;p&gt;Join us for an introductory talk where we’ll explore:&lt;/p&gt;

&lt;h4 id=&quot;modules--visibility-control&quot;&gt;Modules &amp;amp; Visibility Control&lt;/h4&gt;
&lt;ul&gt;
  &lt;li&gt;Learn how to organize your shader code into clean, reusable modules with proper access control (public, internal, private) that actually makes sense.&lt;br /&gt;
&lt;br /&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;h4 id=&quot;compilation-pipeline-magic&quot;&gt;Compilation Pipeline Magic&lt;/h4&gt;
&lt;ul&gt;
  &lt;li&gt;See how modules enable separate compilation, and how that translates to faster compile times when you need to compile many different shader variants specialized from the same source code.&lt;br /&gt;
&lt;br /&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;h4 id=&quot;specialization-without-preprocessor-pain&quot;&gt;Specialization Without Preprocessor Pain&lt;/h4&gt;
&lt;ul&gt;
  &lt;li&gt;Discover how Slang’s language features let you specialize shaders at link-time without the mess of #ifdef spaghetti. Write cleaner, more maintainable code that works synergistically with modules and separate compilation.&lt;br /&gt;
&lt;br /&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;h4 id=&quot;generics-vs-templates-the-slang-way&quot;&gt;Generics vs Templates: The Slang Way&lt;/h4&gt;
&lt;ul&gt;
  &lt;li&gt;Understand the difference between Slang’s type-safe generics and C++ templates, and why Slang generics is the key to allow true separate compilation.&lt;br /&gt;
&lt;br /&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Whether you’re building a rendering engine or just want to write better shaders, this talk will show you how Slang modules can make your code more modular, faster to compile, and easier to maintain. Perfect for developers looking to move beyond traditional HLSL workflows!&lt;/p&gt;

&lt;h3 id=&quot;prerequisites&quot;&gt;Prerequisites&lt;/h3&gt;

&lt;p&gt;No prior Slang experience required - we’ll start from the basics and build up to practical examples you can use immediately.&lt;/p&gt;

&lt;h3 id=&quot;speaker&quot;&gt;Speaker&lt;/h3&gt;

&lt;ul&gt;
  &lt;li&gt;Yong He, NVIDIA&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;br /&gt;
&lt;a class=&quot;btn btn-primary&quot; href=&quot;https://khronosgroup.zoom.us/webinar/register/WN_a5cHAItHR9CQY27fYvv3gA&quot;&gt;Register&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The Khronos Group is dedicated to providing a harassment-free conference experience for everyone. Visit our &lt;a href=&quot;https://www.khronos.org/about/code-of-conduct&quot;&gt;Code of Conduct&lt;/a&gt; page to learn more.&lt;/p&gt;
</description>
        <pubDate>Mon, 28 Jul 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/event/2025/07/28/getting-started-with-slang-modularization/</link>
        <guid isPermaLink="true">http://shader-slang.org/event/2025/07/28/getting-started-with-slang-modularization/</guid>
        
        <category>slang</category>
        
        
        <category>event</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-07-getting-started-with-slang-modularization-thumbnail.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Neural Graphics: Speeding It Up with Wave Intrinsics</title>
        <description>&lt;p&gt;In our journey through neural graphics, we started with &lt;a href=&quot;https://shader-slang.org/blog/featured/2025/04/04/neural-gfx-in-an-afternoon/&quot;&gt;Neural Graphics in an Afternoon&lt;/a&gt;, exploring the exciting possibilities of representing and rendering scenes with machine learning approaches. We then delved into &lt;a href=&quot;https://shader-slang.org/blog/2025/04/30/neural-graphics-first-principles-performance/&quot;&gt;Neural Graphics: First Principles to Performance&lt;/a&gt;, laying down some initial strategies for making these techniques practical. Now, we’re ready to roll up our sleeves and explore more advanced performance optimizations, using our familiar 2D differentiable Gaussian splatting example as a testbed.&lt;/p&gt;

&lt;p&gt;Let’s look at the modifications added to a new example in the SlangPy samples repository, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt;. This example starts with the same Python code as its predecessor, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt;, which we walked through in our previous blog post.&lt;/p&gt;

&lt;h2 id=&quot;recapping-the-2d-differentiable-gaussian-splatting-example&quot;&gt;Recapping the 2D Differentiable Gaussian Splatting Example&lt;/h2&gt;

&lt;p&gt;As a quick refresher, these examples implement a 2D Gaussian splatting algorithm. We represent a scene (or in this case, a 2D image) with a collection of 2D Gaussian “blobs,” each defined by parameters like center, covariance (shape/rotation), and color. We then render an image by splatting these Gaussians onto a canvas, and Slang’s automatic differentiation capabilities allow us to compute how the loss function (the difference between our rendered image and a target) changes with respect to each Gaussian’s parameters. This enables us to train the Gaussians to reconstruct a target image.&lt;/p&gt;

&lt;p&gt;The Python script (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;main.py&lt;/code&gt;) driving this process is nearly identical between the two examples, with one key difference: the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt; example uses SlangPy’s ability to set a specific call group shape to explicitly match the wavefront size. For example, when kicking off the backward propagation of our loss calculation, we now call&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;perPixelLoss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;call_group_shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;WORKGROUP_X&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;WORKGROUP_Y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bwds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dispatch_ids&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This code uses the WORKGROUP_X and WORKGROUP_Y values to define the dispatch shape according to the available workgroup dimensions. We’ll be using wave intrinsics, which allow different threads within a single subgroup to share certain information and do calculations collaboratively, so we want to ensure that the work is organized into appropriately sized groups for our hardware to process. In general, the goal is to saturate all the available threads with work, so that none of the compute units are left idle.&lt;/p&gt;

&lt;p&gt;The number of threads available in a single subgroup can vary from one hardware architecture to another; for ease of explanation, this example uses a set of compile-time constants to define its dispatch size, and assumes only one subgroup per workgroup. If you wished to deploy code like this to different systems with different GPUs, you’d need to do some additional work to determine the correct dimensions at runtime. Additionally, using only a single subgroup for each workgroup has potential downsides: this code will be vulnerable to stalls where there are operations like memory reads which introduce latency. If multiple subgroups are being processed, GPUs are able to swap between them to make efficient use of their available cycles while waiting on operations to complete. For now, when running this example, you’ll want to take a moment to ensure that WORKGROUP_X and WORKGROUP_Y are set to values that, when multiplied together, give the subgroup size for your hardware. (On NVIDIA and AMD RDNA systems, this value is 32.)&lt;/p&gt;

&lt;p&gt;That said, most of the difference between the previous example and this one shows up in the Slang shader code itself (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diffsplatting2d.slang&lt;/code&gt; vs. &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;ballotsplatting2d.slang&lt;/code&gt;), specifically in how Gaussians are culled, sorted (or not), and rasterized.&lt;/p&gt;

&lt;h2 id=&quot;the-diff-splatting-approach-a-straightforward-staged-pipeline&quot;&gt;The &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt; Approach: A Straightforward Staged Pipeline&lt;/h2&gt;

&lt;p&gt;The &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt; example implements the rendering for each tile (a small patch of pixels processed by a GPU workgroup) through a multi-stage process within its main &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;splatBlobs&lt;/code&gt; Slang function:&lt;/p&gt;

&lt;ol&gt;
  &lt;li&gt;&lt;strong&gt;Coarse Rasterization (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;coarseRasterize&lt;/code&gt;):&lt;/strong&gt; This initial stage identifies which Gaussians potentially affect the current tile. Indices of intersecting Gaussians are stored in &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;groupshared&lt;/code&gt; memory, using an &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;Atomic\&amp;lt;uint\&amp;gt;&lt;/code&gt; (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobCountAT&lt;/code&gt;) to safely manage concurrent writes from multiple threads.&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Padding (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;padBuffer&lt;/code&gt;):&lt;/strong&gt; The shared list of blob indices is then padded.&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Sorting (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bitonicSort&lt;/code&gt;):&lt;/strong&gt; A workgroup-level bitonic sort arranges the intersecting blob indices. This sorting ensures Gaussians are composited in the right order.&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Fine Rasterization (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize&lt;/code&gt;):&lt;/strong&gt; With a sorted list of relevant Gaussians, each pixel within the tile iterates through them. It evaluates each Gaussian’s contribution and blends it with the pixel’s current color. This function also has an associated custom backward pass (&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize\_bwd&lt;/code&gt;) for the differentiation process, which “undoes” the blending operations to propagate gradients.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;This staged pipeline is logical and relatively straightforward to follow. However, explicit multi-stage processing involving &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;groupshared&lt;/code&gt; memory and a full sort can introduce performance overhead and synchronization points.&lt;/p&gt;

&lt;h2 id=&quot;balloted-splatting-harnessing-gpu-wave-intrinsics&quot;&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt;: Harnessing GPU Wave Intrinsics&lt;/h2&gt;

&lt;p&gt;The &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt; example presents a more sophisticated and often more performant approach by leveraging &lt;strong&gt;wave intrinsics&lt;/strong&gt; (also known as subgroup operations in Vulkan, or shuffle operations in CUDA). These are GPU hardware commands allowing threads within a small, fixed-size group (a “wave” or “subgroup,” typically 32 or 64 threads) to communicate and coordinate with very high efficiency.&lt;/p&gt;

&lt;p&gt;You can see this in action in the new &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;cullAndApplyBlobs&lt;/code&gt; function, which effectively replaces the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;coarseRasterize&lt;/code&gt;, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;padBuffer&lt;/code&gt;, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bitonicSort&lt;/code&gt;, and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize&lt;/code&gt; sequence from the previous example.&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/*
 * cullAndApplyBlobs finds blobs which intersect the current tile and evaluates them in a single pass using
 * wave intrinsics.
 *
 * This uses the multiplicative alpha blending algorithm laid out in the original GS paper (https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/)
 * This is represented as a &apos;state transition&apos; (transformPixelState) as we go through the blobs in order, so that we can
 * concisely represent the &apos;state undo&apos; operation in the custom backwards pass (fineRasterize_bwd).
 *
 * In Slang, custom derivative functions can be defined using the `[BackwardDerivative(custom_fn)]` attribute.
 */&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;BackwardDerivative&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fineRasterize_bwd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;cullAndApplyBlobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OBB&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tileBounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;no_diff&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numIntersectingBlobs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    
    &lt;span class=&quot;c1&quot;&gt;// Traverse the list in workgroup-sized chunks. Each lane in the workgroup/wave will be responsible for&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;// determining if one gaussian in the chunk intersects the current tile.&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wgStart&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numGaussians&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;count&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wgStart&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numGaussians&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wgStart&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;WG_SIZE&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;// lane 0 will load the blob represented at position wgStart, and other lanes will get the subsequent blobs&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;coarseBlob&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wgStart&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;bool&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;intersects&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;coarseBlob&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;().&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;intersects&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tileBounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

        &lt;span class=&quot;c1&quot;&gt;// All lanes write to the ballot bitmask to indicate whether intersection is true;&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;// so all lanes will have the same value for intersectionMask&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;intersectionMask&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;WaveActiveBallot&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;intersects&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

        &lt;span class=&quot;k&quot;&gt;while&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;intersectionMask&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;!=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
            &lt;span class=&quot;c1&quot;&gt;// identify the next lane with intersects == true in this chunk&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;idxInChunk&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;firstbitlow&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;intersectionMask&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;uint16_t&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobIdx&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wgStart&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;idxInChunk&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// then get the index for that blob&lt;/span&gt;
         
            &lt;span class=&quot;n&quot;&gt;intersectionMask&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;amp;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;intersectionMask&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// remove the least significant 1 bit from the mask&lt;/span&gt;

            &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobEval&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;eval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;transformPixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobEval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

            &lt;span class=&quot;n&quot;&gt;intersectingBlobList&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;min&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;numIntersectingBlobs&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;GAUSSIANS_PER_BLOCK&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

        &lt;span class=&quot;c1&quot;&gt;// if ALL the blobs processed in this chunk are below the alpha threshold,&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;// stop processing blobs.&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;WaveActiveAllTrue&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;255&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
            &lt;span class=&quot;k&quot;&gt;break&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;intersectingBlobCount&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numIntersectingBlobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;maxCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;finalVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;One thing to note here is that wave intrinsics like WaveActiveBallot are not universally supported by all combinations of graphics hardware and API. Under the hood, Slang keeps track of what capabilities are required in order to use optional features, and it will provide a warning if you attempt to compile for a profile that can’t support the necessary capabilities. For example, if you were to compile this shader with ‘-profile sm_5_0’, you’d get this warning:&lt;/p&gt;

&lt;div class=&quot;language-plaintext highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;myshader.slang(9): warning 41012: entry point &apos;computeMain&apos; uses additional capabilities that are not part of the specified profile &apos;sm_5_0&apos;. The profile setting is automatically updated to include these capabilities: &apos;sm_6_0&apos;  
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;So how does this shader use wave intrinsics?&lt;/p&gt;

&lt;p&gt;Instead of a multi-pass approach– first identifying intersecting blobs for the current tile, sorting them, and then calculating colors from the shorter list of blobs, we’re now using a single pass through the set of Gaussians to process them all, in workgroup-sized chunks. Within each chunk, each lane (a thread within the wave) is assigned a single Gaussian, and tests whether it intersects the current tile bounds. The crucial improvement here is the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;WaveActiveBallot(intersects).x&lt;/code&gt; call. This takes the boolean intersection result from each active lane in the wave, and creates a bitmask. All of the lanes in the wave can access the bitmask, and can therefore understand which Gaussians in the chunk being processed are relevant. The code then iterates through the set bits of this mask, which we’ve called &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;intersectionMask&lt;/code&gt;. For each intersection Gaussian, its contribution is evaluated, and immediately alpha-blended. We still store the indices for the intersecting blobs, because we will still need them during the custom backward pass.&lt;br /&gt;
One benefit of this approach is that we no longer need to do an explicit workgroup-wide sort. Because we keep the blobs in order during processing, we maintain the needed order for alpha blending. Additionally, we no longer need to use an atomic counter– and thereby introduce the possibility of contention– when we increment the number of intersecting blobs and write the index to the blob list. This might look problematic at first glance, because all of the lanes are writing to the same &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;intersectingBlobList&lt;/code&gt; in shared memory. But we don’t need to worry about data collisions here because of how we’re coming up with this data. Each lane has its own copy of numIntersectingBlobs, so that variable does not need to be atomically incremented. And each lane also will be operating on the same value in &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;intersectionMask&lt;/code&gt;, calculated using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;WaveActiveBallot&lt;/code&gt;. For this reason, all lanes are storing the same indices in the same order into &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;intersectingBlobList&lt;/code&gt;, so while technically this is a data race, it’s a benign one.&lt;br /&gt;
We’ve also changed the type for a couple of our storage parameters: &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;intersectingBlobList&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;maxCount&lt;/code&gt; have both been changed from &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;uint&lt;/code&gt; to &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;uint16&lt;/code&gt;, which reduces their memory footprint in &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;groupshared&lt;/code&gt; memory. As we noted in the previous post, workgroup shared memory is very small. One potential side effect of requesting very large amounts of shared memory for a workgroup is that fewer workgroups can be scheduled simultaneously on a single unit. This is inefficient, because that means that a chunk of the available compute hardware will sit idle.&lt;/p&gt;

&lt;h2 id=&quot;the-performance-payoff&quot;&gt;The Performance Payoff&lt;/h2&gt;

&lt;p&gt;Why undertake this refactoring? The shift to a wave intrinsic-based approach in &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt; is squarely aimed at &lt;strong&gt;improving performance and efficiency&lt;/strong&gt;:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Reduced Synchronization Overhead:&lt;/strong&gt; Wave operations are generally tightly coupled with the hardware and can involve less synchronization overhead than operations requiring coordination across an entire workgroup using shared memory.&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Eliminating the Bottleneck of Sorting:&lt;/strong&gt; Explicitly sorting data in shared memory (like the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bitonicSort&lt;/code&gt;) is computationally intensive and can be a significant performance bottleneck. The ballot-based approach sidesteps this.&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Better Hardware Utilization:&lt;/strong&gt; Wave intrinsics are designed to map directly onto efficient GPU hardware pathways, allowing for faster execution of tasks like voting (balloting), data exchanges (shuffling), and other coordinated operations within a subgroup.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This performance benefit is easily observable when running the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt; examples side-by-side. On my Windows 11 system, equipped with an RTX 5090, the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt; example takes 47 seconds to complete 10000 iterations, averaging 211 iterations per second. &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt; completes the same number of iterations in 37 seconds, a 21% reduction in execution time, and averages 266.4 iterations per second. Similarly, on the integrated GPU, the execution time drops from around 1 hour 20 minutes for &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt; to 1 hour and 6 minutes for &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt;.&lt;/p&gt;

&lt;h2 id=&quot;looking-ahead&quot;&gt;Looking Ahead&lt;/h2&gt;

&lt;p&gt;The evolution from &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;diff-splatting&lt;/code&gt; to &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;balloted-splatting&lt;/code&gt; demonstrates how subgroup-specific techniques like WaveActiveBallot can provide significant performance benefits by reducing duplicate work, and allowing simultaneously executing threads to work collaboratively. That is, the same compute optimization techniques already available to traditional graphics can also be a great benefit to neural graphics approaches. 
The examples we’ve explored here are just the beginning—there’s a rich landscape of GPU-specific techniques waiting to be applied to neural rendering pipelines, and Slang provides a powerful foundation for exploring them.&lt;/p&gt;
</description>
        <pubDate>Thu, 17 Jul 2025 00:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/07/17/ng-wave-intrinsics/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/07/17/ng-wave-intrinsics/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/wave-graphic.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Reflections API</title>
        <description>
</description>
        <pubDate>Tue, 08 Jul 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2025/07/08/getting-started-with-slang-reflections-api/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2025/07/08/getting-started-with-slang-reflections-api/</guid>
        
        <category>gaussian</category>
        
        <category>splat</category>
        
        <category>api</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-06-04-getting-started-with-slang-reflectsion-api-video.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang at HPG 2025: Bridging Graphics and Machine Learning with Automatic Differentiation</title>
        <description>&lt;p&gt;We’re excited to announce that Slang will be presented at &lt;a href=&quot;https://www.highperformancegraphics.org/2025/index.html&quot;&gt;&lt;strong&gt;High-Performance Graphics (HPG) 2025 in Copenhagen&lt;/strong&gt;&lt;/a&gt;! Our presentation, part of the Hot 3D track, will demonstrate how Slang’s automatic differentiation capabilities are making neural graphics techniques more accessible to real-time graphics developers.&lt;/p&gt;

&lt;h2 id=&quot;why-this-matters&quot;&gt;Why This Matters&lt;/h2&gt;

&lt;p&gt;Neural techniques like Gaussian splatting and learned materials are delivering unprecedented visual quality, but implementing them has traditionally meant juggling separate codebases in ML frameworks and graphics APIs. Slang changes this by bringing automatic differentiation directly to shader programming, enabling developers to implement gradient-based optimization without leaving their familiar graphics environment.&lt;/p&gt;

&lt;h2 id=&quot;join-us-at-hpg-2025&quot;&gt;Join Us at HPG 2025&lt;/h2&gt;

&lt;p&gt;For the complete conference schedule, visit &lt;a href=&quot;https://highperformancegraphics.org/2025/program/schedule/&quot;&gt;https://highperformancegraphics.org/2025/program/schedule/&lt;/a&gt;.&lt;/p&gt;

&lt;h2 id=&quot;cant-make-it&quot;&gt;Can’t Make It?&lt;/h2&gt;

&lt;p&gt;Stay connected with Slang’s neural graphics developments:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;Try our &lt;a href=&quot;https://github.com/shader-slang/slangpy-samples/tree/main/experiments/diff-splatting&quot;&gt;2D Gaussian Splatting examples&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;Experiment with &lt;a href=&quot;https://shader-slang.org/slang-playground/?demo=gsplat2d-diff&quot;&gt;Slang in your browser&lt;/a&gt;&lt;/li&gt;
  &lt;li&gt;Join us on the &lt;a href=&quot;https://khr.io/slangdiscord&quot;&gt;Slang Discord&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;We’re looking forward to demonstrating how Slang is making neural graphics more accessible to everyone. See you at HPG 2025!&lt;/p&gt;
</description>
        <pubDate>Mon, 16 Jun 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/06/16/slang-at-hpg/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/06/16/slang-at-hpg/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2025-06-16-hpg2025.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Reflections API</title>
        <description>&lt;p&gt;We know it: many developers that are just getting started with Slang find themselves daunted by the reflection API. During this meet-up we will explain the underlying mental model for how Slang reflects shader code and show specific code examples for how to use the reflection API to access the information that developers most often need. Along the way we will call out caveats and pitfalls that we’ve seen bite developers before.&lt;/p&gt;

&lt;h3 id=&quot;prerequisites&quot;&gt;Prerequisites&lt;/h3&gt;

&lt;p&gt;This will be a practical nuts-and-bolts presentation that assumes attendees already have experience with GPU shader programming and the Vulkan API. Prior experience with the Slang reflection API and the features of the Slang language will be beneficial but is not required&lt;/p&gt;

&lt;h3 id=&quot;speaker&quot;&gt;Speaker&lt;/h3&gt;

&lt;ul&gt;
  &lt;li&gt;Theresa Foley, NVIDIA&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;br /&gt;
&lt;a class=&quot;btn btn-primary&quot; href=&quot;https://khronosgroup.zoom.us/webinar/register/WN_ZLgU3umkSj6OvIXG_6Ds8A&quot;&gt;Register&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The Khronos Group is dedicated to providing a harassment-free conference experience for everyone. Visit our &lt;a href=&quot;https://www.khronos.org/about/code-of-conduct&quot;&gt;Code of Conduct&lt;/a&gt; page to learn more.&lt;/p&gt;
</description>
        <pubDate>Wed, 04 Jun 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/event/2025/06/04/getting-started-with-slang-reflections-api/</link>
        <guid isPermaLink="true">http://shader-slang.org/event/2025/06/04/getting-started-with-slang-reflections-api/</guid>
        
        <category>slang</category>
        
        
        <category>event</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-07-slang-reflections-meetup-thumbnail.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Draw Your First Splat</title>
        <description>
</description>
        <pubDate>Tue, 20 May 2025 08:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2025/05/20/getting-started-with-slang-draw-your-first-splat/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2025/05/20/getting-started-with-slang-draw-your-first-splat/</guid>
        
        <category>gaussian</category>
        
        <category>splat</category>
        
        <category>neural</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-05-20-getting-started-with-slang-draw-your-first-splat.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Getting Started with Slang: Draw Your First Splat</title>
        <description>&lt;p&gt;Join us for an engaging hands-on session where we’ll walk through creating your first Gaussian splat using Slang. Perfect for graphics programmers interested in getting started with neural techniques, this hands-on introduction will take you from installation through basic shader compilation to rendering your first splat.&lt;/p&gt;

&lt;h2 id=&quot;well-cover-everything-you-need-to-get-started&quot;&gt;We’ll cover everything you need to get started&lt;/h2&gt;

&lt;ul&gt;
  &lt;li&gt;Setting up Slang for development&lt;/li&gt;
  &lt;li&gt;Understanding the basics of Gaussian splats&lt;/li&gt;
  &lt;li&gt;Writing and compiling your first Slang shader&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;br /&gt;
This beginner-friendly session includes step-by-step code examples and demonstrations, with time for Q&amp;amp;A. By the end, you’ll understand how to use Slang and be ready to explore the training process.&lt;/p&gt;

&lt;h2 id=&quot;prerequisites&quot;&gt;Prerequisites&lt;/h2&gt;

&lt;ul&gt;
  &lt;li&gt;Basic understanding of computer graphics concepts&lt;/li&gt;
  &lt;li&gt;Familiarity with shader or kernel programming languages (HLSL, GLSL, etc)&lt;/li&gt;
  &lt;li&gt;No prior Slang or Gaussian splatting experience required&lt;/li&gt;
  &lt;li&gt;Optional: Have Slang compiler installed if you want to follow along&lt;/li&gt;
&lt;/ul&gt;

&lt;h2 id=&quot;speakers&quot;&gt;Speakers&lt;/h2&gt;

&lt;ul&gt;
  &lt;li&gt;Shannon Woods, Slang Working Group Chair. NVIDIA.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;br /&gt;
&lt;a class=&quot;btn btn-primary&quot; href=&quot;https://khronosgroup.zoom.us/webinar/register/WN_H57vtAYZS3OSZwWXxkJ7uw&quot;&gt;Register&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The Khronos Group is dedicated to providing a harassment-free conference experience for everyone. Visit our &lt;a href=&quot;https://www.khronos.org/about/code-of-conduct&quot;&gt;Code of Conduct&lt;/a&gt; page to learn more.&lt;/p&gt;
</description>
        <pubDate>Wed, 30 Apr 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/event/2025/04/30/getting-started-with-slang/</link>
        <guid isPermaLink="true">http://shader-slang.org/event/2025/04/30/getting-started-with-slang/</guid>
        
        <category>slang</category>
        
        
        <category>event</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2025-05-gaussian-splat-meetup-image.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Neural Graphics: From First Principles to Performance</title>
        <description>&lt;p&gt;In my &lt;a href=&quot;https://shader-slang.org/blog/2025/04/04/neural-gfx-in-an-afternoon/&quot;&gt;last blog post&lt;/a&gt;, I gave an introduction to how gradient descent is used to drive gaussian splatting representations– essentially, going through a list of blobs in 2D space, calculating their color values at a specific texture coordinate, and blending them together, and iteratively adjusting them to be as close as possible to an ideal target image. Notably, this simplified version had significant performance and quality limitations. In this post, I’ll take you through the changes needed to go from that simple pedagogical example to an implementation that achieves real-time performance.&lt;/p&gt;

&lt;h2 id=&quot;a-more-efficient-algorithm&quot;&gt;A More Efficient Algorithm&lt;/h2&gt;

&lt;p&gt;The key limitation of our previous implementation was that it evaluated every gaussian blob for every pixel. This is extremely inefficient since most gaussians only affect a small region of pixels. We can do much better by breaking this into three logical steps:&lt;/p&gt;

&lt;ol&gt;
  &lt;li&gt;For each gaussian, determine which tiles of the image it will affect (coarse rasterization)&lt;/li&gt;
  &lt;li&gt;For each tile, sort its affecting gaussians back-to-front&lt;/li&gt;
  &lt;li&gt;For each pixel in the tile, accumulate the color contributions only from gaussians that affect that tile (fine rasterization)&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;While we could implement these as separate compute kernels, we can achieve better performance by combining them into a single kernel using workgroup-level collaboration between threads. Let’s look at how this works.&lt;/p&gt;

&lt;h2 id=&quot;tile-based-rasterization-with-workgroups&quot;&gt;Tile-based Rasterization with Workgroups&lt;/h2&gt;

&lt;p&gt;The core optimization in our approach is to divide the image into tiles, where each tile is processed by a compute workgroup - a collection of threads that execute together and can share data efficiently. This allows us to:&lt;/p&gt;

&lt;ol&gt;
  &lt;li&gt;Build a per-tile shortlist of only the gaussians that affect that tile&lt;/li&gt;
  &lt;li&gt;Sort just those gaussians that affect the tile&lt;/li&gt;
  &lt;li&gt;Process pixels within the tile using only the relevant gaussians&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;A workgroup is a collection of threads that execute simultaneously on the same compute unit. These threads can collaborate using special operations and share data quickly through a small amount of on-chip memory. We’ll use this capability to efficiently build and process our per-tile gaussian lists.&lt;/p&gt;

&lt;p&gt;Here’s how the implementation works in Slang:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;// ----- Constants and definitions --------&lt;/span&gt;

&lt;span class=&quot;k&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;GAUSSIANS_PER_BLOCK&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;512&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;WG_X&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;8&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;static&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;const&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;WG_Y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// -----------------------------------------&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// Some types to hold state info on the &apos;blobs&apos; buffer.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// This makes it easy to make sure we&apos;re not accidentally using the buffer&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// in the wrong state.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// The actual data is in the &apos;blobs&apos; object.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;struct&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;InitializedShortList&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_dummy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;struct&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;FilledShortList&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_dummy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;struct&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;PaddedShortList&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_dummy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;struct&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;SortedShortList&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_dummy&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

&lt;span class=&quot;cm&quot;&gt;/*
 * coarseRasterize() calculates a subset of blobs that intersect with the current tile. Expects the blob counters to be reset before calling.
 *
 * The coarse rasterization step determines a subset of blobs that intersect with the tile.
 * Each thread in the workgroup takes a subset of blobs and uses bounding-box intersection tests to determine
 * if the tile associated with this workgroup overlaps with the blob&apos;s bounds.
 *
 * Note: This is a simplistic implementation, so there is a limit to the number of blobs in the short-list (NUM_GAUSSIANS_PER_BLOCK).
 * In practice, if the number of blobs per tile exceeds this, NUM_GAUSSIANS_PER_BLOCK must be increased manually.
 * A more sophisticated implementation would perform multiple passes to handle this case.
 *
 */&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;FilledShortList&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;coarseRasterize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;InitializedShortList&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;sList&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;OBB&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;tileBounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;nb&quot;&gt;GroupMemoryBarrierWithGroupSync&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numGaussians&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;count&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;numGaussians&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;WG_X&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;WG_Y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;OBB&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;bounds&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;intersects&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tileBounds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobCountAT&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

    &lt;span class=&quot;nb&quot;&gt;GroupMemoryBarrierWithGroupSync&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;blobCount&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobCountAT&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Up at the top of this block, we define a few constants. &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;WG_X&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;WG_Y&lt;/code&gt; describe the dimensions of our workgroup– we’re going to process blocks 8 pixels wide, and 4 pixels tall. These dimensions are chosen because most GPUs can execute 32 or 64 threads simultaneously. The maximum number of blobs we’ll add to the short list for each workgroup is set somewhat arbitrarily – we found 512 was a threshold that gave a good balance between performance and image quality.&lt;/p&gt;

&lt;p&gt;You’ll also see that the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;coarseRasterize&lt;/code&gt; function takes an &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;InitializedShortList&lt;/code&gt; parameter, but doesn’t appear to do anything with it. That’s because this implementation uses a set of sentinel struct types to enforce the correct ordering of the steps in the rasterization algorithm – essentially, this helps us catch bugs at compile time rather than runtime. It doesn’t affect how our gaussian splatting implementation works, so I won’t go deeper into it here.&lt;/p&gt;

&lt;p&gt;Inside the function, the first thing that we do is to call &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;GroupMemoryBarrierWithGroupSync&lt;/code&gt;. This is a memory barrier, which tells the GPU to wait here until all of the load and store operations being done by this workgroup on its shared memory have completed. This is important to avoid data races. The first barrier ensures that no threads start writing into the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;InitializedShortList&lt;/code&gt; parameter until the calling function has finished initializing it, while the barrier at the end of the function makes sure that all the threads have finished adding blobs to the list before the final count is retrieved with &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobCount = blobCountAT.load();&lt;/code&gt;.&lt;/p&gt;

&lt;p&gt;Then, we begin building our short list of relevant gaussian blobs. We want to split this work up across all of our workgroup threads, so that we can process the list of blobs efficiently and without duplicated work. The way we do this is that we ask each thread to start by accessing the blob in the global list that corresponds to its local thread index– an identifier that, similar to the texture coordinates we generated in the simpler example using SlangPy’s &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;grid&lt;/code&gt; generator, allows each thread to understand where it sits within the workgroup. In this case, we’re using a 1-dimensional dispatch shape, and will need to do some math later to figure out what pixel we’re calculating within the image. I’ll explain that in more detail when we get there. Each thread will check its assigned first blob, and then skip ahead by the number of blobs that there are threads in the workgroup. So, with our WG_X (8) and WG_Y (4) describing a total thread group size of 32, we have threads examining blobs at that stride:
Thread 0 checks blobs 0, 32, 64, …
Thread 1 checks blobs 1, 33, 65, …
And so on…&lt;/p&gt;

&lt;p&gt;Whenever a blob is identified that intersects with the current tile, it’s added to the shortlist using this line:&lt;/p&gt;
&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;        &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobCountAT&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The shortlist &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobs&lt;/code&gt; and its index incrementor &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobCountAT&lt;/code&gt; didn’t appear in the excerpt above – that’s because they’re using workgroup shared memory, so they’re declared a bit differently, like this:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;// ----- Shared memory declarations --------&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// Note: In Slang, the &apos;groupshared&apos; identifier is used to define&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// workgroup-level shared memory. This is equivalent to &apos;__shared__&apos; in CUDA&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// blobCountAT is used when storing blob IDs into the blobs buffer. It needs to be atomic&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// since multiple threads will be in contention to increment it.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// Atomic&amp;lt;T&amp;gt; is the most portable way to express atomic operations. Slang supports basic&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// operations like +, -, ++, etc.. on Atomic&amp;lt;T&amp;gt; types.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;groupshared&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Atomic&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobCountAT&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// This is used after the coarse rasterization step as a non-atomic&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// location to store the blob count, since atomics are not necessary after the coarse&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// rasterization step.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;groupshared&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// The blobs buffer is used to store the indices of the blobs that intersect&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// with the current tile.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;groupshared&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GAUSSIANS_PER_BLOCK&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;];&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Using the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;groupshared&lt;/code&gt; identifier tells Slang that these variables need to be in the fast local memory available to all the threads in a workgroup. This shared memory space is much faster to access than global GPU memory, but it’s very limited in space– sometimes only on the order of tens of kilobytes.&lt;/p&gt;

&lt;p&gt;Importantly, we declare the index incrementor, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobCountAT&lt;/code&gt;, to be atomic– this ensures that only one thread has access to read or write the variable at a time, preventing multiple threads from trying to simultaneously increment it.&lt;/p&gt;

&lt;p&gt;After the threads in the workgroup finish iterating across the full list of blobs to identify the ones relevant to the current tile, we issue another &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;GroupMemoryBarrierWithGroupSync&lt;/code&gt; to make sure all the threads in the workgroup finish, before finally writing out the final count of blobs in our shortlist to the non-atomic &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobCount&lt;/code&gt; variable.&lt;/p&gt;

&lt;p&gt;When &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;coarseRasterize()&lt;/code&gt; completes, we have a list of just the gaussian blobs which affect the current tile, so we need not iterate through the entire list for each pixel. Because we’re no longer operating on the full list of gaussians, we can no longer assume that the list stays sorted in the back-to-front order needed for alpha blending. Because we built the short list in individual workgroup threads, we don’t know what order they were added to the list using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobs[blobCountAT++] = i;&lt;/code&gt;, so we will need to take an additional step for sorting them, which is done with &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bitonicSort&lt;/code&gt;, a sorting algorithm which makes similar use of workgroup shared memory to allow the workgroup threads to collaboratively sort the list.&lt;/p&gt;

&lt;p&gt;Now that we have created a reduced list of gaussians to evaluate per-pixel, our rasterization could potentially be much faster! But you may have noticed that the maximum number of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;GAUSSIANS_PER_BLOCK&lt;/code&gt; is defined as 512– more than twice as many as the total list of gaussian blobs we used in our simplified example. Is that going to be a problem?&lt;/p&gt;

&lt;h2 id=&quot;differential-propagation-and-intermediate-value-storage&quot;&gt;Differential Propagation and Intermediate Value Storage&lt;/h2&gt;

&lt;p&gt;There’s a second problem that was causing poor performance in our simplified example, but it’s less obvious, because it’s a byproduct of Slang doing derivative propagation for you.&lt;/p&gt;

&lt;p&gt;Looking back at the simplified implementation, the differentiable function we used to calculate blob colors was &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;simpleSplatBlobs()&lt;/code&gt;:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/* simpleSplatBlobs() is a naive implementation of the computation of color for a pixel.
 * It will iterate over all of the Gaussians for each pixel, to determine their contributions
 * to the pixel color, so this will become prohibitively slow with a very small number of 
 * blobs, but it reduces the number of steps involved in determining the pixel color.
 */&lt;/span&gt;
 &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Differentiable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
 &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;simpleSplatBlobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GradInOutTensor&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;int2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
 &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

    &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
    &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;SIMPLE_BLOBCOUNT&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;blobColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;eval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;texSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;));&lt;/span&gt;
        
        &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;alphaBlend&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobColor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;255&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
            &lt;span class=&quot;k&quot;&gt;continue&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

    &lt;span class=&quot;c1&quot;&gt;// Blend with background&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;Because this function is differentiable, we need to be able to propagate its variables’ values through a chain of derivatives– that is, we need to know the value of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;result&lt;/code&gt;, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobColor&lt;/code&gt;, and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;gaussian&lt;/code&gt; at each step through the list of blobs, for every pixel we calculate. Storing all of that information is costly – especially because we’re doing these calculations on the GPU, and very little memory is available to us without significant latency in accessing it.&lt;/p&gt;

&lt;p&gt;We can avoid needing to do all of this storage of intermediate values if, instead, we provide a way for Slang to recalculate the values as it progresses through the backward propagation. To do this, we provide a user-defined backwards form for part of our rasterization algorithm.&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/*
 * fineRasterize() produces the per-pixel final color from a sorted list of blobs that overlap the current tile.
 *
 * The fine rasterizeration is where the calculation of the per-pixel color happens.
 * This uses the multiplicative alpha blending algorithm laid out in the original GS paper (https://repo-sam.inria.fr/fungraph/3d-gaussian-splatting/)
 * This is represented as a &apos;state transition&apos; (transformPixelState) as we go through the blobs in order, so that we can
 * concisely represent the &apos;state undo&apos; operation in the backwards pass.
 *
 * In Slang, custom derivative functions can be defined using the `[BackwardDerivative(custom_fn)]` attribute.
 */&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;BackwardDerivative&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;fineRasterize_bwd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;fineRasterize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;SortedShortList&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;no_diff&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;nb&quot;&gt;GroupMemoryBarrierWithGroupSync&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;count&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;// The forward rasterization&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;count&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;transformPixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;eval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;));&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;maxCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;finalVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

&lt;span class=&quot;cm&quot;&gt;/*
 * fineRasterize_bwd() is the user-provided backwards pass for the fine rasterization step.
 *
 * This is implemented as a custom derivative function because, while applying auto-diff directly to a function
 * with a loop can result in excessive state caching (a necessary part of standard automatic differentiation methods)
 *
 * For Gaussian splatting, there is a &apos;state undo&apos; (undoPixelState) operation available. fineRasterize_bwd takes advantage of this
 * to recreate the states at each step of the forward pass instead of letting auto-diff store them.
 *
 * While it is important to represent the backwards loop explicitly in this way, the contents of the loop body (loading, evaluation,
 * blending, etc..) can still be differentiated automatically (and it would be tedious to do so manually).
 *
 * The loop body therefore invokes `bwd_diff` to backprop the derivatives via auto-diff.
 */&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;fineRasterize_bwd&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;SortedShortList&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dOut&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;nb&quot;&gt;GroupMemoryBarrierWithGroupSync&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;finalVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;maxCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Differential&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dOut&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

    &lt;span class=&quot;c1&quot;&gt;// The backwards pass manually performs an &apos;undo&apos; to reproduce the state at each step.&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;// The inner loop body still uses auto-diff, so the bulk of the computation is still&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;// handled by the auto-diff engine.&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;--&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;_i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobID&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;];&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;eval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobID&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;prevState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;undoPixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

        &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dpState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;diffPair&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;prevState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dpGVal&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;diffPair&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

        &lt;span class=&quot;c1&quot;&gt;// Once we have the previous state, we can continue with the backpropagation via auto-diff within&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;// the loop body. Note that the `bwd_diff` calls writeback the differentials to dpState and dpGVal,&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;// and can be obtained via `getDifferential()` (or simply &apos;.d&apos;)&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;bwd_diff&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;transformPixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dpState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dpGVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dColor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;bwd_diff&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;eval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobset&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobID&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;localIdx&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dpGVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;getDifferential&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;());&lt;/span&gt;

        &lt;span class=&quot;n&quot;&gt;pixelState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;prevState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;dColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dpState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;getDifferential&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;The first thing you’ll notice here is that, rather than simply being annotated as &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;[Differentiable]&lt;/code&gt;, our &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize()&lt;/code&gt; function uses &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;[BackwardDerivative(fineRasterize_bwd)]&lt;/code&gt; to indicate that, rather than Slang generating the backwards form of this function automatically, we instead want to provide the backward form of this function ourselves. Whereas before, we were storing the pixel state value for each iteration of the loop so that it could be replayed backwards, we now can use our domain-specific knowledge to reproduce the required value at each iteration instead.&lt;/p&gt;

&lt;p&gt;Manually providing a backwards derivative form might seem like it defeats the purpose of using autodiff in Slang in the first place, but one of the very useful things about Slang is that it allows you to mix automatic and user-defined differentiation in a single propagation chain. That is, we can call &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize()&lt;/code&gt; from within an automatically differentiated function (in this case, our top-level splatting function, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;splatBlobs()&lt;/code&gt;), provide a user-defined backwards form for just that function, and even invoke automatic differentiation on parts of that user-defined function using &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bwd_diff()&lt;/code&gt;. This way, we can get the benefits of avoiding that automatic caching of intermediate values during our pixel blending loop, but not have to take on all of the work of doing the derivatives for our full rasterization algorithm ourselves.&lt;/p&gt;

&lt;p&gt;So, in the code above, the backward form of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize()&lt;/code&gt; loops backward over all of our blobs, evaluates each one, and performs an “undo” operation, which we define in &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;undoPixelState&lt;/code&gt;.&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/*
 * undoPixelState() reverses the alpha blending operation and restores the previous pixel
 * state.
 */&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;undoPixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nextState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;index&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;index&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nextState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nextState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nextState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;undoAlphaBlend&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;nextState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;nextState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// …&lt;/span&gt;

&lt;span class=&quot;cm&quot;&gt;/*
 * undoAlphaBlend() implements the reverse of the alpha blending algorithm.
 *
 * Takes a pixel value &apos;pixel&apos; and the same &apos;gval&apos; contribution &amp;amp;
 * computes the previous pixel value.
 *
 * This is a critical piece of the backwards pass.
 */&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;undoAlphaBlend&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;preMult&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oldPixelAlpha&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;oldPixelAlpha&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;oldPixelAlpha&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;One thing to note about undoing an alpha blend: because alpha values are all within the range [0.0, 1.0], our undo is only possible if the pixel never becomes fully opaque. This is handled inside the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;transformPixelState&lt;/code&gt; function called by &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;fineRasterize&lt;/code&gt;:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;cm&quot;&gt;/*
 * transformPixelState() applies the alpha blending operation to the pixel state &amp;amp;
 * updates the counter accordingly.
 *
 * This state transition also stops further blending once the pixel is effectively opaque.
 * This is important to avoid the alpha becoming too low (or even 0), at which point
 * the blending is not reversible.
 *
 */&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Differentiable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;transformPixelState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;PixelState&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;var&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;newState&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;alphaBlend&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;255&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;value&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;newState&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixel&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;finalCount&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
&lt;h2 id=&quot;local-index-mapping&quot;&gt;Local Index Mapping&lt;/h2&gt;
&lt;p&gt;There’s one other notable difference between the simplified and full versions of this 2D gaussian splatter, which I mentioned above: the dispatch shape.&lt;/p&gt;

&lt;p&gt;In the simplified version, we initiated the backward derivative propagation with this line of SlangPy:&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;perPixelLoss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bwds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                         &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)),&lt;/span&gt;
                         &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;Recall that the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;spy.grid()&lt;/code&gt; function is a generator, which produces a grid-shaped set of IDs for the individual threads running the dispatched work.&lt;/p&gt;

&lt;p&gt;By contrast, in this more complex version, we want to ensure that the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;coarseRasterize()&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;bitonicSort()&lt;/code&gt; functions can operate collaboratively on a set of pixels within a workgroup, so we create a mapping of pixels to thread IDs:&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;k&quot;&gt;def&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;calcCompressedDispatchIDs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x_max&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;int&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_max&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;int&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;int&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;int&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;local_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arange&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;local_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arange&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;local_xv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;local_yv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;meshgrid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;local_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;local_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;indexing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;ij&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;local_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;stack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;local_xv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;local_yv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=-&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;local_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;local_xyv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;astype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt;
                        &lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x_max&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y_max&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;local_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;local_xyv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x_max&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_max&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;

    &lt;span class=&quot;n&quot;&gt;group_x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arange&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x_max&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;group_y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;arange&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y_max&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;group_xv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;group_yv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;meshgrid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;group_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;group_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;indexing&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;ij&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;group_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;stack&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;group_xv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;group_yv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;],&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=-&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;group_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tile&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;group_xyv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[:,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;:,&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;None&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;bp&quot;&gt;None&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;:],&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;group_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;group_xyv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;reshape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x_max&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;y_max&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;astype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;group_xyv&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;array&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;wg_x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;wg_y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;])[&lt;/span&gt;&lt;span class=&quot;bp&quot;&gt;None&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;:]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;local_xyv&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;astype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;What’s happening here is that we’re using some utility functions from NumPy to construct a grid of IDs manually, rather than asking SlangPy to generate it for us. We’re also providing the values in a single array, because, behind the scenes, SlangPy currently only supports a 1D dispatch shape– more general dispatch support is planned to be added soon. &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;x_max&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;y_max&lt;/code&gt; represent the size of the full image, while &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;wg_x&lt;/code&gt; and &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;wg_y&lt;/code&gt; are the dimensions of the tile (and the workgroup that will calculate the pixel values within that tile). The IDs we create tell each thread both where it’s located within its workgroup, and which workgroup it belongs to within the full work dispatch, and from those, what pixel coordinates it’s responsible for calculating. We can then provide this set of IDs directly to our &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;perPixelLoss&lt;/code&gt; function at dispatch:&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;perPixelLoss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bwds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dispatch_ids&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;h2 id=&quot;results&quot;&gt;Results&lt;/h2&gt;
&lt;p&gt;With these optimizations, we’re now able to operate on a much larger set of gaussian blobs. The full diff-splatting experiment uses 40960 blobs in total (and correspondingly defines their maximum size to be smaller, since we don’t need to cover as much ground with each blob). And even with this much larger number of blobs, overall execution is much faster. On the same system I used to generate the last post’s example image, all 10,000 iterations now take less than 3 minutes (compared to around 40 minutes for the simplified version). And as you can see, the image quality is orders of magnitude better.&lt;/p&gt;

&lt;p&gt;&lt;img src=&quot;/images/posts/fullsplat-jeep.gif&quot; alt=&quot;An animation of the full differentiable 2D splatter in action&quot; class=&quot;img-fluid&quot; /&gt;&lt;/p&gt;

&lt;p&gt;This optimization journey may feel familiar to graphics developers - we’ve applied many classic optimization patterns like tile-based processing, efficient memory management, and parallel workgroup coordination. The same patterns that have served graphics developers for years are equally crucial in neural graphics applications. The other part of our optimization equation is the mixing of automatic and user-defined differentiation, allowing us to use what we know about our color accumulation operation to avoid storage overhead, which is a particular strength of Slang. But whether you’re rendering traditional polygons or training neural representations, the underlying challenges of efficient hardware utilization remain remarkably similar.&lt;/p&gt;

&lt;p&gt;You can start experimenting with the code for this 2D gaussian splatting example in a couple of ways: to try out the sample from SlangPy, check out the code in our &lt;a href=&quot;https://github.com/shader-slang/slangpy/tree/main/experiments/diff-splatting&quot;&gt;SlangPy experiments set&lt;/a&gt;. If you want to try it out in your browser, head over to the Slang Playground, which uses &lt;a href=&quot;https://shader-slang.org/slang-playground/?demo=gsplat2d-diff&quot;&gt;the same Slang code in WebGPU&lt;/a&gt;.&lt;/p&gt;

</description>
        <pubDate>Wed, 30 Apr 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/04/30/neural-graphics-first-principles-performance/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/04/30/neural-graphics-first-principles-performance/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2025-04-30-abstractsplats.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Welcome Our Newest Slang Committers!</title>
        <description>&lt;p&gt;We’re thrilled to celebrate two outstanding members of the Slang community who have recently become committers: fairywreath and juliusikkala! Their elevation to committer status recognizes their significant contributions to Slang and their ongoing commitment to improving the shader programming ecosystem.&lt;/p&gt;

&lt;h2 id=&quot;meet-our-new-committers&quot;&gt;Meet Our New Committers&lt;/h2&gt;

&lt;h3 id=&quot;fairywreath&quot;&gt;fairywreath&lt;/h3&gt;
&lt;p&gt;fairywreath has demonstrated exceptional expertise in cross-platform shader development and advanced GPU features. Their contributions have significantly expanded Slang’s capabilities:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Advanced GPU Features&lt;/strong&gt;: Implemented cooperative matrix support, subgroup operations, and shader intrinsics, enabling more powerful GPU computations&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Cross-Platform Support&lt;/strong&gt;: Added crucial features for Metal and WGSL targets, including wave operations and structured buffer support&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Graphics Pipeline Enhancement&lt;/strong&gt;: Contributed mesh shader improvements, SPIRV debugging capabilities, and floating-point pack/unpack intrinsics&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Their work has been particularly impactful in making Slang more powerful and consistent across different graphics APIs and hardware platforms.&lt;/p&gt;

&lt;h3 id=&quot;juliusikkala&quot;&gt;juliusikkala&lt;/h3&gt;
&lt;p&gt;juliusikkala has made substantial contributions across multiple areas of Slang, demonstrating deep technical expertise in compiler development and language design. Their work has significantly improved Slang’s capabilities in several key areas:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;strong&gt;Language Enhancement&lt;/strong&gt;: Implemented the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;defer&lt;/code&gt; statement, providing developers with more powerful control flow options, and added support for specialization constants in compute shader thread configurations&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Cross-Platform Compatibility&lt;/strong&gt;: Improved GLSL compatibility and SPIR-V output, making Slang more reliable across different graphics APIs&lt;/li&gt;
  &lt;li&gt;&lt;strong&gt;Compiler Robustness&lt;/strong&gt;: Fixed critical issues in generic specialization, optimization, and type handling, enhancing the compiler’s reliability and performance&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Their consistent attention to detail and commitment to code quality has helped make Slang more robust and feature-rich for all users.&lt;/p&gt;

&lt;h2 id=&quot;why-committers-matter&quot;&gt;Why Committers Matter&lt;/h2&gt;

&lt;p&gt;Committers play a crucial role in the Slang ecosystem. They help review code, mentor new contributors, and shape the future direction of the project.&lt;/p&gt;

&lt;h2 id=&quot;join-the-slang-community&quot;&gt;Join the Slang Community&lt;/h2&gt;

&lt;p&gt;Interested in becoming a Slang committer? The path to becoming a committer starts with getting involved in the community. Visit our &lt;a href=&quot;https://shader-slang.com/community&quot;&gt;Community Page&lt;/a&gt; to learn more.&lt;/p&gt;

&lt;p&gt;Welcome again to our new committers, and thank you to all our community members who make Slang better every day!&lt;/p&gt;

</description>
        <pubDate>Thu, 24 Apr 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/04/24/welcome-new-committers/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/04/24/welcome-new-committers/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/slang-celebration.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Neural Graphics in an Afternoon</title>
        <description>&lt;p&gt;(For the next article in this series, click &lt;a href=&quot;https://shader-slang.org/blog/2025/04/30/neural-graphics-first-principles-performance/&quot;&gt;here&lt;/a&gt;)&lt;/p&gt;

&lt;p&gt;The intersection of computer graphics and machine learning is creating exciting new possibilities, from scene reconstruction with NeRFs and Gaussian splats to learning complex material properties. But getting started with neural graphics can seem daunting. Between understanding graphics APIs, shader programming, and automatic differentiation, there’s a lot to learn. That’s why the Slang team is introducing &lt;a href=&quot;https://slangpy.shader-slang.org/en/latest/&quot;&gt;SlangPy&lt;/a&gt;, a new Python package that makes it dramatically easier to build neural graphics applications with Slang. With just a few lines of Python code, you can now:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;Seamlessly call Slang functions on the GPU from Python&lt;/li&gt;
  &lt;li&gt;Leverage automatic differentiation without writing complex derivative code&lt;/li&gt;
  &lt;li&gt;Eliminate graphics API boilerplate and reduce potential bugs&lt;/li&gt;
  &lt;li&gt;Integrate with popular ML frameworks like PyTorch&lt;/li&gt;
  &lt;li&gt;Rapidly prototype and experiment with neural graphics techniques&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;In this article, I’ll show you how to write your first neural graphics program with Slang and SlangPy by walking through our 2D Gaussian Splatting example.&lt;/p&gt;

&lt;h2 id=&quot;example-2d-gaussian-splatting&quot;&gt;Example: 2D Gaussian Splatting&lt;/h2&gt;

&lt;p&gt;Our concrete example, which you can see in action on the &lt;a href=&quot;https://shader-slang.org/slang-playground/?demo=gsplat2d-diff.slang&quot;&gt;Slang playground&lt;/a&gt;, uses 2D Gaussian splats (think of them as fuzzy circular blobs of color) to represent an image. Each splat has properties for:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;Position (where it’s centered)&lt;/li&gt;
  &lt;li&gt;Sigma (how fuzzy/spread out it is)&lt;/li&gt;
  &lt;li&gt;Color&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Why are Gaussian splats so powerful? Their mathematical properties make them particularly well-suited for representing visual information. Each Gaussian splat naturally creates smooth gradients from its center outward, which is perfect for capturing how light and color blend in real-world scenes. And because of this smoothness, they are well suited to optimization techniques like the one we are about to explore. In more advanced applications, these properties allow Gaussian splats to represent complex 3D scenes with remarkably high visual quality while maintaining real-time performance – a sweet spot that’s made them increasingly popular in computer graphics applications from virtual production to AR/VR.&lt;/p&gt;

&lt;p&gt;The challenge is: how do we determine the right parameters for thousands of splats to recreate a specific image? To do this, we can use a technique common in machine learning called gradient descent. Gradient descent can be used to find an optimal solution to a problem by making small adjustments to its inputs and checking whether they bring the result closer to our desired output. The basic idea is that we start with random splat properties, and define a “loss function”, which measures how different the resulting image is from what we want it to be, and then use gradient descent to adjust the splat properties until the difference is minimized.&lt;/p&gt;

&lt;h2 id=&quot;the-challenge-computing-gradients&quot;&gt;The Challenge: Computing Gradients&lt;/h2&gt;

&lt;p&gt;That’s where things get a little tricky. There’s a mathematical operation to express how a function changes as you change one of its inputs– the derivative. A gradient is a collection of partial derivatives of a function with respect to each of its input parameters. If that sounds scary: don’t worry, Slang is here to help!&lt;/p&gt;

&lt;p&gt;Without Slang, calculating derivatives of our loss function with respect to every parameter can get very laborious. For complex graphics operations, this means:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;Writing both the function itself, and a corresponding function (the derivative of the original) which calculates the gradients. These are referred to as the “forward” and “backward” forms of the function.&lt;/li&gt;
  &lt;li&gt;Making sure that any changes made to the original (forward) form of the function are also done correctly to its differential (backward) form.&lt;/li&gt;
  &lt;li&gt;Actually doing the derivatives, which can get extremely complex for an arbitrary shader function&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Slang makes this entire process much easier, because it can automatically calculate the backward form of your shader functions for you. You can take advantage of the power of gradient descent without having to wade hip-deep (or even dip your toes) into calculus.&lt;/p&gt;

&lt;h2 id=&quot;the-code&quot;&gt;The Code&lt;/h2&gt;

&lt;p&gt;Let’s take a look at what it looks like to do this in the code. I’ll first go through a simplified version of the 2D Gaussian splatting example, so it’s very clear how the mechanism works. You can find this example in the SlangPy repository &lt;a href=&quot;https://github.com/shader-slang/slangpy-samples/tree/main/examples/simplified-splatting&quot;&gt;here&lt;/a&gt;. First, we’ll check out the Python side of things. With SlangPy, this code is pretty succinct.&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;# SPDX-License-Identifier: Apache-2.0
&lt;/span&gt;
&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;slangpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;
&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;pathlib&lt;/span&gt;
&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;imageio&lt;/span&gt;
&lt;span class=&quot;kn&quot;&gt;import&lt;/span&gt; &lt;span class=&quot;nn&quot;&gt;numpy&lt;/span&gt; &lt;span class=&quot;k&quot;&gt;as&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Create a device, which will handle setup and invocation of the Slang
# compiler for us. We give it both the slangpy PATH and the local include
# PATH so that it can find Slang shader files
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Device&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;compiler_options&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;s&quot;&gt;&quot;include_paths&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;SHADER_PATH&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;pathlib&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Path&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;__file__&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;parent&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;absolute&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(),&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;],&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Load our Slang module -- we&apos;ll take a look at this in just a moment
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load_from_file&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;s&quot;&gt;&quot;simplediffsplatting2d.slang&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Create a buffer to store Gaussian blobs. We&apos;re going to make a very small one,
# because right now this code is not very efficient, and will take a while to run.
# For now, we are going to create 200 blobs, and each blob will be comprised of 9
# floats:
#   blob center x and y (2 floats)
#   sigma (a 2x2 covariance matrix - 4 floats)
#   color (3 floats)
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;NUM_BLOBS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;200&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;FLOATS_PER_BLOB&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;9&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;# SlangPy lets us create a Tensor and initialize it easily using numpy to generate
# random values. This Tensor includes storage for gradients, because we call .with_grads()
# on the created spy.Tensor.
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;from_numpy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;device&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;random&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rand&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;NUM_BLOBS&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;FLOATS_PER_BLOB&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;astype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;with_grads&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Load our target image from a file, using the imageio package,
# and store its width and height in W, H
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;image&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageio&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imread&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;./jeep.jpg&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;W&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;H&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Convert the image from RGB_u8 to RGBA_f32 -- we&apos;re going
# to be using texture values during derivative propagation,
# so we need to be dealing with floats here.
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;image&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;image&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;mf&quot;&gt;256.0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;).&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;astype&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;image&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;concatenate&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;([&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ones&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;((&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;W&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;H&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;np&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)],&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;axis&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=-&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;device&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_texture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;data&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;W&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;H&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;nb&quot;&gt;format&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Format&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgba32_float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;usage&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;TextureUsage&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shader_resource&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Create a per_pixel_loss Tensor to hold the calculated loss, and create gradient storage
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;empty&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;device&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dtype&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;W&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;H&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;with_grads&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;()&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;# Set per-pixel loss&apos; derivative to 1 (using a 1-line function in the slang file)
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ones&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grad_in&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Create storage for the Adam update moments
# The Adam optimization algorithm helps us update the inputs to the function being optimized
# in an efficient manner. It stores two &quot;moments&quot;: the first is a moving average of the
# of the gradient of the loss function. The second is a moving average of the squares of these
# gradients. This allows us to &quot;step&quot; in the desired direction while maintaining momentum toward
# the goal
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;adam_first_moment&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zeros_like&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;adam_second_moment&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Tensor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;zeros_like&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;# Pre-allocate a texture to send data to tev occasionally.
&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;current_render&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;device&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;create_texture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;W&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;H&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;nb&quot;&gt;format&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Format&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgba32_float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;usage&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;TextureUsage&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shader_resource&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;|&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;TextureUsage&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;unordered_access&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

&lt;span class=&quot;n&quot;&gt;iterations&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;10000&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;iter&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;range&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;iterations&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;# Back-propagage the unit per-pixel loss with auto-diff.
&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;perPixelLoss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bwds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)),&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

    &lt;span class=&quot;c1&quot;&gt;# Update the parameters using the Adam algorithm
&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;adamUpdate&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grad_out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;adam_first_moment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;adam_second_moment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;

    &lt;span class=&quot;c1&quot;&gt;# Every 50 iterations, render the blobs out to a texture, and hand it off to tev
&lt;/span&gt;    &lt;span class=&quot;c1&quot;&gt;# so that you can visualize the iteration towards ideal
&lt;/span&gt;    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;iter&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;%&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;50&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;==&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;renderBlobsToTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;current_render&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tev&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;show_async&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;current_render&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;sa&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;optimization_&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;iter&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;50&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;:&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;03&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;d&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;}&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This is the entire Python file for setting up, initializing a set of 2D Gaussian blobs, and kicking off the derivative propagation that calculates the ideal values for all those blob parameters. The setup should be fairly straightforward and explained by the comments, so let’s take a closer look at the “meat” of this file, iterating through our gradient descent.&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;n&quot;&gt;iterations&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;10000&lt;/span&gt;
&lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;iter&lt;/span&gt; &lt;span class=&quot;ow&quot;&gt;in&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;range&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;iterations&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;):&lt;/span&gt;
    &lt;span class=&quot;c1&quot;&gt;# Back-propagage the unit per-pixel loss with auto-diff.
&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;perPixelLoss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;bwds&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;per_pixel_loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                             &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)),&lt;/span&gt;
                             &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;What the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;module.perPixelLoss.bwds()&lt;/code&gt; call is doing is going into the Slang module we loaded above, finding the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;perPixelLoss()&lt;/code&gt; function defined within it, and invoking the backwards differential form. The parameters we pass are:&lt;/p&gt;

&lt;ul&gt;
  &lt;li&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;per_pixel_loss&lt;/code&gt; - A tensor we created to store the loss value for each pixel of the calculated image&lt;/li&gt;
  &lt;li&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;spy.grid(shape=(input_image.width, input_image.height))&lt;/code&gt; - This is part of what makes SlangPy so helpful. Much like the thread ID of a traditional compute kernel, SlangPy has a way for your kernel to know what thread it’s operating on in the context of the full dispatch. But what makes it especially handy for ML use cases is that Slang’s generator functions support arbitrary dimensionality, as opposed to the 3D-maximum in most traditional compute paradigms. There are &lt;a href=&quot;https://slangpy.shader-slang.org/en/latest/generators.html&quot;&gt;several generator methods&lt;/a&gt; provided by SlangPy; &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;grid()&lt;/code&gt; is the one we want here because we can be explicit about the shape of the work we’re dispatching. We’re computing the values of a width x height image, and so we want to consider our compute threads in that context, so we provide those values to the grid function, and it will generate appropriate identifier information for each of the invocations of the kernel.&lt;/li&gt;
  &lt;li&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;blobs&lt;/code&gt; - The tensor full of all the blob parameters, which also has storage for gradients associated with each of the blobs. Those gradients will give us the information we need to know which direction to adjust each parameters to get closer to our desired target output.&lt;/li&gt;
  &lt;li&gt;&lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;input_image&lt;/code&gt; - The target image that we’re trying to get our blobs to look like&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;When this call finishes, per_pixel_loss will contain values representing the results of the loss function for each pixel based on the “calculated image” that results from all of our current blob parameters, and blobs will have a gradient associated with each blob, indicating which direction the parameters should move in order to get closer to the target. The input image will be unchanged.&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;c1&quot;&gt;# Update the parameters using the Adam algorithm
&lt;/span&gt;    &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;adamUpdate&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grad_out&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;adam_first_moment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;adam_second_moment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This line calls into a Slang function in our module which provides an &lt;a href=&quot;https://optimization.cbe.cornell.edu/index.php?title=Adam&quot;&gt;optimized algorithm&lt;/a&gt; for updating our blobs based on the information stored in the blob gradients. It calculates moving averages of these gradients, so that we can update our blob parameters efficiently. You can read more about how Adam works in &lt;a href=&quot;https://arxiv.org/pdf/1412.6980&quot;&gt;the paper&lt;/a&gt; that introduced it, and you’ll see the implementation in our Slang module in a moment. Don’t worry– it’s less than thirty lines of Slang code!&lt;/p&gt;

&lt;div class=&quot;language-python highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;    &lt;span class=&quot;c1&quot;&gt;# Every 50 iterations, render the blobs out to a texture, and hand it off to tev
&lt;/span&gt;    &lt;span class=&quot;c1&quot;&gt;# so that you can visualize the iteration towards ideal
&lt;/span&gt;    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;iter&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;%&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;50&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;==&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;:&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;module&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;renderBlobsToTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
            &lt;span class=&quot;n&quot;&gt;current_render&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;grid&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;shape&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;width&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input_image&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;height&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt;
        &lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;spy&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;tev&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;show_async&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;current_render&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;name&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;=&lt;/span&gt;&lt;span class=&quot;sa&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;optimization_&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;iter&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;//&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;50&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;:&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;03&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;d&lt;/span&gt;&lt;span class=&quot;si&quot;&gt;}&lt;/span&gt;&lt;span class=&quot;s&quot;&gt;&quot;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;And then finally, we use one last function in our Slang module to render the results of our blobs out to a texture, instead of just keeping them in memory, so that we can visualize the results of the iterations as we go on. We’re doing 10 thousand iterations, though, so looking at every iteration might be overkill, so we’ll only render out every 50th iteration.&lt;/p&gt;

&lt;p&gt;Ok! Now, for the Slang side of things.&lt;/p&gt;

&lt;p&gt;There’s a bit more to the Slang code, but let’s first take a look at the functions that we called out to from SlangPy just a moment ago. The workhorse of the module is that &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;perPixelLoss()&lt;/code&gt; function and its helpers:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;// simpleSplatBlobs() is a naive implementation of the computation of color for a pixel.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// It will iterate over all of the Gaussians for each pixel, to determine their contributions&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// to the pixel color, so this will become prohibitively slow with a very small number of&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// blobs, but it reduces the number of steps involved in determining the pixel color.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Differentiable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;simpleSplatBlobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GradInOutTensor&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;int2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
    
    &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
    &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;};&lt;/span&gt;
    
    &lt;span class=&quot;c1&quot;&gt;// iterate over the full list of Gaussion blobs&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;SIMPLE_BLOBCOUNT&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;c1&quot;&gt;// first, calculate the color of the current blob&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Gaussian2D&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;load&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;blobColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;gaussian&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;eval&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;/&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;texSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;));&lt;/span&gt;
    
        &lt;span class=&quot;c1&quot;&gt;// then, blend with the blobs we&apos;ve accumulated so far&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;alphaBlend&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobColor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
    
    &lt;span class=&quot;c1&quot;&gt;// Blend with background&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;a&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// loss() implements the standard L2 loss function to quantify the difference between&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// the rendered image and the target texture.&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Differentiable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;int2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;Blobs&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;Texture2D&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texWidth&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texHeight&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;targetTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetDimensions&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;texWidth&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texHeight&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;kt&quot;&gt;int2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texSize&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;int2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;texWidth&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;texHeight&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

    &lt;span class=&quot;c1&quot;&gt;// Splat the blobs and calculate the color for this pixel.&lt;/span&gt;
    &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;color&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;simpleSplatBlobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetColor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    
    &lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;weight&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;if&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;||&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
    &lt;span class=&quot;k&quot;&gt;else&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;targetColor&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;no_diff&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;];&lt;/span&gt;
        &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;nb&quot;&gt;dot&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;color&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetColor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;color&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetColor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;rgb&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
    
    &lt;span class=&quot;k&quot;&gt;return&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;

&lt;span class=&quot;c1&quot;&gt;// Differentiable function to compute per-pixel loss&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// Parameters:&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// output: a 2-dimensional tensor of float4 values, representing the output texture&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// pixelCoord: the coordinates of the output pixel whose loss is being calculated&lt;/span&gt;
&lt;span class=&quot;c1&quot;&gt;// blobsBuffer: a 1-dimensional tensor of floats, containing the Gaussian blobs&lt;/span&gt;

&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;Differentiable&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;perPixelLoss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GradInOutTensor&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;2&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                  &lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                  &lt;span class=&quot;n&quot;&gt;GradInOutTensor&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                  &lt;span class=&quot;kt&quot;&gt;Texture2D&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;targetTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetDimensions&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;set&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;},&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;loss&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;},&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;targetTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;));&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;You can see in this code block that &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;simpleSplatBlobs()&lt;/code&gt; is doing most of the work: iterating over our entire list of Gaussian blobs, and accumulating their contributions to the color of the pixel we are currently calculating. Keep in mind that &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;perPixelLoss()&lt;/code&gt; is going to be invoked once for each pixel in the output image, so the function is figuring out the loss value for just a single pixel. &lt;/p&gt;

&lt;p&gt;You might wonder if iterating over our entire list of Gaussians for each pixel in the image might be slow. It is. There are some clever things that we can do to speed up this calculation considerably, which I’ll cover in a &lt;a href=&quot;https://shader-slang.org/blog/2025/04/30/neural-graphics-first-principles-performance/&quot;&gt;follow-up blog post&lt;/a&gt;, but for now, let’s just focus on the simple– but slow– version.&lt;/p&gt;

&lt;p&gt;This set of functions is responsible for calculating all of the output pixels, as well as the difference between those values and our ideal target image, so they’re invoked not just for propagating loss derivatives (the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;module.perPixelLoss.bwds&lt;/code&gt; call we made in Python), but also during the rendering of our output texture, via &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;renderBlobsToTexture&lt;/code&gt;, which looks like this:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;renderBlobsToTexture&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;RWTexture2D&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;kt&quot;&gt;float4&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GradInOutTensor&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;uint2&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;GetDimensions&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;x&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;y&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;output&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;simpleSplatBlobs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;blobsBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;pixelCoord&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;imageSize&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;As you can see, this function just takes the result of &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;simpleSplatBlobs&lt;/code&gt;, and writes the value to the appropriate pixel location in the output texture.&lt;/p&gt;

&lt;p&gt;The other piece of the equation is the Adam update algorithm:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;adamUpdate&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;k&quot;&gt;inout&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;val&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                &lt;span class=&quot;k&quot;&gt;inout&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                &lt;span class=&quot;k&quot;&gt;inout&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;firstMoment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;
                &lt;span class=&quot;k&quot;&gt;inout&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;secondMoment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;c1&quot;&gt;// Read &amp;amp; reset the derivative&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_t&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;dVal&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_t_2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_t&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;c1&quot;&gt;// Perform a gradient update using Adam optimizer rules for&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;c1&quot;&gt;// a smoother optimization.&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;c1&quot;&gt;//&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t_prev&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;firstMoment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;v_t_prev&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;secondMoment&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_BETA_1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t_prev&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_BETA_1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;v_t&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_BETA_2&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;v_t_prev&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_BETA_2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;g_t_2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

 &lt;span class=&quot;err&quot;&gt;   &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;firstMoment&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;secondMoment&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;v_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t_hat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_BETA_1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;v_t_hat&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;v_t&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_BETA_2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;update&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;ADAM_ETA&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;/&lt;/span&gt; &lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;sqrt&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;v_t_hat&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;ADAM_EPSILON&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;))&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;m_t_hat&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;val&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;-=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;update&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;err&quot;&gt;    &lt;/span&gt;&lt;span class=&quot;n&quot;&gt;dVal&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;f&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This function isn’t marked as differentiable, because we don’t need to do any derivatives here– it’s just a straightforward update of all the blob parameters based on our gradients.&lt;/p&gt;

&lt;p&gt;And… that’s essentially it! Other than a few utility functions, this is all you need to write code that trains itself to match an output image. Your first neural graphics shader!&lt;/p&gt;

&lt;p&gt;&lt;img src=&quot;/images/posts/splatting-jeep-final.gif&quot; alt=&quot;An animation of the low-fi simplified 2D splatter in action&quot; class=&quot;img-fluid&quot; /&gt;&lt;/p&gt;

&lt;p&gt;Now, there are some notable shortcomings in this example– primarily, as mentioned before, that it takes quite a long time to execute. Because we look through our entire list of Gaussian blobs once for every pixel being calculated, at every iteration, it takes about 40 minutes (for me, on a system with a six-year-old graphics card) for all 10,000 iterations to complete. And this is with a very small number of blobs; I limited the number of blobs used to generate the image to 200, because going beyond that point starts to hang my GPU. And because of the small number of blobs, you can see that the image is pretty fuzzy. We could counter this with more, smaller blobs, but doing that will require some clever changes to improve execution speed. Thankfully, this is exactly the sort of work that GPUs are good at! And now that we’ve got the hang of how gradient descent and gaussian splatting work, we can dive into the optimization work in a follow-on blog post.&lt;/p&gt;

&lt;p&gt;If you have any questions or comments on this example code, or things you’d like to see covered in future walkthrough blog posts, please join us on the &lt;a href=&quot;https://khr.io/slang-discord&quot;&gt;Slang Discord&lt;/a&gt; – I and the rest of the Slang team can be found hanging out and answering questions there!&lt;/p&gt;
</description>
        <pubDate>Fri, 04 Apr 2025 17:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/04/04/neural-gfx-in-an-afternoon/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/04/04/neural-gfx-in-an-afternoon/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2025-04-04-splatterjeep.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Now Available: Matrix-vector operations using tensor cores</title>
        <description>&lt;p&gt;Just released, the &lt;a href=&quot;https://github.khronos.org/SPIRV-Registry/extensions/NV/SPV_NV_cooperative_vector.html&quot;&gt;SPV_NV_cooperative_vector extension&lt;/a&gt; in SPIR-V provides a new set of types to SPIR-V, and Slang has added experimental support for these cooperative vectors as well, so that you can make use of these types when targeting SPIR-V. Cooperative vectors enable shaders to do matrix operations utilizing hardware specialized for tensor operations on GPUs where they are available.&lt;/p&gt;

&lt;p&gt;A cooperative vector is an opaque vector type. Unlike normal vector types, they can have arbitrary length and support a relatively limited set of operations. These types are intended to help accelerate the evaluation of small neural networks, where each invocation is performing its own independent evaluation of the network. Behind the scenes, the hardware will try to share calculations as much as possible within a given subgroup, which can improve the performance compared to a traditional implementation.Therefore, while using cooperative vectors to perform multiplications with different matrices in a subgroup is supported by this feature, it’s best to reduce the data divergence as much as possible to achieve best performance.&lt;/p&gt;

&lt;p&gt;Slang’s support for cooperative vectors is currently an experimental feature. For more details on what you can expect from experimental features, and how they are expected to progress to stable status, please check out our &lt;a href=&quot;https://shader-slang.org/community/language-change-process/&quot;&gt;language change process documentation&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;Cooperative vector is currently available as a vendor extension in SPIR-V (SPV_NV_cooperative_vector), and developers will need a driver that supports this extension in order to use it. Cooperative vectors are permitted in any stage in SPIR-V with this extension. If cooperative vector support is added to other target platforms, they may have different restrictions.&lt;/p&gt;

&lt;p&gt;Full documentation of cooperative vector operations in Slang can be found in the &lt;a href=&quot;https://github.com/shader-slang/slang/tree/master/docs/proposals/019-cooperative-vector.md&quot;&gt;cooperative vector feature proposal&lt;/a&gt;&lt;/p&gt;

&lt;h2 id=&quot;code-examples&quot;&gt;Code Examples&lt;/h2&gt;
&lt;p&gt;The following code snippet shows a simple example of how to use cooperative vectors in a Slang compute shader:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;// Slang shader with Cooperative Vector support&lt;/span&gt;

&lt;span class=&quot;kt&quot;&gt;ByteAddressBuffer&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inputBuffer1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;ByteAddressBuffer&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;inputBuffer2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;RWStructuredBuffer&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;outputBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;numthreads&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;computeMain&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;threadID&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;SV_DispatchThreadID&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;CoopVec&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lhs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;coopVecLoad&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inputBuffer1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;CoopVec&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rhs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;coopVecLoad&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;inputBuffer2&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;let&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;lhs&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;rhs&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;getCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;outputBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;threadID&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;32&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;];&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;This compute shader loads 32 values from a shader parameter, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;inputBuffer&lt;/code&gt;, and stores them in a local variable, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;result&lt;/code&gt;, whose type is &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;CoopVec&amp;lt;int,32&amp;gt;&lt;/code&gt;.&lt;/p&gt;

&lt;p&gt;Note that the variable &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;result&lt;/code&gt; is a vector with 32 elements, which is much longer than a traditional vector, whose size ranges up to 4.&lt;/p&gt;

&lt;p&gt;The following code snippet shows another example that performs a matrix multiplication with a cooperative vector:&lt;/p&gt;

&lt;div class=&quot;language-hlsl highlighter-rouge&quot;&gt;&lt;div class=&quot;highlight&quot;&gt;&lt;pre class=&quot;highlight&quot;&gt;&lt;code&gt;&lt;span class=&quot;c1&quot;&gt;// Slang shader with Cooperative Vector support&lt;/span&gt;

&lt;span class=&quot;kt&quot;&gt;ByteAddressBuffer&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// [1 2 3 4 …]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;ByteAddressBuffer&lt;/span&gt; &lt;span class=&quot;kt&quot;&gt;matrix&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;RWStructuredBuffer&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;outputBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt;

&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;nb&quot;&gt;numthreads&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;16&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;1&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)]&lt;/span&gt;
&lt;span class=&quot;kt&quot;&gt;void&lt;/span&gt; &lt;span class=&quot;nf&quot;&gt;computeMain&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;threadID&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;:&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;SV_DispatchThreadID&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;{&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;CoopVec&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int8_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;vec&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;coopVecLoad&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;int8_t&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;input&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;threadID&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;*&lt;/span&gt; &lt;span class=&quot;kr&quot;&gt;sizeof&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;float&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;));&lt;/span&gt;
    &lt;span class=&quot;n&quot;&gt;let&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;coopVecMatMul&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int32_t&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;&amp;gt;&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;vec&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// input vector&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;CoopVecComponentType&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;::&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;SignedInt8&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// vector interpretation&lt;/span&gt;
        &lt;span class=&quot;kt&quot;&gt;matrix&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// input matrix&lt;/span&gt;
        &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// matrix offset in byte&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;CoopVecComponentType&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;::&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;SignedInt8&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// matrix interpretation&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;CoopVecMatrixLayout&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;::&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;RowMajor&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// matrix layout&lt;/span&gt;
        &lt;span class=&quot;nb&quot;&gt;false&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;,&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// matrix transpose&lt;/span&gt;
        &lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// matrix stride&lt;/span&gt;
    &lt;span class=&quot;p&quot;&gt;);&lt;/span&gt;

    &lt;span class=&quot;k&quot;&gt;for&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;(&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;int&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;mi&quot;&gt;0&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;&amp;lt;&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;.&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;getCount&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;();&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;++&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;)&lt;/span&gt;
        &lt;span class=&quot;n&quot;&gt;outputBuffer&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;threadID&lt;/span&gt;&lt;span class=&quot;o&quot;&gt;*&lt;/span&gt;&lt;span class=&quot;mi&quot;&gt;4&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;+&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;]&lt;/span&gt; &lt;span class=&quot;o&quot;&gt;=&lt;/span&gt; &lt;span class=&quot;n&quot;&gt;result&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;[&lt;/span&gt;&lt;span class=&quot;n&quot;&gt;i&lt;/span&gt;&lt;span class=&quot;p&quot;&gt;];&lt;/span&gt; &lt;span class=&quot;c1&quot;&gt;// [30 70 110 150 …]&lt;/span&gt;
&lt;span class=&quot;p&quot;&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This compute shader loads 4 values and stores them in a cooperative vector variable, &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;vec&lt;/code&gt;.
The &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;coopVecMatMul()&lt;/code&gt; function performs matrix multiplication with a given vector and a given matrix.&lt;/p&gt;

&lt;p&gt;Note that most cooperative vector functions take “interpretation” parameters. If a target platform doesn’t directly support the &lt;code class=&quot;language-plaintext highlighter-rouge&quot;&gt;int8&lt;/code&gt; type, the interpretation parameter can allow you to use smaller data types for the calculation.&lt;/p&gt;

&lt;p&gt;For more practical examples of Cooperative Vector usage, please see the RTX Neural Shading and RTX Neural Texture Compression (NTC) SDKs that will be released soon within RTX Kit. NTC contains implementations of neural texture decompression using Cooperative Vectors in Slang, as well as fallback implementations of the same using traditional shader math. On recent NVIDIA GPUs, using Cooperative Vectors can provide up to 4x speedup in decompression compared to using DP4a instructions.&lt;/p&gt;

</description>
        <pubDate>Thu, 30 Jan 2025 20:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/01/30/coop-vec-available/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/01/30/coop-vec-available/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2025-01-30-coop-vec-available.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang Development Update - January 2025</title>
        <description>&lt;p&gt;Happy new year, Slang enthusiasts! As we beckon in 2025, we wanted to take a moment to highlight the many improvements we’ve been able to land in response to the incredible and enthusiastic engagement and feedback from the Slang community! Take a look at what’s new:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Improvement on specialization constants and push constants&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
  &lt;li&gt;Specialization constants are now supported for WGSL and Metal targets.&lt;/li&gt;
  &lt;li&gt;Fixed reflection API issues around specialization constants and push constants.&lt;/li&gt;
  &lt;li&gt;Added more validation checks on invalid use of specialization/push constant attributes.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Improvement on SPIRV pointer support&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
  &lt;li&gt;Added loadAligned and storeAligned intrinsics for aligned load/stores.&lt;/li&gt;
  &lt;li&gt;Now supports pointers to a struct with trailing unsized arrays.&lt;/li&gt;
  &lt;li&gt;Dynamic dispatch via interface-typed pointers are now supported.&lt;/li&gt;
  &lt;li&gt;Reflection on pointer element type now correctly reports the scalar layout.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;SPIRV/GLSL improvements&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
  &lt;li&gt;Added SV_DrawIndex semantic that maps to the DrawIndex builtin&lt;/li&gt;
  &lt;li&gt;Added Sampler2DShadow and friends to combined texture comparison-sampler types.&lt;/li&gt;
  &lt;li&gt;Explicit GLSL binding locations (as in [vk::binding()] and in layout(binding=…) qualifiers) can now use compile time expressions. (contribution from community)&lt;/li&gt;
  &lt;li&gt;-fvk-invert-y option now works for mesh shader position outputs.&lt;/li&gt;
  &lt;li&gt;Added nonuniformEXT intrinsics when consuming GLSL code.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Reflection API and binding improvement&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
  &lt;li&gt;Explicit [vk::location(N)] binding is now supported for WGSL and Metal targets.&lt;/li&gt;
  &lt;li&gt;IMetadata::isParameterLocationUsed now supports reporting usage info on stage varying inputs/outputs.&lt;/li&gt;
  &lt;li&gt;Now support querying both user defined and system builtin attributes on types and functions with the new findAttributeByName API.&lt;/li&gt;
  &lt;li&gt;Attribute::getArgumentValueByFloat now correctly returns the value even if the argument in the source code is in the form of a integer literal.&lt;/li&gt;
  &lt;li&gt;WGSL backend now respects explicit binding qualifiers.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;These improvements are available in Slang v2025.2.&lt;/p&gt;

&lt;p&gt;Thank you to everyone in the Slang community for your feedback and your contributions! We’d also like to recognize GitHub users &lt;a href=&quot;https://github.com/juliusikkala&quot;&gt;@juliusikkala&lt;/a&gt; and &lt;a href=&quot;https://github.com/fairywreath&quot;&gt;@fairywreath&lt;/a&gt; for landing a number of fixes and improvements.&lt;/p&gt;

&lt;p&gt;We’re also happy to share that the video and slides from last month’s Slang Birds of a Feather session are now available! This was a great session, with a full room of developers who brought insightful questions – give it a watch!&lt;/p&gt;
</description>
        <pubDate>Fri, 10 Jan 2025 19:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2025/01/10/january-dev-update/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2025/01/10/january-dev-update/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2025-01-10-dev-updates.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Documentation Update: Reflection API</title>
        <description>&lt;p&gt;The Slang reflection API gives developers the ability to examine shader parameters, types, and their layouts from their application during runtime – a big benefit for things like dynamic shader parameter binding. Based on user feedback, we’ve given the documentation for reflection a major update, making it easier to understand how to interact with and use it to your best advantage.&lt;/p&gt;

&lt;p&gt;The &lt;a href=&quot;https://shader-slang.com/slang/user-guide/reflection&quot;&gt;basic reflection API documentation&lt;/a&gt; provides an overview of best practices for retrieving reflection information from a compiled shader, and how Slang reports out the individual variables and types and their layouts. It also references a &lt;a href=&quot;https://github.com/shader-slang/slang/tree/master/examples/reflection-api&quot;&gt;simple example&lt;/a&gt; to demonstrate how the code is used in practice, and provides coverage of details like how to calculate offsets into parameter blocks, how Slang handles global parameter declarations, and how to avoid common pitfalls.&lt;/p&gt;

&lt;p&gt;For developers targeting multiple API environments, we’ve also put together an overview of one strategy for handling parameter passing in a cross-platform safe way in our &lt;a href=&quot;https://shader-slang.com/docs/shader-cursors/&quot;&gt;shader cursors documentation&lt;/a&gt;. Because the way in which different GPU APIs accept parameters from the caller varies significantly, handing that information off to your shader program is something that many developers wrestle with. Slang’s reflection API was designed with this understanding in mind, to help you manage shader parameters wherever you need to deploy, across the wide range of targets that Slang supports.&lt;/p&gt;

&lt;p&gt;We’re continuing to add to our reflection API documentation, so stay tuned for more updates! And if there are areas you’d like to see more detailed documentation, please let us know – you can join us at any time on our &lt;a href=&quot;https://khr.io/slangdiscord&quot;&gt;Discord server&lt;/a&gt;!&lt;/p&gt;

</description>
        <pubDate>Wed, 18 Dec 2024 18:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2024/12/18/reflection-api-doc-update/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2024/12/18/reflection-api-doc-update/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2024-12-18-reflection-api-doc-update.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang Support Now Available on Godbolt Compiler Explorer</title>
        <description>&lt;p&gt;&lt;img src=&quot;/images/posts/2024-12-17-slang-support-in-godbolt.webp&quot; alt=&quot;&quot; class=&quot;img-fluid&quot; /&gt;&lt;/p&gt;

&lt;p&gt;Great news: Slang support is now officially available on &lt;a href=&quot;https://godbolt.org/z/193GxYrn6&quot;&gt;Godbolt Compiler Explorer&lt;/a&gt;!&lt;/p&gt;

&lt;p&gt;Godbolt Compiler Explorer is a widely-used online tool that allows developers to write code in various programming languages and see the resulting assembly code in real-time. This powerful resource helps users understand how high-level code translates to lower-level instructions, making it an invaluable educational and debugging tool. For Slang, this provides an avenue for viewing the SPIR-V, HLSL, GLSL, WGSL, or other output for a given Slang program directly in a compiler explorer tool which may already be familiar to you.&lt;/p&gt;

&lt;p&gt;The Slang team is very grateful to the efforts of LunarG engineer Spencer Fricke in integrating Slang support to the Compiler Explorer!&lt;/p&gt;

</description>
        <pubDate>Tue, 17 Dec 2024 18:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2024/12/17/slang-support-in-godbolt/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2024/12/17/slang-support-in-godbolt/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2024-12-17-slang-support-in-godbolt.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang and the 3D Shading Language Landscape</title>
        <description>
</description>
        <pubDate>Wed, 04 Dec 2024 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2024/12/04/slang-and-the-4d-shading-language-landscape/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2024/12/04/slang-and-the-4d-shading-language-landscape/</guid>
        
        <category>slang</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/events/2024-12-04-slang-and-the-3d-shading-landscape.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Khronos Hosts Open Source Slang Shading Language and Compiler</title>
        <description>&lt;p&gt;The Khronos Group has announced the launch of the new Slang™ Initiative. This initiative will oversee and advance the open-source Slang shading language and compiler, building on 15 years of research, development, and deployment experience. Supported by NVIDIA since 2017, Slang has been widely adopted in production projects across the industry.&lt;/p&gt;
</description>
        <pubDate>Thu, 21 Nov 2024 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/news/2024/11/21/khronos-hosts-open-source-slang-shading-language-and-compiler/</link>
        <guid isPermaLink="true">http://shader-slang.org/news/2024/11/21/khronos-hosts-open-source-slang-shading-language-and-compiler/</guid>
        
        <category>khronos</category>
        
        
        <category>news</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/khronos-slang-logo.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>There’s a lot going on with Slang!</title>
        <description>&lt;p&gt;If you’ve been following Slang developments, you’ve probably noticed a lot of changes happening recently! We’ve been hard at work, and we’re excited to share the latest updates with the Slang community.&lt;/p&gt;

&lt;h3 id=&quot;slang-now-a-khronos-open-source-project&quot;&gt;Slang: Now a Khronos Open-Source Project!&lt;/h3&gt;

&lt;p&gt;Probably the most noticeable change around here is the new look and feel of the website, which reflects a big move for the Slang project: we have just finished migrating the Slang open source project to Khronos, the industry consortium dedicated to building an open ecosystem of acceleration standards, formats and open source.&lt;/p&gt;

&lt;p&gt;With Slang now hosted at Khronos, it’s no longer tied to any single company, fostering even broader community and industry collaboration. We’ve opened up Slang’s development and decision-making processes, added new documentation on &lt;a href=&quot;https://shader-slang.com/community/&quot;&gt;community and developer practices&lt;/a&gt;, and launched a &lt;a href=&quot;https://khr.io/slangdiscord&quot;&gt;Slang Discord&lt;/a&gt; for ongoing technical conversations. Slang participation is open to everyone, and it’s now easier than ever to get involved—whether as a Slang contributor, committer, or code owner!&lt;/p&gt;

&lt;p&gt;But there’s a lot more than just license and governance changes going on—we’ve introduced many new features in the last few months. Here is a detailed look at the recent advancements, all available in the &lt;a href=&quot;https://github.com/shader-slang/slang/releases/tag/v2024.14.5&quot;&gt;v2024.14.5 release&lt;/a&gt;:&lt;/p&gt;

&lt;h3 id=&quot;metal-support&quot;&gt;Metal Support&lt;/h3&gt;

&lt;p&gt;One of the biggest enhancements in recent months is &lt;a href=&quot;https://shader-slang.com/slang/user-guide/metal-target-specific&quot;&gt;support for Apple’s Metal shading language&lt;/a&gt; as a backend target. This opens up new possibilities for developers targeting iOS and macOS platforms, bringing the power of Slang shaders to even more applications and games. Slang’s Metal support includes vertex, fragment, compute, mesh, and amplification shaders.&lt;/p&gt;

&lt;h3 id=&quot;webgpu-support&quot;&gt;WebGPU Support&lt;/h3&gt;

&lt;p&gt;But Metal Shading Language is not the only new backend, Slang can now also &lt;a href=&quot;https://shader-slang.com/slang/user-guide/wgsl-target-specific&quot;&gt;compile to WGSL&lt;/a&gt; for WebGPU! WebGPU is the latest graphics API for the web, providing high-performance and flexible access to GPU capabilities. With Slang’s new WebGPU support, developers can now compile and run Slang shaders directly in the browser, making it easier than ever to create immersive and interactive web experiences.&lt;/p&gt;

&lt;h3 id=&quot;live-playground&quot;&gt;Live Playground&lt;/h3&gt;

&lt;p&gt;To make it easier for developers to experiment with and learn Slang, we have launched a &lt;a href=&quot;http://try.shader-slang.com&quot;&gt;live playground&lt;/a&gt;. This online tool allows you to compile Slang shaders to various targets, including GLSL, HLSL, MSL, and WGSL, and run them directly in your browser.&lt;/p&gt;

&lt;p&gt;The live playground runs the full Slang compiler as a WebAssembly module, compressed to just 5MB! This allows all shader compilations to run locally in the browser, with no data sent to the cloud. When WebGPU is available in the browser, the playground can execute Slang code by cross-compiling it to WGSL.&lt;/p&gt;

&lt;p&gt;&lt;img src=&quot;/images/posts/2024-11-slang-blog-live-playground.webp&quot; alt=&quot;&quot; class=&quot;img-fluid&quot; /&gt;
The live playground provides an interactive and user-friendly environment where you can test and refine your shaders in real-time– including leveraging the integrated language server support. Whether you are a seasoned developer or just starting out, the live playground is a valuable resource for exploring the capabilities of Slang.&lt;/p&gt;

&lt;h3 id=&quot;focus-on-auto-diff-performance&quot;&gt;Focus on Auto-Diff Performance&lt;/h3&gt;

&lt;p&gt;The recent release of a &lt;a href=&quot;https://github.com/google/slang-gaussian-rasterization&quot;&gt;Slang implementation of the original Gaussian Splatting rasterizer&lt;/a&gt; has given us the opportunity to scrutinize and improve the performance of Slang’s automatic differentiation, with the end result that we have achieved performance parity between Slang’s auto-diff code and the hand-written CUDA implementation, showing the potential to easily create high performance neural shaders..&lt;/p&gt;

&lt;p&gt;&lt;img src=&quot;/images/posts/2024-11-slang-blog-falcor-tiger.webp&quot; alt=&quot;&quot; class=&quot;img-fluid&quot; /&gt;
NVIDIA Falcor research rendering framework using Slang&lt;/p&gt;

&lt;h3 id=&quot;atomict&quot;&gt;Atomic&amp;lt;T&amp;gt;&lt;/h3&gt;

&lt;p&gt;Atomic operations are implemented differently across different shading languages. While HLSL allows atomic operations on arbitrary types, Metal Shading Language (MSL) and WGSL restrict them to be used only on atomic types. In order to be able to use atomics across all supported backends, Slang has added the Atomic&lt;T&gt; type, allowing developers to use atomics directly in Slang and have them be translated cleanly into the relevant target languages.&lt;/T&gt;&lt;/p&gt;

&lt;h3 id=&quot;new-type-system-features&quot;&gt;New Type-System Features&lt;/h3&gt;

&lt;p&gt;We have further expanded Slang’s type system to support &lt;a href=&quot;https://shader-slang.com/slang/user-guide/interfaces-generics.html#generics&quot;&gt;where clauses&lt;/a&gt;,  &lt;a href=&quot;https://shader-slang.com/slang/user-guide/interfaces-generics.html#variadic-generics&quot;&gt;variadic generics&lt;/a&gt;, &lt;a href=&quot;https://shader-slang.com/slang/user-guide/convenience-features.html#tuple-types&quot;&gt;Tuple types&lt;/a&gt;, &lt;a href=&quot;https://github.com/shader-slang/slang/blob/master/docs/proposals/009-ifunc.md&quot;&gt;IFunc interfaces&lt;/a&gt;, and &lt;a href=&quot;https://shader-slang.com/slang/user-guide/interfaces-generics.html#extensions-to-interfaces&quot;&gt;extensions on generic types&lt;/a&gt;. These additions make it easier to implement more advanced shader architectures using Slang, with code that is much easier to maintain than preprocessor macro expansion or external code generation (meta-programming) logic.&lt;/p&gt;

&lt;h3 id=&quot;unicode-support&quot;&gt;Unicode support&lt;/h3&gt;
&lt;p&gt;Not everyone writes ASCII-only code, and so Slang now accepts Unicode, including in identifier names.&lt;/p&gt;

&lt;h3 id=&quot;improved-documentation-and-sample-material&quot;&gt;Improved Documentation and Sample Material&lt;/h3&gt;

&lt;p&gt;We understand the importance of comprehensive documentation and high-quality sample material in helping developers get the most out of Slang, so we’ve been expanding and improving the &lt;a href=&quot;https://shader-slang.com/docs/&quot;&gt;Slang Developer Documentation Suite&lt;/a&gt;, adding samples and reference material. Please let us know where you’d like to see more!&lt;/p&gt;

&lt;h3 id=&quot;join-us&quot;&gt;Join Us!&lt;/h3&gt;

&lt;p&gt;We invite you to join us on Slang’s ongoing journey to push the boundaries of shading language power. Whether you are a developer utilizing Slang in your projects, a contributor helping to shape its future, or simply an enthusiast following our progress, your support and engagement are invaluable to us.&lt;/p&gt;

&lt;p&gt;Stay tuned for more updates, and be sure to explore the new features and resources we have introduced. Together, let’s unlock the full potential of Slang and create amazing graphics experiences across diverse platforms.&lt;/p&gt;

&lt;p&gt;Thank you for being a part of the Slang community!&lt;/p&gt;
</description>
        <pubDate>Wed, 20 Nov 2024 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2024/11/20/theres-a-lot-going-on-with-slang/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2024/11/20/theres-a-lot-going-on-with-slang/</guid>
        
        <category>slang</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2024-11-slang-blog-falcor-tiger.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Using Slang with Vulkan</title>
        <description>
</description>
        <pubDate>Sat, 31 Aug 2024 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2024/08/31/using-slang-with-vulkan-siggraph-2024-khronos-bof/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2024/08/31/using-slang-with-vulkan-siggraph-2024-khronos-bof/</guid>
        
        <category>vulkan</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2024-siggraph-slang-vulkan.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Towards a Next-Generation shading Language: Our Journey with Slang</title>
        <description>
</description>
        <pubDate>Mon, 05 Feb 2024 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2024/02/05/toward-a-next-generation-shading-language-out-journey-with-slang/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2024/02/05/toward-a-next-generation-shading-language-out-journey-with-slang/</guid>
        
        <category>vulkan</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/20204-vulkanised-video-slang.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Slang Shading Language Advances</title>
        <description>
</description>
        <pubDate>Mon, 05 Feb 2024 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/video/2024/02/05/slang-shading-language-advances/</link>
        <guid isPermaLink="true">http://shader-slang.org/video/2024/02/05/slang-shading-language-advances/</guid>
        
        <category>slang</category>
        
        
        <category>video</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/2022-slang-video-nvidia.webp" medium="image" type="image/webp" />
        
      </item>
      
    
     
      <item>
        <title>Differentiable Slang: A Shading Language for Renderers That Learn</title>
        <description>
</description>
        <pubDate>Sun, 22 Oct 2023 10:00:00 +0000</pubDate>
        <link>http://shader-slang.org/blog/2023/10/22/differentiable-slang/</link>
        <guid isPermaLink="true">http://shader-slang.org/blog/2023/10/22/differentiable-slang/</guid>
        
        <category>vulkan</category>
        
        
        <category>blog</category>
        
        
        <media:content xmlns:media="http://search.yahoo.com/mrss/" url="http://shader-slang.org/images/posts/primal-and-derivative-zero-day.jpg" medium="image" type="image/webp" />
        
      </item>
      
    
  </channel>
</rss>