Skip to main content

bevy_render/batching/
gpu_preprocessing.rs

1//! Batching functionality when GPU preprocessing is in use.
2
3use alloc::sync::Arc;
4use core::{
5    any::TypeId,
6    marker::PhantomData,
7    mem,
8    ops::Range,
9    sync::atomic::{AtomicU32, Ordering},
10};
11
12use bevy_app::{App, Plugin};
13use bevy_derive::{Deref, DerefMut};
14use bevy_ecs::{
15    prelude::Entity,
16    query::{Has, With},
17    resource::Resource,
18    schedule::IntoScheduleConfigs as _,
19    system::{Query, Res, ResMut, StaticSystemParam},
20    world::{FromWorld, World},
21};
22use bevy_encase_derive::ShaderType;
23use bevy_log::{error, info_once};
24use bevy_math::UVec4;
25use bevy_platform::collections::{hash_map::Entry, HashMap, HashSet};
26use bevy_tasks::ComputeTaskPool;
27use bevy_utils::{default, TypeIdMap};
28use bytemuck::{Pod, Zeroable};
29use encase::{internal::WriteInto, ShaderSize};
30use nonmax::NonMaxU32;
31use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
32
33use crate::{
34    occlusion_culling::OcclusionCulling,
35    render_phase::{
36        BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
37        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem,
38        PhaseItemBatchSetKey as _, PhaseItemExtraIndex, RenderMultidrawableBatchSet,
39        SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases,
40        ViewSortedRenderPhases,
41    },
42    render_resource::{
43        AtomicPod, AtomicRawBufferVec, AtomicSparseBufferVec, Buffer, GpuArrayBufferable,
44        PartialBufferVec, PipelineCache, RawBufferVec, SparseBufferUpdateBindGroups,
45        SparseBufferUpdateJobs, SparseBufferUpdatePipelines, UninitBufferVec,
46    },
47    renderer::{RenderAdapter, RenderAdapterInfo, RenderDevice, RenderQueue, WgpuWrapper},
48    sync_world::{MainEntity, MainEntityHashMap},
49    view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
50    GpuResourceAppExt, Render, RenderApp, RenderDebugFlags, RenderSystems,
51};
52
53use super::{BatchSetMeta, GetBatchData, GetFullBatchData};
54
55#[derive(Default)]
56pub struct BatchingPlugin {
57    /// Debugging flags that can optionally be set when constructing the renderer.
58    pub debug_flags: RenderDebugFlags,
59}
60
61impl Plugin for BatchingPlugin {
62    fn build(&self, app: &mut App) {
63        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
64            return;
65        };
66
67        render_app
68            .insert_resource(IndirectParametersBuffersSettings {
69                allow_copies_from_indirect_parameter_buffers: self
70                    .debug_flags
71                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
72            })
73            .init_gpu_resource::<IndirectParametersBuffers>()
74            .allow_ambiguous_resource::<IndirectParametersBuffers>()
75            .init_gpu_resource::<BinUnpackingBuffers>()
76            .add_systems(
77                Render,
78                write_indirect_parameters_buffers.in_set(RenderSystems::PrepareResourcesFlush),
79            )
80            .add_systems(
81                Render,
82                clear_indirect_parameters_buffers.in_set(RenderSystems::PrepareViews),
83            );
84    }
85
86    fn finish(&self, app: &mut App) {
87        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
88            return;
89        };
90
91        render_app.init_gpu_resource::<GpuPreprocessingSupport>();
92    }
93}
94
95/// Records whether GPU preprocessing and/or GPU culling are supported on the
96/// device.
97///
98/// No GPU preprocessing is supported on WebGL because of the lack of compute
99/// shader support.  GPU preprocessing is supported on DirectX 12, but due to [a
100/// `wgpu` limitation] GPU culling is not.
101///
102/// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471
103#[derive(Clone, Copy, PartialEq, Resource)]
104pub struct GpuPreprocessingSupport {
105    /// The maximum amount of GPU preprocessing available on this platform.
106    pub max_supported_mode: GpuPreprocessingMode,
107}
108
109impl GpuPreprocessingSupport {
110    /// Returns true if this GPU preprocessing support level isn't `None`.
111    #[inline]
112    pub fn is_available(&self) -> bool {
113        self.max_supported_mode != GpuPreprocessingMode::None
114    }
115
116    /// Returns the given GPU preprocessing mode, capped to the current
117    /// preprocessing mode.
118    pub fn min(&self, mode: GpuPreprocessingMode) -> GpuPreprocessingMode {
119        match (self.max_supported_mode, mode) {
120            (GpuPreprocessingMode::None, _) | (_, GpuPreprocessingMode::None) => {
121                GpuPreprocessingMode::None
122            }
123            (mode, GpuPreprocessingMode::Culling) | (GpuPreprocessingMode::Culling, mode) => mode,
124            (GpuPreprocessingMode::PreprocessingOnly, GpuPreprocessingMode::PreprocessingOnly) => {
125                GpuPreprocessingMode::PreprocessingOnly
126            }
127        }
128    }
129
130    /// Returns true if GPU culling is supported on this platform.
131    pub fn is_culling_supported(&self) -> bool {
132        self.max_supported_mode == GpuPreprocessingMode::Culling
133    }
134}
135
136/// The amount of GPU preprocessing (compute and indirect draw) that we do.
137#[derive(Clone, Copy, PartialEq)]
138pub enum GpuPreprocessingMode {
139    /// No GPU preprocessing is in use at all.
140    ///
141    /// This is used when GPU compute isn't available.
142    None,
143
144    /// GPU preprocessing is in use, but GPU culling isn't.
145    ///
146    /// This is used when the [`NoIndirectDrawing`] component is present on the
147    /// camera.
148    PreprocessingOnly,
149
150    /// Both GPU preprocessing and GPU culling are in use.
151    ///
152    /// This is used by default.
153    Culling,
154}
155
156/// The GPU buffers holding the data needed to render batches.
157///
158/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
159/// `BD` type parameter in that mode.
160///
161/// We have a separate *buffer data input* type (`BDI`) here, which a compute
162/// shader is expected to expand to the full buffer data (`BD`) type. GPU
163/// uniform building is generally faster and uses less system RAM to VRAM bus
164/// bandwidth, but only implemented for some pipelines (for example, not in the
165/// 2D pipeline at present) and only when compute shader is available.
166#[derive(Resource)]
167pub struct BatchedInstanceBuffers<BD, BDI>
168where
169    BD: GpuArrayBufferable + Sync + Send + 'static,
170    BDI: AtomicPod,
171{
172    /// The uniform data inputs for the current frame.
173    ///
174    /// These are uploaded during the extraction phase.
175    pub current_input_buffer: InstanceInputUniformBuffer<BDI>,
176
177    /// The uniform data inputs for the previous frame.
178    ///
179    /// The indices don't generally line up between `current_input_buffer`
180    /// and `previous_input_buffer`, because, among other reasons, entities
181    /// can spawn or despawn between frames. Instead, each current buffer
182    /// data input uniform is expected to contain the index of the
183    /// corresponding buffer data input uniform in this list.
184    pub previous_input_buffer: PreviousInstanceInputUniformBuffer<BDI>,
185
186    /// The data needed to render buffers for each phase.
187    ///
188    /// The keys of this map are the type IDs of each phase: e.g. `Opaque3d`,
189    /// `AlphaMask3d`, etc.
190    pub phase_instance_buffers: TypeIdMap<UntypedPhaseBatchedInstanceBuffers<BD>>,
191}
192
193impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
194where
195    BD: GpuArrayBufferable + Sync + Send + 'static,
196    BDI: AtomicPod,
197{
198    fn default() -> Self {
199        BatchedInstanceBuffers {
200            current_input_buffer: InstanceInputUniformBuffer::new(),
201            previous_input_buffer: PreviousInstanceInputUniformBuffer::new(),
202            phase_instance_buffers: TypeIdMap::default(),
203        }
204    }
205}
206
207/// The GPU buffers holding the data needed to render batches for a single
208/// phase.
209///
210/// These are split out per phase so that we can run the phases in parallel.
211/// This is the version of the structure that has a type parameter, which
212/// enables Bevy's scheduler to run the batching operations for the different
213/// phases in parallel.
214///
215/// See the documentation for [`BatchedInstanceBuffers`] for more information.
216#[derive(Resource)]
217pub struct PhaseBatchedInstanceBuffers<PI, BD>
218where
219    PI: PhaseItem,
220    BD: GpuArrayBufferable + Sync + Send + 'static,
221{
222    /// The buffers for this phase.
223    pub buffers: UntypedPhaseBatchedInstanceBuffers<BD>,
224    phantom: PhantomData<PI>,
225}
226
227impl<PI, BD> Default for PhaseBatchedInstanceBuffers<PI, BD>
228where
229    PI: PhaseItem,
230    BD: GpuArrayBufferable + Sync + Send + 'static,
231{
232    fn default() -> Self {
233        PhaseBatchedInstanceBuffers {
234            buffers: UntypedPhaseBatchedInstanceBuffers::default(),
235            phantom: PhantomData,
236        }
237    }
238}
239
240/// The GPU buffers holding the data needed to render batches for a single
241/// phase, without a type parameter for that phase.
242///
243/// Since this structure doesn't have a type parameter, it can be placed in
244/// [`BatchedInstanceBuffers::phase_instance_buffers`].
245pub struct UntypedPhaseBatchedInstanceBuffers<BD>
246where
247    BD: GpuArrayBufferable + Sync + Send + 'static,
248{
249    /// A storage area for the buffer data that the GPU compute shader is
250    /// expected to write to.
251    ///
252    /// There will be one entry for each index.
253    pub data_buffer: UninitBufferVec<BD>,
254
255    /// The index of the buffer data in the current input buffer that
256    /// corresponds to each instance.
257    ///
258    /// This is keyed off each view. Each view has a separate buffer.
259    pub work_item_buffers: HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
260
261    /// A buffer that holds the number of indexed meshes that weren't visible in
262    /// the previous frame, when GPU occlusion culling is in use.
263    ///
264    /// There's one set of [`LatePreprocessWorkItemIndirectParameters`] per
265    /// view. Bevy uses this value to determine how many threads to dispatch to
266    /// check meshes that weren't visible next frame to see if they became newly
267    /// visible this frame.
268    pub late_indexed_indirect_parameters_buffer:
269        RawBufferVec<LatePreprocessWorkItemIndirectParameters>,
270
271    /// A buffer that holds the number of non-indexed meshes that weren't
272    /// visible in the previous frame, when GPU occlusion culling is in use.
273    ///
274    /// There's one set of [`LatePreprocessWorkItemIndirectParameters`] per
275    /// view. Bevy uses this value to determine how many threads to dispatch to
276    /// check meshes that weren't visible next frame to see if they became newly
277    /// visible this frame.
278    pub late_non_indexed_indirect_parameters_buffer:
279        RawBufferVec<LatePreprocessWorkItemIndirectParameters>,
280}
281
282/// Holds the GPU buffer of instance input data, which is the data about each
283/// mesh instance that the CPU provides.
284///
285/// `BDI` is the *buffer data input* type, which the GPU mesh preprocessing
286/// shader is expected to expand to the full *buffer data* type.
287pub struct InstanceInputUniformBuffer<BDI>
288where
289    BDI: AtomicPod,
290{
291    /// The buffer containing the data that will be uploaded to the GPU.
292    buffer: AtomicSparseBufferVec<BDI>,
293
294    /// Indices of slots that are free within the buffer.
295    ///
296    /// When adding data, we preferentially overwrite these slots first before
297    /// growing the buffer itself.
298    free_uniform_indices: Vec<u32>,
299}
300
301impl<BDI> InstanceInputUniformBuffer<BDI>
302where
303    BDI: AtomicPod,
304{
305    /// Creates a new, empty buffer.
306    pub fn new() -> InstanceInputUniformBuffer<BDI> {
307        InstanceInputUniformBuffer {
308            buffer: AtomicSparseBufferVec::new(
309                BufferUsages::STORAGE,
310                8,
311                Arc::from("instance input uniform buffer"),
312            ),
313            free_uniform_indices: vec![],
314        }
315    }
316
317    /// Clears the buffer and entity list out.
318    pub fn clear(&mut self) {
319        self.buffer.clear();
320        self.free_uniform_indices.clear();
321    }
322
323    /// Returns the [`AtomicSparseBufferVec`] corresponding to this input
324    /// uniform buffer.
325    #[inline]
326    pub fn buffer(&self) -> &AtomicSparseBufferVec<BDI> {
327        &self.buffer
328    }
329
330    /// Adds a new piece of buffered data to the uniform buffer and returns its
331    /// index.
332    pub fn add(&mut self, element: BDI) -> u32 {
333        match self.free_uniform_indices.pop() {
334            Some(uniform_index) => {
335                self.buffer.set(uniform_index, element);
336                uniform_index
337            }
338            None => self.buffer.push(element),
339        }
340    }
341
342    /// Removes a piece of buffered data from the uniform buffer.
343    ///
344    /// This simply marks the data as free.
345    pub fn remove(&mut self, uniform_index: u32) {
346        self.free_uniform_indices.push(uniform_index);
347    }
348
349    /// Returns the piece of buffered data at the given index.
350    ///
351    /// Returns [`None`] if the index is out of bounds or the data is removed.
352    pub fn get(&self, uniform_index: u32) -> Option<BDI> {
353        if uniform_index >= self.buffer.len() || self.free_uniform_indices.contains(&uniform_index)
354        {
355            None
356        } else {
357            Some(self.get_unchecked(uniform_index))
358        }
359    }
360
361    /// Returns the piece of buffered data at the given index.
362    /// Can return data that has previously been removed.
363    ///
364    /// # Panics
365    /// if `uniform_index` is not in bounds of [`Self::buffer`].
366    pub fn get_unchecked(&self, uniform_index: u32) -> BDI {
367        self.buffer.get(uniform_index)
368    }
369
370    /// Stores a piece of buffered data at the given index.
371    ///
372    /// # Panics
373    /// if `uniform_index` is not in bounds of [`Self::buffer`].
374    pub fn set(&self, uniform_index: u32, element: BDI) {
375        self.buffer.set(uniform_index, element);
376    }
377
378    // Ensures that the buffers are nonempty, which the GPU requires before an
379    // upload can take place.
380    pub fn ensure_nonempty(&mut self) {
381        if self.buffer.is_empty() {
382            self.buffer.push(default());
383        }
384    }
385
386    /// Returns the number of instances in this buffer.
387    pub fn len(&self) -> usize {
388        self.buffer.len() as usize
389    }
390
391    /// Returns true if this buffer has no instances or false if it contains any
392    /// instances.
393    pub fn is_empty(&self) -> bool {
394        self.buffer.is_empty()
395    }
396
397    /// Consumes this [`InstanceInputUniformBuffer`] and returns the raw buffer
398    /// ready to be uploaded to the GPU.
399    pub fn into_buffer(self) -> AtomicSparseBufferVec<BDI> {
400        self.buffer
401    }
402}
403
404impl<BDI> Default for InstanceInputUniformBuffer<BDI>
405where
406    BDI: AtomicPod,
407{
408    fn default() -> Self {
409        Self::new()
410    }
411}
412
413/// Stores the input uniforms for the previous frame.
414///
415/// This doesn't use a sparse buffer because it's cleared out every frame and
416/// only ever pushed onto. The length is stored in an atomic field, so multiple
417/// threads can push simultaneously.
418///
419/// The [`AtomicRawBufferVec`] serves as a backing store only. We reserve a
420/// large size, enough to hold all push operations that could possibly occur on
421/// the worker threads, and only synchronize the changed portion of the buffer
422/// to the GPU on each frame.
423pub struct PreviousInstanceInputUniformBuffer<BDI>
424where
425    BDI: AtomicPod,
426{
427    /// The buffer containing the data that will be uploaded to the GPU.
428    buffer: AtomicRawBufferVec<BDI>,
429
430    /// The number of elements pushed since the last [`Self::reserve`].
431    atomic_len: AtomicU32,
432}
433
434impl<BDI> PreviousInstanceInputUniformBuffer<BDI>
435where
436    BDI: AtomicPod,
437{
438    /// Creates a new, empty buffer.
439    pub fn new() -> PreviousInstanceInputUniformBuffer<BDI> {
440        PreviousInstanceInputUniformBuffer {
441            buffer: AtomicRawBufferVec::with_label(
442                BufferUsages::STORAGE,
443                "previous instance input uniform buffer",
444            ),
445            atomic_len: AtomicU32::new(0),
446        }
447    }
448
449    /// Writes the buffer to the GPU.
450    fn write_buffer(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
451        // `Self::ensure_nonempty` must have been called first.
452        debug_assert!(!self.buffer.is_empty());
453        // Only write the modified portion of this buffer. Typically, that
454        // portion will be much smaller than the full size of the buffer.
455        self.buffer.write_buffer_range(
456            0..(self.atomic_len.load(Ordering::Relaxed) as usize).max(1),
457            render_device,
458            render_queue,
459        );
460    }
461
462    /// Clears out the buffer in preparation for a new frame.
463    pub fn clear(&mut self) {
464        // Don't actually clear the underlying buffer out, as then we'd have to
465        // grow it again and that would be slow.
466        self.atomic_len.store(0, Ordering::Relaxed);
467    }
468
469    /// Pre-allocates capacity for concurrent [`Self::push`] calls.
470    pub fn reserve(&mut self, capacity: u32) {
471        self.buffer.grow(capacity);
472        *self.atomic_len.get_mut() = 0;
473    }
474
475    /// Appends a value and returns its index. Thread-safe.
476    ///
477    /// [`Self::reserve`] must have been called first with sufficient capacity.
478    pub fn push(&self, value: BDI) -> u32 {
479        let index = self.atomic_len.fetch_add(1, Ordering::Relaxed);
480        debug_assert!(
481            (index as usize) < self.buffer.len() as usize,
482            "push exceeded pre-allocated capacity"
483        );
484        self.buffer.set(index, value);
485        index
486    }
487
488    /// Pushes a dummy element onto the backing store of this buffer, if this
489    /// buffer is empty.
490    pub fn ensure_nonempty(&mut self) {
491        if self.buffer.is_empty() {
492            self.buffer.push(default());
493        }
494    }
495
496    /// Returns the GPU buffer, if allocated.
497    pub fn buffer(&self) -> Option<&Buffer> {
498        self.buffer.buffer()
499    }
500}
501
502impl<BDI> Default for PreviousInstanceInputUniformBuffer<BDI>
503where
504    BDI: AtomicPod,
505{
506    fn default() -> Self {
507        Self::new()
508    }
509}
510
511/// The buffer of GPU preprocessing work items for a single view.
512#[cfg_attr(
513    not(target_arch = "wasm32"),
514    expect(
515        clippy::large_enum_variant,
516        reason = "See https://github.com/bevyengine/bevy/issues/19220"
517    )
518)]
519pub enum PreprocessWorkItemBuffers {
520    /// The work items we use if we aren't using indirect drawing.
521    ///
522    /// Because we don't have to separate indexed from non-indexed meshes in
523    /// direct mode, we only have a single buffer here.
524    Direct(RawBufferVec<PreprocessWorkItem>),
525
526    /// The buffer of work items we use if we are using indirect drawing.
527    ///
528    /// We need to separate out indexed meshes from non-indexed meshes in this
529    /// case because the indirect parameters for these two types of meshes have
530    /// different sizes.
531    Indirect {
532        /// The buffer of work items corresponding to indexed meshes.
533        indexed: PartialBufferVec<PreprocessWorkItem>,
534        /// The buffer of work items corresponding to non-indexed meshes.
535        non_indexed: PartialBufferVec<PreprocessWorkItem>,
536        /// The work item buffers we use when GPU occlusion culling is in use.
537        gpu_occlusion_culling: Option<GpuOcclusionCullingWorkItemBuffers>,
538    },
539}
540
541/// The work item buffers we use when GPU occlusion culling is in use.
542pub struct GpuOcclusionCullingWorkItemBuffers {
543    /// The buffer of work items corresponding to indexed meshes.
544    pub late_indexed: UninitBufferVec<PreprocessWorkItem>,
545    /// The buffer of work items corresponding to non-indexed meshes.
546    pub late_non_indexed: UninitBufferVec<PreprocessWorkItem>,
547    /// The offset into the
548    /// [`UntypedPhaseBatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`]
549    /// where this view's indirect dispatch counts for indexed meshes live.
550    pub late_indirect_parameters_indexed_offset: u32,
551    /// The offset into the
552    /// [`UntypedPhaseBatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`]
553    /// where this view's indirect dispatch counts for non-indexed meshes live.
554    pub late_indirect_parameters_non_indexed_offset: u32,
555}
556
557/// A GPU-side data structure that stores the number of workgroups to dispatch
558/// for the second phase of GPU occlusion culling.
559///
560/// The late mesh preprocessing phase checks meshes that weren't visible frame
561/// to determine if they're potentially visible this frame.
562#[derive(Clone, Copy, ShaderType, Pod, Zeroable)]
563#[repr(C)]
564pub struct LatePreprocessWorkItemIndirectParameters {
565    /// The number of workgroups to dispatch.
566    ///
567    /// This will be equal to `work_item_count / 64`, rounded *up*.
568    dispatch_x: u32,
569    /// The number of workgroups along the abstract Y axis to dispatch: always
570    /// 1.
571    dispatch_y: u32,
572    /// The number of workgroups along the abstract Z axis to dispatch: always
573    /// 1.
574    dispatch_z: u32,
575    /// The actual number of work items.
576    ///
577    /// The GPU indirect dispatch doesn't read this, but it's used internally to
578    /// determine the actual number of work items that exist in the late
579    /// preprocessing work item buffer.
580    work_item_count: u32,
581    /// Padding to 64-byte boundaries for some hardware.
582    pad: UVec4,
583}
584
585impl Default for LatePreprocessWorkItemIndirectParameters {
586    fn default() -> LatePreprocessWorkItemIndirectParameters {
587        LatePreprocessWorkItemIndirectParameters {
588            dispatch_x: 0,
589            dispatch_y: 1,
590            dispatch_z: 1,
591            work_item_count: 0,
592            pad: default(),
593        }
594    }
595}
596
597/// Returns the set of work item buffers for the given view, first creating it
598/// if necessary.
599///
600/// Bevy uses work item buffers to tell the mesh preprocessing compute shader
601/// which meshes are to be drawn.
602///
603/// You may need to call this function if you're implementing your own custom
604/// render phases. See the `specialized_mesh_pipeline` example.
605pub fn get_or_create_work_item_buffer<'a, I>(
606    work_item_buffers: &'a mut HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
607    view: RetainedViewEntity,
608    no_indirect_drawing: bool,
609    enable_gpu_occlusion_culling: bool,
610) -> &'a mut PreprocessWorkItemBuffers
611where
612    I: 'static,
613{
614    let preprocess_work_item_buffers = match work_item_buffers.entry(view) {
615        Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
616        Entry::Vacant(vacant_entry) => {
617            if no_indirect_drawing {
618                vacant_entry.insert(PreprocessWorkItemBuffers::Direct(RawBufferVec::new(
619                    BufferUsages::STORAGE,
620                )))
621            } else {
622                vacant_entry.insert(PreprocessWorkItemBuffers::Indirect {
623                    indexed: PartialBufferVec::new(
624                        BufferUsages::STORAGE,
625                        "indexed preprocess work item buffer".to_owned(),
626                    ),
627                    non_indexed: PartialBufferVec::new(
628                        BufferUsages::STORAGE,
629                        "non-indexed preprocess work item buffer".to_owned(),
630                    ),
631                    // We fill this in below if `enable_gpu_occlusion_culling`
632                    // is set.
633                    gpu_occlusion_culling: None,
634                })
635            }
636        }
637    };
638
639    // Initialize the GPU occlusion culling buffers if necessary.
640    if let PreprocessWorkItemBuffers::Indirect {
641        ref mut gpu_occlusion_culling,
642        ..
643    } = *preprocess_work_item_buffers
644    {
645        match (
646            enable_gpu_occlusion_culling,
647            gpu_occlusion_culling.is_some(),
648        ) {
649            (false, false) | (true, true) => {}
650            (false, true) => {
651                *gpu_occlusion_culling = None;
652            }
653            (true, false) => {
654                *gpu_occlusion_culling = Some(GpuOcclusionCullingWorkItemBuffers {
655                    late_indexed: UninitBufferVec::new(BufferUsages::STORAGE),
656                    late_non_indexed: UninitBufferVec::new(BufferUsages::STORAGE),
657                    late_indirect_parameters_indexed_offset: 0,
658                    late_indirect_parameters_non_indexed_offset: 0,
659                });
660            }
661        }
662    }
663
664    preprocess_work_item_buffers
665}
666
667/// Initializes work item buffers for a phase in preparation for a new frame.
668pub fn init_work_item_buffers(
669    work_item_buffers: &mut PreprocessWorkItemBuffers,
670    late_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec<
671        LatePreprocessWorkItemIndirectParameters,
672    >,
673    late_non_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec<
674        LatePreprocessWorkItemIndirectParameters,
675    >,
676) {
677    // Add the offsets for indirect parameters that the late phase of mesh
678    // preprocessing writes to.
679    if let PreprocessWorkItemBuffers::Indirect {
680        gpu_occlusion_culling:
681            Some(GpuOcclusionCullingWorkItemBuffers {
682                ref mut late_indirect_parameters_indexed_offset,
683                ref mut late_indirect_parameters_non_indexed_offset,
684                ..
685            }),
686        ..
687    } = *work_item_buffers
688    {
689        *late_indirect_parameters_indexed_offset = late_indexed_indirect_parameters_buffer
690            .push(LatePreprocessWorkItemIndirectParameters::default())
691            as u32;
692        *late_indirect_parameters_non_indexed_offset = late_non_indexed_indirect_parameters_buffer
693            .push(LatePreprocessWorkItemIndirectParameters::default())
694            as u32;
695    }
696}
697
698impl PreprocessWorkItemBuffers {
699    /// Adds a new work item to the appropriate buffer.
700    ///
701    /// `indexed` specifies whether the work item corresponds to an indexed
702    /// mesh.
703    pub fn push(&mut self, indexed: bool, preprocess_work_item: PreprocessWorkItem) {
704        match *self {
705            PreprocessWorkItemBuffers::Direct(ref mut buffer) => {
706                buffer.push(preprocess_work_item);
707            }
708            PreprocessWorkItemBuffers::Indirect {
709                indexed: ref mut indexed_buffer,
710                non_indexed: ref mut non_indexed_buffer,
711                ref mut gpu_occlusion_culling,
712            } => {
713                if indexed {
714                    indexed_buffer.push_init(preprocess_work_item);
715                } else {
716                    non_indexed_buffer.push_init(preprocess_work_item);
717                }
718
719                if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling {
720                    if indexed {
721                        gpu_occlusion_culling.late_indexed.add();
722                    } else {
723                        gpu_occlusion_culling.late_non_indexed.add();
724                    }
725                }
726            }
727        }
728    }
729
730    /// Clears out the GPU work item buffers in preparation for a new frame.
731    pub fn clear(&mut self) {
732        match *self {
733            PreprocessWorkItemBuffers::Direct(ref mut buffer) => {
734                buffer.clear();
735            }
736            PreprocessWorkItemBuffers::Indirect {
737                indexed: ref mut indexed_buffer,
738                non_indexed: ref mut non_indexed_buffer,
739                ref mut gpu_occlusion_culling,
740            } => {
741                indexed_buffer.clear();
742                non_indexed_buffer.clear();
743
744                if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling {
745                    gpu_occlusion_culling.late_indexed.clear();
746                    gpu_occlusion_culling.late_non_indexed.clear();
747                    gpu_occlusion_culling.late_indirect_parameters_indexed_offset = 0;
748                    gpu_occlusion_culling.late_indirect_parameters_non_indexed_offset = 0;
749                }
750            }
751        }
752    }
753}
754
755/// One invocation of the preprocessing shader: i.e. one mesh instance in a
756/// view.
757#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
758#[repr(C)]
759pub struct PreprocessWorkItem {
760    /// The index of the batch input data in the input buffer that the shader
761    /// reads from.
762    pub input_index: u32,
763
764    /// In direct mode, the index of the mesh uniform; in indirect mode, the
765    /// index of the [`IndirectParametersGpuMetadata`].
766    ///
767    /// In indirect mode, this is the index of the
768    /// [`IndirectParametersGpuMetadata`] in the
769    /// `IndirectParametersBuffers::indexed_metadata` or
770    /// `IndirectParametersBuffers::non_indexed_metadata`.
771    pub output_or_indirect_parameters_index: u32,
772}
773
774/// The `wgpu` indirect parameters structure that specifies a GPU draw command.
775///
776/// This is the variant for indexed meshes. We generate the instances of this
777/// structure in the `build_indirect_params.wgsl` compute shader.
778#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)]
779#[repr(C)]
780pub struct IndirectParametersIndexed {
781    /// The number of indices that this mesh has.
782    pub index_count: u32,
783    /// The number of instances we are to draw.
784    pub instance_count: u32,
785    /// The offset of the first index for this mesh in the index buffer slab.
786    pub first_index: u32,
787    /// The offset of the first vertex for this mesh in the vertex buffer slab.
788    pub base_vertex: u32,
789    /// The index of the first mesh instance in the `MeshUniform` buffer.
790    pub first_instance: u32,
791}
792
793/// The `wgpu` indirect parameters structure that specifies a GPU draw command.
794///
795/// This is the variant for non-indexed meshes. We generate the instances of
796/// this structure in the `build_indirect_params.wgsl` compute shader.
797#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)]
798#[repr(C)]
799pub struct IndirectParametersNonIndexed {
800    /// The number of vertices that this mesh has.
801    pub vertex_count: u32,
802    /// The number of instances we are to draw.
803    pub instance_count: u32,
804    /// The offset of the first vertex for this mesh in the vertex buffer slab.
805    pub base_vertex: u32,
806    /// The index of the first mesh instance in the `Mesh` buffer.
807    pub first_instance: u32,
808}
809
810/// A structure, initialized on CPU and read on GPU, that contains metadata
811/// about each batch.
812///
813/// Each batch will have one instance of this structure.
814#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
815#[repr(C)]
816pub struct IndirectParametersCpuMetadata {
817    /// The index of the first instance of this mesh in the array of
818    /// `MeshUniform`s.
819    ///
820    /// Note that this is the *first* output index in this batch. Since each
821    /// instance of this structure refers to arbitrarily many instances, the
822    /// `MeshUniform`s corresponding to this batch span the indices
823    /// `base_output_index..(base_output_index + instance_count)`.
824    pub base_output_index: u32,
825
826    /// The index of the batch set that this batch belongs to in the
827    /// [`IndirectBatchSet`] buffer.
828    ///
829    /// A *batch set* is a set of meshes that may be multi-drawn together.
830    /// Multiple batches (and therefore multiple instances of
831    /// [`IndirectParametersGpuMetadata`] structures) can be part of the same
832    /// batch set.
833    pub batch_set_index: u32,
834}
835
836/// A structure, written and read on GPU, that records how many instances of
837/// each mesh are actually to be drawn.
838///
839/// The GPU mesh preprocessing shader increments the
840/// [`Self::early_instance_count`] and [`Self::late_instance_count`] as it
841/// determines that meshes are visible.  The indirect parameter building shader
842/// reads this metadata in order to construct the indirect draw parameters.
843///
844/// Each batch will have one instance of this structure.
845#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
846#[repr(C)]
847pub struct IndirectParametersGpuMetadata {
848    /// The index of the first mesh in this batch in the array of
849    /// `MeshInputUniform`s.
850    pub mesh_index: u32,
851
852    /// The number of instances that were judged visible last frame.
853    ///
854    /// The CPU sets this value to 0, and the GPU mesh preprocessing shader
855    /// increments it as it culls mesh instances.
856    pub early_instance_count: u32,
857
858    /// The number of instances that have been judged potentially visible this
859    /// frame that weren't in the last frame's potentially visible set.
860    ///
861    /// The CPU sets this value to 0, and the GPU mesh preprocessing shader
862    /// increments it as it culls mesh instances.
863    pub late_instance_count: u32,
864}
865
866/// A structure, shared between CPU and GPU, that holds the number of on-GPU
867/// indirect draw commands for each *batch set*.
868///
869/// A *batch set* is a set of meshes that may be multi-drawn together.
870///
871/// If the current hardware and driver support `multi_draw_indirect_count`, the
872/// indirect parameters building shader increments
873/// [`Self::indirect_parameters_count`] as it generates indirect parameters. The
874/// `multi_draw_indirect_count` command reads
875/// [`Self::indirect_parameters_count`] in order to determine how many commands
876/// belong to each batch set.
877#[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)]
878#[repr(C)]
879pub struct IndirectBatchSet {
880    /// The number of indirect parameter commands (i.e. batches) in this batch
881    /// set.
882    ///
883    /// The CPU sets this value to 0 before uploading this structure to GPU. The
884    /// indirect parameters building shader increments this value as it creates
885    /// indirect parameters. Then the `multi_draw_indirect_count` command reads
886    /// this value in order to determine how many indirect draw commands to
887    /// process.
888    pub indirect_parameters_count: u32,
889
890    /// The offset within the `IndirectParametersBuffers::indexed_data` or
891    /// `IndirectParametersBuffers::non_indexed_data` of the first indirect draw
892    /// command for this batch set.
893    ///
894    /// The CPU fills out this value.
895    pub indirect_parameters_base: u32,
896}
897
898/// The buffers containing all the information that indirect draw commands
899/// (`multi_draw_indirect`, `multi_draw_indirect_count`) use to draw the scene.
900///
901/// In addition to the indirect draw buffers themselves, this structure contains
902/// the buffers that store [`IndirectParametersGpuMetadata`], which are the
903/// structures that culling writes to so that the indirect parameter building
904/// pass can determine how many meshes are actually to be drawn.
905///
906/// These buffers will remain empty if indirect drawing isn't in use.
907#[derive(Resource, Deref, DerefMut, Default)]
908pub struct IndirectParametersBuffers {
909    /// A mapping from a phase type ID to the indirect parameters buffers for
910    /// that phase.
911    ///
912    /// Examples of phase type IDs are `Opaque3d` and `AlphaMask3d`.
913    #[deref]
914    pub buffers: TypeIdMap<UntypedPhaseIndirectParametersBuffers>,
915}
916
917/// Configuration for [`IndirectParametersBuffers`].
918#[derive(Resource)]
919pub struct IndirectParametersBuffersSettings {
920    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
921    /// that they can be read back to CPU.
922    ///
923    /// This is a debugging feature that may reduce performance. It primarily
924    /// exists for the `occlusion_culling` example.
925    pub allow_copies_from_indirect_parameter_buffers: bool,
926}
927
928/// GPU-side information needed to unpack bins belonging to a single batch set.
929#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
930#[repr(C)]
931pub struct GpuBinUnpackingMetadata {
932    /// The index of the first `PreprocessWorkItem` that the compute shader
933    /// dispatch is to write to.
934    base_output_work_item_index: u32,
935    /// The index of the first GPU indirect parameters command for this batch
936    /// set.
937    base_indirect_parameters_index: u32,
938    /// The number of binned mesh instances in the `binned_mesh_instances`
939    /// array.
940    binned_mesh_instance_count: u32,
941    /// Padding.
942    pad: [u32; 61],
943}
944
945impl Default for GpuBinUnpackingMetadata {
946    fn default() -> GpuBinUnpackingMetadata {
947        GpuBinUnpackingMetadata {
948            base_output_work_item_index: 0,
949            base_indirect_parameters_index: 0,
950            binned_mesh_instance_count: 0,
951            pad: [0; _],
952        }
953    }
954}
955
956/// CPU-side information needed to construct the bind groups and issue the
957/// dispatch for the `unpack_bins` shader, for a single batch set.
958pub struct BinUnpackingJob {
959    /// The GPU buffer of `GpuRenderBinnedMeshInstance`s corresponding to the
960    /// mesh instances that this batch set contains.
961    pub render_binned_mesh_instance_buffer: Buffer,
962    /// The GPU buffer that maps each bin index to the index of the indirect
963    /// drawing parameters for that bin, relative to the first such indirect
964    /// drawing parameters for this batch set.
965    pub bin_index_to_indirect_parameters_offset_buffer: Buffer,
966    /// The index of this batch set's [`GpuBinUnpackingMetadata`] in the
967    /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer.
968    pub bin_unpacking_metadata_index: BinUnpackingMetadataIndex,
969    /// The total number of mesh instances in this batch set.
970    pub mesh_instance_count: u32,
971}
972
973/// The buffers containing all the information that indirect draw commands use
974/// to draw the scene, for a single phase.
975///
976/// This is the version of the structure that has a type parameter, so that the
977/// batching for different phases can run in parallel.
978///
979/// See the [`IndirectParametersBuffers`] documentation for more information.
980#[derive(Resource)]
981pub struct PhaseIndirectParametersBuffers<PI>
982where
983    PI: PhaseItem,
984{
985    /// The indirect draw buffers for the phase.
986    pub buffers: UntypedPhaseIndirectParametersBuffers,
987    phantom: PhantomData<PI>,
988}
989
990impl<PI> FromWorld for PhaseIndirectParametersBuffers<PI>
991where
992    PI: PhaseItem,
993{
994    fn from_world(world: &mut World) -> Self {
995        let settings = world.resource::<IndirectParametersBuffersSettings>();
996        PhaseIndirectParametersBuffers {
997            buffers: UntypedPhaseIndirectParametersBuffers::new(
998                settings.allow_copies_from_indirect_parameter_buffers,
999            ),
1000            phantom: PhantomData,
1001        }
1002    }
1003}
1004
1005impl<PI> PhaseIndirectParametersBuffers<PI>
1006where
1007    PI: PhaseItem,
1008{
1009    /// Allocates a single set of indirect parameters in the appropriate buffer.
1010    fn allocate(&mut self, no_indirect_drawing: bool, item_is_indexed: bool) -> Option<u32> {
1011        if no_indirect_drawing {
1012            None
1013        } else if item_is_indexed {
1014            Some(self.buffers.indexed.allocate(1))
1015        } else {
1016            Some(self.buffers.non_indexed.allocate(1))
1017        }
1018    }
1019}
1020
1021/// The buffers containing all the information that indirect draw commands use
1022/// to draw the scene, for a single phase.
1023///
1024/// This is the version of the structure that doesn't have a type parameter, so
1025/// that it can be inserted into [`IndirectParametersBuffers::buffers`]
1026///
1027/// See the [`IndirectParametersBuffers`] documentation for more information.
1028pub struct UntypedPhaseIndirectParametersBuffers {
1029    /// Information that indirect draw commands use to draw indexed meshes in
1030    /// the scene.
1031    pub indexed: MeshClassIndirectParametersBuffers<IndirectParametersIndexed>,
1032    /// Information that indirect draw commands use to draw non-indexed meshes
1033    /// in the scene.
1034    pub non_indexed: MeshClassIndirectParametersBuffers<IndirectParametersNonIndexed>,
1035}
1036
1037impl UntypedPhaseIndirectParametersBuffers {
1038    /// Creates the indirect parameters buffers.
1039    pub fn new(
1040        allow_copies_from_indirect_parameter_buffers: bool,
1041    ) -> UntypedPhaseIndirectParametersBuffers {
1042        UntypedPhaseIndirectParametersBuffers {
1043            non_indexed: MeshClassIndirectParametersBuffers::new(
1044                allow_copies_from_indirect_parameter_buffers,
1045            ),
1046            indexed: MeshClassIndirectParametersBuffers::new(
1047                allow_copies_from_indirect_parameter_buffers,
1048            ),
1049        }
1050    }
1051
1052    /// Reserves space for `count` new batches.
1053    ///
1054    /// The `indexed` parameter specifies whether the meshes that these batches
1055    /// correspond to are indexed or not.
1056    pub fn allocate(&mut self, indexed: bool, count: u32) -> u32 {
1057        if indexed {
1058            self.indexed.allocate(count)
1059        } else {
1060            self.non_indexed.allocate(count)
1061        }
1062    }
1063
1064    /// Returns the number of batches currently allocated.
1065    ///
1066    /// The `indexed` parameter specifies whether the meshes that these batches
1067    /// correspond to are indexed or not.
1068    fn batch_count(&self, indexed: bool) -> usize {
1069        if indexed {
1070            self.indexed.batch_count()
1071        } else {
1072            self.non_indexed.batch_count()
1073        }
1074    }
1075
1076    /// Returns the number of batch sets currently allocated.
1077    ///
1078    /// The `indexed` parameter specifies whether the meshes that these batch
1079    /// sets correspond to are indexed or not.
1080    pub fn batch_set_count(&self, indexed: bool) -> usize {
1081        if indexed {
1082            self.indexed.batch_sets.len()
1083        } else {
1084            self.non_indexed.batch_sets.len()
1085        }
1086    }
1087
1088    /// Adds a new batch set to `Self::indexed_batch_sets` or
1089    /// `Self::non_indexed_batch_sets` as appropriate.
1090    ///
1091    /// `indexed` specifies whether the meshes that these batch sets correspond
1092    /// to are indexed or not. `indirect_parameters_base` specifies the offset
1093    /// within `Self::indexed_data` or `Self::non_indexed_data` of the first
1094    /// batch in this batch set.
1095    #[inline]
1096    pub fn add_batch_set(&mut self, indexed: bool, indirect_parameters_base: u32) {
1097        if indexed {
1098            self.indexed.batch_sets.push(IndirectBatchSet {
1099                indirect_parameters_base,
1100                indirect_parameters_count: 0,
1101            });
1102        } else {
1103            self.non_indexed.batch_sets.push(IndirectBatchSet {
1104                indirect_parameters_base,
1105                indirect_parameters_count: 0,
1106            });
1107        }
1108    }
1109
1110    /// Returns the index that a newly-added batch set will have.
1111    ///
1112    /// The `indexed` parameter specifies whether the meshes in such a batch set
1113    /// are indexed or not.
1114    pub fn get_next_batch_set_index(&self, indexed: bool) -> Option<NonMaxU32> {
1115        NonMaxU32::new(self.batch_set_count(indexed) as u32)
1116    }
1117
1118    /// Clears out the buffers in preparation for a new frame.
1119    pub fn clear(&mut self) {
1120        self.indexed.clear();
1121        self.non_indexed.clear();
1122    }
1123}
1124
1125/// A resource, part of the render world, that holds all GPU buffers used for
1126/// the bin unpacking shader.
1127#[derive(Resource)]
1128pub struct BinUnpackingBuffers {
1129    /// A buffer containing all the uniforms needed to run the bin unpacking
1130    /// compute shader for each batch set.
1131    pub bin_unpacking_metadata: RawBufferVec<GpuBinUnpackingMetadata>,
1132    /// Per-view-phase buffers for the bin unpacking shader.
1133    pub view_phase_buffers: HashMap<BinUnpackingBuffersKey, ViewPhaseBinUnpackingBuffers>,
1134}
1135
1136impl Default for BinUnpackingBuffers {
1137    fn default() -> Self {
1138        let mut bin_unpacking_metadata = RawBufferVec::new(BufferUsages::UNIFORM);
1139        bin_unpacking_metadata.set_label(Some("bin unpacking metadata buffer"));
1140        BinUnpackingBuffers {
1141            bin_unpacking_metadata,
1142            view_phase_buffers: HashMap::default(),
1143        }
1144    }
1145}
1146
1147/// GPU buffers for the bin unpacking shader that are specific to each phase of
1148/// each view.
1149#[derive(Default)]
1150pub struct ViewPhaseBinUnpackingBuffers {
1151    /// Metadata that describes each unpacking job, specific to indexed meshes.
1152    pub indexed_unpacking_jobs: Vec<BinUnpackingJob>,
1153    /// Metadata that describes each unpacking job, specific to non-indexed
1154    /// meshes.
1155    pub non_indexed_unpacking_jobs: Vec<BinUnpackingJob>,
1156}
1157
1158/// A key used to look up the bin unpacking buffers for a specific phase of a
1159/// specific view.
1160#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
1161pub struct BinUnpackingBuffersKey {
1162    /// The ID of the phase.
1163    pub phase: TypeId,
1164    /// The entity ID of the view.
1165    pub view: RetainedViewEntity,
1166}
1167
1168/// The index of the metadata corresponding to one bin unpacking job in the
1169/// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer.
1170#[derive(Clone, Copy, Debug, Deref, DerefMut)]
1171pub struct BinUnpackingMetadataIndex(pub NonMaxU32);
1172
1173impl BinUnpackingMetadataIndex {
1174    /// Returns the byte offset within the
1175    /// [`BinUnpackingBuffers::bin_unpacking_metadata`] buffer corresponding to
1176    /// this index.
1177    pub fn uniform_offset(&self) -> u32 {
1178        self.get() * size_of::<GpuBinUnpackingMetadata>() as u32
1179    }
1180}
1181
1182/// The buffers containing all the information that indirect draw commands use
1183/// to draw the scene, for a single mesh class (indexed or non-indexed), for a
1184/// single phase.
1185pub struct MeshClassIndirectParametersBuffers<IP>
1186where
1187    IP: Clone + ShaderSize + WriteInto,
1188{
1189    /// The GPU buffer that stores the indirect draw parameters for the meshes.
1190    ///
1191    /// The indirect parameters building shader writes to this buffer, while the
1192    /// `multi_draw_indirect` or `multi_draw_indirect_count` commands read from
1193    /// it to perform the draws.
1194    indirect_draw_parameters: UninitBufferVec<IP>,
1195
1196    /// The GPU buffer that holds the data used to construct indirect draw
1197    /// parameters for meshes.
1198    ///
1199    /// The GPU mesh preprocessing shader writes to this buffer, and the
1200    /// indirect parameters building shader reads this buffer to construct the
1201    /// indirect draw parameters.
1202    cpu_metadata: RawBufferVec<IndirectParametersCpuMetadata>,
1203
1204    /// The GPU buffer that holds data built by the GPU used to construct
1205    /// indirect draw parameters for meshes.
1206    ///
1207    /// The GPU mesh preprocessing shader writes to this buffer, and the
1208    /// indirect parameters building shader reads this buffer to construct the
1209    /// indirect draw parameters.
1210    gpu_metadata: UninitBufferVec<IndirectParametersGpuMetadata>,
1211
1212    /// The GPU buffer that holds the number of indirect draw commands for each
1213    /// phase of each view, for meshes.
1214    ///
1215    /// The indirect parameters building shader writes to this buffer, and the
1216    /// `multi_draw_indirect_count` command reads from it in order to know how
1217    /// many indirect draw commands to process.
1218    batch_sets: RawBufferVec<IndirectBatchSet>,
1219}
1220
1221impl<IP> MeshClassIndirectParametersBuffers<IP>
1222where
1223    IP: Clone + ShaderSize + WriteInto,
1224{
1225    fn new(
1226        allow_copies_from_indirect_parameter_buffers: bool,
1227    ) -> MeshClassIndirectParametersBuffers<IP> {
1228        let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT;
1229        if allow_copies_from_indirect_parameter_buffers {
1230            indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC;
1231        }
1232
1233        MeshClassIndirectParametersBuffers {
1234            indirect_draw_parameters: UninitBufferVec::new(indirect_parameter_buffer_usages),
1235            cpu_metadata: RawBufferVec::new(BufferUsages::STORAGE),
1236            gpu_metadata: UninitBufferVec::new(BufferUsages::STORAGE),
1237            batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages),
1238        }
1239    }
1240
1241    /// Returns the GPU buffer that stores the indirect draw parameters for
1242    /// indexed meshes.
1243    ///
1244    /// The indirect parameters building shader writes to this buffer, while the
1245    /// `multi_draw_indirect` or `multi_draw_indirect_count` commands read from
1246    /// it to perform the draws.
1247    #[inline]
1248    pub fn data_buffer(&self) -> Option<&Buffer> {
1249        self.indirect_draw_parameters.buffer()
1250    }
1251
1252    /// Returns the GPU buffer that holds the CPU-constructed data used to
1253    /// construct indirect draw parameters for meshes.
1254    ///
1255    /// The CPU writes to this buffer, and the indirect parameters building
1256    /// shader reads this buffer to construct the indirect draw parameters.
1257    #[inline]
1258    pub fn cpu_metadata_buffer(&self) -> Option<&Buffer> {
1259        self.cpu_metadata.buffer()
1260    }
1261
1262    /// Returns the GPU buffer that holds the GPU-constructed data used to
1263    /// construct indirect draw parameters for meshes.
1264    ///
1265    /// The GPU mesh preprocessing shader writes to this buffer, and the
1266    /// indirect parameters building shader reads this buffer to construct the
1267    /// indirect draw parameters.
1268    #[inline]
1269    pub fn gpu_metadata_buffer(&self) -> Option<&Buffer> {
1270        self.gpu_metadata.buffer()
1271    }
1272
1273    /// Returns the GPU buffer that holds the number of indirect draw commands
1274    /// for each phase of each view.
1275    ///
1276    /// The indirect parameters building shader writes to this buffer, and the
1277    /// `multi_draw_indirect_count` command reads from it in order to know how
1278    /// many indirect draw commands to process.
1279    #[inline]
1280    pub fn batch_sets_buffer(&self) -> Option<&Buffer> {
1281        self.batch_sets.buffer()
1282    }
1283
1284    /// Reserves space for `count` new batches.
1285    ///
1286    /// This allocates in the [`Self::cpu_metadata`], [`Self::gpu_metadata`],
1287    /// and [`Self::indirect_draw_parameters`] buffers.
1288    fn allocate(&mut self, count: u32) -> u32 {
1289        let length = self.indirect_draw_parameters.len();
1290        self.cpu_metadata.reserve_internal(count as usize);
1291        self.gpu_metadata.add_multiple(count as usize);
1292        for _ in 0..count {
1293            self.indirect_draw_parameters.add();
1294            self.cpu_metadata
1295                .push(IndirectParametersCpuMetadata::default());
1296        }
1297        length as u32
1298    }
1299
1300    /// Sets the [`IndirectParametersCpuMetadata`] for the mesh at the given
1301    /// index.
1302    pub fn set(&mut self, index: u32, value: IndirectParametersCpuMetadata) {
1303        self.cpu_metadata.set(index, value);
1304    }
1305
1306    /// Returns the number of batches corresponding to meshes that are currently
1307    /// allocated.
1308    #[inline]
1309    pub fn batch_count(&self) -> usize {
1310        self.indirect_draw_parameters.len()
1311    }
1312
1313    /// Clears out all the buffers in preparation for a new frame.
1314    pub fn clear(&mut self) {
1315        self.indirect_draw_parameters.clear();
1316        self.cpu_metadata.clear();
1317        self.gpu_metadata.clear();
1318        self.batch_sets.clear();
1319    }
1320}
1321
1322impl FromWorld for GpuPreprocessingSupport {
1323    fn from_world(world: &mut World) -> Self {
1324        let adapter = world.resource::<RenderAdapter>();
1325        let device = world.resource::<RenderDevice>();
1326
1327        // Filter Android drivers that are incompatible with GPU preprocessing:
1328        // - We filter out Adreno 730 and earlier GPUs (except 720, as it's newer
1329        //   than 730).
1330        // - We filter out Mali GPUs with driver versions lower than 48.
1331        // - We limit Pixel 10 GPUs (all versions for now) to preprocessing only (no culling)
1332        fn is_non_supported_android_device(adapter_info: &RenderAdapterInfo) -> bool {
1333            crate::get_adreno_model(adapter_info).is_some_and(|model| model != 720 && model <= 730)
1334                || crate::get_mali_driver_version(adapter_info).is_some_and(|version| version < 48)
1335        }
1336        fn is_preprocessing_only_android_device(adapter_info: &RenderAdapterInfo) -> bool {
1337            crate::get_pixel10_driver_version(adapter_info).is_some()
1338        }
1339
1340        let culling_feature_support = device
1341            .features()
1342            .contains(Features::INDIRECT_FIRST_INSTANCE | Features::IMMEDIATES);
1343        // Depth downsampling for occlusion culling requires 12 textures
1344        // and the early occlusion culling pass requires 10 storage buffers
1345        let limit_support = device.limits().max_storage_textures_per_shader_stage >= 12 &&
1346            device.limits().max_storage_buffers_per_shader_stage >= 10 &&
1347            // Even if the adapter supports compute, we might be simulating a lack of
1348            // compute via device limits (see `WgpuSettingsPriority::WebGL2` and
1349            // `wgpu::Limits::downlevel_webgl2_defaults()`). This will have set all the
1350            // `max_compute_*` limits to zero, so we arbitrarily pick one as a canary.
1351            device.limits().max_compute_workgroup_storage_size != 0;
1352
1353        let downlevel_support = adapter
1354            .get_downlevel_capabilities()
1355            .flags
1356            .contains(DownlevelFlags::COMPUTE_SHADERS);
1357
1358        let adapter_info = RenderAdapterInfo(WgpuWrapper::new(adapter.get_info()));
1359
1360        let max_supported_mode = if device.limits().max_compute_workgroup_size_x == 0
1361            || is_non_supported_android_device(&adapter_info)
1362            || adapter_info.backend == wgpu::Backend::Gl
1363        {
1364            info_once!(
1365                "GPU preprocessing is not supported on this device. \
1366                Falling back to CPU preprocessing.",
1367            );
1368            GpuPreprocessingMode::None
1369        } else if !(culling_feature_support && limit_support && downlevel_support)
1370            || is_preprocessing_only_android_device(&adapter_info)
1371        {
1372            info_once!("Some GPU preprocessing are limited on this device.");
1373            GpuPreprocessingMode::PreprocessingOnly
1374        } else {
1375            info_once!("GPU preprocessing is fully supported on this device.");
1376            GpuPreprocessingMode::Culling
1377        };
1378
1379        GpuPreprocessingSupport { max_supported_mode }
1380    }
1381}
1382
1383impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
1384where
1385    BD: GpuArrayBufferable + Sync + Send + 'static,
1386    BDI: AtomicPod,
1387{
1388    /// Creates new buffers.
1389    pub fn new() -> Self {
1390        Self::default()
1391    }
1392
1393    /// Clears out the buffers in preparation for a new frame.
1394    pub fn clear(&mut self) {
1395        for phase_instance_buffer in self.phase_instance_buffers.values_mut() {
1396            phase_instance_buffer.clear();
1397        }
1398    }
1399}
1400
1401impl<BD> UntypedPhaseBatchedInstanceBuffers<BD>
1402where
1403    BD: GpuArrayBufferable + Sync + Send + 'static,
1404{
1405    pub fn new() -> Self {
1406        UntypedPhaseBatchedInstanceBuffers {
1407            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
1408            work_item_buffers: HashMap::default(),
1409            late_indexed_indirect_parameters_buffer: RawBufferVec::new(
1410                BufferUsages::STORAGE | BufferUsages::INDIRECT,
1411            ),
1412            late_non_indexed_indirect_parameters_buffer: RawBufferVec::new(
1413                BufferUsages::STORAGE | BufferUsages::INDIRECT,
1414            ),
1415        }
1416    }
1417
1418    /// Returns the binding of the buffer that contains the per-instance data.
1419    ///
1420    /// This buffer needs to be filled in via a compute shader.
1421    pub fn instance_data_binding(&self) -> Option<BindingResource<'_>> {
1422        self.data_buffer
1423            .buffer()
1424            .map(|buffer| buffer.as_entire_binding())
1425    }
1426
1427    /// Clears out the buffers in preparation for a new frame.
1428    pub fn clear(&mut self) {
1429        self.data_buffer.clear();
1430        self.late_indexed_indirect_parameters_buffer.clear();
1431        self.late_non_indexed_indirect_parameters_buffer.clear();
1432
1433        // Clear each individual set of buffers, but don't depopulate the hash
1434        // table. We want to avoid reallocating these vectors every frame.
1435        for view_work_item_buffers in self.work_item_buffers.values_mut() {
1436            view_work_item_buffers.clear();
1437        }
1438    }
1439}
1440
1441impl<BD> Default for UntypedPhaseBatchedInstanceBuffers<BD>
1442where
1443    BD: GpuArrayBufferable + Sync + Send + 'static,
1444{
1445    fn default() -> Self {
1446        Self::new()
1447    }
1448}
1449
1450/// Information about a single render batch set that we're building up during a
1451/// sorted render phase.
1452struct SortedRenderBatchSet<F>
1453where
1454    F: GetBatchData,
1455{
1456    /// The index of the first phase item in this batch in the list of phase
1457    /// items.
1458    phase_item_start_index: u32,
1459
1460    /// The index of the first instance in this batch in the instance buffer.
1461    instance_start_index: u32,
1462
1463    /// True if the mesh in question has an index buffer; false otherwise.
1464    indexed: bool,
1465
1466    /// The index of the indirect parameters for this batch in the
1467    /// [`IndirectParametersBuffers`].
1468    ///
1469    /// If CPU culling is being used, then this will be `None`.
1470    indirect_parameters_index_range: Option<Range<u32>>,
1471
1472    /// Metadata that can be used to determine whether an instance can be placed
1473    /// into this batch.
1474    ///
1475    /// If `None`, the item inside is unbatchable.
1476    meta: Option<(BatchSetMeta<F::BatchSetCompareData>, F::BatchCompareData)>,
1477}
1478
1479impl<F> SortedRenderBatchSet<F>
1480where
1481    F: GetBatchData,
1482{
1483    /// Finalizes this batch and updates the [`SortedRenderPhase`] with the
1484    /// appropriate indices.
1485    ///
1486    /// `instance_end_index` is the index of the last instance in this batch
1487    /// plus one.
1488    fn flush<I>(
1489        self,
1490        instance_end_index: u32,
1491        phase: &mut SortedRenderPhase<I>,
1492        phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
1493    ) where
1494        I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
1495    {
1496        let (batch_range, batch_extra_index) =
1497            phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut();
1498        *batch_range = self.instance_start_index..instance_end_index;
1499        *batch_extra_index = match self.indirect_parameters_index_range {
1500            Some(ref indirect_parameters_index_range) => {
1501                PhaseItemExtraIndex::IndirectParametersIndex {
1502                    range: (*indirect_parameters_index_range).clone(),
1503                    batch_set_index: None,
1504                }
1505            }
1506            None => PhaseItemExtraIndex::None,
1507        };
1508        if let Some(ref indirect_parameters_index_range) = self.indirect_parameters_index_range {
1509            phase_indirect_parameters_buffers
1510                .add_batch_set(self.indexed, indirect_parameters_index_range.start);
1511        }
1512    }
1513}
1514
1515/// A system that runs early in extraction and clears out all the
1516/// [`BatchedInstanceBuffers`] for the frame.
1517///
1518/// We have to run this during extraction because, if GPU preprocessing is in
1519/// use, the extraction phase will write to the mesh input uniform buffers
1520/// directly, so the buffers need to be cleared before then.
1521pub fn clear_batched_gpu_instance_buffers<GFBD>(
1522    gpu_batched_instance_buffers: Option<
1523        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
1524    >,
1525) where
1526    GFBD: GetFullBatchData,
1527{
1528    // Don't clear the entire table, because that would delete the buffers, and
1529    // we want to reuse those allocations.
1530    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
1531        gpu_batched_instance_buffers.clear();
1532    }
1533}
1534
1535/// A system that removes GPU preprocessing work item buffers that correspond to
1536/// deleted [`ExtractedView`]s.
1537///
1538/// This is a separate system from [`clear_batched_gpu_instance_buffers`]
1539/// because [`ExtractedView`]s aren't created until after the extraction phase
1540/// is completed.
1541pub fn delete_old_work_item_buffers<GFBD>(
1542    mut gpu_batched_instance_buffers: ResMut<
1543        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
1544    >,
1545    extracted_views: Query<&ExtractedView>,
1546) where
1547    GFBD: GetFullBatchData,
1548{
1549    let retained_view_entities: HashSet<_> = extracted_views
1550        .iter()
1551        .map(|extracted_view| extracted_view.retained_view_entity)
1552        .collect();
1553    for phase_instance_buffers in gpu_batched_instance_buffers
1554        .phase_instance_buffers
1555        .values_mut()
1556    {
1557        phase_instance_buffers
1558            .work_item_buffers
1559            .retain(|retained_view_entity, _| {
1560                retained_view_entities.contains(retained_view_entity)
1561            });
1562    }
1563}
1564
1565/// Batch the items in a sorted render phase, when GPU instance buffer building
1566/// is in use. This means comparing metadata needed to draw each phase item and
1567/// trying to combine the draws into a batch.
1568pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
1569    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<I, GFBD::BufferData>>,
1570    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<I>>,
1571    mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
1572    mut views: Query<(
1573        &ExtractedView,
1574        Has<NoIndirectDrawing>,
1575        Has<OcclusionCulling>,
1576    )>,
1577    system_param_item: StaticSystemParam<GFBD::Param>,
1578) where
1579    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
1580    GFBD: GetFullBatchData,
1581{
1582    // We only process GPU-built batch data in this function.
1583    let UntypedPhaseBatchedInstanceBuffers {
1584        ref mut data_buffer,
1585        ref mut work_item_buffers,
1586        ref mut late_indexed_indirect_parameters_buffer,
1587        ref mut late_non_indexed_indirect_parameters_buffer,
1588    } = phase_batched_instance_buffers.buffers;
1589
1590    for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
1591        let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else {
1592            continue;
1593        };
1594
1595        // Create the work item buffer if necessary.
1596        let work_item_buffer = get_or_create_work_item_buffer::<I>(
1597            work_item_buffers,
1598            extracted_view.retained_view_entity,
1599            no_indirect_drawing,
1600            gpu_occlusion_culling,
1601        );
1602
1603        // Initialize those work item buffers in preparation for this new frame.
1604        init_work_item_buffers(
1605            work_item_buffer,
1606            late_indexed_indirect_parameters_buffer,
1607            late_non_indexed_indirect_parameters_buffer,
1608        );
1609
1610        // Walk through the list of phase items, building up batches as we go.
1611        let mut batch_set: Option<SortedRenderBatchSet<GFBD>> = None;
1612
1613        for current_index in 0..phase.items.len() {
1614            // Get the index of the input data, and comparison metadata, for
1615            // this entity.
1616            let item = &phase.items[current_index];
1617            let entity = item.main_entity();
1618            let item_is_indexed = item.indexed();
1619            let current_batch_input_index =
1620                GFBD::get_index_and_compare_data(&system_param_item, entity);
1621
1622            // Unpack that index and metadata. Note that it's possible for index
1623            // and/or metadata to not be present, which signifies that this
1624            // entity is unbatchable. In that case, we break the batch here.
1625            // If the index isn't present the item is not part of this pipeline and so will be skipped.
1626            let Some((current_input_index, current_meta)) = current_batch_input_index else {
1627                // Break a batch if we need to.
1628                if let Some(batch_set) = batch_set.take() {
1629                    batch_set.flush(
1630                        data_buffer.len() as u32,
1631                        phase,
1632                        &mut phase_indirect_parameters_buffers.buffers,
1633                    );
1634                }
1635
1636                continue;
1637            };
1638            let current_meta = if I::AUTOMATIC_BATCHING {
1639                current_meta.map(|(batch_set_meta, batch_meta)| {
1640                    (
1641                        BatchSetMeta::new(&phase.items[current_index], batch_set_meta),
1642                        batch_meta,
1643                    )
1644                })
1645            } else {
1646                None
1647            };
1648
1649            // Determine if this entity can be included in the batch we're
1650            // building up.
1651            let can_batch = match batch_set.as_ref() {
1652                None => SortedPhaseItemBatchability::BreakBatchSet,
1653                Some(batch_set) => match (&current_meta, &batch_set.meta) {
1654                    (
1655                        &Some((ref current_batch_set_key, ref current_bin_key)),
1656                        &Some((ref batch_set_key, ref bin_key)),
1657                    ) => {
1658                        if *current_batch_set_key == *batch_set_key {
1659                            if *current_bin_key == *bin_key {
1660                                SortedPhaseItemBatchability::BatchOk
1661                            } else {
1662                                SortedPhaseItemBatchability::BreakBatch
1663                            }
1664                        } else {
1665                            SortedPhaseItemBatchability::BreakBatchSet
1666                        }
1667                    }
1668                    _ => SortedPhaseItemBatchability::BreakBatchSet,
1669                },
1670            };
1671
1672            // Make space in the data buffer for this instance.
1673            let output_index = data_buffer.add() as u32;
1674
1675            // If we can't batch, break the existing batch or batch set and make
1676            // a new one.
1677            match can_batch {
1678                SortedPhaseItemBatchability::BreakBatchSet => {
1679                    // Flush the existing batch set.
1680                    if let Some(batch_set) = batch_set.take() {
1681                        batch_set.flush(
1682                            output_index,
1683                            phase,
1684                            &mut phase_indirect_parameters_buffers.buffers,
1685                        );
1686                    }
1687
1688                    let indirect_parameters_index = phase_indirect_parameters_buffers
1689                        .allocate(no_indirect_drawing, item_is_indexed);
1690
1691                    // Start a new batch.
1692                    if let Some(indirect_parameters_index) = indirect_parameters_index {
1693                        GFBD::write_batch_indirect_parameters_metadata(
1694                            item_is_indexed,
1695                            output_index,
1696                            None,
1697                            &mut phase_indirect_parameters_buffers.buffers,
1698                            indirect_parameters_index,
1699                        );
1700                    }
1701
1702                    batch_set = Some(SortedRenderBatchSet {
1703                        phase_item_start_index: current_index as u32,
1704                        instance_start_index: output_index,
1705                        indexed: item_is_indexed,
1706                        indirect_parameters_index_range: indirect_parameters_index
1707                            .map(|i| i..(i + 1)),
1708                        meta: current_meta,
1709                    });
1710                }
1711
1712                SortedPhaseItemBatchability::BreakBatch => {
1713                    // Allocate the indirect parameters.
1714                    let maybe_indirect_parameters_index = phase_indirect_parameters_buffers
1715                        .allocate(no_indirect_drawing, item_is_indexed);
1716
1717                    if let (&mut Some(ref mut batch_set), Some(indirect_parameters_index)) =
1718                        (&mut batch_set, maybe_indirect_parameters_index)
1719                    {
1720                        GFBD::write_batch_indirect_parameters_metadata(
1721                            item_is_indexed,
1722                            output_index,
1723                            None,
1724                            &mut phase_indirect_parameters_buffers.buffers,
1725                            indirect_parameters_index,
1726                        );
1727
1728                        batch_set.meta = current_meta;
1729
1730                        let indirect_parameters_index_range = batch_set
1731                            .indirect_parameters_index_range
1732                            .as_mut()
1733                            .expect("Can't allocate in a multidraw set if we aren't multidrawing");
1734                        debug_assert_eq!(
1735                            indirect_parameters_index,
1736                            indirect_parameters_index_range.end
1737                        );
1738                        indirect_parameters_index_range.end += 1;
1739                    }
1740                }
1741
1742                SortedPhaseItemBatchability::BatchOk => {}
1743            };
1744
1745            // Add a new preprocessing work item so that the preprocessing
1746            // shader will copy the per-instance data over.
1747            if let Some(batch_set) = batch_set.as_ref() {
1748                work_item_buffer.push(
1749                    item_is_indexed,
1750                    PreprocessWorkItem {
1751                        input_index: current_input_index.into(),
1752                        output_or_indirect_parameters_index: match (
1753                            no_indirect_drawing,
1754                            &batch_set.indirect_parameters_index_range,
1755                        ) {
1756                            (true, _) => output_index,
1757                            (false, Some(indirect_parameters_index_range)) => {
1758                                indirect_parameters_index_range.end - 1
1759                            }
1760                            (false, None) => 0,
1761                        },
1762                    },
1763                );
1764            }
1765        }
1766
1767        // Flush the final batch set if necessary.
1768        if let Some(batch_set) = batch_set.take() {
1769            batch_set.flush(
1770                data_buffer.len() as u32,
1771                phase,
1772                &mut phase_indirect_parameters_buffers.buffers,
1773            );
1774        }
1775    }
1776}
1777
1778/// How a single sorted phase item can be batched with the previous phase item.
1779#[derive(Clone, Copy, PartialEq)]
1780enum SortedPhaseItemBatchability {
1781    /// The item can be batched with the previous item.
1782    BatchOk,
1783    /// The item can't be batched with the previous item, but can still go in
1784    /// the same batch set.
1785    ///
1786    /// That is, the item can be multi-drawn with the previous item.
1787    BreakBatch,
1788    /// The item needs to create a new batch set.
1789    BreakBatchSet,
1790}
1791
1792/// Creates batches for a render phase that uses bins.
1793pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
1794    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
1795    phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
1796    mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
1797    mut views: Query<
1798        (
1799            &ExtractedView,
1800            Has<NoIndirectDrawing>,
1801            Has<OcclusionCulling>,
1802        ),
1803        With<ExtractedView>,
1804    >,
1805    param: StaticSystemParam<GFBD::Param>,
1806) where
1807    BPI: BinnedPhaseItem,
1808    GFBD: GetFullBatchData,
1809{
1810    let system_param_item = param.into_inner();
1811
1812    let phase_indirect_parameters_buffers = phase_indirect_parameters_buffers.into_inner();
1813
1814    let UntypedPhaseBatchedInstanceBuffers {
1815        ref mut data_buffer,
1816        ref mut work_item_buffers,
1817        ref mut late_indexed_indirect_parameters_buffer,
1818        ref mut late_non_indexed_indirect_parameters_buffer,
1819    } = phase_batched_instance_buffers.buffers;
1820
1821    for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
1822        let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else {
1823            continue;
1824        };
1825
1826        // Create the work item buffer if necessary; otherwise, just mark it as
1827        // used this frame.
1828        let work_item_buffer = get_or_create_work_item_buffer::<BPI>(
1829            work_item_buffers,
1830            extracted_view.retained_view_entity,
1831            no_indirect_drawing,
1832            gpu_occlusion_culling,
1833        );
1834
1835        // Initialize those work item buffers in preparation for this new frame.
1836        init_work_item_buffers(
1837            work_item_buffer,
1838            late_indexed_indirect_parameters_buffer,
1839            late_non_indexed_indirect_parameters_buffer,
1840        );
1841
1842        // We prepare unbatchables, batchables, and multidrawables in that
1843        // order. This is because:
1844        //
1845        // 1. The `PreprocessWorkItem`s are stored in a `PartialBufferVec`.
1846        // 2. `PreprocessWorkItem`s corresponding to multidrawable mesh
1847        // instances are built on GPU via the `unpack_bins` shader.
1848        // 3. `PreprocessWorkItem`s corresponding to unbatchable and
1849        // batchable-but-not-multidrawable mesh instances are currently built on
1850        // the CPU.
1851        // 4. The `PartialBufferVec`s type enforces that CPU-initialized values
1852        // precede the uninitialized (i.e. GPU-initialized) ones.
1853        //
1854        // Thus, we have to make sure the preprocessing work items that the GPU
1855        // will build follow the preprocessing work items that the CPU built. We
1856        // do so by preparing the items in the order listed above.
1857
1858        // Prepare unbatchables.
1859
1860        for (key, unbatchables) in &mut phase.unbatchable_meshes {
1861            // Allocate the indirect parameters if necessary.
1862            let mut indirect_parameters_offset = if no_indirect_drawing {
1863                None
1864            } else if key.0.indexed() {
1865                Some(
1866                    phase_indirect_parameters_buffers
1867                        .buffers
1868                        .indexed
1869                        .allocate(unbatchables.entities.len() as u32),
1870                )
1871            } else {
1872                Some(
1873                    phase_indirect_parameters_buffers
1874                        .buffers
1875                        .non_indexed
1876                        .allocate(unbatchables.entities.len() as u32),
1877                )
1878            };
1879
1880            for main_entity in unbatchables.entities.keys() {
1881                let Some(input_index) = GFBD::get_binned_index(&system_param_item, *main_entity)
1882                else {
1883                    continue;
1884                };
1885                let output_index = data_buffer.add() as u32;
1886
1887                if let Some(ref mut indirect_parameters_index) = indirect_parameters_offset {
1888                    // We're in indirect mode, so add an indirect parameters
1889                    // index.
1890                    GFBD::write_batch_indirect_parameters_metadata(
1891                        key.0.indexed(),
1892                        output_index,
1893                        None,
1894                        &mut phase_indirect_parameters_buffers.buffers,
1895                        *indirect_parameters_index,
1896                    );
1897                    work_item_buffer.push(
1898                        key.0.indexed(),
1899                        PreprocessWorkItem {
1900                            input_index: input_index.into(),
1901                            output_or_indirect_parameters_index: *indirect_parameters_index,
1902                        },
1903                    );
1904                    unbatchables
1905                        .buffer_indices
1906                        .add(UnbatchableBinnedEntityIndices {
1907                            instance_index: *indirect_parameters_index,
1908                            extra_index: PhaseItemExtraIndex::IndirectParametersIndex {
1909                                range: *indirect_parameters_index..(*indirect_parameters_index + 1),
1910                                batch_set_index: None,
1911                            },
1912                        });
1913                    phase_indirect_parameters_buffers
1914                        .buffers
1915                        .add_batch_set(key.0.indexed(), *indirect_parameters_index);
1916                    *indirect_parameters_index += 1;
1917                } else {
1918                    work_item_buffer.push(
1919                        key.0.indexed(),
1920                        PreprocessWorkItem {
1921                            input_index: input_index.into(),
1922                            output_or_indirect_parameters_index: output_index,
1923                        },
1924                    );
1925                    unbatchables
1926                        .buffer_indices
1927                        .add(UnbatchableBinnedEntityIndices {
1928                            instance_index: output_index,
1929                            extra_index: PhaseItemExtraIndex::None,
1930                        });
1931                }
1932            }
1933        }
1934
1935        // Prepare batchables.
1936
1937        for (key, bin) in &phase.batchable_meshes {
1938            let mut batch: Option<BinnedRenderPhaseBatch> = None;
1939            for (&main_entity, &input_index) in bin.entities() {
1940                let output_index = data_buffer.add() as u32;
1941
1942                match batch {
1943                    Some(ref mut batch) => {
1944                        batch.instance_range.end = output_index + 1;
1945
1946                        // Append to the current batch.
1947                        //
1948                        // If we're in indirect mode, then we write the first
1949                        // output index of this batch, so that we have a
1950                        // tightly-packed buffer if GPU culling discards some of
1951                        // the instances. Otherwise, we can just write the
1952                        // output index directly.
1953                        work_item_buffer.push(
1954                            key.0.indexed(),
1955                            PreprocessWorkItem {
1956                                input_index: *input_index,
1957                                output_or_indirect_parameters_index: match (
1958                                    no_indirect_drawing,
1959                                    &batch.extra_index,
1960                                ) {
1961                                    (true, _) => output_index,
1962                                    (
1963                                        false,
1964                                        PhaseItemExtraIndex::IndirectParametersIndex {
1965                                            range: indirect_parameters_range,
1966                                            ..
1967                                        },
1968                                    ) => indirect_parameters_range.start,
1969                                    (false, &PhaseItemExtraIndex::DynamicOffset(_))
1970                                    | (false, &PhaseItemExtraIndex::None) => 0,
1971                                },
1972                            },
1973                        );
1974                    }
1975
1976                    None if !no_indirect_drawing => {
1977                        // Start a new batch, in indirect mode.
1978                        let indirect_parameters_index = phase_indirect_parameters_buffers
1979                            .buffers
1980                            .allocate(key.0.indexed(), 1);
1981                        let batch_set_index = phase_indirect_parameters_buffers
1982                            .buffers
1983                            .get_next_batch_set_index(key.0.indexed());
1984
1985                        GFBD::write_batch_indirect_parameters_metadata(
1986                            key.0.indexed(),
1987                            output_index,
1988                            batch_set_index,
1989                            &mut phase_indirect_parameters_buffers.buffers,
1990                            indirect_parameters_index,
1991                        );
1992                        work_item_buffer.push(
1993                            key.0.indexed(),
1994                            PreprocessWorkItem {
1995                                input_index: *input_index,
1996                                output_or_indirect_parameters_index: indirect_parameters_index,
1997                            },
1998                        );
1999                        batch = Some(BinnedRenderPhaseBatch {
2000                            representative_entity: (Entity::PLACEHOLDER, main_entity),
2001                            instance_range: output_index..output_index + 1,
2002                            extra_index: PhaseItemExtraIndex::IndirectParametersIndex {
2003                                range: indirect_parameters_index..(indirect_parameters_index + 1),
2004                                batch_set_index: None,
2005                            },
2006                        });
2007                    }
2008
2009                    None => {
2010                        // Start a new batch, in direct mode.
2011                        work_item_buffer.push(
2012                            key.0.indexed(),
2013                            PreprocessWorkItem {
2014                                input_index: *input_index,
2015                                output_or_indirect_parameters_index: output_index,
2016                            },
2017                        );
2018                        batch = Some(BinnedRenderPhaseBatch {
2019                            representative_entity: (Entity::PLACEHOLDER, main_entity),
2020                            instance_range: output_index..output_index + 1,
2021                            extra_index: PhaseItemExtraIndex::None,
2022                        });
2023                    }
2024                }
2025            }
2026
2027            if let Some(batch) = batch {
2028                match phase.batch_sets {
2029                    BinnedRenderPhaseBatchSets::DynamicUniforms(_) => {
2030                        error!("Dynamic uniform batch sets shouldn't be used here");
2031                    }
2032                    BinnedRenderPhaseBatchSets::Direct(ref mut vec) => {
2033                        vec.push(batch);
2034                    }
2035                    BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut vec) => {
2036                        // The Bevy renderer will never mark a mesh as batchable
2037                        // but not multidrawable if multidraw is in use.
2038                        // However, custom render pipelines might do so, such as
2039                        // the `specialized_mesh_pipeline` example.
2040                        vec.push(BinnedRenderPhaseBatchSet {
2041                            first_batch: batch,
2042                            batch_count: 1,
2043                            bin_key: key.1.clone(),
2044                            index: phase_indirect_parameters_buffers
2045                                .buffers
2046                                .batch_set_count(key.0.indexed())
2047                                as u32,
2048                            // Unused.
2049                            first_work_item_index: 0,
2050                        });
2051                    }
2052                }
2053            }
2054        }
2055
2056        // Prepare multidrawables.
2057
2058        if let (
2059            &mut BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets),
2060            &mut PreprocessWorkItemBuffers::Indirect {
2061                indexed: ref mut indexed_work_item_buffer,
2062                non_indexed: ref mut non_indexed_work_item_buffer,
2063                gpu_occlusion_culling: ref mut gpu_occlusion_culling_buffers,
2064            },
2065        ) = (&mut phase.batch_sets, &mut *work_item_buffer)
2066        {
2067            // Initialize the state for both indexed and non-indexed meshes.
2068            let mut indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
2069                MultidrawableBatchSetPreparer::new(
2070                    phase_indirect_parameters_buffers.buffers.batch_count(true) as u32,
2071                    phase_indirect_parameters_buffers
2072                        .buffers
2073                        .indexed
2074                        .batch_sets
2075                        .len() as u32,
2076                );
2077            let mut non_indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
2078                MultidrawableBatchSetPreparer::new(
2079                    phase_indirect_parameters_buffers.buffers.batch_count(false) as u32,
2080                    phase_indirect_parameters_buffers
2081                        .buffers
2082                        .non_indexed
2083                        .batch_sets
2084                        .len() as u32,
2085                );
2086
2087            // Prepare each batch set.
2088            for (batch_set_key, bins) in &phase.multidrawable_meshes {
2089                if batch_set_key.indexed() {
2090                    indexed_preparer.prepare_multidrawable_binned_batch_set(
2091                        bins,
2092                        data_buffer,
2093                        indexed_work_item_buffer,
2094                        &mut phase_indirect_parameters_buffers.buffers.indexed,
2095                        batch_sets,
2096                    );
2097                } else {
2098                    non_indexed_preparer.prepare_multidrawable_binned_batch_set(
2099                        bins,
2100                        data_buffer,
2101                        non_indexed_work_item_buffer,
2102                        &mut phase_indirect_parameters_buffers.buffers.non_indexed,
2103                        batch_sets,
2104                    );
2105                }
2106            }
2107
2108            // Reserve space in the occlusion culling buffers, if necessary.
2109            if let Some(gpu_occlusion_culling_buffers) = gpu_occlusion_culling_buffers {
2110                gpu_occlusion_culling_buffers
2111                    .late_indexed
2112                    .add_multiple(indexed_preparer.work_item_count);
2113                gpu_occlusion_culling_buffers
2114                    .late_non_indexed
2115                    .add_multiple(non_indexed_preparer.work_item_count);
2116            }
2117        }
2118    }
2119}
2120
2121/// The state that [`batch_and_prepare_binned_render_phase`] uses to construct
2122/// multidrawable batch sets.
2123///
2124/// The [`batch_and_prepare_binned_render_phase`] system maintains two of these:
2125/// one for indexed meshes and one for non-indexed meshes.
2126struct MultidrawableBatchSetPreparer<BPI, GFBD>
2127where
2128    BPI: BinnedPhaseItem,
2129    GFBD: GetFullBatchData,
2130{
2131    /// The offset in the indirect parameters buffer at which the next indirect
2132    /// parameters will be written.
2133    indirect_parameters_index: u32,
2134    /// The number of batch sets we've built so far for this mesh class.
2135    batch_set_index: u32,
2136    /// The number of work items we've emitted so far for this mesh class.
2137    work_item_count: usize,
2138    phantom: PhantomData<(BPI, GFBD)>,
2139}
2140
2141impl<BPI, GFBD> MultidrawableBatchSetPreparer<BPI, GFBD>
2142where
2143    BPI: BinnedPhaseItem,
2144    GFBD: GetFullBatchData,
2145{
2146    /// Creates a new [`MultidrawableBatchSetPreparer`] that will start writing
2147    /// indirect parameters and batch sets at the given indices.
2148    #[inline]
2149    fn new(initial_indirect_parameters_index: u32, initial_batch_set_index: u32) -> Self {
2150        MultidrawableBatchSetPreparer {
2151            indirect_parameters_index: initial_indirect_parameters_index,
2152            batch_set_index: initial_batch_set_index,
2153            work_item_count: 0,
2154            phantom: PhantomData,
2155        }
2156    }
2157
2158    /// Creates batch sets and writes the GPU data needed to draw all visible
2159    /// entities of one mesh class in the given batch set.
2160    ///
2161    /// The *mesh class* represents whether the mesh has indices or not.
2162    #[inline]
2163    fn prepare_multidrawable_binned_batch_set<IP>(
2164        &mut self,
2165        batch_set: &RenderMultidrawableBatchSet<BPI>,
2166        data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
2167        work_item_buffer: &mut PartialBufferVec<PreprocessWorkItem>,
2168        mesh_class_buffers: &mut MeshClassIndirectParametersBuffers<IP>,
2169        batch_sets: &mut Vec<BinnedRenderPhaseBatchSet<BPI::BinKey>>,
2170    ) where
2171        IP: Clone + ShaderSize + WriteInto,
2172    {
2173        let current_indexed_batch_set_index = self.batch_set_index;
2174        let current_output_index = data_buffer.len() as u32;
2175        let first_work_item_index = work_item_buffer.len() as u32;
2176
2177        let indirect_parameters_base = self.indirect_parameters_index;
2178
2179        // We're going to write the first entity into the batch set. Do this
2180        // here so that we can preload the bin into cache as a side effect.
2181        let Some((first_bin_key, first_bin_index)) = batch_set.bin_key_to_bin_index.iter().next()
2182        else {
2183            return;
2184        };
2185        let first_bin = batch_set
2186            .bin(*first_bin_index)
2187            .expect("At least one bin must be present in each batch set");
2188        let first_bin_len = first_bin.entity_to_binned_mesh_instance_index.len();
2189        let first_bin_entity = batch_set
2190            .representative_entity()
2191            .unwrap_or(MainEntity::from(Entity::PLACEHOLDER));
2192
2193        // Calculate where the mesh uniform (not the mesh input uniform) should
2194        // go for each mesh instance in our bins. This entails performing a
2195        // prefix sum on the number of elements in each bin. First, initialize
2196        // each base output index to zero.
2197        //
2198        // TODO: Eventually, this should be done on GPU with a prefix sum. We
2199        // don't want any per-bin work to be done on CPU for bins that didn't
2200        // change since the last frame.
2201        let cpu_metadata_offset = mesh_class_buffers.cpu_metadata.len() as u32;
2202        for _ in 0..batch_set.bin_count() {
2203            mesh_class_buffers
2204                .cpu_metadata
2205                .push(IndirectParametersCpuMetadata {
2206                    // We fill this in later.
2207                    base_output_index: 0,
2208                    batch_set_index: self.batch_set_index,
2209                });
2210        }
2211
2212        // Next, traverse each bin and allocate the position of each mesh
2213        // uniform in it. Additionally, reserve space for the mesh instances in
2214        // the buffers.
2215        for bin_index in batch_set.bin_key_to_bin_index.values() {
2216            let bin = batch_set.bin(*bin_index).expect("Bin not present");
2217
2218            // Allocate the indirect parameters.
2219            let indirect_parameters_offset = *batch_set
2220                .gpu_buffers
2221                .bin_index_to_indirect_parameters_offset_buffer
2222                .get(bin_index.0)
2223                .unwrap();
2224            mesh_class_buffers.cpu_metadata.values_mut()
2225                [cpu_metadata_offset as usize + indirect_parameters_offset as usize]
2226                .base_output_index = data_buffer.len() as u32;
2227
2228            // Reserve space for the appropriate number of entities in the work
2229            // item buffer and data buffer. Also, advance the output index and
2230            // work item count.
2231            let bin_entity_count = bin.entity_to_binned_mesh_instance_index.len();
2232            work_item_buffer.push_multiple_uninit(bin_entity_count);
2233            data_buffer.add_multiple(bin_entity_count);
2234            self.work_item_count += bin_entity_count;
2235        }
2236
2237        // Reserve space for the bins in this batch set in the GPU buffers.
2238        let bin_count = batch_set.bin_count();
2239        mesh_class_buffers.gpu_metadata.add_multiple(bin_count);
2240        mesh_class_buffers
2241            .indirect_draw_parameters
2242            .add_multiple(bin_count);
2243
2244        // Write the information the GPU will need about this batch set.
2245        mesh_class_buffers.batch_sets.push(IndirectBatchSet {
2246            indirect_parameters_base,
2247            indirect_parameters_count: 0,
2248        });
2249
2250        self.indirect_parameters_index += bin_count as u32;
2251        self.batch_set_index += 1;
2252
2253        // Record the batch set. The render node later processes this record to
2254        // render the batches.
2255        batch_sets.push(BinnedRenderPhaseBatchSet {
2256            first_batch: BinnedRenderPhaseBatch {
2257                representative_entity: (Entity::PLACEHOLDER, first_bin_entity),
2258                instance_range: current_output_index..(current_output_index + first_bin_len as u32),
2259                extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(NonMaxU32::new(
2260                    indirect_parameters_base,
2261                )),
2262            },
2263            bin_key: (*first_bin_key).clone(),
2264            batch_count: self.indirect_parameters_index - indirect_parameters_base,
2265            index: current_indexed_batch_set_index,
2266            first_work_item_index,
2267        });
2268    }
2269}
2270
2271/// A system that gathers up the per-phase GPU buffers and inserts them into the
2272/// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
2273///
2274/// This runs after the [`batch_and_prepare_binned_render_phase`] or
2275/// [`batch_and_prepare_sorted_render_phase`] systems. It takes the per-phase
2276/// [`PhaseBatchedInstanceBuffers`] and [`PhaseIndirectParametersBuffers`]
2277/// resources and inserts them into the global [`BatchedInstanceBuffers`] and
2278/// [`IndirectParametersBuffers`] tables.
2279///
2280/// This system exists so that the [`batch_and_prepare_binned_render_phase`] and
2281/// [`batch_and_prepare_sorted_render_phase`] can run in parallel with one
2282/// another. If those two systems manipulated [`BatchedInstanceBuffers`] and
2283/// [`IndirectParametersBuffers`] directly, then they wouldn't be able to run in
2284/// parallel.
2285pub fn collect_buffers_for_phase<PI, GFBD>(
2286    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<PI, GFBD::BufferData>>,
2287    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<PI>>,
2288    mut batched_instance_buffers: ResMut<
2289        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
2290    >,
2291    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
2292    indirect_parameters_buffers_settings: Res<IndirectParametersBuffersSettings>,
2293) where
2294    PI: PhaseItem,
2295    GFBD: GetFullBatchData + Send + Sync + 'static,
2296{
2297    // Insert the `PhaseBatchedInstanceBuffers` into the global table. Replace
2298    // the contents of the per-phase resource with the old batched instance
2299    // buffers in order to reuse allocations.
2300    let untyped_phase_batched_instance_buffers =
2301        mem::take(&mut phase_batched_instance_buffers.buffers);
2302    if let Some(mut old_untyped_phase_batched_instance_buffers) = batched_instance_buffers
2303        .phase_instance_buffers
2304        .insert(TypeId::of::<PI>(), untyped_phase_batched_instance_buffers)
2305    {
2306        old_untyped_phase_batched_instance_buffers.clear();
2307        phase_batched_instance_buffers.buffers = old_untyped_phase_batched_instance_buffers;
2308    }
2309
2310    // Insert the `PhaseIndirectParametersBuffers` into the global table.
2311    // Replace the contents of the per-phase resource with the old indirect
2312    // parameters buffers in order to reuse allocations.
2313    let untyped_phase_indirect_parameters_buffers = mem::replace(
2314        &mut phase_indirect_parameters_buffers.buffers,
2315        UntypedPhaseIndirectParametersBuffers::new(
2316            indirect_parameters_buffers_settings.allow_copies_from_indirect_parameter_buffers,
2317        ),
2318    );
2319    if let Some(mut old_untyped_phase_indirect_parameters_buffers) = indirect_parameters_buffers
2320        .insert(
2321            TypeId::of::<PI>(),
2322            untyped_phase_indirect_parameters_buffers,
2323        )
2324    {
2325        old_untyped_phase_indirect_parameters_buffers.clear();
2326        phase_indirect_parameters_buffers.buffers = old_untyped_phase_indirect_parameters_buffers;
2327    }
2328}
2329
2330/// A system that writes all instance buffers to the GPU.
2331pub fn write_batched_instance_buffers<GFBD>(
2332    render_device: Res<RenderDevice>,
2333    render_queue: Res<RenderQueue>,
2334    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
2335    pipeline_cache: Res<PipelineCache>,
2336    mut bin_unpacking_buffers: ResMut<BinUnpackingBuffers>,
2337    mut sparse_buffer_update_jobs: ResMut<SparseBufferUpdateJobs>,
2338    mut sparse_buffer_update_bind_groups: ResMut<SparseBufferUpdateBindGroups>,
2339    sparse_buffer_update_pipelines: Res<SparseBufferUpdatePipelines>,
2340) where
2341    GFBD: GetFullBatchData,
2342{
2343    let BatchedInstanceBuffers {
2344        current_input_buffer,
2345        previous_input_buffer,
2346        phase_instance_buffers,
2347    } = gpu_array_buffer.into_inner();
2348
2349    let render_device = &*render_device;
2350    let render_queue = &*render_queue;
2351
2352    ComputeTaskPool::get().scope(|scope| {
2353        scope.spawn(async {
2354            #[cfg(feature = "trace")]
2355            let _span = bevy_log::info_span!("write_current_input_buffers").entered();
2356            current_input_buffer
2357                .buffer
2358                .write_buffers(render_device, render_queue);
2359        });
2360        scope.spawn(async {
2361            #[cfg(feature = "trace")]
2362            let _span = bevy_log::info_span!("write_previous_input_buffers").entered();
2363            previous_input_buffer.write_buffer(render_device, render_queue);
2364        });
2365
2366        for phase_instance_buffers in phase_instance_buffers.values_mut() {
2367            let UntypedPhaseBatchedInstanceBuffers {
2368                ref mut data_buffer,
2369                ref mut work_item_buffers,
2370                ref mut late_indexed_indirect_parameters_buffer,
2371                ref mut late_non_indexed_indirect_parameters_buffer,
2372            } = *phase_instance_buffers;
2373
2374            scope.spawn(async {
2375                #[cfg(feature = "trace")]
2376                let _span = bevy_log::info_span!("write_phase_instance_buffers").entered();
2377                data_buffer.write_buffer(render_device);
2378                late_indexed_indirect_parameters_buffer.write_buffer(render_device, render_queue);
2379                late_non_indexed_indirect_parameters_buffer
2380                    .write_buffer(render_device, render_queue);
2381            });
2382
2383            for phase_work_item_buffers in work_item_buffers.values_mut() {
2384                scope.spawn(async {
2385                    #[cfg(feature = "trace")]
2386                    let _span = bevy_log::info_span!("write_work_item_buffers").entered();
2387                    match *phase_work_item_buffers {
2388                        PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => {
2389                            buffer_vec.write_buffer(render_device, render_queue);
2390                        }
2391                        PreprocessWorkItemBuffers::Indirect {
2392                            ref mut indexed,
2393                            ref mut non_indexed,
2394                            ref mut gpu_occlusion_culling,
2395                        } => {
2396                            indexed.write_buffer(render_device, render_queue);
2397                            non_indexed.write_buffer(render_device, render_queue);
2398
2399                            if let Some(GpuOcclusionCullingWorkItemBuffers {
2400                                ref mut late_indexed,
2401                                ref mut late_non_indexed,
2402                                late_indirect_parameters_indexed_offset: _,
2403                                late_indirect_parameters_non_indexed_offset: _,
2404                            }) = *gpu_occlusion_culling
2405                            {
2406                                if !late_indexed.is_empty() {
2407                                    late_indexed.write_buffer(render_device);
2408                                }
2409                                if !late_non_indexed.is_empty() {
2410                                    late_non_indexed.write_buffer(render_device);
2411                                }
2412                            }
2413                        }
2414                    }
2415                });
2416            }
2417        }
2418    });
2419
2420    // Create the resources necessary to perform sparse uploads of the current
2421    // input buffer if necessary.
2422    current_input_buffer.buffer.prepare_to_populate_buffers(
2423        render_device,
2424        &pipeline_cache,
2425        &mut sparse_buffer_update_jobs,
2426        &mut sparse_buffer_update_bind_groups,
2427        &sparse_buffer_update_pipelines,
2428    );
2429
2430    bin_unpacking_buffers
2431        .bin_unpacking_metadata
2432        .write_buffer(render_device, render_queue);
2433}
2434
2435/// Writes the bin data for each render phase to the GPU.
2436///
2437/// The bin data consists of the IDs of the mesh instances, as well as the
2438/// metadata needed for the `unpack_bins` shader to unpack them.
2439pub fn write_binned_instance_buffers<BPI, GFBD>(
2440    mut views: Query<&ExtractedView>,
2441    mut view_binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
2442    bin_unpacking_buffers: ResMut<BinUnpackingBuffers>,
2443    render_device: Res<RenderDevice>,
2444    render_queue: Res<RenderQueue>,
2445) where
2446    BPI: BinnedPhaseItem,
2447    GFBD: GetFullBatchData,
2448{
2449    let bin_unpacking_buffers = bin_unpacking_buffers.into_inner();
2450
2451    let phase_type_id = TypeId::of::<BPI>();
2452
2453    // Record all the `RetainedViewEntity` keys that we saw so that we can
2454    // delete buffers corresponding to views that went away.
2455    let mut all_seen_view_entities = HashSet::new();
2456
2457    for extracted_view in &mut views {
2458        all_seen_view_entities.insert(extracted_view.retained_view_entity);
2459
2460        let Some(view_binned_render_phase) =
2461            view_binned_render_phases.get_mut(&extracted_view.retained_view_entity)
2462        else {
2463            continue;
2464        };
2465
2466        // Since we currently only perform GPU-side bin unpacking for multidrawn
2467        // batch sets, we bail out for all other types of batch sets.
2468        let BinnedRenderPhaseBatchSets::MultidrawIndirect(ref batch_sets) =
2469            view_binned_render_phase.batch_sets
2470        else {
2471            continue;
2472        };
2473
2474        // Get or create the bin unpacking buffers for this (view, phase)
2475        // combination.
2476        let view_phase_bin_unpacking_buffers = bin_unpacking_buffers
2477            .view_phase_buffers
2478            .entry(BinUnpackingBuffersKey {
2479                phase: phase_type_id,
2480                view: extracted_view.retained_view_entity,
2481            })
2482            .or_default();
2483
2484        // Clear out the list of jobs.
2485        view_phase_bin_unpacking_buffers
2486            .indexed_unpacking_jobs
2487            .clear();
2488        view_phase_bin_unpacking_buffers
2489            .non_indexed_unpacking_jobs
2490            .clear();
2491
2492        // Our goal is to extract the output work item location and indirect
2493        // parameters info from the flat `batch_sets` list and to use it to
2494        // build each batch set's `GpuBinUnpackingMetadata`. To do that, we
2495        // first loop over each batch set in the `batch_set` list and add the
2496        // extracted entry to the
2497        // `representative_entity_to_batch_set_bin_unpacking_metadata` table.
2498
2499        // We use the *representative entity* as the key for the later loop to
2500        // find the `BatchSetBinUnpackingMetadata`, because it's a unique value
2501        // that can be fetched from the `BinnedRenderPhaseBatchSet`.
2502        let mut representative_entity_to_batch_set_bin_unpacking_metadata =
2503            MainEntityHashMap::default();
2504
2505        for batch_set in batch_sets {
2506            let main_entity = batch_set.first_batch.representative_entity.1;
2507            if *main_entity != Entity::PLACEHOLDER
2508                && let PhaseItemExtraIndex::IndirectParametersIndex {
2509                    range: ref indirect_parameters_range,
2510                    ..
2511                } = batch_set.first_batch.extra_index
2512            {
2513                // Record the batch set bin unpacking metadata for later passes
2514                // to use.
2515                representative_entity_to_batch_set_bin_unpacking_metadata.insert(
2516                    main_entity,
2517                    BatchSetBinUnpackingMetadata {
2518                        base_output_work_item_index: batch_set.first_work_item_index,
2519                        base_indirect_parameters_index: indirect_parameters_range.start,
2520                    },
2521                );
2522            }
2523        }
2524
2525        // Now loop over all the batch sets in the phase. Look up the
2526        // corresponding `BatchSetBinUnpackingMetadata`, and use it to prepare
2527        // the `GpuBinUnpackingMetadata` and the `BinUnpackingJob`s. Also, kick
2528        // off writes for all the associated GPU buffers that we'd been building
2529        // up in earlier phases.
2530        for (batch_set_key, batch_set) in view_binned_render_phase.multidrawable_meshes.iter_mut() {
2531            let Some(representative_entity) = batch_set.representative_entity() else {
2532                continue;
2533            };
2534            let Some(bin_unpacking_metadata) =
2535                representative_entity_to_batch_set_bin_unpacking_metadata
2536                    .get(&representative_entity)
2537            else {
2538                continue;
2539            };
2540
2541            // Write the various buffers to the GPU.
2542
2543            batch_set
2544                .gpu_buffers
2545                .render_binned_mesh_instance_buffer
2546                .write_buffer(&render_device, &render_queue);
2547            batch_set
2548                .gpu_buffers
2549                .bin_index_to_indirect_parameters_offset_buffer
2550                .write_buffer(&render_device, &render_queue);
2551
2552            let (
2553                Some(render_bin_entry_buffer),
2554                Some(bin_index_to_indirect_parameters_offset_buffer),
2555            ) = (
2556                batch_set
2557                    .gpu_buffers
2558                    .render_binned_mesh_instance_buffer
2559                    .buffer(),
2560                batch_set
2561                    .gpu_buffers
2562                    .bin_index_to_indirect_parameters_offset_buffer
2563                    .buffer(),
2564            )
2565            else {
2566                continue;
2567            };
2568
2569            let binned_mesh_instance_count = batch_set
2570                .gpu_buffers
2571                .render_binned_mesh_instance_buffer
2572                .len() as u32;
2573
2574            // Build up the `GpuBinUnpackingMetadata` for this batch set.
2575            let gpu_bin_unpacking_metadata_index = bin_unpacking_buffers
2576                .bin_unpacking_metadata
2577                .push(GpuBinUnpackingMetadata {
2578                    base_output_work_item_index: bin_unpacking_metadata.base_output_work_item_index,
2579                    base_indirect_parameters_index: bin_unpacking_metadata
2580                        .base_indirect_parameters_index,
2581                    binned_mesh_instance_count,
2582                    pad: [0; _],
2583                });
2584
2585            let Some(gpu_bin_unpacking_metadata_index) =
2586                NonMaxU32::new(gpu_bin_unpacking_metadata_index as u32)
2587            else {
2588                continue;
2589            };
2590
2591            // Create the [`BinUnpackingJob`].
2592            let job = BinUnpackingJob {
2593                render_binned_mesh_instance_buffer: render_bin_entry_buffer.clone(),
2594                bin_index_to_indirect_parameters_offset_buffer:
2595                    bin_index_to_indirect_parameters_offset_buffer.clone(),
2596                bin_unpacking_metadata_index: BinUnpackingMetadataIndex(
2597                    gpu_bin_unpacking_metadata_index,
2598                ),
2599                mesh_instance_count: binned_mesh_instance_count,
2600            };
2601
2602            if batch_set_key.indexed() {
2603                view_phase_bin_unpacking_buffers
2604                    .indexed_unpacking_jobs
2605                    .push(job);
2606            } else {
2607                view_phase_bin_unpacking_buffers
2608                    .non_indexed_unpacking_jobs
2609                    .push(job);
2610            }
2611        }
2612    }
2613
2614    // Delete buffers corresponding to dead views.
2615    bin_unpacking_buffers
2616        .view_phase_buffers
2617        .retain(|bin_unpacking_buffers_key, _| {
2618            bin_unpacking_buffers_key.phase != phase_type_id
2619                || all_seen_view_entities.contains(&bin_unpacking_buffers_key.view)
2620        });
2621}
2622
2623/// Clears out the [`BinUnpackingBuffers`] in preparation for a new frame.
2624pub fn clear_bin_unpacking_buffers(mut bin_unpacking_buffers: ResMut<BinUnpackingBuffers>) {
2625    bin_unpacking_buffers.bin_unpacking_metadata.clear();
2626}
2627
2628/// CPU-side metadata needed to drive the bin unpacking compute shader for a
2629/// single batch set.
2630struct BatchSetBinUnpackingMetadata {
2631    /// The index of the first [`PreprocessWorkItem`] that the compute shader
2632    /// dispatch is to write to.
2633    base_output_work_item_index: u32,
2634    /// The index of the first GPU indirect parameters command for the batch
2635    /// set.
2636    base_indirect_parameters_index: u32,
2637}
2638
2639pub fn clear_indirect_parameters_buffers(
2640    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
2641) {
2642    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
2643        phase_indirect_parameters_buffers.clear();
2644    }
2645}
2646
2647pub fn write_indirect_parameters_buffers(
2648    render_device: Res<RenderDevice>,
2649    render_queue: Res<RenderQueue>,
2650    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
2651) {
2652    let render_device = &*render_device;
2653    let render_queue = &*render_queue;
2654    ComputeTaskPool::get().scope(|scope| {
2655        for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
2656            scope.spawn(async {
2657                #[cfg(feature = "trace")]
2658                let _span = bevy_log::info_span!("indexed_data").entered();
2659                phase_indirect_parameters_buffers
2660                    .indexed
2661                    .indirect_draw_parameters
2662                    .write_buffer(render_device);
2663            });
2664            scope.spawn(async {
2665                #[cfg(feature = "trace")]
2666                let _span = bevy_log::info_span!("non_indexed_data").entered();
2667                phase_indirect_parameters_buffers
2668                    .non_indexed
2669                    .indirect_draw_parameters
2670                    .write_buffer(render_device);
2671            });
2672
2673            scope.spawn(async {
2674                #[cfg(feature = "trace")]
2675                let _span = bevy_log::info_span!("indexed_cpu_metadata").entered();
2676                phase_indirect_parameters_buffers
2677                    .indexed
2678                    .cpu_metadata
2679                    .write_buffer(render_device, render_queue);
2680            });
2681            scope.spawn(async {
2682                #[cfg(feature = "trace")]
2683                let _span = bevy_log::info_span!("non_indexed_cpu_metadata").entered();
2684                phase_indirect_parameters_buffers
2685                    .non_indexed
2686                    .cpu_metadata
2687                    .write_buffer(render_device, render_queue);
2688            });
2689
2690            scope.spawn(async {
2691                #[cfg(feature = "trace")]
2692                let _span = bevy_log::info_span!("non_indexed_gpu_metadata").entered();
2693                phase_indirect_parameters_buffers
2694                    .non_indexed
2695                    .gpu_metadata
2696                    .write_buffer(render_device);
2697            });
2698            scope.spawn(async {
2699                #[cfg(feature = "trace")]
2700                let _span = bevy_log::info_span!("indexed_gpu_metadata").entered();
2701                phase_indirect_parameters_buffers
2702                    .indexed
2703                    .gpu_metadata
2704                    .write_buffer(render_device);
2705            });
2706
2707            scope.spawn(async {
2708                #[cfg(feature = "trace")]
2709                let _span = bevy_log::info_span!("indexed_batch_sets").entered();
2710                phase_indirect_parameters_buffers
2711                    .indexed
2712                    .batch_sets
2713                    .write_buffer(render_device, render_queue);
2714            });
2715            scope.spawn(async {
2716                #[cfg(feature = "trace")]
2717                let _span = bevy_log::info_span!("non_indexed_batch_sets").entered();
2718                phase_indirect_parameters_buffers
2719                    .non_indexed
2720                    .batch_sets
2721                    .write_buffer(render_device, render_queue);
2722            });
2723        }
2724    });
2725}
2726
2727#[cfg(test)]
2728mod tests {
2729    use bytemuck::{Pod, Zeroable};
2730
2731    use crate::impl_atomic_pod;
2732
2733    use super::*;
2734
2735    #[derive(Clone, Copy, Default, PartialEq, Debug, Pod, Zeroable)]
2736    #[repr(C)]
2737    struct TestData(u32);
2738
2739    impl_atomic_pod!(TestData, TestDataBlob);
2740
2741    #[test]
2742    fn instance_buffer_correct_behavior() {
2743        let mut instance_buffer = InstanceInputUniformBuffer::new();
2744
2745        let index = instance_buffer.add(TestData(2));
2746        instance_buffer.remove(index);
2747        assert_eq!(instance_buffer.get_unchecked(index), TestData(2));
2748        assert_eq!(instance_buffer.get(index), None);
2749
2750        instance_buffer.add(TestData(5));
2751        assert_eq!(instance_buffer.buffer().len(), 1);
2752    }
2753}