`buffer_copy_offset` Not Respected in `wgpu-core` #6827

kpreid · 2024-12-26T01:36:51Z

Description
I use copy_buffer_to_texture to copy data from a buffer to single texels of a 3D texture. On my new Apple Silicon machine, and no others, data is apparently written to incorrect portions of the texture. The same problem does not occur when the entire texture is copied.

Repro steps

Run these tests:

[dependencies]
bytemuck = "1.21.0"
itertools = "0.13.0"
pollster = "0.4.0"
wgpu = { git = "https://github.com/gfx-rs/wgpu/" } # or release 23.0.1

use std::sync::Arc;

#[test]
fn test_single_write() {
    pollster::block_on(run_test(false))
}

#[test]
fn test_scatter() {
    pollster::block_on(run_test(true))
}

async fn run_test(use_many_writes: bool) {
    let (device, queue) = {
        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::default());
        let adapter = wgpu::util::initialize_adapter_from_env_or_default(&instance, None)
            .await
            .expect("no adapter");
        dbg!(adapter.get_info());
        let (device, queue) = adapter
            .request_device(&wgpu::DeviceDescriptor::default(), None)
            .await
            .expect("failed to request_device");
        let device = Arc::new(device);
        (device, queue)
    };

    let size = wgpu::Extent3d {
        width: 4,
        height: 4,
        depth_or_array_layers: 4,
    };
    let texture = {
        device.create_texture(&wgpu::TextureDescriptor {
            size,
            mip_level_count: 1,
            sample_count: 1,
            dimension: wgpu::TextureDimension::D3,
            format: wgpu::TextureFormat::Rgba8Uint,
            view_formats: &[],
            usage: wgpu::TextureUsages::TEXTURE_BINDING
                | wgpu::TextureUsages::COPY_DST
                | wgpu::TextureUsages::COPY_SRC,
            label: None,
        })
    };

    if use_many_writes {
        many_writes(&texture, &device, &queue);
    } else {
        single_write(&texture, &queue);
    }

    let light_texels: Vec<[u8; 4]> = {
        let tc = TextureCopyParameters::from_texture(&texture);
        let temp_buffer = tc.copy_texture_to_new_buffer(&device, &queue, &texture);

        let result_cell =
            Arc::new(std::sync::OnceLock::<Result<(), wgpu::BufferAsyncError>>::new());
        temp_buffer.slice(..).map_async(wgpu::MapMode::Read, {
            let result_cell = result_cell.clone();
            move |result| result_cell.set(result).unwrap()
        });
        device.poll(wgpu::Maintain::Wait);
        result_cell
            .get()
            .as_ref()
            .expect("cell not set")
            .as_ref()
            .expect("mapping failed");

        tc.copy_mapped_to_vec(1, &temp_buffer)
    };

    let mut wrong_texels = Vec::new();
    for (zyx_index, cube) in texel_iter(&texture).enumerate() {
        #[allow(clippy::cast_possible_wrap)]
        let expected = texel_for_cube(cube);
        let actual = light_texels[zyx_index];
        if expected != actual {
            println!("{:?}", (cube, expected, actual));
            wrong_texels.push((cube, expected, actual));
        }
    }

    let volume = size.width * size.height * size.depth_or_array_layers;
    assert!(
        wrong_texels.is_empty(),
        "out of {volume}, {len} were wrong",
        len = wrong_texels.len(),
    );
}

// -------------------------------------------------------------------------------------------------

type Texel = [u8; COMPONENTS];

pub fn texel_for_cube(point: [u32; 3]) -> [u8; 4] {
    [
        10 + point[0] as u8,
        10 + point[1] as u8,
        10 + point[2] as u8,
        10,
    ]
}

const COMPONENTS: usize = 4;

fn texel_iter(texture: &wgpu::Texture) -> impl Iterator<Item = [u32; 3]> {
    itertools::iproduct!(
        0..texture.depth_or_array_layers(),
        0..texture.height(),
        0..texture.width()
    )
    .map(|(z, y, x)| [x, y, z])
}

fn compute_data(texture: &wgpu::Texture) -> Vec<Texel> {
    let mut data = Vec::new();
    for point in texel_iter(texture) {
        data.push(texel_for_cube(point));
    }
    data
}

pub fn single_write(texture: &wgpu::Texture, queue: &wgpu::Queue) {
    let data = compute_data(texture);

    queue.write_texture(
        wgpu::ImageCopyTexture {
            texture,
            mip_level: 0,
            origin: wgpu::Origin3d::ZERO,
            aspect: wgpu::TextureAspect::All,
        },
        data.as_flattened(),
        wgpu::ImageDataLayout {
            offset: 0,
            bytes_per_row: Some(texture.width() * COMPONENTS as u32),
            rows_per_image: Some(texture.height()),
        },
        texture.size(),
    )
}

pub fn many_writes(texture: &wgpu::Texture, device: &wgpu::Device, queue: &wgpu::Queue) {
    let data: Vec<Texel> = compute_data(texture);

    let copy_buffer_2 = device.create_buffer(&wgpu::BufferDescriptor {
        label: None,
        size: u64::try_from(data.len() * COMPONENTS).unwrap(),
        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
        mapped_at_creation: false,
    });

    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor::default());

    for (index, cube) in texel_iter(texture).enumerate() {
        encoder.copy_buffer_to_texture(
            wgpu::ImageCopyBuffer {
                buffer: &copy_buffer_2,
                layout: wgpu::ImageDataLayout {
                    offset: (index * COMPONENTS) as u64,
                    bytes_per_row: None,
                    rows_per_image: None,
                },
            },
            wgpu::ImageCopyTexture {
                texture,
                mip_level: 0,
                origin: wgpu::Origin3d {
                    x: cube[0],
                    y: cube[1],
                    z: cube[2],
                },
                aspect: wgpu::TextureAspect::All,
            },
            wgpu::Extent3d {
                width: 1,
                height: 1,
                depth_or_array_layers: 1,
            },
        );
    }

    queue.write_buffer(&copy_buffer_2, 0, data.as_flattened());
    queue.submit([encoder.finish()]);
}

// -------------------------------------------------------------------------------------------------

/// Elements of GPU-to-CPU copying
#[derive(Clone, Copy, Debug)]
struct TextureCopyParameters {
    pub size: wgpu::Extent3d,
    pub byte_size_of_texel: u32,
}
impl TextureCopyParameters {
    pub fn from_texture(texture: &wgpu::Texture) -> Self {
        let format = texture.format();
        assert_eq!(
            format.block_dimensions(),
            (1, 1),
            "compressed texture format {format:?} not supported",
        );

        Self {
            size: texture.size(),
            byte_size_of_texel: format
                .block_copy_size(None)
                .expect("non-color texture format {format:} not supported"),
        }
    }

    pub fn dense_bytes_per_row(&self) -> u32 {
        self.size.width * self.byte_size_of_texel
    }

    pub fn padded_bytes_per_row(&self) -> u32 {
        self.dense_bytes_per_row()
            .div_ceil(wgpu::COPY_BYTES_PER_ROW_ALIGNMENT)
            * wgpu::COPY_BYTES_PER_ROW_ALIGNMENT
    }

    #[track_caller]
    pub fn copy_texture_to_new_buffer(
        &self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        texture: &wgpu::Texture,
    ) -> wgpu::Buffer {
        let padded_bytes_per_row = self.padded_bytes_per_row();

        let temp_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: Some("GPU-to-CPU image copy buffer"),
            size: u64::from(padded_bytes_per_row)
                * u64::from(self.size.height)
                * u64::from(self.size.depth_or_array_layers),
            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
            mapped_at_creation: false,
        });

        {
            let mut encoder =
                device.create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
            encoder.copy_texture_to_buffer(
                texture.as_image_copy(),
                wgpu::ImageCopyBuffer {
                    buffer: &temp_buffer,
                    layout: wgpu::ImageDataLayout {
                        offset: 0,
                        bytes_per_row: Some(padded_bytes_per_row),
                        rows_per_image: Some(self.size.height),
                    },
                },
                texture.size(),
            );
            queue.submit(Some(encoder.finish()));
        }

        temp_buffer
    }

    /// Given a mapped buffer, make a [`Vec<C>`] of it.
    ///
    /// `size_of::<C>() * components` must be equal to the byte size of a texel.
    pub fn copy_mapped_to_vec<C>(&self, components: usize, buffer: &wgpu::Buffer) -> Vec<C>
    where
        C: bytemuck::AnyBitPattern,
    {
        assert_eq!(
            u32::try_from(components * size_of::<C>()).ok(),
            Some(self.byte_size_of_texel),
            "Texture format does not match requested format",
        );

        // Copy the mapped buffer data into a Rust vector, removing row padding if present
        // by copying it one row at a time.
        let mut texel_vector: Vec<C> = Vec::new();
        {
            let mapped: &[u8] = &buffer.slice(..).get_mapped_range();
            for row in 0..self.row_count() {
                let byte_start_of_row = (self.padded_bytes_per_row()) as usize * row;
                texel_vector.extend(bytemuck::cast_slice::<u8, C>(
                    &mapped[byte_start_of_row..][..self.dense_bytes_per_row() as usize],
                ));
            }
        }

        texel_vector
    }

    fn row_count(&self) -> usize {
        self.size.height as usize * self.size.depth_or_array_layers as usize
    }
}

Expected vs observed behavior
The two tests should both pass, indicating identical data was written to the texture. Instead I get:

[tests/main.rs:19:9] adapter.get_info() = AdapterInfo {
    name: "Apple M4 Max",
    vendor: 0,
    device: 0,
    device_type: IntegratedGpu,
    driver: "",
    driver_info: "",
    backend: Metal,
}
([0, 1, 0], [10, 11, 10, 10], [10, 10, 10, 10])
([0, 2, 0], [10, 12, 10, 10], [10, 10, 10, 10])
([0, 3, 0], [10, 13, 10, 10], [10, 10, 10, 10])
([0, 1, 1], [10, 11, 11, 10], [10, 10, 11, 10])
([0, 2, 1], [10, 12, 11, 10], [10, 10, 11, 10])
([0, 3, 1], [10, 13, 11, 10], [10, 10, 11, 10])
([0, 1, 2], [10, 11, 12, 10], [10, 10, 12, 10])
([0, 2, 2], [10, 12, 12, 10], [10, 10, 12, 10])
([0, 3, 2], [10, 13, 12, 10], [10, 10, 12, 10])
([0, 1, 3], [10, 11, 13, 10], [10, 10, 13, 10])
([0, 2, 3], [10, 12, 13, 10], [10, 10, 13, 10])
([0, 3, 3], [10, 13, 13, 10], [10, 10, 13, 10])
thread 'test_scatter' panicked at tests/main.rs:87:5:
out of 64, 12 were wrong

which indicates that every texel with x=0 received the data belonging to y=0 instead of the data from the correct y position. However, changing the test data size suggests that the problem has more to do with 16-byte stride than the coordinates.

In my actual application, where the single-texel writes are to arbitrary locations in a bigger texture, the incorrect writes seem to be deterministic but not following any simple pattern, unlike the output of this test. None of them has ever written garbage data; only valid data to the wrong texel.

Platform

MacBook Pro
macOS Sequoia 15.1
Apple M4 Max CPU
target: aarch64-apple-darwin
rustc 1.83.0
wgpu 23.0.1 or git 7c75ac7 (same results)

The same bug does not appear on my other macOS machine with x86 CPU and macOS 14.7, nor does it appear with WebGPU.

The text was updated successfully, but these errors were encountered:

kpreid · 2024-12-28T05:09:41Z

Further observation: if I make sure that the small copies are each a multiple of 16 texels (64 bytes) long, then the problem apparently does not occur.

…single texels. This is a workaround for <gfx-rs/wgpu#6827>. However, it should significantly improve throughput too; copying single texels at a time seems to be quite expensive relative to the cost of copying additional texels.

cwfitzgerald · 2025-01-09T01:27:44Z

Okay I think this may be a deeper problem. Turning this into a PR failed in CI. I initially thought this was because we were missing the AgilitySDK and the validation was wrong/out of date, but I think that is just masking the problem.

[2025-01-09T00:12:55Z ERROR wgpu_test::expectations] Validation Error: ID3D12CommandList::CopyTextureRegion: D3D12_PLACED_SUBRESOURCE_FOOTPRINT::Offset must be a multiple of 512, aka. D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT. Offset is 4. [ RESOURCE_MANIPULATION ERROR #864: COPYTEXTUREREGION_INVALIDSRCOFFSET]

We aren't respecting wgpu_hal::Alignments::buffer_copy_offset in core, which includes metal. According to the feature tables, "Buffer alignment for copying an existing texture to a buffer" is 16B on Apple GPUs, which could explain the exact behavior you're seeing.

github-project-automation bot moved this to Todo in WebGPU for Firefox Dec 26, 2024

github-project-automation bot added this to WebGPU for Firefox Dec 26, 2024

kpreid mentioned this issue Dec 28, 2024

gpu: Make LightTexture::update_scatter() work in chunks instead of single texels. kpreid/all-is-cubes#564

Merged

cwfitzgerald added type: bug Something isn't working area: correctness We're behaving incorrectly api: dx12 Issues with DX12 or DXGI api: metal Issues with Metal labels Jan 9, 2025

cwfitzgerald changed the title ~~copy_buffer_to_texture() writes to the wrong location — on Apple Silicon only~~ buffer_copy_offset Not Respected in wgpu-core Jan 9, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

`buffer_copy_offset` Not Respected in `wgpu-core` #6827

`buffer_copy_offset` Not Respected in `wgpu-core` #6827

kpreid commented Dec 26, 2024

kpreid commented Dec 28, 2024

cwfitzgerald commented Jan 9, 2025 •

edited

Loading

buffer_copy_offset Not Respected in wgpu-core #6827

buffer_copy_offset Not Respected in wgpu-core #6827

Comments

kpreid commented Dec 26, 2024

kpreid commented Dec 28, 2024

cwfitzgerald commented Jan 9, 2025 • edited Loading

`buffer_copy_offset` Not Respected in `wgpu-core` #6827

`buffer_copy_offset` Not Respected in `wgpu-core` #6827

cwfitzgerald commented Jan 9, 2025 •

edited

Loading