Hi,

I'm trying to synchronize a host stage into my pipeline, where I basically edit some data on the host during the execution of a command buffer on the device. From reading the specification I think I'm doing the correct synchronization, execution/memory dependencies and availability/visibility operations, but it neither works on NV nor AMD hardware. I produced a minimal example that shows how I'm trying to implement it:

  1. A buffer is filled with increasing integers and uploaded to the device (device local buffer).
  2. This buffer is copied to a host visible one.
  3. The first event is set.
  4. The second event is waited for.
  5. Meanwhile the host waits for the first event.
  6. After it has been set, it increments the numbers in the host visible buffer.
  7. Then it sets the second event.
  8. The device then continues to copy the host visible buffer back to the device local buffer.
  9. Finally, this buffer is copied again to a host visible one for checking.


What happens?

On NV the first part works, the correct data arrives at the host side, but the altered data never arrives at the device side. On AMD not even the first part works and I already don't get the data on the host.

The buffers:

  • device_buffer is a buffer with VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT memory and VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT usage
  • host_buffer is a buffer with VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT memory and VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT usage
  • result_buffer is a buffer with VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT memory and VK_BUFFER_USAGE_TRANSFER_DST_BIT usage


The code:

Code :
	VkCommandBufferBeginInfo begin_info = {};
	begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
 
	std::vector<int> data(NUMBER_COUNT);
 
	for(int i = 0; i < NUMBER_COUNT; i++)
		data[i] = i;
 
	VkMemoryBarrier barrier = {};
	barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
 
	vkBeginCommandBuffer(command_buffer, &begin_info);
 
	vkCmdUpdateBuffer(command_buffer, device_buffer, 0, BUFFER_SIZE, data.data());
 
	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
 
	copyWholeBuffer(command_buffer, host_buffer, device_buffer);
 
	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
 
	vkCmdSetEvent(command_buffer, device_to_host_sync_event, VK_PIPELINE_STAGE_TRANSFER_BIT);
 
	barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
	vkCmdWaitEvents(command_buffer, 1, &host_to_device_sync_event, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 1, &barrier, 0, nullptr, 0, nullptr);
 
	copyWholeBuffer(command_buffer, device_buffer, host_buffer);
 
	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
 
	copyWholeBuffer(command_buffer, result_buffer, device_buffer);
 
	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
 
	vkEndCommandBuffer(command_buffer);
 
	VkSubmitInfo submitInfo = {};
	submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
	submitInfo.commandBufferCount = 1;
	submitInfo.pCommandBuffers = &command_buffer;
 
	std::vector<int> wrong(NUMBER_COUNT, -1);
 
	vkQueueSubmit(device.getGraphicsQueue(), 1, &submitInfo, VK_NULL_HANDLE);
 
	while(vkGetEventStatus(device, device_to_host_sync_event) != VK_EVENT_SET)
		std::this_thread::sleep_for(std::chrono::microseconds(10));
 
	int* numbers;
 
	vkMapMemory(device, host_buffer, 0, 2 * BUFFER_SIZE, 0, reinterpret_cast<void**>(&numbers));
 
	for(int i = 0; i < NUMBER_COUNT; i++)
	{
		numbers[i]++;
 
		if(numbers[i] != data[i] + 1)
			wrong[i] = numbers[i];
	}
 
	for(int i = 0; i < NUMBER_COUNT; i++)
	{
		if(wrong[i] != -1)
			std::cout << "Wrong 1: " << i << " " << wrong[i] << " " << numbers[i] << " " << data[i] + 1 << std::endl;
	}
 
	vkUnmapMemory(device, host_buffer);
 
	vkSetEvent(device, host_to_device_sync_event);
 
	vkDeviceWaitIdle(device);
 
	vkMapMemory(device, result_buffer, 0, BUFFER_SIZE, 0, reinterpret_cast<void**>(&numbers));
 
	for(int i = 0; i < NUMBER_COUNT; i++)
	{
		if(numbers[i] != data[i] + 1)
			std::cout << "Wrong 2: " << i << " " << numbers[i] << " " << data[i] + 1 << std::endl;
	}
 
	vkUnmapMemory(device, result_buffer);

What am I doing wrong? Thanks for your help!

Cheers