Correct Device - Host - Device synchronization with VkEvent

Hi,

I’m trying to synchronize a host stage into my pipeline, where I basically edit some data on the host during the execution of a command buffer on the device. From reading the specification I think I’m doing the correct synchronization, execution/memory dependencies and availability/visibility operations, but it neither works on NV nor AMD hardware. I produced a minimal example that shows how I’m trying to implement it:

[ol]
[li]A buffer is filled with increasing integers and uploaded to the device (device local buffer).[/li][li]This buffer is copied to a host visible one.[/li][li]The first event is set.[/li][li]The second event is waited for.[/li][li]Meanwhile the host waits for the first event.[/li][li]After it has been set, it increments the numbers in the host visible buffer.[/li][li]Then it sets the second event.[/li][li]The device then continues to copy the host visible buffer back to the device local buffer.[/li][li]Finally, this buffer is copied again to a host visible one for checking.[/li][/ol]

What happens?

On NV the first part works, the correct data arrives at the host side, but the altered data never arrives at the device side. On AMD not even the first part works and I already don’t get the data on the host.

The buffers:

[ul]
[li]device_buffer is a buffer with VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT memory and VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT usage[/li][li]host_buffer is a buffer with VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT memory and VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT usage[/li][li]result_buffer is a buffer with VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT memory and VK_BUFFER_USAGE_TRANSFER_DST_BIT usage[/li][/ul]

The code:


	VkCommandBufferBeginInfo begin_info = {};
	begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;

	std::vector<int> data(NUMBER_COUNT);

	for(int i = 0; i < NUMBER_COUNT; i++)
		data[i] = i;

	VkMemoryBarrier barrier = {};
	barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;

	vkBeginCommandBuffer(command_buffer, &begin_info);

	vkCmdUpdateBuffer(command_buffer, device_buffer, 0, BUFFER_SIZE, data.data());

	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);

	copyWholeBuffer(command_buffer, host_buffer, device_buffer);

	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);

	vkCmdSetEvent(command_buffer, device_to_host_sync_event, VK_PIPELINE_STAGE_TRANSFER_BIT);

	barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
	vkCmdWaitEvents(command_buffer, 1, &host_to_device_sync_event, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 1, &barrier, 0, nullptr, 0, nullptr);

	copyWholeBuffer(command_buffer, device_buffer, host_buffer);

	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);

	copyWholeBuffer(command_buffer, result_buffer, device_buffer);

	barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
	barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
	vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);

	vkEndCommandBuffer(command_buffer);

	VkSubmitInfo submitInfo = {};
	submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
	submitInfo.commandBufferCount = 1;
	submitInfo.pCommandBuffers = &command_buffer;

	std::vector<int> wrong(NUMBER_COUNT, -1);

	vkQueueSubmit(device.getGraphicsQueue(), 1, &submitInfo, VK_NULL_HANDLE);

	while(vkGetEventStatus(device, device_to_host_sync_event) != VK_EVENT_SET)
		std::this_thread::sleep_for(std::chrono::microseconds(10));

	int* numbers;

	vkMapMemory(device, host_buffer, 0, 2 * BUFFER_SIZE, 0, reinterpret_cast<void**>(&numbers));

	for(int i = 0; i < NUMBER_COUNT; i++)
	{
		numbers[i]++;

		if(numbers[i] != data[i] + 1)
			wrong[i] = numbers[i];
	}

	for(int i = 0; i < NUMBER_COUNT; i++)
	{
		if(wrong[i] != -1)
			std::cout << "Wrong 1: " << i << " " << wrong[i] << " " << numbers[i] << " " << data[i] + 1 << std::endl;
	}

	vkUnmapMemory(device, host_buffer);

	vkSetEvent(device, host_to_device_sync_event);

	vkDeviceWaitIdle(device);

	vkMapMemory(device, result_buffer, 0, BUFFER_SIZE, 0, reinterpret_cast<void**>(&numbers));

	for(int i = 0; i < NUMBER_COUNT; i++)
	{
		if(numbers[i] != data[i] + 1)
			std::cout << "Wrong 2: " << i << " " << numbers[i] << " " << data[i] + 1 << std::endl;
	}

	vkUnmapMemory(device, result_buffer);

What am I doing wrong? Thanks for your help!

Cheers

I’ve uploaded a working example: Vulkan Device - Host - Device synchronization with VkEvent · GitHub

The interesting bits start at line 292! Please have a look if it works for you?

I opened an issue on github: Device - Host - Device synchronization with VkEvent · Issue #755 · KhronosGroup/Vulkan-Docs · GitHub

After a bit of discussion there, the conclusion is that Device to Host synchronization is not possible with an event and a fence has to be used.