From bf6b6f9df5cd7c1e3dda4af8e4b3546c7109f24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 14 Jan 2025 13:24:55 +0000 Subject: [PATCH] [CUDA][HIP] Fix kernel arguments being overriden when added out of order In the Cuda and Hip adapter, when kernel arguments are added out of order (e.g. argument at index 1 is added before argument at index 0), the existing arguments are currently being overwritten. This happens because some of the argument sizes might not be known when adding them out of order and the code relies on those sizes to choose where to store the argument. This commit avoids this issue by storing the arguments in the same order that they are added and accessing them using pointer offsets. --- source/adapters/cuda/kernel.hpp | 31 ++- source/adapters/hip/kernel.hpp | 65 +++--- .../update/local_memory_update.cpp | 220 ++++++++++++++++++ .../kernel/urKernelSetArgLocal.cpp | 99 ++++++++ 4 files changed, 377 insertions(+), 38 deletions(-) diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index d1b3b61244..a6194e9a57 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -68,6 +68,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -101,6 +103,7 @@ struct ur_kernel_handle_t_ { /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { + // Expand storage to accommodate this Index if needed. if (Index + 2 > Indices.size()) { // Move implicit offset argument index with the end Indices.resize(Index + 2, Indices.back()); @@ -109,14 +112,21 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(Indices[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -177,10 +187,7 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index c6d30e81ad..61dd89cc99 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -63,6 +63,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; /// Byte offset into /p Storage allocation for each parameter. args_index_t Indices; + /// Position in the Storage array where the next argument should added. + size_t InsertPos = 0; /// Aligned size in bytes for each local memory parameter after padding has /// been added. Zero if the argument at the index isn't a local memory /// argument. @@ -95,22 +97,30 @@ struct ur_kernel_handle_t_ { /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg, size_t LocalSize = 0) { + // Expand storage to accommodate this Index if needed. if (Index + 2 > Indices.size()) { - // Move implicit offset argument Index with the end + // Move implicit offset argument index with the end Indices.resize(Index + 2, Indices.back()); // Ensure enough space for the new argument ParamSizes.resize(Index + 1); AlignedLocalMemSize.resize(Index + 1); OriginalLocalMemSize.resize(Index + 1); } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - AlignedLocalMemSize[Index] = LocalSize; + + // Copy new argument to storage if it hasn't been added before. + if (ParamSizes[Index] == 0) { + ParamSizes[Index] = Size; + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + AlignedLocalMemSize[Index] = LocalSize; + InsertPos += Size; + } + // Otherwise, update the existing argument. + else { + std::memcpy(Indices[Index], Arg, Size); + AlignedLocalMemSize[Index] = LocalSize; + assert(Size == ParamSizes[Index]); + } } /// Returns the padded size and offset of a local memory argument. @@ -151,20 +161,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - void addLocalArg(size_t Index, size_t Size) { - // Get the aligned argument size and offset into local data - auto [AlignedLocalSize, AlignedLocalOffset] = - calcAlignedLocalArgument(Index, Size); - - // Store argument details - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - AlignedLocalSize); - - // For every existing local argument which follows at later argument - // indices, update the offset and pointer into the kernel local memory. - // Required as padding will need to be recalculated. + // Iterate over all existing local argument which follows StartIndex + // index, update the offset and pointer into the kernel local memory. + void updateLocalArgOffset(size_t StartIndex) { const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg - for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { // Skip if successor argument isn't a local memory arg @@ -179,14 +180,26 @@ struct ur_kernel_handle_t_ { AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize; // Store new offset into local data - const size_t InsertPos = - std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + SuccIndex, size_t{0}); - std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset, + std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset, sizeof(size_t)); } } + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + updateLocalArgOffset(Index + 1); + } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { assert(hMem && "Invalid mem handle"); // To avoid redundancy we are not storing mem obj with index i at index diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index c467c9783a..d570ce72b0 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -1105,3 +1105,223 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) { uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; Validate(new_output, new_X, new_Y, new_A, global_size, local_size); } + +struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { + virtual void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpExecutionTest::SetUp()); + + if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { + GTEST_SKIP() + << "Local memory argument update not supported on Level Zero."; + } + + // HIP has extra args for local memory so we define an offset for arg + // indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + } +}; + +struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr, + nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + LocalMemoryUpdateTestBaseOutOfOrder::TearDown()); + } + + ur_exp_command_buffer_command_handle_t command_handle = nullptr; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder); + +// Test updating A,X,Y parameters to new values and local memory to larger +// values when the kernel arguments were added out of order. +TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) { + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); + + // Update inputs + std::array + new_input_descs; + std::array + new_value_descs; + + size_t new_local_size = local_size * 4; + size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t); + + // New local_mem_a at index 0 + new_value_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + new_local_mem_a_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New local_mem_b at index 1 + new_value_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1 + hip_arg_offset, // argIndex + local_mem_b_size, // argSize + nullptr, // pProperties + nullptr, // hArgValue + }; + + // New A at index 3 + uint32_t new_A = 33; + new_value_descs[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 3 + (2 * hip_arg_offset), // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // New X at index 4 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 4 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 5 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 5 + (2 * hip_arg_offset), // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + new_input_descs.size(), // numNewPointerArgs + new_value_descs.size(), // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs.data(), // pNewPointerArgList + new_value_descs.data(), // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size, local_size); +} \ No newline at end of file diff --git a/test/conformance/kernel/urKernelSetArgLocal.cpp b/test/conformance/kernel/urKernelSetArgLocal.cpp index 467fa7b920..094559ffbc 100644 --- a/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -236,3 +236,102 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { Validate(output, X, Y, A, global_size, new_local_size); } + +// Tests that adding arguments out of order (e.g. index 1 before index 0) works. +struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { + void SetUp() override { + program_name = "saxpy_usm_local_mem"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + // HIP has extra args for local memory so we define an offset for arg indices here for updating + hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + const size_t allocation_size = + sizeof(uint32_t) * global_size * local_size; + for (auto &shared_ptr : shared_ptrs) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(shared_ptr, pattern.data(), allocation_size); + } + + std::array index_order{}; + if (backend != UR_PLATFORM_BACKEND_HIP) { + index_order = {3, 2, 4, 5, 1, 0}; + } else { + index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + } + size_t current_index = 0; + + // Index 3 is A + ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++], + sizeof(A), nullptr, &A)); + // Index 2 is output + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[0])); + + // Index 4 is X + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[1])); + // Index 5 is Y + ASSERT_SUCCESS(urKernelSetArgPointer( + kernel, index_order[current_index++], nullptr, shared_ptrs[2])); + + // Index 1 is local_mem_b arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_b_size, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + + // Index 0 is local_mem_a arg + ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++], + local_mem_a_size, nullptr)); + + // Hip has extra args for local mem at index 1-3 + if (backend == UR_PLATFORM_BACKEND_HIP) { + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + ASSERT_SUCCESS(urKernelSetArgValue( + kernel, index_order[current_index++], sizeof(hip_local_offset), + nullptr, &hip_local_offset)); + } + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelSetArgLocalOutOfOrder); +TEST_P(urKernelSetArgLocalOutOfOrder, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size, local_size); +}