Skip to content

Commit

Permalink
[CUDA][HIP] Fix kernel arguments being overriden when added out of order
Browse files Browse the repository at this point in the history
In the Cuda and Hip adapter, when kernel arguments are added out of order
(e.g. argument at index 1 is added before argument at index 0), the
existing arguments are currently being overwritten. This happens because
some of the argument sizes might not be known when adding them out of
order and the code relies on those sizes to choose where to store the
argument.

This commit avoids this issue by storing the arguments in the same order that
they are added and accessing them using pointer offsets.
  • Loading branch information
fabiomestre committed Jan 15, 2025
1 parent ef70004 commit bf6b6f9
Show file tree
Hide file tree
Showing 4 changed files with 377 additions and 38 deletions.
31 changes: 19 additions & 12 deletions source/adapters/cuda/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ struct ur_kernel_handle_t_ {
args_size_t ParamSizes;
/// Byte offset into /p Storage allocation for each parameter.
args_index_t Indices;
/// Position in the Storage array where the next argument should added.
size_t InsertPos = 0;
/// Aligned size in bytes for each local memory parameter after padding has
/// been added. Zero if the argument at the index isn't a local memory
/// argument.
Expand Down Expand Up @@ -101,6 +103,7 @@ struct ur_kernel_handle_t_ {
/// Implicit offset argument is kept at the back of the indices collection.
void addArg(size_t Index, size_t Size, const void *Arg,
size_t LocalSize = 0) {
// Expand storage to accommodate this Index if needed.
if (Index + 2 > Indices.size()) {
// Move implicit offset argument index with the end
Indices.resize(Index + 2, Indices.back());
Expand All @@ -109,14 +112,21 @@ struct ur_kernel_handle_t_ {
AlignedLocalMemSize.resize(Index + 1);
OriginalLocalMemSize.resize(Index + 1);
}
ParamSizes[Index] = Size;
// calculate the insertion point on the array
size_t InsertPos = std::accumulate(std::begin(ParamSizes),
std::begin(ParamSizes) + Index, 0);
// Update the stored value for the argument
std::memcpy(&Storage[InsertPos], Arg, Size);
Indices[Index] = &Storage[InsertPos];
AlignedLocalMemSize[Index] = LocalSize;

// Copy new argument to storage if it hasn't been added before.
if (ParamSizes[Index] == 0) {
ParamSizes[Index] = Size;
std::memcpy(&Storage[InsertPos], Arg, Size);
Indices[Index] = &Storage[InsertPos];
AlignedLocalMemSize[Index] = LocalSize;
InsertPos += Size;
}
// Otherwise, update the existing argument.
else {
std::memcpy(Indices[Index], Arg, Size);
AlignedLocalMemSize[Index] = LocalSize;
assert(Size == ParamSizes[Index]);
}
}

/// Returns the padded size and offset of a local memory argument.
Expand Down Expand Up @@ -177,10 +187,7 @@ struct ur_kernel_handle_t_ {
AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;

// Store new offset into local data
const size_t InsertPos =
std::accumulate(std::begin(ParamSizes),
std::begin(ParamSizes) + SuccIndex, size_t{0});
std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset,
sizeof(size_t));
}
}
Expand Down
65 changes: 39 additions & 26 deletions source/adapters/hip/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ struct ur_kernel_handle_t_ {
args_size_t ParamSizes;
/// Byte offset into /p Storage allocation for each parameter.
args_index_t Indices;
/// Position in the Storage array where the next argument should added.
size_t InsertPos = 0;
/// Aligned size in bytes for each local memory parameter after padding has
/// been added. Zero if the argument at the index isn't a local memory
/// argument.
Expand Down Expand Up @@ -95,22 +97,30 @@ struct ur_kernel_handle_t_ {
/// Implicit offset argument is kept at the back of the indices collection.
void addArg(size_t Index, size_t Size, const void *Arg,
size_t LocalSize = 0) {
// Expand storage to accommodate this Index if needed.
if (Index + 2 > Indices.size()) {
// Move implicit offset argument Index with the end
// Move implicit offset argument index with the end
Indices.resize(Index + 2, Indices.back());
// Ensure enough space for the new argument
ParamSizes.resize(Index + 1);
AlignedLocalMemSize.resize(Index + 1);
OriginalLocalMemSize.resize(Index + 1);
}
ParamSizes[Index] = Size;
// calculate the insertion point on the array
size_t InsertPos = std::accumulate(std::begin(ParamSizes),
std::begin(ParamSizes) + Index, 0);
// Update the stored value for the argument
std::memcpy(&Storage[InsertPos], Arg, Size);
Indices[Index] = &Storage[InsertPos];
AlignedLocalMemSize[Index] = LocalSize;

// Copy new argument to storage if it hasn't been added before.
if (ParamSizes[Index] == 0) {
ParamSizes[Index] = Size;
std::memcpy(&Storage[InsertPos], Arg, Size);
Indices[Index] = &Storage[InsertPos];
AlignedLocalMemSize[Index] = LocalSize;
InsertPos += Size;
}
// Otherwise, update the existing argument.
else {
std::memcpy(Indices[Index], Arg, Size);
AlignedLocalMemSize[Index] = LocalSize;
assert(Size == ParamSizes[Index]);
}
}

/// Returns the padded size and offset of a local memory argument.
Expand Down Expand Up @@ -151,20 +161,11 @@ struct ur_kernel_handle_t_ {
return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
}

void addLocalArg(size_t Index, size_t Size) {
// Get the aligned argument size and offset into local data
auto [AlignedLocalSize, AlignedLocalOffset] =
calcAlignedLocalArgument(Index, Size);

// Store argument details
addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
AlignedLocalSize);

// For every existing local argument which follows at later argument
// indices, update the offset and pointer into the kernel local memory.
// Required as padding will need to be recalculated.
// Iterate over all existing local argument which follows StartIndex
// index, update the offset and pointer into the kernel local memory.
void updateLocalArgOffset(size_t StartIndex) {
const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) {
const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
if (OriginalLocalSize == 0) {
// Skip if successor argument isn't a local memory arg
Expand All @@ -179,14 +180,26 @@ struct ur_kernel_handle_t_ {
AlignedLocalMemSize[SuccIndex] = SuccAlignedLocalSize;

// Store new offset into local data
const size_t InsertPos =
std::accumulate(std::begin(ParamSizes),
std::begin(ParamSizes) + SuccIndex, size_t{0});
std::memcpy(&Storage[InsertPos], &SuccAlignedLocalOffset,
std::memcpy(Indices[SuccIndex], &SuccAlignedLocalOffset,
sizeof(size_t));
}
}

void addLocalArg(size_t Index, size_t Size) {
// Get the aligned argument size and offset into local data
auto [AlignedLocalSize, AlignedLocalOffset] =
calcAlignedLocalArgument(Index, Size);

// Store argument details
addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
AlignedLocalSize);

// For every existing local argument which follows at later argument
// indices, update the offset and pointer into the kernel local memory.
// Required as padding will need to be recalculated.
updateLocalArgOffset(Index + 1);
}

void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
assert(hMem && "Invalid mem handle");
// To avoid redundancy we are not storing mem obj with index i at index
Expand Down
220 changes: 220 additions & 0 deletions test/conformance/exp_command_buffer/update/local_memory_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1105,3 +1105,223 @@ TEST_P(LocalMemoryMultiUpdateTest, UpdateWithoutBlocking) {
uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
}

struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase {
virtual void SetUp() override {
program_name = "saxpy_usm_local_mem";
UUR_RETURN_ON_FATAL_FAILURE(
urUpdatableCommandBufferExpExecutionTest::SetUp());

if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
GTEST_SKIP()
<< "Local memory argument update not supported on Level Zero.";
}

// HIP has extra args for local memory so we define an offset for arg
// indices here for updating
hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
ur_device_usm_access_capability_flags_t shared_usm_flags;
ASSERT_SUCCESS(
uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
GTEST_SKIP() << "Shared USM is not supported.";
}

const size_t allocation_size =
sizeof(uint32_t) * global_size * local_size;
for (auto &shared_ptr : shared_ptrs) {
ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
allocation_size, &shared_ptr));
ASSERT_NE(shared_ptr, nullptr);

std::vector<uint8_t> pattern(allocation_size);
uur::generateMemFillPattern(pattern);
std::memcpy(shared_ptr, pattern.data(), allocation_size);
}

std::array<size_t, 12> index_order{};
if (backend != UR_PLATFORM_BACKEND_HIP) {
index_order = {3, 2, 4, 5, 1, 0};
} else {
index_order = {9, 8, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
}
size_t current_index = 0;

// Index 3 is A
ASSERT_SUCCESS(urKernelSetArgValue(kernel, index_order[current_index++],
sizeof(A), nullptr, &A));
// Index 2 is output
ASSERT_SUCCESS(urKernelSetArgPointer(
kernel, index_order[current_index++], nullptr, shared_ptrs[0]));

// Index 4 is X
ASSERT_SUCCESS(urKernelSetArgPointer(
kernel, index_order[current_index++], nullptr, shared_ptrs[1]));
// Index 5 is Y
ASSERT_SUCCESS(urKernelSetArgPointer(
kernel, index_order[current_index++], nullptr, shared_ptrs[2]));

// Index 1 is local_mem_b arg
ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++],
local_mem_b_size, nullptr));
if (backend == UR_PLATFORM_BACKEND_HIP) {
ASSERT_SUCCESS(urKernelSetArgValue(
kernel, index_order[current_index++], sizeof(hip_local_offset),
nullptr, &hip_local_offset));
ASSERT_SUCCESS(urKernelSetArgValue(
kernel, index_order[current_index++], sizeof(hip_local_offset),
nullptr, &hip_local_offset));
ASSERT_SUCCESS(urKernelSetArgValue(
kernel, index_order[current_index++], sizeof(hip_local_offset),
nullptr, &hip_local_offset));
}

// Index 0 is local_mem_a arg
ASSERT_SUCCESS(urKernelSetArgLocal(kernel, index_order[current_index++],
local_mem_a_size, nullptr));

// Hip has extra args for local mem at index 1-3
if (backend == UR_PLATFORM_BACKEND_HIP) {
ASSERT_SUCCESS(urKernelSetArgValue(
kernel, index_order[current_index++], sizeof(hip_local_offset),
nullptr, &hip_local_offset));
ASSERT_SUCCESS(urKernelSetArgValue(
kernel, index_order[current_index++], sizeof(hip_local_offset),
nullptr, &hip_local_offset));
ASSERT_SUCCESS(urKernelSetArgValue(
kernel, index_order[current_index++], sizeof(hip_local_offset),
nullptr, &hip_local_offset));
}
}
};

struct LocalMemoryUpdateTestOutOfOrder : LocalMemoryUpdateTestBaseOutOfOrder {
void SetUp() override {
UUR_RETURN_ON_FATAL_FAILURE(
LocalMemoryUpdateTestBaseOutOfOrder::SetUp());

// Append kernel command to command-buffer and close command-buffer
ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
&global_size, &local_size, 0, nullptr, 0, nullptr, 0, nullptr,
nullptr, nullptr, &command_handle));
ASSERT_NE(command_handle, nullptr);

ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
}

void TearDown() override {
if (command_handle) {
EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle));
}

UUR_RETURN_ON_FATAL_FAILURE(
LocalMemoryUpdateTestBaseOutOfOrder::TearDown());
}

ur_exp_command_buffer_command_handle_t command_handle = nullptr;
};

UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(LocalMemoryUpdateTestOutOfOrder);

// Test updating A,X,Y parameters to new values and local memory to larger
// values when the kernel arguments were added out of order.
TEST_P(LocalMemoryUpdateTestOutOfOrder, UpdateAllParameters) {
// Run command-buffer prior to update and verify output
ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
nullptr, nullptr));
ASSERT_SUCCESS(urQueueFinish(queue));

uint32_t *output = (uint32_t *)shared_ptrs[0];
uint32_t *X = (uint32_t *)shared_ptrs[1];
uint32_t *Y = (uint32_t *)shared_ptrs[2];
Validate(output, X, Y, A, global_size, local_size);

// Update inputs
std::array<ur_exp_command_buffer_update_pointer_arg_desc_t, 2>
new_input_descs;
std::array<ur_exp_command_buffer_update_value_arg_desc_t, 3>
new_value_descs;

size_t new_local_size = local_size * 4;
size_t new_local_mem_a_size = new_local_size * sizeof(uint32_t);

// New local_mem_a at index 0
new_value_descs[0] = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
nullptr, // pNext
0, // argIndex
new_local_mem_a_size, // argSize
nullptr, // pProperties
nullptr, // hArgValue
};

// New local_mem_b at index 1
new_value_descs[1] = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
nullptr, // pNext
1 + hip_arg_offset, // argIndex
local_mem_b_size, // argSize
nullptr, // pProperties
nullptr, // hArgValue
};

// New A at index 3
uint32_t new_A = 33;
new_value_descs[2] = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
nullptr, // pNext
3 + (2 * hip_arg_offset), // argIndex
sizeof(new_A), // argSize
nullptr, // pProperties
&new_A, // hArgValue
};

// New X at index 4
new_input_descs[0] = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
nullptr, // pNext
4 + (2 * hip_arg_offset), // argIndex
nullptr, // pProperties
&shared_ptrs[3], // pArgValue
};

// New Y at index 5
new_input_descs[1] = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
nullptr, // pNext
5 + (2 * hip_arg_offset), // argIndex
nullptr, // pProperties
&shared_ptrs[4], // pArgValue
};

// Update kernel inputs
ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
nullptr, // pNext
kernel, // hNewKernel
0, // numNewMemObjArgs
new_input_descs.size(), // numNewPointerArgs
new_value_descs.size(), // numNewValueArgs
n_dimensions, // newWorkDim
nullptr, // pNewMemObjArgList
new_input_descs.data(), // pNewPointerArgList
new_value_descs.data(), // pNewValueArgList
nullptr, // pNewGlobalWorkOffset
nullptr, // pNewGlobalWorkSize
nullptr, // pNewLocalWorkSize
};

// Update kernel and enqueue command-buffer again
ASSERT_SUCCESS(
urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
nullptr, nullptr));
ASSERT_SUCCESS(urQueueFinish(queue));

// Verify that update occurred correctly
uint32_t *new_output = (uint32_t *)shared_ptrs[0];
uint32_t *new_X = (uint32_t *)shared_ptrs[3];
uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
Validate(new_output, new_X, new_Y, new_A, global_size, local_size);
}
Loading

0 comments on commit bf6b6f9

Please sign in to comment.