diff --git a/CMakeLists.txt b/CMakeLists.txt index 86ad9d39d8..f7ec7b2208 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -588,7 +588,7 @@ ENDIF() ENDFOREACH() add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) -add_subdirectory(library) +#add_subdirectory(library) if(NOT GPU_ARCHS AND USER_GPU_TARGETS) rocm_package_setup_component(tests @@ -601,20 +601,20 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS) PACKAGE_NAME examples ) add_subdirectory(example) - if(BUILD_TESTING) - add_subdirectory(test) - endif() + #if(BUILD_TESTING) + #add_subdirectory(test) + #endif() endif() rocm_package_setup_component(profiler LIBRARY_NAME composablekernel PACKAGE_NAME ckprofiler ) -add_subdirectory(profiler) +#add_subdirectory(profiler) -if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) - add_subdirectory(codegen) -endif() +#if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) + #add_subdirectory(codegen) +#endif() #Create an interface target for the include only files and call it "composablekernels" include(CMakePackageConfigHelpers) diff --git a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp index 7b491173a6..b27d0a3421 100644 --- a/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp +++ b/example/01_gemm/gemm_xdl_bf16_pk_i4_v3.cpp @@ -8,16 +8,35 @@ using ADataType = ck::bhalf_t; using BDataType = ck::pk_i4_t; using AccDataType = float; -using CShuffleDataType = ck::bhalf_t; +using CShuffleDataType = float; using CDataType = ck::bhalf_t; using ALayout = Row; using BLayout = Col; using CLayout = Row; +struct Scale +{ + template + __host__ __device__ constexpr void + operator()(E& e, const C& c) const; + + template <> + __host__ __device__ constexpr void operator()( + ck::bhalf_t& e, const float& c) const + { + const float x0_f = c * scale_; + + e = ck::type_convert(x0_f); + } + + float scale_ = 16.0; +}; + using AElementOp = PassThrough; using BElementOp = PassThrough; -using CElementOp = PassThrough; +using CElementOp = Scale; +//using CElementOp = PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; static constexpr bool PermuteA = false; @@ -121,7 +140,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) b_k_n.GenerateTensorValue(GeneratorTensor_1{1}); break; default: - a_m_k.GenerateTensorValue(GeneratorTensor_3{0, 1.0}); + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); b_k_n.GenerateTensorValue(GeneratorTensor_2{-2, 2}); } @@ -149,7 +168,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) { for(int jj = 0; jj < K1; jj++) { - b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj)); + //b_k_n_permute(j * N * K1 + i * K1 + jj) = ck::bit_cast(b_k_n(i * K + (j * K1 + jj))) + 0x88; + b_k_n_permute(j * N * K1 + i * K1 + jj) = ck::bit_cast(b_k_n(i * K + (j * K1 + jj))); } } } @@ -165,6 +185,57 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) } } +#if 1 + // vector pk_i4x4 permute + for(int i = 0; i < N; i++) + { + for(int j = 0; j < K; j += 8) + { + int input[8]; + + for(int k = 0; k < 4; k++) + { + int i4x2 = b_k_n_permute(j + k * 2, i).data; + input[k * 2 + 0] = (i4x2 >> 4) & 0xf; + input[k * 2 + 1] = (i4x2 >> 0) & 0xf; + } + + // permute 01234567->04261537 + { + int hi = input[4]; + int lo = input[0]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 0, i) = i4x2; + } + + { + int hi = input[6]; + int lo = input[2]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 2, i) = i4x2; + } + + { + int hi = input[5]; + int lo = input[1]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 4, i) = i4x2; + } + + { + int hi = input[7]; + int lo = input[3]; + int i4x2 = (hi << 4) | lo; + + b_k_n_permute(j + 6, i) = i4x2; + } + } + } +#endif + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n_permute.mData.data()); DeviceMem workspace; diff --git a/include/ck/library/utility/host_tensor_generator.hpp b/include/ck/library/utility/host_tensor_generator.hpp index 6a90523c33..92f6707ec8 100644 --- a/include/ck/library/utility/host_tensor_generator.hpp +++ b/include/ck/library/utility/host_tensor_generator.hpp @@ -89,7 +89,7 @@ struct GeneratorTensor_1 template ck::pk_i4_t operator()(Is...) { - int t = value + 8; + int t = value; ck::pk_i4_t r = ((t << 4) + t) & 0xff; return r; } @@ -144,8 +144,8 @@ struct GeneratorTensor_2 template ck::pk_i4_t operator()(Is...) { - int hi = std::rand() % (max_value - min_value) + min_value + 8; - int lo = std::rand() % (max_value - min_value) + min_value + 8; + int hi = std::rand() % (max_value - min_value) + min_value; + int lo = std::rand() % (max_value - min_value) + min_value; ck::pk_i4_t r = ((hi << 4) + lo) & 0xff; return r; } diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 5e522fb2ea..a8d192888c 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -12,6 +12,83 @@ namespace ck { +__device__ inline bhalf8_t pki4_to_bhalf8(uint32_t q) +{ + uint32_t i4x8 = q; + uint32_t bhalfx2_0, bhalfx2_1, bhalfx2_2, bhalfx2_3; + float tmp_0, tmp_2; + vector_type res; + + //i4x8 = 0x01000000; + + asm volatile ( + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_0\n" + "v_cvt_off_f32_i4 %[v_dst_0], %[v_src], src0_sel:BYTE_2\n" + "v_mov_b32 %[v_dst_0], %[v_tmp_0], dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1\n" + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n" + "v_cvt_off_f32_i4 %[v_dst_1], %[v_src], src0_sel:BYTE_3\n" + "v_mov_b32 %[v_dst_1], %[v_tmp_0], dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1\n" + "v_lshrrev_b32 %[v_tmp_2], 4, %[v_src]\n" + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_tmp_2], src0_sel:BYTE_0\n" + "v_cvt_off_f32_i4 %[v_dst_2], %[v_tmp_2], src0_sel:BYTE_2\n" + "v_mov_b32 %[v_dst_2], %[v_tmp_0], dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1\n" + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_tmp_2], src0_sel:BYTE_1\n" + "v_cvt_off_f32_i4 %[v_dst_3], %[v_tmp_2], src0_sel:BYTE_3\n" + "v_mov_b32 %[v_dst_3], %[v_tmp_0], dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1" + : [v_tmp_0]"+v"(tmp_0), [v_tmp_2]"+v"(tmp_2), + [v_dst_0]"+v"(bhalfx2_0), [v_dst_1]"+v"(bhalfx2_1), + [v_dst_2]"+v"(bhalfx2_2), [v_dst_3]"+v"(bhalfx2_3), + [v_src]"+v"(i4x8) + : + ); + + res.template AsType()(Number<0>{}) = bit_cast(bhalfx2_0); + res.template AsType()(Number<1>{}) = bit_cast(bhalfx2_1); + res.template AsType()(Number<2>{}) = bit_cast(bhalfx2_2); + res.template AsType()(Number<3>{}) = bit_cast(bhalfx2_3); + + //if(threadIdx.x == 0 and blockIdx.x == 0) + //printf("%x %x %x %x\n", bhalfx2_0, bhalfx2_1, bhalfx2_2, bhalfx2_3); + + return res.template AsType()[Number<0>{}]; +} + +__device__ inline half8_t pki4_to_half8(uint32_t q) +{ + uint32_t i4x8 = q; + uint32_t halfx2_0, halfx2_1, halfx2_2, halfx2_3; + float tmp_0, tmp_1, tmp_2; + vector_type res; + + asm volatile ( + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_0\n" + "v_cvt_off_f32_i4 %[v_tmp_1], %[v_src], src0_sel:BYTE_2\n" + "v_cvt_pkrtz_f16_f32 %[v_dst_0], %[v_tmp_0], %[v_tmp_1]\n" + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_src], src0_sel:BYTE_1\n" + "v_cvt_off_f32_i4 %[v_tmp_1], %[v_src], src0_sel:BYTE_3\n" + "v_cvt_pkrtz_f16_f32 %[v_dst_1], %[v_tmp_0], %[v_tmp_1]\n" + "v_lshrrev_b32 %[v_tmp_2], 4, %[v_src]\n" + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_tmp_2], src0_sel:BYTE_0\n" + "v_cvt_off_f32_i4 %[v_tmp_1], %[v_tmp_2], src0_sel:BYTE_2\n" + "v_cvt_pkrtz_f16_f32 %[v_dst_2], %[v_tmp_0], %[v_tmp_1]\n" + "v_cvt_off_f32_i4 %[v_tmp_0], %[v_tmp_2], src0_sel:BYTE_1\n" + "v_cvt_off_f32_i4 %[v_tmp_1], %[v_tmp_2], src0_sel:BYTE_3\n" + "v_cvt_pkrtz_f16_f32 %[v_dst_3], %[v_tmp_0], %[v_tmp_1]\n" + : [v_tmp_0]"+v"(tmp_0), [v_tmp_1]"+v"(tmp_1), [v_tmp_2]"+v"(tmp_2), + [v_dst_0]"+v"(halfx2_0), [v_dst_1]"+v"(halfx2_1), + [v_dst_2]"+v"(halfx2_2), [v_dst_3]"+v"(halfx2_3), + [v_src]"+v"(i4x8) + : + ); + + res.template AsType()(Number<0>{}) = bit_cast(halfx2_0); + res.template AsType()(Number<1>{}) = bit_cast(halfx2_1); + res.template AsType()(Number<2>{}) = bit_cast(halfx2_2); + res.template AsType()(Number<3>{}) = bit_cast(halfx2_3); + + return res.template AsType()[Number<0>{}]; +} + // Fast int4x4 to half8_t data type conversion based on paper // [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production] // (https://arxiv.org/abs/2211.10017) and implementation: @@ -166,6 +243,8 @@ struct PassThroughPack8 result.template AsType()(Number<1>{}) = pki4_to_half4(bit_cast(x) >> 8); y = result.template AsType()[Number<0>{}]; +#elif 1 + y = pki4_to_half8(bit_cast(x)); #else vector_type dst; vector_type src{x}; @@ -185,13 +264,15 @@ struct PassThroughPack8 __host__ __device__ constexpr void operator()(ck::bhalf8_t& y, const ck::pk_i4x4_t& x) const { -#if 1 +#if 0 vector_type result; result.template AsType()(Number<0>{}) = pki4_to_bhalf4(bit_cast(x)); result.template AsType()(Number<1>{}) = pki4_to_bhalf4(bit_cast(x) >> 16); y = result.template AsType()[Number<0>{}]; +#elif 1 + y = pki4_to_bhalf8(bit_cast(x)); #else vector_type dst; vector_type src{x}; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 7e2482807d..d24ecdb3ac 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -76,7 +76,7 @@ struct ReferenceGemm : public device::BaseOperator i4 = (i4x2 >> 0) & 0xf; else i4 = (i4x2 >> 4) & 0xf; - i4 = i4 - 8; + i4 = (i4 > 7) ? i4 - 16 : i4; v_a = type_convert(i4); } else @@ -92,7 +92,7 @@ struct ReferenceGemm : public device::BaseOperator i4 = (i4x2 >> 0) & 0xf; else i4 = (i4x2 >> 4) & 0xf; - i4 = i4 - 8; + i4 = (i4 > 7) ? i4 - 16 : i4; v_b = type_convert(i4); } else diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 6089fc7a7e..94da211b05 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -17,7 +17,7 @@ fi cmake \ -D CMAKE_PREFIX_PATH=/opt/rocm/ \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_CXX_FLAGS="-Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ +-D CMAKE_HIP_FLAGS="-gline-tables-only -save-temps -Xclang -mllvm -Xclang -enable-post-misched=0 -std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_BUILD_TYPE=Release \ -D BUILD_DEV=ON \ -D GPU_TARGETS=$GPU_TARGETS \