diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index d9f88477a124d0..90ca2127e864ad 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -214,6 +214,10 @@ std::vector fully_connected_inst::calc_output_layouts(fully_connected_no } kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) { + if (can_apply_single_batch_optimization(orig_impl_param)) { + return std::move(orig_impl_param); + } + // fc_tiled_opt kernel is optimized for row shape aligned by 8. // Thus, use fake aligned shape at kernel execution for better performance. const auto& orig_input_layout = orig_impl_param.get_input_layout(); @@ -326,6 +330,32 @@ std::string fully_connected_inst::to_string(fully_connected_node const& node) { return primitive_description.str(); } +bool fully_connected_inst::can_apply_single_batch_optimization(const kernel_impl_params& impl_param) { + if (impl_param.output_layouts.empty() || impl_param.output_layouts[0].is_dynamic()) + return false; + + // Only support i4/u4 weight so far + if (impl_param.weights_layout) { + auto weights_layout_dt = impl_param.weights_layout.value().data_type; + if (weights_layout_dt != data_types::i4 && weights_layout_dt != data_types::u4) { + return false; + } + } + + // Don't support swiglu fused + if (impl_param.fused_desc.size() > 0) { + for (const auto& f : impl_param.fused_desc) { + if (f.is_type()) + return false; + } + } + + // Single batch + auto shape = impl_param.output_layouts[0].get_partial_shape().to_shape(); + auto shape_size = ov::shape_size(shape); + return one_of(shape_size, shape) && (shape_size % 16 == 0); +} + fully_connected_inst::typed_primitive_inst(network& network, fully_connected_node const& node) : parent(network, node) { } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp index b33a391dadea4c..b6a8d9e1a8d13a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp @@ -183,6 +183,11 @@ struct fully_connected_impl : typed_primitive_impl_ocl { if (with_zp) { params.has_decompression_zp = true; params.decompression_zero_point = convert_data_tensor(updated_impl_param.input_layouts[3]); + if (updated_impl_param.input_layouts[3].get_linear_size() == 1 && + primitive->decompression_zero_point_scalar.has_value()) { + params.scalar_zp = true; + params.zp_value = primitive->decompression_zero_point_scalar.value(); + } } else if (primitive->decompression_zero_point_scalar.has_value()) { params.has_decompression_zp = true; params.scalar_zp = true; @@ -203,7 +208,9 @@ struct fully_connected_impl : typed_primitive_impl_ocl { params.quantization = kernel_selector::QuantizationType::NONE; } - params.dynamic_quantization_group_size = impl_param.get_program().get_config().get_dynamic_quantization_group_size(); + params.single_batch_optimized = fully_connected_inst::can_apply_single_batch_optimization(updated_impl_param); + params.dynamic_quantization_group_size = + impl_param.get_program().get_config().get_dynamic_quantization_group_size(); return params; } diff --git a/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h b/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h index d57d90a9be799a..f706f1292ae5b4 100644 --- a/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/fully_connected_inst.h @@ -49,6 +49,7 @@ class typed_primitive_inst : public typed_primitive_inst_base= 32" +# endif +// CW +# if WEIGHTS_K == DECOMPRESSION_GROUP_SIZE_SRC && WEIGHTS_K > 128 +# define SINGLE_GROUP_NUM +# endif +# ifdef SINGLE_GROUP_NUM +# define SCALE_GROUP_NUM (WEIGHTS_K / 128) +# define DECOMPRESSION_GROUP_SIZE 128 +# else +# define SCALE_GROUP_NUM (WEIGHTS_K / DECOMPRESSION_GROUP_SIZE_SRC) +# define DECOMPRESSION_GROUP_SIZE DECOMPRESSION_GROUP_SIZE_SRC +# endif +#else +# define SCALE_GROUP_NUM (WEIGHTS_K / 128) +# define DECOMPRESSION_GROUP_SIZE 128 +#endif + +#if KERNEL_LAYOUT_OS_IS_YX_OSV16 && WEIGHTS_K % 32 != 0 +# error "fully_connected_gpu_gemv.cl - KERNEL_LAYOUT_OS_IS_YX_OSV16 must be WEIGHTS_K % 32 != 0" +#endif + +#if KERNEL_LAYOUT_OS_IS_YX_OSV16 +# define INPUT_TILE_SIZE 2 +#elif KERNEL_LAYOUT_OS_IS_YX_OSV32_ISV2 +# define INPUT_TILE_SIZE 1 +#elif KERNEL_LAYOUT_OS_IS_YX_OSV64_ISV2 +# define INPUT_TILE_SIZE 1 +#else +# error "fully_connected_gpu_gemv.cl - Unsupported layout!" +#endif + +// Macros for vectorized types. +#define GEMV_INPUT_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_TILE_SIZE) +#define GEMV_ACCUMULATOR_VEC_TYPE MAKE_VECTOR_TYPE(float, 8) +#define GEMV_FILTER_VEC_TYPE MAKE_VECTOR_TYPE(half, 16) +#define GEMV_FILTER_PACKED_VEC_TYPE MAKE_VECTOR_TYPE(char, 16) +#define GEMV_OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, 1) +#define TO_GEMV_OUTPUT_VEC_TYPE(x) CAT(convert_, GEMV_OUTPUT_VEC_TYPE)(x) +#define TO_GEMV_FILTER_VEC_TYPE(x) CAT(convert_, GEMV_FILTER_VEC_TYPE)(x) +#define TO_GEMV_FILTER_PACKED_VEC_TYPE(x) CAT(convert_, GEMV_FILTER_PACKED_VEC_TYPE)(x) + +#define GEMV_INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, INPUT_TILE_SIZE, ptr, offset) +#define GEMV_FILTER_BLOCK_READ(ptr, offset) BLOCK_READN(FILTER_TYPE, 16, ptr, offset) + +inline int get_4bit_weight_index(int k, int n, int K, int N, int OSV) { + return (n / OSV) * (OSV * K / 2) + (n % OSV) + (k / 2) * OSV; +} + +inline int get_4bit_weight_index_no_isv(int k, int n, int K, int N, int OSV) { + return (n / OSV) * (OSV * K / 2) + (k / 2) * OSV; +} + +inline void thread_task_splitter(const int group_num, const int thr_num, const int thr_id, int* n_start, int* n_end) { + if (thr_num <= 1 || group_num == 0) { + *n_start = 0; + *n_end = group_num; + } else { + int num = (group_num + thr_num - 1) / thr_num; + int num_minus = num - 1; + int last = group_num - num_minus * thr_num; + *n_end = thr_id < last ? num : num_minus; + *n_start = thr_id <= last ? thr_id * num : last * num + (thr_id - last) * num_minus; + } + *n_end += *n_start; +} + +__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) KERNEL(fully_connected_gpu_gemv)( + OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, +#if DECOMPRESSION_SCALE_TERM + const __global DECOMPRESSION_SCALE_TYPE* scales, +#endif +#if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + const __global DECOMPRESSION_ZP_TYPE* zps, +#endif + __global OUTPUT_TYPE* output, + const __global FILTER_TYPE* weights +#if BIAS_TERM + , + const __global BIAS_TYPE* bias +#endif +#if HAS_FUSED_OPS_DECLS + , + FUSED_OPS_DECLS +#endif +) { +#if KERNEL_LAYOUT_OS_IS_YX_OSV16 + // global:[N, M, 16] + // local: [16, 1, 16] + int n = get_global_id(0); // N + int thr_id = get_local_id(2); // 0~15 + int thr_num = get_local_size(2); // 16 + int wi_id = get_sub_group_local_id(); // 0~15 + + int gk0, gk1; + thread_task_splitter(SCALE_GROUP_NUM, thr_num, thr_id, &gk0, &gk1); + +# if DECOMPRESSION_ZP_SCALAR + char zp_scalar_value = (char)(DECOMPRESSION_ZP_VALUE); +# endif + + __local float all_sum_even[16][16]; // [wi_id, thr_id] + __local float all_sum_odd[16][16]; + + // Scale layout is byfx + scales += n; +# if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + zps += n; +# endif + + float sum_all = 0; + for (int gk = gk0; gk < gk1; gk++) { + __global INPUT0_TYPE* A = input + gk * DECOMPRESSION_GROUP_SIZE; + const __global FILTER_TYPE* B = + weights + get_4bit_weight_index_no_isv(gk * DECOMPRESSION_GROUP_SIZE, n, WEIGHTS_K, WEIGHTS_N, 16); + + GEMV_ACCUMULATOR_VEC_TYPE sum = 0; +# ifdef SINGLE_GROUP_NUM + float scale_1 = convert_float(scales[0]); +# else + float scale_1 = convert_float(scales[gk * WEIGHTS_N]); +# endif + +# if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR +# ifdef SINGLE_GROUP_NUM + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)(zps[0]); +# else + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)(zps[gk * WEIGHTS_N]); +# endif +# elif DECOMPRESSION_ZP_SCALAR + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)(zp_scalar_value); +# else + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)0; +# endif + + __attribute__((opencl_unroll_hint(4))) for (int g = 0; g < DECOMPRESSION_GROUP_SIZE; g += 32, B += 16 * 16) { + GEMV_INPUT_VEC_TYPE input_value = GEMV_INPUT_BLOCK_READ(A, g); + GEMV_FILTER_PACKED_VEC_TYPE bx16 = TO_GEMV_FILTER_PACKED_VEC_TYPE(GEMV_FILTER_BLOCK_READ(B, 0)); + +# if WEI_UINT4 + GEMV_FILTER_VEC_TYPE i4x16_even = TO_GEMV_FILTER_VEC_TYPE((bx16 & (char16)0xF)) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd = TO_GEMV_FILTER_VEC_TYPE(as_char16(as_uchar16(bx16) >> 4)) - zpx16; +# else + char16 i4x16_even_c16 = (bx16 & (char16)0xF); + char16 i4x16_odd_c16 = (as_char16(as_uchar16(bx16) >> 4)); + i4x16_even_c16 = select(i4x16_even_c16, i4x16_even_c16 - (char16)16, i4x16_even_c16 > (char16)7); + i4x16_odd_c16 = select(i4x16_odd_c16, i4x16_odd_c16 - (char16)16, i4x16_odd_c16 > (char16)7); + GEMV_FILTER_VEC_TYPE i4x16_even = TO_GEMV_FILTER_VEC_TYPE(i4x16_even_c16) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd = TO_GEMV_FILTER_VEC_TYPE(i4x16_odd_c16) - zpx16; +# endif + + sum[0] += as_half(sub_group_broadcast(input_value.s0, 0)) * i4x16_even.s0 + + as_half(sub_group_broadcast(input_value.s0, 4)) * i4x16_even.s2 + + as_half(sub_group_broadcast(input_value.s0, 8)) * i4x16_even.s4 + + as_half(sub_group_broadcast(input_value.s0, 12)) * i4x16_even.s6; + sum[1] += as_half(sub_group_broadcast(input_value.s0, 1)) * i4x16_odd.s0 + + as_half(sub_group_broadcast(input_value.s0, 5)) * i4x16_odd.s2 + + as_half(sub_group_broadcast(input_value.s0, 9)) * i4x16_odd.s4 + + as_half(sub_group_broadcast(input_value.s0, 13)) * i4x16_odd.s6; + + sum[2] += as_half(sub_group_broadcast(input_value.s0, 2)) * i4x16_even.s1 + + as_half(sub_group_broadcast(input_value.s0, 6)) * i4x16_even.s3 + + as_half(sub_group_broadcast(input_value.s0, 10)) * i4x16_even.s5 + + as_half(sub_group_broadcast(input_value.s0, 14)) * i4x16_even.s7; + sum[3] += as_half(sub_group_broadcast(input_value.s0, 3)) * i4x16_odd.s1 + + as_half(sub_group_broadcast(input_value.s0, 7)) * i4x16_odd.s3 + + as_half(sub_group_broadcast(input_value.s0, 11)) * i4x16_odd.s5 + + as_half(sub_group_broadcast(input_value.s0, 15)) * i4x16_odd.s7; + + sum[4] += as_half(sub_group_broadcast(input_value.s1, 0)) * i4x16_even.s8 + + as_half(sub_group_broadcast(input_value.s1, 4)) * i4x16_even.sa + + as_half(sub_group_broadcast(input_value.s1, 8)) * i4x16_even.sc + + as_half(sub_group_broadcast(input_value.s1, 12)) * i4x16_even.se; + sum[5] += as_half(sub_group_broadcast(input_value.s1, 1)) * i4x16_odd.s8 + + as_half(sub_group_broadcast(input_value.s1, 5)) * i4x16_odd.sa + + as_half(sub_group_broadcast(input_value.s1, 9)) * i4x16_odd.sc + + as_half(sub_group_broadcast(input_value.s1, 13)) * i4x16_odd.se; + + sum[6] += as_half(sub_group_broadcast(input_value.s1, 2)) * i4x16_even.s9 + + as_half(sub_group_broadcast(input_value.s1, 6)) * i4x16_even.sb + + as_half(sub_group_broadcast(input_value.s1, 10)) * i4x16_even.sd + + as_half(sub_group_broadcast(input_value.s1, 14)) * i4x16_even.sf; + sum[7] += as_half(sub_group_broadcast(input_value.s1, 3)) * i4x16_odd.s9 + + as_half(sub_group_broadcast(input_value.s1, 7)) * i4x16_odd.sb + + as_half(sub_group_broadcast(input_value.s1, 11)) * i4x16_odd.sd + + as_half(sub_group_broadcast(input_value.s1, 15)) * i4x16_odd.sf; + } + + sum_all += (sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7]) * scale_1; + } + + all_sum_even[wi_id][thr_id] = sum_all; + barrier(CLK_LOCAL_MEM_FENCE); + + float2 sum_value; + sum_value[0] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_even[thr_id])); + sum_value[0] = sub_group_reduce_add(sum_value[0]); + if (wi_id == 0) { + int cur_n = n + thr_id; +# if BIAS_TERM + sum_value[0] += bias[cur_n]; +# endif +# if HAS_FUSED_OPS + for (int i = 0; i < 1; i++) { + FUSED_OPS_VEC + output[cur_n + i] = FUSED_OPS_RESULT_VEC; + } +# else + for (int i = 0; i < 1; i++) { + output[cur_n + i] = TO_GEMV_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(sum_value[i], ACTIVATION_PARAMS_TYPED)); + } +# endif + } +} + +#elif KERNEL_LAYOUT_OS_IS_YX_OSV32_ISV2 + // global:[N//2, M, 16] + // local: [16, 1, 16] + int n = get_global_id(0) * 2; // N + int thr_id = get_local_id(2); // 0~15 + int thr_num = get_local_size(2); // 16 + int wi_id = get_sub_group_local_id(); // 0~15 + + int gk0, gk1; + thread_task_splitter(SCALE_GROUP_NUM, thr_num, thr_id, &gk0, &gk1); + +# if DECOMPRESSION_ZP_SCALAR + char zp_scalar_value = (char)(DECOMPRESSION_ZP_VALUE); +# endif + + __local float all_sum_even[16][16]; // [wi_id, thr_id] + __local float all_sum_odd[16][16]; + + // Scale layout is fbyx + scales += (n / 32) * 32 + (n % 32) / 2; +# if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + zps += (n / 32) * 32 + (n % 32) / 2; +# endif + + float2 sum_all = 0; + for (int gk = gk0; gk < gk1; gk++) { + __global INPUT0_TYPE* A = input + gk * DECOMPRESSION_GROUP_SIZE; + const __global FILTER_TYPE* B = + weights + get_4bit_weight_index(gk * DECOMPRESSION_GROUP_SIZE, n, WEIGHTS_K, WEIGHTS_N, 32); + + GEMV_ACCUMULATOR_VEC_TYPE sum = 0; +# ifdef SINGLE_GROUP_NUM + float scale_0 = convert_float(scales[0]); + float scale_1 = convert_float(scales[16]); +# else + float scale_0 = convert_float(scales[gk * WEIGHTS_N]); + float scale_1 = convert_float(scales[gk * WEIGHTS_N + 16]); +# endif + +# if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR +# ifdef SINGLE_GROUP_NUM + half zp0 = zps[0]; + half zp1 = zps[16]; +# else + half zp0 = zps[gk * WEIGHTS_N]; + half zp1 = zps[gk * WEIGHTS_N + 16]; +# endif + GEMV_FILTER_VEC_TYPE zpx16 = {zp0, zp1, zp0, zp1, zp0, zp1, zp0, zp1, zp0, zp1, zp0, zp1, zp0, zp1, zp0, zp1}; +# elif DECOMPRESSION_ZP_SCALAR + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)(zp_scalar_value); +# else + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)0; +# endif + + __attribute__((opencl_unroll_hint(4))) for (int g = 0; g < DECOMPRESSION_GROUP_SIZE; g += 16, B += 16 * 16) { + // read 16 elements of A + GEMV_INPUT_VEC_TYPE input_value = GEMV_INPUT_BLOCK_READ(A, g); + + // read 16x16 int8 = (16x2)x16 int4 + + GEMV_FILTER_PACKED_VEC_TYPE bx16 = TO_GEMV_FILTER_PACKED_VEC_TYPE(GEMV_FILTER_BLOCK_READ(B, 0)); + +# if WEI_UINT4 + GEMV_FILTER_VEC_TYPE i4x16_even = TO_GEMV_FILTER_VEC_TYPE(bx16 & (char16)0xF) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd = TO_GEMV_FILTER_VEC_TYPE(as_char16(as_uchar16(bx16) >> 4)) - zpx16; +# else + char16 i4x16_even_c16 = (bx16 & (char16)0xF); + char16 i4x16_odd_c16 = (as_char16(as_uchar16(bx16) >> 4)); + i4x16_even_c16 = select(i4x16_even_c16, i4x16_even_c16 - (char16)16, i4x16_even_c16 > (char16)7); + i4x16_odd_c16 = select(i4x16_odd_c16, i4x16_odd_c16 - (char16)16, i4x16_odd_c16 > (char16)7); + GEMV_FILTER_VEC_TYPE i4x16_even = TO_GEMV_FILTER_VEC_TYPE(i4x16_even_c16) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd = TO_GEMV_FILTER_VEC_TYPE(i4x16_odd_c16) - zpx16; +# endif + + sum[0] += as_half(sub_group_broadcast(input_value, 0)) * i4x16_even.s0 + + as_half(sub_group_broadcast(input_value, 4)) * i4x16_even.s4 + + as_half(sub_group_broadcast(input_value, 8)) * i4x16_even.s8 + + as_half(sub_group_broadcast(input_value, 12)) * i4x16_even.sc; + + sum[1] += as_half(sub_group_broadcast(input_value, 0)) * i4x16_even.s1 + + as_half(sub_group_broadcast(input_value, 4)) * i4x16_even.s5 + + as_half(sub_group_broadcast(input_value, 8)) * i4x16_even.s9 + + as_half(sub_group_broadcast(input_value, 12)) * i4x16_even.sd; + + sum[2] += as_half(sub_group_broadcast(input_value, 1)) * i4x16_odd.s0 + + as_half(sub_group_broadcast(input_value, 5)) * i4x16_odd.s4 + + as_half(sub_group_broadcast(input_value, 9)) * i4x16_odd.s8 + + as_half(sub_group_broadcast(input_value, 13)) * i4x16_odd.sc; + + sum[3] += as_half(sub_group_broadcast(input_value, 1)) * i4x16_odd.s1 + + as_half(sub_group_broadcast(input_value, 5)) * i4x16_odd.s5 + + as_half(sub_group_broadcast(input_value, 9)) * i4x16_odd.s9 + + as_half(sub_group_broadcast(input_value, 13)) * i4x16_odd.sd; + + sum[4] += as_half(sub_group_broadcast(input_value, 2)) * i4x16_even.s2 + + as_half(sub_group_broadcast(input_value, 6)) * i4x16_even.s6 + + as_half(sub_group_broadcast(input_value, 10)) * i4x16_even.sa + + as_half(sub_group_broadcast(input_value, 14)) * i4x16_even.se; + + sum[5] += as_half(sub_group_broadcast(input_value, 2)) * i4x16_even.s3 + + as_half(sub_group_broadcast(input_value, 6)) * i4x16_even.s7 + + as_half(sub_group_broadcast(input_value, 10)) * i4x16_even.sb + + as_half(sub_group_broadcast(input_value, 14)) * i4x16_even.sf; + + sum[6] += as_half(sub_group_broadcast(input_value, 3)) * i4x16_odd.s2 + + as_half(sub_group_broadcast(input_value, 7)) * i4x16_odd.s6 + + as_half(sub_group_broadcast(input_value, 11)) * i4x16_odd.sa + + as_half(sub_group_broadcast(input_value, 15)) * i4x16_odd.se; + + sum[7] += as_half(sub_group_broadcast(input_value, 3)) * i4x16_odd.s3 + + as_half(sub_group_broadcast(input_value, 7)) * i4x16_odd.s7 + + as_half(sub_group_broadcast(input_value, 11)) * i4x16_odd.sb + + as_half(sub_group_broadcast(input_value, 15)) * i4x16_odd.sf; + } + + sum_all[0] += (sum[0] + sum[2] + sum[4] + sum[6]) * scale_0; + sum_all[1] += (sum[1] + sum[3] + sum[5] + sum[7]) * scale_1; + } + + all_sum_even[wi_id][thr_id] = sum_all[0]; + all_sum_odd[wi_id][thr_id] = sum_all[1]; + barrier(CLK_LOCAL_MEM_FENCE); + + float2 sum_value; + sum_value[0] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_even[thr_id])); + sum_value[1] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_odd[thr_id])); + sum_value[0] = sub_group_reduce_add(sum_value[0]); + sum_value[1] = sub_group_reduce_add(sum_value[1]); + + if (wi_id == 0) { + int cur_n = n + thr_id; + + // bias +# if BIAS_TERM + sum_value[0] += bias[cur_n]; + sum_value[1] += bias[cur_n + 16]; +# endif + +// fused_op +# if HAS_FUSED_OPS + for (int i = 0; i < 2; i++) { + FUSED_OPS_VEC + output[cur_n + 16 * i] = FUSED_OPS_RESULT_VEC; + } +# else + for (int i = 0; i < 2; i++) { + output[cur_n + 16 * i] = TO_GEMV_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(sum_value[i], ACTIVATION_PARAMS_TYPED)); + } +# endif + } +} +#elif KERNEL_LAYOUT_OS_IS_YX_OSV64_ISV2 + // global:[N//4, M, 16] + // local: [16, 1, 16] + int n = get_global_id(0) * 4; // N + int thr_id = get_local_id(2); // 0~15 + int thr_num = get_local_size(2); // 16 + int wi_id = get_sub_group_local_id(); // 0~15 + + int gk0, gk1; + thread_task_splitter(SCALE_GROUP_NUM, thr_num, thr_id, &gk0, &gk1); + + __local float all_sum_0[16][16]; // [wi_id, thr_id] + __local float all_sum_1[16][16]; // [wi_id, thr_id] + __local float all_sum_2[16][16]; // [wi_id, thr_id] + __local float all_sum_3[16][16]; // [wi_id, thr_id] + + scales += (n / 64) * 64 + (n % 64) / 4; +# if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR + zps += (n / 64) * 64 + (n % 64) / 4; +# endif + + float4 sum_all = 0; + for (int gk = gk0; gk < gk1; gk++) { + __global INPUT0_TYPE* A = input + gk * DECOMPRESSION_GROUP_SIZE; + const __global FILTER_TYPE* B = + weights + get_4bit_weight_index(gk * DECOMPRESSION_GROUP_SIZE, n, WEIGHTS_K, WEIGHTS_N, 64); + + GEMV_ACCUMULATOR_VEC_TYPE sum = 0; +# ifdef SINGLE_GROUP_NUM + float scale_0 = convert_float(scales[0]); + float scale_1 = convert_float(scales[16]); + float scale_2 = convert_float(scales[2 * 16]); + float scale_3 = convert_float(scales[3 * 16]); +# else + float scale_0 = convert_float(scales[gk * WEIGHTS_N]); + float scale_1 = convert_float(scales[gk * WEIGHTS_N + 1 * 16]); + float scale_2 = convert_float(scales[gk * WEIGHTS_N + 2 * 16]); + float scale_3 = convert_float(scales[gk * WEIGHTS_N + 3 * 16]); +# endif +# if DECOMPRESSION_ZP_TERM && !DECOMPRESSION_ZP_SCALAR +# ifdef SINGLE_GROUP_NUM + half zp0 = zps[0]; + half zp1 = zps[1 * 16]; + half zp2 = zps[2 * 16]; + half zp3 = zps[3 * 16]; +# else + half zp0 = zps[gk * WEIGHTS_N]; + half zp1 = zps[gk * WEIGHTS_N + 1 * 16]; + half zp2 = zps[gk * WEIGHTS_N + 2 * 16]; + half zp3 = zps[gk * WEIGHTS_N + 3 * 16]; +# endif + GEMV_FILTER_VEC_TYPE zpx16 = {zp0, zp1, zp2, zp3, zp0, zp1, zp2, zp3, zp0, zp1, zp2, zp3, zp0, zp1, zp2, zp3}; +# elif DECOMPRESSION_ZP_SCALAR + half zp_scalar_value = (half)(DECOMPRESSION_ZP_VALUE); + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)(zp_scalar_value); +# else + GEMV_FILTER_VEC_TYPE zpx16 = (GEMV_FILTER_VEC_TYPE)0; +# endif + + __attribute__((opencl_unroll_hint(2))) for (int g = 0; g < DECOMPRESSION_GROUP_SIZE; g += 16, B += 16 * 32) { + // read 16 elements of A + GEMV_INPUT_VEC_TYPE input_value = GEMV_INPUT_BLOCK_READ(A, g); + GEMV_FILTER_PACKED_VEC_TYPE bx16 = TO_GEMV_FILTER_PACKED_VEC_TYPE(GEMV_FILTER_BLOCK_READ(B, 0)); + GEMV_FILTER_PACKED_VEC_TYPE bx16_second = + TO_GEMV_FILTER_PACKED_VEC_TYPE(GEMV_FILTER_BLOCK_READ(B, 16 * 16)); + +# if WEI_UINT4 + GEMV_FILTER_VEC_TYPE i4x16_even = convert_half16((bx16 & (char16)0xF)) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd = convert_half16(as_char16(as_uchar16(bx16) >> 4)) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_even_second = convert_half16((bx16_second & (char16)0xF)) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd_second = convert_half16(as_char16(as_uchar16(bx16_second) >> 4)) - zpx16; +# else + char16 i4x16_even_c16 = (bx16 & (char16)0xF); + char16 i4x16_odd_c16 = (as_char16(as_uchar16(bx16) >> 4)); + i4x16_even_c16 = select(i4x16_even_c16, i4x16_even_c16 - (char16)16, i4x16_even_c16 > (char16)7); + i4x16_odd_c16 = select(i4x16_odd_c16, i4x16_odd_c16 - (char16)16, i4x16_odd_c16 > (char16)7); + + char16 i4x16_even_c16_second = (bx16_second & (char16)0xF); + char16 i4x16_odd_c16_second = (as_char16(as_uchar16(bx16_second) >> 4)); + i4x16_even_c16_second = + select(i4x16_even_c16_second, i4x16_even_c16_second - (char16)16, i4x16_even_c16_second > (char16)7); + i4x16_odd_c16_second = + select(i4x16_odd_c16_second, i4x16_odd_c16_second - (char16)16, i4x16_odd_c16_second > (char16)7); + + GEMV_FILTER_VEC_TYPE i4x16_even = convert_half16(i4x16_even_c16) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd = convert_half16(i4x16_odd_c16) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_even_second = convert_half16(i4x16_even_c16_second) - zpx16; + GEMV_FILTER_VEC_TYPE i4x16_odd_second = convert_half16(i4x16_odd_c16_second) - zpx16; +# endif + + sum[0] += as_half(sub_group_broadcast(input_value, 0)) * i4x16_even.s0 + + as_half(sub_group_broadcast(input_value, 2)) * i4x16_even.s4 + + as_half(sub_group_broadcast(input_value, 4)) * i4x16_even.s8 + + as_half(sub_group_broadcast(input_value, 6)) * i4x16_even.sc; + sum[0] += as_half(sub_group_broadcast(input_value, 8)) * i4x16_even_second.s0 + + as_half(sub_group_broadcast(input_value, 10)) * i4x16_even_second.s4 + + as_half(sub_group_broadcast(input_value, 12)) * i4x16_even_second.s8 + + as_half(sub_group_broadcast(input_value, 14)) * i4x16_even_second.sc; + sum[1] += as_half(sub_group_broadcast(input_value, 0)) * i4x16_even.s1 + + as_half(sub_group_broadcast(input_value, 2)) * i4x16_even.s5 + + as_half(sub_group_broadcast(input_value, 4)) * i4x16_even.s9 + + as_half(sub_group_broadcast(input_value, 6)) * i4x16_even.sd; + sum[1] += as_half(sub_group_broadcast(input_value, 8)) * i4x16_even_second.s1 + + as_half(sub_group_broadcast(input_value, 10)) * i4x16_even_second.s5 + + as_half(sub_group_broadcast(input_value, 12)) * i4x16_even_second.s9 + + as_half(sub_group_broadcast(input_value, 14)) * i4x16_even_second.sd; + sum[2] += as_half(sub_group_broadcast(input_value, 0)) * i4x16_even.s2 + + as_half(sub_group_broadcast(input_value, 2)) * i4x16_even.s6 + + as_half(sub_group_broadcast(input_value, 4)) * i4x16_even.sa + + as_half(sub_group_broadcast(input_value, 6)) * i4x16_even.se; + sum[2] += as_half(sub_group_broadcast(input_value, 8)) * i4x16_even_second.s2 + + as_half(sub_group_broadcast(input_value, 10)) * i4x16_even_second.s6 + + as_half(sub_group_broadcast(input_value, 12)) * i4x16_even_second.sa + + as_half(sub_group_broadcast(input_value, 14)) * i4x16_even_second.se; + sum[3] += as_half(sub_group_broadcast(input_value, 0)) * i4x16_even.s3 + + as_half(sub_group_broadcast(input_value, 2)) * i4x16_even.s7 + + as_half(sub_group_broadcast(input_value, 4)) * i4x16_even.sb + + as_half(sub_group_broadcast(input_value, 6)) * i4x16_even.sf; + sum[3] += as_half(sub_group_broadcast(input_value, 8)) * i4x16_even_second.s3 + + as_half(sub_group_broadcast(input_value, 10)) * i4x16_even_second.s7 + + as_half(sub_group_broadcast(input_value, 12)) * i4x16_even_second.sb + + as_half(sub_group_broadcast(input_value, 14)) * i4x16_even_second.sf; + sum[4] += as_half(sub_group_broadcast(input_value, 1)) * i4x16_odd.s0 + + as_half(sub_group_broadcast(input_value, 3)) * i4x16_odd.s4 + + as_half(sub_group_broadcast(input_value, 5)) * i4x16_odd.s8 + + as_half(sub_group_broadcast(input_value, 7)) * i4x16_odd.sc; + sum[4] += as_half(sub_group_broadcast(input_value, 9)) * i4x16_odd_second.s0 + + as_half(sub_group_broadcast(input_value, 11)) * i4x16_odd_second.s4 + + as_half(sub_group_broadcast(input_value, 13)) * i4x16_odd_second.s8 + + as_half(sub_group_broadcast(input_value, 15)) * i4x16_odd_second.sc; + sum[5] += as_half(sub_group_broadcast(input_value, 1)) * i4x16_odd.s1 + + as_half(sub_group_broadcast(input_value, 3)) * i4x16_odd.s5 + + as_half(sub_group_broadcast(input_value, 5)) * i4x16_odd.s9 + + as_half(sub_group_broadcast(input_value, 7)) * i4x16_odd.sd; + sum[5] += as_half(sub_group_broadcast(input_value, 9)) * i4x16_odd_second.s1 + + as_half(sub_group_broadcast(input_value, 11)) * i4x16_odd_second.s5 + + as_half(sub_group_broadcast(input_value, 13)) * i4x16_odd_second.s9 + + as_half(sub_group_broadcast(input_value, 15)) * i4x16_odd_second.sd; + sum[6] += as_half(sub_group_broadcast(input_value, 1)) * i4x16_odd.s2 + + as_half(sub_group_broadcast(input_value, 3)) * i4x16_odd.s6 + + as_half(sub_group_broadcast(input_value, 5)) * i4x16_odd.sa + + as_half(sub_group_broadcast(input_value, 7)) * i4x16_odd.se; + sum[6] += as_half(sub_group_broadcast(input_value, 9)) * i4x16_odd_second.s2 + + as_half(sub_group_broadcast(input_value, 11)) * i4x16_odd_second.s6 + + as_half(sub_group_broadcast(input_value, 13)) * i4x16_odd_second.sa + + as_half(sub_group_broadcast(input_value, 15)) * i4x16_odd_second.se; + sum[7] += as_half(sub_group_broadcast(input_value, 1)) * i4x16_odd.s3 + + as_half(sub_group_broadcast(input_value, 3)) * i4x16_odd.s7 + + as_half(sub_group_broadcast(input_value, 5)) * i4x16_odd.sb + + as_half(sub_group_broadcast(input_value, 7)) * i4x16_odd.sf; + sum[7] += as_half(sub_group_broadcast(input_value, 9)) * i4x16_odd_second.s3 + + as_half(sub_group_broadcast(input_value, 11)) * i4x16_odd_second.s7 + + as_half(sub_group_broadcast(input_value, 13)) * i4x16_odd_second.sb + + as_half(sub_group_broadcast(input_value, 15)) * i4x16_odd_second.sf; + } + + sum_all[0] += (sum[0] + sum[4]) * scale_0; + sum_all[1] += (sum[1] + sum[5]) * scale_1; + sum_all[2] += (sum[2] + sum[6]) * scale_2; + sum_all[3] += (sum[3] + sum[7]) * scale_3; + } + + all_sum_0[wi_id][thr_id] = sum_all[0]; + all_sum_1[wi_id][thr_id] = sum_all[1]; + all_sum_2[wi_id][thr_id] = sum_all[2]; + all_sum_3[wi_id][thr_id] = sum_all[3]; + barrier(CLK_LOCAL_MEM_FENCE); + + float4 sum_value; + sum_value[0] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_0[thr_id])); + sum_value[1] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_1[thr_id])); + sum_value[2] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_2[thr_id])); + sum_value[3] = as_float(intel_sub_group_block_read((const __local uint*)all_sum_3[thr_id])); + + for (int i = 0; i < 4; i++) { + sum_value[i] = sub_group_reduce_add(sum_value[i]); + } + + if (wi_id == 0) { + int cur_n = n + thr_id; +# if BIAS_TERM + for (int i = 0; i < 4; i++) { + sum_value[i] += bias[cur_n + 16 * i]; + } +# endif +# if HAS_FUSED_OPS + for (int i = 0; i < 4; i++) { + FUSED_OPS_VEC + output[cur_n + 16 * i] = FUSED_OPS_RESULT_VEC; + } +# else + for (int i = 0; i < 4; i++) { + output[cur_n + 16 * i] = TO_GEMV_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(sum_value[i], ACTIVATION_PARAMS_TYPED)); + } +# endif + } +} +#endif + +#undef INPUT_TILE_SIZE +#undef GEMV_FILTER_BLOCK_READ +#undef GEMV_INPUT_BLOCK_READ +#undef TO_GEMV_FILTER_PACKED_VEC_TYPE +#undef TO_GEMV_FILTER_VEC_TYPE +#undef TO_GEMV_OUTPUT_VEC_TYPE +#undef GEMV_OUTPUT_VEC_TYPE +#undef GEMV_FILTER_PACKED_VEC_TYPE +#undef GEMV_ACCUMULATOR_VEC_TYPE +#undef GEMV_INPUT_VEC_TYPE +#undef SUBGROUP_SIZE +#undef KERNEL_LAYOUT_OS_IS_YX_OSV16 +#undef KERNEL_LAYOUT_OS_IS_YX_OSV32_ISV2 +#undef KERNEL_LAYOUT_OS_IS_YX_OSV64_ISV2 +#undef DECOMPRESSION_GROUP_SIZE_SRC \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 7354cfef609437..2fee3d845433e8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -18,7 +18,8 @@ static std::vector available_quantize_grp_size = {128, 64, 32}; namespace kernel_selector { -static std::pair get_input_bf_size(const fully_connected_params& params) { +namespace fc_kernel_bf_tiled_utils { +std::pair get_input_bf_size(const fully_connected_params& params) { auto& input = params.inputs[0]; size_t input_f = input.Feature().v; size_t input_batch = input.Batch().v; @@ -36,11 +37,12 @@ static std::pair get_input_bf_size(const fully_connected_params& return {input_batch, input_f}; } -static std::pair get_output_aligned_bf_size(const fully_connected_params& params, - bool needs_align, - uint32_t align_b = 1, - int32_t align_f = 1) { - size_t output_f = (needs_align == true) ? CeilDiv(params.outputs[0].Feature().v, align_f) : params.outputs[0].Feature().v; +std::pair get_output_aligned_bf_size(const fully_connected_params& params, + bool needs_align, + uint32_t align_b, + int32_t align_f) { + size_t output_f = + (needs_align == true) ? CeilDiv(params.outputs[0].Feature().v, align_f) : params.outputs[0].Feature().v; size_t output_b = params.outputs[0].Batch().v; // 3D output if (params.outputs[0].GetLayout() == DataLayout::bfyx) { @@ -53,11 +55,11 @@ static std::pair get_output_aligned_bf_size(const fully_connecte return {output_b, output_f}; } -static size_t get_scale_group_size(const fully_connected_params& params) { +size_t get_scale_group_size(const fully_connected_params& params) { return params.weights.IFM().v / params.decompression_scale.Feature().v; } -static bool is_8bit_asym_wei(const fully_connected_params& params) { +bool is_8bit_asym_wei(const fully_connected_params& params) { auto weight_type = params.weights.GetDType(); // UINT8 weight type is supported by FC dyn-quantize(with SLM). if (weight_type == WeightsType::UINT8 && params.has_decompression_zp) @@ -66,7 +68,7 @@ static bool is_8bit_asym_wei(const fully_connected_params& params) { return false; } -static bool is_weight_dyn_quantizable(const fully_connected_params& params) { +bool is_weight_dyn_quantizable(const fully_connected_params& params) { auto weight_type = params.weights.GetDType(); if (weight_type == WeightsType::INT4 || weight_type == WeightsType::UINT4) return true; @@ -77,16 +79,16 @@ static bool is_weight_dyn_quantizable(const fully_connected_params& params) { return false; } -static bool is_per_token_dynamic_quantize(const fully_connected_params& params) { +bool is_per_token_dynamic_quantize(const fully_connected_params& params) { auto dynamic_quantization_group_size = params.dynamic_quantization_group_size; if (dynamic_quantization_group_size == UINT64_MAX) return true; return false; - } +} // DYNAMIC_QUANTIZE -static size_t get_dynamic_quantize_group_size(const fully_connected_params& params) { +size_t get_dynamic_quantize_group_size(const fully_connected_params& params) { auto dynamic_quantization_group_size = params.dynamic_quantization_group_size; size_t scale_group_size = get_scale_group_size(params); @@ -107,22 +109,23 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para dynamic_quantization_group_size = zp_group_size; } - GPU_DEBUG_LOG << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" << dynamic_quantization_group_size - << ") : From scale_group_size (" << scale_group_size << ", zp_group_size(" << zp_group_size - << "), zp_group_num(" << zp_group_num << "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl; + GPU_DEBUG_LOG << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" + << dynamic_quantization_group_size << ") : From scale_group_size (" << scale_group_size + << ", zp_group_size(" << zp_group_size << "), zp_group_num(" << zp_group_num + << "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl; return (size_t)dynamic_quantization_group_size; } } // Grouped-size dyn-quan : use aligned sizes which are in 'available_quantize_grp_size' for (auto group_size : available_quantize_grp_size) { - if (dynamic_quantization_group_size >= group_size && - (scale_group_size % group_size) == 0) { + if (dynamic_quantization_group_size >= group_size && (scale_group_size % group_size) == 0) { dynamic_quantization_group_size = group_size; if (dynamic_quantization_group_size > scale_group_size) { - GPU_DEBUG_TRACE_DETAIL << " Scale group size " << scale_group_size << " is smaller than FC dyn-quan group size " - << dynamic_quantization_group_size << ". Reduce FC dyn-quan group size to scale size." << std::endl; + GPU_DEBUG_TRACE_DETAIL << " Scale group size " << scale_group_size + << " is smaller than FC dyn-quan group size " << dynamic_quantization_group_size + << ". Reduce FC dyn-quan group size to scale size." << std::endl; dynamic_quantization_group_size = scale_group_size; } @@ -133,16 +136,16 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para return 0; } -static bool should_dynamic_quantize(const fully_connected_params& params) { +bool should_dynamic_quantize(const fully_connected_params& params) { size_t dynamic_quantization_group_size = get_dynamic_quantize_group_size(params); if (params.inputs[0].GetFirstElementOffset() != 0) return false; if (dynamic_quantization_group_size < min_quantize_grp_size) { - GPU_DEBUG_TRACE_DETAIL << "Set dynamic_quantize_group_size " << dynamic_quantization_group_size - << " is smaller than minimum supported size 32" << std::endl; - return false; + GPU_DEBUG_TRACE_DETAIL << "Set dynamic_quantize_group_size " << dynamic_quantization_group_size + << " is smaller than minimum supported size 32" << std::endl; + return false; } const size_t scale_group_size = get_scale_group_size(params); @@ -156,45 +159,47 @@ static bool should_dynamic_quantize(const fully_connected_params& params) { if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) && (params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) && params.inputs[0].GetDType() == Datatype::F16 && is_weight_dyn_quantizable(params)) { - GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size << - ", Dyn-quan group size: " << dynamic_quantization_group_size << - ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) << - ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) << - ", W:" << kernel_selector::toString(params.weights.GetDType()) << - "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) << - ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << - ", Y: " << params.inputs[0].Y().v << std ::endl; + GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size + << ", Dyn-quan group size: " << dynamic_quantization_group_size + << ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) + << ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) + << ", W:" << kernel_selector::toString(params.weights.GetDType()) + << "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) + << ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v + << ", Y: " << params.inputs[0].Y().v << std ::endl; return true; } return false; } -static bool is_weight_vertical(const fully_connected_params& params, size_t output_f) { +bool is_weight_vertical(const fully_connected_params& params, size_t output_f) { size_t min_num_threads = params.engineInfo.computeUnitsCount * simd; - GPU_DEBUG_TRACE_DETAIL << "out_ofm (== weight N dim) size " << output_f << " is small compared to the available threads. " + GPU_DEBUG_TRACE_DETAIL << "out_ofm (== weight N dim) size " << output_f + << " is small compared to the available threads. " << "(computeUnitsCount : " << params.engineInfo.computeUnitsCount << " min_num_threads : " << min_num_threads << ")" << std::endl; GPU_DEBUG_TRACE_DETAIL << "Use ofm_tile size 1 if the batch size is 1." << std::endl; - return (params.weights.IFM().v >= params.weights.OFM().v * 3 - && output_f / 2 /*most frequently used tile_ofm*/ <= min_num_threads); + return (params.weights.IFM().v >= params.weights.OFM().v * 3 && + output_f / 2 /*most frequently used tile_ofm*/ <= min_num_threads); } -static bool is_weight_horizontal(const fully_connected_params& params, size_t output_f) { +bool is_weight_horizontal(const fully_connected_params& params, size_t output_f) { size_t min_num_threads = params.engineInfo.computeUnitsCount * simd; - GPU_DEBUG_TRACE_DETAIL << "out_ofm (== weight N dim) size " << output_f << " is large compared to the available threads. " + GPU_DEBUG_TRACE_DETAIL << "out_ofm (== weight N dim) size " << output_f + << " is large compared to the available threads. " << "(computeUnitsCount : " << params.engineInfo.computeUnitsCount << " min_num_threads : " << min_num_threads << ")" << std::endl; - return (params.weights.OFM().v > params.weights.IFM().v * 3 - && output_f / 4 /* tile_ofm=4 */ > min_num_threads * 1.5); + return (params.weights.OFM().v > params.weights.IFM().v * 3 && + output_f / 4 /* tile_ofm=4 */ > min_num_threads * 1.5); } -static bool is_weight_small_kn(const fully_connected_params& params, size_t output_f) { +bool is_weight_small_kn(const fully_connected_params& params, size_t output_f) { size_t min_num_threads = params.engineInfo.computeUnitsCount * simd; return output_f / 2 /*most frequently used tile_ofm*/ <= min_num_threads; } -static bool is_swiglu_fused(const fully_connected_params& params) { +bool is_swiglu_fused(const fully_connected_params& params) { bool swiglu_fused = false; if (!params.fused_ops.empty()) { for (auto p : params.fused_ops) { @@ -206,13 +211,15 @@ static bool is_swiglu_fused(const fully_connected_params& params) { OPENVINO_ASSERT(params.fused_ops.size() == 1); return swiglu_fused; } -static bool is_suitable_outer_ofm(const fully_connected_params& params, size_t output_f) { +bool is_suitable_outer_ofm(const fully_connected_params& params, size_t output_f) { if (is_swiglu_fused(params)) return true; size_t min_num_threads = params.engineInfo.computeUnitsCount * simd; - return (params.weights.OFM().v > params.weights.IFM().v * 6 - && output_f / 8 /* tile_ofm=4 and outer_ofm=2 */ > min_num_threads * 1.5); + return (params.weights.OFM().v > params.weights.IFM().v * 6 && + output_f / 8 /* tile_ofm=4 and outer_ofm=2 */ > min_num_threads * 1.5); } +} // namespace fc_kernel_bf_tiled_utils +using namespace fc_kernel_bf_tiled_utils; FullyConnected_bf_tiled::FullyConnected_bf_tiled() : FullyConnectedKernelBase("fully_connected_gpu_bf_tiled") { for (unsigned tile_b = 1; tile_b <= 32; ++tile_b) @@ -281,6 +288,10 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const { auto& output = fc_params.outputs[0]; auto& weights = fc_params.weights; + if (fc_params.single_batch_optimized) { + return false; + } + // Block reads must be aligned to 4 bytes, for fp16 we can correct for offset misalignment, // but we need to ensure that batch pitch preserves alignment. if (input.GetDType() == Datatype::F16) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h index 7c6c53d6af4c95..e8a9e14ac8af3d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.h @@ -87,4 +87,25 @@ class FullyConnected_bf_tiled : public FullyConnectedKernelBase { std::vector auto_tune_params; }; + +namespace fc_kernel_bf_tiled_utils { +using namespace kernel_selector; +std::pair get_input_bf_size(const fully_connected_params& params); +std::pair get_output_aligned_bf_size(const fully_connected_params& params, + bool needs_align, + uint32_t align_b = 1, + int32_t align_f = 1); +size_t get_scale_group_size(const fully_connected_params& params); +bool is_8bit_asym_wei(const fully_connected_params& params); +bool is_weight_dyn_quantizable(const fully_connected_params& params); +bool is_per_token_dynamic_quantize(const fully_connected_params& params); +size_t get_dynamic_quantize_group_size(const fully_connected_params& params); +bool should_dynamic_quantize(const fully_connected_params& params); +bool is_weight_vertical(const fully_connected_params& params, size_t output_f); +bool is_weight_horizontal(const fully_connected_params& params, size_t output_f); +bool is_weight_small_kn(const fully_connected_params& params, size_t output_f); +bool is_swiglu_fused(const fully_connected_params& params); +bool is_suitable_outer_ofm(const fully_connected_params& params, size_t output_f); +}; // namespace fc_kernel_bf_tiled_utils + } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.cpp new file mode 100644 index 00000000000000..3f37bf5fb75163 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.cpp @@ -0,0 +1,247 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fully_connected_kernel_gemv.h" +#include "fully_connected_kernel_bf_tiled.h" + +#include "common_types.h" +#include "kernel_selector_utils.h" +#include "swiglu/swiglu_kernel_base.h" + +using namespace kernel_selector::fc_kernel_bf_tiled_utils; +static constexpr size_t simd = 16; + +namespace kernel_selector { +ParamsKey FullyConnected_GEMV::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F16); + k.EnableInputWeightsType(WeightsType::INT4); + k.EnableInputWeightsType(WeightsType::UINT4); + k.EnableInputLayout(DataLayout::bf); + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bf); + k.EnableOutputLayout(DataLayout::bfyx); + k.EnableBiasPerOutput(); + k.EnableNonBiasTerm(); + k.EnableTensorOffset(); + k.EnableDifferentInputWeightsTypes(); + k.EnableDynamicShapesSupport(); + k.EnableWeightsCompression(); + k.EnableBatching(); + k.EnableBiasPerFeature(); + k.EnableDifferentTypes(); + + return k; +} + +DeviceFeaturesKey FullyConnected_GEMV::get_required_device_features_key(const Params& params) const { + auto k = get_common_subgroups_device_features_key(params); + k.requires_subgroup_broadcast(); + k.requires_blocked_read_write(); + k.requires_blocked_read_write_char(); + k.requires_blocked_read_write_short(); + + return k; +} + +bool FullyConnected_GEMV::Validate(const Params& params) const { + if (!Parent::Validate(params)) + return false; + + const auto& fc_params = static_cast(params); + const auto& input = fc_params.inputs[0]; + const auto& output = fc_params.outputs[0]; + const auto& weights = fc_params.weights; + + if (!fc_params.compressed) { + return false; + } + const size_t scale_group_size = weights.IFM().v / fc_params.decompression_scale.Feature().v; + if (scale_group_size == 0 || scale_group_size % 16 != 0) { + return false; + } + + // Data type re-check: only support f16:int4:f16 + if (input.GetDType() != Datatype::F16 || output.GetDType() != Datatype::F16 || + (weights.GetDType() != WeightsType::INT4 && weights.GetDType() != WeightsType::UINT4)) { + return false; + } + + // Only support vector data as input, the data size should be aligned by 16 elements + auto input_size = get_input_bf_size(fc_params); + if (input_size.first > 1 || input_size.second == 0 || input_size.second % 16 != 0 || weights.IFM().v % 16 != 0) { + return false; + } + + auto wl = weights.GetLayout(); + auto wo = weights.OFM().v; + + auto& fc_input = fc_params.inputs[0]; + if (is_swiglu_fused(fc_params)) { + return false; + } + + if (input_size.first != 0 && fc_input.is_dynamic()) { + if (input_size.first != 1) { + return false; + } + if (!(wl == WeightsLayout::os_is_yx_osv32_isv2 && wo % 32 == 0) && + !(wl == WeightsLayout::os_is_yx_osv64_isv2 && wo % 64 == 0) && + !(wl == WeightsLayout::os_iyx_osv16 && wo % 16 == 0)) { + return false; + } + } + + if (input.GetLayout() == DataLayout::bfyx) { + // Padding on input is not supported. + if (input.X().pad.Total() != 0) + return false; + if (input.Y().pad.Total() != 0) + return false; + } + + // We don't support 4d output + if (fc_params.outputs[0].GetLayout() == DataLayout::bfyx && fc_params.outputs[0].X().v > 1) + return false; + + return true; +} + +FullyConnected_GEMV::DispatchData FullyConnected_GEMV::SetDefault(const fully_connected_params& params, + int, + int /*kernel_number*/) const { + auto dispatchData = Parent::SetDefault(params); + + std::vector global = {params.weights.OFM().v, 1, 16}; + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { + global[0] = params.weights.OFM().v; + } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { + global[0] = params.weights.OFM().v / 2; + // if (is_swiglu_fused(params)) { + // global[0] = params.weights.OFM().v / 4; + // } + } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { + global[0] = params.weights.OFM().v / 4; + } + + dispatchData.gws = global; + dispatchData.lws = {16, 1, 16}; + + return dispatchData; +} + +KernelsPriority FullyConnected_GEMV::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_9; +} + +JitConstants FullyConnected_GEMV::GetJitConstants(const fully_connected_params& params, + const FullyConnectedKernelBase::DispatchData& dispatchData) const { + JitConstants jit = Parent::GetJitConstants(params, dispatchData); + + // TODO: SWIGLU support + // if (is_swiglu_fused(params)) { + // auto split_length = params.fused_ops[0].GetOpParams()->split_length; + // auto split_to_glu_idx = params.fused_ops[0].GetOpParams()->split_to_glu_idx; + // jit.AddConstant(MakeJitConstant("SWIGLU_LENGTH", split_length)); + // jit.AddConstant(MakeJitConstant("SWIGLU_SPLIT_TO_GLU_IDX", split_to_glu_idx)); + // } + + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { + jit.AddConstant(MakeJitConstant("FILTER_LAYOUT_OS_IS_YX_TYPE", 0)); + } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { + jit.AddConstant(MakeJitConstant("FILTER_LAYOUT_OS_IS_YX_TYPE", 1)); + } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { + jit.AddConstant(MakeJitConstant("FILTER_LAYOUT_OS_IS_YX_TYPE", 2)); + } else { + OPENVINO_ASSERT("GEMV doesn't support this weights layout: ", params.weights.GetLayout()); + } + + if (params.weights.GetDType() == WeightsType::UINT4) { + jit.AddConstant(MakeJitConstant("WEI_UINT4", 1)); + } else if (params.weights.GetDType() == WeightsType::INT4) { + jit.AddConstant(MakeJitConstant("WEI_UINT4", 0)); + } else { + OPENVINO_ASSERT("GEMV only support INT4 and UINT4, doesn't support ", static_cast(params.weights.GetDType())); + } + + jit.AddConstant(MakeJitConstant("SIMD", simd)); + jit.AddConstant(MakeJitConstant("WEIGHTS_K", params.weights.IFM().v)); + jit.AddConstant(MakeJitConstant("WEIGHTS_N", params.weights.OFM().v)); + + auto activation_dt = GetActivationType(params); + jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION")); + jit.Merge(MakeActivationJitConstants(params.activations, activation_dt, "_TYPED")); + + if (!params.fused_ops.empty() && !is_swiglu_fused(params)) { + std::vector idx_order = {"0", "0", "(cur_n + 16 * i)", "0"}; + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { + idx_order = {"0", "0", "(cur_n + i)", "0"}; + } + FusedOpsConfiguration conf_vec = {"_VEC", idx_order, "sum_value[i]", activation_dt, 1}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec})); + } + return jit; +} + +KernelsData FullyConnected_GEMV::GetTunedKernelsDataByIndex(const Params& params, const int autoTuneIndex) const { + auto& fc_params = static_cast(params); + auto output_f = get_output_aligned_bf_size(fc_params, false).second; + + WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16; + if (is_swiglu_fused(fc_params)) { + weights_layout = WeightsLayout::os_is_yx_osv32_isv2; + } else if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && + (fc_params.weights.GetLayout() == WeightsLayout::oiyx || + fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) && + (fc_params.weights.GetDType() == WeightsType::INT4 || + fc_params.weights.GetDType() == WeightsType::UINT4) && + is_weight_horizontal(fc_params, output_f)) { + // Large N + small K case (horizontal weight) to use osv64_isv2 + weights_layout = WeightsLayout::os_is_yx_osv64_isv2; + } else if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && + (fc_params.weights.GetDType() == WeightsType::INT4 || + fc_params.weights.GetDType() == WeightsType::UINT4) && + (fc_params.weights.GetLayout() == WeightsLayout::oiyx || + fc_params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) && + is_weight_vertical(fc_params, output_f)) { + // Large K + Small N case (vertical weight) to use osv16 + weights_layout = WeightsLayout::os_iyx_osv16; + } else if (fc_params.compressed && + fc_params.inputs[0].GetDType() == Datatype::F16 + // ioyx => os_is_yx_osv32_isv2 is not supported yet + && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || + fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) && + (fc_params.weights.GetDType() == WeightsType::INT4 || + fc_params.weights.GetDType() == WeightsType::UINT4)) { + weights_layout = WeightsLayout::os_iyx_osv16; + } + + if ((fc_params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) || + (fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) || + (fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)) { + weights_layout = fc_params.weights.GetLayout(); + } + + KernelsData kernels_data; + kernels_data = GetCommonKernelsData(params, + fc_params.inputs[0].GetLayout(), + weights_layout, + EXE_MODE_DEFAULT, + autoTuneIndex, + 0); + return kernels_data; +} + +KernelsData FullyConnected_GEMV::GetKernelsData(const Params& params) const { + KernelsData res = {}; + KernelsData kds = GetTunedKernelsDataByIndex(params, -1); + if (!kds.empty()) { + res.emplace_back(kds[0]); + } + + return res; +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.h new file mode 100644 index 00000000000000..dfd14ec7077d08 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_gemv.h @@ -0,0 +1,36 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "fully_connected_kernel_base.h" + +namespace kernel_selector { + +class FullyConnected_GEMV : public FullyConnectedKernelBase { +public: + using Parent = FullyConnectedKernelBase; + + FullyConnected_GEMV() : Parent("fully_connected_gpu_gemv") {} + + using FullyConnectedKernelBase::GetTunedKernelsDataByIndex; + KernelsData GetTunedKernelsDataByIndex(const Params& params, const int autoTuneIndex = -1) const override; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + DeviceFeaturesKey get_required_device_features_key(const Params& params) const override; + +protected: + DispatchData SetDefault(const fully_connected_params& params, + int autoTuneIndex = -1, + int kernel_number = 0) const override; + std::vector GetSupportedFusedOps() const override { + return {FusedOpType::ACTIVATION, FusedOpType::ELTWISE, FusedOpType::SWIGLU}; + } + bool Validate(const Params& params) const override; + JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override; +}; +} // namespace kernel_selector \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_selector.cpp index ebbc11fb2a859c..ca6e131bac6144 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_selector.cpp @@ -20,6 +20,7 @@ #include "fully_connected_kernel_imad.h" #include "fully_connected_kernel_fs_byx_fsv32.h" #include "fully_connected_kernel_bf_tiled.h" +#include "fully_connected_kernel_gemv.h" namespace kernel_selector { @@ -41,6 +42,7 @@ fully_connected_kernel_selector::fully_connected_kernel_selector() { Attach(); Attach(); Attach(); + Attach(); } KernelsData fully_connected_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_params.h index c75ee645c06b30..bb7348523864ea 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_params.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_params.h @@ -16,6 +16,7 @@ struct fully_connected_params : public weight_bias_params { QuantizationType quantization = QuantizationType::NONE; size_t dynamic_quantization_group_size = 0; + bool single_batch_optimized = 0; ParamsKey GetParamsKey() const override { ParamsKey k = weight_bias_params::GetParamsKey(); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 3d3c0c6a114f1e..770dbab6194a04 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1984,6 +1984,875 @@ class fully_connected_gpu_tests: public ::testing::Test { } } + void test_compressed_int4_scale_gemv(bool is_caching_test, + bool is_dynamic, + long int batch_num, + long int scales_group_size = 128, + bool is_uint4 = false, + bool is_wei_dyn = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + auto supports_immad = engine.get_device_info().supports_immad; + + if (engine.get_device_info().dev_type == device_type::discrete_gpu || supports_immad) + GTEST_SKIP(); + + long int ifm_num = 1024; + long int ofm_num = 14336; + auto wei_type = is_uint4 ? data_types::u4 : data_types::i4; + + auto input_mem = engine.allocate_memory({{batch_num, ifm_num}, data_types::f16, format::bfyx}); + auto weights_mem = engine.allocate_memory({{ofm_num, ifm_num}, wei_type, format::bfyx}); + auto scale_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto dcomp_zp_mem = engine.allocate_memory({{1, 1, 1, 1}, data_types::u8, format::bfyx}); + auto bias_mem = engine.allocate_memory({{batch_num, ofm_num}, data_types::f16, format::bfyx}); + + set_values(dcomp_zp_mem, {1}); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.0f, 2.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 255); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -2.0f, 2.0f); + set_values(scale_mem, scale_data); + + auto bias_data = rg.generate_random_1d(batch_num * ofm_num, -2.0f, 2.0f); + set_values(bias_mem, bias_data); + + auto in_layout = is_dynamic ? layout{{-1, ifm_num}, data_types::f16, format::bfyx} + : layout{{batch_num, ifm_num}, data_types::f16, format::bfyx}; + + if (is_dynamic && is_wei_dyn) { + // ifm_num is dynamic + in_layout = layout{{-1, -1}, data_types::f16, format::bfyx}; + } + + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; + + auto fc_prim = fully_connected("fc_prim", + input_info("input"), + "weights", + "bias", + "scale", + dcomp_zp_name, + data_types::f16, + 2, + 2); + + fc_prim.decompression_zero_point_scalar = 1; + + auto get_ref_results = [&]() { + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, + "fully_connected_gpu_bfyx_ref", + impl_types::ocl}; + config.set_property( + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}})); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + if (batch_num == 1) { + // Wait for switching to gemv kernel due to it is async switch for dynamic shape + network->get_program()->get_compilation_context().wait_all(); + outputs = network->execute(); + } + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto fc_kernels = impl->get_kernels(); + + for (auto& it : fc_kernels) { + auto kernel_name = it->get_id(); + if (batch_num == 1) { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") != kernel_name.npos); + } else { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") == kernel_name.npos); + } + } + if (batch_num == 1) { + ASSERT_EQ(fc_kernels.size(), 1); + } else if (batch_num > 1) { + ASSERT_EQ(fc_kernels.size(), 2); + } else { + ASSERT_TRUE(false); + } + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref(ref_output_mem, get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) { + EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 30.0) << "i = " << i; + } + } + + void test_compressed_int4_scale_zp_gemv(bool is_caching_test, + bool is_dynamic, + long int batch_num, + long int scales_group_size = 128, + bool is_uint4 = false, + bool is_wei_dyn = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + auto supports_immad = engine.get_device_info().supports_immad; + + if (engine.get_device_info().dev_type == device_type::discrete_gpu || supports_immad) + GTEST_SKIP(); + + long int ifm_num = 256; + long int ofm_num = 512; + auto wei_type = is_uint4 ? data_types::u4 : data_types::i4; + + auto input_mem = engine.allocate_memory({{batch_num, ifm_num}, data_types::f16, format::bfyx}); + auto weights_mem = engine.allocate_memory({{ofm_num, ifm_num}, wei_type, format::bfyx}); + auto scale_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto dcomp_zp_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto bias_mem = engine.allocate_memory({{batch_num, ofm_num}, data_types::f16, format::bfyx}); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.0f, 2.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 255); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -2.0f, 2.0f); + set_values(scale_mem, scale_data); + + auto dcomp_zp_mem_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, 2.0f, 4.0f); + set_values(dcomp_zp_mem, dcomp_zp_mem_data); + + auto bias_data = rg.generate_random_1d(batch_num * ofm_num, -1.0f, 1.0f); + set_values(bias_mem, bias_data); + + auto in_layout = is_dynamic ? layout{{-1, ifm_num}, data_types::f16, format::bfyx} + : layout{{batch_num, ifm_num}, data_types::f16, format::bfyx}; + + if (is_dynamic && is_wei_dyn) { + // ifm_num is dynamic + in_layout = layout{{-1, -1}, data_types::f16, format::bfyx}; + } + + auto fc_prim = fully_connected("fc_prim", + input_info("input"), + "weights", + "", + "scale", + "dcomp_zp", + data_types::f16, + 2, + 2); + + auto get_ref_results = [&]() { + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, + "fully_connected_gpu_bfyx_ref", + impl_types::ocl}; + config.set_property( + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}})); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + if (batch_num == 1) { + // Wait for switching to gemv kernel due to it is async switch for dynamic shape + network->get_program()->get_compilation_context().wait_all(); + outputs = network->execute(); + } + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto fc_kernels = impl->get_kernels(); + + for (auto& it : fc_kernels) { + auto kernel_name = it->get_id(); + if (batch_num == 1) { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") != kernel_name.npos); + } else { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") == kernel_name.npos); + } + } + if (batch_num == 1) { + ASSERT_EQ(fc_kernels.size(), 1); + } else if (batch_num > 1) { + ASSERT_EQ(fc_kernels.size(), 2); + } else { + ASSERT_TRUE(false); + } + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref(ref_output_mem, get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size() / batch_num; i++) { + EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 10.0) << "i = " << i; + } + } + + void test_compressed_int4_scale_activation_gemv(bool is_caching_test, + bool is_dynamic, + long int batch_num, + long int scales_group_size = 128, + bool is_wei_dyn = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + auto supports_immad = engine.get_device_info().supports_immad; + + if (engine.get_device_info().dev_type == device_type::discrete_gpu || supports_immad) + GTEST_SKIP(); + + long int ifm_num = 256; + long int ofm_num = 512; + + auto input_mem = engine.allocate_memory({{batch_num, ifm_num}, data_types::f16, format::bfyx}); + auto weights_mem = engine.allocate_memory({{ofm_num, ifm_num}, data_types::i4, format::bfyx}); + auto scale_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto dcomp_zp_mem = engine.allocate_memory({{1, 1, 1, 1}, data_types::u8, format::bfyx}); + auto bias_mem = engine.allocate_memory({{batch_num, ofm_num}, data_types::f16, format::bfyx}); + + set_values(dcomp_zp_mem, {8}); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.0f, 2.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 10); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f); + set_values(scale_mem, scale_data); + + auto bias_data = rg.generate_random_1d(batch_num * ofm_num, -2.0f, 2.0f); + set_values(bias_mem, bias_data); + + auto in_layout = is_dynamic ? layout{{-1, ifm_num}, data_types::f16, format::bfyx} + : layout{{batch_num, ifm_num}, data_types::f16, format::bfyx}; + + if (is_dynamic && is_wei_dyn) { + // ifm_num is dynamic + in_layout = layout{{-1, -1}, data_types::f16, format::bfyx}; + } + + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; + + auto fc_prim = fully_connected("fc_prim", + input_info("input"), + "weights", + "bias", + "scale", + dcomp_zp_name, + data_types::f16, + 2, + 2); + + fc_prim.decompression_zero_point_scalar = 8; + + auto get_ref_results = [&]() { + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim, + activation("out", input_info("fc_prim"), activation_func::relu)); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, + "fully_connected_gpu_bfyx_ref", + impl_types::ocl}; + config.set_property( + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}})); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "out"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("bias", bias_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim, + activation("out", input_info("fc_prim"), activation_func::relu)); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + if (batch_num == 1) { + // Wait for switching to gemv kernel due to it is async switch for dynamic shape + network->get_program()->get_compilation_context().wait_all(); + outputs = network->execute(); + } + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "out"); + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto fc_kernels = impl->get_kernels(); + + for (auto& it : fc_kernels) { + auto kernel_name = it->get_id(); + if (batch_num == 1) { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") != kernel_name.npos); + } else { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") == kernel_name.npos); + } + } + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref(ref_output_mem, get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size(); i++) + ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i; + } + + void test_compressed_int4_scale_large_n_gemv(bool is_caching_test, + bool is_dynamic, + long int batch_num, + bool is_dyn_quan = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + auto supports_immad = engine.get_device_info().supports_immad; + if (engine.get_device_info().dev_type == device_type::discrete_gpu || supports_immad) + GTEST_SKIP(); + + long int ifm_num = 4096; + long int ofm_num = 14336; + long int scales_group_size = 128; + + auto input_mem = engine.allocate_memory({{batch_num, 1, ifm_num}, data_types::f16, format::bfyx}); + auto weights_mem = engine.allocate_memory({{ofm_num, ifm_num}, data_types::i4, format::bfyx}); + auto scale_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto dcomp_zp_mem = engine.allocate_memory({{1, 1, 1, 1}, data_types::u8, format::bfyx}); + + set_values(dcomp_zp_mem, {2}); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -1.0f, 1.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 10); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -1.0f, 1.0f); + set_values(scale_mem, scale_data); + + + auto in_layout = is_dynamic ? layout{{-1, 1, ifm_num}, data_types::f16, format::bfyx} + : layout{{batch_num, 1, ifm_num}, data_types::f16, format::bfyx}; + + if (is_dynamic) { + // ifm_num is dynamic + in_layout = layout{{-1, -1, -1}, data_types::f16, format::bfyx}; + } + + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; + + auto fc_prim = fully_connected("fc_prim", + input_info("input"), + "weights", + "", + "scale", + dcomp_zp_name, + data_types::f16, + 3, + 2); + + fc_prim.decompression_zero_point_scalar = 2; + + auto get_ref_results = [&]() { + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, + "fully_connected_gpu_bfyx_ref", + impl_types::ocl}; + config.set_property( + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}})); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + // for (size_t i = 0; i < 100; i++) { + // outputs = network.execute(); + // } + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + if (is_dyn_quan) { + config.set_user_property(ov::hint::dynamic_quantization_group_size(32)); + } else { + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + } + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + if (batch_num == 1) { + // Wait for switching to gemv kernel due to it is async switch for dynamic shape + network->get_program()->get_compilation_context().wait_all(); + outputs = network->execute(); + } + + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto fc_kernels = impl->get_kernels(); + + for (auto& it : fc_kernels) { + auto kernel_name = it->get_id(); + if (batch_num == 1) { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") != kernel_name.npos); + } else { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") == kernel_name.npos); + } + } + if (batch_num == 1) { + ASSERT_EQ(fc_kernels.size(), 1); + + } else if (batch_num > 1) { + ASSERT_EQ(fc_kernels.size(), 2); + } else { + ASSERT_TRUE(false); + } + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref(ref_output_mem, get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size(); i++) { + EXPECT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i; + } + } + + void test_compressed_int4_scale_reuse_gemv(bool is_caching_test, + bool is_dynamic, + long int batch_num, + long int scales_group_size = 128) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + auto supports_immad = engine.get_device_info().supports_immad; + + if (engine.get_device_info().dev_type == device_type::discrete_gpu || supports_immad) + GTEST_SKIP(); + + long int ifm_num = 1024; + long int ofm_num = 256; + + auto input_mem = engine.allocate_memory({{batch_num, ifm_num}, data_types::f16, format::bfyx}); + auto weights_mem1 = engine.allocate_memory({{ofm_num, ifm_num}, data_types::i4, format::bfyx}); + auto weights_mem2 = engine.allocate_memory({{ofm_num, ifm_num}, data_types::i4, format::bfyx}); + auto scale_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto dcomp_zp_mem = engine.allocate_memory({{1, 1, 1, 1}, data_types::u8, format::bfyx}); + + set_values(dcomp_zp_mem, {8}); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.0f, 2.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 10); + set_values(weights_mem1, weigths_data); + set_values(weights_mem2, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f); + set_values(scale_mem, scale_data); + + auto in_layout = is_dynamic ? layout{{-1, ifm_num}, data_types::f16, format::bfyx} + : layout{{batch_num, ifm_num}, data_types::f16, format::bfyx}; + + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; + + auto fc_prim1 = fully_connected("fc_prim1", + input_info("input"), + "weights1", + "", + "scale", + dcomp_zp_name, + data_types::f16, + 2, + 2); + auto fc_prim2 = fully_connected("fc_prim2", + input_info("input"), + "weights2", + "", + "scale", + dcomp_zp_name, + data_types::f16, + 2, + 2); + + fc_prim1.decompression_zero_point_scalar = 8; + fc_prim2.decompression_zero_point_scalar = 8; + + auto get_ref_results = [&]() { + topology topology(input_layout("input", in_layout), + data("weights1", weights_mem1), + data("weights2", weights_mem2), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim1, + fc_prim2); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl = {in_layout.format, "", impl_types::ocl}; + config.set_property(ov::intel_gpu::force_implementations( + ov::intel_gpu::ImplForcingMap{{"fc_prim1", fc_impl}, {"fc_prim2", fc_impl}})); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 2); + + std::vector res{ + engine.reinterpret_buffer(*outputs.at("fc_prim1").get_memory(), outputs.at("fc_prim1").get_layout()), + engine.reinterpret_buffer(*outputs.at("fc_prim2").get_memory(), outputs.at("fc_prim2").get_layout()), + }; + + return res; + }; + + topology topology(input_layout("input", in_layout), + data("weights1", weights_mem1), + data("weights2", weights_mem2), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim1, + fc_prim2); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + if (batch_num == 1) { + // Wait for switching to gemv kernel due to it is async switch for dynamic shape + network->get_program()->get_compilation_context().wait_all(); + outputs = network->execute(); + } + ASSERT_EQ(outputs.size(), size_t(2)); + + auto inst = network->get_primitive("fc_prim1"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto fc_kernels = impl->get_kernels(); + + for (auto& it : fc_kernels) { + auto kernel_name = it->get_id(); + if (batch_num == 1) { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") != kernel_name.npos); + } else { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") == kernel_name.npos); + } + } + if (batch_num == 1) { + ASSERT_EQ(fc_kernels.size(), 1); + + } else if (batch_num > 1) { + ASSERT_EQ(fc_kernels.size(), 2); + } else { + ASSERT_TRUE(false); + } + + std::vector output_mem{ + engine.reinterpret_buffer(*outputs.at("fc_prim1").get_memory(), outputs.at("fc_prim1").get_layout()), + engine.reinterpret_buffer(*outputs.at("fc_prim2").get_memory(), outputs.at("fc_prim2").get_layout()), + }; + auto ref_output_mem = get_ref_results(); + + for (size_t i = 0; i < 2; i++) { + cldnn::mem_lock output_ptr(output_mem[i], get_test_stream()); + cldnn::mem_lock output_ptr_ref(ref_output_mem[i], get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size(); i++) + ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i; + } + } + +void test_compressed_int4_scale_dynamic_batch_gemv(bool is_caching_test, + long int scales_group_size = 128, + bool is_wei_dyn = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + auto supports_immad = engine.get_device_info().supports_immad; + + if (engine.get_device_info().dev_type == device_type::discrete_gpu || supports_immad) + GTEST_SKIP(); + + long int ifm_num = 1024; + long int ofm_num = 1024; + long int batch_num_list[] = {4, 1, 2, 1}; + + auto input_mem = engine.allocate_memory({{batch_num_list[0], ifm_num}, data_types::f16, format::bfyx}); + auto weights_mem = engine.allocate_memory({{ofm_num, ifm_num}, data_types::i4, format::bfyx}); + auto scale_mem = + engine.allocate_memory({{ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx}); + auto dcomp_zp_mem = engine.allocate_memory({{1, 1, 1, 1}, data_types::u8, format::bfyx}); + + set_values(dcomp_zp_mem, {8}); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 5); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -2.0f, 2.0f); + set_values(scale_mem, scale_data); + + auto in_layout = layout{{-1, ifm_num}, data_types::f16, format::bfyx}; + if (is_wei_dyn) { + // ifm_num is dynamic + in_layout = layout{{-1, -1}, data_types::f16, format::bfyx}; + } + + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; + + auto fc_prim = fully_connected("fc_prim", + input_info("input"), + "weights", + "", + "scale", + dcomp_zp_name, + data_types::f16, + 2, + 2); + + fc_prim.decompression_zero_point_scalar = 8; + + auto get_ref_results = [&]() { + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = {format::bfyx, + "fully_connected_gpu_bfyx_ref", + impl_types::ocl}; + config.set_property( + ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"fc_prim", fc_impl_desc}})); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology(input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("dcomp_zp", dcomp_zp_mem), + fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_user_property(ov::hint::dynamic_quantization_group_size(0)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + for(const auto& batch_num : batch_num_list) { + input_mem = engine.allocate_memory({{batch_num, ifm_num}, data_types::f16, format::bfyx}); + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -1.0f, 1.0f); + set_values(input_mem, input_data); + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + if (batch_num == 1) { + // Wait for switching to gemv kernel due to it is async switch for dynamic shape + network->get_program()->get_compilation_context().wait_all(); + outputs = network->execute(); + } + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto fc_kernels = impl->get_kernels(); + for (auto& it : fc_kernels) { + auto kernel_name = it->get_id(); + if (batch_num == 1) { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") != kernel_name.npos); + } else { + ASSERT_TRUE(kernel_name.find("fully_connected_gpu_gemv") == kernel_name.npos); + } + } + if (batch_num == 1) { + ASSERT_EQ(fc_kernels.size(), 1); + + } else if (batch_num > 1) { + ASSERT_EQ(fc_kernels.size(), 2); + } else { + ASSERT_TRUE(false); + } + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref(ref_output_mem, get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size(); i++) + ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i; + } + } + void test_compressed_int8_scale_zp_bias(bool is_caching_test) { auto& engine = get_test_engine(); @@ -4199,6 +5068,111 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_sta this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true); } +// Test gemv for INT4 weight compression +TEST_F(fully_connected_gpu_tests, gemv_compressed_uint4_scale_dynamic_cached_b1) { + this->test_compressed_int4_scale_gemv(true, true, 1, 128, true); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_cached_b1) { + this->test_compressed_int4_scale_gemv(true, true, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_cached_b10) { + this->test_compressed_int4_scale_gemv(true, true, 10); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_b1g32) { + this->test_compressed_int4_scale_gemv(false, true, 1, 32); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_b48g32) { + this->test_compressed_int4_scale_gemv(false, true, 48, 32); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_b1g64) { + this->test_compressed_int4_scale_gemv(false, true, 1, 64); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_dynamic_b1g128) { + this->test_compressed_int4_scale_gemv(false, true, 1, 128); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_b1g32) { + this->test_compressed_int4_scale_gemv(false, false, 1, 32); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_b1g64) { + this->test_compressed_int4_scale_gemv(false, false, 1, 64); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_b1g128) { + this->test_compressed_int4_scale_gemv(false, false, 1, 128); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_uint4_scale_zp_dynamic_cached_b1) { + this->test_compressed_int4_scale_zp_gemv(true, true, 1, 128, true); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_zp_dynamic_cached_b1) { + this->test_compressed_int4_scale_zp_gemv(true, true, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_zp_dynamic_b1g128) { + this->test_compressed_int4_scale_zp_gemv(false, true, 1, 128); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_relu_b1g128) { + this->test_compressed_int4_scale_activation_gemv(false, false, 1, 128); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_relu_b2g128) { + this->test_compressed_int4_scale_activation_gemv(false, false, 2, 128); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_relu_dynamic_b1g128) { + this->test_compressed_int4_scale_activation_gemv(false, true, 1, 128); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_b1) { + this->test_compressed_int4_scale_large_n_gemv(false, false, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_cached_b1) { + this->test_compressed_int4_scale_large_n_gemv(true, false, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_dynamic_b1) { + this->test_compressed_int4_scale_large_n_gemv(false, true, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_dynamic_b6) { + this->test_compressed_int4_scale_large_n_gemv(false, true, 6); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_dynamic_cached_b1) { + this->test_compressed_int4_scale_large_n_gemv(true, true, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_dyn_quan) { + this->test_compressed_int4_scale_large_n_gemv(false, false, 1, true); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_scale_large_n_dyn_quan_dynamic) { + this->test_compressed_int4_scale_large_n_gemv(true, false, 1, true); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_reuse_scale_b1) { + this->test_compressed_int4_scale_reuse_gemv(false, true, 1); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_reuse_scale_b12) { + this->test_compressed_int4_scale_reuse_gemv(false, true, 12); +} + +TEST_F(fully_connected_gpu_tests, gemv_compressed_int4_dynamic_batch) { + this->test_compressed_int4_scale_dynamic_batch_gemv(false, 128, false); +} + // Test weight zp for INT8 ASYM TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large_input_1025) { this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 1025, 3584, 4608, 128, 128, true);