Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 58 additions & 26 deletions aiter/configs/model_configs/dsv3_bf16_tuned_gemm.csv
Original file line number Diff line number Diff line change
@@ -1,27 +1,59 @@
cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw
256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,7.4498,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0117,0.49,494.63
256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,63.0464,auto,0.0,3.67,3675.33
256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.2696,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0234,1.01,508.93
256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.5459,auto,0.0,7.18,3590.67
256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,11,7.7755,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0322,1.89,479.64
256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.2502,auto,0.0,14.42,3608.65
256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,7.8799,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0303,3.73,480.82
256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.6376,auto,0.0,28.67,3589.91
256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,7.8452,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0312,7.48,498.09
256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.0337,auto,0.0,57.0,3573.78
256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,14,7.7283,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0288,15.2,536.36
256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,64.518,auto,0.0,114.9,3613.92
256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,7.9838,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0282,22.06,548.95
256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.2215,auto,0.0,170.5,3586.38
256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,7.9784,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0302,29.44,579.1
256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4327,auto,0.0,226.6,3586.22
256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,14,8.5025,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0281,34.53,571.34
256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,84.1977,auto,0.0,220.12,2795.83
256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.6162,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0321,40.89,591.38
256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.0249,auto,0.0,258.53,2745.12
256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,9.9659,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0278,41.24,535.12
256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.1104,auto,0.0,301.32,2751.06
256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,12,10.1315,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0279,46.37,549.83
256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,86.9964,auto,0.0,340.86,2731.63
256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,8,12.4846,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0223,75.25,598.43
256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,120.6481,auto,0.0,491.57,2019.21
256,1,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4782,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0195,0.49,492.75
256,2,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.4609,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0332,0.98,495.88
256,4,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5195,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,1.95,495.96
256,8,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.5756,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,3.88,500.13
256,16,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.6807,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0308,7.65,508.75
256,32,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,7.8114,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0326,15.03,530.65
256,48,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,14,8.172,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0277,21.56,536.31
256,64,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,16,8.4622,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0312,27.76,545.99
256,80,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,16,8.7848,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0322,33.42,552.98
256,96,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,16,8.9146,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0306,39.52,571.58
256,112,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,180,16,9.5363,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0296,43.1,559.23
256,128,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,179,8,10.2913,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0224,45.65,541.29
256,256,256,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,287,8,12.8749,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k8_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.023,72.97,580.28
256,1,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2196,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0265,3.28,3286.06
256,2,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,182,16,9.2149,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k16_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0282,6.57,3289.75
256,80,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,6,15.4972,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0178,156.3,2049.56
256,112,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.3948,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,174.85,1668.3
256,128,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,3,19.5107,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0106,198.64,1673.61
256,256,2112,7168,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,28.0092,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.006,276.73,1250.62
256,1,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.3014,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0169,1.5,1499.1
256,2,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.4095,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0163,2.94,1475.25
256,4,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,194,8,6.5084,flydsl_gemm2_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0136,5.8,1455.66
256,8,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,6.6916,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0064,11.28,1421.32
256,16,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,6.7833,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,22.26,1412.98
256,32,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,176,4,7.7348,flydsl_gemm2_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0077,39.04,1258.22
256,48,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,3,8.9019,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0058,50.89,1109.83
256,64,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,2,8.9485,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0027,67.5,1120.52
256,80,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,4,9.4262,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0077,80.09,1079.38
256,96,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,3,2,9.8468,_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,0.0027,92.01,1048.25
256,112,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.092,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,104.73,1037.39
256,128,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,2,10.0569,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0028,120.11,1055.68
256,256,3072,1536,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,12.3425,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,195.74,955.76
256,1,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.3747,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0151,3.51,3508.01
256,2,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,119,8,8.5564,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0158,6.86,3435.67
256,4,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,244,8,9.2613,flydsl_gemm2_abf16_wbf16_bf16_t64x128x64_split_k8_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0139,12.68,3178.16
256,8,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,114,4,9.2997,flydsl_gemm2_abf16_wbf16_bf16_t32x128x128_split_k4_block_m_warp1_block_n_warp4_async_copyTrue_b_to_ldsFalse_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0084,25.26,3172.96
256,16,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,286,4,9.4278,flydsl_gemm2_abf16_wbf16_bf16_t64x64x128_split_k4_block_m_warp2_block_n_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0088,49.83,3145.49
256,32,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,11.3011,auto,0.0,83.14,2650.18
256,48,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.3303,auto,0.0,114.29,2452.89
256,64,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,12.0008,auto,0.0,156.58,2544.81
256,80,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,2,13.9539,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0034,168.33,2209.75
256,96,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,2,14.1437,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0034,199.28,2200.95
256,112,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,asm,6,1,15.4791,_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,0.0,212.44,2030.13
256,128,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,15.4483,auto,0.0,243.27,2053.26
256,256,7168,2048,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,21.9002,auto,0.0,343.2,1556.09
256,1,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.8589,auto,0.0,5.05,5052.81
256,2,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.5282,auto,0.0,10.18,5090.54
256,4,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1821,auto,0.0,20.51,5131.6
256,8,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,45.1407,auto,0.0,41.06,5140.44
256,16,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,39.1932,auto,0.0,94.58,5930.01
256,32,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,41.086,auto,0.0,180.44,5674.99
256,48,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,65.4891,auto,0.0,169.8,3571.73
256,64,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,66.227,auto,0.0,223.88,3543.2
256,80,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.8314,auto,0.0,244.41,3104.28
256,96,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,75.5926,auto,0.0,294.21,3123.97
256,112,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.7086,auto,0.0,333.9,3048.51
256,128,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,77.9492,auto,0.0,380.42,3048.67
256,256,16160,7168,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,135.7516,auto,0.0,436.88,1794.55
Loading
Loading