Skip to content
12 changes: 6 additions & 6 deletions aiter/configs/model_configs/dsv3_fp4_tuned_fmoe.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag
256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.307,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb14_fq,20.3%,7.6176,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.3%,20.9246,0,4.74,67614.8,
256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.3539,flydsl_moe1_afp4_wfp4_bf16_t16x128x256_w3_kb7_bnt0_go_fq,18.7%,9.3549,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.3%,25.7088,0,7.71,55033.07,
256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,21.7862,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb2_bnt0_go_fq,16.6%,11.8142,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic,1.3%,33.6004,0,11.8,42108.94,
256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.307,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb14_fp4,20.3%,7.6176,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.3%,20.9246,0,4.74,67614.8,
256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.3539,flydsl_moe1_afp4_wfp4_bf16_t16x128x256_w3_kb7_bnt0_go_fp4,18.7%,9.3549,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.3%,25.7088,0,7.71,55033.07,
256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,21.7862,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb2_bnt0_go_fp4,16.6%,11.8142,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic,1.3%,33.6004,0,11.8,42108.94,
256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,31.3946,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3,0.0%,18.8511,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.2%,50.2457,0,15.78,28160.88,
256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,52.7618,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4,0.0%,30.7153,flydsl_moe2_afp4_wfp4_bf16_t16x256x256_atomic_sbm32,1.3%,83.4771,0,18.99,16952.38,
256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,86.6761,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3,0.0%,49.9872,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.2%,136.6633,0,23.2,10357.42,
Expand All @@ -15,9 +15,9 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,
256,8192,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,273.7589,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4_bnt0,0.0%,515.3227,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.0%,789.0816,0,1028.73,2016.21,
256,16384,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,425.3491,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,1027.5233,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist_sbm128,0.0%,1452.8724,0,1117.44,1216.29,
256,32768,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,669.5871999999999,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,2017.8465,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_sbm128,0.0%,2687.4337,0,1208.21,788.65,
256,1,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.6555,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_go_fq,23.1%,8.2174,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.6%,24.8729,0,7.97,113762.52,
256,2,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.244,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb4_go_fq,20.6%,14.059,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.7%,36.303,0,10.92,77944.67,
256,4,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.5005,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3_fq,19.5%,19.299,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.9%,47.7995,0,16.58,59198.7,
256,1,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.6555,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kb4_go_fp4,23.1%,8.2174,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.6%,24.8729,0,7.97,113762.52,
256,2,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.244,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_kb4_go_fp4,20.6%,14.059,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.7%,36.303,0,10.92,77944.67,
256,4,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.5005,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3_fp4,19.5%,19.299,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.9%,47.7995,0,16.58,59198.7,
256,8,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,54.3584,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3,0.0%,30.4539,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,3.0%,84.8123,0,18.69,33364.91,
256,16,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,96.3987,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3,0.0%,51.0459,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,2.9%,147.4446,0,21.51,19193.15,
256,32,7168,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,163.055,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3,0.0%,89.1386,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_atomic_persist,2.9%,252.1936,0,25.15,11222.61,
Expand Down
15 changes: 15 additions & 0 deletions aiter/configs/model_configs/gptoss_fp4_tuned_fmoe.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,tflops,bw,_tag
256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,214.3544,flydsl_moe1_afp8_wfp4_bf16_t32x128x256_w2_gui_fp8,0.0%,111.631,flydsl_moe2_afp8_wfp4_bf16_t32x256x256_atomic_bnt2_persist,0.0%,325.9854,0,355.73,11131.16,
256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,235.2077,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_gui_fp8,0.0%,125.5088,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_bnt2,0.0%,360.7165,0,642.97,10072.5,
256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,312.5584,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w2_bnt0_gui_fp8,0.0%,172.1029,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_persist,0.0%,484.6613,0,957.07,7516.08,
256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,442.3352,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui_fp8,0.0%,256.1523,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_persist_sbm128,0.0%,698.4875,0,1328.17,5242.22,
256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,714.6281,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_bnt0_gui_fp8,0.0%,413.5452,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_persist_sbm128,0.0%,1128.1733,0,1644.63,3279.08,
256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1356.5778,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui,0.0%,731.3886,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_sbm128,0.0%,2087.9664,0,1777.26,1807.92,
256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,2474.7814,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w2_bnt0_gui_fp8,0.0%,1348.5732,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_atomic_xcd4_persist,0.0%,3823.3546,0,1941.15,1026.81,
256,512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,527.0715,cktile_a8w4_bm32,0.0,117.3402,cktile_a8w4_bm32,0.0,644.4117,0,0.0,0.0,flydsl_fallback
256,1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,286.4048,cktile_a8w4_bm32,0.0,142.6674,cktile_a8w4_bm32,0.0,429.0722,0,0.0,0.0,flydsl_fallback
256,2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,446.6267,cktile_a8w4_bm32,0.0,181.4069,cktile_a8w4_bm32,0.0,628.0336,0,0.0,0.0,flydsl_fallback
256,4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,786.0193,cktile_a8w4_bm32,0.0,275.9191,cktile_a8w4_bm32,0.0,1061.9384,0,0.0,0.0,flydsl_fallback
256,8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,1478.4233,cktile_a8w4_bm32,0.0,480.9397,cktile_a8w4_bm32,0.0,1959.363,0,0.0,0.0,flydsl_fallback
256,16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,2752.7649,cktile_a8w4_bm32,0.0,908.23,cktile_a8w4_bm32,0.0,3660.9949,0,0.0,0.0,flydsl_fallback
256,32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,5411.816400000001,cktile_a8w4_bm32,0.0,1750.1288,cktile_a8w4_bm32,0.0,7161.9452,0,0.0,0.0,flydsl_fallback
8 changes: 8 additions & 0 deletions aiter/configs/model_configs/gptoss_fp4_untuned_fmoe.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1
512,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
1024,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
2048,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
4096,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
8192,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
16384,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
32768,3072,3072,128,4,ActivationType.Swiglu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0
Loading
Loading