Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Created September 9, 2025 04:01
Show Gist options
  • Select an option

  • Save davidberard98/df769c0789b813853e7b448a2bb6517a to your computer and use it in GitHub Desktop.

Select an option

Save davidberard98/df769c0789b813853e7b448a2bb6517a to your computer and use it in GitHub Desktop.
metric side_a_speedup side_b_speedup ratio_b_over_a improvement_percent
tritonbench_gemm_fwd[x_(2816, 2816, 2816)-triton_tutorial_matmul]_speedup 0.9140271292993862 0.6970010513896804 0.7625605729273494 -23.743942707265063
tritonbench_layer_norm_bwd[x_(4096, 7680)-liger_layer_norm]_speedup 0.9977561823241726 0.7614043701113448 0.7631166647724796 -23.688333522752036
tritonbench_gemm_fwd[x_(2688, 2688, 2688)-triton_tutorial_matmul]_speedup 0.9499290949872015 0.7257081424105393 0.7639603273971904 -23.603967260280957
tritonbench_layer_norm_bwd[x_(4096, 6656)-liger_layer_norm]_speedup 0.9266213852456668 0.7133837487030577 0.7698761976164888 -23.012380238351117
tritonbench_layer_norm_bwd[x_(4096, 7168)-liger_layer_norm]_speedup 0.9561411985507111 0.7366730997908861 0.770464760756583 -22.953523924341702
tritonbench_gemm_fwd[x_(4096, 4096, 4096)-triton_tutorial_matmul]_speedup 0.932466383549378 0.7262403068410183 0.7788380574928906 -22.11619425071094
tritonbench_gemm_fwd[x_(3328, 3328, 3328)-triton_tutorial_matmul]_speedup 0.8662131367630359 0.681059896289598 0.7862497893239766 -21.37502106760234
tritonbench_gemm_fwd[x_(3584, 3584, 3584)-triton_tutorial_matmul]_speedup 0.9239252216948058 0.733860809232369 0.7942859357018185 -20.57140642981815
tritonbench_gemm_fwd[x_(3456, 3456, 3456)-triton_tutorial_matmul]_speedup 0.9320134486363466 0.7484312598877756 0.8030262449354406 -19.697375506455938
tritonbench_layer_norm_bwd[x_(4096, 5632)-liger_layer_norm]_speedup 0.930560755740277 0.751740837248541 0.8078363853314643 -19.21636146685357
tritonbench_int4_gemm_fwd[x_(16, 1, 1280, 8192)-triton]_speedup 0.28203342708712165 0.22792606435496146 0.8081526601616406 -19.184733983835944
tritonbench_layer_norm_bwd[x_(4096, 4608)-liger_layer_norm]_speedup 0.8456937502177758 0.684227485420111 0.8090724156870198 -19.09275843129802
tritonbench_gemm_fwd[x_(2048, 2048, 2048)-triton_tutorial_matmul]_speedup 0.9319148704288177 0.7546689412125357 0.8098045917705736 -19.019540822942638
tritonbench_gemm_fwd[x_(3840, 3840, 3840)-triton_tutorial_matmul]_speedup 0.8105280714900104 0.6570415943890352 0.8106339774033763 -18.936602259662372
tritonbench_gemm_fwd[x_(3968, 3968, 3968)-triton_tutorial_matmul]_speedup 0.9490344862049137 0.7725833045822535 0.8140729507857303 -18.59270492142697
tritonbench_layer_norm_bwd[x_(4096, 5120)-liger_layer_norm]_speedup 0.8879154716173385 0.7232902955772791 0.8145936394821518 -18.54063605178482
tritonbench_gemm_fwd[x_(3712, 3712, 3712)-triton_tutorial_matmul]_speedup 0.9085162379243885 0.7425105464487084 0.8172782339532648 -18.27217660467352
tritonbench_gemm_fwd[x_(2560, 2560, 2560)-triton_tutorial_matmul]_speedup 0.7961751678560129 0.6511723660373726 0.8178757543906923 -18.212424560930774
tritonbench_gemm_fwd[x_(1920, 1920, 1920)-triton_tutorial_matmul]_speedup 0.9405162764891412 0.7780898839661873 0.8273008170264992 -17.269918297350085
tritonbench_int4_gemm_fwd[x_(4, 1, 1280, 8192)-triton]_speedup 0.1679442538532255 0.1389541663585856 0.8273826771115627 -17.261732288843735
tritonbench_flex_attention_fwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup 68.5115324058302 56.9197164006804 0.830804893744234 -16.919510625576596
tritonbench_gemm_fwd[x_(3200, 3200, 3200)-triton_tutorial_matmul]_speedup 0.7812828527457611 0.65284305712294 0.8356039747046425 -16.439602529535748
tritonbench_layer_norm_bwd[x_(4096, 6144)-liger_layer_norm]_speedup 0.9004269870019902 0.7682271948151851 0.8531809973544108 -14.681900264558923
tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup 69.96805342165906 60.322940955645684 0.8621497670102728 -13.785023298972721
tritonbench_layer_norm_bwd[x_(4096, 8192)-liger_layer_norm]_speedup 0.8485369968865842 0.7324222510850372 0.8631588885015147 -13.684111149848533
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 1024)-triton]_speedup 0.5294117507190358 0.4653614445949409 0.8790160852359187 -12.098391476408132
tritonbench_gemm_fwd[x_(3072, 3072, 3072)-triton_tutorial_matmul]_speedup 0.7059819343391301 0.6206473856851741 0.879126441480634 -12.087355851936598
tritonbench_gemm_fwd[x_(2944, 2944, 2944)-triton_tutorial_matmul]_speedup 0.739337929071021 0.651223991104509 0.880820482080194 -11.917951791980597
tritonbench_low_mem_dropout_fwd[x_131072-triton_dropout]_speedup 1.1411764333272758 1.0159574130472777 0.8902719889553776 -10.972801104462238
tritonbench_int4_gemm_fwd[x_(4, 1, 8192, 3584)-triton]_speedup 0.40917662245713743 0.3694379307298832 0.9028813242344582 -9.711867576554178
tritonbench_gemm_fwd[x_average-triton_tutorial_matmul]_speedup 0.8152680793731638 0.737620404647682 0.9047581075599294 -9.524189244007065
tritonbench_flash_attention_bwd[x_(4, 48, 256, 256, 64)-triton_tutorial_flash_v2]_speedup 1.1715653142489653 1.0602748521449032 0.9050070356722664 -9.499296432773363
tritonbench_int4_gemm_fwd[x_(64, 1, 1280, 8192)-triton]_speedup 0.6068667108994487 0.5493393127889478 0.9052058762207628 -9.479412377923724
tritonbench_low_mem_dropout_fwd[x_128-triton_dropout]_speedup 1.3142857083461572 1.1920529967260118 0.9069968494339348 -9.300315056606523
tritonbench_gemm_fwd[x_(768, 768, 768)-triton_tutorial_matmul]_speedup 0.7506631543411684 0.6814621521028467 0.9078135088446467 -9.218649115535326
tritonbench_gemm_fwd[x_(1664, 1664, 1664)-triton_tutorial_matmul]_speedup 0.7782101619705566 0.7095238128235636 0.9117380464769724 -8.826195352302758
tritonbench_int4_gemm_fwd[x_(16, 1, 7168, 8192)-triton]_speedup 0.7101668982371787 0.6478675515015323 0.912275062537708 -8.772493746229204
tritonbench_int4_gemm_fwd[x_(4, 1, 7168, 8192)-triton]_speedup 0.32786386474682905 0.3005894020035852 0.9168116231280786 -8.318837687192138
tritonbench_flex_attention_fwd[x_average-compiled]_speedup 36.83800164133012 33.99696962589044 0.9228776836729329 -7.712231632706712
tritonbench_gemm_fwd[x_(896, 896, 896)-triton_tutorial_matmul]_speedup 0.7410071781770686 0.6858513020404576 0.9255663402987561 -7.443365970124393
tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup 64.86869697263923 60.380211059209174 0.9308065966652107 -6.91934033347893
tritonbench_gemm_fwd[x_(1536, 1536, 1536)-triton_tutorial_matmul]_speedup 0.730069981078846 0.6806282770379919 0.9322781304227951 -6.7721869577204945
tritonbench_rms_norm_bwd[x_(2048, 32768)-liger_rms]_speedup 0.41037518147105817 0.3833887990796067 0.934239730836757 -6.576026916324295
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 3584)-triton]_speedup 0.8228571246952967 0.7712766378318567 0.9373153791643474 -6.268462083565263
tritonbench_rope_fwd[x_(512, 2048)-liger_rotary_pos_emb]_speedup 2.9203251678557436 2.7407974538612794 0.9385247519794917 -6.14752480205083
tritonbench_gemm_fwd[x_(1792, 1792, 1792)-triton_tutorial_matmul]_speedup 0.872135045701087 0.8224718713121524 0.943055637273455 -5.694436272654501
tritonbench_layer_norm_bwd[x_average-liger_layer_norm]_speedup 0.82141756801026 0.7750321206641942 0.9435300033107079 -5.646999668929209
tritonbench_gemm_fwd[x_(2432, 2432, 2432)-triton_tutorial_matmul]_speedup 0.7975687041153187 0.7538847120719476 0.9452285529535336 -5.4771447046466415
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 13312, 6656)-_triton]_speedup 1.0857333658872872 1.0339598273810182 0.952314684126928 -4.768531587307201
tritonbench_int4_gemm_fwd[x_(1, 1, 1280, 8192)-triton]_speedup 0.1614145565778853 0.15407071496453845 0.9545032259231011 -4.54967740768989
tritonbench_int4_gemm_fwd[x_(1, 1, 7168, 8192)-triton]_speedup 0.2988505865438298 0.28612386109569515 0.9574144203786994 -4.258557962130061
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 8192, 2048)-_triton]_speedup 1.26924851413472 1.2155476980927016 0.9576908576657837 -4.230914233421634
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 1024)-triton]_speedup 1.7644882168603548 1.6909518005694775 0.9583242236540834 -4.167577634591657
tritonbench_welford_fwd[x_1536-test_welford]_speedup 0.5788942570534981 0.555729510384026 0.9599844939084767 -4.001550609152327
tritonbench_jsd_bwd[x_(4, 2048, 4096)-liger_jsd]_speedup 5.84087672479919 5.617193698195862 0.9617038610567461 -3.82961389432539
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 3584)-triton]_speedup 0.36904025620309866 0.355106892630371 0.9622443260903777 -3.775567390962231
tritonbench_swiglu_fwd[x_(4, 1024, 4096)-liger_swiglu]_speedup 1.0462246840051916 1.0129992106039154 0.9682425066916971 -3.175749330830291
tritonbench_int4_gemm_fwd[x_(64, 1, 7168, 8192)-triton]_speedup 1.6631204792164707 1.6106007885767748 0.9684209945725406 -3.1579005427459395
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 8192, 13312)-_triton]_speedup 1.1669015889187855 1.130393005476213 0.9687132284425114 -3.12867715574886
tritonbench_embedding_fwd[x_(32, 512, 768, 1024)-liger_embedding]_speedup 0.9918367518256614 0.9618420457051134 0.9697584243925856 -3.024157560741436
tritonbench_int4_gemm_fwd[x_(64, 1, 8192, 3584)-triton]_speedup 1.8534189738874451 1.797447509105221 0.9698009648272744 -3.0199035172725597
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 13312, 13312)-_triton]_speedup 0.8056885030248641 0.7823240512278228 0.9710006389450486 -2.899936105495138
tritonbench_softmax_fwd[x_(4096, 2304)-triton_softmax]_speedup 4.389227513387762 4.274605855094556 0.9738856876423942 -2.611431235760575
tritonbench_embedding_bwd[x_(32, 512, 768, 1024)-liger_embedding]_speedup 1.5752113151232225 1.5345444286760197 0.9741832184312226 -2.5816781568777425
tritonbench_gemm_fwd[x_(2304, 2304, 2304)-triton_tutorial_matmul]_speedup 0.6848072399241188 0.6679774944336473 0.9754241127878023 -2.4575887212197722
tritonbench_int4_gemm_fwd[x_average-triton]_speedup 1.4782812454926078 1.4428288824136488 0.976017849656785 -2.3982150343215047
tritonbench_rope_bwd[x_(8192, 1024)-liger_rotary_pos_emb]_speedup 3.779593710528573 3.68905360892575 0.9760450173915225 -2.3954982608477504
tritonbench_flash_attention_fwd[x_(4, 48, 256, 256, 64)-triton_tutorial_flash_v2]_speedup 1.5866013003515287 1.5505617599604 0.9772850681622763 -2.271493183772366
tritonbench_softmax_fwd[x_(4096, 2688)-triton_softmax]_speedup 4.6219965357295525 4.517985801308563 0.9774965788881597 -2.250342111184034
tritonbench_softmax_fwd[x_(4096, 2816)-triton_softmax]_speedup 4.658969883582606 4.554883537383809 0.9776589356017132 -2.234106439828676
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 4096, 16384)-_triton]_speedup 1.2012822160879075 1.1747973730388783 0.977952855129014 -2.2047144870986024
tritonbench_layer_norm_fwd[x_(4096, 2048)-liger_layer_norm]_speedup 1.2887168185753521 1.263274276167788 0.98025746072307 -1.9742539276930038
tritonbench_layer_norm_fwd[x_(4096, 3072)-liger_layer_norm]_speedup 1.3011648021651314 1.275693316849076 0.9804240898050186 -1.9575910194981394
tritonbench_cross_entropy_fwd[x_(8, 2048, 16384)-liger_cross_entropy_loss]_speedup 0.7846806722903867 0.770913718905545 0.9824553428279842 -1.754465717201581
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 1024)-triton]_speedup 2.305555954814678 2.265124604925214 0.9824635139281562 -1.753648607184377
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 2304, 16384)-_triton]_speedup 0.7328071938281417 0.7200729762640198 0.9826226902910176 -1.7377309708982436
tritonbench_addmm_fwd[x_(19735, 512, 1536)-triton_addmm]_speedup 0.9567631388377106 0.9402866337972583 0.9827789090406763 -1.722109095932367
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 1024)-triton]_speedup 2.3054914096591363 2.2664821713213605 0.9830798596020173 -1.692014039798273
tritonbench_embedding_bwd[x_(32, 512, 768, 4096)-liger_embedding]_speedup 1.7441508259835 1.7148204380701824 0.9831835713538257 -1.6816428646174275
tritonbench_int4_gemm_fwd[x_(1, 4096, 7168, 8192)-triton]_speedup 2.2401300715511874 2.202515916613222 0.9832089415629696 -1.6791058437030437
tritonbench_softmax_fwd[x_(4096, 2560)-triton_softmax]_speedup 4.579807852034335 4.5032803590872685 0.9832902393682142 -1.6709760631785753
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 1024)-triton]_speedup 2.2835847346665705 2.247043149957335 0.9839981481070065 -1.6001851892993524
tritonbench_flash_attention_fwd[x_(4, 48, 512, 512, 64)-triton_tutorial_flash_v2]_speedup 1.2648766191909557 1.2446808106601426 0.9840333766753228 -1.5966623324677243
tritonbench_welford_fwd[x_2048-test_welford]_speedup 0.5732197634009926 0.5640685186388528 0.9840353641893919 -1.596463581060814
tritonbench_softmax_fwd[x_(4096, 3328)-triton_softmax]_speedup 4.7831514881183725 4.7068702992869165 0.9840521068544571 -1.5947893145542902
tritonbench_flex_attention_bwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup 14.490887938291525 14.261208026571001 0.9841500456908783 -1.5849954309121728
tritonbench_welford_fwd[x_4096-test_welford]_speedup 0.5831165809963016 0.5738943549924704 0.9841845930910175 -1.5815406908982532
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 1024)-triton]_speedup 2.3047366206535775 2.2685297896745937 0.9842902522333696 -1.5709747766630366
tritonbench_int4_gemm_fwd[x_(16, 4096, 7168, 8192)-triton]_speedup 2.252789647456481 2.217507372841485 0.9843384069813925 -1.5661593018607478
tritonbench_int4_gemm_fwd[x_(4, 4096, 1280, 8192)-triton]_speedup 2.2308745214777237 2.196160367845063 0.9844392173121123 -1.5560782687887742
tritonbench_int4_gemm_fwd[x_(4, 4096, 7168, 8192)-triton]_speedup 2.2497090438986955 2.21495932247359 0.9845536819441837 -1.544631805581631
tritonbench_int4_gemm_fwd[x_(1, 4096, 8192, 3584)-triton]_speedup 2.2398816432538218 2.2057306708117443 0.9847532245532102 -1.5246775446789762
tritonbench_int4_gemm_fwd[x_(16, 4096, 1280, 8192)-triton]_speedup 2.2445335020741055 2.210451156546966 0.98481539905925 -1.5184600940750026
tritonbench_int4_gemm_fwd[x_(64, 4096, 7168, 8192)-triton]_speedup 2.253285955307447 2.2195966746560525 0.9850488214458347 -1.4951178554165279
tritonbench_jsd_bwd[x_(4, 2048, 8192)-liger_jsd]_speedup 6.232600341610404 6.140791662763295 0.9852696027636858 -1.4730397236314197
tritonbench_int4_gemm_fwd[x_(4, 4096, 8192, 3584)-triton]_speedup 2.255883735393586 2.2231088371781573 0.9854713708418531 -1.45286291581469
tritonbench_int4_gemm_fwd[x_(16, 4096, 8192, 3584)-triton]_speedup 2.2584417633926916 2.226136226144642 0.9856956518553219 -1.430434814467807
tritonbench_int4_gemm_fwd[x_(64, 4096, 8192, 3584)-triton]_speedup 2.259118018304066 2.226830591463728 0.9857079503688009 -1.4292049631199122
tritonbench_int4_gemm_fwd[x_(64, 4096, 1280, 8192)-triton]_speedup 2.246857902367221 2.215660836050993 0.9861152473045316 -1.3884752695468405
tritonbench_rms_norm_fwd[x_(2048, 8192)-liger_rms]_speedup 4.1390307845034275 4.084436100164419 0.9868097902186639 -1.3190209781336115
tritonbench_gemm_fwd[x_(512, 512, 512)-triton_tutorial_matmul]_speedup 0.8659793628841556 0.8549618867547684 0.9872774380064978 -1.272256199350219
tritonbench_addmm_fwd[x_(20211, 512, 1536)-triton_addmm]_speedup 0.9767350221083531 0.9643139509678258 0.987283069758556 -1.2716930241444047
tritonbench_int4_gemm_fwd[x_(1, 4096, 1280, 8192)-triton]_speedup 2.1639886828643653 2.13663507443488 0.987359634250361 -1.2640365749639004
tritonbench_int4_gemm_fwd[x_(16, 1, 8192, 1024)-triton]_speedup 0.7797148045887564 0.7700617692883619 0.987619787076525 -1.238021292347502
tritonbench_fused_linear_cross_entropy_fwd[x_(8192, 4096)-liger_lm_head_ce]_speedup 0.2976182658950201 0.29406473800016614 0.9880601149120752 -1.1939885087924762
tritonbench_softmax_fwd[x_(4096, 4608)-triton_softmax]_speedup 4.806057230774488 4.749138880806348 0.9881569554345553 -1.1843044565444716
tritonbench_jsd_bwd[x_average-liger_jsd]_speedup 5.938695265363019 5.868801187512297 0.988230735081092 -1.1769264918907951
tritonbench_flex_attention_bwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_speedup 13.553821439634765 13.395468996890449 0.9883167678245152 -1.1683232175484837
tritonbench_low_mem_dropout_fwd[x_32-triton_dropout]_speedup 1.0519480715829528 1.0397351127402403 0.9883901504526411 -1.1609849547358886
tritonbench_flex_attention_bwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_speedup 14.11732890413987 13.95664601650124 0.9886180389555486 -1.1381961044451439
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 8192, 13312)-_triton]_speedup 0.5186952939068512 0.5134849392060832 0.989954883412334 -1.0045116587666048
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 8192, 16384)-_triton]_speedup 0.6522481508217816 0.6458392681672043 0.9901741650834845 -0.9825834916515452
tritonbench_flash_attention_fwd[x_(4, 48, 1024, 1024, 64)-triton_tutorial_flash_v2]_speedup 1.0528501360605702 1.0425385725815968 0.9902060482059146 -0.9793951794085376
tritonbench_softmax_fwd[x_(4096, 256)-triton_softmax]_speedup 3.697761068414634 3.66420665910683 0.9909257497477548 -0.9074250252245175
tritonbench_welford_fwd[x_6144-test_welford]_speedup 0.6511465258915132 0.6455501462419818 0.9914053451458269 -0.85946548541731
tritonbench_jsd_bwd[x_(4, 2048, 16384)-liger_jsd]_speedup 6.047223104930588 5.996224194120004 0.9915665570914687 -0.8433442908531319
tritonbench_geglu_fwd[x_(8, 2048, 4096)-liger_geglu]_speedup 0.9768367639626843 0.968691766854009 0.9916618646951474 -0.8338135304852612
tritonbench_flash_attention_fwd[x_(4, 48, 4096, 4096, 64)-triton_tutorial_flash_v2]_speedup 0.8395386446203711 0.8326111147902348 0.9917484086354728 -0.8251591364527244
tritonbench_embedding_fwd[x_(8, 2048, 4096, 8192)-liger_embedding]_speedup 1.1672694439894962 1.1577173077224865 0.9918166826723722 -0.8183317327627782
tritonbench_swiglu_bwd[x_(4, 1024, 4096)-liger_swiglu]_speedup 1.0070808227345693 0.9990891775665965 0.9920645443865441 -0.7935455613455877
tritonbench_embedding_bwd[x_(32, 512, 768, 65536)-liger_embedding]_speedup 1.6027353473422985 1.5903428800535209 0.9922679266359682 -0.7732073364031811
tritonbench_embedding_bwd[x_(32, 512, 768, 2048)-liger_embedding]_speedup 1.623049557424206 1.6107474270181354 0.9924203605799972 -0.7579639420002837
tritonbench_gemm_fwd[x_(2176, 2176, 2176)-triton_tutorial_matmul]_speedup 0.6703096817260172 0.6652542265385779 0.9924580304816398 -0.754196951836017
tritonbench_cross_entropy_bwd[x_(8, 2048, 4096)-liger_cross_entropy_loss]_speedup 1.459957710600525 1.4493076339893123 0.9927052156826981 -0.7294784317301906
tritonbench_flex_attention_bwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_speedup 12.574965173923344 12.4840832431269 0.9927727886686394 -0.7227211331360617
tritonbench_embedding_bwd[x_(8, 2048, 4096, 2048)-liger_embedding]_speedup 0.7019496773187089 0.6968945382602288 0.9927984309676021 -0.7201569032397881
tritonbench_fused_linear_jsd_bwd[x_(2048, 4096)-liger_lm_head_jsd]_speedup 72.34693312118209 71.85800450962729 0.9932418889030744 -0.675811109692559
tritonbench_embedding_bwd[x_(32, 512, 768, 131072)-liger_embedding]_speedup 1.3783263689131826 1.3693405310390288 0.9934806167270533 -0.6519383272946655
tritonbench_layer_norm_fwd[x_(4096, 4096)-liger_layer_norm]_speedup 1.3869731407708001 1.3780330985252722 0.9935542787508058 -0.6445721249194181
tritonbench_fused_linear_jsd_fwd[x_(1024, 4096)-liger_lm_head_jsd]_speedup 0.18110404205443445 0.1799668713781164 0.9937208984216032 -0.627910157839684
tritonbench_embedding_fwd[x_(8, 2048, 4096, 4096)-liger_embedding]_speedup 1.1722008243103719 1.1648690255906224 0.9937452708036928 -0.6254729196307207
tritonbench_softmax_fwd[x_(4096, 10624)-triton_softmax]_speedup 4.806125051945107 4.7768660254969895 0.9939121379215309 -0.608786207846912
tritonbench_rms_norm_fwd[x_(2048, 16384)-liger_rms]_speedup 4.014764900047293 3.9906541344303212 0.9939944763349187 -0.6005523665081336
tritonbench_softmax_fwd[x_(4096, 10496)-triton_softmax]_speedup 4.8002694057591 4.771558596002546 0.9940189169961776 -0.5981083003822385
tritonbench_jsd_bwd[x_(4, 2048, 131072)-liger_jsd]_speedup 5.86343850762021 5.829686144550078 0.9942435888043736 -0.5756411195626421
tritonbench_softmax_fwd[x_(4096, 8960)-triton_softmax]_speedup 4.84670627864951 4.81916866319178 0.9943182825872826 -0.5681717412717391
tritonbench_layer_norm_fwd[x_(4096, 3584)-liger_layer_norm]_speedup 1.3216080624587836 1.3142655553607079 0.9944442627836158 -0.5555737216384249
tritonbench_softmax_fwd[x_(4096, 12160)-triton_softmax]_speedup 4.736299715750312 4.71045922144424 0.9945441598173905 -0.5455840182609473
tritonbench_softmax_fwd[x_(4096, 11136)-triton_softmax]_speedup 4.781424641491178 4.755690807487796 0.9946179567947021 -0.5382043205297893
tritonbench_softmax_fwd[x_(4096, 11392)-triton_softmax]_speedup 4.770553221641905 4.745044676254643 0.9946529167159186 -0.53470832840814
tritonbench_rms_norm_fwd[x_(2048, 4096)-liger_rms]_speedup 4.190583109611227 4.168903787619387 0.9948266574305332 -0.5173342569466777
tritonbench_softmax_fwd[x_(4096, 12544)-triton_softmax]_speedup 4.733559912920285 4.709968617746254 0.9950161621257527 -0.49838378742472766
tritonbench_softmax_fwd[x_(4096, 11776)-triton_softmax]_speedup 4.746322404460557 4.724394296838585 0.9953799793285505 -0.46200206714495096
tritonbench_embedding_bwd[x_average-liger_embedding]_speedup 1.382536358109996 1.376477924137093 0.9956178845226282 -0.4382115477371773
tritonbench_softmax_fwd[x_(4096, 10880)-triton_softmax]_speedup 4.785956091349303 4.765177147975516 0.995658350603896 -0.43416493961040414
tritonbench_softmax_fwd[x_(4096, 6144)-triton_softmax]_speedup 4.7547759504036575 4.734423367595345 0.995719549560146 -0.42804504398540333
tritonbench_softmax_fwd[x_(4096, 7040)-triton_softmax]_speedup 4.734405371199078 4.714174240784348 0.9957267853450398 -0.42732146549602223
tritonbench_softmax_fwd[x_(4096, 6912)-triton_softmax]_speedup 4.729772710337059 4.7095881305073 0.9957324419024101 -0.4267558097589941
tritonbench_softmax_fwd[x_(4096, 11264)-triton_softmax]_speedup 4.762084479070741 4.742306744154943 0.9958468324107393 -0.41531675892606534
tritonbench_swiglu_fwd[x_average-liger_swiglu]_speedup 1.0854025059401498 1.0809491805245712 0.9958970746877711 -0.4102925312228889
tritonbench_fused_linear_cross_entropy_fwd[x_average-liger_lm_head_ce]_speedup 0.3033311701457682 0.3020976783382171 0.9959335144919055 -0.40664855080945195
tritonbench_fused_linear_cross_entropy_fwd[x_(16384, 4096)-liger_lm_head_ce]_speedup 0.3276516362325097 0.32635314713300895 0.9960369827099557 -0.3963017290044335
tritonbench_cross_entropy_bwd[x_(8, 2048, 32768)-liger_cross_entropy_loss]_speedup 2.067881461172406 2.0603816821306244 0.9963732065001785 -0.36267934998215345
tritonbench_layer_norm_fwd[x_(4096, 6144)-liger_layer_norm]_speedup 1.4937146830647252 1.4884832313371217 0.9964976900964313 -0.3502309903568701
tritonbench_softmax_fwd[x_(4096, 7296)-triton_softmax]_speedup 4.743258721709375 4.72734150216041 0.996644243866329 -0.3355756133671006
tritonbench_geglu_fwd[x_(8, 8192, 4096)-liger_geglu]_speedup 1.010679097558023 1.0073528672373722 0.9967089154918829 -0.32910845081171036
tritonbench_softmax_fwd[x_(4096, 5376)-triton_softmax]_speedup 4.704591109168532 4.68945039629067 0.9967817154506043 -0.3218284549395656
tritonbench_jsd_bwd[x_(4, 2048, 65536)-liger_jsd]_speedup 5.83901322404002 5.820836903427041 0.9968870903497624 -0.3112909650237583
tritonbench_softmax_fwd[x_(4096, 8064)-triton_softmax]_speedup 4.718631441876037 4.704225424210067 0.9969469923973883 -0.3053007602611735
tritonbench_fused_linear_jsd_fwd[x_(8192, 4096)-liger_lm_head_jsd]_speedup 0.4368753069040728 0.43560309470064196 0.9970879283325799 -0.2912071667420135
tritonbench_layer_norm_fwd[x_(4096, 2560)-liger_layer_norm]_speedup 1.2999036121530863 1.2961538616529273 0.9971153626583528 -0.2884637341647234
tritonbench_layer_norm_fwd[x_(4096, 7168)-liger_layer_norm]_speedup 1.5472921493625484 1.5428789436691595 0.9971477877043406 -0.2852212295659351
tritonbench_layer_norm_bwd[x_(4096, 3584)-liger_layer_norm]_speedup 0.8482097369719747 0.8458383403262112 0.9972042331720582 -0.27957668279418035
tritonbench_embedding_bwd[x_(32, 512, 768, 32768)-liger_embedding]_speedup 1.8581105588508908 1.8529918422700513 0.9972452034371921 -0.27547965628078597
tritonbench_softmax_fwd[x_(4096, 4864)-triton_softmax]_speedup 4.8053096718337125 4.79228639118974 0.9972898144899364 -0.2710185510063634
tritonbench_softmax_fwd[x_(4096, 5248)-triton_softmax]_speedup 4.720122584500994 4.707403211173603 0.99730528750055 -0.26947124994499694
tritonbench_softmax_fwd[x_(4096, 3712)-triton_softmax]_speedup 4.708535787733428 4.696132768140148 0.9973658436183936 -0.2634156381606356
tritonbench_rope_bwd[x_(8192, 8192)-liger_rotary_pos_emb]_speedup 3.683395119523763 3.674097639565173 0.9974758396379175 -0.25241603620824726
tritonbench_layer_norm_bwd[x_(4096, 1024)-liger_layer_norm]_speedup 0.5290930064141324 0.5277857068921588 0.9975291687734947 -0.24708312265052657
tritonbench_rope_fwd[x_(8192, 16384)-liger_rotary_pos_emb]_speedup 3.059157084130118 3.0518462106450714 0.9976101673487208 -0.23898326512792334
tritonbench_fused_linear_jsd_fwd[x_average-liger_lm_head_jsd]_speedup 0.3200001155405728 0.319252002746645 0.9976621483630904 -0.23378516369095825
tritonbench_layer_norm_fwd[x_(4096, 15872)-liger_layer_norm]_speedup 1.6072518451964959 1.603581081261356 0.9977161239875938 -0.22838760124062052
tritonbench_cross_entropy_bwd[x_average-liger_cross_entropy_loss]_speedup 1.8523424396283736 1.8485287262616856 0.9979411402097697 -0.20588597902303496
tritonbench_softmax_fwd[x_(4096, 1920)-triton_softmax]_speedup 4.282816445123302 4.274021224045633 0.9979463931759943 -0.20536068240056826
tritonbench_layer_norm_bwd[x_(4096, 4096)-liger_layer_norm]_speedup 0.9220282487249086 0.9201527684469013 0.9979659188527022 -0.20340811472977505
tritonbench_layer_norm_fwd[x_(4096, 13824)-liger_layer_norm]_speedup 1.5635449008164877 1.5604263183935931 0.9980054410837412 -0.19945589162587707
tritonbench_rope_fwd[x_average-liger_rotary_pos_emb]_speedup 2.8729174114752363 2.8672685862944247 0.9980337669442746 -0.19662330557254082
tritonbench_softmax_fwd[x_(4096, 3456)-triton_softmax]_speedup 4.769346262810168 4.760299638205606 0.9981031730333559 -0.18968269666440607
tritonbench_kl_div_fwd[x_(8, 512, 4096)-liger_kl_div]_speedup 3.3988630452880417 3.392518838391171 0.9981334326178085 -0.18665673821914863
tritonbench_fused_linear_jsd_fwd[x_(2048, 4096)-liger_lm_head_jsd]_speedup 0.2938437998056872 0.2933265062482356 0.9982395627956292 -0.17604372043708016
tritonbench_swiglu_fwd[x_(4, 2048, 4096)-liger_swiglu]_speedup 1.0362195709580697 1.0344047632594906 0.9982486263052326 -0.17513736947674108
tritonbench_softmax_fwd[x_(4096, 12416)-triton_softmax]_speedup 4.733883915413517 4.725776992500469 0.9982874690089777 -0.17125309910223097
tritonbench_softmax_fwd[x_(4096, 5632)-triton_softmax]_speedup 4.739318097960334 4.73145020272092 0.9983398676609615 -0.16601323390385003
tritonbench_gemm_fwd[x_(1280, 1280, 1280)-triton_tutorial_matmul]_speedup 0.7659574654732583 0.7647058640428881 0.9983659648390569 -0.16340351609430703
tritonbench_embedding_bwd[x_(8, 2048, 4096, 32768)-liger_embedding]_speedup 1.318370903513246 1.3162619174307915 0.9984003089898037 -0.15996910101963024
tritonbench_kl_div_bwd[x_(8, 512, 65536)-liger_kl_div]_speedup 1.049876268629992 1.048225737674776 0.9984278804993183 -0.15721195006817013
tritonbench_cross_entropy_bwd[x_(8, 2048, 65536)-liger_cross_entropy_loss]_speedup 2.102465675464798 2.099592500744546 0.9986334260988035 -0.1366573901196544
tritonbench_layer_norm_bwd[x_(4096, 1536)-liger_layer_norm]_speedup 0.7695472708107032 0.7685243941703278 0.9986708072665922 -0.13291927334078135
tritonbench_kl_div_bwd[x_(8, 512, 32768)-liger_kl_div]_speedup 1.040329815222385 1.038960602369394 0.9986838665652409 -0.13161334347591147
tritonbench_swiglu_bwd[x_(4, 8192, 4096)-liger_swiglu]_speedup 1.0440015237717635 1.0427002368110767 0.9987535583703121 -0.12464416296879
tritonbench_softmax_fwd[x_(4096, 1152)-triton_softmax]_speedup 3.702936239789029 3.698629847531816 0.9988370331060688 -0.11629668939312499
tritonbench_rope_fwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup 3.1046127795441985 3.101151516079867 0.9988851223292202 -0.11148776707797792
tritonbench_rope_bwd[x_(8192, 16384)-liger_rotary_pos_emb]_speedup 3.950680118765617 3.946288582297316 0.9988884100113696 -0.11115899886303682
tritonbench_layer_norm_fwd[x_(4096, 6656)-liger_layer_norm]_speedup 1.5235389994650754 1.5218623737094041 0.9988995189776829 -0.11004810223170791
tritonbench_fused_linear_cross_entropy_bwd[x_(32768, 4096)-liger_lm_head_ce]_speedup 545.8503787322865 545.2638739867466 0.9989255210431438 -0.10744789568561952
tritonbench_kl_div_bwd[x_(8, 512, 4096)-liger_kl_div]_speedup 0.909461730400436 0.9085766402205332 0.999026797774643 -0.09732022253570172
tritonbench_layer_norm_bwd[x_(4096, 15360)-liger_layer_norm]_speedup 0.8131962749993826 0.8124056846552841 0.9990277988618441 -0.09722011381558815
tritonbench_kl_div_fwd[x_(8, 512, 32768)-liger_kl_div]_speedup 4.439526200000614 4.435228023848629 0.9990318389939935 -0.0968161006006496
tritonbench_cross_entropy_bwd[x_(8, 2048, 131072)-liger_cross_entropy_loss]_speedup 2.1176590619772035 2.1156363561161515 0.9990448387573949 -0.09551612426050848
tritonbench_rope_bwd[x_(8192, 4096)-liger_rotary_pos_emb]_speedup 3.643590254083874 3.6402345627512167 0.9990790151749648 -0.09209848250352204
tritonbench_softmax_fwd[x_(4096, 9856)-triton_softmax]_speedup 4.7720544919120504 4.767679589064829 0.9990832244571732 -0.09167755428267643
tritonbench_rope_fwd[x_(8192, 2048)-liger_rotary_pos_emb]_speedup 2.76027415756235 2.7577556399826313 0.9990875842630274 -0.09124157369726493
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 8192, 6656)-_triton]_speedup 0.6797052263812687 0.6791085997974049 0.9991222274587467 -0.08777725412533188
tritonbench_fused_linear_cross_entropy_bwd[x_average-liger_lm_head_ce]_speedup 274.3719450071362 274.15314598855383 0.9992025459506193 -0.0797454049380697
tritonbench_softmax_fwd[x_(4096, 9600)-triton_softmax]_speedup 4.773057600077714 4.769677672575322 0.9992918737242272 -0.07081262757727691
tritonbench_swiglu_bwd[x_(4, 2048, 4096)-liger_swiglu]_speedup 1.025330008712361 1.024612403141267 0.9993001223362269 -0.06998776637731075
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 4096, 6656)-_triton]_speedup 0.6867133315738065 0.6862334443739752 0.9993011826365281 -0.0698817363471882
tritonbench_softmax_fwd[x_(4096, 7552)-triton_softmax]_speedup 4.7310522497496725 4.728140970486364 0.9993846444491365 -0.061535555086345006
tritonbench_cross_entropy_bwd[x_(8, 2048, 16384)-liger_cross_entropy_loss]_speedup 1.8262628228609377 1.8252333287889049 0.9994362837269939 -0.05637162730061096
tritonbench_fused_linear_cross_entropy_bwd[x_(8192, 4096)-liger_lm_head_ce]_speedup 162.13003878566073 162.03914270537632 0.999439363112689 -0.05606368873110057
tritonbench_fused_linear_cross_entropy_bwd[x_(16384, 4096)-liger_lm_head_ce]_speedup 304.236982358219 304.0771152625708 0.9994745310237796 -0.05254689762204112
tritonbench_embedding_fwd[x_(8, 2048, 4096, 65536)-liger_embedding]_speedup 1.026657270685912 1.026154111739422 0.999509905631746 -0.049009436825397934
tritonbench_fused_linear_cross_entropy_bwd[x_(4096, 4096)-liger_lm_head_ce]_speedup 85.27038015237868 85.2324519995216 0.9995552013162214 -0.04447986837785578
tritonbench_fused_linear_cross_entropy_fwd[x_(32768, 4096)-liger_lm_head_ce]_speedup 0.34583686593487384 0.3456936632123823 0.9995859240682614 -0.04140759317385889
tritonbench_softmax_fwd[x_(4096, 4096)-triton_softmax]_speedup 4.843709408932528 4.841731192063543 0.9995915905142169 -0.04084094857831344
tritonbench_embedding_bwd[x_(8, 2048, 4096, 131072)-liger_embedding]_speedup 1.146237444033417 1.1457769464996226 0.9995982529307592 -0.04017470692407876
tritonbench_geglu_bwd[x_(8, 8192, 4096)-liger_geglu]_speedup 1.006057404513583 1.005675944262156 0.9996208364952976 -0.037916350470235116
tritonbench_jsd_fwd[x_(4, 2048, 32768)-liger_jsd]_speedup 0.5966337270213812 0.5964091993597125 0.9996236758810307 -0.03763241189692623
tritonbench_softmax_fwd[x_(4096, 1024)-triton_softmax]_speedup 3.891304442854918 3.889980425528898 0.9996597497457566 -0.0340250254243446
tritonbench_softmax_fwd[x_(4096, 8320)-triton_softmax]_speedup 4.813267037991012 4.811982893999397 0.9997332074074678 -0.026679259253215015
tritonbench_softmax_fwd[x_(4096, 1792)-triton_softmax]_speedup 4.238035319881156 4.237139320654531 0.9997885814631082 -0.021141853689177204
tritonbench_layer_norm_fwd[x_(4096, 14848)-liger_layer_norm]_speedup 1.5778408380350717 1.5775421284151177 0.999810684567953 -0.01893154320470325
tritonbench_fused_linear_jsd_fwd[x_(4096, 4096)-liger_lm_head_jsd]_speedup 0.36817731339809684 0.3681115386595862 0.9998213503762532 -0.017864962374680715
tritonbench_softmax_fwd[x_(4096, 9728)-triton_softmax]_speedup 4.7617134595589095 4.7609071199530915 0.9998306618798745 -0.01693381201255395
tritonbench_jsd_bwd[x_(4, 2048, 32768)-liger_jsd]_speedup 5.809019689177697 5.808074522017501 0.9998372931732428 -0.01627068267572307
tritonbench_fused_linear_jsd_bwd[x_average-liger_lm_head_jsd]_speedup 132.45637725539433 132.4414294735621 0.9998871493986022 -0.01128506013977626
tritonbench_geglu_bwd[x_(8, 2048, 4096)-liger_geglu]_speedup 1.005061170978461 1.004953022395565 0.9998923960192485 -0.010760398075149169
tritonbench_embedding_fwd[x_(8, 2048, 4096, 16384)-liger_embedding]_speedup 1.0995817766077767 1.0994639394464667 0.9998928345632705 -0.010716543672950607
tritonbench_geglu_fwd[x_average-liger_geglu]_speedup 0.9886903479750182 0.9886316639399085 0.9999406446768395 -0.005935532316048153
tritonbench_jsd_fwd[x_(4, 2048, 8192)-liger_jsd]_speedup 4.318643381007405 4.318417557455946 0.9999477096088896 -0.005229039111043221
tritonbench_embedding_bwd[x_(8, 2048, 4096, 1024)-liger_embedding]_speedup 0.6214689170662767 0.6214719929551952 1.0000049493849716 0.000494938497164199
tritonbench_kl_div_fwd[x_(8, 512, 131072)-liger_kl_div]_speedup 4.609827578188277 4.609881546749964 1.0000117072842252 0.0011707284225170866
tritonbench_embedding_fwd[x_(8, 2048, 4096, 32768)-liger_embedding]_speedup 1.0540788557929333 1.0541133298645684 1.0000327054009723 0.0032705400972332654
tritonbench_geglu_bwd[x_(8, 4096, 4096)-liger_geglu]_speedup 1.0050813435606616 1.005114430073657 1.000032919239032 0.003291923903203653
tritonbench_geglu_bwd[x_average-liger_geglu]_speedup 1.0051995983941768 1.0052582013719065 1.000058299841965 0.00582998419649261
tritonbench_kl_div_fwd[x_(8, 512, 65536)-liger_kl_div]_speedup 4.554194224645726 4.554687300879865 1.0001082686002873 0.010826860028734231
tritonbench_layer_norm_bwd[x_(4096, 13312)-liger_layer_norm]_speedup 0.8014388352424096 0.8015363743315401 1.0001217049696638 0.012170496966379396
tritonbench_kl_div_fwd[x_average-liger_kl_div]_speedup 4.182155010548579 4.182722500695426 1.0001356932360028 0.013569323600282068
tritonbench_kl_div_bwd[x_average-liger_kl_div]_speedup 1.0074821474270703 1.0076686962509545 1.0001851634039973 0.01851634039973238
tritonbench_embedding_bwd[x_(8, 2048, 4096, 65536)-liger_embedding]_speedup 1.2320577136922453 1.232347133126388 1.0002349073675092 0.023490736750919616
tritonbench_fused_linear_cross_entropy_fwd[x_(4096, 4096)-liger_lm_head_ce]_speedup 0.24221791252066915 0.24227916500731086 1.0002528817377885 0.025288173778847955
tritonbench_layer_norm_bwd[x_(4096, 3072)-liger_layer_norm]_speedup 0.6744258090810452 0.6746351946532081 1.0003104649456522 0.031046494565223348
tritonbench_softmax_fwd[x_(4096, 6528)-triton_softmax]_speedup 4.7400084845565305 4.741488199870676 1.0003121756678215 0.031217566782149042
tritonbench_layer_norm_fwd[x_(4096, 15360)-liger_layer_norm]_speedup 1.594165511469532 1.5946794004195854 1.000322356083077 0.032235608307695784
tritonbench_softmax_fwd[x_(4096, 3840)-triton_softmax]_speedup 4.724627898263088 4.726230648503993 1.0003392331153729 0.033923311537287226
tritonbench_softmax_fwd[x_(4096, 9984)-triton_softmax]_speedup 4.748948474087353 4.751051230053391 1.0004427834872314 0.04427834872313685
tritonbench_kl_div_fwd[x_(8, 512, 8192)-liger_kl_div]_speedup 3.871929801151736 3.8741954297891623 1.0005851419715184 0.0585141971518377
tritonbench_softmax_fwd[x_(4096, 11648)-triton_softmax]_speedup 4.752620352111677 4.75553960643853 1.000614241010342 0.06142410103420204
tritonbench_geglu_bwd[x_(8, 1024, 4096)-liger_geglu]_speedup 1.0045984745240017 1.0052894087562478 1.000687771532376 0.06877715323760913
tritonbench_geglu_fwd[x_(8, 1024, 4096)-liger_geglu]_speedup 1.003999372956363 1.0046981299428182 1.000695973528746 0.06959735287459523
tritonbench_softmax_fwd[x_(4096, 10112)-triton_softmax]_speedup 4.772511678530406 4.776044619948735 1.0007402687843012 0.07402687843012234
tritonbench_fused_linear_jsd_bwd[x_(8192, 4096)-liger_lm_head_jsd]_speedup 277.1352570404711 277.34149027019527 1.0007441609267853 0.07441609267853089
tritonbench_layer_norm_fwd[x_(4096, 11776)-liger_layer_norm]_speedup 1.5010150953880834 1.502144996349043 1.0007527578932625 0.07527578932624568
tritonbench_cross_entropy_bwd[x_(8, 2048, 8192)-liger_cross_entropy_loss]_speedup 1.5398279056943722 1.5410208558005736 1.0007747295017773 0.07747295017772604
tritonbench_fused_linear_jsd_bwd[x_(1024, 4096)-liger_lm_head_jsd]_speedup 36.40959286517057 36.441034048842056 1.000863541204317 0.08635412043169755
tritonbench_embedding_bwd[x_(8, 2048, 4096, 16384)-liger_embedding]_speedup 1.3624161280411369 1.3635964634521145 1.0008663545496006 0.08663545496006009
tritonbench_cross_entropy_fwd[x_(8, 2048, 32768)-liger_cross_entropy_loss]_speedup 1.5662939318660936 1.5677367420919375 1.000921161856335 0.0921161856334951
tritonbench_softmax_fwd[x_(4096, 3072)-triton_softmax]_speedup 4.651960799233028 4.6563514605003204 1.000943830237782 0.09438302377819952
tritonbench_layer_norm_fwd[x_(4096, 13312)-liger_layer_norm]_speedup 1.548273779966417 1.5497825725283565 1.0009744998471601 0.09744998471601374
tritonbench_kl_div_bwd[x_(8, 512, 131072)-liger_kl_div]_speedup 1.0516921828121812 1.05275370605673 1.001009347850918 0.10093478509181075
tritonbench_softmax_fwd[x_(4096, 8832)-triton_softmax]_speedup 4.849098346600481 4.85402871695745 1.0010167602313995 0.10167602313995161
tritonbench_softmax_fwd[x_(4096, 2176)-triton_softmax]_speedup 4.119675416155235 4.123867309979537 1.001017530120908 0.10175301209081056
tritonbench_kl_div_bwd[x_(8, 512, 8192)-liger_kl_div]_speedup 0.977701304160121 0.9787044568454528 1.0010260318576476 0.10260318576476024
tritonbench_cross_entropy_fwd[x_(8, 2048, 131072)-liger_cross_entropy_loss]_speedup 1.2902193308199654 1.291687079102783 1.0011375959480353 0.11375959480353082
tritonbench_layer_norm_fwd[x_(4096, 12800)-liger_layer_norm]_speedup 1.5363461978672297 1.5381002868413862 1.0011417276760872 0.1141727676087223
tritonbench_softmax_fwd[x_(4096, 6272)-triton_softmax]_speedup 4.715404462181992 4.720990577492836 1.00118465242073 0.11846524207299591
tritonbench_softmax_fwd[x_(4096, 7680)-triton_softmax]_speedup 4.729837224860923 4.735453323432211 1.0011873767117756 0.11873767117756007
tritonbench_layer_norm_fwd[x_(4096, 12288)-liger_layer_norm]_speedup 1.5263274361145989 1.5282355888011274 1.0012501594621046 0.12501594621046053
tritonbench_embedding_bwd[x_(8, 2048, 4096, 8192)-liger_embedding]_speedup 1.1172489800914096 1.1186571082995123 1.0012603530933522 0.12603530933521512
tritonbench_softmax_fwd[x_(4096, 3584)-triton_softmax]_speedup 4.794310848447796 4.800582246070135 1.0013080915736554 0.13080915736554122
tritonbench_fused_linear_jsd_bwd[x_(4096, 4096)-liger_lm_head_jsd]_speedup 143.9337259947535 144.12518906558375 1.0013302168724323 0.13302168724322883
tritonbench_embedding_fwd[x_(8, 2048, 4096, 131072)-liger_embedding]_speedup 1.013173633421558 1.0145547851678913 1.0013631935344283 0.13631935344282642
tritonbench_layer_norm_bwd[x_(4096, 11264)-liger_layer_norm]_speedup 0.7862715305293678 0.7873440516206237 1.0013640594242728 0.13640594242727566
tritonbench_softmax_fwd[x_average-triton_softmax]_speedup 4.612606979649023 4.618910905640853 1.0013666731242534 0.13666731242534258
tritonbench_layer_norm_fwd[x_(4096, 1536)-liger_layer_norm]_speedup 1.324438216370136 1.3263010128542327 1.001406480469283 0.14064804692830446
tritonbench_rope_fwd[x_(8192, 8192)-liger_rotary_pos_emb]_speedup 2.7786435110372643 2.7826140508328794 1.0014289489745063 0.14289489745062944
tritonbench_softmax_fwd[x_(4096, 9472)-triton_softmax]_speedup 4.7874040683679455 4.794387082431816 1.0014586222437354 0.14586222437353857
tritonbench_layer_norm_bwd[x_(4096, 10240)-liger_layer_norm]_speedup 0.824169486277523 0.8254487124226256 1.0015521396586526 0.15521396586526404
tritonbench_layer_norm_bwd[x_(4096, 2048)-liger_layer_norm]_speedup 0.7178101773692543 0.7189265104750217 1.0015551926414008 0.15551926414008044
tritonbench_layer_norm_bwd[x_(4096, 11776)-liger_layer_norm]_speedup 0.8152547467515124 0.8165229312593294 1.001555568382607 0.15555683826069533
tritonbench_softmax_fwd[x_(4096, 7168)-triton_softmax]_speedup 4.724536999817203 4.732280363614023 1.0016389677543258 0.16389677543258152
tritonbench_softmax_fwd[x_(4096, 8192)-triton_softmax]_speedup 4.801966609607765 4.809959473111806 1.0016644979346687 0.16644979346687183
tritonbench_softmax_fwd[x_(4096, 7808)-triton_softmax]_speedup 4.717948888034661 4.725881244103081 1.0016813145408459 0.16813145408458574
tritonbench_layer_norm_bwd[x_(4096, 2560)-liger_layer_norm]_speedup 0.7975934797014604 0.798965722273639 1.0017204786737879 0.1720478673787884
tritonbench_softmax_fwd[x_(4096, 9344)-triton_softmax]_speedup 4.770282368562259 4.778503261218973 1.0017233555629521 0.17233555629521113
tritonbench_softmax_fwd[x_(4096, 5760)-triton_softmax]_speedup 4.743856038483515 4.7524795858012245 1.0018178349527795 0.181783495277954
tritonbench_softmax_fwd[x_(4096, 3200)-triton_softmax]_speedup 4.759289317370831 4.768734728697427 1.0019846264214536 0.19846264214535836
tritonbench_layer_norm_fwd[x_(4096, 10240)-liger_layer_norm]_speedup 1.3990848499476018 1.4018872069899893 1.0020029929153278 0.20029929153277948
tritonbench_softmax_fwd[x_(4096, 6656)-triton_softmax]_speedup 4.716166406396282 4.72574042445641 1.0020300424614244 0.20300424614243529
tritonbench_layer_norm_bwd[x_(4096, 9728)-liger_layer_norm]_speedup 0.7959315339394065 0.797638273637922 1.0021443297893577 0.21443297893577462
tritonbench_embedding_bwd[x_(32, 512, 768, 16384)-liger_embedding]_speedup 2.05536976100857 2.0598662882402925 1.0021876974727486 0.21876974727486065
tritonbench_swiglu_fwd[x_(4, 4096, 4096)-liger_swiglu]_speedup 1.031227144245513 1.0335169618477316 1.002220478402839 0.222047840283901
tritonbench_layer_norm_fwd[x_(4096, 5120)-liger_layer_norm]_speedup 1.3626104360456581 1.365781768993298 1.0023273951701437 0.23273951701436868
tritonbench_layer_norm_bwd[x_(4096, 15872)-liger_layer_norm]_speedup 0.8407269947723515 0.8428121219531028 1.002480147769391 0.24801477693909835
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 2304, 2048)-_triton]_speedup 0.4527272829256398 0.4538534801813155 1.0024875842436487 0.24875842436486995
tritonbench_layer_norm_fwd[x_average-liger_layer_norm]_speedup 1.4394843139428997 1.4431554005843856 1.002550279017234 0.2550279017234036
tritonbench_layer_norm_bwd[x_(4096, 14848)-liger_layer_norm]_speedup 0.8211980105172973 0.8232965928296331 1.0025555131472053 0.2555513147205257
tritonbench_cross_entropy_fwd[x_(8, 2048, 65536)-liger_cross_entropy_loss]_speedup 1.3704670101517349 1.373975794129196 1.0025602834300058 0.2560283430005805
tritonbench_embedding_fwd[x_(32, 512, 768, 8192)-liger_embedding]_speedup 1.139267040597558 1.142259461865306 1.0026266197134768 0.26266197134767744
tritonbench_kl_div_fwd[x_(8, 512, 16384)-liger_kl_div]_speedup 4.2185892140170775 4.229823864513758 1.0026631297637019 0.2663129763701866
tritonbench_kl_div_bwd[x_(8, 512, 16384)-liger_kl_div]_speedup 1.0158315833373073 1.0187910343388409 1.0029133284001772 0.29133284001772175
tritonbench_softmax_fwd[x_(4096, 1536)-triton_softmax]_speedup 4.230993859177122 4.243831466435976 1.0030341824370672 0.3034182437067212
tritonbench_jsd_fwd[x_average-liger_jsd]_speedup 1.8483005164956614 1.8540384469756717 1.0031044359014134 0.3104435901413405
tritonbench_layer_norm_fwd[x_(4096, 14336)-liger_layer_norm]_speedup 1.5879594696267045 1.5930779456378217 1.0032233039375498 0.3223303937549771
tritonbench_jsd_fwd[x_(4, 2048, 65536)-liger_jsd]_speedup 0.5836544078511092 0.5855549917429285 1.003256351474868 0.32563514748680245
tritonbench_softmax_fwd[x_(4096, 4736)-triton_softmax]_speedup 4.740719905003004 4.756316527824568 1.0032899270857796 0.3289927085779576
tritonbench_flash_attention_bwd[x_(4, 48, 8192, 8192, 64)-triton_tutorial_flash_v2]_speedup 0.733803464153566 0.7363038084201633 1.0034073759374813 0.3407375937481305
tritonbench_softmax_fwd[x_(4096, 4992)-triton_softmax]_speedup 4.731550879906828 4.748532020150231 1.0035889163351315 0.35889163351314757
tritonbench_layer_norm_bwd[x_(4096, 12800)-liger_layer_norm]_speedup 0.7947606970832846 0.797740430106766 1.0037492204061131 0.37492204061131407
tritonbench_layer_norm_bwd[x_(4096, 14336)-liger_layer_norm]_speedup 0.842127752068173 0.8454350806203466 1.0039273477736022 0.3927347773602152
tritonbench_softmax_fwd[x_(4096, 12032)-triton_softmax]_speedup 4.7164388297008975 4.734987872991382 1.0039328493298112 0.39328493298111944
tritonbench_layer_norm_fwd[x_(4096, 11264)-liger_layer_norm]_speedup 1.482790675512065 1.4886762650375007 1.0039692652662542 0.3969265266254185
tritonbench_layer_norm_bwd[x_(4096, 12288)-liger_layer_norm]_speedup 0.8321095839575321 0.8354403010126406 1.0040027384845966 0.4002738484596646
tritonbench_softmax_fwd[x_(4096, 11520)-triton_softmax]_speedup 4.741840585917306 4.7608962850411265 1.004018629217611 0.40186292176109806
tritonbench_softmax_fwd[x_(4096, 10368)-triton_softmax]_speedup 4.766377893215445 4.785597906548868 1.0040324149205166 0.4032414920516647
tritonbench_softmax_fwd[x_(4096, 7424)-triton_softmax]_speedup 4.704832889050918 4.724151126299808 1.0041060411080376 0.41060411080375836
tritonbench_addmm_fwd[x_(20116, 512, 1536)-triton_addmm]_speedup 0.9349881433425802 0.9389131288678308 1.0041978987147568 0.41978987147568425
tritonbench_embedding_bwd[x_(32, 512, 768, 8192)-liger_embedding]_speedup 1.9230117175791341 1.9311369627068171 1.0042252707320534 0.42252707320533656
tritonbench_layer_norm_bwd[x_(4096, 10752)-liger_layer_norm]_speedup 0.7610238132335864 0.7642799956253006 1.0042786865996725 0.42786865996724543
tritonbench_layer_norm_bwd[x_(4096, 9216)-liger_layer_norm]_speedup 0.7849627250786703 0.7883650876654468 1.0043344256715319 0.4334425671531861
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 2304, 13312)-_triton]_speedup 1.0467104845623845 1.0513010722901142 1.0043857282366375 0.43857282366375205
tritonbench_softmax_fwd[x_(4096, 8576)-triton_softmax]_speedup 4.805144853604173 4.8263395333571975 1.0044108305574029 0.44108305574028783
tritonbench_jsd_fwd[x_(4, 2048, 131072)-liger_jsd]_speedup 0.5803724078855633 0.582991093573832 1.0045120781978751 0.45120781978751356
tritonbench_softmax_fwd[x_(4096, 2048)-triton_softmax]_speedup 4.44495959761037 4.465358007018123 1.0045891101954492 0.4589110195449164
tritonbench_embedding_bwd[x_(8, 2048, 4096, 4096)-liger_embedding]_speedup 0.8608665137784937 0.864849888095587 1.0046271683859669 0.46271683859668844
tritonbench_softmax_fwd[x_(4096, 12288)-triton_softmax]_speedup 4.699630279131025 4.721539409487938 1.0046618838197128 0.46618838197127754
tritonbench_rope_fwd[x_(8192, 4096)-liger_rotary_pos_emb]_speedup 2.7608437436638824 2.773923236296803 1.0047374983328694 0.47374983328694054
tritonbench_softmax_fwd[x_(4096, 10752)-triton_softmax]_speedup 4.767417369666216 4.790636509485369 1.004870381176796 0.4870381176796057
tritonbench_softmax_fwd[x_(4096, 9216)-triton_softmax]_speedup 4.783775327704205 4.807331407443259 1.0049241609660542 0.4924160966054192
tritonbench_embedding_fwd[x_(32, 512, 768, 131072)-liger_embedding]_speedup 1.0078739839898685 1.0128928325216937 1.004979638934579 0.4979638934579089
tritonbench_softmax_fwd[x_(4096, 1664)-triton_softmax]_speedup 4.291609329719174 4.313698454825873 1.0051470493723027 0.5147049372302703
tritonbench_jsd_fwd[x_(4, 2048, 4096)-liger_jsd]_speedup 4.40278497730151 4.42577579513686 1.0052218806854931 0.5221880685493119
tritonbench_welford_fwd[x_average-test_welford]_speedup 0.6227756046563793 0.6260622761513978 1.0052774570333916 0.5277457033391553
tritonbench_layer_norm_bwd[x_(4096, 13824)-liger_layer_norm]_speedup 0.8296436567564022 0.8340657754385716 1.005330142219683 0.5330142219682976
tritonbench_softmax_fwd[x_(4096, 2944)-triton_softmax]_speedup 4.6239386860860865 4.6489001657884135 1.0053983154615433 0.5398315461543346
tritonbench_softmax_fwd[x_(4096, 11904)-triton_softmax]_speedup 4.711164124480445 4.73671675400913 1.0054238461776162 0.5423846177616154
tritonbench_softmax_fwd[x_(4096, 10240)-triton_softmax]_speedup 4.735487501259944 4.76140740556327 1.0054735450777623 0.5473545077762321
tritonbench_softmax_fwd[x_(4096, 7936)-triton_softmax]_speedup 4.725209592297598 4.751226174732133 1.0055059107805384 0.5505910780538414
tritonbench_softmax_fwd[x_(4096, 3968)-triton_softmax]_speedup 4.797603053522679 4.825449614425635 1.0058042652950434 0.5804265295043409
tritonbench_softmax_fwd[x_(4096, 8704)-triton_softmax]_speedup 4.80324234065995 4.831205961652928 1.0058218218048802 0.5821821804880223
tritonbench_softmax_fwd[x_(4096, 8448)-triton_softmax]_speedup 4.822759842576639 4.851329415634804 1.0059239053966453 0.5923905396645335
tritonbench_softmax_fwd[x_(4096, 1280)-triton_softmax]_speedup 3.949664436740285 3.9733331315467852 1.0059925837208679 0.5992583720867861
tritonbench_softmax_fwd[x_(4096, 11008)-triton_softmax]_speedup 4.7446319204623855 4.773812596621726 1.00615025077783 0.6150250777829935
tritonbench_softmax_fwd[x_(4096, 896)-triton_softmax]_speedup 3.640496008656105 3.663244187765001 1.006248648276171 0.6248648276170998
tritonbench_softmax_fwd[x_(4096, 9088)-triton_softmax]_speedup 4.801837657779311 4.833025637001571 1.0064950090871425 0.6495009087142511
tritonbench_softmax_fwd[x_(4096, 6784)-triton_softmax]_speedup 4.703628843923576 4.73419768095767 1.006498990045438 0.6498990045437969
tritonbench_addmm_fwd[x_(35901, 512, 1536)-triton_addmm]_speedup 0.997324288184425 1.0039594463274348 1.0066529595454743 0.6652959545474291
tritonbench_softmax_fwd[x_(4096, 6400)-triton_softmax]_speedup 4.723123133145658 4.754611789620588 1.0066669141555828 0.6666914155582848
tritonbench_softmax_fwd[x_(4096, 12672)-triton_softmax]_speedup 4.721836564039016 4.753655545795351 1.0067386876535849 0.6738687653584874
tritonbench_softmax_fwd[x_(4096, 5888)-triton_softmax]_speedup 4.715533067140289 4.747577411623739 1.0067954871755112 0.679548717551115
tritonbench_layer_norm_bwd[x_(4096, 8704)-liger_layer_norm]_speedup 0.7473509424429097 0.7524342867577843 1.0068018169593236 0.6801816959323581
tritonbench_embedding_fwd[x_average-liger_embedding]_speedup 1.0631471799655985 1.0705210641286214 1.0069359015402377 0.693590154023771
tritonbench_addmm_fwd[x_(34579, 512, 1536)-triton_addmm]_speedup 0.9780111861096646 0.984882223476552 1.007025520223566 0.7025520223566062
tritonbench_embedding_fwd[x_(8, 2048, 4096, 2048)-liger_embedding]_speedup 1.0816326238366767 1.089476605368574 1.007251983121657 0.7251983121657091
tritonbench_softmax_fwd[x_(4096, 5504)-triton_softmax]_speedup 4.747796403849664 4.783144026000836 1.0074450585375798 0.7445058537579818
tritonbench_swiglu_bwd[x_average-liger_swiglu]_speedup 0.9790424839901848 0.9863691719658316 1.0074835240507503 0.7483524050750257
tritonbench_addmm_fwd[x_(34181, 512, 1536)-triton_addmm]_speedup 0.978531411689928 0.986080049985375 1.007714252404438 0.7714252404438016
tritonbench_softmax_fwd[x_(4096, 6016)-triton_softmax]_speedup 4.695456670666228 4.731828199794541 1.0077461111196138 0.7746111119613763
tritonbench_softmax_fwd[x_(4096, 5120)-triton_softmax]_speedup 4.726419981844305 4.763750622401199 1.007898291032175 0.7898291032174987
tritonbench_layer_norm_fwd[x_(4096, 10752)-liger_layer_norm]_speedup 1.433584486263541 1.4456081357819957 1.0083871230706416 0.83871230706416
tritonbench_addmm_fwd[x_(15168, 512, 1536)-triton_addmm]_speedup 1.062910319441614 1.071862874208646 1.0084226812020558 0.8422681202055804
tritonbench_addmm_fwd[x_(19410, 512, 1536)-triton_addmm]_speedup 0.9295607581120797 0.9374248705539081 1.0084600305824014 0.8460030582401412
tritonbench_low_mem_dropout_fwd[x_512-triton_dropout]_speedup 1.1328671293090333 1.142857083461572 1.0088182928907399 0.8818292890739876
tritonbench_softmax_fwd[x_(4096, 1408)-triton_softmax]_speedup 4.0820436115963 4.1182793958742305 1.008876873381507 0.8876873381507
tritonbench_addmm_fwd[x_(20203, 512, 1536)-triton_addmm]_speedup 0.9681853302614809 0.9771112957135731 1.0092192735968035 0.9219273596803523
tritonbench_layer_norm_fwd[x_(4096, 8192)-liger_layer_norm]_speedup 1.548311658650188 1.5626054338570805 1.0092318462675363 0.9231846267536259
tritonbench_rms_norm_fwd[x_average-liger_rms]_speedup 3.847226719983393 3.8838816763777557 1.0095276309565975 0.9527630956597477
tritonbench_embedding_fwd[x_(32, 512, 768, 2048)-liger_embedding]_speedup 1.0298136740277062 1.0396927052908154 1.0095930278576233 0.9593027857623282
tritonbench_flash_attention_bwd[x_(4, 48, 4096, 4096, 64)-triton_tutorial_flash_v2]_speedup 0.7921999801376013 0.7998079457312222 1.0096035922549498 0.9603592254949778
tritonbench_layer_norm_fwd[x_(4096, 9216)-liger_layer_norm]_speedup 1.3074408646456794 1.3200911291638657 1.0096755921130052 0.9675592113005171
tritonbench_cross_entropy_fwd[x_average-liger_cross_entropy_loss]_speedup 1.1053884899686743 1.1165040485382238 1.0100557936602583 1.0055793660258328
tritonbench_addmm_fwd[x_(36032, 512, 1536)-triton_addmm]_speedup 1.0095308075570502 1.0198020155756784 1.0101742393018034 1.017423930180339
tritonbench_softmax_fwd[x_(4096, 4352)-triton_softmax]_speedup 4.742286622266863 4.79169217029805 1.0104180856127947 1.041808561279467
tritonbench_addmm_fwd[x_(34308, 512, 1536)-triton_addmm]_speedup 0.9368729894359722 0.9466334719440221 1.0104181491174444 1.041814911744443
tritonbench_geglu_fwd[x_(8, 4096, 4096)-liger_geglu]_speedup 0.9632461574230026 0.973783891725435 1.010939814523241 1.093981452324111
tritonbench_layer_norm_fwd[x_(4096, 9728)-liger_layer_norm]_speedup 1.3652754675847865 1.3803695401246143 1.0110556974751255 1.105569747512547
tritonbench_addmm_fwd[x_(20224, 512, 1536)-triton_addmm]_speedup 0.9512961469455794 0.9619047504656151 1.011151736033093 1.1151736033093052
tritonbench_rope_bwd[x_(8192, 2048)-liger_rotary_pos_emb]_speedup 3.63594888262531 3.6789117578881734 1.0118161384138717 1.1816138413871746
tritonbench_flash_attention_bwd[x_(4, 48, 1024, 1024, 64)-triton_tutorial_flash_v2]_speedup 1.0373973436104333 1.0498217593139236 1.0119765254654016 1.1976525465401622
tritonbench_jsd_fwd[x_(4, 2048, 16384)-liger_jsd]_speedup 0.6077141979069992 0.6150820445847508 1.012123867935169 1.2123867935168953
tritonbench_swiglu_fwd[x_(4, 8192, 4096)-liger_swiglu]_speedup 1.2279386245518242 1.2428757863871474 1.0121644205473013 1.216442054730127
tritonbench_rope_bwd[x_average-liger_rotary_pos_emb]_speedup 3.3778462099213566 3.4192723817871746 1.0122640787328154 1.226407873281543
tritonbench_layer_norm_fwd[x_(4096, 7680)-liger_layer_norm]_speedup 1.5377525265343155 1.5572193562701542 1.0126592734526092 1.2659273452609199
tritonbench_addmm_fwd[x_(19747, 512, 1536)-triton_addmm]_speedup 0.9435514858719904 0.955617786474094 1.0127881740241789 1.2788174024178867
tritonbench_flash_attention_bwd[x_(4, 48, 2048, 2048, 64)-triton_tutorial_flash_v2]_speedup 0.8909412769322257 0.9025480255041217 1.0130275124437622 1.3027512443762213
tritonbench_addmm_fwd[x_(35884, 512, 1536)-triton_addmm]_speedup 0.9721679841265889 0.9848747263821526 1.0130705212093356 1.3070521209335606
tritonbench_softmax_fwd[x_(4096, 4480)-triton_softmax]_speedup 4.695550208266197 4.757378851953863 1.0131674970866718 1.3167497086671753
tritonbench_flash_attention_bwd[x_average-triton_tutorial_flash_v2]_speedup 0.9229942897273619 0.9352210650263427 1.013246859092262 1.3246859092262087
tritonbench_addmm_fwd[x_(35678, 512, 1536)-triton_addmm]_speedup 0.9883297103910513 1.0017228207040139 1.0135512574115204 1.3551257411520412
tritonbench_softmax_fwd[x_(4096, 512)-triton_softmax]_speedup 3.8666667137031587 3.919161412365276 1.0135762150060879 1.3576215006087855
tritonbench_addmm_fwd[x_(27456, 512, 1536)-triton_addmm]_speedup 0.9739365000114044 0.9872340865204358 1.0136534430210549 1.3653443021054867
tritonbench_addmm_fwd[x_(35656, 512, 1536)-triton_addmm]_speedup 0.9676084153816562 0.9812114584684797 1.0140584175071048 1.4058417507104792
tritonbench_addmm_fwd[x_(34533, 512, 1536)-triton_addmm]_speedup 0.9951219140995763 1.0092269453233496 1.0141741740624173 1.4174174062417277
tritonbench_softmax_fwd[x_(4096, 640)-triton_softmax]_speedup 3.7427823262592943 3.796344619742883 1.0143108224883388 1.4310822488338815
tritonbench_addmm_fwd[x_(20067, 512, 1536)-triton_addmm]_speedup 0.9601263039619264 0.9740829229863445 1.0145362323340446 1.4536232334044552
tritonbench_addmm_fwd[x_(35917, 512, 1536)-triton_addmm]_speedup 1.0124360297632633 1.0274820877767536 1.014861243151341 1.4861243151341075
tritonbench_addmm_fwd[x_(33894, 512, 1536)-triton_addmm]_speedup 0.957884447357298 0.9721385178670049 1.0148807829055293 1.48807829055293
tritonbench_layer_norm_fwd[x_(4096, 4608)-liger_layer_norm]_speedup 1.2568759951486166 1.2757718150038089 1.0150339571510059 1.5033957151005861
tritonbench_addmm_fwd[x_(19632, 512, 1536)-triton_addmm]_speedup 0.9706227567365846 0.9856401909341301 1.0154719576615296 1.5471957661529645
tritonbench_addmm_fwd[x_average-triton_addmm]_speedup 0.9740244352307735 0.9892682240810557 1.0156503146111222 1.5650314611122207
tritonbench_addmm_fwd[x_(35503, 512, 1536)-triton_addmm]_speedup 0.9799031627423144 0.9953177977503902 1.015730773809258 1.5730773809258025
tritonbench_layer_norm_fwd[x_(4096, 8704)-liger_layer_norm]_speedup 1.2585900524837856 1.2784646708286294 1.0157911770442025 1.579117704420252
tritonbench_addmm_fwd[x_(35405, 512, 1536)-triton_addmm]_speedup 0.9916911550414265 1.007442262797223 1.0158830777865904 1.5883077786590372
tritonbench_flash_attention_fwd[x_(4, 48, 2048, 2048, 64)-triton_tutorial_flash_v2]_speedup 0.8595862685933638 0.8733226103174755 1.0159801781694233 1.598017816942332
tritonbench_rms_norm_fwd[x_(2048, 1024)-liger_rms]_speedup 3.790908916125762 3.851851798621251 1.016076060871906 1.607606087190594
tritonbench_addmm_fwd[x_(35504, 512, 1536)-triton_addmm]_speedup 0.9707799517133658 0.9866831204153033 1.016381847064177 1.6381847064177002
tritonbench_softmax_fwd[x_(4096, 4224)-triton_softmax]_speedup 4.6997534033538235 4.777500119143809 1.0165427223765622 1.654272237656218
tritonbench_welford_fwd[x_5120-test_welford]_speedup 0.6651642838158686 0.676330828342283 1.0167876490035739 1.678764900357388
tritonbench_welford_fwd[x_8192-test_welford]_speedup 0.6786546582257925 0.6903228050317051 1.0171930549131079 1.7193054913107852
tritonbench_addmm_fwd[x_(34516, 512, 1536)-triton_addmm]_speedup 0.9671729316858216 0.9840398940521813 1.017439448327984 1.7439448327984053
tritonbench_welford_fwd[x_2560-test_welford]_speedup 0.6243278675589694 0.6353740248782801 1.0176928788434505 1.76928788434505
tritonbench_addmm_fwd[x_(35380, 512, 1536)-triton_addmm]_speedup 0.962448236198496 0.979561700465237 1.0177811788967854 1.7781178896785432
tritonbench_flash_attention_bwd[x_(4, 48, 16384, 16384, 64)-triton_tutorial_flash_v2]_speedup 0.7171653746765837 0.7299916500526769 1.017884682988045 1.788468298804502
tritonbench_addmm_fwd[x_(35844, 512, 1536)-triton_addmm]_speedup 0.9670972775894111 0.9844443838893157 1.017937292040718 1.7937292040717923
tritonbench_addmm_fwd[x_(33887, 512, 1536)-triton_addmm]_speedup 0.9566074532897889 0.9738154929117872 1.0179886112770913 1.7988611277091282
tritonbench_addmm_fwd[x_(20068, 512, 1536)-triton_addmm]_speedup 0.9229264954723729 0.9397686534541081 1.0182486450051638 1.8248645005163766
tritonbench_embedding_fwd[x_(32, 512, 768, 4096)-liger_embedding]_speedup 1.0897284506061946 1.1096849668291047 1.0183132928316303 1.8313292831630301
tritonbench_addmm_fwd[x_(20120, 512, 1536)-triton_addmm]_speedup 0.9250097949755752 0.9419768132723346 1.0183425282509657 1.8342528250965717
tritonbench_low_mem_dropout_fwd[x_32768-triton_dropout]_speedup 1.1412429490007971 1.1633986996484713 1.0194137021105563 1.9413702110556263
tritonbench_layer_norm_fwd[x_(4096, 5632)-liger_layer_norm]_speedup 1.4241586108050595 1.4519589158171715 1.0195205118314714 1.952051183147141
tritonbench_flex_attention_fwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_speedup 33.75424574773952 34.41452670770589 1.0195614194700406 1.9561419470040642
tritonbench_low_mem_dropout_fwd[x_average-triton_dropout]_speedup 1.1309832684928391 1.1532152554267951 1.0196572200078455 1.9657220007845533
tritonbench_flash_attention_bwd[x_(4, 48, 512, 512, 64)-triton_tutorial_flash_v2]_speedup 1.1834707176637906 1.2068437876858717 1.0197495972424397 1.9749597242439743
tritonbench_rms_norm_bwd[x_(2048, 8192)-liger_rms]_speedup 1.251090082277544 1.275799403540684 1.0197502335068935 1.9750233506893533
tritonbench_softmax_fwd[x_(4096, 384)-triton_softmax]_speedup 3.7785233653950767 3.8533329996094268 1.0197986427448036 1.9798642744803585
tritonbench_addmm_fwd[x_(35605, 512, 1536)-triton_addmm]_speedup 0.9858983764689899 1.0054266652990116 1.019807608264822 1.9807608264821974
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 13312, 2304)-_triton]_speedup 1.1024444558335045 1.1261183518136761 1.0214740033884728 2.147400338847283
tritonbench_embedding_fwd[x_(8, 2048, 4096, 1024)-liger_embedding]_speedup 1.0005964853196214 1.0220877729956892 1.0214784760803979 2.147847608039788
tritonbench_addmm_fwd[x_(35916, 512, 1536)-triton_addmm]_speedup 0.987603268367134 1.0094152189554142 1.022085741599806 2.208574159980592
tritonbench_addmm_fwd[x_(34839, 512, 1536)-triton_addmm]_speedup 0.980745818496503 1.0024771059466575 1.0221579200648225 2.2157920064822534
tritonbench_welford_fwd[x_1024-test_welford]_speedup 0.5970861890354304 0.6106643179322852 1.022740651427208 2.2740651427207936
tritonbench_welford_fwd[x_3072-test_welford]_speedup 0.6314564635200584 0.6458466985689951 1.022788958353072 2.278895835307204
tritonbench_addmm_fwd[x_(35541, 512, 1536)-triton_addmm]_speedup 0.977724025654443 1.0002475915751519 1.0230367315619893 2.3036731561989265
tritonbench_softmax_fwd[x_(4096, 2432)-triton_softmax]_speedup 4.410058014013073 4.511810769195866 1.023072883590073 2.307288359007309
tritonbench_embedding_fwd[x_(32, 512, 768, 32768)-liger_embedding]_speedup 1.0423011112738387 1.0671834731553886 1.0238725274418448 2.387252744184476
tritonbench_embedding_fwd[x_(32, 512, 768, 16384)-liger_embedding]_speedup 1.0745573396983834 1.1015843549962028 1.0251517664989431 2.515176649894313
tritonbench_rms_norm_bwd[x_(2048, 16384)-liger_rms]_speedup 1.0398610995367572 1.0670877385212463 1.026182957509054 2.6182957509053972
tritonbench_layer_norm_fwd[x_(4096, 1024)-liger_layer_norm]_speedup 1.2779782357994327 1.3117117908903853 1.0263960325348191 2.6396032534819147
tritonbench_welford_fwd[x_7168-test_welford]_speedup 0.6446894570653673 0.6628415565030984 1.0281563460341971 2.8156346034197144
tritonbench_addmm_fwd[x_(34238, 512, 1536)-triton_addmm]_speedup 0.9480106875908492 0.9750872360890065 1.0285614380223562 2.8561438022356223
tritonbench_addmm_fwd[x_(35791, 512, 1536)-triton_addmm]_speedup 0.9834911256985909 1.0119017846153278 1.0288875600138805 2.8887560013880487
tritonbench_addmm_fwd[x_(35561, 512, 1536)-triton_addmm]_speedup 0.9846714731370093 1.0131383666231903 1.0289100418391222 2.891004183912216
tritonbench_addmm_fwd[x_(33660, 512, 1536)-triton_addmm]_speedup 1.0714285856267163 1.1030390497760243 1.0295030994817242 2.950309948172425
tritonbench_rope_fwd[x_(8192, 1024)-liger_rotary_pos_emb]_speedup 2.8344004655420574 2.920879033381363 1.0305103562078222 3.0510356207822165
tritonbench_addmm_fwd[x_(35249, 512, 1536)-triton_addmm]_speedup 0.9769024975658357 1.008231483720672 1.0320697165099888 3.20697165099888
tritonbench_rms_norm_fwd[x_(2048, 2048)-liger_rms]_speedup 3.4503675375024785 3.566794127379919 1.0337432428899198 3.374324288991981
tritonbench_addmm_fwd[x_(33961, 512, 1536)-triton_addmm]_speedup 0.9677419171293598 1.000498756939776 1.0338487351127497 3.384873511274966
tritonbench_rms_norm_bwd[x_(2048, 1024)-liger_rms]_speedup 0.37369613521198625 0.38715769196902494 1.0360227347532036 3.602273475320361
tritonbench_addmm_fwd[x_(35410, 512, 1536)-triton_addmm]_speedup 0.9786769555207153 1.014345777535752 1.0364459608595349 3.6445960859534887
tritonbench_cross_entropy_fwd[x_(8, 2048, 4096)-liger_cross_entropy_loss]_speedup 0.6053642375147344 0.6278791520845198 1.0371923433439316 3.719234334393162
tritonbench_grouped_gemm_fwd[x_512-triton]_speedup 0.2094707526130375 0.21734036949282884 1.03756904857419 3.7569048574189967
tritonbench_flex_attention_bwd[x_average-compiled]_speedup 12.163566977108449 12.63428161781806 1.038698733816773 3.8698733816773023
tritonbench_flex_attention_fwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_speedup 27.4930922339489 28.569285680023714 1.039144139804904 3.9144139804903944
tritonbench_rms_norm_fwd[x_(2048, 32768)-liger_rms]_speedup 3.4977050721101732 3.6406501100512374 1.0408682364562045 4.086823645620452
tritonbench_flash_attention_fwd[x_(4, 48, 8192, 8192, 64)-triton_tutorial_flash_v2]_speedup 0.8232426925500342 0.8573266584936939 1.0414020874428693 4.140208744286933
tritonbench_flex_attention_fwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_speedup 30.108392348824133 31.36907620385871 1.0418715101234495 4.187151012344947
tritonbench_flash_attention_fwd[x_(4, 48, 16384, 16384, 64)-triton_tutorial_flash_v2]_speedup 0.8722730039719995 0.9098174590839943 1.0430420922589965 4.304209225899647
tritonbench_embedding_fwd[x_(32, 512, 768, 65536)-liger_embedding]_speedup 1.019785613466018 1.064760307798593 1.0441021070886816 4.410210708868156
tritonbench_swiglu_bwd[x_(4, 4096, 4096)-liger_swiglu]_speedup 0.8397575807420454 0.8790748703443867 1.0468198090782328 4.68198090782328
tritonbench_gemm_fwd[x_(1024, 1024, 1024)-triton_tutorial_matmul]_speedup 0.6286307179214503 0.6587472818876933 1.047908196509745 4.790819650974498
tritonbench_grouped_gemm_fwd[x_256-triton]_speedup 0.18229469443114404 0.1913846180187703 1.0498638954687718 4.986389546877179
tritonbench_cross_entropy_fwd[x_(8, 2048, 8192)-liger_cross_entropy_loss]_speedup 1.015305757169131 1.0668318049153616 1.050749291415322 5.074929141532203
tritonbench_gemm_fwd[x_(1152, 1152, 1152)-triton_tutorial_matmul]_speedup 0.6487985282176437 0.6852886373662823 1.0562425892809635 5.624258928096348
tritonbench_grouped_gemm_fwd[x_average-triton]_speedup 0.1766186330633104 0.1867622298116646 1.0574322005126047 5.743220051260467
tritonbench_fp8_gemm_blockwise_fwd[x_average-_triton]_speedup 0.7490762171158447 0.7955222829950928 1.0620044593834237 6.200445938342369
tritonbench_int4_gemm_fwd[x_(1, 1, 8192, 1024)-triton]_speedup 0.46376812205499474 0.49293562790301565 1.0628924336558048 6.289243365580477
tritonbench_softmax_fwd[x_(4096, 768)-triton_softmax]_speedup 3.6040725346487035 3.836492754042984 1.064488219135394 6.448821913539393
tritonbench_grouped_gemm_fwd[x_1024-triton]_speedup 0.1502182176822012 0.16030534723584264 1.06714984180535 6.714984180535
tritonbench_rope_bwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup 2.3423039086195816 2.505056012250436 1.069483768964366 6.948376896436592
tritonbench_rms_norm_bwd[x_average-liger_rms]_speedup 0.7417710056318009 0.7957658492801478 1.0727917959025064 7.2791795902506395
tritonbench_gemm_fwd[x_(384, 384, 384)-triton_tutorial_matmul]_speedup 0.8941176933832611 0.9617021455777318 1.0755878702486439 7.5587870248643885
tritonbench_grouped_gemm_fwd[x_128-triton]_speedup 0.16449086752685893 0.1780185844992166 1.0822399272114531 8.223992721145311
tritonbench_low_mem_dropout_fwd[x_524288-triton_dropout]_speedup 0.9678714756167762 1.0486725498470697 1.083483268456489 8.348326845648902
tritonbench_flash_attention_fwd[x_average-triton_tutorial_flash_v2]_speedup 1.0137224294991736 1.099296923022081 1.0844160995482612 8.441609954826124
tritonbench_gemm_fwd[x_(1408, 1408, 1408)-triton_tutorial_matmul]_speedup 0.7003105432871565 0.7605863047047715 1.0860700470603932 8.607004706039323
tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup 21.133578614875113 23.182881310623575 1.09696903364516 9.696903364516007
tritonbench_flex_attention_bwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup 21.43795374600298 23.793965348831318 1.1098990897518664 10.989908975186635
tritonbench_rope_bwd[x_(512, 2048)-liger_rotary_pos_emb]_speedup 2.3189012301652254 2.5975147498715248 1.1201489378167468 12.014893781674685
tritonbench_low_mem_dropout_fwd[x_8192-triton_dropout]_speedup 1.142011791011773 1.3013698534562392 1.1395415211109876 13.954152111098761
tritonbench_low_mem_dropout_fwd[x_2048-triton_dropout]_speedup 1.156462589747948 1.32167833448748 1.142863025751262 14.286302575126197
tritonbench_gemm_fwd[x_(640, 640, 640)-triton_tutorial_matmul]_speedup 0.7739938791848584 0.8862876381142197 1.1450835232025671 14.508352320256712
tritonbench_flash_attention_bwd[x_(4, 48, 128, 128, 64)-triton_tutorial_flash_v2]_speedup 0.8574108463957293 0.9961766913578586 1.1618428849430291 16.184288494302912
tritonbench_gemm_fwd[x_(256, 256, 256)-triton_tutorial_matmul]_speedup 0.8888888817027578 1.0338164006689101 1.1630434601550295 16.304346015502947
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 4096, 2304)-_triton]_speedup 0.45796851179520687 0.5385405598811152 1.175933598076582 17.593359807658192
tritonbench_rms_norm_bwd[x_(2048, 2048)-liger_rms]_speedup 0.5401493724877225 0.6363636188707251 1.1781252580927315 17.81252580927315
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 13312, 2048)-_triton]_speedup 0.5509157536024145 0.6530612582193145 1.1854103897900448 18.541038979004476
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup 0.7800324847897045 0.9375609547782697 1.2019511662146156 20.195116621461562
tritonbench_rms_norm_bwd[x_(2048, 4096)-liger_rms]_speedup 0.8354541628057371 1.0247978436995997 1.2266356304431862 22.663563044318625
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 8192, 2304)-_triton]_speedup 0.46483180210391334 0.6297827945382797 1.3548616761757009 35.48616761757009
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 2304, 2304)-_triton]_speedup 0.4835466514737925 0.6571428571428571 1.3590061168657959 35.90061168657959
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 4096, 2048)-_triton]_speedup 0.3534421589365737 0.5155279476907165 1.4585921194059637 45.85921194059637
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 13312, 2048)-_triton]_speedup 0.48988137172834756 0.7657952003408762 1.563225802277557 56.322580227755694
tritonbench_flash_attention_fwd[x_(4, 48, 128, 128, 64)-triton_tutorial_flash_v2]_speedup 0.8108107706545644 1.4835163982891106 1.82967031517289 82.96703151728899
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment