Skip to content

Instantly share code, notes, and snippets.

@davidberard98
Last active September 19, 2025 18:16
Show Gist options
  • Select an option

  • Save davidberard98/50a197b08511070eb1fef76013350f30 to your computer and use it in GitHub Desktop.

Select an option

Save davidberard98/50a197b08511070eb1fef76013350f30 to your computer and use it in GitHub Desktop.
metric side_a_speedup side_b_speedup ratio_b_over_a improvement_percent
tritonbench_rope_bwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup 3.872456642947213 2.7721369338334196 0.7158600313530245 -28.413996864697555
tritonbench_flex_attention_fwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup 80.91770255783995 65.44677234875213 0.8088066057235227 -19.11933942764773
tritonbench_flex_attention_bwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup 26.4510668357185 22.165953910167254 0.8379984840624729 -16.20015159375271
tritonbench_flash_attention_fwd[x_(4, 48, 128, 128, 64)-triton_tutorial_flash_v2]_speedup 1.5028248392486048 1.2918659460892634 0.8596250955867761 -14.037490441322387
tritonbench_flex_attention_fwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup 68.91569269467183 61.62544280408594 0.8942149515512374 -10.578504844876257
tritonbench_gemm_fwd[x_(1280, 1280, 1280)-triton_tutorial_matmul]_speedup 0.7517605254921877 0.6761268526603815 0.8993912685395554 -10.060873146044457
tritonbench_flex_attention_fwd[x_ (8, 16, 128, 16, 128, 128) | noop-compiled]_speedup 74.69016768218381 67.24201043480133 0.9002792806802183 -9.972071931978165
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 13312, 2048)-_triton]_speedup 0.7259858665437773 0.6570363458826082 0.9050263595496436 -9.497364045035638
tritonbench_gemm_fwd[x_(384, 384, 384)-triton_tutorial_matmul]_speedup 0.89019609476801 0.8134920854904238 0.9138347048157118 -8.616529518428816
tritonbench_gemm_fwd[x_(896, 896, 896)-triton_tutorial_matmul]_speedup 0.7415458998327725 0.6811593897250264 0.9185667264543382 -8.143327354566177
tritonbench_flex_attention_fwd[x_average-compiled]_speedup 39.41781812574853 36.34228982561435 0.9219761913172667 -7.802380868273328
tritonbench_flex_attention_bwd[x_ (8, 16, 256, 16, 256, 128) | noop-compiled]_speedup 23.819493344175164 21.989429822120723 0.9231695025745807 -7.683049742541925
tritonbench_rms_norm_bwd[x_(2048, 32768)-liger_rms]_speedup 0.41033445340348207 0.38039704742117975 0.9270414518352305 -7.295854816476954
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 2304, 2048)-_triton]_speedup 0.5301339302390289 0.4935834243893425 0.9310542039193637 -6.8945796080636335
tritonbench_swiglu_fwd[x_(4, 1024, 4096)-liger_swiglu]_speedup 1.0282033889499047 0.960015832888271 0.9336828133475876 -6.631718665241237
tritonbench_flex_attention_bwd[x_average-compiled]_speedup 13.099646110268003 12.240713832015242 0.9344308791991337 -6.556912080086629
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 2304, 2304)-_triton]_speedup 0.5310853440839427 0.4965787209404082 0.9350262184262415 -6.497378157375855
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 13312, 2304)-_triton]_speedup 1.1652799755934045 1.101271991190709 0.9450707248529692 -5.492927514703084
tritonbench_gemm_fwd[x_(768, 768, 768)-triton_tutorial_matmul]_speedup 0.6883117275814891 0.6530612441873078 0.9487870364812171 -5.121296351878291
tritonbench_welford_fwd[x_1536-test_welford]_speedup 0.5786131110905074 0.5509859878398601 0.9522528564923483 -4.774714350765175
tritonbench_fp8_gemm_blockwise_fwd[x_(64, 4096, 2048)-_triton]_speedup 0.48101262737814 0.45898618455916523 0.954208181728961 -4.579181827103895
tritonbench_rms_norm_fwd[x_(2048, 2048)-liger_rms]_speedup 3.62355216867367 3.4634146820911513 0.9558064906676551 -4.419350933234489
tritonbench_layer_norm_fwd[x_(4096, 1024)-liger_layer_norm]_speedup 1.3716980744576996 1.3121596542105796 0.9565950981810202 -4.340490181897982
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 13312, 6656)-_triton]_speedup 1.0943520171180818 1.0476445539650099 0.9573195256896646 -4.268047431033539
tritonbench_swiglu_bwd[x_(4, 1024, 4096)-liger_swiglu]_speedup 1.0125297698478177 0.9733073502658477 0.9612629467794659 -3.8737053220534112
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 2304, 16384)-_triton]_speedup 0.7418017809921026 0.7142857385908088 0.9629064756834458 -3.7093524316554216
tritonbench_flex_attention_bwd[x_ (8, 16, 512, 16, 512, 128) | noop-compiled]_speedup 14.46750965910237 13.952769297870766 0.964420942279603 -3.5579057720397045
tritonbench_welford_fwd[x_2048-test_welford]_speedup 0.573423364009273 0.5550247932852954 0.967914508060261 -3.2085491939738975
tritonbench_softmax_fwd[x_(4096, 1152)-triton_softmax]_speedup 3.8399280905049586 3.7268040474126396 0.9705400620985475 -2.9459937901452515
tritonbench_softmax_fwd[x_(4096, 1664)-triton_softmax]_speedup 4.27961420988512 4.162913917189848 0.9727311185139735 -2.7268881486026464
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 13312, 13312)-_triton]_speedup 0.7964477277481811 0.7764127871887845 0.9748446258789112 -2.5155374121088814
tritonbench_softmax_fwd[x_(4096, 2304)-triton_softmax]_speedup 4.371457742591337 4.264560418975628 0.9755465270602519 -2.445347293974809
tritonbench_flash_attention_fwd[x_(4, 48, 2048, 2048, 64)-triton_tutorial_flash_v2]_speedup 0.8602836245386501 0.8394570627967098 0.9757910517556241 -2.4208948244375916
tritonbench_layer_norm_fwd[x_(4096, 1536)-liger_layer_norm]_speedup 1.3276836548361237 1.2963482786678653 0.9763984620477035 -2.360153795229647
tritonbench_rms_norm_bwd[x_(2048, 16384)-liger_rms]_speedup 1.026982534867518 1.0035272889300886 0.9771610079615861 -2.2838992038413886
tritonbench_softmax_fwd[x_(4096, 2688)-triton_softmax]_speedup 4.60295484235613 4.505405708986644 0.9788072799515989 -2.1192720048401115
tritonbench_gemm_fwd[x_(3712, 3712, 3712)-triton_tutorial_matmul]_speedup 0.9031034318033014 0.8840483070543161 0.9789003960366573 -2.109960396334265
tritonbench_swiglu_bwd[x_(4, 4096, 4096)-liger_swiglu]_speedup 0.8317691716450402 0.814533313135485 0.9792780748588377 -2.0721925141162334
tritonbench_rope_bwd[x_average-liger_rotary_pos_emb]_speedup 3.5458751006000697 3.472904093845866 0.9794208750494752 -2.057912495052483
tritonbench_welford_fwd[x_6144-test_welford]_speedup 0.6534903344689879 0.6400794047059176 0.9794779982875069 -2.0522001712493076
tritonbench_gemm_fwd[x_(2688, 2688, 2688)-triton_tutorial_matmul]_speedup 0.9456165236626565 0.9272812389394656 0.9806102323041344 -1.9389767695865645
tritonbench_embedding_bwd[x_(32, 512, 768, 16384)-liger_embedding]_speedup 2.0761310621157105 2.036851046961825 0.9810801852201678 -1.8919814779832178
tritonbench_cross_entropy_fwd[x_(8, 2048, 16384)-liger_cross_entropy_loss]_speedup 0.7860623482826301 0.7714934331802106 0.9814659547881293 -1.8534045211870742
tritonbench_gemm_fwd[x_(2176, 2176, 2176)-triton_tutorial_matmul]_speedup 0.6874999909951017 0.6747868596457358 0.9815081723405339 -1.849182765946611
tritonbench_embedding_bwd[x_(32, 512, 768, 2048)-liger_embedding]_speedup 1.595283276447927 1.5666847009839169 0.9820730425208944 -1.7926957479105554
tritonbench_rms_norm_bwd[x_(2048, 8192)-liger_rms]_speedup 1.270231263588272 1.2490090643698293 0.9832926492782958 -1.670735072170415
tritonbench_swiglu_fwd[x_average-liger_swiglu]_speedup 1.082621605023014 1.065415528210507 0.9841070261920911 -1.5892973807908883
tritonbench_swiglu_bwd[x_average-liger_swiglu]_speedup 0.9784946601845061 0.9632974446018294 0.9844687802590449 -1.553121974095506
tritonbench_flash_attention_fwd[x_average-triton_tutorial_flash_v2]_speedup 1.0857202236476184 1.0689801371189696 0.9845815835755473 -1.541841642445274
tritonbench_gemm_fwd[x_(512, 512, 512)-triton_tutorial_matmul]_speedup 0.8625429106470411 0.8494208787064684 0.9847868068027721 -1.5213193197227914
tritonbench_gemm_fwd[x_(1408, 1408, 1408)-triton_tutorial_matmul]_speedup 0.7433071044420512 0.7324633044858829 0.9854114135444623 -1.4588586455537733
tritonbench_softmax_fwd[x_(4096, 3712)-triton_softmax]_speedup 4.768472862818265 4.70628912272967 0.986959401494456 -1.3040598505543954
tritonbench_gemm_fwd[x_(2304, 2304, 2304)-triton_tutorial_matmul]_speedup 0.681637319614989 0.6734348639529706 0.9879665396450837 -1.2033460354916259
tritonbench_gemm_fwd[x_(1152, 1152, 1152)-triton_tutorial_matmul]_speedup 0.6261180374292956 0.619389609587523 0.9892537390083855 -1.0746260991614465
tritonbench_welford_fwd[x_4096-test_welford]_speedup 0.5862376091293723 0.5803147217893215 0.9898967803364801 -1.0103219663519902
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 8192, 13312)-_triton]_speedup 1.1589687544326976 1.1474460847701715 0.9900578254431317 -0.99421745568683
tritonbench_flex_attention_bwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_speedup 13.958826780459077 13.82357734417826 0.990310830673094 -0.9689169326906
tritonbench_embedding_bwd[x_(32, 512, 768, 32768)-liger_embedding]_speedup 1.8492561671923893 1.8317803677285527 0.9905498222615804 -0.9450177738419607
tritonbench_gemm_fwd[x_(1536, 1536, 1536)-triton_tutorial_matmul]_speedup 0.6968010721219631 0.6909090864685826 0.9915442356662316 -0.8455764333768379
tritonbench_embedding_bwd[x_(32, 512, 768, 65536)-liger_embedding]_speedup 1.5996391411486017 1.5864827395160452 0.9917753940284871 -0.8224605971512866
tritonbench_flash_attention_bwd[x_(4, 48, 8192, 8192, 64)-triton_tutorial_flash_v2]_speedup 0.748064081684358 0.7421104759515026 0.992041315873033 -0.7958684126966986
tritonbench_embedding_bwd[x_(32, 512, 768, 8192)-liger_embedding]_speedup 1.9464324876929522 1.9310420636305135 0.9920930090513026 -0.7906990948697401
tritonbench_flash_attention_bwd[x_(4, 48, 16384, 16384, 64)-triton_tutorial_flash_v2]_speedup 0.7128300880706262 0.7072219817239129 0.9921326183608602 -0.7867381639139803
tritonbench_layer_norm_bwd[x_(4096, 1024)-liger_layer_norm]_speedup 0.5068161329841744 0.5028674783247186 0.9922089010146425 -0.7791098985357459
tritonbench_embedding_bwd[x_(8, 2048, 4096, 2048)-liger_embedding]_speedup 0.6984162670443088 0.6931901400125959 0.9925171745299837 -0.748282547001633
tritonbench_rms_norm_fwd[x_(2048, 1024)-liger_rms]_speedup 3.573065664827788 3.546511559043127 0.992568256988375 -0.7431743011624969
tritonbench_layer_norm_fwd[x_(4096, 11776)-liger_layer_norm]_speedup 1.5105322777640664 1.4996615998558793 0.9928034123678058 -0.7196587632194218
tritonbench_fused_linear_cross_entropy_fwd[x_(16384, 4096)-liger_lm_head_ce]_speedup 0.33129807766263464 0.3290352983427091 0.993169959403659 -0.6830040596340958
tritonbench_softmax_fwd[x_(4096, 10496)-triton_softmax]_speedup 4.8010782255092135 4.769148231434905 0.9933494118248987 -0.6650588175101335
tritonbench_layer_norm_fwd[x_(4096, 9216)-liger_layer_norm]_speedup 1.316494054026588 1.307750436484222 0.9933584071150015 -0.6641592884998548
tritonbench_fp8_gemm_blockwise_fwd[x_(2048, 8192, 2048)-_triton]_speedup 1.257534304247788 1.249207062395763 0.9933781195281141 -0.6621880471885944
tritonbench_cross_entropy_bwd[x_(8, 2048, 8192)-liger_cross_entropy_loss]_speedup 1.5512598486379368 1.5410141733222444 0.9933952552664285 -0.6604744733571533
tritonbench_cross_entropy_bwd[x_(8, 2048, 4096)-liger_cross_entropy_loss]_speedup 1.4758064430952629 1.4662991781095551 0.993557918770318 -0.644208122968204
tritonbench_embedding_bwd[x_average-liger_embedding]_speedup 1.3788076289243911 1.3701926454156363 0.9937518597025204 -0.6248140297479554
tritonbench_gemm_fwd[x_(1792, 1792, 1792)-triton_tutorial_matmul]_speedup 0.8901234821633761 0.8848485246157024 0.9940739036174472 -0.5926096382552815
tritonbench_softmax_fwd[x_(4096, 7936)-triton_softmax]_speedup 4.752804805062711 4.724799614997022 0.9941076498584883 -0.5892350141511682
tritonbench_softmax_fwd[x_(4096, 9088)-triton_softmax]_speedup 4.831076714901533 4.802812588485966 0.9941495181957294 -0.5850481804270613
tritonbench_layer_norm_fwd[x_(4096, 4096)-liger_layer_norm]_speedup 1.3825974961544958 1.3746835142719656 0.9942760044737953 -0.5723995526204728
tritonbench_embedding_bwd[x_(8, 2048, 4096, 1024)-liger_embedding]_speedup 0.6206148682877406 0.617080355298678 0.9943048206389025 -0.5695179361097535
tritonbench_kl_div_bwd[x_(8, 512, 4096)-liger_kl_div]_speedup 0.9225020140104235 0.9173714649789185 0.9944384413761865 -0.5561558623813467
tritonbench_softmax_fwd[x_(4096, 7680)-triton_softmax]_speedup 4.728850123065216 4.703344121181108 0.994606299370813 -0.5393700629186982
tritonbench_layer_norm_fwd[x_(4096, 14336)-liger_layer_norm]_speedup 1.590535839211541 1.5822611550161025 0.9947975493595037 -0.5202450640496292
tritonbench_flash_attention_fwd[x_(4, 48, 1024, 1024, 64)-triton_tutorial_flash_v2]_speedup 1.0564868009346586 1.051018089412294 0.994823682115549 -0.5176317884451054
tritonbench_flex_attention_bwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_speedup 13.530005919819493 13.461528084988394 0.9949388170827932 -0.5061182917206808
tritonbench_swiglu_bwd[x_(4, 8192, 4096)-liger_swiglu]_speedup 1.0405088367894777 1.035281034078711 0.9949757248319994 -0.5024275168000636
tritonbench_flash_attention_fwd[x_(4, 48, 512, 512, 64)-triton_tutorial_flash_v2]_speedup 1.269646764682159 1.2633093492643095 0.9950085207995344 -0.49914792004656094
tritonbench_softmax_fwd[x_(4096, 8192)-triton_softmax]_speedup 4.828960863746952 4.80623323621277 0.9952934744812683 -0.4706525518731741
tritonbench_rope_bwd[x_(8192, 1024)-liger_rotary_pos_emb]_speedup 3.660442954472158 3.643227109121236 0.9952967863274885 -0.4703213672511475
tritonbench_softmax_fwd[x_(4096, 9984)-triton_softmax]_speedup 4.775740741336714 4.7540616393377615 0.9954605781232412 -0.453942187675882
tritonbench_embedding_bwd[x_(32, 512, 768, 131072)-liger_embedding]_speedup 1.3724070616233144 1.366375712362822 0.9956052767220839 -0.4394723277916124
tritonbench_layer_norm_fwd[x_(4096, 14848)-liger_layer_norm]_speedup 1.5761075390136778 1.5693789158030425 0.9957308603353003 -0.42691396646996616
tritonbench_fused_linear_cross_entropy_fwd[x_(8192, 4096)-liger_lm_head_ce]_speedup 0.30486699890441094 0.30357620321176726 0.9957660366740829 -0.42339633259170784
tritonbench_softmax_fwd[x_(4096, 11392)-triton_softmax]_speedup 4.767256156174415 4.747089616790845 0.9957697806195182 -0.4230219380481848
tritonbench_softmax_fwd[x_(4096, 6016)-triton_softmax]_speedup 4.721392923233391 4.701573056188471 0.9958021144676631 -0.4197885532336909
tritonbench_embedding_bwd[x_(8, 2048, 4096, 16384)-liger_embedding]_speedup 1.3652479735431031 1.3596798650252062 0.9959215405363712 -0.4078459463628836
tritonbench_softmax_fwd[x_(4096, 7424)-triton_softmax]_speedup 4.720852055783301 4.701819511380732 0.995968409054621 -0.4031590945378971
tritonbench_layer_norm_fwd[x_(4096, 7680)-liger_layer_norm]_speedup 1.553024983805427 1.5468416746686708 0.996018538528849 -0.3981461471150971
tritonbench_fused_linear_cross_entropy_bwd[x_(4096, 4096)-liger_lm_head_ce]_speedup 85.64285174915061 85.3046579242822 0.996051114390037 -0.3948885609962982
tritonbench_swiglu_fwd[x_(4, 2048, 4096)-liger_swiglu]_speedup 1.0368469134979423 1.0330299756990873 0.9963187065041472 -0.36812934958527777
tritonbench_layer_norm_fwd[x_(4096, 9728)-liger_layer_norm]_speedup 1.3681481248970226 1.3631202916596163 0.9963250812204382 -0.3674918779561831
tritonbench_softmax_fwd[x_(4096, 7552)-triton_softmax]_speedup 4.7266372050522705 4.709476607293861 0.9963693854607528 -0.3630614539247179
tritonbench_gemm_fwd[x_(1024, 1024, 1024)-triton_tutorial_matmul]_speedup 0.6487068647842971 0.6464208379895781 0.9964760249677963 -0.35239750322036967
tritonbench_layer_norm_fwd[x_(4096, 8704)-liger_layer_norm]_speedup 1.2627420186677483 1.2583440277247029 0.996517110480187 -0.3482889519813015
tritonbench_gemm_fwd[x_(3200, 3200, 3200)-triton_tutorial_matmul]_speedup 0.7927434146792091 0.7901010354658564 0.9966667913420361 -0.33332086579639375
tritonbench_softmax_fwd[x_(4096, 6144)-triton_softmax]_speedup 4.75322383494346 4.737516527323237 0.996695441206713 -0.3304558793286949
tritonbench_gemm_fwd[x_average-triton_tutorial_matmul]_speedup 0.8100316891520294 0.8074681757352743 0.9968352924322768 -0.3164707567723246
tritonbench_layer_norm_bwd[x_(4096, 8192)-liger_layer_norm]_speedup 1.0535418855482095 1.0502839163434845 0.9969076035329818 -0.3092396467018199
tritonbench_flex_attention_bwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_speedup 12.570266342869415 12.532452196796532 0.9969917784523051 -0.3008221547694867
tritonbench_layer_norm_fwd[x_(4096, 8192)-liger_layer_norm]_speedup 1.5622476868330748 1.5576408012932401 0.9970511170676312 -0.2948882932368835
tritonbench_softmax_fwd[x_(4096, 9472)-triton_softmax]_speedup 4.7890072008932885 4.774970624283648 0.9970690007300422 -0.2930999269957768
tritonbench_fused_linear_cross_entropy_fwd[x_average-liger_lm_head_ce]_speedup 0.3074590956700557 0.30656587213808706 0.99709482157286 -0.29051784271399983
tritonbench_softmax_fwd[x_(4096, 6656)-triton_softmax]_speedup 4.735732118043402 4.722039642482404 0.9971086887476533 -0.2891311252346651
tritonbench_embedding_fwd[x_(8, 2048, 4096, 2048)-liger_embedding]_speedup 1.086724679813746 1.0836383921795132 0.9971600096219755 -0.2839990378024515
tritonbench_embedding_bwd[x_(8, 2048, 4096, 65536)-liger_embedding]_speedup 1.232110230330958 1.2286176508507851 0.9971653676804269 -0.28346323195731493
tritonbench_fp8_gemm_blockwise_fwd[x_(32, 8192, 13312)-_triton]_speedup 0.516865874461181 0.5154721231020777 0.9973034564130273 -0.26965435869726884
tritonbench_softmax_fwd[x_(4096, 3456)-triton_softmax]_speedup 4.694013224022822 4.681383425216347 0.9973093815028389 -0.26906184971611236
tritonbench_layer_norm_bwd[x_(4096, 9728)-liger_layer_norm]_speedup 0.8093814523063336 0.8073455195568475 0.997484581906733 -0.2515418093267052
tritonbench_layer_norm_fwd[x_(4096, 12288)-liger_layer_norm]_speedup 1.5294116805342963 1.525576305681538 0.9974922547659513 -0.25077452340487305
tritonbench_layer_norm_bwd[x_(4096, 14848)-liger_layer_norm]_speedup 0.8189254484018068 0.8168957313158525 0.9975214873467231 -0.24785126532769208
tritonbench_layer_norm_bwd[x_(4096, 12800)-liger_layer_norm]_speedup 0.8097976628269815 0.8078085647125293 0.997543709736691 -0.24562902633089578
tritonbench_layer_norm_fwd[x_(4096, 5120)-liger_layer_norm]_speedup 1.353982382938206 1.350706226478775 0.9975803551798645 -0.2419644820135458
tritonbench_geglu_fwd[x_(8, 1024, 4096)-liger_geglu]_speedup 1.0056968057805804 1.0033373442991351 0.9976539037731019 -0.23460962268980756
tritonbench_softmax_fwd[x_(4096, 2816)-triton_softmax]_speedup 4.560869571819394 4.550737137142392 0.9977783985011087 -0.2221601498891279
tritonbench_rope_bwd[x_(8192, 16384)-liger_rotary_pos_emb]_speedup 3.9495622296723334 3.942054787043776 0.9980991709480723 -0.19008290519276771
tritonbench_jsd_fwd[x_(4, 2048, 8192)-liger_jsd]_speedup 4.312912571954319 4.304769696203691 0.9981119775523439 -0.1888022447656068
tritonbench_gemm_fwd[x_(2944, 2944, 2944)-triton_tutorial_matmul]_speedup 0.6941931548606403 0.6929388051060109 0.9981930825075894 -0.18069174924105758
tritonbench_embedding_bwd[x_(8, 2048, 4096, 131072)-liger_embedding]_speedup 1.145863575426287 1.1438362646126634 0.9982307572584552 -0.17692427415447742
tritonbench_jsd_bwd[x_(4, 2048, 16384)-liger_jsd]_speedup 6.045495949378903 6.0348060814293625 0.998231763276487 -0.1768236723513006
tritonbench_cross_entropy_bwd[x_(8, 2048, 16384)-liger_cross_entropy_loss]_speedup 1.8301455684489667 1.8269875863707186 0.9982744639919957 -0.17255360080042914
tritonbench_layer_norm_bwd[x_(4096, 5120)-liger_layer_norm]_speedup 0.8940909713450915 0.8925575411894171 0.9982849282625375 -0.1715071737462548
tritonbench_layer_norm_bwd[x_(4096, 2048)-liger_layer_norm]_speedup 0.6915506509477481 0.6903735571302598 0.9982978921126382 -0.17021078873618034
tritonbench_layer_norm_fwd[x_(4096, 15360)-liger_layer_norm]_speedup 1.5932422708692313 1.5905816543728648 0.9983300615700367 -0.16699384299633246
tritonbench_layer_norm_fwd[x_average-liger_layer_norm]_speedup 1.4432388991739507 1.4408733342284876 0.9983609332129164 -0.16390667870835873
tritonbench_softmax_fwd[x_(4096, 10624)-triton_softmax]_speedup 4.807671776019647 4.799946880670175 0.9983932149053929 -0.16067850946070994
tritonbench_layer_norm_fwd[x_(4096, 3584)-liger_layer_norm]_speedup 1.3280462142819194 1.326133966127583 0.9985601042088958 -0.1439895791104151
tritonbench_softmax_fwd[x_(4096, 8576)-triton_softmax]_speedup 4.80920217446447 4.80250488091991 0.9986074002918569 -0.1392599708143094
tritonbench_geglu_fwd[x_(8, 8192, 4096)-liger_geglu]_speedup 1.0122192460713488 1.0108586085748021 0.9986557877635427 -0.13442122364573095
tritonbench_rope_bwd[x_(8192, 8192)-liger_rotary_pos_emb]_speedup 3.6799477862867565 3.6754025950315388 0.9987648761560816 -0.12351238439184398
tritonbench_softmax_fwd[x_(4096, 4992)-triton_softmax]_speedup 4.7787371972204316 4.772996253668429 0.9987986484054111 -0.12013515945888953
tritonbench_rope_fwd[x_(8192, 16384)-liger_rotary_pos_emb]_speedup 3.058421423562507 3.0548653659873573 0.9988372898686384 -0.11627101313615995
tritonbench_kl_div_bwd[x_(8, 512, 16384)-liger_kl_div]_speedup 1.0200919757227167 1.018925272263689 0.998856276211563 -0.11437237884369678
tritonbench_swiglu_fwd[x_(4, 8192, 4096)-liger_swiglu]_speedup 1.2400881439021776 1.2386811897519423 0.9988654402050744 -0.11345597949256048
tritonbench_layer_norm_fwd[x_(4096, 10240)-liger_layer_norm]_speedup 1.3920863490112292 1.390512647991649 0.9988695377835592 -0.1130462216440753
tritonbench_fused_linear_jsd_fwd[x_(4096, 4096)-liger_lm_head_jsd]_speedup 0.3717545343344134 0.3713412407951508 0.9988882622776812 -0.11117377223187619
tritonbench_rms_norm_fwd[x_(2048, 16384)-liger_rms]_speedup 3.992656975337204 3.9882982165919745 0.9989083062301235 -0.10916937698765006
tritonbench_embedding_bwd[x_(8, 2048, 4096, 4096)-liger_embedding]_speedup 0.8633951689084083 0.8624601592779951 0.9989170548271711 -0.10829451728289197
tritonbench_softmax_fwd[x_(4096, 4224)-triton_softmax]_speedup 4.715607639143241 4.710672376403699 0.9989534195553983 -0.1046580444601708
tritonbench_layer_norm_bwd[x_(4096, 4608)-liger_layer_norm]_speedup 0.8777714573325273 0.8768641001791483 0.9989662945339595 -0.10337054660405354
tritonbench_embedding_bwd[x_(8, 2048, 4096, 32768)-liger_embedding]_speedup 1.3170397871856907 1.3156913315887218 0.9989761466509297 -0.10238533490702917
tritonbench_embedding_fwd[x_(8, 2048, 4096, 131072)-liger_embedding]_speedup 1.0153899148973713 1.0143564701967627 0.9989822188644516 -0.10177811355484012
tritonbench_embedding_bwd[x_(8, 2048, 4096, 8192)-liger_embedding]_speedup 1.118684666217797 1.1176017210913967 0.9990319478230969 -0.09680521769031003
tritonbench_layer_norm_bwd[x_(4096, 10240)-liger_layer_norm]_speedup 0.8222178949502688 0.8214493456073374 0.9990652729067909 -0.09347270932090757
tritonbench_softmax_fwd[x_(4096, 11904)-triton_softmax]_speedup 4.734621241039573 4.7303481611370035 0.9990974822092356 -0.09025177907644455
tritonbench_jsd_fwd[x_(4, 2048, 16384)-liger_jsd]_speedup 0.6087472947033401 0.6082043930405621 0.9991081657898084 -0.08918342101915666
tritonbench_jsd_bwd[x_(4, 2048, 32768)-liger_jsd]_speedup 5.811029960415806 5.806049348948052 0.9991429038394773 -0.08570961605226524
tritonbench_embedding_fwd[x_(8, 2048, 4096, 8192)-liger_embedding]_speedup 1.168778296560491 1.1677973853256465 0.9991607379793661 -0.08392620206338908
tritonbench_kl_div_bwd[x_average-liger_kl_div]_speedup 1.010764280007287 1.0099473117545639 0.9991917321685357 -0.080826783146426
tritonbench_layer_norm_bwd[x_(4096, 7680)-liger_layer_norm]_speedup 1.0064029284606988 1.0056621520748075 0.9992639365755579 -0.07360634244421016
tritonbench_softmax_fwd[x_(4096, 4864)-triton_softmax]_speedup 4.79613259306054 4.792723156593179 0.9992891279794277 -0.07108720205722951
tritonbench_layer_norm_fwd[x_(4096, 15872)-liger_layer_norm]_speedup 1.603117736297034 1.602006138588575 0.9993066025761611 -0.06933974238388929
tritonbench_geglu_bwd[x_(8, 1024, 4096)-liger_geglu]_speedup 1.0028581137786496 1.0021648729698867 0.9993087349055282 -0.06912650944718157
tritonbench_layer_norm_bwd[x_(4096, 15360)-liger_layer_norm]_speedup 0.8259637394059474 0.8254207346724636 0.9993425804214185 -0.0657419578581453
tritonbench_layer_norm_bwd[x_(4096, 11776)-liger_layer_norm]_speedup 0.823121498475445 0.822600770845257 0.9993673745235029 -0.06326254764971173
tritonbench_geglu_fwd[x_(8, 2048, 4096)-liger_geglu]_speedup 0.9455933746434811 0.9450191824295952 0.9993927704769479 -0.060722952305214406
tritonbench_softmax_fwd[x_(4096, 11520)-triton_softmax]_speedup 4.739759763553262 4.737073905955592 0.9994333346558357 -0.05666653441642744
tritonbench_softmax_fwd[x_(4096, 11136)-triton_softmax]_speedup 4.781281553821182 4.77876546650166 0.9994737629877684 -0.05262370122316451
tritonbench_softmax_fwd[x_(4096, 3072)-triton_softmax]_speedup 4.664489906078971 4.662052019066326 0.9994773518515997 -0.05226481484003065
tritonbench_kl_div_bwd[x_(8, 512, 65536)-liger_kl_div]_speedup 1.0491676041213225 1.0486588755866386 0.9995151122349895 -0.04848877650105443
tritonbench_layer_norm_bwd[x_(4096, 12288)-liger_layer_norm]_speedup 0.830807532916175 0.8304109925223229 0.9995227048646751 -0.047729513532490664
tritonbench_layer_norm_bwd[x_average-liger_layer_norm]_speedup 0.8245709273827535 0.8241876952942216 0.9995352345373754 -0.04647654626246389
tritonbench_layer_norm_bwd[x_(4096, 10752)-liger_layer_norm]_speedup 0.7753725016942574 0.7750146417019049 0.9995384670057675 -0.046153299423246086
tritonbench_layer_norm_fwd[x_(4096, 12800)-liger_layer_norm]_speedup 1.536606194974814 1.535935117663141 0.999563273066406 -0.04367269335939783
tritonbench_softmax_fwd[x_(4096, 3840)-triton_softmax]_speedup 4.733108195536526 4.731081310361444 0.9995717644534318 -0.042823554656823415
tritonbench_embedding_fwd[x_(8, 2048, 4096, 4096)-liger_embedding]_speedup 1.1653020137586945 1.1648238581420198 0.9995896723673099 -0.04103276326901106
tritonbench_softmax_fwd[x_(4096, 12544)-triton_softmax]_speedup 4.71202900530713 4.710194201981066 0.9996106128964831 -0.038938710351688055
tritonbench_softmax_fwd[x_(4096, 12416)-triton_softmax]_speedup 4.722057049946883 4.720319779612497 0.999632094590131 -0.03679054098689738
tritonbench_fused_linear_jsd_bwd[x_(8192, 4096)-liger_lm_head_jsd]_speedup 278.4738465208261 278.3737789332258 0.9996406571430297 -0.035934285697025725
tritonbench_welford_fwd[x_5120-test_welford]_speedup 0.6678498838780406 0.6676428193351723 0.9996899534642937 -0.03100465357063209
tritonbench_fused_linear_cross_entropy_fwd[x_(4096, 4096)-liger_lm_head_ce]_speedup 0.2440582650790855 0.24399030082625134 0.999721524477722 -0.027847552227799888
tritonbench_layer_norm_fwd[x_(4096, 6656)-liger_layer_norm]_speedup 1.5185034074796486 1.5180967350214107 0.9997321886429528 -0.02678113570472318
tritonbench_jsd_bwd[x_(4, 2048, 65536)-liger_jsd]_speedup 5.839072768844507 5.8376351181951005 0.9997537878518868 -0.024621214811315628
tritonbench_softmax_fwd[x_(4096, 4736)-triton_softmax]_speedup 4.758445813518913 4.757439239505333 0.9997884658031158 -0.021153419688424435
tritonbench_embedding_bwd[x_(32, 512, 768, 1024)-liger_embedding]_speedup 1.5369960296655287 1.5366741836896918 0.9997906006459191 -0.020939935408093024
tritonbench_softmax_fwd[x_(4096, 6528)-triton_softmax]_speedup 4.735801664724452 4.73487417655724 0.9998041539251696 -0.019584607483036365
tritonbench_jsd_bwd[x_average-liger_jsd]_speedup 5.935903350659348 5.934793560392529 0.9998130376791434 -0.018696232085657627
tritonbench_jsd_bwd[x_(4, 2048, 8192)-liger_jsd]_speedup 6.230539310543682 6.229525043010322 0.9998372103147406 -0.01627896852594013
tritonbench_layer_norm_bwd[x_(4096, 3584)-liger_layer_norm]_speedup 0.8469750824808776 0.8468788479889653 0.9998863786032165 -0.011362139678350314
tritonbench_layer_norm_bwd[x_(4096, 7168)-liger_layer_norm]_speedup 0.964829539803206 0.9647260701422596 0.9998927586100158 -0.010724138998419175
tritonbench_layer_norm_bwd[x_(4096, 15872)-liger_layer_norm]_speedup 0.8385049460067674 0.8384167483974786 0.9998948156362001 -0.010518436379991503
tritonbench_softmax_fwd[x_(4096, 5632)-triton_softmax]_speedup 4.740278482436041 4.739817731653933 0.9999028009042474 -0.00971990957525648
tritonbench_kl_div_fwd[x_(8, 512, 131072)-liger_kl_div]_speedup 4.606177431922028 4.605788730153373 0.9999156129405782 -0.00843870594218199
tritonbench_geglu_bwd[x_(8, 2048, 4096)-liger_geglu]_speedup 1.0049163941036896 1.0048615893819843 0.999945463401705 -0.005453659829501856
tritonbench_fused_linear_jsd_fwd[x_(2048, 4096)-liger_lm_head_jsd]_speedup 0.29443544579373526 0.2944339194004598 0.9999948158644034 -0.000518413559658093
tritonbench_jsd_fwd[x_average-liger_jsd]_speedup 1.8500953664338733 1.85012355233838 1.0000152348387106 0.001523483871057607
tritonbench_fused_linear_cross_entropy_bwd[x_average-liger_lm_head_ce]_speedup 274.75736848423674 274.77246200413526 1.000054933994971 0.005493399497091822
tritonbench_fused_linear_cross_entropy_bwd[x_(32768, 4096)-liger_lm_head_ce]_speedup 546.4995953339925 546.5323976201926 1.0000600225260552 0.0060022526055192316
tritonbench_fused_linear_jsd_fwd[x_average-liger_lm_head_jsd]_speedup 0.3226564725420707 0.3226796220922283 1.0000717467403495 0.007174674034948758
tritonbench_layer_norm_fwd[x_(4096, 13312)-liger_layer_norm]_speedup 1.5477865387090335 1.5478999089296361 1.0000732466769593 0.007324667695929321
tritonbench_softmax_fwd[x_(4096, 3968)-triton_softmax]_speedup 4.814963249463494 4.815333170558845 1.000076827397466 0.00768273974660616
tritonbench_softmax_fwd[x_(4096, 11008)-triton_softmax]_speedup 4.75019160454774 4.7508300647681745 1.0001344072562932 0.013440725629321193
tritonbench_fused_linear_cross_entropy_fwd[x_(32768, 4096)-liger_lm_head_ce]_speedup 0.3496130410340915 0.3496616861716205 1.0001391399399322 0.013913993993219798
tritonbench_cross_entropy_bwd[x_average-liger_cross_entropy_loss]_speedup 1.8539619767060653 1.8542330588149405 1.0001462177284546 0.014621772845457848
tritonbench_layer_norm_bwd[x_(4096, 4096)-liger_layer_norm]_speedup 0.6794676895397839 0.6795684974718376 1.0001483630989458 0.014836309894583621
tritonbench_layer_norm_bwd[x_(4096, 3072)-liger_layer_norm]_speedup 0.8417222331936004 0.841860396351664 1.000164143410516 0.016414341051596004
tritonbench_softmax_fwd[x_(4096, 9600)-triton_softmax]_speedup 4.79830461721342 4.7991239857506285 1.0001707620925668 0.017076209256683406
tritonbench_softmax_fwd[x_(4096, 5888)-triton_softmax]_speedup 4.716451977771094 4.717301293102251 1.000180075051153 0.0180075051152917
tritonbench_layer_norm_bwd[x_(4096, 8704)-liger_layer_norm]_speedup 0.7799972108794566 0.7801772814683975 1.0002308605549215 0.023086055492149704
tritonbench_layer_norm_bwd[x_(4096, 11264)-liger_layer_norm]_speedup 0.7937575743235263 0.7939519291076897 1.0002448540844842 0.024485408448415846
tritonbench_softmax_fwd[x_(4096, 12672)-triton_softmax]_speedup 4.751350171537092 4.7525308350003375 1.0002484900966295 0.02484900966295278
tritonbench_addmm_fwd[x_(35901, 512, 1536)-triton_addmm]_speedup 1.0024183980688852 1.0026901077216068 1.0002710541359228 0.027105413592276584
tritonbench_layer_norm_fwd[x_(4096, 6144)-liger_layer_norm]_speedup 1.5019641719267829 1.5024006658692226 1.0002906154158655 0.029061541586550277
tritonbench_jsd_fwd[x_(4, 2048, 32768)-liger_jsd]_speedup 0.5968401452435037 0.5970274795421712 1.0003138768398212 0.03138768398212033
tritonbench_jsd_fwd[x_(4, 2048, 131072)-liger_jsd]_speedup 0.5811393535470355 0.5813228993817452 1.0003158379028876 0.03158379028875835
tritonbench_softmax_fwd[x_(4096, 8064)-triton_softmax]_speedup 4.719322610085519 4.7208983804798645 1.0003338975790674 0.033389757906743256
tritonbench_layer_norm_bwd[x_(4096, 13824)-liger_layer_norm]_speedup 0.8239021020553735 0.8241977641173305 1.000358855816995 0.03588558169949074
tritonbench_softmax_fwd[x_(4096, 5120)-triton_softmax]_speedup 4.725899237106533 4.727604510948316 1.0003608358444027 0.03608358444027182
tritonbench_jsd_bwd[x_(4, 2048, 131072)-liger_jsd]_speedup 5.832342907344473 5.8344601887043055 1.0003630241557242 0.03630241557242009
tritonbench_kl_div_bwd[x_(8, 512, 8192)-liger_kl_div]_speedup 0.9809439173222789 0.9813287655113241 1.0003923243543786 0.0392324354378637
tritonbench_cross_entropy_fwd[x_(8, 2048, 131072)-liger_cross_entropy_loss]_speedup 1.2906769833897092 1.2911915644223544 1.0003986907950382 0.03986907950381813
tritonbench_layer_norm_bwd[x_(4096, 14336)-liger_layer_norm]_speedup 0.8402587902742877 0.8406082186713911 1.0004158580679523 0.04158580679523016
tritonbench_kl_div_bwd[x_(8, 512, 131072)-liger_kl_div]_speedup 1.052250229404886 1.052688556518659 1.0004165616709066 0.04165616709066455
tritonbench_fused_linear_jsd_bwd[x_average-liger_lm_head_jsd]_speedup 132.66779222704378 132.73292379231944 1.000490937281629 0.0490937281629078
tritonbench_softmax_fwd[x_(4096, 10880)-triton_softmax]_speedup 4.761572095100628 4.76420478922783 1.0005529043926293 0.055290439262933866
tritonbench_welford_fwd[x_average-test_welford]_speedup 0.6241323950778657 0.624489383755402 1.000571975882604 0.05719758826039367
tritonbench_fp8_gemm_blockwise_fwd[x_(1, 8192, 16384)-_triton]_speedup 0.6607601723998078 0.6611523124844976 1.0005934681009383 0.059346810093829916
tritonbench_softmax_fwd[x_(4096, 11648)-triton_softmax]_speedup 4.75219274383081 4.755052515869277 1.0006017794716302 0.060177947163020384
tritonbench_fp8_gemm_blockwise_fwd[x_(16, 4096, 6656)-_triton]_speedup 0.6817548815393024 0.6821651730605418 1.000601816770733 0.06018167707328992
tritonbench_layer_norm_bwd[x_(4096, 6144)-liger_layer_norm]_speedup 0.953681119421572 0.954279462644334 1.0006274038675789 0.06274038675788685
tritonbench_softmax_fwd[x_(4096, 640)-triton_softmax]_speedup 3.598503874402092 3.600985057118775 1.000689503972563 0.06895039725629548
tritonbench_fused_linear_cross_entropy_bwd[x_(8192, 4096)-liger_lm_head_ce]_speedup 162.3205156621521 162.43469045018367 1.0007033909889074 0.07033909889073708
tritonbench_softmax_fwd[x_(4096, 12032)-triton_softmax]_speedup 4.709313445258512 4.712673519869383 1.0007134956400607 0.07134956400607351
tritonbench_softmax_fwd[x_(4096, 1536)-triton_softmax]_speedup 4.2254758476138825 4.228529887982024 1.0007227683882907 0.07227683882906621
tritonbench_softmax_fwd[x_(4096, 4480)-triton_softmax]_speedup 4.711020041177997 4.714453033944093 1.000728715381401 0.07287153814010683
tritonbench_rope_fwd[x_(8192, 8192)-liger_rotary_pos_emb]_speedup 2.7782865692045577 2.7803489445779532 1.0007423191675962 0.07423191675961505
tritonbench_softmax_fwd[x_(4096, 5760)-triton_softmax]_speedup 4.744207746137763 4.747756104352998 1.0007479348302408 0.07479348302408262
tritonbench_fused_linear_jsd_fwd[x_(1024, 4096)-liger_lm_head_jsd]_speedup 0.18093557760265308 0.18108129339309353 1.0008053462584372 0.08053462584372006
tritonbench_fused_linear_jsd_fwd[x_(8192, 4096)-liger_lm_head_jsd]_speedup 0.44350033243748105 0.44386203478020897 1.000815562731915 0.0815562731915076
tritonbench_fused_linear_cross_entropy_bwd[x_(16384, 4096)-liger_lm_head_ce]_speedup 304.56651119165184 304.81810202188257 1.0008260620290994 0.08260620290994058
tritonbench_kl_div_fwd[x_(8, 512, 32768)-liger_kl_div]_speedup 4.42930805617185 4.433021988912163 1.0008384905030794 0.0838490503079381
tritonbench_layer_norm_bwd[x_(4096, 5632)-liger_layer_norm]_speedup 0.9024844561689532 0.9032699926645171 1.000870415540339 0.08704155403389535
tritonbench_swiglu_bwd[x_(4, 2048, 4096)-liger_swiglu]_speedup 1.0291708624556895 1.0300680809272738 1.0008717876732767 0.08717876732766694
tritonbench_layer_norm_fwd[x_(4096, 2048)-liger_layer_norm]_speedup 1.2957907384923155 1.296928271143072 1.0008778675575967 0.08778675575966677
tritonbench_embedding_fwd[x_(8, 2048, 4096, 65536)-liger_embedding]_speedup 1.021507216691437 1.0224168195220864 1.000890451693132 0.0890451693132066
tritonbench_geglu_fwd[x_average-liger_geglu]_speedup 0.9795143663275336 0.9804472685764447 1.0009524130334186 0.0952413033418642
tritonbench_softmax_fwd[x_(4096, 8832)-triton_softmax]_speedup 4.829291283311924 4.83396217131708 1.000967199477343 0.09671994773430281
tritonbench_cross_entropy_fwd[x_(8, 2048, 65536)-liger_cross_entropy_loss]_speedup 1.372374166809067 1.3737238807194068 1.000983488281099 0.09834882810990653
tritonbench_kl_div_bwd[x_(8, 512, 32768)-liger_kl_div]_speedup 1.0396299394620934 1.0407109356681534 1.0010397894145098 0.1039789414509773
tritonbench_cross_entropy_bwd[x_(8, 2048, 131072)-liger_cross_entropy_loss]_speedup 2.1155103413392733 2.117737797288932 1.001052916597065 0.10529165970649679
tritonbench_fused_linear_jsd_bwd[x_(2048, 4096)-liger_lm_head_jsd]_speedup 71.8774387858073 71.95404878193854 1.0010658420420284 0.10658420420284198
tritonbench_fused_linear_jsd_bwd[x_(1024, 4096)-liger_lm_head_jsd]_speedup 36.48583862561098 36.526825037851054 1.0011233512448663 0.11233512448662886
tritonbench_softmax_fwd[x_(4096, 4608)-triton_softmax]_speedup 4.737780474572858 4.743251054023396 1.0011546713656105 0.11546713656105378
tritonbench_rope_fwd[x_(8192, 4096)-liger_rotary_pos_emb]_speedup 2.762818928836894 2.766066368751536 1.0011754081603925 0.11754081603925215
tritonbench_softmax_fwd[x_(4096, 3200)-triton_softmax]_speedup 4.6703648365937465 4.675889521212759 1.0011829235642844 0.11829235642843994
tritonbench_softmax_fwd[x_(4096, 4352)-triton_softmax]_speedup 4.796080916990599 4.801956118966497 1.001225000594774 0.12250005947740394
tritonbench_geglu_bwd[x_average-liger_geglu]_speedup 1.002764451281167 1.0039962519361727 1.0012284047898106 0.12284047898105666
tritonbench_jsd_fwd[x_(4, 2048, 65536)-liger_jsd]_speedup 0.5832776076697858 0.5840057103951928 1.0012482953499893 0.12482953499892702
tritonbench_gemm_fwd[x_(4096, 4096, 4096)-triton_tutorial_matmul]_speedup 0.9068648900137336 0.9080325756896083 1.0012876071052403 0.12876071052403404
tritonbench_layer_norm_bwd[x_(4096, 9216)-liger_layer_norm]_speedup 0.7856755294396502 0.7867362919796942 1.001350128011242 0.13501280112420133
tritonbench_softmax_fwd[x_(4096, 5248)-triton_softmax]_speedup 4.691878087517371 4.698629866747876 1.0014390355215042 0.14390355215041506
tritonbench_layer_norm_bwd[x_(4096, 13312)-liger_layer_norm]_speedup 0.813214212282992 0.8143985189662989 1.001456328068815 0.1456328068814905
tritonbench_embedding_fwd[x_(8, 2048, 4096, 16384)-liger_embedding]_speedup 1.0986306471481364 1.1002790811321956 1.0015004442014597 0.1500444201459672
tritonbench_softmax_fwd[x_(4096, 9856)-triton_softmax]_speedup 4.774046472705178 4.781455387521872 1.0015519151015921 0.15519151015921384
tritonbench_layer_norm_bwd[x_(4096, 6656)-liger_layer_norm]_speedup 0.9352037135943546 0.9366566672652529 1.0015536226490311 0.1553622649031139
tritonbench_softmax_fwd[x_(4096, 5504)-triton_softmax]_speedup 4.737640772509983 4.745117178729238 1.001578086346824 0.15780863468239925
tritonbench_jsd_bwd[x_(4, 2048, 4096)-liger_jsd]_speedup 5.8569392074287165 5.866285582068034 1.0015957779837399 0.15957779837398878
tritonbench_fused_linear_jsd_bwd[x_(4096, 4096)-liger_lm_head_jsd]_speedup 143.83404497593068 144.07704241626234 1.0016894292333385 0.16894292333384797
tritonbench_jsd_fwd[x_(4, 2048, 4096)-liger_jsd]_speedup 4.417655225485256 4.425411135466918 1.0017556621297468 0.175566212974676
tritonbench_softmax_fwd[x_(4096, 9216)-triton_softmax]_speedup 4.803214051280067 4.81168611045731 1.0017638312777222 0.1763831277722172
tritonbench_layer_norm_fwd[x_(4096, 10752)-liger_layer_norm]_speedup 1.433993338475147 1.4365229329910985 1.0017640211067098 0.17640211067098122
tritonbench_layer_norm_bwd[x_(4096, 1536)-liger_layer_norm]_speedup 0.6562133984349304 0.6573857403373233 1.0017865253973615 0.17865253973614603
tritonbench_softmax_fwd[x_(4096, 2048)-triton_softmax]_speedup 4.322033987486602 4.329954841867878 1.0018326682307934 0.18326682307934217
tritonbench_kl_div_fwd[x_(8, 512, 65536)-liger_kl_div]_speedup 4.548820331922483 4.557575726239857 1.001924761515844 0.19247615158439757
tritonbench_softmax_fwd[x_(4096, 12288)-triton_softmax]_speedup 4.7203158462319825 4.729529104208536 1.0019518308258775 0.19518308258774653
tritonbench_embedding_fwd[x_(8, 2048, 4096, 32768)-liger_embedding]_speedup 1.0535452591351346 1.0556678715000651 1.0020147329661688 0.20147329661688307
tritonbench_layer_norm_bwd[x_(4096, 2560)-liger_layer_norm]_speedup 0.7354784659876081 0.7369633850758623 1.0020189837730467 0.20189837730466564
tritonbench_kl_div_fwd[x_(8, 512, 8192)-liger_kl_div]_speedup 3.868215014136774 3.8764998651190323 1.0021417762332188 0.21417762332187618
tritonbench_kl_div_fwd[x_average-liger_kl_div]_speedup 4.1746045332607595 4.183618747769876 1.0021592978298415 0.21592978298414778
tritonbench_softmax_fwd[x_(4096, 7808)-triton_softmax]_speedup 4.709918097082515 4.7213116712876 1.0024190599433442 0.24190599433442372
tritonbench_softmax_fwd[x_(4096, 2176)-triton_softmax]_speedup 4.237798664838354 4.248447163051464 1.0025127428307208 0.2512742830720782
tritonbench_geglu_bwd[x_(8, 8192, 4096)-liger_geglu]_speedup 1.0031404314980366 1.0056660201532144 1.0025176820471748 0.25176820471748407
tritonbench_softmax_fwd[x_(4096, 3328)-triton_softmax]_speedup 4.773967174689948 4.786604549481771 1.0026471432101214 0.2647143210121383
tritonbench_softmax_fwd[x_(4096, 7040)-triton_softmax]_speedup 4.735270741519857 4.747839572191535 1.0026543003257389 0.26543003257388786
tritonbench_softmax_fwd[x_(4096, 2944)-triton_softmax]_speedup 4.732699131638342 4.746758934951704 1.0029707790253075 0.2970779025307513
tritonbench_addmm_fwd[x_(20203, 512, 1536)-triton_addmm]_speedup 0.9656384328215393 0.9685781062199709 1.0030442796170012 0.30442796170011555
tritonbench_geglu_bwd[x_(8, 4096, 4096)-liger_geglu]_speedup 1.0001428657442923 1.003292525239605 1.0031492095811418 0.3149209581141843
tritonbench_softmax_fwd[x_(4096, 8960)-triton_softmax]_speedup 4.825668094786236 4.8410719351643925 1.0031920637879757 0.3192063787975652
tritonbench_softmax_fwd[x_average-triton_softmax]_speedup 4.598279266841951 4.612989688000419 1.0031991143436076 0.3199114343607645
tritonbench_embedding_bwd[x_(32, 512, 768, 4096)-liger_embedding]_speedup 1.7234042999595416 1.7290340240187698 1.0032666299250619 0.32666299250618813
tritonbench_welford_fwd[x_1024-test_welford]_speedup 0.595690544867554 0.5976483581592229 1.0032866281134347 0.32866281134347197
tritonbench_softmax_fwd[x_(4096, 1920)-triton_softmax]_speedup 4.288438598548208 4.3028504470998135 1.003360628401322 0.3360628401321897
tritonbench_layer_norm_fwd[x_(4096, 2560)-liger_layer_norm]_speedup 1.2916666804468893 1.2960152272125567 1.0033666168149225 0.3366616814922452
tritonbench_cross_entropy_bwd[x_(8, 2048, 65536)-liger_cross_entropy_loss]_speedup 2.0957948047944557 2.1029817137159093 1.003429204474127 0.342920447412709
tritonbench_gemm_fwd[x_(2560, 2560, 2560)-triton_tutorial_matmul]_speedup 0.7873936596456409 0.7902414329625181 1.003616708468492 0.3616708468491936
tritonbench_softmax_fwd[x_(4096, 8320)-triton_softmax]_speedup 4.823275901000963 4.840931579791913 1.0036605160379246 0.36605160379246104
tritonbench_addmm_fwd[x_(20116, 512, 1536)-triton_addmm]_speedup 0.9334638339271351 0.9369333643651827 1.0037168343453127 0.37168343453126695
tritonbench_kl_div_fwd[x_(8, 512, 16384)-liger_kl_div]_speedup 4.216765946916615 4.2325194302025855 1.0037359159802288 0.3735915980228821
tritonbench_softmax_fwd[x_(4096, 10368)-triton_softmax]_speedup 4.766170734240107 4.784026085127714 1.0037462675768902 0.37462675768902276
tritonbench_layer_norm_fwd[x_(4096, 13824)-liger_layer_norm]_speedup 1.5595980467872133 1.5654630366189963 1.0037605778257193 0.3760577825719258
tritonbench_cross_entropy_fwd[x_(8, 2048, 32768)-liger_cross_entropy_loss]_speedup 1.5672431964551443 1.573185119612579 1.0037913217111896 0.3791321711189566
tritonbench_addmm_fwd[x_(20067, 512, 1536)-triton_addmm]_speedup 0.9643555725955517 0.9680599689479571 1.003841317930517 0.38413179305170075
tritonbench_softmax_fwd[x_(4096, 10112)-triton_softmax]_speedup 4.723815039281187 4.742738630635089 1.0040059975245732 0.40059975245732016
tritonbench_softmax_fwd[x_(4096, 11264)-triton_softmax]_speedup 4.74410991006552 4.763389958154881 1.0040639969256309 0.4063996925630864
tritonbench_softmax_fwd[x_(4096, 9728)-triton_softmax]_speedup 4.759988411338069 4.780213519193115 1.0042489825829977 0.42489825829976624
tritonbench_softmax_fwd[x_(4096, 8448)-triton_softmax]_speedup 4.829355993790675 4.850016638947823 1.0042781367088514 0.42781367088513544
tritonbench_swiglu_fwd[x_(4, 4096, 4096)-liger_swiglu]_speedup 1.0253479737420317 1.029935114502727 1.0044737405038744 0.44737405038743905
tritonbench_softmax_fwd[x_(4096, 6400)-triton_softmax]_speedup 4.736075201701759 4.757289835405113 1.0044793701112962 0.4479370111296177
tritonbench_softmax_fwd[x_(4096, 9344)-triton_softmax]_speedup 4.778900354366162 4.80077550035501 1.0045774434214476 0.45774434214476134
tritonbench_softmax_fwd[x_(4096, 11776)-triton_softmax]_speedup 4.724915907675665 4.746988041032503 1.0046714341139875 0.4671434113987516
tritonbench_gemm_fwd[x_(640, 640, 640)-triton_tutorial_matmul]_speedup 0.7801857869680611 0.7838709671362366 1.0047234648845589 0.4723464884558881
tritonbench_softmax_fwd[x_(4096, 1280)-triton_softmax]_speedup 3.8212558063546456 3.839743856433038 1.0048382131464864 0.48382131464863587
tritonbench_softmax_fwd[x_(4096, 10240)-triton_softmax]_speedup 4.7278449154269415 4.750892570752123 1.0048748754955936 0.48748754955936224
tritonbench_softmax_fwd[x_(4096, 6912)-triton_softmax]_speedup 4.71230134788611 4.735458010338793 1.0049140877764684 0.4914087776468401
tritonbench_softmax_fwd[x_(4096, 8704)-triton_softmax]_speedup 4.806738796801383 4.830622124987059 1.0049687177097222 0.49687177097221813
tritonbench_addmm_fwd[x_(20224, 512, 1536)-triton_addmm]_speedup 0.9456732125776249 0.9505688411409998 1.0051768713528755 0.5176871352875478
tritonbench_kl_div_fwd[x_(8, 512, 4096)-liger_kl_div]_speedup 3.378340418494809 3.3963067459922427 1.0053180926939975 0.5318092693997523
tritonbench_softmax_fwd[x_(4096, 10752)-triton_softmax]_speedup 4.765968692811354 4.7917434325137975 1.0054080799442346 0.5408079944234645
tritonbench_addmm_fwd[x_(35916, 512, 1536)-triton_addmm]_speedup 0.9936724913049736 0.9992676293895483 1.005630766810528 0.5630766810527987
tritonbench_softmax_fwd[x_(4096, 12160)-triton_softmax]_speedup 4.713819820437702 4.740697181991932 1.0057018219995804 0.5701821999580359
tritonbench_softmax_fwd[x_(4096, 7168)-triton_softmax]_speedup 4.698772099813405 4.726291597497385 1.005856742378519 0.5856742378518964
tritonbench_gemm_fwd[x_(3328, 3328, 3328)-triton_tutorial_matmul]_speedup 0.8480888249786462 0.8531017721820877 1.0059108752005637 0.5910875200563659
tritonbench_rms_norm_fwd[x_average-liger_rms]_speedup 3.823401588007052 3.8461253503667536 1.005943336538589 0.5943336538589028
tritonbench_addmm_fwd[x_(20068, 512, 1536)-triton_addmm]_speedup 0.9354207960694176 0.941293062469078 1.0062776735607495 0.6277673560749486
tritonbench_fp8_gemm_blockwise_fwd[x_(8, 8192, 6656)-_triton]_speedup 0.6842993402782463 0.6886632750171143 1.006377230667932 0.6377230667931943
tritonbench_layer_norm_fwd[x_(4096, 5632)-liger_layer_norm]_speedup 1.4376743793168425 1.4469272967907743 1.0064360314178573 0.6436031417857313
tritonbench_softmax_fwd[x_(4096, 7296)-triton_softmax]_speedup 4.713908058993972 4.744495155284071 1.0064886917409726 0.6488691740972552
tritonbench_softmax_fwd[x_(4096, 6272)-triton_softmax]_speedup 4.694673098802581 4.727826146374548 1.0070618436841583 0.7061843684158298
tritonbench_gemm_fwd[x_(3584, 3584, 3584)-triton_tutorial_matmul]_speedup 0.9203578001767542 0.9269044054406707 1.0071131089046665 0.7113108904666543
tritonbench_rope_fwd[x_(8192, 2048)-liger_rotary_pos_emb]_speedup 2.7812572340747423 2.8015301502456733 1.0072891194394233 0.7289119439423342
tritonbench_addmm_fwd[x_(35380, 512, 1536)-triton_addmm]_speedup 0.9708029246700546 0.9779032117366308 1.007313829497361 0.7313829497360924
tritonbench_cross_entropy_bwd[x_(8, 2048, 32768)-liger_cross_entropy_loss]_speedup 2.0552548539204967 2.0703779040822834 1.0073582359545041 0.7358235954504133
tritonbench_softmax_fwd[x_(4096, 5376)-triton_softmax]_speedup 4.688833915949177 4.72355309805354 1.0074046517165527 0.740465171655269
tritonbench_softmax_fwd[x_(4096, 6784)-triton_softmax]_speedup 4.695564642000827 4.731657909012764 1.007686672373561 0.768667237356091
tritonbench_addmm_fwd[x_(34181, 512, 1536)-triton_addmm]_speedup 0.9731969042601993 0.9809594741113515 1.0079763610192052 0.797636101920518
tritonbench_geglu_fwd[x_(8, 4096, 4096)-liger_geglu]_speedup 0.954548038814724 0.9625739390022466 1.0084080631473389 0.8408063147338884
tritonbench_fp8_gemm_blockwise_fwd[x_(4096, 2304, 13312)-_triton]_speedup 1.0539714623873773 1.0628738835015592 1.0084465485374878 0.844654853748783
tritonbench_addmm_fwd[x_(34238, 512, 1536)-triton_addmm]_speedup 0.9607605158251642 0.9688888827567239 1.0084603465667805 0.8460346566780519
tritonbench_rope_bwd[x_(8192, 2048)-liger_rotary_pos_emb]_speedup 3.622706284472685 3.6538819615315137 1.0086056319808345 0.8605631980834527
tritonbench_gemm_fwd[x_(2048, 2048, 2048)-triton_tutorial_matmul]_speedup 0.948164129761847 0.9563794834815925 1.0086644848311326 0.866448483113258
tritonbench_welford_fwd[x_3072-test_welford]_speedup 0.6320777366511622 0.6375825919542389 1.0087091428536026 0.8709142853602581
tritonbench_addmm_fwd[x_(19632, 512, 1536)-triton_addmm]_speedup 0.9677419617551467 0.9763500231086938 1.0088949964905263 0.8894996490526275
tritonbench_rope_bwd[x_(8192, 4096)-liger_rotary_pos_emb]_speedup 3.6390519976385236 3.6717788002594793 1.0089932220375508 0.8993222037550819
tritonbench_layer_norm_fwd[x_(4096, 7168)-liger_layer_norm]_speedup 1.529836614127413 1.543926730737488 1.0092102100838474 0.921021008384737
tritonbench_addmm_fwd[x_(33660, 512, 1536)-triton_addmm]_speedup 1.0754609716402375 1.0853970846130292 1.0092389340336894 0.9238934033689361
tritonbench_layer_norm_fwd[x_(4096, 11264)-liger_layer_norm]_speedup 1.4705882796996042 1.4842767991442158 1.0093081929412682 0.9308192941268167
tritonbench_gemm_fwd[x_(3840, 3840, 3840)-triton_tutorial_matmul]_speedup 0.8356968585206377 0.8437144094564455 1.0095938507534905 0.9593850753490507
tritonbench_addmm_fwd[x_(36032, 512, 1536)-triton_addmm]_speedup 1.0209654286796948 1.0307768695450366 1.0096099638535556 0.9609963853555614
tritonbench_addmm_fwd[x_(27456, 512, 1536)-triton_addmm]_speedup 0.9811715444777841 0.9906400958542367 1.0096502506923926 0.9650250692392559
tritonbench_cross_entropy_fwd[x_average-liger_cross_entropy_loss]_speedup 1.1071305427328388 1.1180387949988726 1.0098527245387954 0.9852724538795377
tritonbench_embedding_fwd[x_(8, 2048, 4096, 1024)-liger_embedding]_speedup 1.0056649210152915 1.0163735433788625 1.0106483005818279 1.0648300581827863
tritonbench_gemm_fwd[x_(2816, 2816, 2816)-triton_tutorial_matmul]_speedup 0.8953938922197581 0.9050966584712193 1.0108363105173828 1.0836310517382763
tritonbench_addmm_fwd[x_(19410, 512, 1536)-triton_addmm]_speedup 0.9406250011368684 0.9509569358001994 1.0109841165723252 1.098411657232523
tritonbench_gemm_fwd[x_(3968, 3968, 3968)-triton_tutorial_matmul]_speedup 0.9233261606744085 0.9335506901344728 1.0110735836322413 1.1073583632241313
tritonbench_addmm_fwd[x_(35410, 512, 1536)-triton_addmm]_speedup 0.9987959858720636 1.0107003570485003 1.0119187214854921 1.1918721485492112
tritonbench_addmm_fwd[x_(35561, 512, 1536)-triton_addmm]_speedup 0.9832442229393471 0.9950762798319264 1.0120336907316965 1.2033690731696467
tritonbench_addmm_fwd[x_(20120, 512, 1536)-triton_addmm]_speedup 0.9256942419076779 0.9370048255167637 1.0122184875923792 1.2218487592379201
tritonbench_addmm_fwd[x_(35678, 512, 1536)-triton_addmm]_speedup 0.9997579237684245 1.0120600887587325 1.0123051437731416 1.2305143773141625
tritonbench_flash_attention_bwd[x_(4, 48, 512, 512, 64)-triton_tutorial_flash_v2]_speedup 1.187992438988762 1.202978791738349 1.0126148553288299 1.2614855328829888
tritonbench_addmm_fwd[x_(35917, 512, 1536)-triton_addmm]_speedup 1.0231046437803402 1.0362694199317053 1.0128674776635962 1.2867477663596194
tritonbench_addmm_fwd[x_(35656, 512, 1536)-triton_addmm]_speedup 0.9672170169227955 0.9803294459145921 1.0135568634157346 1.3556863415734588
tritonbench_addmm_fwd[x_(35405, 512, 1536)-triton_addmm]_speedup 0.9934513769083727 1.007358293402241 1.0139985879703006 1.3998587970300624
tritonbench_addmm_fwd[x_(33894, 512, 1536)-triton_addmm]_speedup 0.9668874512566172 0.9804853316109574 1.0140635606933026 1.4063560693302612
tritonbench_flash_attention_bwd[x_(4, 48, 1024, 1024, 64)-triton_tutorial_flash_v2]_speedup 1.0355204247925078 1.0505172483618674 1.014482402481211 1.448240248121091
tritonbench_addmm_fwd[x_(20211, 512, 1536)-triton_addmm]_speedup 0.9688958359887372 0.9830441266404071 1.0146024888601486 1.4602488860148588
tritonbench_addmm_fwd[x_average-triton_addmm]_speedup 0.976674213890852 0.9911525119898416 1.0148240814522085 1.4824081452208482
tritonbench_softmax_fwd[x_(4096, 4096)-triton_softmax]_speedup 4.770261391391121 4.842377041394896 1.0151177564680045 1.5117756468004506
tritonbench_addmm_fwd[x_(33961, 512, 1536)-triton_addmm]_speedup 0.9787490455686836 0.9935531374577089 1.015125523704009 1.5125523704009103
tritonbench_softmax_fwd[x_(4096, 256)-triton_softmax]_speedup 3.4013843353309237 3.453608094825279 1.0153536778987002 1.5353677898700235
tritonbench_rms_norm_fwd[x_(2048, 8192)-liger_rms]_speedup 4.0773583474471655 4.143132265590126 1.0161315029335456 1.6131502933545594
tritonbench_flash_attention_fwd[x_(4, 48, 256, 256, 64)-triton_tutorial_flash_v2]_speedup 1.53685906973334 1.5619967345596684 1.016356519163914 1.6356519163913896
tritonbench_layer_norm_fwd[x_(4096, 4608)-liger_layer_norm]_speedup 1.2598958786185914 1.2809047154389495 1.0166750579765313 1.667505797653135
tritonbench_flash_attention_bwd[x_(4, 48, 256, 256, 64)-triton_tutorial_flash_v2]_speedup 1.1660351178870558 1.1857693065401007 1.0169241803701459 1.692418037014587
tritonbench_addmm_fwd[x_(35844, 512, 1536)-triton_addmm]_speedup 0.9718855037594868 0.9884264650184711 1.017019454652837 1.7019454652837052
tritonbench_welford_fwd[x_2560-test_welford]_speedup 0.6247007314039265 0.6353532249754771 1.0170521547935611 1.7052154793561147
tritonbench_flash_attention_bwd[x_(4, 48, 2048, 2048, 64)-triton_tutorial_flash_v2]_speedup 0.8925026471777469 0.9083892772478239 1.0178000929412516 1.7800092941251622
tritonbench_addmm_fwd[x_(35605, 512, 1536)-triton_addmm]_speedup 1.0019441217758047 1.0200049199779049 1.0180257539413375 1.8025753941337541
tritonbench_softmax_fwd[x_(4096, 896)-triton_softmax]_speedup 3.641237019616522 3.707216423825704 1.018120052019061 1.8120052019060973
tritonbench_softmax_fwd[x_(4096, 3584)-triton_softmax]_speedup 4.704447641458184 4.79024044470145 1.0182365305731562 1.8236530573156218
tritonbench_addmm_fwd[x_(34533, 512, 1536)-triton_addmm]_speedup 0.9953894899082237 1.0136476889854003 1.0183427685969038 1.8342768596903758
tritonbench_addmm_fwd[x_(35503, 512, 1536)-triton_addmm]_speedup 0.9831284491089135 1.0014673625430912 1.018653629086616 1.8653629086615942
tritonbench_addmm_fwd[x_(34839, 512, 1536)-triton_addmm]_speedup 0.9865853205617965 1.0057001093005653 1.0193746940486446 1.9374694048644558
tritonbench_embedding_fwd[x_average-liger_embedding]_speedup 1.0554445315973007 1.0759858979400547 1.0194622888534624 1.946228885346235
tritonbench_gemm_fwd[x_(3072, 3072, 3072)-triton_tutorial_matmul]_speedup 0.7108400254661341 0.7247222556736624 1.0195293310874343 1.9529331087434265
tritonbench_addmm_fwd[x_(15168, 512, 1536)-triton_addmm]_speedup 1.0528301505510524 1.0733996056912702 1.019537296808466 1.9537296808465943
tritonbench_layer_norm_fwd[x_(4096, 3072)-liger_layer_norm]_speedup 1.2915643225648366 1.317195300397192 1.0198449100711116 1.9844910071111554
tritonbench_addmm_fwd[x_(33887, 512, 1536)-triton_addmm]_speedup 0.951427847907326 0.9703666585587187 1.019905671978226 1.9905671978226014
tritonbench_addmm_fwd[x_(34308, 512, 1536)-triton_addmm]_speedup 0.9288480337079285 0.9474858153242829 1.0200654799708764 2.0065479970876376
tritonbench_fp8_gemm_blockwise_fwd[x_average-_triton]_speedup 0.7893782863407978 0.8052217833653433 1.0200708548723691 2.007085487236915
tritonbench_softmax_fwd[x_(4096, 2432)-triton_softmax]_speedup 4.396335690154272 4.488188543312112 1.0208930481272271 2.0893048127227143
tritonbench_addmm_fwd[x_(34579, 512, 1536)-triton_addmm]_speedup 0.9865853205617965 1.0074570623383048 1.0211555365172302 2.1155536517230233
tritonbench_embedding_fwd[x_(32, 512, 768, 65536)-liger_embedding]_speedup 1.021826978973435 1.0439469775502799 1.0216474990698203 2.164749906982033
tritonbench_addmm_fwd[x_(35504, 512, 1536)-triton_addmm]_speedup 0.9699355696334175 0.9921530703386081 1.0229061613995531 2.290616139955315
tritonbench_gemm_fwd[x_(2432, 2432, 2432)-triton_tutorial_matmul]_speedup 0.7775446078052627 0.7956359681092646 1.0232672956926132 2.326729569261321
tritonbench_rms_norm_fwd[x_(2048, 4096)-liger_rms]_speedup 4.187429980134267 4.286210915292205 1.023589871502704 2.3589871502704085
tritonbench_addmm_fwd[x_(19735, 512, 1536)-triton_addmm]_speedup 0.9394177786802789 0.9619652124805717 1.0240014978553718 2.4001497855371756
tritonbench_gemm_fwd[x_(3456, 3456, 3456)-triton_tutorial_matmul]_speedup 0.9290246842066845 0.9513232119492733 1.0240020831756802 2.40020831756802
tritonbench_softmax_fwd[x_(4096, 2560)-triton_softmax]_speedup 4.475117508427814 4.583173957871456 1.0241460585649749 2.414605856497487
tritonbench_softmax_fwd[x_(4096, 512)-triton_softmax]_speedup 3.837349559712514 3.9307230645141638 1.0243328118402237 2.4332811840223734
tritonbench_fp8_gemm_blockwise_fwd[x_(16384, 4096, 16384)-_triton]_speedup 1.2194818878551292 1.2494677504980742 1.0245890184524882 2.4589018452488176
tritonbench_addmm_fwd[x_(35791, 512, 1536)-triton_addmm]_speedup 0.988625444897008 1.0130541139681863 1.0247097312711015 2.4709731271101543
tritonbench_rope_fwd[x_average-liger_rotary_pos_emb]_speedup 2.8457374437689253 2.9181759066525537 1.0254550759917225 2.5455075991722476
tritonbench_addmm_fwd[x_(35249, 512, 1536)-triton_addmm]_speedup 0.9779820968841367 1.0029629854475677 1.0255432984336015 2.554329843360148
tritonbench_addmm_fwd[x_(34516, 512, 1536)-triton_addmm]_speedup 0.9625332333799332 0.9871889797853208 1.0256154754457765 2.561547544577647
tritonbench_embedding_fwd[x_(32, 512, 768, 16384)-liger_embedding]_speedup 1.073636395578419 1.10205994081131 1.0264740887603554 2.6474088760355396
tritonbench_rope_fwd[x_(2048, 2048)-liger_rotary_pos_emb]_speedup 3.0217040638397648 3.1063478403326963 1.028011934558996 2.8011934558995977
tritonbench_embedding_fwd[x_(32, 512, 768, 8192)-liger_embedding]_speedup 1.0875763279052024 1.1188222148338822 1.0287298336006108 2.8729833600610766
tritonbench_welford_fwd[x_7168-test_welford]_speedup 0.6535596026749115 0.6730924202432981 1.0298868190268216 2.9886819026821554
tritonbench_addmm_fwd[x_(19747, 512, 1536)-triton_addmm]_speedup 0.9352686987937017 0.9634485501182053 1.0301302196479467 3.013021964794671
tritonbench_flash_attention_fwd[x_(4, 48, 4096, 4096, 64)-triton_tutorial_flash_v2]_speedup 0.8091591199657246 0.8339146685645856 1.0305941662004743 3.059416620047428
tritonbench_addmm_fwd[x_(35884, 512, 1536)-triton_addmm]_speedup 0.9643202187036429 0.9942899844440457 1.0310786449968785 3.1078644996878513
tritonbench_softmax_fwd[x_(4096, 1792)-triton_softmax]_speedup 4.220402943636216 4.355297148975286 1.0319623995956289 3.1962399595628854
tritonbench_rope_fwd[x_(8192, 1024)-liger_rotary_pos_emb]_speedup 2.8334756794862375 2.924669544603786 1.0321844531003999 3.2184453100399857
tritonbench_flash_attention_fwd[x_(4, 48, 16384, 16384, 64)-triton_tutorial_flash_v2]_speedup 0.8471208324311976 0.874834584203284 1.0327152287030283 3.2715228703028343
tritonbench_embedding_fwd[x_(32, 512, 768, 1024)-liger_embedding]_speedup 0.9932156880461214 1.0259209119083823 1.032928621905479 3.292862190547896
tritonbench_addmm_fwd[x_(35541, 512, 1536)-triton_addmm]_speedup 0.9864439698779935 1.0202419553590478 1.0342624482617442 3.4262448261744183
tritonbench_flash_attention_bwd[x_average-triton_tutorial_flash_v2]_speedup 0.9270976882359618 0.9588675771972968 1.0342681136674876 3.426811366748761
tritonbench_embedding_fwd[x_(32, 512, 768, 32768)-liger_embedding]_speedup 1.044725786397817 1.081650472774965 1.0353439025416067 3.5343902541606687
tritonbench_softmax_fwd[x_(4096, 1408)-triton_softmax]_speedup 3.956780941059147 4.09861328542786 1.035845387066272 3.584538706627205
tritonbench_gemm_fwd[x_(1664, 1664, 1664)-triton_tutorial_matmul]_speedup 0.8002662671032473 0.8290712869917317 1.0359942947398633 3.5994294739863264
tritonbench_flex_attention_fwd[x_ (8, 16, 2048, 16, 2048, 128) | noop-compiled]_speedup 28.56819643229996 29.63050695034608 1.0371850746883358 3.718507468833576
tritonbench_flash_attention_fwd[x_(4, 48, 8192, 8192, 64)-triton_tutorial_flash_v2]_speedup 0.8033807376466122 0.8354446620616406 1.0399112437136033 3.991124371360333
tritonbench_cross_entropy_fwd[x_(8, 2048, 4096)-liger_cross_entropy_loss]_speedup 0.6066079613770747 0.6319600871162818 1.041793262458433 4.179326245843296
tritonbench_flex_attention_fwd[x_ (8, 16, 4096, 16, 4096, 128) | noop-compiled]_speedup 31.275174513001716 32.626690198489314 1.0432136896606523 4.321368966065231
tritonbench_embedding_fwd[x_(32, 512, 768, 131072)-liger_embedding]_speedup 0.989763793313489 1.0340357278066306 1.0447297979500036 4.472979795000365
tritonbench_cross_entropy_fwd[x_(8, 2048, 8192)-liger_cross_entropy_loss]_speedup 1.0198186000834084 1.0666786849424028 1.045949431452968 4.594943145296804
tritonbench_welford_fwd[x_8192-test_welford]_speedup 0.6756810326049216 0.7071695152662154 1.046602584861525 4.660258486152502
tritonbench_rms_norm_fwd[x_(2048, 32768)-liger_rms]_speedup 3.486346391622217 3.6491844635919373 1.0467073703178273 4.670737031782735
tritonbench_embedding_fwd[x_(32, 512, 768, 4096)-liger_embedding]_speedup 1.0583523795810728 1.1116333308686848 1.0503432999401412 5.0343299940141195
tritonbench_gemm_fwd[x_(1920, 1920, 1920)-triton_tutorial_matmul]_speedup 0.9384098409927033 0.9895349390481165 1.0544805646979685 5.448056469796847
tritonbench_softmax_fwd[x_(4096, 768)-triton_softmax]_speedup 3.646258458506749 3.8601893404120466 1.058671343334479 5.867134333447899
tritonbench_flash_attention_bwd[x_(4, 48, 4096, 4096, 64)-triton_tutorial_flash_v2]_speedup 0.7650221036392292 0.810132938705313 1.0589667080878977 5.896670808789772
tritonbench_softmax_fwd[x_(4096, 1024)-triton_softmax]_speedup 3.6956520638796606 3.927165261853107 1.062644749552101 6.264474955210098
tritonbench_embedding_fwd[x_(32, 512, 768, 2048)-liger_embedding]_speedup 1.0024722067409508 1.072351369109586 1.0697068326670254 6.970683266702538
tritonbench_softmax_fwd[x_(4096, 384)-triton_softmax]_speedup 3.5155278874343403 3.8394648321585056 1.0921446095996061 9.214460959960613
tritonbench_gemm_fwd[x_(256, 256, 256)-triton_tutorial_matmul]_speedup 0.8652173803010095 0.9504504669853755 1.0985106039533248 9.851060395332478
tritonbench_flex_attention_fwd[x_ (8, 16, 1024, 16, 1024, 128) | noop-compiled]_speedup 30.97561112599094 34.16689586844004 1.1030257233495342 10.30257233495342
tritonbench_rope_fwd[x_(512, 2048)-liger_rotary_pos_emb]_speedup 2.7627119388936148 3.1051724131009792 1.123958082413945 12.395808241394501
tritonbench_rms_norm_bwd[x_average-liger_rms]_speedup 0.7644359463282543 0.8662195324343752 1.1331486131637436 13.31486131637436
tritonbench_flash_attention_bwd[x_(4, 48, 128, 128, 64)-triton_tutorial_flash_v2]_speedup 0.9088146036474087 1.0638205973095058 1.1705584318737845 17.05584318737845
tritonbench_rms_norm_bwd[x_(2048, 4096)-liger_rms]_speedup 0.9124579177369803 1.0770083158592108 1.1803375201459567 18.03375201459567
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 8192, 2304)-_triton]_speedup 0.5843780401856269 0.6916099466462786 1.1834974949205648 18.34974949205648
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 13312, 2048)-_triton]_speedup 0.6565566384407119 0.7775446078052627 1.1842765152019343 18.427651520193432
tritonbench_rope_bwd[x_(512, 2048)-liger_rotary_pos_emb]_speedup 2.2880597028756555 2.802905985395061 1.2250143568685474 22.50143568685474
tritonbench_fp8_gemm_blockwise_fwd[x_(4, 4096, 2304)-_triton]_speedup 0.5023298795703591 0.6164705490810757 1.2272225367289333 22.722253672893334
tritonbench_fp8_gemm_blockwise_fwd[x_(128, 2304, 6656)-_triton]_speedup 0.7445652213210681 1.0165631522376146 1.365311087770049 36.531108777004896
tritonbench_rms_norm_bwd[x_(2048, 1024)-liger_rms]_speedup 0.4071146095013766 0.598566305701288 1.470264863337615 47.0264863337615
tritonbench_rms_norm_bwd[x_(2048, 2048)-liger_rms]_speedup 0.5594948988718965 0.8888091723246544 1.5885920928264952 58.859209282649516
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment