This repository has been archived by the owner on Nov 4, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: move
JuliaSIMD
deps to extensions (#175)
* fix: remove LV.vmap! usage * fix: remove LV handling for bias_activation * fix: remove LV usage in dropout * refactor: move LV and octavian behind an extension * docs: add docs for loading packages * refactor: move SLEEFPirates to an ext * fix: enzyme rules for batched matmul * fix: patch more enzyme issues * feat: add a preference to disable loop vectorization * fix: incorrect dispatch called * fix: enzyme segfault bypass
- Loading branch information
Showing
26 changed files
with
354 additions
and
304 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "LuxLib" | ||
uuid = "82251201-b29d-42c6-8e01-566dec8acb11" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.3.3" | ||
version = "1.3.4" | ||
|
||
[deps] | ||
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" | ||
|
@@ -15,16 +15,14 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" | |
Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d" | ||
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" | ||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" | ||
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" | ||
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623" | ||
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40" | ||
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" | ||
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" | ||
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" | ||
Preferences = "21216c6a-2e73-6563-6e65-726566657250" | ||
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" | ||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" | ||
Reexport = "189a3867-3050-52da-a836-e630ba90ab69" | ||
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa" | ||
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" | ||
StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" | ||
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
|
@@ -36,7 +34,10 @@ BLISBLAS = "6f275bd8-fec0-4d39-945b-7e95a765fa1e" | |
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" | ||
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2" | ||
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" | ||
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" | ||
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" | ||
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" | ||
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa" | ||
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" | ||
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" | ||
|
||
|
@@ -46,7 +47,10 @@ LuxLibBLISBLASExt = "BLISBLAS" | |
LuxLibCUDAExt = "CUDA" | ||
LuxLibMKLExt = "MKL" | ||
LuxLibEnzymeExt = "Enzyme" | ||
LuxLibLoopVectorizationExt = "LoopVectorization" | ||
LuxLibOctavianExt = ["Octavian", "LoopVectorization"] | ||
LuxLibReverseDiffExt = "ReverseDiff" | ||
LuxLibSLEEFPiratesExt = "SLEEFPirates" | ||
LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"] | ||
LuxLibTrackerExt = "Tracker" | ||
LuxLibcuDNNExt = ["CUDA", "cuDNN"] | ||
|
@@ -75,6 +79,7 @@ MLDataDevices = "1.2" | |
Markdown = "1.10" | ||
NNlib = "0.9.24" | ||
Octavian = "0.3.28" | ||
Preferences = "1.4.3" | ||
Polyester = "0.7.15" | ||
Random = "1.10" | ||
Reexport = "1" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
module LuxLibLoopVectorizationExt | ||
|
||
using LoopVectorization: LoopVectorization, @tturbo, @turbo, indices | ||
using Polyester: @batch | ||
using Static: True | ||
|
||
using LuxLib: LuxLib, Utils | ||
|
||
Utils.is_extension_loaded(::Val{:LoopVectorization}) = True() | ||
|
||
Utils.can_loopvec_args_check(::True, args...) = LoopVectorization.check_args(args...) | ||
|
||
# matmul | ||
for serial in (true, false) | ||
opname = serial ? :serial_matmul_loopvec! : :matmul_loopvec! | ||
@eval @inline function LuxLib.Impl.$(opname)( | ||
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number) | ||
if !iszero(β) # Secial case this because Base.FastMath.mul_fast(NaN, false) = NaN | ||
@turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1) | ||
Cⱼₖ = zero(eltype(C)) | ||
for I in indices((A, B), (2, 1)) | ||
Cⱼₖ += A[J, I] * B[I, K] | ||
end | ||
C[J, K] = α * Cⱼₖ + β * C[J, K] | ||
end | ||
else | ||
@turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1) | ||
Cⱼₖ = zero(eltype(C)) | ||
for I in indices((A, B), (2, 1)) | ||
Cⱼₖ += A[J, I] * B[I, K] | ||
end | ||
C[J, K] = α * Cⱼₖ | ||
end | ||
end | ||
end | ||
end | ||
|
||
@inline function LuxLib.Impl.matmuladd_loopvec!( | ||
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, bias::AbstractVector) | ||
@tturbo for K in indices((C, B), 2), J in indices((C, A), 1) | ||
Cⱼₖ = zero(eltype(C)) | ||
for I in indices((A, B), (2, 1)) | ||
Cⱼₖ += A[J, I] * B[I, K] | ||
end | ||
C[J, K] = bias[J] + Cⱼₖ | ||
end | ||
return | ||
end | ||
|
||
# batched matmul | ||
function LuxLib.Impl.batched_matmul_loopvec_impl!( | ||
z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3}, | ||
y::AbstractArray{yT, 3}, α::Number=true, β::Number=false) where {zT, xT, yT} | ||
if size(x, 3) == size(y, 3) | ||
@batch for L in axes(z, 3) | ||
LuxLib.Impl.serial_matmul_loopvec!( | ||
Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, L), α, β) | ||
end | ||
elseif size(x, 3) == 1 | ||
@batch for L in axes(z, 3) | ||
LuxLib.Impl.serial_matmul_loopvec!( | ||
Utils.batchview(z, L), Utils.batchview(x, 1), Utils.batchview(y, L), α, β) | ||
end | ||
else # has to be size(y, 3) == 1 | ||
@batch for L in axes(z, 3) | ||
LuxLib.Impl.serial_matmul_loopvec!( | ||
Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, 1), α, β) | ||
end | ||
end | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
module LuxLibOctavianExt | ||
|
||
using Octavian: Octavian | ||
using Static: True | ||
|
||
using LuxLib: LuxLib, Utils | ||
|
||
Utils.is_extension_loaded(::Val{:Octavian}) = True() | ||
|
||
@inline function LuxLib.Impl.matmul_octavian!( | ||
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number) | ||
Octavian.matmul!(C, A, B, α, β) | ||
return | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
module LuxLibSLEEFPiratesExt | ||
|
||
using ChainRulesCore: ChainRulesCore | ||
using NNlib: NNlib | ||
using SLEEFPirates: SLEEFPirates | ||
|
||
using LuxLib: Numeric, Impl | ||
|
||
const CRC = ChainRulesCore | ||
|
||
sigmoid_fast(x::Number) = SLEEFPirates.sigmoid_fast(x) | ||
softplus(x::Number) = SLEEFPirates.softplus(x) | ||
logsigmoid(x::Number) = -softplus(-x) | ||
swish(x::Number) = Base.FastMath.mul_fast(x, sigmoid_fast(x)) | ||
lisht(x::Number) = Base.FastMath.mul_fast(x, tanh_fast(x)) | ||
tanh(x::Number) = SLEEFPirates.tanh(x) | ||
tanh_fast(x::Number) = SLEEFPirates.tanh_fast(x) | ||
|
||
for (f, dfdx) in [ | ||
#! format: off | ||
(:sigmoid_fast, :(conj(Base.FastMath.mul_fast(Ω, Base.FastMath.sub_fast(1, Ω))))), | ||
(:softplus, :(sigmoid_fast(x))), | ||
(:logsigmoid, :(sigmoid_fast(-x))), | ||
(:swish, :(Base.FastMath.add_fast(Ω, Base.FastMath.mul_fast(sigmoid_fast(x), Base.FastMath.sub_fast(1, Ω))))), | ||
(:lisht, :(Base.FastMath.add_fast(x, Base.FastMath.mul_fast(tanh_fast(x), Base.FastMath.sub_fast(1, Ω))))), | ||
(:tanh, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω))))), | ||
(:tanh_fast, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω))))) | ||
#! format: on | ||
] | ||
@eval CRC.@scalar_rule($f(x), $(dfdx)) | ||
|
||
∇f = Symbol(:∇broadcasted_, f) | ||
@eval function CRC.rrule(::typeof(Broadcast.broadcasted), ::typeof($f), | ||
x::Union{Numeric, Broadcast.Broadcasted}) | ||
Ω = $(f).(x) | ||
function $(∇f)(dΩ) | ||
∂x = CRC.InplaceableThunk(dx -> @.(dx+=dΩ * $(dfdx)), CRC.@thunk @.(dΩ*$(dfdx))) | ||
return CRC.NoTangent(), CRC.NoTangent(), ∂x | ||
end | ||
return Ω, $(∇f) | ||
end | ||
end | ||
|
||
for (fbase, ffast) in [ | ||
#! format: off | ||
(NNlib.sigmoid_fast, sigmoid_fast), | ||
(NNlib.softplus, softplus), | ||
(NNlib.logsigmoid, logsigmoid), | ||
(NNlib.swish, swish), | ||
(NNlib.lisht, lisht), | ||
(Base.tanh, tanh), | ||
(NNlib.tanh_fast, tanh_fast) | ||
#! format: on | ||
] | ||
@eval Impl.sleefpirates_fast_act(::typeof($fbase)) = $ffast | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
98a2d7a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
98a2d7a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/117587
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
98a2d7a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LuxLib Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6417
ns5375
ns1.19
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6041
ns5250
ns1.15
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7167
ns7708.5
ns0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5292
ns5416
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
103542
ns113361
ns0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
637131
ns601544
ns1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10166.5
ns9729.5
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9958
ns9938
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10291.5
ns10167
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9979.5
ns11063
ns0.90
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
494284
ns544547
ns0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
719725
ns629346
ns1.14
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1583
ns1500
ns1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1542
ns1458
ns1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1666
ns1771
ns0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1500
ns1583
ns0.95
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
20684
ns20770
ns1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU
33302
ns30997
ns1.07
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
3812.5
ns4104
ns0.93
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4125
ns4500
ns0.92
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4250
ns4500
ns0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
4334
ns4333
ns1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
134278.5
ns134970
ns0.99
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU
143062.5
ns138579
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58000
ns57666.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46417
ns46875
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46875
ns47125
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83750
ns81458
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37449
ns36587
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
70883
ns69420
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2037500
ns2030375
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2083416.5
ns2088625
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2090916.5
ns2086625
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1996979.5
ns1998562
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
220080
ns217216
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1213928
ns930850
ns1.30
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
173708
ns175083
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
146625
ns147291
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
165062.5
ns150021
ns1.10
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
172000
ns151750
ns1.13
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
167869.5
ns166825
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
196051.5
ns262570
ns0.75
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1113854.5
ns1115103.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1110541
ns1110771
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1118667
ns1113771
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1124479.5
ns1136250
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
644177
ns639845.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
899376
ns864075
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5333
ns3792
ns1.41
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4875
ns4479
ns1.09
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6750
ns6583
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4416
ns6375
ns0.69
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
83066
ns85209.5
ns0.97
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
64020
ns59531
ns1.08
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8584
ns8417
ns1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8750
ns8750
ns1
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8875
ns9042
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8584
ns8958
ns0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
552192.5
ns557500.5
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
372446
ns370833
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17229.5
ns17958
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17250
ns16458
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21542
ns21125
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17208.5
ns17292
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
63166
ns63776.5
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
79573.5
ns82870
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
220583
ns212625
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
218875
ns213042
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
223125
ns212771
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
219625
ns212291
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
329089
ns329859
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
423777
ns405232
ns1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
583
ns667
ns0.87
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
625
ns625
ns1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
833
ns875
ns0.95
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
834
ns709
ns1.18
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
19066
ns19101
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU
27311
ns26409
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1417
ns1458
ns0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1417
ns1334
ns1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1583
ns1583
ns1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1375
ns1375
ns1
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
116071.5
ns117126.5
ns0.99
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU
118732
ns115676
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7375
ns7375
ns1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6000
ns6041
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6083
ns6084
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10334
ns9958
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
24482
ns23587
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
52122
ns52723
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
229541.5
ns229167
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
268417
ns230667
ns1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
241500
ns267875
ns0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
251250
ns257458
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
189293
ns182744
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
588480
ns548449.5
ns1.07
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3917
ns3958
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3958
ns3958
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4042
ns3917
ns1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23660.5
ns22860
ns1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU
43502
ns39504
ns1.10
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16833
ns17042
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16834
ns16875
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16959
ns17083
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16666
ns16875
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
188039
ns185787.5
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU
166010.5
ns162052
ns1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
929291
ns491583
ns1.89
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
838708
ns385625
ns2.17
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
841584
ns386458
ns2.18
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
1269208
ns844083
ns1.50
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113941
ns113763
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU
396441
ns388657
ns1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2610729.5
ns2155583
ns1.21
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
2330541.5
ns1863374.5
ns1.25
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
2324458
ns1865167
ns1.25
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3478334
ns3377520.5
ns1.03
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
232093
ns229580
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
630643.5
ns610962
ns1.03
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6000
ns6500
ns0.92
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7042
ns5500
ns1.28
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7333.5
ns7667
ns0.96
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6584
ns5167
ns1.27
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
82915
ns84720.5
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
62131.5
ns59932
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11875
ns11229
ns1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11417
ns11395.5
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12417
ns12334
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9813
ns10667
ns0.92
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
585345.5
ns602168
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
388046
ns383917
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23179.5
ns23328
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU
41949
ns41367
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2083
ns2084
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2250
ns2166
ns1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2167
ns2167
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2083
ns2084
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
226220
ns228927.5
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU
166171
ns165900
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8583
ns9584
ns0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8542
ns8333
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
10709
ns9895.5
ns1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
8833
ns8542
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
100758
ns105241
ns0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
72575
ns71955
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17228.5
ns17688
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
18583
ns16666.5
ns1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18500
ns18708
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17750
ns17562
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
582511
ns595171
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
371318.5
ns358129
ns1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
459
ns542
ns0.85
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns458
ns1.36
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
583
ns583
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns458
ns1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
34079
ns34578
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
44423
ns41387
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9479
ns9229
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9750
ns8958.5
ns1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10333
ns9750
ns1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9562.5
ns8104
ns1.18
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
262881
ns257823
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
351422
ns349944
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
396583
ns397270.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288042
ns288083
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
287666
ns288666.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
756167
ns751792
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
112987
ns112022
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU
77780.5
ns74609
ns1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1455709
ns1454270.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1130291
ns1130500
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1133250
ns1131583
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2358000
ns2437959
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
202802
ns200057
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU
268682
ns302285
ns0.89
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7354.5
ns7750
ns0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
8000
ns7083.5
ns1.13
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8687.5
ns8312.5
ns1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7750
ns6687.5
ns1.16
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
137305
ns139766
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
64461
ns60383
ns1.07
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
12812.5
ns13479.5
ns0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
15041.5
ns12750
ns1.18
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15353.5
ns15125
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
12333.5
ns14625.5
ns0.84
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
906003
ns923489
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
413373
ns407432
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
26000
ns25625
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
27562.5
ns23666
ns1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
27042
ns29417
ns0.92
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
26021
ns24041
ns1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
186382.5
ns186240.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
146484
ns120505
ns1.22
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
146500
ns152187
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
157750
ns145250
ns1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
129416
ns146917
ns0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
155812.5
ns103958
ns1.50
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1016426
ns1013659
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
551090
ns535240
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
84667
ns74583
ns1.14
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
80167
ns79584
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
78063
ns76791.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
80521
ns76083
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
190829
ns190594.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
124858.5
ns121316.5
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
219479
ns273562.5
ns0.80
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
281750
ns304084
ns0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
278146
ns303333
ns0.92
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
320791.5
ns307583
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1021778
ns1045024
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
643542
ns624192
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
13125
ns12417
ns1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
13666.5
ns12896
ns1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14041.5
ns14000
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
13459
ns12500
ns1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
136741.5
ns138416
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
226473
ns226152
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
27083.5
ns27792
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26125
ns26458
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27833.5
ns28437.5
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26604.5
ns33937.5
ns0.78
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
919419
ns924126.5
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
633979.5
ns610976
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
14000
ns11124.5
ns1.26
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
14708.5
ns10333
ns1.42
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
17583.5
ns12479.5
ns1.41
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
14792
ns11125
ns1.33
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
119245
ns118543.5
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
233827
ns233176
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26875
ns22291.5
ns1.21
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
25958.5
ns22417
ns1.16
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
26583
ns24167
ns1.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26541
ns28562.5
ns0.93
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
676576
ns668341
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
589361.5
ns569113
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
182375
ns68709
ns2.65
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
183208
ns62750
ns2.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
185583
ns67520.5
ns2.75
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
183459
ns64417
ns2.85
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
102955
ns102389
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
232900.5
ns230751
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
583500
ns506375
ns1.15
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
595083
ns510167
ns1.17
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
597520.5
ns475209
ns1.26
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
624167
ns647896
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
493717.5
ns492781
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
657463
ns593680
ns1.11
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6750
ns7958
ns0.85
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7645.5
ns6750
ns1.13
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8167
ns8208
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7542
ns7562.5
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
135360
ns137965
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
62767
ns62687
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15375
ns16125
ns0.95
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14917
ns16250
ns0.92
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16187.5
ns16250
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15292
ns14833
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
885601
ns900927
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
392428
ns388286
ns1.01
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
6153416.5
ns6150354
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
6381624.5
ns6368167
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
6371521
ns6373937.5
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
11926500
ns11915167
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
346494
ns345749
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU
392843
ns388426
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
19117208.5
ns19083437.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
19977084
ns19960479.5
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
19957021
ns19966834
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
36558729
ns37142104
ns0.98
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1005649
ns1072087
ns0.94
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU
1105996
ns1035750.5
ns1.07
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1750
ns958
ns1.83
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1000
ns1.83
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1833
ns1042
ns1.76
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1834
ns958
ns1.91
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23503
ns23415
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU
197739
ns200906
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4834
ns3917
ns1.23
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4958
ns4000
ns1.24
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4917
ns4041
ns1.22
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4916
ns5458
ns0.90
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
276337.5
ns270573.5
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
502208
ns486775
ns1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8062.5
ns8687
ns0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8416
ns7459
ns1.13
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9459
ns9334
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
8145.5
ns7834
ns1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
115989
ns116220
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
71584
ns71133
ns1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11562.5
ns12125
ns0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
12438
ns11958
ns1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
12541
ns13000
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
12875
ns11750
ns1.10
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
604320
ns609643.5
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
353160
ns341729
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns291
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22648
ns22413
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU
43592
ns44053
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2917
ns3000
ns0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2917
ns2917
ns1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3041
ns3208
ns0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
3000
ns2916
ns1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
197848
ns194923.5
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU
146363.5
ns154488.5
ns0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
14604
ns11625
ns1.26
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
15458.5
ns10500
ns1.47
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
15896
ns12875
ns1.23
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
15000.5
ns11875
ns1.26
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
117481
ns115370
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
236802
ns231793
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
26500
ns22667
ns1.17
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
25625
ns22104.5
ns1.16
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
26041.5
ns23625
ns1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
25958
ns26729
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
561217
ns555861
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
566814
ns545740
ns1.04
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4291
ns4334
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4209
ns4333
ns0.97
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4208
ns4208
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4375
ns4250
ns1.03
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
24363
ns23923
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU
44754
ns44864
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16250
ns16500
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16125
ns16333
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16292
ns16166
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16416
ns16292
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
321227
ns319806
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU
190786
ns186077
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5916
ns2125
ns2.78
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5875
ns2084
ns2.82
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5792
ns2209
ns2.62
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5750
ns2000
ns2.88
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
34700.5
ns35327
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
200434
ns199242
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
22292
ns17104
ns1.30
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21292
ns20167
ns1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21792
ns19000
ns1.15
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
22208
ns23083.5
ns0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
283315.5
ns284984
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
598489
ns583431
ns1.03
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
59729
ns59458
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
64229
ns65666
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
66833
ns66125
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
50958
ns52833
ns0.96
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66908
ns66304
ns1.01
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU
115781
ns110241
ns1.05
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
198937.5
ns153041
ns1.30
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
144625
ns155229
ns0.93
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
167291.5
ns130209
ns1.28
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
303249.5
ns286334
ns1.06
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
208882.5
ns210129.5
ns0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU
529218
ns511145
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
84291
ns106521
ns0.79
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
83875
ns78958
ns1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
88125
ns84042
ns1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81562.5
ns115521
ns0.71
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193291
ns191513.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
182771
ns267630
ns0.68
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1875250
ns1894896
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1914792
ns1902375
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1928375
ns1878334
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1916625
ns1895250
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
505449
ns507442
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
857542
ns825763
ns1.04
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
333
ns291
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
21535
ns21516
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU
36788
ns35507
ns1.04
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1833
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1834
ns1834
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1834
ns1833
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
243998
ns245735
ns0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU
166221
ns164548
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
11229
ns10916
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9791.5
ns8291
ns1.18
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
11125
ns11146
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
10479.5
ns9500
ns1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
114440.5
ns114788
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
233386
ns232004
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10458
ns8916
ns1.17
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10250
ns8854.5
ns1.16
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9917
ns10917
ns0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10145.5
ns9583
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
491014
ns491693
ns1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
561274
ns536332
ns1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58375
ns57958
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46917
ns46625
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46625
ns46750
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83708
ns83166
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
38960
ns38476.5
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
72876
ns71814
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1897625
ns1905145.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1964750
ns1949542
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1985854
ns1958500
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1899833
ns1874958
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
212091
ns212675
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
994598
ns968925.5
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
266354
ns267500
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
269729
ns271479.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
271041.5
ns271209
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
268271
ns268209
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
193629.5
ns194219.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
271156
ns271267
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
693917
ns585333.5
ns1.19
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
692541
ns600292
ns1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
687708
ns671042
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
593833
ns845604.5
ns0.70
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
991006
ns991966
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
863163
ns831153
ns1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2180687.5
ns2211666
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2214917
ns2203958
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2212041
ns2229083
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2208479
ns2173792
ns1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
154859
ns161646
ns0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
451844.5
ns470965
ns0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5453666
ns5493104.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5518208
ns5515875
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5522375
ns5526542
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5522209
ns6852458
ns0.81
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
930442
ns959137
ns0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1495900
ns1437405
ns1.04
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
999875
ns478292
ns2.09
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
913333
ns345625
ns2.64
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
912895.5
ns346750
ns2.63
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
1334562.5
ns908542
ns1.47
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46425
ns46909
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU
399125
ns393175
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
2620166
ns2137500
ns1.23
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
2328541
ns1869334
ns1.25
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
2329395.5
ns1859271
ns1.25
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
3468667
ns3380209
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
247327
ns264095.5
ns0.94
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
658089
ns632907.5
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58083
ns57458
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46625
ns46166
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46542
ns46250
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84000
ns78667
ns1.07
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
29007
ns28560
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
73392
ns73147
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2036000
ns2029292
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2096916
ns2078187.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2092208
ns2063250
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1992542
ns1963958
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
225482
ns230846.5
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1028937.5
ns980522
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58417
ns58083.5
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47208
ns46584
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47375
ns46917
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83541
ns79958
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
48550
ns48944
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
71593.5
ns71428.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1926354.5
ns1871729
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1987291
ns1973604
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1972375
ns1944167
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1890375
ns1876792
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
231977
ns238010
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
931260
ns881607.5
ns1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns292
ns1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
333
ns291
ns1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
33752
ns34878
ns0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
44343
ns47028
ns0.94
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6542
ns6270.5
ns1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7187.5
ns6187.5
ns1.16
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7625
ns7375
ns1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6209
ns6125
ns1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
203191.5
ns211705.5
ns0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
350064
ns332741
ns1.05
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns291
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns250
ns1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32755
ns32902
ns1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU
36558
ns36327
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
3375
ns2667
ns1.27
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
3333
ns2667
ns1.25
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
3000
ns4292
ns0.70
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
3208
ns3167
ns1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
185298.5
ns187662.5
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU
144480
ns136635
ns1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1465479.5
ns467208
ns3.14
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1410667
ns469417
ns3.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1427770.5
ns466875
ns3.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1410417
ns464979.5
ns3.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
136084
ns137312
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
354201
ns361475
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5012687.5
ns4027749.5
ns1.24
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5023959
ns4071500
ns1.23
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5034167
ns4067417
ns1.24
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5021667
ns5516750
ns0.91
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
673868
ns690445
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1145811
ns1091915
ns1.05
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
49876625
ns49879250
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
35509791
ns35487583
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
35514916
ns35512833.5
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
97103375
ns96974083
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1608361
ns1622377
ns0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU
1576726
ns1579230
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
154443875
ns154423062.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
112320833.5
ns112364750
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
112445042
ns112377416
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
296071750
ns299989812
ns0.99
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6483041.5
ns6468945
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU
6222525
ns7230228
ns0.86
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
48042
ns19104.5
ns2.51
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47667
ns18375
ns2.59
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47916
ns17375.5
ns2.76
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47583
ns15083
ns3.15
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
19626
ns19621
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU
28463
ns28854
ns0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50583.5
ns11062.5
ns4.57
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50167
ns8833
ns5.68
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
51000
ns9291
ns5.49
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
50667
ns17667
ns2.87
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
245482
ns252067.5
ns0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU
140773
ns138484
ns1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8667
ns7937.5
ns1.09
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8750
ns8125
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
11167
ns10375
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
9666.5
ns8708
ns1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
118847
ns120230.5
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
237489
ns235119
ns1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10791
ns9708
ns1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10458
ns9084
ns1.15
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10333
ns9792
ns1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10709
ns10667
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
584310
ns599437
ns0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
572469
ns557070
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9125
ns9291.5
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9896
ns8812.5
ns1.12
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10667
ns9917
ns1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9292
ns8958.5
ns1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
115727.5
ns118821
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
73908
ns71593
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13874.5
ns13687.5
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13750
ns13604.5
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
14333
ns14395.5
ns1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
14375.5
ns14750
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
559680.5
ns570663
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
337060
ns323504
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
959
ns542
ns1.77
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1042
ns625
ns1.67
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1042
ns584
ns1.78
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1083
ns500
ns2.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
33675
ns35088
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU
206546
ns203871
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8917
ns7562.5
ns1.18
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8437.5
ns7667
ns1.10
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8791
ns7875
ns1.12
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9250
ns8520.5
ns1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
225862.5
ns227876
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
576667
ns569945
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
23667
ns16458
ns1.44
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23292
ns17041
ns1.37
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23813
ns16209
ns1.47
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23666
ns10979
ns2.16
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
20529
ns20941
ns0.98
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU
187811
ns182992
ns1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
53583.5
ns35666
ns1.50
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52145.5
ns35167
ns1.48
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
53584
ns36000
ns1.49
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
53667
ns57833
ns0.93
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
260507
ns265749
ns0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
549086
ns534293
ns1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1444541.5
ns447500
ns3.23
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1445459
ns488042
ns2.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1414666.5
ns455709
ns3.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1401396
ns496916
ns2.82
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
195236
ns195513
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
321861
ns328714
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5007208
ns4024209
ns1.24
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5006958
ns4055021
ns1.23
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5015812.5
ns4053917
ns1.24
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5020500
ns5501562.5
ns0.91
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
510108
ns521631.5
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1117899
ns1059038
ns1.06
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
828285625
ns836727208
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
541921375
ns553913292
ns0.98
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
542359625
ns540736625
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
1558200021
ns1517196875
ns1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22535776.5
ns22767789
ns0.99
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU
12173703
ns10331681
ns1.18
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
3903695416
ns3773348667
ns1.03
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1771980416
ns1782084291
ns0.99
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1773568584
ns1780399750
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
5228367459
ns4786718666
ns1.09
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
119027931
ns118657187
ns1.00
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU
68450588
ns67063298
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
75916.5
ns76542
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
87437.5
ns76584
ns1.14
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
84417
ns79583
ns1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
81083
ns76708.5
ns1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
192111.5
ns195943.5
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
126607
ns123300.5
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
282646
ns191292
ns1.48
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
283042
ns252042
ns1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
236875
ns199562.5
ns1.19
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
276458
ns225542
ns1.23
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
995625
ns1004442
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
612404
ns590764
ns1.04
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
199947208.5
ns199694520.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
139420500
ns138856500
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
138954958
ns139241166
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
389188834
ns393790959
ns0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5832800
ns5842492
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU
2958637.5
ns4746717.5
ns0.62
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
618298396
ns617676375.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
439277916
ns439446917
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
439303895.5
ns439765166.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
1200068000
ns1174222000
ns1.02
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
26614249.5
ns26723523
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU
16011697.5
ns15854720
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7417
ns7292
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6125
ns6125
ns1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6125
ns5959
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10125
ns9834
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26885
ns26896.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
54341
ns55173
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
214083
ns213041.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
232833
ns227729
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
230000
ns220416.5
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
207709
ns206125
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
215596
ns219868
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
546726.5
ns541982
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7417
ns8521
ns0.87
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
8875.5
ns7458
ns1.19
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10750
ns11167
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
10459
ns9250
ns1.13
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
111291
ns115361
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
72956
ns74069
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7792
ns7562.5
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7833.5
ns7958
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8125
ns8167
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8375
ns7395.5
ns1.13
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
492517.5
ns495697
ns0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
322723
ns309298
ns1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
417
ns417
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
500
ns459
ns1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
459
ns500
ns0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
583
ns375
ns1.55
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
25272
ns26124
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
45194
ns45334
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9646
ns9584
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9541
ns9062.5
ns1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
11104
ns9792
ns1.13
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
10333
ns9542
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
247083
ns247606
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
383457
ns382304
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
351000
ns112312.5
ns3.13
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
354459
ns103229
ns3.43
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
352250
ns104104.5
ns3.38
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
351625
ns155083
ns2.27
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
23168
ns23501
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU
198701
ns192539
ns1.03
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
826000
ns536562
ns1.54
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
820458
ns554250
ns1.48
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
822083.5
ns535291.5
ns1.54
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
827750
ns910854
ns0.91
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
214195.5
ns221242
ns0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU
578901
ns560216.5
ns1.03
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
5229.5
ns5416.5
ns0.97
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
5875
ns6208.5
ns0.95
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
6958.5
ns6021
ns1.16
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
4667
ns4000
ns1.17
batchedmm(16, Bsize=32)/forward/GPU/CUDA
17091
ns17520
ns0.98
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU
74219
ns73648
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
13458.5
ns11562.5
ns1.16
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
10625
ns11062
ns0.96
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
13041
ns11000
ns1.19
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
18542
ns16666
ns1.11
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
202239.5
ns207455.5
ns0.97
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU
330217
ns330387
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
39833.5
ns39667
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
51209
ns51291
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
52458.5
ns52958.5
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
13459
ns13625
ns0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA
19993
ns20356
ns0.98
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU
99666.5
ns98364
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
38229.5
ns36375.5
ns1.05
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
35125
ns31417
ns1.12
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
34187.5
ns31229.5
ns1.09
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
59417
ns57000
ns1.04
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
178995.5
ns184178
ns0.97
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU
362888
ns355254
ns1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3500
ns1750
ns2
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3667
ns2042
ns1.80
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3833
ns2208
ns1.74
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3709
ns1875
ns1.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
19015
ns19575
ns0.97
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU
29645
ns29099.5
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4291
ns2208
ns1.94
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4500
ns2167
ns2.08
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4458
ns2375
ns1.88
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4292
ns2208
ns1.94
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
194611
ns198996.5
ns0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU
126757
ns128571
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5916
ns4583
ns1.29
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5062.5
ns4417
ns1.15
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6375
ns6729
ns0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4625
ns3958
ns1.17
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
138395
ns143699.5
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
65944
ns61955.5
ns1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9625
ns8334
ns1.15
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8500
ns8083.5
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9333
ns8709
ns1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10666
ns8583
ns1.24
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
807046.5
ns836045.5
ns0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
378457
ns364891
ns1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
207583
ns54833
ns3.79
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209042
ns55833
ns3.74
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
213208
ns55583
ns3.84
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
204125
ns56000
ns3.65
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35332
ns36570
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
203930.5
ns202568
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
603500
ns476729
ns1.27
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
623479.5
ns494500
ns1.26
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
658604.5
ns494208
ns1.33
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
586375
ns641625
ns0.91
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
254148
ns259886
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
767213
ns705894
ns1.09
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
3324167
ns3310333
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
2328667
ns2334062.5
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
2334417
ns2333375
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
6324542
ns6300479
ns1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA
206559
ns204581.5
ns1.01
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU
377105
ns373097
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
11496208.5
ns11459729
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
8303562.5
ns8305729.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
8348416.5
ns8342854
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
21193020.5
ns21088292
ns1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
736080.5
ns744676
ns0.99
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU
2044820.5
ns1994797.5
ns1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3917
ns4833
ns0.81
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5292
ns4646
ns1.14
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6292
ns7520.5
ns0.84
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
7125
ns4917
ns1.45
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
129442
ns133339
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
57067
ns61520
ns0.93
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8500
ns7083
ns1.20
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7375
ns7291.5
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7833
ns7500
ns1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8291.5
ns7416.5
ns1.12
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
711410
ns725863
ns0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
364581
ns353680
ns1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
117312.5
ns100459
ns1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
101437.5
ns123042
ns0.82
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
102687.5
ns102417
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
98458.5
ns121458.5
ns0.81
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
149616
ns151940.5
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
210473
ns233346
ns0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2008250
ns2033271
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2022459
ns2026417
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2039937.5
ns1997458.5
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2036625
ns2041833
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
661994.5
ns678763
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
963831
ns931831
ns1.03
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
33416
ns32666
ns1.02
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
35459
ns36562.5
ns0.97
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
34709
ns36167
ns0.96
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
750
ns667
ns1.12
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15265
ns15627
ns0.98
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU
78737
ns70121
ns1.12
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
3959
ns2604.5
ns1.52
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2917
ns2958
ns0.99
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
4708
ns2937.5
ns1.60
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
3666
ns2167
ns1.69
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
136137.5
ns139744
ns0.97
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU
321796.5
ns289641
ns1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7250
ns7208
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6042
ns6000
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6083
ns5916
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10042
ns9917
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
34970
ns35855
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
56516
ns53911
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221584
ns212958.5
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220959
ns222708
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
234583
ns219917
ns1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
207333
ns206209
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
237194
ns243430
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
540189
ns513269
ns1.05
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3750
ns3750
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3750
ns3750
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3833
ns3750
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3958
ns3791
ns1.04
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
21681
ns21959
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU
39383
ns35557
ns1.11
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14458
ns14500
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14458
ns14500
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14541
ns14500
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14625
ns14459
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
297631.5
ns302419
ns0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU
190215
ns179841
ns1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
129834
ns128041
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
118271
ns144417
ns0.82
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
106750
ns106917
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
101666.5
ns151959
ns0.67
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
150106
ns140874
ns1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
241781
ns236762
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1921708.5
ns1924583
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1924583
ns1920500
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1932000
ns1914229.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1922750
ns1928875
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
653385
ns673452
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
928325
ns899671
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18875
ns17333
ns1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17292
ns17354.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
20937
ns21208
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18459
ns17375
ns1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
104073.5
ns108833.5
ns0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
91301
ns91100
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
239083.5
ns216917
ns1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
224791
ns252646
ns0.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
224958.5
ns222166
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
218500
ns229125
ns0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
493640.5
ns508535.5
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
439080
ns419764
ns1.05
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
26166
ns24271
ns1.08
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
29167
ns30791.5
ns0.95
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
28958
ns29437.5
ns0.98
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
1416
ns1584
ns0.89
batchedmm(16, Bsize=4)/forward/GPU/CUDA
15781
ns16398
ns0.96
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU
72756
ns76093
ns0.96
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
6208
ns4500
ns1.38
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
5041
ns4916
ns1.03
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
6875
ns5125
ns1.34
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
6417
ns4625
ns1.39
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
199155.5
ns204364
ns0.97
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU
324216
ns331675
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
221875
ns222666
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
223375
ns220666.5
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
225375
ns225667
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
223542
ns220583
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
216803
ns222506.5
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
267771
ns267871
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
508542
ns495084
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
511042
ns511812.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
509500
ns500854
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
557354
ns675750
ns0.82
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1017707.5
ns1053634
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
811461
ns780999
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19104
ns20375
ns0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
19584
ns20000
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22063
ns23875
ns0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
19792
ns18792
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
111072
ns114286
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
90009
ns89858
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221854
ns212375
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220250
ns213041
ns1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
218166.5
ns214458
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
220146
ns212541
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
700847.5
ns727333.5
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
494855
ns469036
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6292
ns6666
ns0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7000
ns6604.5
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7375
ns8750.5
ns0.84
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
6834
ns6208
ns1.10
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
130925
ns137142
ns0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
63498
ns60974
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11041.5
ns9791
ns1.13
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9959
ns10084
ns0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10895.5
ns10750
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10459
ns10750
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
770540.5
ns794651.5
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
375452
ns370101.5
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4104
ns4666
ns0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
7041
ns4708
ns1.50
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7166
ns7437.5
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6166
ns4917
ns1.25
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
131485.5
ns138544.5
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
62607
ns59692
ns1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7416.5
ns7458
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7750
ns7166
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8125
ns7791
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8083
ns7708
ns1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
737449
ns755761
ns0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
380902
ns376523
ns1.01
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
14481917
ns14498417
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
10107542
ns10124125
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
10094750
ns10094833
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
27859959
ns27748583.5
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA
533975
ns532665
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU
867906.5
ns866850
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
46387667
ns46333437
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
33363354
ns33447541.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
33478875
ns33510458
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
85752792
ns85445667
ns1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2651799
ns2636151
ns1.01
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU
5191497.5
ns5189385.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
185208.5
ns66458
ns2.79
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
185916
ns65687.5
ns2.83
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
188604
ns70500
ns2.68
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
187271
ns66500
ns2.82
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
117719.5
ns118172.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
236051
ns237313
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
634875
ns467958
ns1.36
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
627937.5
ns480333.5
ns1.31
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
601166
ns474916.5
ns1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
587625
ns686583.5
ns0.86
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
694993
ns715446
ns0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
698169.5
ns655875
ns1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
541
ns542
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
584
ns583
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
584
ns500
ns1.17
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
31826
ns32877
ns0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU
48104.5
ns47579
ns1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9541
ns8750
ns1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9687.5
ns9208
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10542
ns9104.5
ns1.16
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10938
ns9750
ns1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
276120
ns280778.5
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
371078
ns355484
ns1.04
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26250
ns9500
ns2.76
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26333
ns9500
ns2.77
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26583
ns9500
ns2.80
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26458
ns9500
ns2.79
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
22942
ns23273
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU
206526
ns200655
ns1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67125
ns50209
ns1.34
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
67333
ns50250
ns1.34
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
68792
ns50500
ns1.36
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66875
ns72375
ns0.92
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
273858
ns278469.5
ns0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
554115
ns491037
ns1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
207166
ns54917
ns3.77
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
211667
ns55667
ns3.80
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
211167
ns55584
ns3.80
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
202875
ns56000
ns3.62
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
27563
ns28169
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
206546
ns203240
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
609937.5
ns518854
ns1.18
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
669750
ns500625
ns1.34
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
664812.5
ns497750
ns1.34
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
609042
ns643417
ns0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
233231.5
ns238777
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
798562
ns758938
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
664875
ns655042
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
636687.5
ns613083
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
648791.5
ns652541
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
629792
ns678416.5
ns0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
185894.5
ns192069
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
349393
ns269704
ns1.30
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2244229
ns2167104.5
ns1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2225354
ns2233125
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2256708
ns2241292
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2271792
ns2230208.5
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
900927
ns929752.5
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1235829
ns1217770.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19333
ns19500
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
21166.5
ns19208.5
ns1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
22375
ns23542
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
19958
ns20000
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
106770.5
ns111306
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
89387
ns91551
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
227250
ns220459
ns1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
262312.5
ns226458
ns1.16
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
231250
ns223104.5
ns1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
222770.5
ns219708
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
700957
ns714110
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
516550
ns487481
ns1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
500
ns625
ns0.80
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
584
ns583
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
584
ns584
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
584
ns500
ns1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
22928
ns23491
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU
44243
ns43771
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9583
ns9417
ns1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9958.5
ns9291.5
ns1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
13229.5
ns9708
ns1.36
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
10875
ns9646
ns1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
258192
ns261581
ns0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
395479
ns381618
ns1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
8062.5
ns8917
ns0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
9208
ns7583
ns1.21
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
10459
ns11854.5
ns0.88
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8333
ns9042
ns0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
112863.5
ns115935.5
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU
72315
ns70456.5
ns1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7500
ns8125
ns0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7750
ns7542
ns1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
14875
ns8000
ns1.86
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8917
ns7292
ns1.22
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
472419
ns484010
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU
321811
ns302215
ns1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1979.5
ns1417
ns1.40
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2500
ns1667
ns1.50
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2542
ns1959
ns1.30
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2416
ns1500
ns1.61
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
19845
ns20030
ns0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU
191508
ns184144
ns1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6666
ns3708
ns1.80
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6459
ns3625
ns1.78
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
7292
ns3833
ns1.90
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
7292
ns4917
ns1.48
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
208409
ns213101.5
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
543621
ns524324.5
ns1.04
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
754167
ns148729
ns5.07
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
751000
ns128917
ns5.83
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
749375
ns129917
ns5.77
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
747104
ns235541
ns3.17
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
22303
ns22778
ns0.98
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU
47829
ns46868
ns1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
792250
ns143645.5
ns5.52
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
811750
ns130875
ns6.20
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
789500
ns138417
ns5.70
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
794229.5
ns290021
ns2.74
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
206590.5
ns211960
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU
233541
ns223578
ns1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7250
ns7167
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5917
ns5958
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6000
ns5958.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10209
ns10000
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
32976
ns33236
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
57267
ns57207
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
228458.5
ns221249.5
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
269270.5
ns238542
ns1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
235021
ns264500
ns0.89
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213146
ns213250
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
254662
ns259447
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
552652
ns530542
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12417
ns13209
ns0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
13250
ns12166
ns1.09
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
14458
ns13584
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
13000
ns12667
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
131273.5
ns135078
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU
231363
ns227730.5
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24854.5
ns23917
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24916
ns24083.5
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25542
ns24750
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24458
ns30146
ns0.81
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
813324
ns833527
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU
634495
ns615374.5
ns1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
8875
ns9271
ns0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
9958
ns9541
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
11167
ns10375
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
9542
ns9250
ns1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
116553
ns119628
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU
74930
ns74940
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13770.5
ns14041
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14917
ns13958
ns1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15916
ns14750
ns1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
16437.5
ns13459
ns1.22
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
621843
ns638262
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU
356836
ns344824
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
9145.5
ns9666.5
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
9354
ns9208
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10750
ns10959
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10125
ns9083.5
ns1.11
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
116468
ns118521
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU
74383.5
ns79399
ns0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12916
ns13416
ns0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12959
ns12416
ns1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
20541
ns13479.5
ns1.52
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
14500
ns12708
ns1.14
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
515709
ns530027
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU
328534
ns317163
ns1.04
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
31062
ns30896
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
33146
ns33813
ns0.98
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
30750
ns32249.5
ns0.95
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
1833
ns1875
ns0.98
batchedmm(2, Bsize=128)/forward/GPU/CUDA
16169
ns16425
ns0.98
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU
77564
ns76663
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
5562.5
ns5417
ns1.03
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
5312.5
ns5000
ns1.06
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
7208
ns5479.5
ns1.32
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
7834
ns6270.5
ns1.25
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
134922
ns138278
ns0.98
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU
340125
ns340566
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
292
ns333
ns0.88
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
375
ns291
ns1.29
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
24307
ns25574
ns0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
45845
ns45666
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6166.5
ns6458
ns0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6708
ns6375
ns1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8167
ns6791.5
ns1.20
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7083
ns6458.5
ns1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
179926.5
ns185923.5
ns0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
372385.5
ns365402.5
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5834
ns2084
ns2.80
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5833
ns2084
ns2.80
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
5875
ns2083
ns2.82
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5958
ns2000
ns2.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
25187
ns26453
ns0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU
201636
ns203645.5
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
21041
ns18041
ns1.17
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21709
ns17166.5
ns1.26
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
23458
ns17750
ns1.32
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26125
ns23458.5
ns1.11
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
262884
ns268326
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU
615780.5
ns600702.5
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
192083.5
ns147875
ns1.30
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
158917
ns155437.5
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
154416.5
ns155125
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
146417
ns151708
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
184640
ns190890.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
215472.5
ns271146.5
ns0.79
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1319792
ns1321937.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1328249.5
ns1330625
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1347250
ns1308375
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1337000
ns1285166
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
844907
ns867140
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1041340
ns1006962
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24292
ns25500
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24916
ns23542
ns1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
28000
ns28708.5
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24833.5
ns24416.5
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
224694.5
ns226899
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU
130334
ns128029
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
117583
ns125062.5
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
131375
ns165729.5
ns0.79
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
160499.5
ns125854.5
ns1.28
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
164750
ns180062
ns0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
967206
ns998018.5
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU
585053
ns568743
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
250
ns375
ns0.67
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
334
ns375
ns0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
375
ns250
ns1.50
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
22932
ns23453
ns0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU
47870
ns44533
ns1.07
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6292
ns6895.5
ns0.91
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6833
ns6458
ns1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9416
ns6958
ns1.35
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7500
ns6520.5
ns1.15
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
196587.5
ns201834
ns0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
380031
ns372536
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5875
ns5645.5
ns1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6292
ns5375
ns1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7187.5
ns7979
ns0.90
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6562
ns5166
ns1.27
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
134586
ns139838.5
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
230170
ns229750
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9833
ns9958
ns0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10000
ns10042
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11187.5
ns10417
ns1.07
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
11083
ns10854.5
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
840176
ns866511
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
631290
ns603858
ns1.05
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1542
ns708
ns2.18
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1625
ns708
ns2.30
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns750
ns2.17
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1625
ns667
ns2.44
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
22272
ns22827
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU
204933
ns202368
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5750
ns4834
ns1.19
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6125
ns4833
ns1.27
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6417
ns5125
ns1.25
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5875
ns6291
ns0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
216977
ns222098
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU
491814.5
ns471721
ns1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8250
ns8750
ns0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
8562.5
ns7834
ns1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
9895.5
ns9375
ns1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
9209
ns7646
ns1.20
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
115063
ns117939.5
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU
73999
ns74409
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8167
ns8792
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9250
ns8583
ns1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9833.5
ns8875
ns1.11
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10333
ns8083
ns1.28
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
548589
ns568724.5
ns0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU
340367
ns335106
ns1.02
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
127271
ns126042
ns1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
128750
ns129208
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
131062
ns129542
ns1.01
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
181979.5
ns180792
ns1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46303.5
ns46423
ns1.00
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU
102121
ns101850
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
338125
ns315875
ns1.07
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
339792
ns334166.5
ns1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
346083
ns323291.5
ns1.07
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
595417
ns609395.5
ns0.98
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
181951
ns187684
ns0.97
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU
410627.5
ns405833.5
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397708
ns397500
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288375
ns287979.5
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
287937.5
ns288375
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
756708
ns756000
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
43092
ns43964
ns0.98
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU
85671
ns79439
ns1.08
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
1456291.5
ns1461000
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
1133125
ns1133834
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
1127937.5
ns1129645.5
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
2360208
ns2449292
ns0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
248595.5
ns254140
ns0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU
266317
ns254646
ns1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
643479.5
ns626500
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
654166
ns657208.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
652750
ns649750.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
650625
ns642417
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
172424.5
ns185720.5
ns0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
315089
ns264649
ns1.19
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2449417
ns2452625
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2455020.5
ns2465208.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2465625
ns2459375
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2469208.5
ns2376375
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
922065
ns949649
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1363193.5
ns1323598
ns1.03
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
32917
ns32458
ns1.01
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
35374.5
ns36521
ns0.97
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
34417
ns34833
ns0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
1000
ns959
ns1.04
batchedmm(2, Bsize=32)/forward/GPU/CUDA
15534
ns15902
ns0.98
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU
78366
ns74499.5
ns1.05
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
2937.5
ns3125
ns0.94
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
3375
ns3250
ns1.04
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
5208
ns3375
ns1.54
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
4625
ns3062.5
ns1.51
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
133935.5
ns137187.5
ns0.98
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU
318886
ns314258
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1464209
ns436500
ns3.35
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1500333
ns438625
ns3.42
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1501333
ns438791
ns3.42
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1442563
ns445917
ns3.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
41738
ns42826
ns0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
318625
ns374379.5
ns0.85
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5128625
ns4140000
ns1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5291041
ns4271375
ns1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5297084
ns4270687.5
ns1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4998791.5
ns5468750
ns0.91
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
230499.5
ns236201.5
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1198280
ns1135862
ns1.05
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3709
ns3750
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3750
ns3791
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3750
ns3750
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3916
ns3709
ns1.06
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
33583
ns34158
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU
36778.5
ns41117
ns0.89
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15417
ns15375
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15500
ns15334
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15791
ns15500
ns1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
16000
ns15250
ns1.05
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
252278
ns255579
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU
161662
ns158606
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
404625
ns404792
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
296000
ns295917
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
295916
ns295958
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
760625
ns759750
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113161.5
ns113245
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU
95859
ns91962
ns1.04
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1479249.5
ns1482854
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1158584
ns1158625
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1160500
ns1150334
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2383354
ns2466708
ns0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
228888
ns236768.5
ns0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU
265922
ns298578
ns0.89
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
958
ns584
ns1.64
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1042
ns625
ns1.67
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1042
ns584
ns1.78
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
1083
ns542
ns2.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
24404
ns25569
ns0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU
207859
ns202679
ns1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7917
ns8083
ns0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8542
ns7792
ns1.10
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9917
ns8375
ns1.18
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
12895.5
ns8437.5
ns1.53
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
202191
ns207068.5
ns0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU
620871
ns593474
ns1.05
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
835834
ns829375
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
615542
ns617667
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
617791.5
ns618667
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
1549375
ns1544417
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA
130350.5
ns130866
ns1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU
215532
ns211214
ns1.02
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
2690375
ns2686104.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
2000479.5
ns1994542
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
2007416.5
ns1998375
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
4941104
ns4960479
ns1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
232712
ns234509
ns0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU
872871.5
ns831293.5
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
291
ns292
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
375
ns250
ns1.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31625
ns32562
ns0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU
47950
ns48691
ns0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6084
ns6333
ns0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6708
ns6375
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7666
ns6667
ns1.15
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8083
ns6104.5
ns1.32
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
221856.5
ns227701
ns0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU
352319
ns346728
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1741791.5
ns1760625
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1752167
ns1749875
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1739042
ns1744292
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1719916
ns1755166
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
183055.5
ns189332
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
415606.5
ns413433
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4361125
ns4360416
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4365916.5
ns4366917
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4399333
ns4349104
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4394333
ns5705104
ns0.77
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
827645.5
ns849205
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1239667.5
ns1205562.5
ns1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
7083
ns9604
ns0.74
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
7395.5
ns6916
ns1.07
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7041
ns8208
ns0.86
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6854.5
ns6854
ns1.00
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
22223.5
ns22924.5
ns0.97
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU
47178
ns46437
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
45292
ns50604.5
ns0.90
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
51167
ns52166
ns0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
49250
ns45458.5
ns1.08
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
49437
ns33312.5
ns1.48
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
204846
ns211538
ns0.97
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU
235841
ns226508
ns1.04
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
22125
ns21646
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
25125
ns26083.5
ns0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
24833
ns24958.5
ns0.99
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
5458.5
ns5291.5
ns1.03
batchedmm(2, Bsize=512)/forward/GPU/CUDA
17859
ns18121
ns0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU
82154
ns73668
ns1.12
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
11792
ns12125
ns0.97
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
10750
ns10667
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
12583
ns10833
ns1.16
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
19708.5
ns18042
ns1.09
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
216235
ns221707
ns0.98
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU
331099
ns322703
ns1.03
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
406250
ns405917
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
297333
ns296791.5
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
296833.5
ns297167
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
762833
ns756709
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
46303.5
ns46696
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU
97252
ns90770
ns1.07
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
1477458
ns1487375
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
1164395.5
ns1163500
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
1164416
ns1157209
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
2386333
ns2472417
ns0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
268961
ns283340.5
ns0.95
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU
282959
ns269032
ns1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1488416
ns436458
ns3.41
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1526958
ns443270.5
ns3.44
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1529250
ns440750
ns3.47
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1466395.5
ns449000
ns3.27
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
52650
ns53940
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
326982
ns323133
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5119459
ns4138541
ns1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5285084
ns4268354.5
ns1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5297709
ns4258750
ns1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4955208
ns5475229.5
ns0.91
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
250192
ns255597
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
1186136
ns1132896.5
ns1.05
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28292
ns9333
ns3.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28292
ns8000
ns3.54
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28333
ns8000
ns3.54
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28417
ns13250
ns2.14
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
23514.5
ns23885
ns0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU
207227
ns202528
ns1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66542
ns49625
ns1.34
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66750
ns49667
ns1.34
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66500
ns49583
ns1.34
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66208
ns71667
ns0.92
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
333506.5
ns336641
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU
576948.5
ns508895.5
ns1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
124875
ns108270.5
ns1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81875
ns86167
ns0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
89166
ns86500
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
86750
ns146083
ns0.59
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
191648
ns192063
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU
233116
ns267851
ns0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2025145.5
ns2018917
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2021978.5
ns2016937.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2030542
ns2011375
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1995125
ns2024000.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
506195
ns511598
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU
881973
ns860237
ns1.03
This comment was automatically generated by workflow using github-action-benchmark.