Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
refactor: move JuliaSIMD deps to extensions (#175)
Browse files Browse the repository at this point in the history
* fix: remove LV.vmap! usage

* fix: remove LV handling for bias_activation

* fix: remove LV usage in dropout

* refactor: move LV and octavian behind an extension

* docs: add docs for loading packages

* refactor: move SLEEFPirates to an ext

* fix: enzyme rules for batched matmul

* fix: patch more enzyme issues

* feat: add a preference to disable loop vectorization

* fix: incorrect dispatch called

* fix: enzyme segfault bypass
  • Loading branch information
avik-pal authored Oct 18, 2024
1 parent 604783f commit 98a2d7a
Show file tree
Hide file tree
Showing 26 changed files with 354 additions and 304 deletions.
25 changes: 24 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ concurrency:

jobs:
ci:
name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }}
name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }} - ${{ matrix.loopvec }}
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') }}
runs-on: ${{ matrix.os }}
strategy:
Expand All @@ -43,27 +43,49 @@ jobs:
- "others"
blas_backend:
- "default"
loopvec:
- "true"
include:
- os: ubuntu-latest
test_group: "dense"
blas_backend: "blis"
version: "1.10"
loopvec: "true"
- os: ubuntu-latest
test_group: "dense"
blas_backend: "mkl"
version: "1.10"
loopvec: "true"
- os: ubuntu-latest
test_group: "dense"
blas_backend: "default"
version: "1.10"
loopvec: "false"
- os: ubuntu-latest
test_group: "batched_ops"
blas_backend: "default"
version: "1.10"
loopvec: "false"
- os: ubuntu-latest
test_group: "other_ops"
blas_backend: "default"
version: "1.10"
loopvec: "false"
- os: macos-latest
test_group: "dense"
blas_backend: "appleaccelerate"
version: "1.10"
loopvec: "true"
- os: macos-latest
test_group: "all"
blas_backend: "default"
version: "1.10"
loopvec: "true"
- os: windows-latest
test_group: "all"
blas_backend: "default"
version: "1.10"
loopvec: "true"
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
Expand All @@ -84,6 +106,7 @@ jobs:
env:
LUXLIB_TEST_GROUP: ${{ matrix.test_group }}
LUXLIB_BLAS_BACKEND: ${{ matrix.blas_backend }}
LUXLIB_LOAD_LOOPVEC: ${{ matrix.loopvec }}
- uses: julia-actions/julia-processcoverage@v1
with:
directories: src,ext
Expand Down
13 changes: 9 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.3"
version = "1.3.4"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand All @@ -15,16 +15,14 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -36,7 +34,10 @@ BLISBLAS = "6f275bd8-fec0-4d39-945b-7e95a765fa1e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

Expand All @@ -46,7 +47,10 @@ LuxLibBLISBLASExt = "BLISBLAS"
LuxLibCUDAExt = "CUDA"
LuxLibMKLExt = "MKL"
LuxLibEnzymeExt = "Enzyme"
LuxLibLoopVectorizationExt = "LoopVectorization"
LuxLibOctavianExt = ["Octavian", "LoopVectorization"]
LuxLibReverseDiffExt = "ReverseDiff"
LuxLibSLEEFPiratesExt = "SLEEFPirates"
LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
LuxLibTrackerExt = "Tracker"
LuxLibcuDNNExt = ["CUDA", "cuDNN"]
Expand Down Expand Up @@ -75,6 +79,7 @@ MLDataDevices = "1.2"
Markdown = "1.10"
NNlib = "0.9.24"
Octavian = "0.3.28"
Preferences = "1.4.3"
Polyester = "0.7.15"
Random = "1.10"
Reexport = "1"
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/Project.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Expand Down
1 change: 1 addition & 0 deletions benchmarks/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ using Pkg
using BenchmarkTools
using InteractiveUtils
using LinearAlgebra
using Octavian, LoopVectorization

const SUITE = BenchmarkGroup()
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
Expand Down
72 changes: 72 additions & 0 deletions ext/LuxLibLoopVectorizationExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
module LuxLibLoopVectorizationExt

using LoopVectorization: LoopVectorization, @tturbo, @turbo, indices
using Polyester: @batch
using Static: True

using LuxLib: LuxLib, Utils

Utils.is_extension_loaded(::Val{:LoopVectorization}) = True()

Utils.can_loopvec_args_check(::True, args...) = LoopVectorization.check_args(args...)

# matmul
for serial in (true, false)
opname = serial ? :serial_matmul_loopvec! : :matmul_loopvec!
@eval @inline function LuxLib.Impl.$(opname)(
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number)
if !iszero(β) # Secial case this because Base.FastMath.mul_fast(NaN, false) = NaN
@turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1)
Cⱼₖ = zero(eltype(C))
for I in indices((A, B), (2, 1))
Cⱼₖ += A[J, I] * B[I, K]
end
C[J, K] = α * Cⱼₖ + β * C[J, K]
end
else
@turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1)
Cⱼₖ = zero(eltype(C))
for I in indices((A, B), (2, 1))
Cⱼₖ += A[J, I] * B[I, K]
end
C[J, K] = α * Cⱼₖ
end
end
end
end

@inline function LuxLib.Impl.matmuladd_loopvec!(
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, bias::AbstractVector)
@tturbo for K in indices((C, B), 2), J in indices((C, A), 1)
Cⱼₖ = zero(eltype(C))
for I in indices((A, B), (2, 1))
Cⱼₖ += A[J, I] * B[I, K]
end
C[J, K] = bias[J] + Cⱼₖ
end
return
end

# batched matmul
function LuxLib.Impl.batched_matmul_loopvec_impl!(
z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}, α::Number=true, β::Number=false) where {zT, xT, yT}
if size(x, 3) == size(y, 3)
@batch for L in axes(z, 3)
LuxLib.Impl.serial_matmul_loopvec!(
Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, L), α, β)
end
elseif size(x, 3) == 1
@batch for L in axes(z, 3)
LuxLib.Impl.serial_matmul_loopvec!(
Utils.batchview(z, L), Utils.batchview(x, 1), Utils.batchview(y, L), α, β)
end
else # has to be size(y, 3) == 1
@batch for L in axes(z, 3)
LuxLib.Impl.serial_matmul_loopvec!(
Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, 1), α, β)
end
end
end

end
16 changes: 16 additions & 0 deletions ext/LuxLibOctavianExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module LuxLibOctavianExt

using Octavian: Octavian
using Static: True

using LuxLib: LuxLib, Utils

Utils.is_extension_loaded(::Val{:Octavian}) = True()

@inline function LuxLib.Impl.matmul_octavian!(
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number)
Octavian.matmul!(C, A, B, α, β)
return
end

end
58 changes: 58 additions & 0 deletions ext/LuxLibSLEEFPiratesExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module LuxLibSLEEFPiratesExt

using ChainRulesCore: ChainRulesCore
using NNlib: NNlib
using SLEEFPirates: SLEEFPirates

using LuxLib: Numeric, Impl

const CRC = ChainRulesCore

sigmoid_fast(x::Number) = SLEEFPirates.sigmoid_fast(x)
softplus(x::Number) = SLEEFPirates.softplus(x)
logsigmoid(x::Number) = -softplus(-x)
swish(x::Number) = Base.FastMath.mul_fast(x, sigmoid_fast(x))
lisht(x::Number) = Base.FastMath.mul_fast(x, tanh_fast(x))
tanh(x::Number) = SLEEFPirates.tanh(x)
tanh_fast(x::Number) = SLEEFPirates.tanh_fast(x)

for (f, dfdx) in [
#! format: off
(:sigmoid_fast, :(conj(Base.FastMath.mul_fast(Ω, Base.FastMath.sub_fast(1, Ω))))),
(:softplus, :(sigmoid_fast(x))),
(:logsigmoid, :(sigmoid_fast(-x))),
(:swish, :(Base.FastMath.add_fast(Ω, Base.FastMath.mul_fast(sigmoid_fast(x), Base.FastMath.sub_fast(1, Ω))))),
(:lisht, :(Base.FastMath.add_fast(x, Base.FastMath.mul_fast(tanh_fast(x), Base.FastMath.sub_fast(1, Ω))))),
(:tanh, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω))))),
(:tanh_fast, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω)))))
#! format: on
]
@eval CRC.@scalar_rule($f(x), $(dfdx))

∇f = Symbol(:∇broadcasted_, f)
@eval function CRC.rrule(::typeof(Broadcast.broadcasted), ::typeof($f),
x::Union{Numeric, Broadcast.Broadcasted})
Ω = $(f).(x)
function $(∇f)(dΩ)
∂x = CRC.InplaceableThunk(dx -> @.(dx+=* $(dfdx)), CRC.@thunk @.(dΩ*$(dfdx)))
return CRC.NoTangent(), CRC.NoTangent(), ∂x
end
return Ω, $(∇f)
end
end

for (fbase, ffast) in [
#! format: off
(NNlib.sigmoid_fast, sigmoid_fast),
(NNlib.softplus, softplus),
(NNlib.logsigmoid, logsigmoid),
(NNlib.swish, swish),
(NNlib.lisht, lisht),
(Base.tanh, tanh),
(NNlib.tanh_fast, tanh_fast)
#! format: on
]
@eval Impl.sleefpirates_fast_act(::typeof($fbase)) = $ffast
end

end
3 changes: 3 additions & 0 deletions src/LuxLib.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module LuxLib

using Compat: @compat
using Preferences: @load_preference
using Reexport: @reexport
using Static: Static, known

Expand All @@ -15,6 +16,8 @@ const Numeric = Union{AbstractArray{<:T}, T} where {T <: Number}
const ∂∅ = NoTangent()
const CRC = ChainRulesCore

const DISABLE_LOOP_VECTORIZATION = @load_preference("disable_loop_vectorization", false)

include("utils.jl")
include("traits.jl")
include("impl/Impl.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/api/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ generic implementation.
This function doesn't replace `σ` with `NNlib.fast_act(σ, ...)`, that needs to be
done by the user if needed.
!!! tip
!!! tip "Load `SLEEFPirates.jl` to get faster activations"
Certain activation functions are replaced with specialized implementations from
[SLEEFPirates.jl](https://github.com/JuliaSIMD/SLEEFPirates.jl) for FP32. This might
Expand Down
5 changes: 5 additions & 0 deletions src/api/batched_mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
Computes the batched matrix multiplication of `x` and `y`. For more details see the NNlib
documentation on `NNlib.batched_mul`. This function is mostly a wrapper around `batched_mul`
but attempts to be faster on CPUs.
!!! tip "Load `LoopVectorization.jl` to get faster batched matrix multiplication"
On CPUs loading LoopVectorization adds faster implementations of batched matrix
multiplication.
"""
function batched_matmul(x::AbstractMatrix, y::AbstractArray{yT, 3}) where {yT}
return batched_matmul(expand_batchdim(x), y)
Expand Down
5 changes: 5 additions & 0 deletions src/api/dense.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ multiple operations.
- For small CPU Arrays, we use LoopVectorization.jl. On `x86_64` we use Octavian for
medium sized matrices. This is overridden if special BLAS implementations are loaded
(currently `MKL`, `AppleAccelerate`, and `BLISBLAS`).
!!! tip "Load `Octavian.jl`
Loading `Octavian.jl` enables a polyalgorithm that uses different backends based on the
input sizes.
"""
function fused_dense_bias_activation::F, weight::AbstractMatrix, x::AbstractMatrix,
b::Optional{<:AbstractVector}) where {F}
Expand Down
5 changes: 1 addition & 4 deletions src/impl/Impl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ using ForwardDiff: ForwardDiff

using KernelAbstractions: KernelAbstractions, @kernel, @Const, @index

using LoopVectorization: LoopVectorization, @turbo, @tturbo, indices
using Octavian: Octavian
using Polyester: @batch

using LinearAlgebra: LinearAlgebra, mul!
Expand All @@ -31,15 +29,14 @@ using ..Utils: Utils, NotaNumber, batchview, concrete_bias_act_output_eltype, co
copy_drop_gradients, eltype_mismatch, expand_batchdim,
maybe_reduce_BLAS_threads, ofeltype_array, only_derivative, remove_tracking,
reset_BLAS_threads, run_ka_kernel, safe_eltype, safe_vec, safe_warning,
unsafe_known, unrolled_mapreduce, @enzyme_alternative
unsafe_known, unrolled_mapreduce, can_loopvec_args, @enzyme_alternative
using ..Traits: activation_intermediate_not_needed, activation_has_rrule, is_mutable_array,
fuse_cpu_activation
using ..System: explicit_blas_loaded, use_octavian, fits_in_l1cache, fits_in_l2cache,
fits_in_l3cache

const CRC = ChainRulesCore
const KA = KernelAbstractions
const LV = LoopVectorization

include("activation.jl")
include("batched_mul.jl")
Expand Down
Loading

3 comments on commit 98a2d7a

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/117587

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.4 -m "<description of version>" 98a2d7ad69cba4a97a848d8e0e4f7419c543fda2
git push origin v1.3.4

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 98a2d7a Previous: 604783f Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6417 ns 5375 ns 1.19
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6041 ns 5250 ns 1.15
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7167 ns 7708.5 ns 0.93
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5292 ns 5416 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 103542 ns 113361 ns 0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 637131 ns 601544 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10166.5 ns 9729.5 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9958 ns 9938 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10291.5 ns 10167 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9979.5 ns 11063 ns 0.90
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 494284 ns 544547 ns 0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 719725 ns 629346 ns 1.14
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1583 ns 1500 ns 1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1542 ns 1458 ns 1.06
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1666 ns 1771 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1500 ns 1583 ns 0.95
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 20684 ns 20770 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 33302 ns 30997 ns 1.07
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 3812.5 ns 4104 ns 0.93
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4125 ns 4500 ns 0.92
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4250 ns 4500 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4334 ns 4333 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 134278.5 ns 134970 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 143062.5 ns 138579 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58000 ns 57666.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46417 ns 46875 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46875 ns 47125 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83750 ns 81458 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37449 ns 36587 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 70883 ns 69420 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2037500 ns 2030375 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2083416.5 ns 2088625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090916.5 ns 2086625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1996979.5 ns 1998562 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 220080 ns 217216 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1213928 ns 930850 ns 1.30
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 173708 ns 175083 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 146625 ns 147291 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 165062.5 ns 150021 ns 1.10
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 172000 ns 151750 ns 1.13
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167869.5 ns 166825 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 196051.5 ns 262570 ns 0.75
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1113854.5 ns 1115103.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1110541 ns 1110771 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1118667 ns 1113771 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1124479.5 ns 1136250 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 644177 ns 639845.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 899376 ns 864075 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5333 ns 3792 ns 1.41
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4875 ns 4479 ns 1.09
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6750 ns 6583 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4416 ns 6375 ns 0.69
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 83066 ns 85209.5 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 64020 ns 59531 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8584 ns 8417 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8750 ns 8750 ns 1
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8875 ns 9042 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8584 ns 8958 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 552192.5 ns 557500.5 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 372446 ns 370833 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17229.5 ns 17958 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17250 ns 16458 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21542 ns 21125 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17208.5 ns 17292 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 63166 ns 63776.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79573.5 ns 82870 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220583 ns 212625 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218875 ns 213042 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223125 ns 212771 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219625 ns 212291 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 329089 ns 329859 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 423777 ns 405232 ns 1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 583 ns 667 ns 0.87
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 625 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 833 ns 875 ns 0.95
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 834 ns 709 ns 1.18
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 19066 ns 19101 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 27311 ns 26409 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1417 ns 1458 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1334 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1583 ns 1583 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1375 ns 1375 ns 1
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 116071.5 ns 117126.5 ns 0.99
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 118732 ns 115676 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7375 ns 7375 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 6041 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6083 ns 6084 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10334 ns 9958 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24482 ns 23587 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 52122 ns 52723 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 229541.5 ns 229167 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 268417 ns 230667 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 241500 ns 267875 ns 0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 251250 ns 257458 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 189293 ns 182744 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 588480 ns 548449.5 ns 1.07
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4042 ns 3917 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23660.5 ns 22860 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 43502 ns 39504 ns 1.10
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16833 ns 17042 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16834 ns 16875 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16959 ns 17083 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16666 ns 16875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 188039 ns 185787.5 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 166010.5 ns 162052 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 929291 ns 491583 ns 1.89
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 838708 ns 385625 ns 2.17
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 841584 ns 386458 ns 2.18
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 1269208 ns 844083 ns 1.50
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113941 ns 113763 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 396441 ns 388657 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2610729.5 ns 2155583 ns 1.21
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2330541.5 ns 1863374.5 ns 1.25
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2324458 ns 1865167 ns 1.25
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3478334 ns 3377520.5 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 232093 ns 229580 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 630643.5 ns 610962 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6000 ns 6500 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7042 ns 5500 ns 1.28
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7333.5 ns 7667 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6584 ns 5167 ns 1.27
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 82915 ns 84720.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 62131.5 ns 59932 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11875 ns 11229 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11417 ns 11395.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12417 ns 12334 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9813 ns 10667 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 585345.5 ns 602168 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 388046 ns 383917 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23179.5 ns 23328 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 41949 ns 41367 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2250 ns 2166 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2167 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 226220 ns 228927.5 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 166171 ns 165900 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8583 ns 9584 ns 0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8542 ns 8333 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10709 ns 9895.5 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8833 ns 8542 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 100758 ns 105241 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 72575 ns 71955 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17228.5 ns 17688 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18583 ns 16666.5 ns 1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18500 ns 18708 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17750 ns 17562 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 582511 ns 595171 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 371318.5 ns 358129 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 459 ns 542 ns 0.85
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 458 ns 1.36
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34079 ns 34578 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 44423 ns 41387 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9479 ns 9229 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9750 ns 8958.5 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10333 ns 9750 ns 1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9562.5 ns 8104 ns 1.18
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 262881 ns 257823 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 351422 ns 349944 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 396583 ns 397270.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288042 ns 288083 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287666 ns 288666.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756167 ns 751792 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112987 ns 112022 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 77780.5 ns 74609 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1455709 ns 1454270.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1130291 ns 1130500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1133250 ns 1131583 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2358000 ns 2437959 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 202802 ns 200057 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 268682 ns 302285 ns 0.89
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7354.5 ns 7750 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8000 ns 7083.5 ns 1.13
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8687.5 ns 8312.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7750 ns 6687.5 ns 1.16
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 137305 ns 139766 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 64461 ns 60383 ns 1.07
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12812.5 ns 13479.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15041.5 ns 12750 ns 1.18
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15353.5 ns 15125 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 12333.5 ns 14625.5 ns 0.84
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 906003 ns 923489 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 413373 ns 407432 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 26000 ns 25625 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 27562.5 ns 23666 ns 1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27042 ns 29417 ns 0.92
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 26021 ns 24041 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 186382.5 ns 186240.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 146484 ns 120505 ns 1.22
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 146500 ns 152187 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 157750 ns 145250 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 129416 ns 146917 ns 0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 155812.5 ns 103958 ns 1.50
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1016426 ns 1013659 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 551090 ns 535240 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 84667 ns 74583 ns 1.14
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 80167 ns 79584 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78063 ns 76791.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 80521 ns 76083 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 190829 ns 190594.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 124858.5 ns 121316.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219479 ns 273562.5 ns 0.80
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 281750 ns 304084 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 278146 ns 303333 ns 0.92
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 320791.5 ns 307583 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1021778 ns 1045024 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 643542 ns 624192 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13125 ns 12417 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13666.5 ns 12896 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14041.5 ns 14000 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13459 ns 12500 ns 1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 136741.5 ns 138416 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 226473 ns 226152 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27083.5 ns 27792 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26125 ns 26458 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27833.5 ns 28437.5 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26604.5 ns 33937.5 ns 0.78
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 919419 ns 924126.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 633979.5 ns 610976 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 14000 ns 11124.5 ns 1.26
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 14708.5 ns 10333 ns 1.42
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 17583.5 ns 12479.5 ns 1.41
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 14792 ns 11125 ns 1.33
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 119245 ns 118543.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 233827 ns 233176 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26875 ns 22291.5 ns 1.21
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25958.5 ns 22417 ns 1.16
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26583 ns 24167 ns 1.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26541 ns 28562.5 ns 0.93
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 676576 ns 668341 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 589361.5 ns 569113 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 182375 ns 68709 ns 2.65
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183208 ns 62750 ns 2.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185583 ns 67520.5 ns 2.75
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183459 ns 64417 ns 2.85
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102955 ns 102389 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 232900.5 ns 230751 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 583500 ns 506375 ns 1.15
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 595083 ns 510167 ns 1.17
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 597520.5 ns 475209 ns 1.26
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 624167 ns 647896 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 493717.5 ns 492781 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 657463 ns 593680 ns 1.11
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6750 ns 7958 ns 0.85
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7645.5 ns 6750 ns 1.13
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8167 ns 8208 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7542 ns 7562.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 135360 ns 137965 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 62767 ns 62687 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 15375 ns 16125 ns 0.95
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14917 ns 16250 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16187.5 ns 16250 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15292 ns 14833 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 885601 ns 900927 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 392428 ns 388286 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6153416.5 ns 6150354 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6381624.5 ns 6368167 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6371521 ns 6373937.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11926500 ns 11915167 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 346494 ns 345749 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 392843 ns 388426 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19117208.5 ns 19083437.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19977084 ns 19960479.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19957021 ns 19966834 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36558729 ns 37142104 ns 0.98
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1005649 ns 1072087 ns 0.94
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1105996 ns 1035750.5 ns 1.07
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1750 ns 958 ns 1.83
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 1000 ns 1.83
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1042 ns 1.76
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1834 ns 958 ns 1.91
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23503 ns 23415 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 197739 ns 200906 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4834 ns 3917 ns 1.23
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4958 ns 4000 ns 1.24
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4041 ns 1.22
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4916 ns 5458 ns 0.90
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 276337.5 ns 270573.5 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 502208 ns 486775 ns 1.03
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8062.5 ns 8687 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8416 ns 7459 ns 1.13
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9459 ns 9334 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8145.5 ns 7834 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 115989 ns 116220 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 71584 ns 71133 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11562.5 ns 12125 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12438 ns 11958 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12541 ns 13000 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12875 ns 11750 ns 1.10
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 604320 ns 609643.5 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 353160 ns 341729 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22648 ns 22413 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 43592 ns 44053 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2917 ns 3000 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2917 ns 2917 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3041 ns 3208 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3000 ns 2916 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 197848 ns 194923.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 146363.5 ns 154488.5 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 14604 ns 11625 ns 1.26
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15458.5 ns 10500 ns 1.47
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 15896 ns 12875 ns 1.23
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15000.5 ns 11875 ns 1.26
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 117481 ns 115370 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 236802 ns 231793 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 26500 ns 22667 ns 1.17
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25625 ns 22104.5 ns 1.16
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 26041.5 ns 23625 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25958 ns 26729 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 561217 ns 555861 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 566814 ns 545740 ns 1.04
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4291 ns 4334 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4209 ns 4333 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4250 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24363 ns 23923 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 44754 ns 44864 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16250 ns 16500 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16125 ns 16333 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16292 ns 16166 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16416 ns 16292 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 321227 ns 319806 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 190786 ns 186077 ns 1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5916 ns 2125 ns 2.78
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5875 ns 2084 ns 2.82
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5792 ns 2209 ns 2.62
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 2000 ns 2.88
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 34700.5 ns 35327 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 200434 ns 199242 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 22292 ns 17104 ns 1.30
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21292 ns 20167 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21792 ns 19000 ns 1.15
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 22208 ns 23083.5 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 283315.5 ns 284984 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 598489 ns 583431 ns 1.03
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59729 ns 59458 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 64229 ns 65666 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 66833 ns 66125 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 50958 ns 52833 ns 0.96
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66908 ns 66304 ns 1.01
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 115781 ns 110241 ns 1.05
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 198937.5 ns 153041 ns 1.30
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 144625 ns 155229 ns 0.93
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 167291.5 ns 130209 ns 1.28
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 303249.5 ns 286334 ns 1.06
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 208882.5 ns 210129.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 529218 ns 511145 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 84291 ns 106521 ns 0.79
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83875 ns 78958 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 88125 ns 84042 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81562.5 ns 115521 ns 0.71
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193291 ns 191513.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 182771 ns 267630 ns 0.68
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1875250 ns 1894896 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1914792 ns 1902375 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1928375 ns 1878334 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1916625 ns 1895250 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 505449 ns 507442 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 857542 ns 825763 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 291 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21535 ns 21516 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 36788 ns 35507 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 1792 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 243998 ns 245735 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 166221 ns 164548 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 11229 ns 10916 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9791.5 ns 8291 ns 1.18
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11125 ns 11146 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10479.5 ns 9500 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 114440.5 ns 114788 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 233386 ns 232004 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10458 ns 8916 ns 1.17
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 8854.5 ns 1.16
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9917 ns 10917 ns 0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10145.5 ns 9583 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 491014 ns 491693 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 561274 ns 536332 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58375 ns 57958 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46917 ns 46625 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46625 ns 46750 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83708 ns 83166 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38960 ns 38476.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 72876 ns 71814 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1897625 ns 1905145.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1964750 ns 1949542 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1985854 ns 1958500 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1899833 ns 1874958 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 212091 ns 212675 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 994598 ns 968925.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 266354 ns 267500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 269729 ns 271479.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 271041.5 ns 271209 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 268271 ns 268209 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 193629.5 ns 194219.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 271156 ns 271267 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 693917 ns 585333.5 ns 1.19
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 692541 ns 600292 ns 1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 687708 ns 671042 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 593833 ns 845604.5 ns 0.70
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 991006 ns 991966 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 863163 ns 831153 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2180687.5 ns 2211666 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2214917 ns 2203958 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2212041 ns 2229083 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2208479 ns 2173792 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 154859 ns 161646 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 451844.5 ns 470965 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5453666 ns 5493104.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5518208 ns 5515875 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5522375 ns 5526542 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5522209 ns 6852458 ns 0.81
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 930442 ns 959137 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1495900 ns 1437405 ns 1.04
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 999875 ns 478292 ns 2.09
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 913333 ns 345625 ns 2.64
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 912895.5 ns 346750 ns 2.63
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 1334562.5 ns 908542 ns 1.47
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46425 ns 46909 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 399125 ns 393175 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2620166 ns 2137500 ns 1.23
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2328541 ns 1869334 ns 1.25
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2329395.5 ns 1859271 ns 1.25
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3468667 ns 3380209 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 247327 ns 264095.5 ns 0.94
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 658089 ns 632907.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58083 ns 57458 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46625 ns 46166 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46542 ns 46250 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84000 ns 78667 ns 1.07
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29007 ns 28560 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 73392 ns 73147 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036000 ns 2029292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2096916 ns 2078187.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2092208 ns 2063250 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1992542 ns 1963958 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 225482 ns 230846.5 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1028937.5 ns 980522 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58417 ns 58083.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47208 ns 46584 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47375 ns 46917 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83541 ns 79958 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48550 ns 48944 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 71593.5 ns 71428.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1926354.5 ns 1871729 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1987291 ns 1973604 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1972375 ns 1944167 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1890375 ns 1876792 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 231977 ns 238010 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 931260 ns 881607.5 ns 1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 291 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 33752 ns 34878 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 44343 ns 47028 ns 0.94
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6542 ns 6270.5 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7187.5 ns 6187.5 ns 1.16
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7375 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6209 ns 6125 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 203191.5 ns 211705.5 ns 0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 350064 ns 332741 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32755 ns 32902 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 36558 ns 36327 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3375 ns 2667 ns 1.27
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3333 ns 2667 ns 1.25
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3000 ns 4292 ns 0.70
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 3208 ns 3167 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 185298.5 ns 187662.5 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 144480 ns 136635 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1465479.5 ns 467208 ns 3.14
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1410667 ns 469417 ns 3.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1427770.5 ns 466875 ns 3.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1410417 ns 464979.5 ns 3.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 136084 ns 137312 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 354201 ns 361475 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5012687.5 ns 4027749.5 ns 1.24
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5023959 ns 4071500 ns 1.23
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5034167 ns 4067417 ns 1.24
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5021667 ns 5516750 ns 0.91
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 673868 ns 690445 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1145811 ns 1091915 ns 1.05
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49876625 ns 49879250 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35509791 ns 35487583 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35514916 ns 35512833.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 97103375 ns 96974083 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1608361 ns 1622377 ns 0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1576726 ns 1579230 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154443875 ns 154423062.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112320833.5 ns 112364750 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112445042 ns 112377416 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 296071750 ns 299989812 ns 0.99
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6483041.5 ns 6468945 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 6222525 ns 7230228 ns 0.86
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 48042 ns 19104.5 ns 2.51
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47667 ns 18375 ns 2.59
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47916 ns 17375.5 ns 2.76
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47583 ns 15083 ns 3.15
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19626 ns 19621 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 28463 ns 28854 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50583.5 ns 11062.5 ns 4.57
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50167 ns 8833 ns 5.68
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 51000 ns 9291 ns 5.49
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50667 ns 17667 ns 2.87
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 245482 ns 252067.5 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 140773 ns 138484 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8667 ns 7937.5 ns 1.09
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8750 ns 8125 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 11167 ns 10375 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9666.5 ns 8708 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 118847 ns 120230.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 237489 ns 235119 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10791 ns 9708 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10458 ns 9084 ns 1.15
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10333 ns 9792 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10709 ns 10667 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 584310 ns 599437 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 572469 ns 557070 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9125 ns 9291.5 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9896 ns 8812.5 ns 1.12
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10667 ns 9917 ns 1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9292 ns 8958.5 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 115727.5 ns 118821 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 73908 ns 71593 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13874.5 ns 13687.5 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13750 ns 13604.5 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 14333 ns 14395.5 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 14375.5 ns 14750 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 559680.5 ns 570663 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 337060 ns 323504 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 959 ns 542 ns 1.77
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1042 ns 625 ns 1.67
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 584 ns 1.78
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 500 ns 2.17
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 33675 ns 35088 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 206546 ns 203871 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8917 ns 7562.5 ns 1.18
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8437.5 ns 7667 ns 1.10
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8791 ns 7875 ns 1.12
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9250 ns 8520.5 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 225862.5 ns 227876 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 576667 ns 569945 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23667 ns 16458 ns 1.44
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23292 ns 17041 ns 1.37
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23813 ns 16209 ns 1.47
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23666 ns 10979 ns 2.16
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 20529 ns 20941 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 187811 ns 182992 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 53583.5 ns 35666 ns 1.50
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52145.5 ns 35167 ns 1.48
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 53584 ns 36000 ns 1.49
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 53667 ns 57833 ns 0.93
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 260507 ns 265749 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 549086 ns 534293 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1444541.5 ns 447500 ns 3.23
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1445459 ns 488042 ns 2.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1414666.5 ns 455709 ns 3.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1401396 ns 496916 ns 2.82
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 195236 ns 195513 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 321861 ns 328714 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5007208 ns 4024209 ns 1.24
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5006958 ns 4055021 ns 1.23
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5015812.5 ns 4053917 ns 1.24
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5020500 ns 5501562.5 ns 0.91
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 510108 ns 521631.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1117899 ns 1059038 ns 1.06
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 828285625 ns 836727208 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 541921375 ns 553913292 ns 0.98
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 542359625 ns 540736625 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1558200021 ns 1517196875 ns 1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22535776.5 ns 22767789 ns 0.99
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 12173703 ns 10331681 ns 1.18
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 3903695416 ns 3773348667 ns 1.03
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1771980416 ns 1782084291 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1773568584 ns 1780399750 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 5228367459 ns 4786718666 ns 1.09
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 119027931 ns 118657187 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 68450588 ns 67063298 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75916.5 ns 76542 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 87437.5 ns 76584 ns 1.14
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 84417 ns 79583 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 81083 ns 76708.5 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 192111.5 ns 195943.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 126607 ns 123300.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 282646 ns 191292 ns 1.48
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 283042 ns 252042 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 236875 ns 199562.5 ns 1.19
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 276458 ns 225542 ns 1.23
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 995625 ns 1004442 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 612404 ns 590764 ns 1.04
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199947208.5 ns 199694520.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139420500 ns 138856500 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 138954958 ns 139241166 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389188834 ns 393790959 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5832800 ns 5842492 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 2958637.5 ns 4746717.5 ns 0.62
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 618298396 ns 617676375.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439277916 ns 439446917 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 439303895.5 ns 439765166.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1200068000 ns 1174222000 ns 1.02
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26614249.5 ns 26723523 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 16011697.5 ns 15854720 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7417 ns 7292 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 6125 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 5959 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 9834 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26885 ns 26896.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 54341 ns 55173 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214083 ns 213041.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232833 ns 227729 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230000 ns 220416.5 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207709 ns 206125 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 215596 ns 219868 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 546726.5 ns 541982 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7417 ns 8521 ns 0.87
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8875.5 ns 7458 ns 1.19
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10750 ns 11167 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10459 ns 9250 ns 1.13
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 111291 ns 115361 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 72956 ns 74069 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7792 ns 7562.5 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7833.5 ns 7958 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 8167 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 7395.5 ns 1.13
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 492517.5 ns 495697 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 322723 ns 309298 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 417 ns 417 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 375 ns 1.55
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 25272 ns 26124 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 45194 ns 45334 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9646 ns 9584 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9541 ns 9062.5 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 11104 ns 9792 ns 1.13
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10333 ns 9542 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 247083 ns 247606 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 383457 ns 382304 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 351000 ns 112312.5 ns 3.13
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 354459 ns 103229 ns 3.43
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352250 ns 104104.5 ns 3.38
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 351625 ns 155083 ns 2.27
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 23168 ns 23501 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 198701 ns 192539 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 826000 ns 536562 ns 1.54
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 820458 ns 554250 ns 1.48
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 822083.5 ns 535291.5 ns 1.54
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 827750 ns 910854 ns 0.91
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 214195.5 ns 221242 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 578901 ns 560216.5 ns 1.03
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5229.5 ns 5416.5 ns 0.97
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5875 ns 6208.5 ns 0.95
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 6958.5 ns 6021 ns 1.16
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4667 ns 4000 ns 1.17
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17091 ns 17520 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 74219 ns 73648 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 13458.5 ns 11562.5 ns 1.16
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10625 ns 11062 ns 0.96
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 13041 ns 11000 ns 1.19
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 18542 ns 16666 ns 1.11
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 202239.5 ns 207455.5 ns 0.97
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 330217 ns 330387 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 39833.5 ns 39667 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51209 ns 51291 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52458.5 ns 52958.5 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13459 ns 13625 ns 0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA 19993 ns 20356 ns 0.98
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 99666.5 ns 98364 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 38229.5 ns 36375.5 ns 1.05
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 35125 ns 31417 ns 1.12
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 34187.5 ns 31229.5 ns 1.09
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 59417 ns 57000 ns 1.04
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 178995.5 ns 184178 ns 0.97
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 362888 ns 355254 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3500 ns 1750 ns 2
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3667 ns 2042 ns 1.80
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3833 ns 2208 ns 1.74
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3709 ns 1875 ns 1.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 19015 ns 19575 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 29645 ns 29099.5 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4291 ns 2208 ns 1.94
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4500 ns 2167 ns 2.08
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4458 ns 2375 ns 1.88
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4292 ns 2208 ns 1.94
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 194611 ns 198996.5 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 126757 ns 128571 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5916 ns 4583 ns 1.29
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5062.5 ns 4417 ns 1.15
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6375 ns 6729 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4625 ns 3958 ns 1.17
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 138395 ns 143699.5 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 65944 ns 61955.5 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9625 ns 8334 ns 1.15
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8500 ns 8083.5 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9333 ns 8709 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10666 ns 8583 ns 1.24
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 807046.5 ns 836045.5 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 378457 ns 364891 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 207583 ns 54833 ns 3.79
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209042 ns 55833 ns 3.74
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 213208 ns 55583 ns 3.84
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 204125 ns 56000 ns 3.65
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35332 ns 36570 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 203930.5 ns 202568 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 603500 ns 476729 ns 1.27
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 623479.5 ns 494500 ns 1.26
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 658604.5 ns 494208 ns 1.33
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 586375 ns 641625 ns 0.91
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 254148 ns 259886 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 767213 ns 705894 ns 1.09
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3324167 ns 3310333 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2328667 ns 2334062.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2334417 ns 2333375 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6324542 ns 6300479 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 206559 ns 204581.5 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 377105 ns 373097 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11496208.5 ns 11459729 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8303562.5 ns 8305729.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8348416.5 ns 8342854 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21193020.5 ns 21088292 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 736080.5 ns 744676 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 2044820.5 ns 1994797.5 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3917 ns 4833 ns 0.81
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5292 ns 4646 ns 1.14
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6292 ns 7520.5 ns 0.84
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7125 ns 4917 ns 1.45
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 129442 ns 133339 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 57067 ns 61520 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 7083 ns 1.20
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7291.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7833 ns 7500 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8291.5 ns 7416.5 ns 1.12
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 711410 ns 725863 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 364581 ns 353680 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 117312.5 ns 100459 ns 1.17
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 101437.5 ns 123042 ns 0.82
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 102687.5 ns 102417 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 98458.5 ns 121458.5 ns 0.81
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 149616 ns 151940.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 210473 ns 233346 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2008250 ns 2033271 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2022459 ns 2026417 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2039937.5 ns 1997458.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2036625 ns 2041833 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 661994.5 ns 678763 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 963831 ns 931831 ns 1.03
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33416 ns 32666 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 35459 ns 36562.5 ns 0.97
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 34709 ns 36167 ns 0.96
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 750 ns 667 ns 1.12
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15265 ns 15627 ns 0.98
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 78737 ns 70121 ns 1.12
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 3959 ns 2604.5 ns 1.52
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2917 ns 2958 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 4708 ns 2937.5 ns 1.60
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 3666 ns 2167 ns 1.69
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 136137.5 ns 139744 ns 0.97
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 321796.5 ns 289641 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7208 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6042 ns 6000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6083 ns 5916 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 9917 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34970 ns 35855 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 56516 ns 53911 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221584 ns 212958.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220959 ns 222708 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 234583 ns 219917 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207333 ns 206209 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 237194 ns 243430 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 540189 ns 513269 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3833 ns 3750 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3791 ns 1.04
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21681 ns 21959 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 39383 ns 35557 ns 1.11
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14458 ns 14500 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14458 ns 14500 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14541 ns 14500 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14625 ns 14459 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 297631.5 ns 302419 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 190215 ns 179841 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 129834 ns 128041 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 118271 ns 144417 ns 0.82
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 106750 ns 106917 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 101666.5 ns 151959 ns 0.67
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 150106 ns 140874 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 241781 ns 236762 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921708.5 ns 1924583 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1924583 ns 1920500 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1932000 ns 1914229.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1922750 ns 1928875 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 653385 ns 673452 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 928325 ns 899671 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18875 ns 17333 ns 1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17292 ns 17354.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20937 ns 21208 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18459 ns 17375 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104073.5 ns 108833.5 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 91301 ns 91100 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 239083.5 ns 216917 ns 1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 224791 ns 252646 ns 0.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 224958.5 ns 222166 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218500 ns 229125 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 493640.5 ns 508535.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 439080 ns 419764 ns 1.05
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 26166 ns 24271 ns 1.08
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 29167 ns 30791.5 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 28958 ns 29437.5 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1416 ns 1584 ns 0.89
batchedmm(16, Bsize=4)/forward/GPU/CUDA 15781 ns 16398 ns 0.96
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 72756 ns 76093 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 6208 ns 4500 ns 1.38
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 5041 ns 4916 ns 1.03
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 6875 ns 5125 ns 1.34
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 6417 ns 4625 ns 1.39
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 199155.5 ns 204364 ns 0.97
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 324216 ns 331675 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221875 ns 222666 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 223375 ns 220666.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 225375 ns 225667 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 223542 ns 220583 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 216803 ns 222506.5 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 267771 ns 267871 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 508542 ns 495084 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 511042 ns 511812.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 509500 ns 500854 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 557354 ns 675750 ns 0.82
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1017707.5 ns 1053634 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 811461 ns 780999 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19104 ns 20375 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19584 ns 20000 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22063 ns 23875 ns 0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19792 ns 18792 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 111072 ns 114286 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 90009 ns 89858 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221854 ns 212375 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220250 ns 213041 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218166.5 ns 214458 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220146 ns 212541 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 700847.5 ns 727333.5 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 494855 ns 469036 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6292 ns 6666 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 6604.5 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7375 ns 8750.5 ns 0.84
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6834 ns 6208 ns 1.10
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 130925 ns 137142 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 63498 ns 60974 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11041.5 ns 9791 ns 1.13
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9959 ns 10084 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10895.5 ns 10750 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10459 ns 10750 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 770540.5 ns 794651.5 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 375452 ns 370101.5 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4104 ns 4666 ns 0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7041 ns 4708 ns 1.50
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7166 ns 7437.5 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6166 ns 4917 ns 1.25
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 131485.5 ns 138544.5 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 62607 ns 59692 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7416.5 ns 7458 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7166 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 7791 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8083 ns 7708 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 737449 ns 755761 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 380902 ns 376523 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14481917 ns 14498417 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10107542 ns 10124125 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10094750 ns 10094833 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27859959 ns 27748583.5 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 533975 ns 532665 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 867906.5 ns 866850 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46387667 ns 46333437 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33363354 ns 33447541.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33478875 ns 33510458 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85752792 ns 85445667 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2651799 ns 2636151 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 5191497.5 ns 5189385.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 185208.5 ns 66458 ns 2.79
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 185916 ns 65687.5 ns 2.83
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 188604 ns 70500 ns 2.68
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 187271 ns 66500 ns 2.82
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 117719.5 ns 118172.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 236051 ns 237313 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 634875 ns 467958 ns 1.36
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 627937.5 ns 480333.5 ns 1.31
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 601166 ns 474916.5 ns 1.27
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 587625 ns 686583.5 ns 0.86
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 694993 ns 715446 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 698169.5 ns 655875 ns 1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 541 ns 542 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 583 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 31826 ns 32877 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 48104.5 ns 47579 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9541 ns 8750 ns 1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9687.5 ns 9208 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10542 ns 9104.5 ns 1.16
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10938 ns 9750 ns 1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 276120 ns 280778.5 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 371078 ns 355484 ns 1.04
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 9500 ns 2.76
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26333 ns 9500 ns 2.77
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26583 ns 9500 ns 2.80
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26458 ns 9500 ns 2.79
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 22942 ns 23273 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 206526 ns 200655 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67125 ns 50209 ns 1.34
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67333 ns 50250 ns 1.34
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 68792 ns 50500 ns 1.36
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66875 ns 72375 ns 0.92
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 273858 ns 278469.5 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 554115 ns 491037 ns 1.13
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 207166 ns 54917 ns 3.77
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 211667 ns 55667 ns 3.80
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 211167 ns 55584 ns 3.80
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 202875 ns 56000 ns 3.62
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27563 ns 28169 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 206546 ns 203240 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 609937.5 ns 518854 ns 1.18
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 669750 ns 500625 ns 1.34
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 664812.5 ns 497750 ns 1.34
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 609042 ns 643417 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 233231.5 ns 238777 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 798562 ns 758938 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 664875 ns 655042 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 636687.5 ns 613083 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 648791.5 ns 652541 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 629792 ns 678416.5 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 185894.5 ns 192069 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 349393 ns 269704 ns 1.30
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2244229 ns 2167104.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2225354 ns 2233125 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2256708 ns 2241292 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2271792 ns 2230208.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 900927 ns 929752.5 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1235829 ns 1217770.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19333 ns 19500 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21166.5 ns 19208.5 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22375 ns 23542 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19958 ns 20000 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 106770.5 ns 111306 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 89387 ns 91551 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227250 ns 220459 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 262312.5 ns 226458 ns 1.16
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231250 ns 223104.5 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 222770.5 ns 219708 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 700957 ns 714110 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 516550 ns 487481 ns 1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 584 ns 583 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 584 ns 584 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 22928 ns 23491 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 44243 ns 43771 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9583 ns 9417 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9958.5 ns 9291.5 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 13229.5 ns 9708 ns 1.36
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10875 ns 9646 ns 1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 258192 ns 261581 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 395479 ns 381618 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8062.5 ns 8917 ns 0.90
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9208 ns 7583 ns 1.21
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10459 ns 11854.5 ns 0.88
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8333 ns 9042 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 112863.5 ns 115935.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 72315 ns 70456.5 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 8125 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7542 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 14875 ns 8000 ns 1.86
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8917 ns 7292 ns 1.22
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 472419 ns 484010 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 321811 ns 302215 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1979.5 ns 1417 ns 1.40
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2500 ns 1667 ns 1.50
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2542 ns 1959 ns 1.30
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2416 ns 1500 ns 1.61
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 19845 ns 20030 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 191508 ns 184144 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6666 ns 3708 ns 1.80
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6459 ns 3625 ns 1.78
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 7292 ns 3833 ns 1.90
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 7292 ns 4917 ns 1.48
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 208409 ns 213101.5 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 543621 ns 524324.5 ns 1.04
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 754167 ns 148729 ns 5.07
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 751000 ns 128917 ns 5.83
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749375 ns 129917 ns 5.77
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 747104 ns 235541 ns 3.17
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 22303 ns 22778 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 47829 ns 46868 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 792250 ns 143645.5 ns 5.52
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 811750 ns 130875 ns 6.20
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 789500 ns 138417 ns 5.70
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 794229.5 ns 290021 ns 2.74
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 206590.5 ns 211960 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 233541 ns 223578 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7167 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5917 ns 5958 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 5958.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10209 ns 10000 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32976 ns 33236 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 57267 ns 57207 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 228458.5 ns 221249.5 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 269270.5 ns 238542 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 235021 ns 264500 ns 0.89
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213146 ns 213250 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 254662 ns 259447 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 552652 ns 530542 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12417 ns 13209 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 13250 ns 12166 ns 1.09
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14458 ns 13584 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 13000 ns 12667 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 131273.5 ns 135078 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 231363 ns 227730.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24854.5 ns 23917 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24916 ns 24083.5 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25542 ns 24750 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24458 ns 30146 ns 0.81
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 813324 ns 833527 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 634495 ns 615374.5 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8875 ns 9271 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9958 ns 9541 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11167 ns 10375 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9542 ns 9250 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 116553 ns 119628 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 74930 ns 74940 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13770.5 ns 14041 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14917 ns 13958 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15916 ns 14750 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16437.5 ns 13459 ns 1.22
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 621843 ns 638262 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 356836 ns 344824 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9145.5 ns 9666.5 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9354 ns 9208 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10750 ns 10959 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10125 ns 9083.5 ns 1.11
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 116468 ns 118521 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 74383.5 ns 79399 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12916 ns 13416 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12959 ns 12416 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 20541 ns 13479.5 ns 1.52
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 14500 ns 12708 ns 1.14
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 515709 ns 530027 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 328534 ns 317163 ns 1.04
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 31062 ns 30896 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 33146 ns 33813 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 30750 ns 32249.5 ns 0.95
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 1833 ns 1875 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16169 ns 16425 ns 0.98
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 77564 ns 76663 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5562.5 ns 5417 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5312.5 ns 5000 ns 1.06
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 7208 ns 5479.5 ns 1.32
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 7834 ns 6270.5 ns 1.25
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 134922 ns 138278 ns 0.98
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 340125 ns 340566 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 291 ns 1.29
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24307 ns 25574 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 45845 ns 45666 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6166.5 ns 6458 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6375 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8167 ns 6791.5 ns 1.20
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 6458.5 ns 1.10
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 179926.5 ns 185923.5 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 372385.5 ns 365402.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5834 ns 2084 ns 2.80
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5833 ns 2084 ns 2.80
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 2083 ns 2.82
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5958 ns 2000 ns 2.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25187 ns 26453 ns 0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 201636 ns 203645.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21041 ns 18041 ns 1.17
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21709 ns 17166.5 ns 1.26
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 23458 ns 17750 ns 1.32
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26125 ns 23458.5 ns 1.11
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 262884 ns 268326 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 615780.5 ns 600702.5 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 192083.5 ns 147875 ns 1.30
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 158917 ns 155437.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 154416.5 ns 155125 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 146417 ns 151708 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 184640 ns 190890.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 215472.5 ns 271146.5 ns 0.79
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1319792 ns 1321937.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1328249.5 ns 1330625 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1347250 ns 1308375 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1337000 ns 1285166 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 844907 ns 867140 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1041340 ns 1006962 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24292 ns 25500 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24916 ns 23542 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28000 ns 28708.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24833.5 ns 24416.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 224694.5 ns 226899 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 130334 ns 128029 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 117583 ns 125062.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 131375 ns 165729.5 ns 0.79
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 160499.5 ns 125854.5 ns 1.28
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 164750 ns 180062 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 967206 ns 998018.5 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 585053 ns 568743 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 375 ns 0.67
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22932 ns 23453 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 47870 ns 44533 ns 1.07
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6292 ns 6895.5 ns 0.91
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6833 ns 6458 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9416 ns 6958 ns 1.35
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7500 ns 6520.5 ns 1.15
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 196587.5 ns 201834 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 380031 ns 372536 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5875 ns 5645.5 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6292 ns 5375 ns 1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7187.5 ns 7979 ns 0.90
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6562 ns 5166 ns 1.27
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 134586 ns 139838.5 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 230170 ns 229750 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9833 ns 9958 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10000 ns 10042 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11187.5 ns 10417 ns 1.07
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11083 ns 10854.5 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 840176 ns 866511 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 631290 ns 603858 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1542 ns 708 ns 2.18
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 708 ns 2.30
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 750 ns 2.17
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1625 ns 667 ns 2.44
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22272 ns 22827 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 204933 ns 202368 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5750 ns 4834 ns 1.19
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6125 ns 4833 ns 1.27
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6417 ns 5125 ns 1.25
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5875 ns 6291 ns 0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 216977 ns 222098 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 491814.5 ns 471721 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8250 ns 8750 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8562.5 ns 7834 ns 1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9895.5 ns 9375 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9209 ns 7646 ns 1.20
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 115063 ns 117939.5 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 73999 ns 74409 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8167 ns 8792 ns 0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9250 ns 8583 ns 1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9833.5 ns 8875 ns 1.11
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10333 ns 8083 ns 1.28
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 548589 ns 568724.5 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 340367 ns 335106 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 127271 ns 126042 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 128750 ns 129208 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 131062 ns 129542 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 181979.5 ns 180792 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46303.5 ns 46423 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 102121 ns 101850 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 338125 ns 315875 ns 1.07
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 339792 ns 334166.5 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 346083 ns 323291.5 ns 1.07
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 595417 ns 609395.5 ns 0.98
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 181951 ns 187684 ns 0.97
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 410627.5 ns 405833.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397708 ns 397500 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288375 ns 287979.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287937.5 ns 288375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756708 ns 756000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43092 ns 43964 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 85671 ns 79439 ns 1.08
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1456291.5 ns 1461000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1133125 ns 1133834 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1127937.5 ns 1129645.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2360208 ns 2449292 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 248595.5 ns 254140 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 266317 ns 254646 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 643479.5 ns 626500 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 654166 ns 657208.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 652750 ns 649750.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 650625 ns 642417 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 172424.5 ns 185720.5 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 315089 ns 264649 ns 1.19
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2449417 ns 2452625 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2455020.5 ns 2465208.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2465625 ns 2459375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2469208.5 ns 2376375 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 922065 ns 949649 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1363193.5 ns 1323598 ns 1.03
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 32917 ns 32458 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35374.5 ns 36521 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34417 ns 34833 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 1000 ns 959 ns 1.04
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15534 ns 15902 ns 0.98
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 78366 ns 74499.5 ns 1.05
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 2937.5 ns 3125 ns 0.94
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3375 ns 3250 ns 1.04
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5208 ns 3375 ns 1.54
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 4625 ns 3062.5 ns 1.51
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 133935.5 ns 137187.5 ns 0.98
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 318886 ns 314258 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1464209 ns 436500 ns 3.35
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1500333 ns 438625 ns 3.42
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1501333 ns 438791 ns 3.42
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1442563 ns 445917 ns 3.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41738 ns 42826 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 318625 ns 374379.5 ns 0.85
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5128625 ns 4140000 ns 1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5291041 ns 4271375 ns 1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5297084 ns 4270687.5 ns 1.24
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4998791.5 ns 5468750 ns 0.91
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 230499.5 ns 236201.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1198280 ns 1135862 ns 1.05
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3709 ns 3750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3791 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3750 ns 3750 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3916 ns 3709 ns 1.06
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33583 ns 34158 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 36778.5 ns 41117 ns 0.89
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15417 ns 15375 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15500 ns 15334 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15791 ns 15500 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 16000 ns 15250 ns 1.05
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 252278 ns 255579 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 161662 ns 158606 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404625 ns 404792 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 296000 ns 295917 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295916 ns 295958 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760625 ns 759750 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113161.5 ns 113245 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 95859 ns 91962 ns 1.04
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1479249.5 ns 1482854 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1158584 ns 1158625 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1160500 ns 1150334 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2383354 ns 2466708 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 228888 ns 236768.5 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 265922 ns 298578 ns 0.89
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 958 ns 584 ns 1.64
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1042 ns 625 ns 1.67
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 584 ns 1.78
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 542 ns 2.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24404 ns 25569 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 207859 ns 202679 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7917 ns 8083 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 7792 ns 1.10
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9917 ns 8375 ns 1.18
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12895.5 ns 8437.5 ns 1.53
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 202191 ns 207068.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 620871 ns 593474 ns 1.05
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 835834 ns 829375 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 615542 ns 617667 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 617791.5 ns 618667 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1549375 ns 1544417 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130350.5 ns 130866 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 215532 ns 211214 ns 1.02
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2690375 ns 2686104.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2000479.5 ns 1994542 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2007416.5 ns 1998375 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4941104 ns 4960479 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 232712 ns 234509 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 872871.5 ns 831293.5 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31625 ns 32562 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 47950 ns 48691 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6084 ns 6333 ns 0.96
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6375 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7666 ns 6667 ns 1.15
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8083 ns 6104.5 ns 1.32
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 221856.5 ns 227701 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 352319 ns 346728 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1741791.5 ns 1760625 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1752167 ns 1749875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1739042 ns 1744292 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1719916 ns 1755166 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 183055.5 ns 189332 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 415606.5 ns 413433 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4361125 ns 4360416 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4365916.5 ns 4366917 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4399333 ns 4349104 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4394333 ns 5705104 ns 0.77
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 827645.5 ns 849205 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1239667.5 ns 1205562.5 ns 1.03
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 7083 ns 9604 ns 0.74
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7395.5 ns 6916 ns 1.07
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7041 ns 8208 ns 0.86
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6854.5 ns 6854 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 22223.5 ns 22924.5 ns 0.97
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 47178 ns 46437 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 45292 ns 50604.5 ns 0.90
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 51167 ns 52166 ns 0.98
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 49250 ns 45458.5 ns 1.08
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 49437 ns 33312.5 ns 1.48
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 204846 ns 211538 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 235841 ns 226508 ns 1.04
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 22125 ns 21646 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25125 ns 26083.5 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 24833 ns 24958.5 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5458.5 ns 5291.5 ns 1.03
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17859 ns 18121 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 82154 ns 73668 ns 1.12
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11792 ns 12125 ns 0.97
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10750 ns 10667 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 12583 ns 10833 ns 1.16
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 19708.5 ns 18042 ns 1.09
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 216235 ns 221707 ns 0.98
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 331099 ns 322703 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406250 ns 405917 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297333 ns 296791.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 296833.5 ns 297167 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762833 ns 756709 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46303.5 ns 46696 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 97252 ns 90770 ns 1.07
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1477458 ns 1487375 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1164395.5 ns 1163500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1164416 ns 1157209 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2386333 ns 2472417 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 268961 ns 283340.5 ns 0.95
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 282959 ns 269032 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1488416 ns 436458 ns 3.41
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1526958 ns 443270.5 ns 3.44
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1529250 ns 440750 ns 3.47
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1466395.5 ns 449000 ns 3.27
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 52650 ns 53940 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 326982 ns 323133 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5119459 ns 4138541 ns 1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5285084 ns 4268354.5 ns 1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5297709 ns 4258750 ns 1.24
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4955208 ns 5475229.5 ns 0.91
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 250192 ns 255597 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1186136 ns 1132896.5 ns 1.05
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28292 ns 9333 ns 3.03
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28292 ns 8000 ns 3.54
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28333 ns 8000 ns 3.54
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28417 ns 13250 ns 2.14
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 23514.5 ns 23885 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 207227 ns 202528 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66542 ns 49625 ns 1.34
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66750 ns 49667 ns 1.34
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66500 ns 49583 ns 1.34
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66208 ns 71667 ns 0.92
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 333506.5 ns 336641 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 576948.5 ns 508895.5 ns 1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 124875 ns 108270.5 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81875 ns 86167 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 89166 ns 86500 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 86750 ns 146083 ns 0.59
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191648 ns 192063 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 233116 ns 267851 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2025145.5 ns 2018917 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2021978.5 ns 2016937.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2030542 ns 2011375 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1995125 ns 2024000.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 506195 ns 511598 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 881973 ns 860237 ns 1.03

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.