-
Notifications
You must be signed in to change notification settings - Fork 69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
amd gpu give different results when nested loop is used #517
Comments
Can you also print the output? |
The output is cpu: [18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
cuda: [18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
amd: [6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0;;;]
|
Yeah that's concerning... @pxl-th any ideas? |
AMDGPU on Windows right? Could one have sum differently to discern if p or q are not executed? |
An error occurs if the kernel function changes to @kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i, j, k = @index(Global, NTuple)
sum = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-Ny):Ny
sum += 2.0
end
end
@inbounds tensor[i, j, k] = sum
end the error msg:
|
Testing on RX 7800 XT Ubuntu 22.04 I can reproduce the issue. The following does not errors though for me: @kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i, j, k = @index(Global, NTuple)
sum = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-Ny):Ny
sum += 2.0
end
end
@inbounds tensor[i, j, k] = sum
end and produces the same wrong results as the original version. From testing, it seems that @kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
i, j, k = @index(Global, NTuple)
sum = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-1):Ny
sum += 2.0
end
end
@inbounds tensor[i, j, k] = sum
end EDIT: |
Also the "plain" AMDGPU version works fine: using AMDGPU
function compute_amdgpu(tensor, kernel_fun, Nx, Ny, Nz)
groupsize = (16, 4, 2) # nthreads
gridsize = cld.(size(tensor), groupsize) # nblocks
@roc groupsize=groupsize gridsize=gridsize kernel_fun(tensor, Nx, Ny, Nz)
AMDGPU.synchronize()
return nothing
end
function kernel_xx2!(tensor, Nx, Ny, Nz)
i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
k = (workgroupIdx().z - 1) * workgroupDim().z + workitemIdx().z
sum = zero(eltype(tensor))
for p in (-Nx):Nx, q in (-Ny):Ny
sum += 2.0
end
if i ∈ axes(tensor, 1) && j ∈ axes(tensor, 2) && k ∈ axes(tensor, 3)
@inbounds tensor[i, j, k] = sum
end
return nothing
end
nx, ny, nz = 10, 1, 1
Nx, Ny, Nz = 1, 1, 1
tensor2 = AMDGPU.zeros(Float64, nx, ny, nz)
compute_amdgpu(tensor2, kernel_xx2!, Nx, Ny, Nz)
println("amdgpu:", tensor2) |
This seems to appear only when using
|
Or more generally, when passing |
Here's optimized LLVM IR for: @kernel function kernel_xx!(tensor, Nx::Int64, Ny::Int64, Nz::Int64)
idx = @index(Global)
res = zero(eltype(tensor))
for p in (-Nx):Nx
for q in (-Ny):Ny
res += 2.0
end
end
@inbounds tensor[idx] = res
end @vchuravy in %29 = icmp sgt i64 %.fca.1.1.0.1.0.extract, 0
br i1 %29, label %pass6, label %fail5
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:10:11:12:13"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #0
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #0
; Function Attrs: cold noreturn nounwind
declare void @llvm.amdgcn.endpgm() #1
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i64 @llvm.smax.i64(i64, i64) #2
define amdgpu_kernel void @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEE7NDRangeILl3ES0_S0_S2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEES2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5_S5_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, { [3 x i64], i8 addrspace(1)*, i64 } %1, i64 signext %2, i64 signext %3, i64 signext %4) local_unnamed_addr #3 {
conversion:
%.fca.0.0.0.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 0, 0, 0, 0
%.fca.0.0.1.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 0, 0, 1, 0
%.fca.0.0.2.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 0, 0, 2, 0
%.fca.1.0.0.0.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 0, 0, 0, 0
%.fca.1.0.0.1.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 0, 0, 1, 0
%.fca.1.1.0.0.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 1, 0, 0, 0
%.fca.1.1.0.1.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 1, 0, 1, 0
%.fca.1.1.0.2.0.extract = extractvalue { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] } %0, 1, 1, 0, 2, 0
%.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %1, 1
%5 = call i32 @llvm.amdgcn.workitem.id.x(), !range !7
%6 = call i64 @llvm.smax.i64(i64 %.fca.1.0.0.1.0.extract, i64 0)
%7 = icmp sgt i64 %.fca.1.0.0.0.0.extract, 0
br i1 %7, label %pass, label %fail
L674: ; preds = %L674.preheader, %L706
%value_phi17 = phi i64 [ %10, %L706 ], [ %65, %L674.preheader ]
%value_phi18 = phi double [ %value_phi26, %L706 ], [ 0.000000e+00, %L674.preheader ]
br i1 %.not127.not, label %L706, label %L693
L693: ; preds = %L693, %L674
%value_phi22 = phi double [ %8, %L693 ], [ %value_phi18, %L674 ]
%value_phi23 = phi i64 [ %9, %L693 ], [ %67, %L674 ]
%8 = fadd double %value_phi22, 2.000000e+00
%.not128 = icmp eq i64 %value_phi23, %value_phi19
%9 = add i64 %value_phi23, 1
br i1 %.not128, label %L706, label %L693
L706: ; preds = %L693, %L674
%value_phi26 = phi double [ %value_phi18, %L674 ], [ %8, %L693 ]
%.not129 = icmp eq i64 %value_phi17, %value_phi
%10 = add i64 %value_phi17, 1
br i1 %.not129, label %L732, label %L674
L732: ; preds = %pass10, %L706
%value_phi29 = phi double [ 0.000000e+00, %pass10 ], [ %value_phi26, %L706 ]
%11 = add i64 %64, %32
%reass.add137 = add i64 %11, %reass.mul135
%reass.mul138 = mul i64 %reass.add137, %60
%12 = add i64 %reass.mul138, %31
%13 = add i64 %12, %reass.mul
%14 = bitcast i8 addrspace(1)* %.fca.1.extract to double addrspace(1)*
%15 = getelementptr inbounds double, double addrspace(1)* %14, i64 %13
store double %value_phi29, double addrspace(1)* %15, align 8, !tbaa !8
br label %L738
L738: ; preds = %pass6, %L732
ret void
fail: ; preds = %conversion
%state.i.fca.0.extract.i = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0
%16 = inttoptr i64 %state.i.fca.0.extract.i to i32*
store i32 1, i32* %16, align 1
call void @llvm.amdgcn.endpgm()
unreachable
pass: ; preds = %conversion
%17 = call i32 @llvm.amdgcn.workgroup.id.x(), !range !11
%18 = zext i32 %17 to i64
%19 = udiv i64 %18, %.fca.1.0.0.0.0.extract
%20 = mul i64 %19, %.fca.1.0.0.0.0.extract
%21 = icmp sgt i64 %.fca.1.0.0.1.0.extract, 0
br i1 %21, label %pass2, label %fail1
fail1: ; preds = %pass
%state.i.fca.0.extract.i28 = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0
%22 = inttoptr i64 %state.i.fca.0.extract.i28 to i32*
store i32 1, i32* %22, align 1
call void @llvm.amdgcn.endpgm()
unreachable
pass2: ; preds = %pass
%23 = udiv i64 %19, %6
%24 = mul i64 %23, %6
%25 = sub i64 %19, %24
%26 = call i64 @llvm.smax.i64(i64 %.fca.1.1.0.1.0.extract, i64 0)
%27 = icmp sgt i64 %.fca.1.1.0.0.0.extract, 0
br i1 %27, label %pass4, label %fail3
fail3: ; preds = %pass2
%state.i.fca.0.extract.i42 = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0
%28 = inttoptr i64 %state.i.fca.0.extract.i42 to i32*
store i32 1, i32* %28, align 1
call void @llvm.amdgcn.endpgm()
unreachable
pass4: ; preds = %pass2
%29 = icmp sgt i64 %.fca.1.1.0.1.0.extract, 0
br i1 %29, label %pass6, label %fail5
fail5: ; preds = %pass4
%state.i.fca.0.extract.i56 = extractvalue { i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, 0
%30 = inttoptr i64 %state.i.fca.0.extract.i56 to i32*
store i32 1, i32* %30, align 1
call void @llvm.amdgcn.endpgm()
unreachable
pass6: ; preds = %pass4
%31 = zext i32 %5 to i64
%32 = udiv i64 %31, %.fca.1.1.0.0.0.extract
%33 = udiv i64 %32, %26
%34 = mul i64 %33, %26
%35 = add i64 %20, %32
%reass.add = sub i64 %18, %35
%reass.mul = mul i64 %reass.add, %.fca.1.1.0.0.0.extract
%36 = add nuw nsw i64 %31, 1
%37 = add i64 %36, %reass.mul
%38 = mul i64 %25, %.fca.1.1.0.1.0.extract
%39 = add i64 %38, 1
%40 = add i64 %39, %32
%41 = sub i64 %40, %34
%42 = mul i64 %23, %.fca.1.1.0.2.0.extract
%43 = add i64 %42, 1
%44 = add i64 %43, %33
%45 = icmp sgt i64 %37, 0
%46 = icmp sle i64 %37, %.fca.0.0.0.0.extract
%47 = and i1 %45, %46
%48 = icmp sgt i64 %41, 0
%49 = icmp sle i64 %41, %.fca.0.0.1.0.extract
%50 = and i1 %48, %49
%51 = icmp sgt i64 %44, 0
%52 = icmp sle i64 %44, %.fca.0.0.2.0.extract
%53 = and i1 %51, %52
%54 = and i1 %47, %50
%55 = and i1 %53, %54
br i1 %55, label %pass10, label %L738
pass10: ; preds = %pass6
%56 = udiv i64 %19, %.fca.1.0.0.1.0.extract
%57 = mul i64 %56, %.fca.1.0.0.1.0.extract
%58 = udiv i64 %32, %.fca.1.1.0.1.0.extract
%59 = mul i64 %56, %.fca.1.1.0.2.0.extract
%60 = call i64 @llvm.smax.i64(i64 %.fca.0.0.0.0.extract, i64 0)
%61 = call i64 @llvm.smax.i64(i64 %.fca.0.0.1.0.extract, i64 0)
%62 = add i64 %57, %58
%reass.add134 = sub i64 %19, %62
%reass.mul135 = mul i64 %reass.add134, %.fca.1.1.0.1.0.extract
%63 = add i64 %58, %59
%64 = mul i64 %63, %61
%65 = sub i64 0, %2
%.not = icmp sgt i64 %65, %2
%66 = sext i1 %.not to i64
%value_phi = xor i64 %66, %2
%.not125.not = icmp slt i64 %value_phi, %65
br i1 %.not125.not, label %L732, label %L674.preheader
L674.preheader: ; preds = %pass10
%67 = sub i64 0, %3
%.not126 = icmp sgt i64 %67, %3
%68 = sext i1 %.not126 to i64
%value_phi19 = xor i64 %68, %3
%.not127.not = icmp slt i64 %value_phi19, %67
br label %L674
}
attributes #0 = { nounwind readnone speculatable willreturn }
attributes #1 = { cold noreturn nounwind }
attributes #2 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
attributes #3 = { "amdgpu-unsafe-fp-atomics"="true" "target-cpu"="gfx1100" "target-features"="+wavefrontsize32,-wavefrontsize64" }
!llvm.module.flags = !{!0, !1, !2, !3}
!opencl.ocl.version = !{!4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4}
!llvm.ident = !{!5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5}
!julia.kernel = !{!6}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{i32 7, !"PIC Level", i32 1}
!4 = !{i32 2, i32 0}
!5 = !{!"clang version 15.0.0 (/cache/yggdrasil/downloads/clones/llvm-project.git-974efd367bc513231526d317489c66cb27727ef3caa41108e3819c131a8acf57 f3d695fc2985a8dfdd5f4219d351fdeac3038867)"}
!6 = !{void ({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 }, { [1 x [3 x [1 x i64]]], [2 x [1 x [3 x [1 x i64]]]] }, { [3 x i64], i8 addrspace(1)*, i64 }, i64, i64, i64)* @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl3E5TupleI5OneToI5Int64ES4_IS5_ES4_IS5_EEE7NDRangeILl3ES0_S0_S2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEES2_ILl3ES3_IS4_IS5_ES4_IS5_ES4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5_S5_}
!7 = !{i32 0, i32 1023}
!8 = !{!9, !9, i64 0, i64 0}
!9 = !{!"custom_tbaa_addrspace(1)", !10, i64 0}
!10 = !{!"custom_tbaa"}
!11 = !{i32 0, i32 -2}
; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:10:11:12:13"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workgroup.id.x() #0
; Function Attrs: nounwind readnone speculatable willreturn
declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_kernel void @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl1E5TupleI5OneToI5Int64EEE7NDRangeILl1ES0_S0_S2_ILl1ES3_IS4_IS5_EEES2_ILl1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5_S5_({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 } %state, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, { [3 x i64], i8 addrspace(1)*, i64 } %1, i64 signext %2, i64 signext %3, i64 signext %4) local_unnamed_addr #1 {
conversion:
%.fca.0.0.0.0.extract = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, 0, 0, 0, 0
%.fca.1.1.0.0.0.extract = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %0, 1, 1, 0, 0, 0
%.fca.1.extract = extractvalue { [3 x i64], i8 addrspace(1)*, i64 } %1, 1
%5 = call i32 @llvm.amdgcn.workgroup.id.x(), !range !7
%6 = call i32 @llvm.amdgcn.workitem.id.x(), !range !8
%7 = add nuw nsw i32 %6, 1
%8 = zext i32 %7 to i64
%9 = zext i32 %5 to i64
%10 = mul i64 %.fca.1.1.0.0.0.extract, %9
%11 = add i64 %10, %8
%12 = icmp slt i64 %11, 1
%13 = icmp sgt i64 %11, %.fca.0.0.0.0.extract
%14 = or i1 %12, %13
br i1 %14, label %L299, label %L103
L103: ; preds = %conversion
%15 = sub i64 0, %2
%.not = icmp sgt i64 %15, %2
%16 = sext i1 %.not to i64
%value_phi = xor i64 %16, %2
%.not6.not = icmp slt i64 %value_phi, %15
br i1 %.not6.not, label %L293, label %L235.preheader
L235.preheader: ; preds = %L103
%17 = sub i64 0, %3
%.not7 = icmp sgt i64 %17, %3
%18 = sext i1 %.not7 to i64
%value_phi5 = xor i64 %18, %3
%.not8.not = icmp slt i64 %value_phi5, %17
br label %L235
L235: ; preds = %L267, %L235.preheader
%value_phi3 = phi i64 [ %21, %L267 ], [ %15, %L235.preheader ]
%value_phi4 = phi double [ %value_phi12, %L267 ], [ 0.000000e+00, %L235.preheader ]
br i1 %.not8.not, label %L267, label %L254
L254: ; preds = %L254, %L235
%value_phi8 = phi double [ %19, %L254 ], [ %value_phi4, %L235 ]
%value_phi9 = phi i64 [ %20, %L254 ], [ %17, %L235 ]
%19 = fadd double %value_phi8, 2.000000e+00
%.not9 = icmp eq i64 %value_phi9, %value_phi5
%20 = add i64 %value_phi9, 1
br i1 %.not9, label %L267, label %L254
L267: ; preds = %L254, %L235
%value_phi12 = phi double [ %value_phi4, %L235 ], [ %19, %L254 ]
%.not10 = icmp eq i64 %value_phi3, %value_phi
%21 = add i64 %value_phi3, 1
br i1 %.not10, label %L293, label %L235
L293: ; preds = %L267, %L103
%value_phi15 = phi double [ 0.000000e+00, %L103 ], [ %value_phi12, %L267 ]
%22 = add nsw i64 %8, -1
%23 = add i64 %22, %10
%24 = bitcast i8 addrspace(1)* %.fca.1.extract to double addrspace(1)*
%25 = getelementptr inbounds double, double addrspace(1)* %24, i64 %23
store double %value_phi15, double addrspace(1)* %25, align 8, !tbaa !9
br label %L299
L299: ; preds = %L293, %conversion
ret void
}
attributes #0 = { nounwind readnone speculatable willreturn }
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "target-cpu"="gfx1100" "target-features"="+wavefrontsize32,-wavefrontsize64" }
!llvm.module.flags = !{!0, !1, !2, !3}
!opencl.ocl.version = !{!4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4}
!llvm.ident = !{!5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5, !5}
!julia.kernel = !{!6}
!0 = !{i32 2, !"Dwarf Version", i32 4}
!1 = !{i32 2, !"Debug Info Version", i32 3}
!2 = !{i32 1, !"wchar_size", i32 4}
!3 = !{i32 7, !"PIC Level", i32 1}
!4 = !{i32 2, i32 0}
!5 = !{!"clang version 15.0.0 (/cache/yggdrasil/downloads/clones/llvm-project.git-974efd367bc513231526d317489c66cb27727ef3caa41108e3819c131a8acf57 f3d695fc2985a8dfdd5f4219d351fdeac3038867)"}
!6 = !{void ({ i64, i64, i64, i64, i64, i64, i32, i32, i64, i64, i64, i64 }, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] }, { [3 x i64], i8 addrspace(1)*, i64 }, i64, i64, i64)* @_Z14gpu_kernel_xx_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILl1E5TupleI5OneToI5Int64EEE7NDRangeILl1ES0_S0_S2_ILl1ES3_IS4_IS5_EEES2_ILl1ES3_IS4_IS5_EEEEE14ROCDeviceArrayI7Float64Ll3ELl1EES5_S5_S5_}
!7 = !{i32 0, i32 -2}
!8 = !{i32 0, i32 1023}
!9 = !{!10, !10, i64 0, i64 0}
!10 = !{!"custom_tbaa_addrspace(1)", !11, i64 0}
!11 = !{!"custom_tbaa"} |
In my computer, the linear or cartesian also give the wrong results: cpu:[18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0; 18.0;;;]
amd:[6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0; 6.0;;;]
|
This time this kernel works well (the results are still wrong).
|
Did you change how you launch the code? |
I just tried ndrange=length(x), and it give the correct results. I am using ndrange=size(x) because I want to know its Cartesian Indices, it seems I can cast the linear index to CartesianIndices manually: idx = @index(Global)
i, j = Tuple(CartesianIndices(tensor)[idx]) so using ndrange=length(x) solves the problem. |
You can also access size of the array within the kernel and compute x[2, 2] == x[4] |
thanks~ |
So this is smelling more and more like a compiler bug... The below code is an attempt of mine to remove the KA syntax sugar. Next step is the inline
|
This is now KA free:
|
Hi, I noticed that the following script produces different results depending on the backend. On my machine, the output is:
Is there a mistake in the kernel function?
The text was updated successfully, but these errors were encountered: