; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60
; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70
; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60
; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70
; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-copy-byval-args -S | FileCheck %s --check-prefixes=COMMON,COPY
; RUN: llc < %s -mcpu=sm_60 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_60
; RUN: llc < %s -mcpu=sm_70 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_70
source_filename = "<stdin>"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%struct.S = type { i32, i32 }

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
; LOWER-ARGS-NEXT:    store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT:    ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; COPY-NEXT:  [[ENTRY:.*:]]
; COPY-NEXT:    [[I:%.*]] = load i32, ptr [[S]], align 4
; COPY-NEXT:    store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT:    ret void
;
; PTX-LABEL: read_only(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<2>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    ld.param.b64 %rd1, [read_only_param_0];
; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT:    ld.param.b32 %r1, [read_only_param_1];
; PTX-NEXT:    st.global.b32 [%rd2], %r1;
; PTX-NEXT:    ret;
entry:
  %i = load i32, ptr %s, align 4
  store i32 %i, ptr %out, align 4
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
; LOWER-ARGS-NEXT:    store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT:    ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT:  [[ENTRY:.*:]]
; COPY-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
; COPY-NEXT:    [[I:%.*]] = load i32, ptr [[B]], align 4
; COPY-NEXT:    store i32 [[I]], ptr [[OUT]], align 4
; COPY-NEXT:    ret void
;
; PTX-LABEL: read_only_gep(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<2>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    ld.param.b64 %rd1, [read_only_gep_param_0];
; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT:    ld.param.b32 %r1, [read_only_gep_param_1+4];
; PTX-NEXT:    st.global.b32 [%rd2], %r1;
; PTX-NEXT:    ret;
entry:
  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
  %i = load i32, ptr %b, align 4
  store i32 %i, ptr %out, align 4
  ret void
}

;; TODO: This test has been disabled because the addrspacecast is not legal on
;; sm_60, and not well supported within nvptx-lower-args. We should determine
;; in what cases it is safe to make assumptions about the address of a byval
;; parameter and improve our handling of addrspacecast in nvptx-lower-args.

; define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; entry:
;   %b = getelementptr inbounds nuw i8, ptr %s, i64 4
;   %asc = addrspacecast ptr %b to ptr addrspace(101)
;   %i = load i32, ptr addrspace(101) %asc, align 4
;   store i32 %i, ptr %out, align 4
;   ret void
; }
;
; define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; entry:
;   %b = getelementptr inbounds nuw i8, ptr %s, i64 4
;   %asc = addrspacecast ptr %b to ptr addrspace(101)
;   %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
;   %i = load i32, ptr %asc0, align 4
;   store i32 %i, ptr %out, align 4
;   ret void
; }

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr(
; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; COMMON-NEXT:  [[ENTRY:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
; COMMON-NEXT:    ret void
;
; PTX-LABEL: escape_ptr(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot2[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    mov.b64 %SPL, __local_depot2;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    add.u64 %rd1, %SP, 0;
; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_param_1+4];
; PTX-NEXT:    st.local.b32 [%rd2+4], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_param_1];
; PTX-NEXT:    st.local.b32 [%rd2], %r2;
; PTX-NEXT:    { // callseq 0, 0
; PTX-NEXT:    .param .b64 param0;
; PTX-NEXT:    st.param.b64 [param0], %rd1;
; PTX-NEXT:    call.uni _Z6escapePv, (param0);
; PTX-NEXT:    } // callseq 0
; PTX-NEXT:    ret;
entry:
  call void @_Z6escapePv(ptr noundef nonnull %s) #0
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep(
; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT:  [[ENTRY:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
; COMMON-NEXT:    ret void
;
; PTX-LABEL: escape_ptr_gep(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot3[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    mov.b64 %SPL, __local_depot3;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    add.u64 %rd1, %SP, 0;
; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_gep_param_1+4];
; PTX-NEXT:    st.local.b32 [%rd2+4], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_gep_param_1];
; PTX-NEXT:    st.local.b32 [%rd2], %r2;
; PTX-NEXT:    add.s64 %rd3, %rd1, 4;
; PTX-NEXT:    { // callseq 1, 0
; PTX-NEXT:    .param .b64 param0;
; PTX-NEXT:    st.param.b64 [param0], %rd3;
; PTX-NEXT:    call.uni _Z6escapePv, (param0);
; PTX-NEXT:    } // callseq 1
; PTX-NEXT:    ret;
entry:
  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
  call void @_Z6escapePv(ptr noundef nonnull %b) #0
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_store(
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT:  [[ENTRY:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    store ptr [[S1]], ptr [[OUT]], align 8
; COMMON-NEXT:    ret void
;
; PTX-LABEL: escape_ptr_store(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot4[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    mov.b64 %SPL, __local_depot4;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    ld.param.b64 %rd1, [escape_ptr_store_param_0];
; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT:    add.u64 %rd3, %SP, 0;
; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_store_param_1+4];
; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_store_param_1];
; PTX-NEXT:    st.local.b32 [%rd4], %r2;
; PTX-NEXT:    st.global.b64 [%rd2], %rd3;
; PTX-NEXT:    ret;
entry:
  store ptr %s, ptr %out, align 8
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store(
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT:  [[ENTRY:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT:    store ptr [[B]], ptr [[OUT]], align 8
; COMMON-NEXT:    ret void
;
; PTX-LABEL: escape_ptr_gep_store(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot5[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    mov.b64 %SPL, __local_depot5;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    ld.param.b64 %rd1, [escape_ptr_gep_store_param_0];
; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT:    add.u64 %rd3, %SP, 0;
; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [escape_ptr_gep_store_param_1+4];
; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_gep_store_param_1];
; PTX-NEXT:    st.local.b32 [%rd4], %r2;
; PTX-NEXT:    add.s64 %rd5, %rd3, 4;
; PTX-NEXT:    st.global.b64 [%rd2], %rd5;
; PTX-NEXT:    ret;
entry:
  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
  store ptr %b, ptr %out, align 8
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptrtoint(
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT:  [[ENTRY:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    [[I:%.*]] = ptrtoint ptr [[S1]] to i64
; COMMON-NEXT:    store i64 [[I]], ptr [[OUT]], align 8
; COMMON-NEXT:    ret void
;
; PTX-LABEL: escape_ptrtoint(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot6[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    mov.b64 %SPL, __local_depot6;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    ld.param.b64 %rd1, [escape_ptrtoint_param_0];
; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT:    add.u64 %rd3, %SP, 0;
; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [escape_ptrtoint_param_1+4];
; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [escape_ptrtoint_param_1];
; PTX-NEXT:    st.local.b32 [%rd4], %r2;
; PTX-NEXT:    st.global.b64 [%rd2], %rd3;
; PTX-NEXT:    ret;
entry:
  %i = ptrtoint ptr %s to i64
  store i64 %i, ptr %out, align 8
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT:    ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT:  [[ENTRY:.*:]]
; COPY-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true)
; COPY-NEXT:    ret void
;
; PTX-LABEL: memcpy_from_param(
; PTX:       {
; PTX-NEXT:    .reg .b16 %rs<17>;
; PTX-NEXT:    .reg .b64 %rd<2>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_from_param_param_0];
; PTX-NEXT:    ld.param.b8 %rs1, [memcpy_from_param_param_1+15];
; PTX-NEXT:    st.volatile.b8 [%rd1+15], %rs1;
; PTX-NEXT:    ld.param.b8 %rs2, [memcpy_from_param_param_1+14];
; PTX-NEXT:    st.volatile.b8 [%rd1+14], %rs2;
; PTX-NEXT:    ld.param.b8 %rs3, [memcpy_from_param_param_1+13];
; PTX-NEXT:    st.volatile.b8 [%rd1+13], %rs3;
; PTX-NEXT:    ld.param.b8 %rs4, [memcpy_from_param_param_1+12];
; PTX-NEXT:    st.volatile.b8 [%rd1+12], %rs4;
; PTX-NEXT:    ld.param.b8 %rs5, [memcpy_from_param_param_1+11];
; PTX-NEXT:    st.volatile.b8 [%rd1+11], %rs5;
; PTX-NEXT:    ld.param.b8 %rs6, [memcpy_from_param_param_1+10];
; PTX-NEXT:    st.volatile.b8 [%rd1+10], %rs6;
; PTX-NEXT:    ld.param.b8 %rs7, [memcpy_from_param_param_1+9];
; PTX-NEXT:    st.volatile.b8 [%rd1+9], %rs7;
; PTX-NEXT:    ld.param.b8 %rs8, [memcpy_from_param_param_1+8];
; PTX-NEXT:    st.volatile.b8 [%rd1+8], %rs8;
; PTX-NEXT:    ld.param.b8 %rs9, [memcpy_from_param_param_1+7];
; PTX-NEXT:    st.volatile.b8 [%rd1+7], %rs9;
; PTX-NEXT:    ld.param.b8 %rs10, [memcpy_from_param_param_1+6];
; PTX-NEXT:    st.volatile.b8 [%rd1+6], %rs10;
; PTX-NEXT:    ld.param.b8 %rs11, [memcpy_from_param_param_1+5];
; PTX-NEXT:    st.volatile.b8 [%rd1+5], %rs11;
; PTX-NEXT:    ld.param.b8 %rs12, [memcpy_from_param_param_1+4];
; PTX-NEXT:    st.volatile.b8 [%rd1+4], %rs12;
; PTX-NEXT:    ld.param.b8 %rs13, [memcpy_from_param_param_1+3];
; PTX-NEXT:    st.volatile.b8 [%rd1+3], %rs13;
; PTX-NEXT:    ld.param.b8 %rs14, [memcpy_from_param_param_1+2];
; PTX-NEXT:    st.volatile.b8 [%rd1+2], %rs14;
; PTX-NEXT:    ld.param.b8 %rs15, [memcpy_from_param_param_1+1];
; PTX-NEXT:    st.volatile.b8 [%rd1+1], %rs15;
; PTX-NEXT:    ld.param.b8 %rs16, [memcpy_from_param_param_1];
; PTX-NEXT:    st.volatile.b8 [%rd1], %rs16;
; PTX-NEXT:    ret;
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT:  [[ENTRY:.*:]]
; LOWER-ARGS-NEXT:    [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT:    ret void
;
; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign(
; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COPY-NEXT:  [[ENTRY:.*:]]
; COPY-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true)
; COPY-NEXT:    ret void
;
; PTX-LABEL: memcpy_from_param_noalign(
; PTX:       {
; PTX-NEXT:    .reg .b16 %rs<17>;
; PTX-NEXT:    .reg .b64 %rd<2>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_from_param_noalign_param_0];
; PTX-NEXT:    ld.param.b8 %rs1, [memcpy_from_param_noalign_param_1+15];
; PTX-NEXT:    st.volatile.b8 [%rd1+15], %rs1;
; PTX-NEXT:    ld.param.b8 %rs2, [memcpy_from_param_noalign_param_1+14];
; PTX-NEXT:    st.volatile.b8 [%rd1+14], %rs2;
; PTX-NEXT:    ld.param.b8 %rs3, [memcpy_from_param_noalign_param_1+13];
; PTX-NEXT:    st.volatile.b8 [%rd1+13], %rs3;
; PTX-NEXT:    ld.param.b8 %rs4, [memcpy_from_param_noalign_param_1+12];
; PTX-NEXT:    st.volatile.b8 [%rd1+12], %rs4;
; PTX-NEXT:    ld.param.b8 %rs5, [memcpy_from_param_noalign_param_1+11];
; PTX-NEXT:    st.volatile.b8 [%rd1+11], %rs5;
; PTX-NEXT:    ld.param.b8 %rs6, [memcpy_from_param_noalign_param_1+10];
; PTX-NEXT:    st.volatile.b8 [%rd1+10], %rs6;
; PTX-NEXT:    ld.param.b8 %rs7, [memcpy_from_param_noalign_param_1+9];
; PTX-NEXT:    st.volatile.b8 [%rd1+9], %rs7;
; PTX-NEXT:    ld.param.b8 %rs8, [memcpy_from_param_noalign_param_1+8];
; PTX-NEXT:    st.volatile.b8 [%rd1+8], %rs8;
; PTX-NEXT:    ld.param.b8 %rs9, [memcpy_from_param_noalign_param_1+7];
; PTX-NEXT:    st.volatile.b8 [%rd1+7], %rs9;
; PTX-NEXT:    ld.param.b8 %rs10, [memcpy_from_param_noalign_param_1+6];
; PTX-NEXT:    st.volatile.b8 [%rd1+6], %rs10;
; PTX-NEXT:    ld.param.b8 %rs11, [memcpy_from_param_noalign_param_1+5];
; PTX-NEXT:    st.volatile.b8 [%rd1+5], %rs11;
; PTX-NEXT:    ld.param.b8 %rs12, [memcpy_from_param_noalign_param_1+4];
; PTX-NEXT:    st.volatile.b8 [%rd1+4], %rs12;
; PTX-NEXT:    ld.param.b8 %rs13, [memcpy_from_param_noalign_param_1+3];
; PTX-NEXT:    st.volatile.b8 [%rd1+3], %rs13;
; PTX-NEXT:    ld.param.b8 %rs14, [memcpy_from_param_noalign_param_1+2];
; PTX-NEXT:    st.volatile.b8 [%rd1+2], %rs14;
; PTX-NEXT:    ld.param.b8 %rs15, [memcpy_from_param_noalign_param_1+1];
; PTX-NEXT:    st.volatile.b8 [%rd1+1], %rs15;
; PTX-NEXT:    ld.param.b8 %rs16, [memcpy_from_param_noalign_param_1];
; PTX-NEXT:    st.volatile.b8 [%rd1], %rs16;
; PTX-NEXT:    ret;
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param(
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT:  [[ENTRY:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
; COMMON-NEXT:    ret void
;
; PTX-LABEL: memcpy_to_param(
; PTX:       {
; PTX-NEXT:    .local .align 8 .b8 __local_depot9[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<48>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %entry
; PTX-NEXT:    mov.b64 %SPL, __local_depot9;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    ld.param.b64 %rd1, [memcpy_to_param_param_0];
; PTX-NEXT:    add.u64 %rd3, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [memcpy_to_param_param_1+4];
; PTX-NEXT:    st.local.b32 [%rd3+4], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [memcpy_to_param_param_1];
; PTX-NEXT:    st.local.b32 [%rd3], %r2;
; PTX-NEXT:    ld.volatile.b8 %rd4, [%rd1];
; PTX-NEXT:    ld.volatile.b8 %rd5, [%rd1+1];
; PTX-NEXT:    shl.b64 %rd6, %rd5, 8;
; PTX-NEXT:    or.b64 %rd7, %rd6, %rd4;
; PTX-NEXT:    ld.volatile.b8 %rd8, [%rd1+2];
; PTX-NEXT:    shl.b64 %rd9, %rd8, 16;
; PTX-NEXT:    ld.volatile.b8 %rd10, [%rd1+3];
; PTX-NEXT:    shl.b64 %rd11, %rd10, 24;
; PTX-NEXT:    or.b64 %rd12, %rd11, %rd9;
; PTX-NEXT:    or.b64 %rd13, %rd12, %rd7;
; PTX-NEXT:    ld.volatile.b8 %rd14, [%rd1+4];
; PTX-NEXT:    ld.volatile.b8 %rd15, [%rd1+5];
; PTX-NEXT:    shl.b64 %rd16, %rd15, 8;
; PTX-NEXT:    or.b64 %rd17, %rd16, %rd14;
; PTX-NEXT:    ld.volatile.b8 %rd18, [%rd1+6];
; PTX-NEXT:    shl.b64 %rd19, %rd18, 16;
; PTX-NEXT:    ld.volatile.b8 %rd20, [%rd1+7];
; PTX-NEXT:    shl.b64 %rd21, %rd20, 24;
; PTX-NEXT:    or.b64 %rd22, %rd21, %rd19;
; PTX-NEXT:    or.b64 %rd23, %rd22, %rd17;
; PTX-NEXT:    shl.b64 %rd24, %rd23, 32;
; PTX-NEXT:    or.b64 %rd25, %rd24, %rd13;
; PTX-NEXT:    st.volatile.b64 [%SP], %rd25;
; PTX-NEXT:    ld.volatile.b8 %rd26, [%rd1+8];
; PTX-NEXT:    ld.volatile.b8 %rd27, [%rd1+9];
; PTX-NEXT:    shl.b64 %rd28, %rd27, 8;
; PTX-NEXT:    or.b64 %rd29, %rd28, %rd26;
; PTX-NEXT:    ld.volatile.b8 %rd30, [%rd1+10];
; PTX-NEXT:    shl.b64 %rd31, %rd30, 16;
; PTX-NEXT:    ld.volatile.b8 %rd32, [%rd1+11];
; PTX-NEXT:    shl.b64 %rd33, %rd32, 24;
; PTX-NEXT:    or.b64 %rd34, %rd33, %rd31;
; PTX-NEXT:    or.b64 %rd35, %rd34, %rd29;
; PTX-NEXT:    ld.volatile.b8 %rd36, [%rd1+12];
; PTX-NEXT:    ld.volatile.b8 %rd37, [%rd1+13];
; PTX-NEXT:    shl.b64 %rd38, %rd37, 8;
; PTX-NEXT:    or.b64 %rd39, %rd38, %rd36;
; PTX-NEXT:    ld.volatile.b8 %rd40, [%rd1+14];
; PTX-NEXT:    shl.b64 %rd41, %rd40, 16;
; PTX-NEXT:    ld.volatile.b8 %rd42, [%rd1+15];
; PTX-NEXT:    shl.b64 %rd43, %rd42, 24;
; PTX-NEXT:    or.b64 %rd44, %rd43, %rd41;
; PTX-NEXT:    or.b64 %rd45, %rd44, %rd39;
; PTX-NEXT:    shl.b64 %rd46, %rd45, 32;
; PTX-NEXT:    or.b64 %rd47, %rd46, %rd35;
; PTX-NEXT:    st.volatile.b64 [%SP+8], %rd47;
; PTX-NEXT:    ret;
entry:
  tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
  ret void
}

; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
; COMMON-LABEL: define dso_local ptx_kernel void @copy_on_store(
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT:  [[BB:.*:]]
; COMMON-NEXT:    [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT:    [[I:%.*]] = load i32, ptr [[IN]], align 4
; COMMON-NEXT:    store i32 [[I]], ptr [[S1]], align 4
; COMMON-NEXT:    ret void
;
; PTX-LABEL: copy_on_store(
; PTX:       {
; PTX-EMPTY:
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %bb
; PTX-NEXT:    ret;
bb:
  %i = load i32, ptr %in, align 4
  store i32 %i, ptr %s, align 4
  ret void
}

define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
; SM_60-LABEL: define ptx_kernel void @test_select(
; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_60-NEXT:  [[BB:.*:]]
; SM_60-NEXT:    [[INPUT24:%.*]] = alloca i32, align 4
; SM_60-NEXT:    [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
; SM_60-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
; SM_60-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; SM_60-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[OUT]], align 4
; SM_60-NEXT:    ret void
;
; SM_70-LABEL: define ptx_kernel void @test_select(
; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; SM_70-NEXT:  [[BB:.*:]]
; SM_70-NEXT:    [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_70-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
; SM_70-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; SM_70-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[OUT]], align 4
; SM_70-NEXT:    ret void
;
; COPY-LABEL: define ptx_kernel void @test_select(
; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; COPY-NEXT:  [[BB:.*:]]
; COPY-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
; COPY-NEXT:    [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COPY-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
; COPY-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COPY-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COPY-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; COPY-NEXT:    store i32 [[VALLOADED]], ptr [[OUT]], align 4
; COPY-NEXT:    ret void
;
; PTX_60-LABEL: test_select(
; PTX_60:       {
; PTX_60-NEXT:    .reg .pred %p<2>;
; PTX_60-NEXT:    .reg .b16 %rs<3>;
; PTX_60-NEXT:    .reg .b32 %r<4>;
; PTX_60-NEXT:    .reg .b64 %rd<3>;
; PTX_60-EMPTY:
; PTX_60-NEXT:  // %bb.0: // %bb
; PTX_60-NEXT:    ld.param.b8 %rs1, [test_select_param_3];
; PTX_60-NEXT:    and.b16 %rs2, %rs1, 1;
; PTX_60-NEXT:    setp.ne.b16 %p1, %rs2, 0;
; PTX_60-NEXT:    ld.param.b64 %rd1, [test_select_param_2];
; PTX_60-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; PTX_60-NEXT:    ld.param.b32 %r1, [test_select_param_1];
; PTX_60-NEXT:    ld.param.b32 %r2, [test_select_param_0];
; PTX_60-NEXT:    selp.b32 %r3, %r2, %r1, %p1;
; PTX_60-NEXT:    st.global.b32 [%rd2], %r3;
; PTX_60-NEXT:    ret;
;
; PTX_70-LABEL: test_select(
; PTX_70:       {
; PTX_70-NEXT:    .reg .pred %p<2>;
; PTX_70-NEXT:    .reg .b16 %rs<3>;
; PTX_70-NEXT:    .reg .b32 %r<2>;
; PTX_70-NEXT:    .reg .b64 %rd<6>;
; PTX_70-EMPTY:
; PTX_70-NEXT:  // %bb.0: // %bb
; PTX_70-NEXT:    ld.param.b8 %rs1, [test_select_param_3];
; PTX_70-NEXT:    and.b16 %rs2, %rs1, 1;
; PTX_70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
; PTX_70-NEXT:    mov.b64 %rd1, test_select_param_0;
; PTX_70-NEXT:    ld.param.b64 %rd2, [test_select_param_2];
; PTX_70-NEXT:    cvta.to.global.u64 %rd3, %rd2;
; PTX_70-NEXT:    mov.b64 %rd4, test_select_param_1;
; PTX_70-NEXT:    selp.b64 %rd5, %rd1, %rd4, %p1;
; PTX_70-NEXT:    ld.param.b32 %r1, [%rd5];
; PTX_70-NEXT:    st.global.b32 [%rd3], %r1;
; PTX_70-NEXT:    ret;
bb:
  %ptrnew = select i1 %cond, ptr %input1, ptr %input2
  %valloaded = load i32, ptr %ptrnew, align 4
  store i32 %valloaded, ptr %out, align 4
  ret void
}

define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
; COMMON-LABEL: define ptx_kernel void @test_select_write(
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
; COMMON-NEXT:  [[BB:.*:]]
; COMMON-NEXT:    [[INPUT23:%.*]] = alloca i32, align 4
; COMMON-NEXT:    [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COMMON-NEXT:    [[INPUT11:%.*]] = alloca i32, align 4
; COMMON-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COMMON-NEXT:    [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COMMON-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
; COMMON-NEXT:    ret void
;
; PTX-LABEL: test_select_write(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot12[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .pred %p<2>;
; PTX-NEXT:    .reg .b16 %rs<3>;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %bb
; PTX-NEXT:    mov.b64 %SPL, __local_depot12;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    ld.param.b8 %rs1, [test_select_write_param_3];
; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
; PTX-NEXT:    setp.ne.b16 %p1, %rs2, 0;
; PTX-NEXT:    ld.param.b32 %r1, [test_select_write_param_1];
; PTX-NEXT:    st.b32 [%SP], %r1;
; PTX-NEXT:    ld.param.b32 %r2, [test_select_write_param_0];
; PTX-NEXT:    st.b32 [%SP+4], %r2;
; PTX-NEXT:    add.u64 %rd2, %SPL, 4;
; PTX-NEXT:    add.u64 %rd4, %SPL, 0;
; PTX-NEXT:    selp.b64 %rd5, %rd2, %rd4, %p1;
; PTX-NEXT:    st.local.b32 [%rd5], 1;
; PTX-NEXT:    ret;
bb:
  %ptrnew = select i1 %cond, ptr %input1, ptr %input2
  store i32 1, ptr %ptrnew, align 4
  ret void
}

define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
; SM_60-LABEL: define ptx_kernel void @test_phi(
; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_60-NEXT:  [[BB:.*:]]
; SM_60-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; SM_60-NEXT:    [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; SM_60-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; SM_60-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; SM_60-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60:       [[FIRST]]:
; SM_60-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
; SM_60-NEXT:    br label %[[MERGE:.*]]
; SM_60:       [[SECOND]]:
; SM_60-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
; SM_60-NEXT:    br label %[[MERGE]]
; SM_60:       [[MERGE]]:
; SM_60-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; SM_60-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_60-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; SM_60-NEXT:    ret void
;
; SM_70-LABEL: define ptx_kernel void @test_phi(
; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; SM_70-NEXT:  [[BB:.*:]]
; SM_70-NEXT:    [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_70-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
; SM_70-NEXT:    [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_70:       [[FIRST]]:
; SM_70-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
; SM_70-NEXT:    br label %[[MERGE:.*]]
; SM_70:       [[SECOND]]:
; SM_70-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
; SM_70-NEXT:    br label %[[MERGE]]
; SM_70:       [[MERGE]]:
; SM_70-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; SM_70-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; SM_70-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; SM_70-NEXT:    ret void
;
; COPY-LABEL: define ptx_kernel void @test_phi(
; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COPY-NEXT:  [[BB:.*:]]
; COPY-NEXT:    [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8
; COPY-NEXT:    [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false)
; COPY-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; COPY-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COPY-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COPY:       [[FIRST]]:
; COPY-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
; COPY-NEXT:    br label %[[MERGE:.*]]
; COPY:       [[SECOND]]:
; COPY-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT23]], i32 0, i32 1
; COPY-NEXT:    br label %[[MERGE]]
; COPY:       [[MERGE]]:
; COPY-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; COPY-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; COPY-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; COPY-NEXT:    ret void
;
; PTX_60-LABEL: test_phi(
; PTX_60:       {
; PTX_60-NEXT:    .reg .pred %p<2>;
; PTX_60-NEXT:    .reg .b16 %rs<3>;
; PTX_60-NEXT:    .reg .b32 %r<5>;
; PTX_60-NEXT:    .reg .b64 %rd<3>;
; PTX_60-EMPTY:
; PTX_60-NEXT:  // %bb.0: // %bb
; PTX_60-NEXT:    ld.param.b8 %rs1, [test_phi_param_3];
; PTX_60-NEXT:    and.b16 %rs2, %rs1, 1;
; PTX_60-NEXT:    setp.ne.b16 %p1, %rs2, 0;
; PTX_60-NEXT:    ld.param.b64 %rd2, [test_phi_param_2];
; PTX_60-NEXT:    cvta.to.global.u64 %rd1, %rd2;
; PTX_60-NEXT:    ld.param.b32 %r4, [test_phi_param_0];
; PTX_60-NEXT:    @%p1 bra $L__BB13_2;
; PTX_60-NEXT:  // %bb.1: // %second
; PTX_60-NEXT:    ld.param.b32 %r4, [test_phi_param_1+4];
; PTX_60-NEXT:  $L__BB13_2: // %merge
; PTX_60-NEXT:    st.global.b32 [%rd1], %r4;
; PTX_60-NEXT:    ret;
;
; PTX_70-LABEL: test_phi(
; PTX_70:       {
; PTX_70-NEXT:    .reg .pred %p<2>;
; PTX_70-NEXT:    .reg .b16 %rs<3>;
; PTX_70-NEXT:    .reg .b32 %r<2>;
; PTX_70-NEXT:    .reg .b64 %rd<8>;
; PTX_70-EMPTY:
; PTX_70-NEXT:  // %bb.0: // %bb
; PTX_70-NEXT:    ld.param.b8 %rs1, [test_phi_param_3];
; PTX_70-NEXT:    and.b16 %rs2, %rs1, 1;
; PTX_70-NEXT:    setp.ne.b16 %p1, %rs2, 0;
; PTX_70-NEXT:    mov.b64 %rd7, test_phi_param_0;
; PTX_70-NEXT:    ld.param.b64 %rd6, [test_phi_param_2];
; PTX_70-NEXT:    cvta.to.global.u64 %rd1, %rd6;
; PTX_70-NEXT:    @%p1 bra $L__BB13_2;
; PTX_70-NEXT:  // %bb.1: // %second
; PTX_70-NEXT:    mov.b64 %rd2, test_phi_param_1;
; PTX_70-NEXT:    add.s64 %rd7, %rd2, 4;
; PTX_70-NEXT:  $L__BB13_2: // %merge
; PTX_70-NEXT:    ld.param.b32 %r1, [%rd7];
; PTX_70-NEXT:    st.global.b32 [%rd1], %r1;
; PTX_70-NEXT:    ret;
bb:
  br i1 %cond, label %first, label %second

first:                                            ; preds = %bb
  %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0
  br label %merge

second:                                           ; preds = %bb
  %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1
  br label %merge

merge:                                            ; preds = %second, %first
  %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ]
  %valloaded = load i32, ptr %ptrnew, align 4
  store i32 %valloaded, ptr %inout, align 4
  ret void
}

define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
; COMMON-LABEL: define ptx_kernel void @test_phi_write(
; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
; COMMON-NEXT:  [[BB:.*:]]
; COMMON-NEXT:    [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
; COMMON-NEXT:    [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; COMMON-NEXT:    [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT:    [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COMMON-NEXT:    br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON:       [[FIRST]]:
; COMMON-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
; COMMON-NEXT:    br label %[[MERGE:.*]]
; COMMON:       [[SECOND]]:
; COMMON-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
; COMMON-NEXT:    br label %[[MERGE]]
; COMMON:       [[MERGE]]:
; COMMON-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; COMMON-NEXT:    store i32 1, ptr [[PTRNEW]], align 4
; COMMON-NEXT:    ret void
;
; PTX-LABEL: test_phi_write(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot14[8];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .pred %p<2>;
; PTX-NEXT:    .reg .b16 %rs<3>;
; PTX-NEXT:    .reg .b32 %r<3>;
; PTX-NEXT:    .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0: // %bb
; PTX-NEXT:    mov.b64 %SPL, __local_depot14;
; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
; PTX-NEXT:    ld.param.b8 %rs1, [test_phi_write_param_2];
; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
; PTX-NEXT:    setp.ne.b16 %p1, %rs2, 0;
; PTX-NEXT:    add.u64 %rd1, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [test_phi_write_param_1+4];
; PTX-NEXT:    st.b32 [%SP], %r1;
; PTX-NEXT:    add.u64 %rd6, %SPL, 4;
; PTX-NEXT:    ld.param.b32 %r2, [test_phi_write_param_0];
; PTX-NEXT:    st.b32 [%SP+4], %r2;
; PTX-NEXT:    @%p1 bra $L__BB14_2;
; PTX-NEXT:  // %bb.1: // %second
; PTX-NEXT:    mov.b64 %rd6, %rd1;
; PTX-NEXT:  $L__BB14_2: // %merge
; PTX-NEXT:    st.local.b32 [%rd6], 1;
; PTX-NEXT:    ret;
bb:
  br i1 %cond, label %first, label %second

first:                                            ; preds = %bb
  %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0
  br label %merge

second:                                           ; preds = %bb
  %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1
  br label %merge

merge:                                            ; preds = %second, %first
  %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ]
  store i32 1, ptr %ptrnew, align 4
  ret void
}

define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
; COMMON-NEXT:    [[INPUT1:%.*]] = alloca i32, align 4
; COMMON-NEXT:    [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
; COMMON-NEXT:    call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
; COMMON-NEXT:    ret void
;
; PTX-LABEL: test_forward_byval_arg(
; PTX:       {
; PTX-NEXT:    .local .align 4 .b8 __local_depot15[4];
; PTX-NEXT:    .reg .b64 %SP;
; PTX-NEXT:    .reg .b64 %SPL;
; PTX-NEXT:    .reg .b32 %r<2>;
; PTX-NEXT:    .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    mov.b64 %SPL, __local_depot15;
; PTX-NEXT:    add.u64 %rd2, %SPL, 0;
; PTX-NEXT:    ld.param.b32 %r1, [test_forward_byval_arg_param_0];
; PTX-NEXT:    st.local.b32 [%rd2], %r1;
; PTX-NEXT:    { // callseq 2, 0
; PTX-NEXT:    .param .align 4 .b8 param0[4];
; PTX-NEXT:    st.param.b32 [param0], %r1;
; PTX-NEXT:    call.uni device_func, (param0);
; PTX-NEXT:    } // callseq 2
; PTX-NEXT:    ret;
  call void @device_func(ptr byval(i32) align 4 %input)
  ret void
}

define void @device_func(ptr byval(i32) align 4 %input) {
; COMMON-LABEL: define void @device_func(
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
; COMMON-NEXT:    call void @device_func(ptr byval(i32) align 4 [[INPUT]])
; COMMON-NEXT:    ret void
;
; PTX-LABEL: device_func(
; PTX:       {
; PTX-NEXT:    .reg .b32 %r<2>;
; PTX-NEXT:    .reg .b64 %rd<2>;
; PTX-EMPTY:
; PTX-NEXT:  // %bb.0:
; PTX-NEXT:    ld.param.b32 %r1, [device_func_param_0];
; PTX-NEXT:    { // callseq 3, 0
; PTX-NEXT:    .param .align 4 .b8 param0[4];
; PTX-NEXT:    st.param.b32 [param0], %r1;
; PTX-NEXT:    call.uni device_func, (param0);
; PTX-NEXT:    } // callseq 3
; PTX-NEXT:    ret;
  call void @device_func(ptr byval(i32) align 4 %input)
  ret void
}

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }

!llvm.module.flags = !{!0, !1, !2, !3}
!llvm.ident = !{!20, !21}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{i32 7, !"frame-pointer", i32 2}
!20 = !{!"clang version 20.0.0git"}
!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
