; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s

define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i16:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT:    s_mul_i32 s0, s0, s1
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i16:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16_zeroext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i16_zeroext:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT:    s_mul_i32 s0, s0, s1
; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16_zeroext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i16_zeroext:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mul_i32 s0, s0, s1
; GFX7-NEXT:    s_sext_i32_i16 s0, s0
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX8-NEXT:    s_mul_i32 s0, s0, s1
; GFX8-NEXT:    s_sext_i32_i16 s0, s0
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX9-NEXT:    s_mul_i32 s0, s0, s1
; GFX9-NEXT:    s_sext_i32_i16 s0, s0
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16_signext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i16_signext:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT:    s_mul_i32 s0, s0, s1
; GFX12-NEXT:    s_sext_i32_i16 s0, s0
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i16 %num, %den
  ret i16 %result
}

define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16_signext:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i16_signext:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i16 %num, %den
  ret i16 %result
}

define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_mul_i32 s0, s0, s1
; GCN-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i32:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_i32 s0, s0, s1
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i32 %num, %den
  ret i32 %result
}

define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mul_lo_u32 v0, v0, v1
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v1
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i32:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i32 %num, %den
  ret i32 %result
}

define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_mul_i32 s0, s0, s2
; GCN-NEXT:    s_mul_i32 s1, s1, s3
; GCN-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_v2i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s3
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_v2i32:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_i32 s0, s0, s2
; GFX12-NEXT:    s_mul_i32 s1, s1, s3
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul <2 x i32> %num, %den
  ret <2 x i32> %result
}

define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
; GCN-NEXT:    v_mul_lo_u32 v1, v1, v3
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_v2i32:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v2
; GFX10PLUS-NEXT:    v_mul_lo_u32 v1, v1, v3
; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_v2i32:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul <2 x i32> %num, %den
  ret <2 x i32> %result
}

define amdgpu_cs i33 @s_mul_i33(i33 inreg %num,  i33 inreg %den) {
; GFX7-LABEL: s_mul_i33:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s2
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s4, s0, s2
; GFX7-NEXT:    s_mul_i32 s0, s0, s3
; GFX7-NEXT:    s_mul_i32 s1, s1, s2
; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
; GFX7-NEXT:    s_add_u32 s0, s0, s5
; GFX7-NEXT:    s_add_u32 s1, s1, s0
; GFX7-NEXT:    s_mov_b32 s0, s4
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i33:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s4, s0, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s3
; GFX8-NEXT:    s_mul_i32 s1, s1, s2
; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
; GFX8-NEXT:    s_add_u32 s0, s0, s5
; GFX8-NEXT:    s_add_u32 s1, s1, s0
; GFX8-NEXT:    s_mov_b32 s0, s4
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i33:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s4, s0, s2
; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s3
; GFX9-NEXT:    s_add_u32 s0, s0, s5
; GFX9-NEXT:    s_mul_i32 s1, s1, s2
; GFX9-NEXT:    s_add_u32 s1, s1, s0
; GFX9-NEXT:    s_mov_b32 s0, s4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i33:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_hi_u32 s4, s0, s2
; GFX10PLUS-NEXT:    s_mul_i32 s3, s0, s3
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s2
; GFX10PLUS-NEXT:    s_add_i32 s3, s4, s3
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT:    s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i33:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i33 %num, %den
  ret i33 %result
}

define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s2
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    s_mul_i32 s4, s0, s2
; GFX7-NEXT:    s_mul_i32 s0, s0, s3
; GFX7-NEXT:    s_mul_i32 s1, s1, s2
; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
; GFX7-NEXT:    s_add_u32 s0, s0, s5
; GFX7-NEXT:    s_add_u32 s1, s1, s0
; GFX7-NEXT:    s_mov_b32 s0, s4
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    s_mul_i32 s4, s0, s2
; GFX8-NEXT:    s_mul_i32 s0, s0, s3
; GFX8-NEXT:    s_mul_i32 s1, s1, s2
; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
; GFX8-NEXT:    s_add_u32 s0, s0, s5
; GFX8-NEXT:    s_add_u32 s1, s1, s0
; GFX8-NEXT:    s_mov_b32 s0, s4
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s4, s0, s2
; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s2
; GFX9-NEXT:    s_mul_i32 s0, s0, s3
; GFX9-NEXT:    s_add_u32 s0, s0, s5
; GFX9-NEXT:    s_mul_i32 s1, s1, s2
; GFX9-NEXT:    s_add_u32 s1, s1, s0
; GFX9-NEXT:    s_mov_b32 s0, s4
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i64:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_hi_u32 s4, s0, s2
; GFX10PLUS-NEXT:    s_mul_i32 s3, s0, s3
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s2
; GFX10PLUS-NEXT:    s_add_i32 s3, s4, s3
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT:    s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i64:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i64 %num, %den
  ret i64 %result
}

define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN-LABEL: v_mul_i64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v4, v0
; GCN-NEXT:    v_mov_b32_e32 v5, v1
; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v4, v0
; GFX10-NEXT:    v_mov_b32_e32 v5, v1
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v3
; GFX10-NEXT:    v_mul_lo_u32 v2, v5, v2
; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v2
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
; GFX11-NEXT:    v_mul_lo_u32 v2, v5, v2
; GFX11-NEXT:    v_add3_u32 v1, v1, v3, v2
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i64:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
; GFX12-NEXT:    v_mul_lo_u32 v3, v0, v3
; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v2
; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT:    v_add3_u32 v1, v4, v3, v1
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i64 %num, %den
  ret i64 %result
}

define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s3
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_mov_b32_e32 v1, s4
; GFX7-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX7-NEXT:    s_mul_i32 s5, s0, s5
; GFX7-NEXT:    v_readfirstlane_b32 s7, v0
; GFX7-NEXT:    s_mul_i32 s8, s1, s4
; GFX7-NEXT:    v_mov_b32_e32 v0, s1
; GFX7-NEXT:    s_add_u32 s5, s8, s5
; GFX7-NEXT:    s_mul_i32 s2, s2, s3
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s3
; GFX7-NEXT:    s_mul_i32 s6, s0, s3
; GFX7-NEXT:    s_add_u32 s2, s2, s5
; GFX7-NEXT:    s_mul_i32 s0, s0, s4
; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
; GFX7-NEXT:    s_add_u32 s0, s0, s7
; GFX7-NEXT:    s_addc_u32 s2, s4, s2
; GFX7-NEXT:    s_mul_i32 s1, s1, s3
; GFX7-NEXT:    v_readfirstlane_b32 s3, v0
; GFX7-NEXT:    s_add_u32 s1, s1, s0
; GFX7-NEXT:    s_addc_u32 s2, s3, s2
; GFX7-NEXT:    s_mov_b32 s0, s6
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s3
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s4
; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX8-NEXT:    s_mul_i32 s5, s0, s5
; GFX8-NEXT:    v_readfirstlane_b32 s7, v0
; GFX8-NEXT:    s_mul_i32 s8, s1, s4
; GFX8-NEXT:    v_mov_b32_e32 v0, s1
; GFX8-NEXT:    s_add_u32 s5, s8, s5
; GFX8-NEXT:    s_mul_i32 s2, s2, s3
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s3
; GFX8-NEXT:    s_mul_i32 s6, s0, s3
; GFX8-NEXT:    s_add_u32 s2, s2, s5
; GFX8-NEXT:    s_mul_i32 s0, s0, s4
; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
; GFX8-NEXT:    s_add_u32 s0, s0, s7
; GFX8-NEXT:    s_addc_u32 s2, s4, s2
; GFX8-NEXT:    s_mul_i32 s1, s1, s3
; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
; GFX8-NEXT:    s_add_u32 s1, s1, s0
; GFX8-NEXT:    s_addc_u32 s2, s3, s2
; GFX8-NEXT:    s_mov_b32 s0, s6
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s5, s0, s5
; GFX9-NEXT:    s_mul_i32 s8, s1, s4
; GFX9-NEXT:    s_add_u32 s5, s8, s5
; GFX9-NEXT:    s_mul_i32 s2, s2, s3
; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s3
; GFX9-NEXT:    s_add_u32 s2, s2, s5
; GFX9-NEXT:    s_mul_i32 s5, s0, s4
; GFX9-NEXT:    s_mul_i32 s6, s0, s3
; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT:    s_add_u32 s4, s5, s7
; GFX9-NEXT:    s_addc_u32 s0, s0, s2
; GFX9-NEXT:    s_mul_i32 s2, s1, s3
; GFX9-NEXT:    s_mul_hi_u32 s3, s1, s3
; GFX9-NEXT:    s_add_u32 s1, s2, s4
; GFX9-NEXT:    s_addc_u32 s2, s3, s0
; GFX9-NEXT:    s_mov_b32 s0, s6
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i96:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s6, s0, s5
; GFX10PLUS-NEXT:    s_mul_i32 s7, s1, s4
; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s3
; GFX10PLUS-NEXT:    s_add_i32 s6, s6, s7
; GFX10PLUS-NEXT:    s_mul_hi_u32 s7, s0, s3
; GFX10PLUS-NEXT:    s_add_i32 s6, s6, s2
; GFX10PLUS-NEXT:    s_mul_i32 s2, s0, s4
; GFX10PLUS-NEXT:    s_mul_i32 s5, s0, s3
; GFX10PLUS-NEXT:    s_mul_hi_u32 s0, s0, s4
; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s7
; GFX10PLUS-NEXT:    s_mul_i32 s4, s1, s3
; GFX10PLUS-NEXT:    s_addc_u32 s0, s0, s6
; GFX10PLUS-NEXT:    s_mul_hi_u32 s3, s1, s3
; GFX10PLUS-NEXT:    s_add_u32 s1, s4, s2
; GFX10PLUS-NEXT:    s_addc_u32 s2, s3, s0
; GFX10PLUS-NEXT:    s_mov_b32 s0, s5
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i96:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_i32 s6, s0, s5
; GFX12-NEXT:    s_mul_i32 s7, s1, s4
; GFX12-NEXT:    s_mul_i32 s2, s2, s3
; GFX12-NEXT:    s_add_co_i32 s6, s6, s7
; GFX12-NEXT:    s_mul_hi_u32 s7, s0, s3
; GFX12-NEXT:    s_add_co_i32 s6, s6, s2
; GFX12-NEXT:    s_mul_i32 s2, s0, s4
; GFX12-NEXT:    s_mul_i32 s5, s0, s3
; GFX12-NEXT:    s_mul_hi_u32 s0, s0, s4
; GFX12-NEXT:    s_add_co_u32 s2, s2, s7
; GFX12-NEXT:    s_mul_i32 s4, s1, s3
; GFX12-NEXT:    s_add_co_ci_u32 s0, s0, s6
; GFX12-NEXT:    s_mul_hi_u32 s3, s1, s3
; GFX12-NEXT:    s_add_co_u32 s1, s4, s2
; GFX12-NEXT:    s_add_co_ci_u32 s2, s3, s0
; GFX12-NEXT:    s_mov_b32 s0, s5
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i96 %num, %den
  %cast = bitcast i96 %result to <3 x i32>
  ret <3 x i32> %cast
}

define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-LABEL: v_mul_i96:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v6, v0
; GCN-NEXT:    v_mov_b32_e32 v7, v1
; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
; GCN-NEXT:    v_mov_b32_e32 v2, v8
; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
; GCN-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v6, v0
; GFX10-NEXT:    v_mov_b32_e32 v7, v1
; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT:    v_mul_lo_u32 v5, v6, v5
; GFX10-NEXT:    v_mul_lo_u32 v8, v7, v4
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
; GFX10-NEXT:    v_add3_u32 v2, v5, v8, v2
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX11-NEXT:    v_mul_lo_u32 v5, v6, v5
; GFX11-NEXT:    v_mul_lo_u32 v8, v7, v4
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
; GFX11-NEXT:    v_add3_u32 v2, v5, v8, v2
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i96:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
; GFX12-NEXT:    v_mul_lo_u32 v2, v2, v3
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mul_lo_u32 v5, v6, v5
; GFX12-NEXT:    v_mul_lo_u32 v8, v7, v4
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add3_u32 v2, v5, v8, v2
; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i96 %num, %den
  ret i96 %result
}

define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    v_mov_b32_e32 v0, s4
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_mov_b32_e32 v1, s5
; GFX7-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX7-NEXT:    s_mul_i32 s10, s0, s6
; GFX7-NEXT:    v_readfirstlane_b32 s9, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s6
; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT:    v_readfirstlane_b32 s13, v2
; GFX7-NEXT:    v_mov_b32_e32 v2, s2
; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT:    s_mul_i32 s12, s1, s5
; GFX7-NEXT:    v_readfirstlane_b32 s11, v0
; GFX7-NEXT:    s_add_u32 s10, s12, s10
; GFX7-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX7-NEXT:    v_mov_b32_e32 v0, s1
; GFX7-NEXT:    s_addc_u32 s11, s13, s11
; GFX7-NEXT:    s_mul_i32 s12, s2, s4
; GFX7-NEXT:    v_readfirstlane_b32 s13, v2
; GFX7-NEXT:    s_add_u32 s10, s12, s10
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s4
; GFX7-NEXT:    s_addc_u32 s11, s13, s11
; GFX7-NEXT:    s_mul_i32 s12, s0, s5
; GFX7-NEXT:    v_readfirstlane_b32 s13, v1
; GFX7-NEXT:    s_add_u32 s9, s12, s9
; GFX7-NEXT:    s_addc_u32 s10, s13, s10
; GFX7-NEXT:    s_mul_i32 s13, s1, s4
; GFX7-NEXT:    s_cselect_b32 s12, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s14, v0
; GFX7-NEXT:    s_add_u32 s9, s13, s9
; GFX7-NEXT:    s_mul_i32 s8, s0, s4
; GFX7-NEXT:    s_addc_u32 s10, s14, s10
; GFX7-NEXT:    s_mul_i32 s0, s0, s7
; GFX7-NEXT:    s_addc_u32 s0, s11, s0
; GFX7-NEXT:    s_mul_i32 s1, s1, s6
; GFX7-NEXT:    s_cmp_lg_u32 s12, 0
; GFX7-NEXT:    s_addc_u32 s0, s0, s1
; GFX7-NEXT:    s_mul_i32 s2, s2, s5
; GFX7-NEXT:    s_add_u32 s0, s2, s0
; GFX7-NEXT:    s_mul_i32 s3, s3, s4
; GFX7-NEXT:    s_add_u32 s3, s3, s0
; GFX7-NEXT:    s_mov_b32 s0, s8
; GFX7-NEXT:    s_mov_b32 s1, s9
; GFX7-NEXT:    s_mov_b32 s2, s10
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    v_mov_b32_e32 v0, s4
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s5
; GFX8-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX8-NEXT:    s_mul_i32 s10, s0, s6
; GFX8-NEXT:    v_readfirstlane_b32 s9, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s6
; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT:    v_readfirstlane_b32 s13, v2
; GFX8-NEXT:    v_mov_b32_e32 v2, s2
; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT:    s_mul_i32 s12, s1, s5
; GFX8-NEXT:    v_readfirstlane_b32 s11, v0
; GFX8-NEXT:    s_add_u32 s10, s12, s10
; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
; GFX8-NEXT:    v_mov_b32_e32 v0, s1
; GFX8-NEXT:    s_addc_u32 s11, s13, s11
; GFX8-NEXT:    s_mul_i32 s12, s2, s4
; GFX8-NEXT:    v_readfirstlane_b32 s13, v2
; GFX8-NEXT:    s_add_u32 s10, s12, s10
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s4
; GFX8-NEXT:    s_addc_u32 s11, s13, s11
; GFX8-NEXT:    s_mul_i32 s12, s0, s5
; GFX8-NEXT:    v_readfirstlane_b32 s13, v1
; GFX8-NEXT:    s_add_u32 s9, s12, s9
; GFX8-NEXT:    s_addc_u32 s10, s13, s10
; GFX8-NEXT:    s_mul_i32 s13, s1, s4
; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s14, v0
; GFX8-NEXT:    s_add_u32 s9, s13, s9
; GFX8-NEXT:    s_mul_i32 s8, s0, s4
; GFX8-NEXT:    s_addc_u32 s10, s14, s10
; GFX8-NEXT:    s_mul_i32 s0, s0, s7
; GFX8-NEXT:    s_addc_u32 s0, s11, s0
; GFX8-NEXT:    s_mul_i32 s1, s1, s6
; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
; GFX8-NEXT:    s_addc_u32 s0, s0, s1
; GFX8-NEXT:    s_mul_i32 s2, s2, s5
; GFX8-NEXT:    s_add_u32 s0, s2, s0
; GFX8-NEXT:    s_mul_i32 s3, s3, s4
; GFX8-NEXT:    s_add_u32 s3, s3, s0
; GFX8-NEXT:    s_mov_b32 s0, s8
; GFX8-NEXT:    s_mov_b32 s1, s9
; GFX8-NEXT:    s_mov_b32 s2, s10
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mul_i32 s10, s0, s6
; GFX9-NEXT:    s_mul_i32 s12, s1, s5
; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s6
; GFX9-NEXT:    s_mul_hi_u32 s13, s1, s5
; GFX9-NEXT:    s_add_u32 s10, s12, s10
; GFX9-NEXT:    s_addc_u32 s11, s13, s11
; GFX9-NEXT:    s_mul_i32 s12, s2, s4
; GFX9-NEXT:    s_mul_hi_u32 s13, s2, s4
; GFX9-NEXT:    s_add_u32 s10, s12, s10
; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s4
; GFX9-NEXT:    s_addc_u32 s11, s13, s11
; GFX9-NEXT:    s_mul_i32 s12, s0, s5
; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s5
; GFX9-NEXT:    s_add_u32 s9, s12, s9
; GFX9-NEXT:    s_addc_u32 s10, s13, s10
; GFX9-NEXT:    s_mul_i32 s13, s1, s4
; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT:    s_add_u32 s9, s13, s9
; GFX9-NEXT:    s_mul_i32 s8, s0, s4
; GFX9-NEXT:    s_addc_u32 s10, s14, s10
; GFX9-NEXT:    s_mul_i32 s0, s0, s7
; GFX9-NEXT:    s_addc_u32 s0, s11, s0
; GFX9-NEXT:    s_mul_i32 s1, s1, s6
; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
; GFX9-NEXT:    s_addc_u32 s0, s0, s1
; GFX9-NEXT:    s_mul_i32 s2, s2, s5
; GFX9-NEXT:    s_add_u32 s0, s2, s0
; GFX9-NEXT:    s_mul_i32 s3, s3, s4
; GFX9-NEXT:    s_add_u32 s3, s3, s0
; GFX9-NEXT:    s_mov_b32 s0, s8
; GFX9-NEXT:    s_mov_b32 s1, s9
; GFX9-NEXT:    s_mov_b32 s2, s10
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i128:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s9, s0, s6
; GFX10PLUS-NEXT:    s_mul_i32 s11, s1, s5
; GFX10PLUS-NEXT:    s_mul_hi_u32 s10, s0, s6
; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s1, s5
; GFX10PLUS-NEXT:    s_add_u32 s9, s11, s9
; GFX10PLUS-NEXT:    s_mul_i32 s11, s2, s4
; GFX10PLUS-NEXT:    s_addc_u32 s10, s12, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s2, s4
; GFX10PLUS-NEXT:    s_mul_hi_u32 s8, s0, s4
; GFX10PLUS-NEXT:    s_add_u32 s9, s11, s9
; GFX10PLUS-NEXT:    s_mul_i32 s11, s0, s5
; GFX10PLUS-NEXT:    s_addc_u32 s10, s12, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s0, s5
; GFX10PLUS-NEXT:    s_add_u32 s8, s11, s8
; GFX10PLUS-NEXT:    s_addc_u32 s9, s12, s9
; GFX10PLUS-NEXT:    s_mul_i32 s12, s1, s4
; GFX10PLUS-NEXT:    s_mul_hi_u32 s13, s1, s4
; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s8, s12, s8
; GFX10PLUS-NEXT:    s_mul_i32 s12, s0, s7
; GFX10PLUS-NEXT:    s_addc_u32 s7, s13, s9
; GFX10PLUS-NEXT:    s_addc_u32 s9, s10, s12
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s6
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s5
; GFX10PLUS-NEXT:    s_addc_u32 s1, s9, s1
; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s4
; GFX10PLUS-NEXT:    s_add_i32 s1, s1, s2
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s4
; GFX10PLUS-NEXT:    s_add_i32 s3, s1, s3
; GFX10PLUS-NEXT:    s_mov_b32 s1, s8
; GFX10PLUS-NEXT:    s_mov_b32 s2, s7
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i128:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_i32 s9, s0, s6
; GFX12-NEXT:    s_mul_i32 s11, s1, s5
; GFX12-NEXT:    s_mul_hi_u32 s10, s0, s6
; GFX12-NEXT:    s_mul_hi_u32 s12, s1, s5
; GFX12-NEXT:    s_add_co_u32 s9, s11, s9
; GFX12-NEXT:    s_mul_i32 s11, s2, s4
; GFX12-NEXT:    s_add_co_ci_u32 s10, s12, s10
; GFX12-NEXT:    s_mul_hi_u32 s12, s2, s4
; GFX12-NEXT:    s_mul_hi_u32 s8, s0, s4
; GFX12-NEXT:    s_add_co_u32 s9, s11, s9
; GFX12-NEXT:    s_mul_i32 s11, s0, s5
; GFX12-NEXT:    s_add_co_ci_u32 s10, s12, s10
; GFX12-NEXT:    s_mul_hi_u32 s12, s0, s5
; GFX12-NEXT:    s_add_co_u32 s8, s11, s8
; GFX12-NEXT:    s_add_co_ci_u32 s9, s12, s9
; GFX12-NEXT:    s_mul_i32 s12, s1, s4
; GFX12-NEXT:    s_mul_hi_u32 s13, s1, s4
; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
; GFX12-NEXT:    s_add_co_u32 s8, s12, s8
; GFX12-NEXT:    s_mul_i32 s12, s0, s7
; GFX12-NEXT:    s_add_co_ci_u32 s7, s13, s9
; GFX12-NEXT:    s_add_co_ci_u32 s9, s10, s12
; GFX12-NEXT:    s_mul_i32 s1, s1, s6
; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
; GFX12-NEXT:    s_mul_i32 s2, s2, s5
; GFX12-NEXT:    s_add_co_ci_u32 s1, s9, s1
; GFX12-NEXT:    s_mul_i32 s3, s3, s4
; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
; GFX12-NEXT:    s_mul_i32 s0, s0, s4
; GFX12-NEXT:    s_add_co_i32 s3, s1, s3
; GFX12-NEXT:    s_mov_b32 s1, s8
; GFX12-NEXT:    s_mov_b32 s2, s7
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i128 %num, %den
  %cast = bitcast i128 %result to <4 x i32>
  ret <4 x i32> %cast
}

define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mov_b32_e32 v8, v0
; GFX7-NEXT:    v_mov_b32_e32 v9, v1
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT:    v_mov_b32_e32 v10, v2
; GFX7-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX7-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX7-NEXT:    v_mov_b32_e32 v2, v11
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX7-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mov_b32_e32 v8, v0
; GFX8-NEXT:    v_mov_b32_e32 v9, v1
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT:    v_mov_b32_e32 v10, v2
; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX8-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX8-NEXT:    v_mov_b32_e32 v2, v11
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX8-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v8, v0
; GFX9-NEXT:    v_mov_b32_e32 v9, v1
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT:    v_mov_b32_e32 v10, v2
; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX9-NEXT:    v_mov_b32_e32 v2, v11
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v8, v0
; GFX10-NEXT:    v_mov_b32_e32 v9, v1
; GFX10-NEXT:    v_mov_b32_e32 v10, v2
; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v6, 0
; GFX10-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
; GFX10-NEXT:    v_mov_b32_e32 v2, v11
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX10-NEXT:    v_mul_lo_u32 v5, v10, v5
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX10-NEXT:    v_add3_u32 v3, v4, v5, v3
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i128:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
; GFX11-NEXT:    v_mov_b32_e32 v10, v2
; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
; GFX11-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX11-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v4, 0
; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12]
; GFX11-NEXT:    v_mov_b32_e32 v2, v11
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX11-NEXT:    v_mul_lo_u32 v5, v10, v5
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX11-NEXT:    v_add3_u32 v3, v4, v5, v3
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i128:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT:    v_mov_b32_e32 v10, v2
; GFX12-NEXT:    v_mul_lo_u32 v3, v3, v4
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
; GFX12-NEXT:    v_mul_lo_u32 v7, v8, v7
; GFX12-NEXT:    v_mul_lo_u32 v6, v9, v6
; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
; GFX12-NEXT:    v_mov_b32_e32 v2, v11
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX12-NEXT:    v_mul_lo_u32 v5, v10, v5
; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
; GFX12-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT:    v_add3_u32 v3, v4, v5, v3
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i128 %num, %den
  ret i128 %result
}

define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mov_b32 s16, s0
; GFX7-NEXT:    v_mov_b32_e32 v0, s8
; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT:    v_mov_b32_e32 v1, s9
; GFX7-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, s16, v1
; GFX7-NEXT:    v_readfirstlane_b32 s17, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s10
; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT:    v_readfirstlane_b32 s21, v2
; GFX7-NEXT:    v_mov_b32_e32 v2, s2
; GFX7-NEXT:    v_mul_hi_u32 v3, v2, s8
; GFX7-NEXT:    s_mul_i32 s18, s16, s10
; GFX7-NEXT:    s_mul_i32 s20, s1, s9
; GFX7-NEXT:    v_readfirstlane_b32 s19, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s1
; GFX7-NEXT:    s_add_u32 s18, s20, s18
; GFX7-NEXT:    s_addc_u32 s19, s21, s19
; GFX7-NEXT:    s_mul_i32 s21, s2, s8
; GFX7-NEXT:    v_readfirstlane_b32 s23, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, v0, s8
; GFX7-NEXT:    s_cselect_b32 s20, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s22, v3
; GFX7-NEXT:    s_add_u32 s18, s21, s18
; GFX7-NEXT:    s_addc_u32 s19, s22, s19
; GFX7-NEXT:    s_mul_i32 s22, s16, s9
; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
; GFX7-NEXT:    s_add_u32 s17, s22, s17
; GFX7-NEXT:    s_addc_u32 s22, s23, s18
; GFX7-NEXT:    v_readfirstlane_b32 s23, v1
; GFX7-NEXT:    v_mov_b32_e32 v1, s12
; GFX7-NEXT:    v_mul_hi_u32 v3, s16, v1
; GFX7-NEXT:    s_mul_i32 s18, s1, s8
; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
; GFX7-NEXT:    s_add_u32 s18, s18, s17
; GFX7-NEXT:    s_addc_u32 s17, s23, s22
; GFX7-NEXT:    v_mov_b32_e32 v4, s11
; GFX7-NEXT:    v_readfirstlane_b32 s23, v3
; GFX7-NEXT:    v_mul_hi_u32 v3, v2, s10
; GFX7-NEXT:    v_mul_hi_u32 v5, s1, v4
; GFX7-NEXT:    s_mul_i32 s22, s16, s12
; GFX7-NEXT:    s_mul_i32 s24, s1, s11
; GFX7-NEXT:    v_readfirstlane_b32 s28, v3
; GFX7-NEXT:    v_mov_b32_e32 v3, s3
; GFX7-NEXT:    v_readfirstlane_b32 s27, v5
; GFX7-NEXT:    v_mul_hi_u32 v5, v3, s9
; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s24, s22
; GFX7-NEXT:    s_addc_u32 s23, s27, s23
; GFX7-NEXT:    v_readfirstlane_b32 s29, v5
; GFX7-NEXT:    v_mov_b32_e32 v5, s4
; GFX7-NEXT:    v_mul_hi_u32 v6, v5, s8
; GFX7-NEXT:    s_mul_i32 s27, s2, s10
; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
; GFX7-NEXT:    s_add_u32 s24, s27, s24
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s10
; GFX7-NEXT:    s_addc_u32 s27, s28, s23
; GFX7-NEXT:    s_mul_i32 s28, s3, s9
; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
; GFX7-NEXT:    s_add_u32 s28, s28, s24
; GFX7-NEXT:    v_readfirstlane_b32 s30, v6
; GFX7-NEXT:    v_mul_hi_u32 v6, s16, v4
; GFX7-NEXT:    s_addc_u32 s27, s29, s27
; GFX7-NEXT:    s_mul_i32 s29, s4, s8
; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
; GFX7-NEXT:    s_add_u32 s28, s29, s28
; GFX7-NEXT:    v_readfirstlane_b32 s33, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v2, s9
; GFX7-NEXT:    s_addc_u32 s27, s30, s27
; GFX7-NEXT:    s_mul_i32 s30, s16, s11
; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s31, v6
; GFX7-NEXT:    s_add_u32 s19, s30, s19
; GFX7-NEXT:    s_addc_u32 s28, s31, s28
; GFX7-NEXT:    s_mul_i32 s31, s1, s10
; GFX7-NEXT:    s_cselect_b32 s30, 1, 0
; GFX7-NEXT:    s_add_u32 s19, s31, s19
; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s8
; GFX7-NEXT:    s_addc_u32 s28, s33, s28
; GFX7-NEXT:    s_mul_i32 s33, s2, s9
; GFX7-NEXT:    s_cselect_b32 s31, 1, 0
; GFX7-NEXT:    s_add_u32 s19, s33, s19
; GFX7-NEXT:    s_addc_u32 s28, s34, s28
; GFX7-NEXT:    s_mul_i32 s34, s3, s8
; GFX7-NEXT:    s_cselect_b32 s33, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    s_add_u32 s19, s34, s19
; GFX7-NEXT:    v_mov_b32_e32 v0, s14
; GFX7-NEXT:    s_addc_u32 s28, s35, s28
; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT:    s_cselect_b32 s34, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s26, 0
; GFX7-NEXT:    s_addc_u32 s19, s25, s19
; GFX7-NEXT:    v_mov_b32_e32 v2, s13
; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
; GFX7-NEXT:    s_addc_u32 s20, s20, 0
; GFX7-NEXT:    v_readfirstlane_b32 s26, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, s2, v1
; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
; GFX7-NEXT:    s_addc_u32 s20, s20, s28
; GFX7-NEXT:    s_mul_i32 s25, s16, s14
; GFX7-NEXT:    s_mul_i32 s28, s1, s13
; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
; GFX7-NEXT:    s_add_u32 s25, s28, s25
; GFX7-NEXT:    s_addc_u32 s26, s35, s26
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s11
; GFX7-NEXT:    s_mul_i32 s28, s2, s12
; GFX7-NEXT:    s_add_u32 s25, s28, s25
; GFX7-NEXT:    s_addc_u32 s26, s35, s26
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    v_mul_hi_u32 v0, v5, s10
; GFX7-NEXT:    s_mul_i32 s28, s3, s11
; GFX7-NEXT:    s_add_u32 s25, s28, s25
; GFX7-NEXT:    s_addc_u32 s26, s35, s26
; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s5
; GFX7-NEXT:    v_mul_hi_u32 v6, v0, s9
; GFX7-NEXT:    s_mul_i32 s28, s4, s10
; GFX7-NEXT:    s_add_u32 s25, s28, s25
; GFX7-NEXT:    v_mul_hi_u32 v1, s1, v1
; GFX7-NEXT:    s_addc_u32 s26, s35, s26
; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
; GFX7-NEXT:    v_mov_b32_e32 v6, s6
; GFX7-NEXT:    v_mul_hi_u32 v6, v6, s8
; GFX7-NEXT:    s_mul_i32 s28, s5, s9
; GFX7-NEXT:    s_add_u32 s25, s28, s25
; GFX7-NEXT:    v_mul_hi_u32 v2, s16, v2
; GFX7-NEXT:    v_readfirstlane_b32 s36, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, s2, v4
; GFX7-NEXT:    s_addc_u32 s26, s35, s26
; GFX7-NEXT:    s_mul_i32 s28, s6, s8
; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
; GFX7-NEXT:    s_add_u32 s25, s28, s25
; GFX7-NEXT:    s_addc_u32 s26, s35, s26
; GFX7-NEXT:    s_mul_i32 s28, s16, s13
; GFX7-NEXT:    v_readfirstlane_b32 s35, v2
; GFX7-NEXT:    s_add_u32 s27, s28, s27
; GFX7-NEXT:    v_readfirstlane_b32 s37, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, v3, s10
; GFX7-NEXT:    s_addc_u32 s25, s35, s25
; GFX7-NEXT:    s_mul_i32 s35, s1, s12
; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
; GFX7-NEXT:    s_add_u32 s27, s35, s27
; GFX7-NEXT:    s_addc_u32 s25, s36, s25
; GFX7-NEXT:    s_mul_i32 s36, s2, s11
; GFX7-NEXT:    s_cselect_b32 s35, 1, 0
; GFX7-NEXT:    s_add_u32 s27, s36, s27
; GFX7-NEXT:    v_readfirstlane_b32 s38, v1
; GFX7-NEXT:    v_mul_hi_u32 v1, v5, s9
; GFX7-NEXT:    s_addc_u32 s25, s37, s25
; GFX7-NEXT:    s_mul_i32 s37, s3, s10
; GFX7-NEXT:    s_cselect_b32 s36, 1, 0
; GFX7-NEXT:    s_add_u32 s27, s37, s27
; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s8
; GFX7-NEXT:    s_addc_u32 s25, s38, s25
; GFX7-NEXT:    s_mul_i32 s38, s4, s9
; GFX7-NEXT:    s_cselect_b32 s37, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s39, v1
; GFX7-NEXT:    s_add_u32 s27, s38, s27
; GFX7-NEXT:    s_addc_u32 s25, s39, s25
; GFX7-NEXT:    s_mul_i32 s39, s5, s8
; GFX7-NEXT:    s_cselect_b32 s38, 1, 0
; GFX7-NEXT:    v_readfirstlane_b32 s40, v0
; GFX7-NEXT:    s_add_u32 s27, s39, s27
; GFX7-NEXT:    s_addc_u32 s25, s40, s25
; GFX7-NEXT:    s_cselect_b32 s39, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
; GFX7-NEXT:    s_addc_u32 s30, s30, 0
; GFX7-NEXT:    s_cmp_lg_u32 s33, 0
; GFX7-NEXT:    s_addc_u32 s30, s30, 0
; GFX7-NEXT:    s_cmp_lg_u32 s34, 0
; GFX7-NEXT:    s_addc_u32 s30, s30, 0
; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
; GFX7-NEXT:    s_addc_u32 s21, s30, s27
; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
; GFX7-NEXT:    s_addc_u32 s22, s22, 0
; GFX7-NEXT:    s_cmp_lg_u32 s24, 0
; GFX7-NEXT:    s_addc_u32 s22, s22, 0
; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
; GFX7-NEXT:    s_addc_u32 s22, s22, 0
; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
; GFX7-NEXT:    s_addc_u32 s22, s22, s25
; GFX7-NEXT:    s_mul_i32 s16, s16, s15
; GFX7-NEXT:    s_addc_u32 s15, s26, s16
; GFX7-NEXT:    s_mul_i32 s1, s1, s14
; GFX7-NEXT:    s_cmp_lg_u32 s39, 0
; GFX7-NEXT:    s_addc_u32 s1, s15, s1
; GFX7-NEXT:    s_mul_i32 s2, s2, s13
; GFX7-NEXT:    s_cmp_lg_u32 s38, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s2
; GFX7-NEXT:    s_mul_i32 s3, s3, s12
; GFX7-NEXT:    s_cmp_lg_u32 s37, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s3
; GFX7-NEXT:    s_mul_i32 s4, s4, s11
; GFX7-NEXT:    s_cmp_lg_u32 s36, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s4
; GFX7-NEXT:    s_mul_i32 s5, s5, s10
; GFX7-NEXT:    s_cmp_lg_u32 s35, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s5
; GFX7-NEXT:    s_mul_i32 s6, s6, s9
; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
; GFX7-NEXT:    s_addc_u32 s1, s1, s6
; GFX7-NEXT:    s_mul_i32 s7, s7, s8
; GFX7-NEXT:    s_mul_i32 s0, s0, s8
; GFX7-NEXT:    s_add_u32 s7, s7, s1
; GFX7-NEXT:    s_mov_b32 s1, s18
; GFX7-NEXT:    s_mov_b32 s2, s17
; GFX7-NEXT:    s_mov_b32 s3, s19
; GFX7-NEXT:    s_mov_b32 s4, s20
; GFX7-NEXT:    s_mov_b32 s5, s21
; GFX7-NEXT:    s_mov_b32 s6, s22
; GFX7-NEXT:    ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_mov_b32 s16, s0
; GFX8-NEXT:    v_mov_b32_e32 v0, s8
; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT:    v_mov_b32_e32 v1, s9
; GFX8-NEXT:    v_mul_hi_u32 v2, s1, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, s16, v1
; GFX8-NEXT:    v_readfirstlane_b32 s17, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s10
; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT:    v_readfirstlane_b32 s21, v2
; GFX8-NEXT:    v_mov_b32_e32 v2, s2
; GFX8-NEXT:    v_mul_hi_u32 v3, v2, s8
; GFX8-NEXT:    s_mul_i32 s18, s16, s10
; GFX8-NEXT:    s_mul_i32 s20, s1, s9
; GFX8-NEXT:    v_readfirstlane_b32 s19, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s1
; GFX8-NEXT:    s_add_u32 s18, s20, s18
; GFX8-NEXT:    s_addc_u32 s19, s21, s19
; GFX8-NEXT:    s_mul_i32 s21, s2, s8
; GFX8-NEXT:    v_readfirstlane_b32 s23, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, v0, s8
; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s22, v3
; GFX8-NEXT:    s_add_u32 s18, s21, s18
; GFX8-NEXT:    s_addc_u32 s19, s22, s19
; GFX8-NEXT:    s_mul_i32 s22, s16, s9
; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
; GFX8-NEXT:    s_add_u32 s17, s22, s17
; GFX8-NEXT:    s_addc_u32 s22, s23, s18
; GFX8-NEXT:    v_readfirstlane_b32 s23, v1
; GFX8-NEXT:    v_mov_b32_e32 v1, s12
; GFX8-NEXT:    v_mul_hi_u32 v3, s16, v1
; GFX8-NEXT:    s_mul_i32 s18, s1, s8
; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
; GFX8-NEXT:    s_add_u32 s18, s18, s17
; GFX8-NEXT:    s_addc_u32 s17, s23, s22
; GFX8-NEXT:    v_mov_b32_e32 v4, s11
; GFX8-NEXT:    v_readfirstlane_b32 s23, v3
; GFX8-NEXT:    v_mul_hi_u32 v3, v2, s10
; GFX8-NEXT:    v_mul_hi_u32 v5, s1, v4
; GFX8-NEXT:    s_mul_i32 s22, s16, s12
; GFX8-NEXT:    s_mul_i32 s24, s1, s11
; GFX8-NEXT:    v_readfirstlane_b32 s28, v3
; GFX8-NEXT:    v_mov_b32_e32 v3, s3
; GFX8-NEXT:    v_readfirstlane_b32 s27, v5
; GFX8-NEXT:    v_mul_hi_u32 v5, v3, s9
; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s24, s22
; GFX8-NEXT:    s_addc_u32 s23, s27, s23
; GFX8-NEXT:    v_readfirstlane_b32 s29, v5
; GFX8-NEXT:    v_mov_b32_e32 v5, s4
; GFX8-NEXT:    v_mul_hi_u32 v6, v5, s8
; GFX8-NEXT:    s_mul_i32 s27, s2, s10
; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
; GFX8-NEXT:    s_add_u32 s24, s27, s24
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s10
; GFX8-NEXT:    s_addc_u32 s27, s28, s23
; GFX8-NEXT:    s_mul_i32 s28, s3, s9
; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
; GFX8-NEXT:    s_add_u32 s28, s28, s24
; GFX8-NEXT:    v_readfirstlane_b32 s30, v6
; GFX8-NEXT:    v_mul_hi_u32 v6, s16, v4
; GFX8-NEXT:    s_addc_u32 s27, s29, s27
; GFX8-NEXT:    s_mul_i32 s29, s4, s8
; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
; GFX8-NEXT:    s_add_u32 s28, s29, s28
; GFX8-NEXT:    v_readfirstlane_b32 s33, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v2, s9
; GFX8-NEXT:    s_addc_u32 s27, s30, s27
; GFX8-NEXT:    s_mul_i32 s30, s16, s11
; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s31, v6
; GFX8-NEXT:    s_add_u32 s19, s30, s19
; GFX8-NEXT:    s_addc_u32 s28, s31, s28
; GFX8-NEXT:    s_mul_i32 s31, s1, s10
; GFX8-NEXT:    s_cselect_b32 s30, 1, 0
; GFX8-NEXT:    s_add_u32 s19, s31, s19
; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s8
; GFX8-NEXT:    s_addc_u32 s28, s33, s28
; GFX8-NEXT:    s_mul_i32 s33, s2, s9
; GFX8-NEXT:    s_cselect_b32 s31, 1, 0
; GFX8-NEXT:    s_add_u32 s19, s33, s19
; GFX8-NEXT:    s_addc_u32 s28, s34, s28
; GFX8-NEXT:    s_mul_i32 s34, s3, s8
; GFX8-NEXT:    s_cselect_b32 s33, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    s_add_u32 s19, s34, s19
; GFX8-NEXT:    v_mov_b32_e32 v0, s14
; GFX8-NEXT:    s_addc_u32 s28, s35, s28
; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT:    s_cselect_b32 s34, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
; GFX8-NEXT:    s_addc_u32 s19, s25, s19
; GFX8-NEXT:    v_mov_b32_e32 v2, s13
; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
; GFX8-NEXT:    s_addc_u32 s20, s20, 0
; GFX8-NEXT:    v_readfirstlane_b32 s26, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v1
; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
; GFX8-NEXT:    s_addc_u32 s20, s20, s28
; GFX8-NEXT:    s_mul_i32 s25, s16, s14
; GFX8-NEXT:    s_mul_i32 s28, s1, s13
; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
; GFX8-NEXT:    s_add_u32 s25, s28, s25
; GFX8-NEXT:    s_addc_u32 s26, s35, s26
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s11
; GFX8-NEXT:    s_mul_i32 s28, s2, s12
; GFX8-NEXT:    s_add_u32 s25, s28, s25
; GFX8-NEXT:    s_addc_u32 s26, s35, s26
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    v_mul_hi_u32 v0, v5, s10
; GFX8-NEXT:    s_mul_i32 s28, s3, s11
; GFX8-NEXT:    s_add_u32 s25, s28, s25
; GFX8-NEXT:    s_addc_u32 s26, s35, s26
; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s5
; GFX8-NEXT:    v_mul_hi_u32 v6, v0, s9
; GFX8-NEXT:    s_mul_i32 s28, s4, s10
; GFX8-NEXT:    s_add_u32 s25, s28, s25
; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
; GFX8-NEXT:    s_addc_u32 s26, s35, s26
; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
; GFX8-NEXT:    v_mov_b32_e32 v6, s6
; GFX8-NEXT:    v_mul_hi_u32 v6, v6, s8
; GFX8-NEXT:    s_mul_i32 s28, s5, s9
; GFX8-NEXT:    s_add_u32 s25, s28, s25
; GFX8-NEXT:    v_mul_hi_u32 v2, s16, v2
; GFX8-NEXT:    v_readfirstlane_b32 s36, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v4
; GFX8-NEXT:    s_addc_u32 s26, s35, s26
; GFX8-NEXT:    s_mul_i32 s28, s6, s8
; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
; GFX8-NEXT:    s_add_u32 s25, s28, s25
; GFX8-NEXT:    s_addc_u32 s26, s35, s26
; GFX8-NEXT:    s_mul_i32 s28, s16, s13
; GFX8-NEXT:    v_readfirstlane_b32 s35, v2
; GFX8-NEXT:    s_add_u32 s27, s28, s27
; GFX8-NEXT:    v_readfirstlane_b32 s37, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, v3, s10
; GFX8-NEXT:    s_addc_u32 s25, s35, s25
; GFX8-NEXT:    s_mul_i32 s35, s1, s12
; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
; GFX8-NEXT:    s_add_u32 s27, s35, s27
; GFX8-NEXT:    s_addc_u32 s25, s36, s25
; GFX8-NEXT:    s_mul_i32 s36, s2, s11
; GFX8-NEXT:    s_cselect_b32 s35, 1, 0
; GFX8-NEXT:    s_add_u32 s27, s36, s27
; GFX8-NEXT:    v_readfirstlane_b32 s38, v1
; GFX8-NEXT:    v_mul_hi_u32 v1, v5, s9
; GFX8-NEXT:    s_addc_u32 s25, s37, s25
; GFX8-NEXT:    s_mul_i32 s37, s3, s10
; GFX8-NEXT:    s_cselect_b32 s36, 1, 0
; GFX8-NEXT:    s_add_u32 s27, s37, s27
; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s8
; GFX8-NEXT:    s_addc_u32 s25, s38, s25
; GFX8-NEXT:    s_mul_i32 s38, s4, s9
; GFX8-NEXT:    s_cselect_b32 s37, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s39, v1
; GFX8-NEXT:    s_add_u32 s27, s38, s27
; GFX8-NEXT:    s_addc_u32 s25, s39, s25
; GFX8-NEXT:    s_mul_i32 s39, s5, s8
; GFX8-NEXT:    s_cselect_b32 s38, 1, 0
; GFX8-NEXT:    v_readfirstlane_b32 s40, v0
; GFX8-NEXT:    s_add_u32 s27, s39, s27
; GFX8-NEXT:    s_addc_u32 s25, s40, s25
; GFX8-NEXT:    s_cselect_b32 s39, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
; GFX8-NEXT:    s_addc_u32 s30, s30, 0
; GFX8-NEXT:    s_cmp_lg_u32 s33, 0
; GFX8-NEXT:    s_addc_u32 s30, s30, 0
; GFX8-NEXT:    s_cmp_lg_u32 s34, 0
; GFX8-NEXT:    s_addc_u32 s30, s30, 0
; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
; GFX8-NEXT:    s_addc_u32 s21, s30, s27
; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
; GFX8-NEXT:    s_addc_u32 s22, s22, 0
; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
; GFX8-NEXT:    s_addc_u32 s22, s22, 0
; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
; GFX8-NEXT:    s_addc_u32 s22, s22, 0
; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
; GFX8-NEXT:    s_addc_u32 s22, s22, s25
; GFX8-NEXT:    s_mul_i32 s16, s16, s15
; GFX8-NEXT:    s_addc_u32 s15, s26, s16
; GFX8-NEXT:    s_mul_i32 s1, s1, s14
; GFX8-NEXT:    s_cmp_lg_u32 s39, 0
; GFX8-NEXT:    s_addc_u32 s1, s15, s1
; GFX8-NEXT:    s_mul_i32 s2, s2, s13
; GFX8-NEXT:    s_cmp_lg_u32 s38, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s2
; GFX8-NEXT:    s_mul_i32 s3, s3, s12
; GFX8-NEXT:    s_cmp_lg_u32 s37, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s3
; GFX8-NEXT:    s_mul_i32 s4, s4, s11
; GFX8-NEXT:    s_cmp_lg_u32 s36, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s4
; GFX8-NEXT:    s_mul_i32 s5, s5, s10
; GFX8-NEXT:    s_cmp_lg_u32 s35, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s5
; GFX8-NEXT:    s_mul_i32 s6, s6, s9
; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
; GFX8-NEXT:    s_addc_u32 s1, s1, s6
; GFX8-NEXT:    s_mul_i32 s7, s7, s8
; GFX8-NEXT:    s_mul_i32 s0, s0, s8
; GFX8-NEXT:    s_add_u32 s7, s7, s1
; GFX8-NEXT:    s_mov_b32 s1, s18
; GFX8-NEXT:    s_mov_b32 s2, s17
; GFX8-NEXT:    s_mov_b32 s3, s19
; GFX8-NEXT:    s_mov_b32 s4, s20
; GFX8-NEXT:    s_mov_b32 s5, s21
; GFX8-NEXT:    s_mov_b32 s6, s22
; GFX8-NEXT:    ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s16, s0
; GFX9-NEXT:    s_mul_i32 s18, s16, s10
; GFX9-NEXT:    s_mul_i32 s20, s1, s9
; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s10
; GFX9-NEXT:    s_mul_hi_u32 s21, s1, s9
; GFX9-NEXT:    s_add_u32 s18, s20, s18
; GFX9-NEXT:    s_addc_u32 s19, s21, s19
; GFX9-NEXT:    s_mul_i32 s21, s2, s8
; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s22, s2, s8
; GFX9-NEXT:    s_add_u32 s18, s21, s18
; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s8
; GFX9-NEXT:    s_addc_u32 s19, s22, s19
; GFX9-NEXT:    s_mul_i32 s22, s16, s9
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
; GFX9-NEXT:    s_add_u32 s17, s22, s17
; GFX9-NEXT:    s_addc_u32 s18, s23, s18
; GFX9-NEXT:    s_mul_i32 s23, s1, s8
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s24, s1, s8
; GFX9-NEXT:    s_add_u32 s17, s23, s17
; GFX9-NEXT:    s_addc_u32 s18, s24, s18
; GFX9-NEXT:    s_mul_i32 s24, s16, s12
; GFX9-NEXT:    s_mul_i32 s26, s1, s11
; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s25, s16, s12
; GFX9-NEXT:    s_mul_hi_u32 s27, s1, s11
; GFX9-NEXT:    s_add_u32 s24, s26, s24
; GFX9-NEXT:    s_addc_u32 s25, s27, s25
; GFX9-NEXT:    s_mul_i32 s27, s2, s10
; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s28, s2, s10
; GFX9-NEXT:    s_add_u32 s24, s27, s24
; GFX9-NEXT:    s_addc_u32 s25, s28, s25
; GFX9-NEXT:    s_mul_i32 s28, s3, s9
; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT:    s_add_u32 s24, s28, s24
; GFX9-NEXT:    s_addc_u32 s25, s29, s25
; GFX9-NEXT:    s_mul_i32 s29, s4, s8
; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s30, s4, s8
; GFX9-NEXT:    s_add_u32 s24, s29, s24
; GFX9-NEXT:    s_addc_u32 s25, s30, s25
; GFX9-NEXT:    s_mul_i32 s30, s16, s11
; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s31, s16, s11
; GFX9-NEXT:    s_add_u32 s19, s30, s19
; GFX9-NEXT:    s_addc_u32 s24, s31, s24
; GFX9-NEXT:    s_mul_i32 s31, s1, s10
; GFX9-NEXT:    s_cselect_b32 s30, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s33, s1, s10
; GFX9-NEXT:    s_add_u32 s19, s31, s19
; GFX9-NEXT:    s_addc_u32 s24, s33, s24
; GFX9-NEXT:    s_mul_i32 s33, s2, s9
; GFX9-NEXT:    s_cselect_b32 s31, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s34, s2, s9
; GFX9-NEXT:    s_add_u32 s19, s33, s19
; GFX9-NEXT:    s_addc_u32 s24, s34, s24
; GFX9-NEXT:    s_mul_i32 s34, s3, s8
; GFX9-NEXT:    s_cselect_b32 s33, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s35, s3, s8
; GFX9-NEXT:    s_add_u32 s19, s34, s19
; GFX9-NEXT:    s_addc_u32 s24, s35, s24
; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
; GFX9-NEXT:    s_addc_u32 s19, s22, s19
; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
; GFX9-NEXT:    s_addc_u32 s20, s20, 0
; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
; GFX9-NEXT:    s_addc_u32 s20, s20, s24
; GFX9-NEXT:    s_mul_i32 s22, s16, s14
; GFX9-NEXT:    s_mul_i32 s24, s1, s13
; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s14
; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s13
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s2, s12
; GFX9-NEXT:    s_mul_hi_u32 s35, s2, s12
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s3, s11
; GFX9-NEXT:    s_mul_hi_u32 s35, s3, s11
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s4, s10
; GFX9-NEXT:    s_mul_hi_u32 s35, s4, s10
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s5, s9
; GFX9-NEXT:    s_mul_hi_u32 s35, s5, s9
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s6, s8
; GFX9-NEXT:    s_mul_hi_u32 s35, s6, s8
; GFX9-NEXT:    s_add_u32 s22, s24, s22
; GFX9-NEXT:    s_addc_u32 s23, s35, s23
; GFX9-NEXT:    s_mul_i32 s24, s16, s13
; GFX9-NEXT:    s_mul_hi_u32 s35, s16, s13
; GFX9-NEXT:    s_add_u32 s24, s24, s25
; GFX9-NEXT:    s_addc_u32 s22, s35, s22
; GFX9-NEXT:    s_mul_i32 s35, s1, s12
; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s36, s1, s12
; GFX9-NEXT:    s_add_u32 s24, s35, s24
; GFX9-NEXT:    s_addc_u32 s22, s36, s22
; GFX9-NEXT:    s_mul_i32 s36, s2, s11
; GFX9-NEXT:    s_cselect_b32 s35, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s37, s2, s11
; GFX9-NEXT:    s_add_u32 s24, s36, s24
; GFX9-NEXT:    s_addc_u32 s22, s37, s22
; GFX9-NEXT:    s_mul_i32 s37, s3, s10
; GFX9-NEXT:    s_cselect_b32 s36, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s38, s3, s10
; GFX9-NEXT:    s_add_u32 s24, s37, s24
; GFX9-NEXT:    s_addc_u32 s22, s38, s22
; GFX9-NEXT:    s_mul_i32 s38, s4, s9
; GFX9-NEXT:    s_cselect_b32 s37, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s39, s4, s9
; GFX9-NEXT:    s_add_u32 s24, s38, s24
; GFX9-NEXT:    s_addc_u32 s22, s39, s22
; GFX9-NEXT:    s_mul_i32 s39, s5, s8
; GFX9-NEXT:    s_cselect_b32 s38, 1, 0
; GFX9-NEXT:    s_mul_hi_u32 s40, s5, s8
; GFX9-NEXT:    s_add_u32 s24, s39, s24
; GFX9-NEXT:    s_addc_u32 s22, s40, s22
; GFX9-NEXT:    s_cselect_b32 s39, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s31, 0
; GFX9-NEXT:    s_addc_u32 s30, s30, 0
; GFX9-NEXT:    s_cmp_lg_u32 s33, 0
; GFX9-NEXT:    s_addc_u32 s30, s30, 0
; GFX9-NEXT:    s_cmp_lg_u32 s34, 0
; GFX9-NEXT:    s_addc_u32 s30, s30, 0
; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
; GFX9-NEXT:    s_addc_u32 s21, s30, s24
; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
; GFX9-NEXT:    s_addc_u32 s26, s26, 0
; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
; GFX9-NEXT:    s_addc_u32 s26, s26, 0
; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
; GFX9-NEXT:    s_addc_u32 s26, s26, 0
; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
; GFX9-NEXT:    s_addc_u32 s22, s26, s22
; GFX9-NEXT:    s_mul_i32 s16, s16, s15
; GFX9-NEXT:    s_addc_u32 s15, s23, s16
; GFX9-NEXT:    s_mul_i32 s1, s1, s14
; GFX9-NEXT:    s_cmp_lg_u32 s39, 0
; GFX9-NEXT:    s_addc_u32 s1, s15, s1
; GFX9-NEXT:    s_mul_i32 s2, s2, s13
; GFX9-NEXT:    s_cmp_lg_u32 s38, 0
; GFX9-NEXT:    s_addc_u32 s1, s1, s2
; GFX9-NEXT:    s_mul_i32 s3, s3, s12
; GFX9-NEXT:    s_cmp_lg_u32 s37, 0
; GFX9-NEXT:    s_addc_u32 s1, s1, s3
; GFX9-NEXT:    s_mul_i32 s4, s4, s11
; GFX9-NEXT:    s_cmp_lg_u32 s36, 0
; GFX9-NEXT:    s_addc_u32 s1, s1, s4
; GFX9-NEXT:    s_mul_i32 s5, s5, s10
; GFX9-NEXT:    s_cmp_lg_u32 s35, 0
; GFX9-NEXT:    s_addc_u32 s1, s1, s5
; GFX9-NEXT:    s_mul_i32 s6, s6, s9
; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
; GFX9-NEXT:    s_addc_u32 s1, s1, s6
; GFX9-NEXT:    s_mul_i32 s7, s7, s8
; GFX9-NEXT:    s_mul_i32 s0, s0, s8
; GFX9-NEXT:    s_add_u32 s7, s7, s1
; GFX9-NEXT:    s_mov_b32 s1, s17
; GFX9-NEXT:    s_mov_b32 s2, s18
; GFX9-NEXT:    s_mov_b32 s3, s19
; GFX9-NEXT:    s_mov_b32 s4, s20
; GFX9-NEXT:    s_mov_b32 s5, s21
; GFX9-NEXT:    s_mov_b32 s6, s22
; GFX9-NEXT:    ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i256:
; GFX10PLUS:       ; %bb.0:
; GFX10PLUS-NEXT:    s_mul_i32 s17, s0, s10
; GFX10PLUS-NEXT:    s_mul_i32 s19, s1, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s18, s0, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s20, s1, s9
; GFX10PLUS-NEXT:    s_add_u32 s17, s19, s17
; GFX10PLUS-NEXT:    s_addc_u32 s18, s20, s18
; GFX10PLUS-NEXT:    s_mul_i32 s20, s2, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s21, s2, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s19, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s17, s20, s17
; GFX10PLUS-NEXT:    s_mul_hi_u32 s16, s0, s8
; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s16, s21, s16
; GFX10PLUS-NEXT:    s_addc_u32 s17, s22, s17
; GFX10PLUS-NEXT:    s_mul_i32 s22, s1, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s23, s1, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s16, s22, s16
; GFX10PLUS-NEXT:    s_addc_u32 s17, s23, s17
; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s12
; GFX10PLUS-NEXT:    s_mul_i32 s25, s1, s11
; GFX10PLUS-NEXT:    s_mul_hi_u32 s24, s0, s12
; GFX10PLUS-NEXT:    s_mul_hi_u32 s26, s1, s11
; GFX10PLUS-NEXT:    s_cselect_b32 s22, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s25, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s26, s24
; GFX10PLUS-NEXT:    s_mul_i32 s26, s2, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s27, s2, s10
; GFX10PLUS-NEXT:    s_cselect_b32 s25, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s26, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s27, s24
; GFX10PLUS-NEXT:    s_mul_i32 s27, s3, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s28, s3, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s26, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s27, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s28, s24
; GFX10PLUS-NEXT:    s_mul_i32 s28, s4, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s29, s4, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s27, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s28, s23
; GFX10PLUS-NEXT:    s_addc_u32 s24, s29, s24
; GFX10PLUS-NEXT:    s_mul_i32 s29, s0, s11
; GFX10PLUS-NEXT:    s_mul_hi_u32 s30, s0, s11
; GFX10PLUS-NEXT:    s_cselect_b32 s28, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s29, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s30, s23
; GFX10PLUS-NEXT:    s_mul_i32 s30, s1, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s31, s1, s10
; GFX10PLUS-NEXT:    s_cselect_b32 s29, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s30, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s31, s23
; GFX10PLUS-NEXT:    s_mul_i32 s31, s2, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s33, s2, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s30, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s31, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s33, s23
; GFX10PLUS-NEXT:    s_mul_i32 s33, s3, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s31, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s18, s33, s18
; GFX10PLUS-NEXT:    s_addc_u32 s23, s34, s23
; GFX10PLUS-NEXT:    s_cselect_b32 s33, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s22, 0
; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s14
; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s1, s13
; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s21, 0
; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s14
; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, s23
; GFX10PLUS-NEXT:    s_mul_i32 s23, s1, s13
; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s2, s12
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s2, s12
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s3, s11
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s11
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s4, s10
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s4, s10
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s5, s9
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s5, s9
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s6, s8
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s6, s8
; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s13
; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s0, s13
; GFX10PLUS-NEXT:    s_add_u32 s23, s23, s24
; GFX10PLUS-NEXT:    s_addc_u32 s21, s34, s21
; GFX10PLUS-NEXT:    s_mul_i32 s34, s1, s12
; GFX10PLUS-NEXT:    s_mul_hi_u32 s35, s1, s12
; GFX10PLUS-NEXT:    s_cselect_b32 s24, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s34, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s35, s21
; GFX10PLUS-NEXT:    s_mul_i32 s35, s2, s11
; GFX10PLUS-NEXT:    s_mul_hi_u32 s36, s2, s11
; GFX10PLUS-NEXT:    s_cselect_b32 s34, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s35, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s36, s21
; GFX10PLUS-NEXT:    s_mul_i32 s36, s3, s10
; GFX10PLUS-NEXT:    s_mul_hi_u32 s37, s3, s10
; GFX10PLUS-NEXT:    s_cselect_b32 s35, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s36, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s37, s21
; GFX10PLUS-NEXT:    s_mul_i32 s37, s4, s9
; GFX10PLUS-NEXT:    s_mul_hi_u32 s38, s4, s9
; GFX10PLUS-NEXT:    s_cselect_b32 s36, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s37, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s38, s21
; GFX10PLUS-NEXT:    s_mul_i32 s38, s5, s8
; GFX10PLUS-NEXT:    s_mul_hi_u32 s39, s5, s8
; GFX10PLUS-NEXT:    s_cselect_b32 s37, 1, 0
; GFX10PLUS-NEXT:    s_add_u32 s23, s38, s23
; GFX10PLUS-NEXT:    s_addc_u32 s21, s39, s21
; GFX10PLUS-NEXT:    s_cselect_b32 s38, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s14
; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s31, 0
; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s13
; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s33, 0
; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s12
; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
; GFX10PLUS-NEXT:    s_mul_i32 s4, s4, s11
; GFX10PLUS-NEXT:    s_addc_u32 s20, s29, s23
; GFX10PLUS-NEXT:    s_cselect_b32 s23, 1, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
; GFX10PLUS-NEXT:    s_mul_i32 s26, s0, s15
; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s27, 0
; GFX10PLUS-NEXT:    s_mul_i32 s5, s5, s10
; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s28, 0
; GFX10PLUS-NEXT:    s_mul_i32 s6, s6, s9
; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s23, 0
; GFX10PLUS-NEXT:    s_mul_i32 s7, s7, s8
; GFX10PLUS-NEXT:    s_addc_u32 s15, s25, s21
; GFX10PLUS-NEXT:    s_addc_u32 s21, s22, s26
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s38, 0
; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s8
; GFX10PLUS-NEXT:    s_addc_u32 s1, s21, s1
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s37, 0
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s2
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s36, 0
; GFX10PLUS-NEXT:    s_mov_b32 s2, s17
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s35, 0
; GFX10PLUS-NEXT:    s_mov_b32 s3, s18
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s4
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s34, 0
; GFX10PLUS-NEXT:    s_mov_b32 s4, s19
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
; GFX10PLUS-NEXT:    s_cmp_lg_u32 s24, 0
; GFX10PLUS-NEXT:    s_mov_b32 s5, s20
; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s6
; GFX10PLUS-NEXT:    s_mov_b32 s6, s15
; GFX10PLUS-NEXT:    s_add_i32 s7, s1, s7
; GFX10PLUS-NEXT:    s_mov_b32 s1, s16
; GFX10PLUS-NEXT:    ; return to shader part epilog
;
; GFX12-LABEL: s_mul_i256:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_mul_i32 s17, s0, s10
; GFX12-NEXT:    s_mul_i32 s19, s1, s9
; GFX12-NEXT:    s_mul_hi_u32 s18, s0, s10
; GFX12-NEXT:    s_mul_hi_u32 s20, s1, s9
; GFX12-NEXT:    s_add_co_u32 s17, s19, s17
; GFX12-NEXT:    s_add_co_ci_u32 s18, s20, s18
; GFX12-NEXT:    s_mul_i32 s20, s2, s8
; GFX12-NEXT:    s_mul_hi_u32 s21, s2, s8
; GFX12-NEXT:    s_cselect_b32 s19, 1, 0
; GFX12-NEXT:    s_add_co_u32 s17, s20, s17
; GFX12-NEXT:    s_mul_hi_u32 s16, s0, s8
; GFX12-NEXT:    s_add_co_ci_u32 s18, s21, s18
; GFX12-NEXT:    s_mul_i32 s21, s0, s9
; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s9
; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
; GFX12-NEXT:    s_add_co_u32 s16, s21, s16
; GFX12-NEXT:    s_add_co_ci_u32 s17, s22, s17
; GFX12-NEXT:    s_mul_i32 s22, s1, s8
; GFX12-NEXT:    s_mul_hi_u32 s23, s1, s8
; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
; GFX12-NEXT:    s_add_co_u32 s16, s22, s16
; GFX12-NEXT:    s_add_co_ci_u32 s17, s23, s17
; GFX12-NEXT:    s_mul_i32 s23, s0, s12
; GFX12-NEXT:    s_mul_i32 s25, s1, s11
; GFX12-NEXT:    s_mul_hi_u32 s24, s0, s12
; GFX12-NEXT:    s_mul_hi_u32 s26, s1, s11
; GFX12-NEXT:    s_cselect_b32 s22, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s25, s23
; GFX12-NEXT:    s_add_co_ci_u32 s24, s26, s24
; GFX12-NEXT:    s_mul_i32 s26, s2, s10
; GFX12-NEXT:    s_mul_hi_u32 s27, s2, s10
; GFX12-NEXT:    s_cselect_b32 s25, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s26, s23
; GFX12-NEXT:    s_add_co_ci_u32 s24, s27, s24
; GFX12-NEXT:    s_mul_i32 s27, s3, s9
; GFX12-NEXT:    s_mul_hi_u32 s28, s3, s9
; GFX12-NEXT:    s_cselect_b32 s26, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s27, s23
; GFX12-NEXT:    s_add_co_ci_u32 s24, s28, s24
; GFX12-NEXT:    s_mul_i32 s28, s4, s8
; GFX12-NEXT:    s_mul_hi_u32 s29, s4, s8
; GFX12-NEXT:    s_cselect_b32 s27, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s28, s23
; GFX12-NEXT:    s_add_co_ci_u32 s24, s29, s24
; GFX12-NEXT:    s_mul_i32 s29, s0, s11
; GFX12-NEXT:    s_mul_hi_u32 s30, s0, s11
; GFX12-NEXT:    s_cselect_b32 s28, 1, 0
; GFX12-NEXT:    s_add_co_u32 s18, s29, s18
; GFX12-NEXT:    s_add_co_ci_u32 s23, s30, s23
; GFX12-NEXT:    s_mul_i32 s30, s1, s10
; GFX12-NEXT:    s_mul_hi_u32 s31, s1, s10
; GFX12-NEXT:    s_cselect_b32 s29, 1, 0
; GFX12-NEXT:    s_add_co_u32 s18, s30, s18
; GFX12-NEXT:    s_add_co_ci_u32 s23, s31, s23
; GFX12-NEXT:    s_mul_i32 s31, s2, s9
; GFX12-NEXT:    s_mul_hi_u32 s33, s2, s9
; GFX12-NEXT:    s_cselect_b32 s30, 1, 0
; GFX12-NEXT:    s_add_co_u32 s18, s31, s18
; GFX12-NEXT:    s_add_co_ci_u32 s23, s33, s23
; GFX12-NEXT:    s_mul_i32 s33, s3, s8
; GFX12-NEXT:    s_mul_hi_u32 s34, s3, s8
; GFX12-NEXT:    s_cselect_b32 s31, 1, 0
; GFX12-NEXT:    s_add_co_u32 s18, s33, s18
; GFX12-NEXT:    s_add_co_ci_u32 s23, s34, s23
; GFX12-NEXT:    s_cselect_b32 s33, 1, 0
; GFX12-NEXT:    s_cmp_lg_u32 s22, 0
; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s14
; GFX12-NEXT:    s_add_co_ci_u32 s18, s21, s18
; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
; GFX12-NEXT:    s_mul_hi_u32 s34, s1, s13
; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, 0
; GFX12-NEXT:    s_cmp_lg_u32 s21, 0
; GFX12-NEXT:    s_mul_i32 s21, s0, s14
; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, s23
; GFX12-NEXT:    s_mul_i32 s23, s1, s13
; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
; GFX12-NEXT:    s_mul_i32 s23, s2, s12
; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
; GFX12-NEXT:    s_mul_hi_u32 s34, s2, s12
; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
; GFX12-NEXT:    s_mul_i32 s23, s3, s11
; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
; GFX12-NEXT:    s_mul_hi_u32 s34, s3, s11
; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
; GFX12-NEXT:    s_mul_i32 s23, s4, s10
; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
; GFX12-NEXT:    s_mul_hi_u32 s34, s4, s10
; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
; GFX12-NEXT:    s_mul_i32 s23, s5, s9
; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
; GFX12-NEXT:    s_mul_hi_u32 s34, s5, s9
; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
; GFX12-NEXT:    s_mul_i32 s23, s6, s8
; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
; GFX12-NEXT:    s_mul_hi_u32 s34, s6, s8
; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
; GFX12-NEXT:    s_mul_i32 s23, s0, s13
; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
; GFX12-NEXT:    s_mul_hi_u32 s34, s0, s13
; GFX12-NEXT:    s_add_co_u32 s23, s23, s24
; GFX12-NEXT:    s_add_co_ci_u32 s21, s34, s21
; GFX12-NEXT:    s_mul_i32 s34, s1, s12
; GFX12-NEXT:    s_mul_hi_u32 s35, s1, s12
; GFX12-NEXT:    s_cselect_b32 s24, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s34, s23
; GFX12-NEXT:    s_add_co_ci_u32 s21, s35, s21
; GFX12-NEXT:    s_mul_i32 s35, s2, s11
; GFX12-NEXT:    s_mul_hi_u32 s36, s2, s11
; GFX12-NEXT:    s_cselect_b32 s34, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s35, s23
; GFX12-NEXT:    s_add_co_ci_u32 s21, s36, s21
; GFX12-NEXT:    s_mul_i32 s36, s3, s10
; GFX12-NEXT:    s_mul_hi_u32 s37, s3, s10
; GFX12-NEXT:    s_cselect_b32 s35, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s36, s23
; GFX12-NEXT:    s_add_co_ci_u32 s21, s37, s21
; GFX12-NEXT:    s_mul_i32 s37, s4, s9
; GFX12-NEXT:    s_mul_hi_u32 s38, s4, s9
; GFX12-NEXT:    s_cselect_b32 s36, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s37, s23
; GFX12-NEXT:    s_add_co_ci_u32 s21, s38, s21
; GFX12-NEXT:    s_mul_i32 s38, s5, s8
; GFX12-NEXT:    s_mul_hi_u32 s39, s5, s8
; GFX12-NEXT:    s_cselect_b32 s37, 1, 0
; GFX12-NEXT:    s_add_co_u32 s23, s38, s23
; GFX12-NEXT:    s_add_co_ci_u32 s21, s39, s21
; GFX12-NEXT:    s_cselect_b32 s38, 1, 0
; GFX12-NEXT:    s_cmp_lg_u32 s30, 0
; GFX12-NEXT:    s_mul_i32 s1, s1, s14
; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
; GFX12-NEXT:    s_cmp_lg_u32 s31, 0
; GFX12-NEXT:    s_mul_i32 s2, s2, s13
; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
; GFX12-NEXT:    s_cmp_lg_u32 s33, 0
; GFX12-NEXT:    s_mul_i32 s3, s3, s12
; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
; GFX12-NEXT:    s_mul_i32 s4, s4, s11
; GFX12-NEXT:    s_add_co_ci_u32 s20, s29, s23
; GFX12-NEXT:    s_cselect_b32 s23, 1, 0
; GFX12-NEXT:    s_cmp_lg_u32 s26, 0
; GFX12-NEXT:    s_mul_i32 s26, s0, s15
; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
; GFX12-NEXT:    s_cmp_lg_u32 s27, 0
; GFX12-NEXT:    s_mul_i32 s5, s5, s10
; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
; GFX12-NEXT:    s_cmp_lg_u32 s28, 0
; GFX12-NEXT:    s_mul_i32 s6, s6, s9
; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
; GFX12-NEXT:    s_cmp_lg_u32 s23, 0
; GFX12-NEXT:    s_mul_i32 s7, s7, s8
; GFX12-NEXT:    s_add_co_ci_u32 s15, s25, s21
; GFX12-NEXT:    s_add_co_ci_u32 s21, s22, s26
; GFX12-NEXT:    s_cmp_lg_u32 s38, 0
; GFX12-NEXT:    s_mul_i32 s0, s0, s8
; GFX12-NEXT:    s_add_co_ci_u32 s1, s21, s1
; GFX12-NEXT:    s_cmp_lg_u32 s37, 0
; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s2
; GFX12-NEXT:    s_cmp_lg_u32 s36, 0
; GFX12-NEXT:    s_mov_b32 s2, s17
; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s3
; GFX12-NEXT:    s_cmp_lg_u32 s35, 0
; GFX12-NEXT:    s_mov_b32 s3, s18
; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s4
; GFX12-NEXT:    s_cmp_lg_u32 s34, 0
; GFX12-NEXT:    s_mov_b32 s4, s19
; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s5
; GFX12-NEXT:    s_cmp_lg_u32 s24, 0
; GFX12-NEXT:    s_mov_b32 s5, s20
; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s6
; GFX12-NEXT:    s_mov_b32 s6, s15
; GFX12-NEXT:    s_add_co_i32 s7, s1, s7
; GFX12-NEXT:    s_mov_b32 s1, s16
; GFX12-NEXT:    ; return to shader part epilog
  %result = mul i256 %num, %den
  %cast = bitcast i256 %result to <8 x i32>
  ret <8 x i32> %cast
}

define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT:    v_mov_b32_e32 v16, v0
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
; GFX7-NEXT:    v_mov_b32_e32 v17, v1
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
; GFX7-NEXT:    v_addc_u32_e32 v25, vcc, 0, v24, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
; GFX7-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
; GFX7-NEXT:    v_mov_b32_e32 v18, v23
; GFX7-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
; GFX7-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
; GFX7-NEXT:    v_mov_b32_e32 v0, v20
; GFX7-NEXT:    v_mov_b32_e32 v1, v23
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
; GFX7-NEXT:    v_mul_lo_u32 v20, v6, v9
; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
; GFX7-NEXT:    v_mul_lo_u32 v23, v5, v10
; GFX7-NEXT:    v_mul_lo_u32 v26, v4, v11
; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
; GFX7-NEXT:    v_mul_lo_u32 v13, v2, v13
; GFX7-NEXT:    v_mov_b32_e32 v2, v22
; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
; GFX7-NEXT:    v_mul_lo_u32 v12, v3, v12
; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX7-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
; GFX7-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
; GFX7-NEXT:    v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
; GFX7-NEXT:    v_mul_lo_u32 v10, v16, v15
; GFX7-NEXT:    v_mul_lo_u32 v9, v17, v14
; GFX7-NEXT:    v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
; GFX7-NEXT:    v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
; GFX7-NEXT:    v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
; GFX7-NEXT:    v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
; GFX7-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
; GFX7-NEXT:    s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_mov_b32_e32 v16, v0
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
; GFX8-NEXT:    v_mov_b32_e32 v17, v1
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v24, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
; GFX8-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
; GFX8-NEXT:    v_mov_b32_e32 v18, v23
; GFX8-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v21, vcc
; GFX8-NEXT:    v_mov_b32_e32 v0, v20
; GFX8-NEXT:    v_mov_b32_e32 v1, v23
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
; GFX8-NEXT:    v_mul_lo_u32 v20, v6, v9
; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
; GFX8-NEXT:    v_mul_lo_u32 v23, v5, v10
; GFX8-NEXT:    v_mul_lo_u32 v26, v4, v11
; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
; GFX8-NEXT:    v_mul_lo_u32 v13, v2, v13
; GFX8-NEXT:    v_mov_b32_e32 v2, v22
; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
; GFX8-NEXT:    v_mul_lo_u32 v12, v3, v12
; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
; GFX8-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
; GFX8-NEXT:    v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
; GFX8-NEXT:    v_mul_lo_u32 v10, v16, v15
; GFX8-NEXT:    v_mul_lo_u32 v9, v17, v14
; GFX8-NEXT:    v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
; GFX8-NEXT:    v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
; GFX8-NEXT:    v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
; GFX8-NEXT:    v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v16, v0
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
; GFX9-NEXT:    v_mov_b32_e32 v17, v1
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
; GFX9-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
; GFX9-NEXT:    v_mov_b32_e32 v18, v23
; GFX9-NEXT:    v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
; GFX9-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
; GFX9-NEXT:    v_mov_b32_e32 v0, v20
; GFX9-NEXT:    v_mov_b32_e32 v1, v23
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
; GFX9-NEXT:    v_mul_lo_u32 v20, v6, v9
; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[8:9]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
; GFX9-NEXT:    v_mul_lo_u32 v23, v5, v10
; GFX9-NEXT:    v_mul_lo_u32 v26, v4, v11
; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
; GFX9-NEXT:    v_mul_lo_u32 v13, v2, v13
; GFX9-NEXT:    v_mov_b32_e32 v2, v22
; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
; GFX9-NEXT:    v_mul_lo_u32 v12, v3, v12
; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[8:9], 0, v6, s[8:9]
; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[8:9], v9, v3, s[8:9]
; GFX9-NEXT:    v_mul_lo_u32 v10, v16, v15
; GFX9-NEXT:    v_mul_lo_u32 v9, v17, v14
; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v25, v4, s[8:9]
; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v18, v5, s[8:9]
; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[8:9], v21, v6, s[8:9]
; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[8:9], v24, v10, s[8:9]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v10, v9, s[14:15]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v13, s[12:13]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[8:9], v9, v12, s[10:11]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v9, v26, s[6:7]
; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5]
; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v16, v0
; GFX10-NEXT:    v_mov_b32_e32 v17, v1
; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
; GFX10-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT:    v_mov_b32_e32 v20, v22
; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
; GFX10-NEXT:    v_mov_b32_e32 v20, v18
; GFX10-NEXT:    v_mov_b32_e32 v19, v22
; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
; GFX10-NEXT:    v_mov_b32_e32 v13, v1
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
; GFX10-NEXT:    v_mov_b32_e32 v14, v21
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
; GFX10-NEXT:    v_add_nc_u32_e32 v7, v8, v7
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i256:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX11-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX11-NEXT:    v_mul_lo_u32 v27, v6, v9
; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], null, v16, v12, 0
; GFX11-NEXT:    v_mul_lo_u32 v30, v17, v14
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX11-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
; GFX11-NEXT:    v_mov_b32_e32 v20, v22
; GFX11-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX11-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20]
; GFX11-NEXT:    v_mov_b32_e32 v20, v18
; GFX11-NEXT:    v_mov_b32_e32 v19, v22
; GFX11-NEXT:    v_mul_lo_u32 v22, v16, v15
; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20]
; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v8, 0
; GFX11-NEXT:    v_mul_lo_u32 v20, v4, v11
; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25]
; GFX11-NEXT:    v_mul_lo_u32 v25, v3, v12
; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
; GFX11-NEXT:    v_mov_b32_e32 v14, v21
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX11-NEXT:    v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19]
; GFX11-NEXT:    v_mul_lo_u32 v24, v2, v13
; GFX11-NEXT:    v_mov_b32_e32 v13, v1
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19]
; GFX11-NEXT:    v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14]
; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
; GFX11-NEXT:    v_add_nc_u32_e32 v7, v8, v7
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i256:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT:    s_wait_expcnt 0x0
; GFX12-NEXT:    s_wait_samplecnt 0x0
; GFX12-NEXT:    s_wait_bvhcnt 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
; GFX12-NEXT:    v_mul_lo_u32 v7, v7, v8
; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX12-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mov_b32_e32 v20, v22
; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT:    v_mov_b32_e32 v19, v22
; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
; GFX12-NEXT:    v_mov_b32_e32 v20, v18
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
; GFX12-NEXT:    v_mul_lo_u32 v20, v4, v11
; GFX12-NEXT:    v_mul_lo_u32 v25, v3, v12
; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
; GFX12-NEXT:    v_mul_lo_u32 v24, v2, v13
; GFX12-NEXT:    v_mov_b32_e32 v13, v1
; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX12-NEXT:    v_mov_b32_e32 v14, v21
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX12-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT:    v_add_nc_u32_e32 v7, v8, v7
; GFX12-NEXT:    s_setpc_b64 s[30:31]
  %result = mul i256 %num, %den
  ret i256 %result
}

define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX7-LABEL: s_mul_u64_zext_with_vregs:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mov_b32 s2, 0
; GFX7-NEXT:    s_mov_b32 s3, 0xf000
; GFX7-NEXT:    s_mov_b64 s[0:1], 0
; GFX7-NEXT:    buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
; GFX7-NEXT:    v_mov_b32_e32 v3, 0x50
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT:    s_endpgm
;
; GFX8-LABEL: s_mul_u64_zext_with_vregs:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    flat_load_dword v2, v[2:3]
; GFX8-NEXT:    v_mov_b32_e32 v3, 0x50
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_vregs:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v2, v[2:3], off
; GFX9-NEXT:    v_mov_b32_e32 v3, 0x50
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_vregs:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v2, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_vregs:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
; GFX12-NEXT:    s_wait_loadcnt 0x0
; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT:    s_nop 0
; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT:    s_endpgm
  %val = load i32, ptr addrspace(1) %in, align 4
  %ext = zext i32 %val to i64
  %mul = mul i64 %ext, 80
  store i64 %mul, ptr addrspace(1) %out, align 8
  ret void
}

define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX7-LABEL: s_mul_u64_zext_with_sregs:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT:    v_mov_b32_e32 v0, 0x50
; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
; GFX7-NEXT:    s_mov_b32 s2, -1
; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
; GFX7-NEXT:    v_mul_hi_u32 v0, s3, v0
; GFX7-NEXT:    s_mul_i32 s4, s3, 0x50
; GFX7-NEXT:    s_mov_b32 s3, 0xf000
; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
; GFX7-NEXT:    v_mov_b32_e32 v0, s4
; GFX7-NEXT:    v_mov_b32_e32 v1, s5
; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT:    s_endpgm
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT:    v_mov_b32_e32 v3, s1
; GFX8-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT:    s_mulk_i32 s2, 0x50
; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT:    v_mov_b32_e32 v0, s2
; GFX9-NEXT:    v_mov_b32_e32 v1, s3
; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_load_dword s3, s[2:3], 0x0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_mul_i32 s2, s3, 0x50
; GFX10-NEXT:    s_mul_hi_u32 s3, s3, 0x50
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v2, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_load_b32 s3, s[2:3], 0x0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_mul_i32 s2, s3, 0x50
; GFX11-NEXT:    s_mul_hi_u32 s3, s3, 0x50
; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT:    v_mov_b32_e32 v2, 0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT:    s_mov_b32 s3, 0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT:    s_nop 0
; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT:    s_endpgm
  %val = load i32, ptr addrspace(1) %in, align 4
  %ext = zext i32 %val to i64
  %mul = mul i64 %ext, 80
  store i64 %mul, ptr addrspace(1) %out, align 8
  ret void
}

define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX7-LABEL: s_mul_u64_sext_with_vregs:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_mov_b32 s2, 0
; GFX7-NEXT:    s_mov_b32 s3, 0xf000
; GFX7-NEXT:    s_mov_b64 s[0:1], 0
; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
; GFX7-NEXT:    s_waitcnt vmcnt(0)
; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT:    s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    flat_load_dword v4, v[2:3]
; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    global_load_dword v4, v[2:3], off
; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_vregs:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    global_load_dword v2, v[2:3], off
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
; GFX10-NEXT:    v_mul_lo_u32 v4, 0x50, v4
; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_vregs:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX11-NEXT:    v_mul_lo_u32 v4, 0x50, v4
; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v4
; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
; GFX12-NEXT:    s_wait_loadcnt 0x0
; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT:    s_nop 0
; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT:    s_endpgm
  %val = load i32, ptr addrspace(1) %in, align 4
  %ext = sext i32 %val to i64
  %mul = mul i64 %ext, 80
  store i64 %mul, ptr addrspace(1) %out, align 8
  ret void
}

define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX7-LABEL: s_mul_u64_sext_with_sregs:
; GFX7:       ; %bb.0:
; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT:    v_mov_b32_e32 v0, 0x50
; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
; GFX7-NEXT:    s_mov_b32 s2, -1
; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
; GFX7-NEXT:    v_mul_hi_u32 v0, s3, v0
; GFX7-NEXT:    s_ashr_i32 s5, s3, 31
; GFX7-NEXT:    s_mul_i32 s4, s3, 0x50
; GFX7-NEXT:    s_mulk_i32 s5, 0x50
; GFX7-NEXT:    v_readfirstlane_b32 s3, v0
; GFX7-NEXT:    s_add_u32 s5, s5, s3
; GFX7-NEXT:    v_mov_b32_e32 v0, s4
; GFX7-NEXT:    v_mov_b32_e32 v1, s5
; GFX7-NEXT:    s_mov_b32 s3, 0xf000
; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT:    s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
; GFX8-NEXT:    v_mov_b32_e32 v3, s1
; GFX8-NEXT:    v_mov_b32_e32 v2, s0
; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
; GFX8-NEXT:    s_ashr_i32 s3, s2, 31
; GFX8-NEXT:    s_mulk_i32 s2, 0x50
; GFX8-NEXT:    s_mulk_i32 s3, 0x50
; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
; GFX8-NEXT:    s_add_u32 s3, s3, s4
; GFX8-NEXT:    v_mov_b32_e32 v0, s2
; GFX8-NEXT:    v_mov_b32_e32 v1, s3
; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT:    s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
; GFX9-NEXT:    s_mulk_i32 s4, 0x50
; GFX9-NEXT:    s_add_u32 s3, s4, s3
; GFX9-NEXT:    v_mov_b32_e32 v0, s2
; GFX9-NEXT:    v_mov_b32_e32 v1, s3
; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
; GFX10-NEXT:    s_mul_hi_u32 s4, s2, 0x50
; GFX10-NEXT:    s_mulk_i32 s3, 0x50
; GFX10-NEXT:    s_mulk_i32 s2, 0x50
; GFX10-NEXT:    s_add_i32 s3, s4, s3
; GFX10-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-NEXT:    v_mov_b32_e32 v1, s3
; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT:    s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT:    v_mov_b32_e32 v2, 0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_ashr_i32 s3, s2, 31
; GFX11-NEXT:    s_mul_hi_u32 s4, s2, 0x50
; GFX11-NEXT:    s_mulk_i32 s3, 0x50
; GFX11-NEXT:    s_mulk_i32 s2, 0x50
; GFX11-NEXT:    s_add_i32 s3, s4, s3
; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT:    s_nop 0
; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT:    s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
; GFX12:       ; %bb.0:
; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT:    v_mov_b32_e32 v2, 0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT:    s_wait_kmcnt 0x0
; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT:    s_nop 0
; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT:    s_endpgm
  %val = load i32, ptr addrspace(1) %in, align 4
  %ext = sext i32 %val to i64
  %mul = mul i64 %ext, 80
  store i64 %mul, ptr addrspace(1) %out, align 8
  ret void
}
