llvm.org GIT mirror llvm / 84a775d
R600/SI: adjust writemask to only the used components Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Michel Dänzer <michel.daenzer@amd.com> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179165 91177308-0d34-0410-b5e6-96231b3b80d8 Christian Konig 7 years ago
5 changed file(s) with 156 addition(s) and 33 deletion(s). Raw diff Collapse all Expand all
568568 Operand = SDValue(Node, 0);
569569 }
570570
571 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
572 SelectionDAG &DAG) const {
571 /// \brief Try to fold the Nodes operands into the Node
572 SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
573 SelectionDAG &DAG) const {
573574
574575 // Original encoding (either e32 or e64)
575576 int Opcode = Node->getMachineOpcode();
702703 return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(),
703704 Node->getVTList(), Ops.data(), Ops.size());
704705 }
706
707 /// \brief Helper function for adjustWritemask
708 unsigned SubIdx2Lane(unsigned Idx) {
709 switch (Idx) {
710 default: return 0;
711 case AMDGPU::sub0: return 0;
712 case AMDGPU::sub1: return 1;
713 case AMDGPU::sub2: return 2;
714 case AMDGPU::sub3: return 3;
715 }
716 }
717
718 /// \brief Adjust the writemask of MIMG instructions
719 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
720 SelectionDAG &DAG) const {
721 SDNode *Users[4] = { };
722 unsigned Writemask = 0;
723
724 // Try to figure out the used register components
725 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
726 I != E; ++I) {
727
728 // Abort if we can't understand the usage
729 if (!I->isMachineOpcode() ||
730 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
731 return;
732
733 unsigned Lane = SubIdx2Lane(I->getConstantOperandVal(1));
734
735 // Abort if we have more than one user per component
736 if (Users[Lane])
737 return;
738
739 Users[Lane] = *I;
740 Writemask |= 1 << Lane;
741 }
742
743 // Abort if all components are used
744 if (Writemask == 0xf)
745 return;
746
747 // Adjust the writemask in the node
748 std::vector Ops;
749 Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
750 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
751 Ops.push_back(Node->getOperand(i));
752 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
753
754 // Update the users of the node with the new indices
755 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
756
757 SDNode *User = Users[i];
758 if (!User)
759 continue;
760
761 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32);
762 DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
763
764 switch (Idx) {
765 default: break;
766 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
767 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
768 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
769 }
770 }
771 }
772
773 /// \brief Fold the instructions after slecting them
774 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
775 SelectionDAG &DAG) const {
776
777 if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
778 adjustWritemask(Node, DAG);
779
780 return foldOperands(Node, DAG);
781 }
3333 void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
3434 unsigned RegClass, bool &ScalarSlotUsed) const;
3535
36 SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
37 void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
38
3639 public:
3740 SITargetLowering(TargetMachine &tm);
3841
7979 int getVOPe64(uint16_t Opcode);
8080 int getCommuteRev(uint16_t Opcode);
8181 int getCommuteOrig(uint16_t Opcode);
82 int isMIMG(uint16_t Opcode);
8283
8384 } // End namespace AMDGPU
8485
378378 let ValueCols = [["1"]];
379379 }
380380
381 def isMIMG : InstrMapping {
382 let FilterClass = "MIMG_Load_Helper";
383 let RowFields = ["Inst"];
384 let ColFields = ["Size"];
385 let KeyCol = ["8"];
386 let ValueCols = [["8"]];
387 }
388
381389 include "SIInstructions.td"
0 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
11
2 ;CHECK: IMAGE_SAMPLE
3 ;CHECK: IMAGE_SAMPLE
4 ;CHECK: IMAGE_SAMPLE
5 ;CHECK: IMAGE_SAMPLE
6 ;CHECK: IMAGE_SAMPLE
7 ;CHECK: IMAGE_SAMPLE_C
8 ;CHECK: IMAGE_SAMPLE_C
9 ;CHECK: IMAGE_SAMPLE_C
10 ;CHECK: IMAGE_SAMPLE
11 ;CHECK: IMAGE_SAMPLE
12 ;CHECK: IMAGE_SAMPLE_C
13 ;CHECK: IMAGE_SAMPLE_C
14 ;CHECK: IMAGE_SAMPLE_C
15 ;CHECK: IMAGE_SAMPLE
16 ;CHECK: IMAGE_SAMPLE
17 ;CHECK: IMAGE_SAMPLE
2 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
3 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 3
4 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 2
5 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 1
6 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 4
7 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 8
8 ;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 5
9 ;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 9
10 ;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 6
11 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 10
12 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 12
13 ;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
14 ;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
15 ;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
16 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
17 ;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 8
1818
1919 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
2020 %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
6666 %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
6767 <8 x i32> undef, <4 x i32> undef, i32 16)
6868 %e1 = extractelement <4 x float> %res1, i32 0
69 %e2 = extractelement <4 x float> %res2, i32 0
70 %e3 = extractelement <4 x float> %res3, i32 0
71 %e4 = extractelement <4 x float> %res4, i32 0
72 %e5 = extractelement <4 x float> %res5, i32 0
73 %e6 = extractelement <4 x float> %res6, i32 0
74 %e7 = extractelement <4 x float> %res7, i32 0
75 %e8 = extractelement <4 x float> %res8, i32 0
76 %e9 = extractelement <4 x float> %res9, i32 0
77 %e10 = extractelement <4 x float> %res10, i32 0
78 %e11 = extractelement <4 x float> %res11, i32 0
79 %e12 = extractelement <4 x float> %res12, i32 0
80 %e13 = extractelement <4 x float> %res13, i32 0
81 %e14 = extractelement <4 x float> %res14, i32 0
82 %e15 = extractelement <4 x float> %res15, i32 0
83 %e16 = extractelement <4 x float> %res16, i32 0
69 %e2 = extractelement <4 x float> %res2, i32 1
70 %e3 = extractelement <4 x float> %res3, i32 2
71 %e4 = extractelement <4 x float> %res4, i32 3
72 %t0 = extractelement <4 x float> %res5, i32 0
73 %t1 = extractelement <4 x float> %res5, i32 1
74 %e5 = fadd float %t0, %t1
75 %t2 = extractelement <4 x float> %res6, i32 0
76 %t3 = extractelement <4 x float> %res6, i32 2
77 %e6 = fadd float %t2, %t3
78 %t4 = extractelement <4 x float> %res7, i32 0
79 %t5 = extractelement <4 x float> %res7, i32 3
80 %e7 = fadd float %t4, %t5
81 %t6 = extractelement <4 x float> %res8, i32 1
82 %t7 = extractelement <4 x float> %res8, i32 2
83 %e8 = fadd float %t6, %t7
84 %t8 = extractelement <4 x float> %res9, i32 1
85 %t9 = extractelement <4 x float> %res9, i32 3
86 %e9 = fadd float %t8, %t9
87 %t10 = extractelement <4 x float> %res10, i32 2
88 %t11 = extractelement <4 x float> %res10, i32 3
89 %e10 = fadd float %t10, %t11
90 %t12 = extractelement <4 x float> %res11, i32 0
91 %t13 = extractelement <4 x float> %res11, i32 1
92 %t14 = extractelement <4 x float> %res11, i32 2
93 %t15 = fadd float %t12, %t13
94 %e11 = fadd float %t14, %t15
95 %t16 = extractelement <4 x float> %res12, i32 0
96 %t17 = extractelement <4 x float> %res12, i32 1
97 %t18 = extractelement <4 x float> %res12, i32 3
98 %t19 = fadd float %t16, %t17
99 %e12 = fadd float %t18, %t19
100 %t20 = extractelement <4 x float> %res13, i32 0
101 %t21 = extractelement <4 x float> %res13, i32 2
102 %t22 = extractelement <4 x float> %res13, i32 3
103 %t23 = fadd float %t20, %t21
104 %e13 = fadd float %t22, %t23
105 %t24 = extractelement <4 x float> %res14, i32 1
106 %t25 = extractelement <4 x float> %res14, i32 2
107 %t26 = extractelement <4 x float> %res14, i32 3
108 %t27 = fadd float %t24, %t25
109 %e14 = fadd float %t26, %t27
110 %t28 = extractelement <4 x float> %res15, i32 0
111 %t29 = extractelement <4 x float> %res15, i32 1
112 %t30 = extractelement <4 x float> %res15, i32 2
113 %t31 = extractelement <4 x float> %res15, i32 3
114 %t32 = fadd float %t28, %t29
115 %t33 = fadd float %t30, %t31
116 %e15 = fadd float %t32, %t33
117 %e16 = extractelement <4 x float> %res16, i32 3
84118 %s1 = fadd float %e1, %e2
85119 %s2 = fadd float %s1, %e3
86120 %s3 = fadd float %s2, %e4