NEON matrix palette skinning

Indroduction

What is ARM NEON? – The ARM® NEON™ general-purpose SIMD engine … – in other words it is an extended instruction set similar to the x86 CPU SSE/SSE2 etc.

Why?

One my friend from time to time asked me about: What do you think about ARM NEON optimization for your 3d math functions?
My answers were:

  • FPS in my project in the normal range
  • Profiler doesn’t show hot spots in a math functions
  • Data required to be aligned on 16 bytes, my code was not ready for this

A few weeks ago i added FSAA (full screen antialiasing) to game and FPS immediately fell under 20. That was a problem. After one week of optimizations FPS increased to 25 again. FSAA ate all of my GPU power, and I had only one way to speed up the performance – optimize the code for CPU.

Usually when i run xCode profiler i saw ~10% of CPU time inside matrix palette skinning block. This code looked very optimized and my attention shifted to other places. One week ago my friend came to me and said – “Hey, yesterday i spent a lot of time to learn asm commands for ARM NEON and i feel like i can help you write that code. Let’s try to optimize your matrix palette skinning block”.
We sat together near my laptop and we started.

C/C++

Plain C++ code for matrix palette skinning:

Structures:

1
2
3
4
5
6
// ready to use with glSubData for vertex buffer
struct PN
{
  Math::Vec3f p;
  Math::Vec3f n;
};

Math functions:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
forceinline
void transformPointNormal4x3Weight_NoW(const Matrix44f& mat,const Vec3f& inV, const Vec3f& inN, BaseRenderScene::PN& outPN)
{
  outPN.p.vec[0] = (inV.vec[0]*mat.mat[0][0] + inV.vec[1]*mat.mat[1][0] + inV.vec[2]*mat.mat[2][0] + mat.mat[3][0]);
  outPN.n.vec[0] = (inN.vec[0]*mat.mat[0][0] + inN.vec[1]*mat.mat[1][0] + inN.vec[2]*mat.mat[2][0]);

  outPN.p.vec[1] = (inV.vec[0]*mat.mat[0][1] + inV.vec[1]*mat.mat[1][1] + inV.vec[2]*mat.mat[2][1] + mat.mat[3][1]);
  outPN.n.vec[1] = (inN.vec[0]*mat.mat[0][1] + inN.vec[1]*mat.mat[1][1] + inN.vec[2]*mat.mat[2][1]);

  outPN.p.vec[2] = (inV.vec[0]*mat.mat[0][2] + inV.vec[1]*mat.mat[1][2] + inV.vec[2]*mat.mat[2][2] + mat.mat[3][2]);
  outPN.n.vec[2] = (inN.vec[0]*mat.mat[0][2] + inN.vec[1]*mat.mat[1][2] + inN.vec[2]*mat.mat[2][2]);
}

forceinline
void transformPointNormal4x3Weight(const Matrix44f& mat,const Vec3f& inV, const Vec3f& inN, BaseRenderScene::PN& outPN,float w )
{
  outPN.p.vec[0] = (inV.vec[0]*mat.mat[0][0] + inV.vec[1]*mat.mat[1][0] + inV.vec[2]*mat.mat[2][0] + mat.mat[3][0])*w;
  outPN.n.vec[0] = (inN.vec[0]*mat.mat[0][0] + inN.vec[1]*mat.mat[1][0] + inN.vec[2]*mat.mat[2][0])*w;

  outPN.p.vec[1] = (inV.vec[0]*mat.mat[0][1] + inV.vec[1]*mat.mat[1][1] + inV.vec[2]*mat.mat[2][1] + mat.mat[3][1])*w;
  outPN.n.vec[1] = (inN.vec[0]*mat.mat[0][1] + inN.vec[1]*mat.mat[1][1] + inN.vec[2]*mat.mat[2][1])*w;

  outPN.p.vec[2] = (inV.vec[0]*mat.mat[0][2] + inV.vec[1]*mat.mat[1][2] + inV.vec[2]*mat.mat[2][2] + mat.mat[3][2])*w;
  outPN.n.vec[2] = (inN.vec[0]*mat.mat[0][2] + inN.vec[1]*mat.mat[1][2] + inN.vec[2]*mat.mat[2][2])*w;
}

forceinline
void transformPointNormal4x3AddWeighted(const Matrix44f& mat,const Vec3f& inV, const Vec3f& inN, BaseRenderScene::PN& outPN,float w )
{
  outPN.p.vec[0] += (inV.vec[0]*mat.mat[0][0] + inV.vec[1]*mat.mat[1][0] + inV.vec[2]*mat.mat[2][0] + mat.mat[3][0])*w;
  outPN.n.vec[0] += (inN.vec[0]*mat.mat[0][0] + inN.vec[1]*mat.mat[1][0] + inN.vec[2]*mat.mat[2][0])*w;

  outPN.p.vec[1] += (inV.vec[0]*mat.mat[0][1] + inV.vec[1]*mat.mat[1][1] + inV.vec[2]*mat.mat[2][1] + mat.mat[3][1])*w;
  outPN.n.vec[1] += (inN.vec[0]*mat.mat[0][1] + inN.vec[1]*mat.mat[1][1] + inN.vec[2]*mat.mat[2][1])*w;

  outPN.p.vec[2] += (inV.vec[0]*mat.mat[0][2] + inV.vec[1]*mat.mat[1][2] + inV.vec[2]*mat.mat[2][2] + mat.mat[3][2])*w;
  outPN.n.vec[2] += (inN.vec[0]*mat.mat[0][2] + inN.vec[1]*mat.mat[1][2] + inN.vec[2]*mat.mat[2][2])*w;
}

1 vertex processing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
const Vec3f& vx = pVerticies[v];
const Vec3f& vxN = pNormals[v];

float w = pVertexWeight[v].vec[0];
int boneIndex = pVertexBones[v].vec[0];
const Matrix44f& boneTM = pBoneTMList[boneIndex];
if( wCount==1 )
{
  transformPointNormal4x3Weight_NoW(boneTM,vx,vxN,skinTempPN[v]);
}
else
{
  // 1st vertex without add
  transformPointNormal4x3Weight_N(boneTM,vx,vxN,skinTempPN[v],w);
  for(size_t i=1;i<wCount;i++)
  {
   // other verticies
   w = pVertexWeight[v].vec[i];
   boneIndex = pVertexBones[v].vec[i];
   const Matrix44f& boneTM = pBoneTMList[boneIndex];
   transformPointNormal4x3AddWeighted_N(boneTM,vx,vxN,skinTempPN[v],w);
  }
}

Note: To work with this code you need to keep amount of blend weight for each vertex. In my case almost 30% of vertices were with 1 blend weight.

ASM

Code bellow is an asm ARM NEON version of algorithm above.

Note: For for ARM NEON all data must be aligned to 16 bytes.
Due to this limitation all my input vertex data aligned to 16 bytes – so input position and normal components in my input vertices converted to Vec4f.
Output data still aligned to 4 bytes – because output data directly come to glSubData (of course you may keep 2 Vec4f output position and normal but in that case you need send to GPU 8 bytes more for each vertex).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#if defined(__ARM_NEON__)
#define USE_NEON
#endif

#if defined(USE_NEON)


#ifdef __thumb__
#error "This file should be compiled in ARM mode only."
// Note in Xcode, right click file, Get Info->Build, Other compiler flags = "-marm"
#endif

#define OP  "q0"

#define OPS0  "s0"
#define OPS1  "s1"
#define OPS2  "s2"

#define ON  "q1"

#define ONS0  "s4"
#define ONS1  "s5"
#define ONS2  "s6"


#define IP  "q2"
#define IN  "q3"

#define IPX "d4[0]"
#define IPY "d4[1]"
#define IPZ "d5[0]"
#define IPW "d5[1]"

#define INX "d6[0]"
#define INY "d6[1]"
#define INZ "d7[0]"
#define INW "d7[1]"

#define WQ "q4"
#define W0D "d8[0]"
#define W1D "d8[1]"
#define W2D "d9[0]"
#define W3D "d9[1]"

#define QM0 q8
#define QM1 q9
#define QM2 q10
#define QM3 q11

#define QT  "q14"


// outP = mt.row0*pos + mt.row1*pos + mt.row2*pos + mt.row3*pos
#define mat_pos(_RES) \
"vmul.f32 " _RES ", q8, " IPX "\n\t" \
"vmla.f32 " _RES ", q9, " IPY "\n\t" \
"vmla.f32 " _RES ", q10, " IPZ "\n\t" \
"vmla.f32 " _RES ", q11, " IPW "\n\t"


#define mat_pos_w_set(_RES,_QT,_WD) \
mat_pos(_QT) \
"vmul.f32 " _RES ", " _QT ", " _WD "\n\t"


#define mat_pos_w_add(_RES,_QT,_WD) \
mat_pos(_QT) \
"vmla.f32 " _RES ", " _QT ", " _WD "\n\t"



// outN = mt.row0*nor + mt.row1*nor + mt.row2*nor
#define mat_nor(_RES) \
"vmul.f32 " _RES ", q8, " INX "\n\t" \
"vmla.f32 " _RES ", q9, " INY "\n\t" \
"vmla.f32 " _RES ", q10, " INZ "\n\t"


#define mat_nor_w_set(_RES,_QT,_WD) \
mat_nor(_QT) \
"vmul.f32 " _RES ", " _QT ", " _WD "\n\t"


#define mat_nor_w_add(_RES,_QT,_WD) \
mat_nor(_QT) \
"vmla.f32 " _RES ", " _QT ", " _WD "\n\t"


#define STORE3_P3N3(_R) \
"fsts "OPS0",[" _R "] \n\t" \
"fsts "OPS1",[" _R ",#4] \n\t" \
"fsts "OPS2",[" _R ",#8] \n\t" \
"fsts "ONS0",[" _R ",#12] \n\t" \
"fsts "ONS1",[" _R ",#16] \n\t" \
"fsts "ONS2",[" _R ",#20] \n\t"



#define mat_load(_R) \
"vldmia " _R ", { q8-q11 } \n\t"


__attribute__((always_inline))
void clalcSkin1(
                const Matrix44f* mat0,
                const Vec4f* posnorm,
                Vec3f* outPN)
{
  //
  asm volatile
  (
   // q4-q7 need to be preserved
   "vldmia %1, { " IP " - " IN " } \n\t"        // pos norm
   // OP p temp
   // ON n temp
   //
   // mat0
   mat_load("%0")
   mat_pos(OP)
   mat_nor(ON)
   STORE3_P3N3("%2")
   : // no output
   : "r" (mat0), "r" (posnorm), "r" (outPN)
   : "memory", IP, IN, WQ, QT, OP, ON, "q8", "q9", "q10", "q11" //clobber
   );
}

__attribute__((always_inline))
void clalcSkin2(
                const Matrix44f* mat0,
                const Matrix44f* mat1,
                const Vec4f* posnorm,
                const Vec4f* weight,
                Vec3f* outPN)
{
  //
  asm volatile
  (
   // q4-q7 need to be preserved
   "vmov q15," WQ "\n\t"
   //
   "vldmia %2, { " IP " - " IN " } \n\t"        // pos norm
   "vldmia %3, { " WQ " } \n\t"     // weights
   // QT intermediate temp
   // OP p temp
   // ON n temp
   //
   // mat0
   mat_load("%0")
   mat_pos_w_set(OP,QT,W0D)
   mat_nor_w_set(ON,QT,W0D)
   // mat 1
   mat_load("%1")
   mat_pos_w_add(OP,QT,W1D)
   mat_nor_w_add(ON,QT,W1D)
   // output pos3f,norm3f
   STORE3_P3N3("%4")
   // restore q4 (WQ)
   "vmov " WQ ", q15 \n\t"
   : // no output
   : "r" (mat0), "r" (mat1), "r" (posnorm), "r" (weight), "r" (outPN)
   : "memory", IP, IN, WQ, QT , OP, ON, "q8", "q9", "q10", "q11", "q15" //clobber
   );
}

__attribute__((always_inline))
void clalcSkin3(
                const Matrix44f* mat0,
                const Matrix44f* mat1,
                const Matrix44f* mat2,
                const Vec4f* posnorm,
                const Vec4f* weight,
                Vec3f* outPN)
{
  //
  asm volatile
  (
   // q4-q7 need to be preserved
   "vmov q15," WQ "\n\t"
   //
   "vldmia %3, { " IP " - " IN " } \n\t"        // pos norm
   "vldmia %4, { " WQ " } \n\t"     // weights
   // QT intermediate temp
   // OP p temp
   // ON n temp
   //
   // mat0
   mat_load("%0")
   mat_pos_w_set(OP,QT,W0D)
   mat_nor_w_set(ON,QT,W0D)
   // mat 1
   mat_load("%1")
   mat_pos_w_add(OP,QT,W1D)
   mat_nor_w_add(ON,QT,W1D)
   // mat 2
   mat_load("%2")
   mat_pos_w_add(OP,QT,W2D)
   mat_nor_w_add(ON,QT,W2D)
   // output pos,normal
   STORE3_P3N3("%5")
   
   // restore q4 (WQ)
   "vmov " WQ ", q15 \n\t"
   
   : // no output
   : "r" (mat0), "r" (mat1), "r" (mat2),"r" (posnorm), "r" (weight), "r" (outPN)
   : "memory", IP, IN, WQ, QT, OP, ON, "q8", "q9", "q10", "q11", "q15" //clobber
   );
}
__attribute__((always_inline))
void clalcSkin4(
                const Matrix44f* mat0,
                const Matrix44f* mat1,
                const Matrix44f* mat2,
                const Matrix44f* mat3,
                const Vec4f* posnorm,
                const Vec4f* weight,
                Vec3f* outPN)
{
  //
  asm volatile
  (
   // q4-q7 need to be preserved
   "vmov q15," WQ "\n\t"
   //
   "vldmia %4, { " IP " - " IN " } \n\t"        // pos norm
   "vldmia %5, { " WQ " } \n\t"     // weights
   // QT intermediate temp
   // OP p temp
   // ON n temp
   //
   // mat0
   mat_load("%0")
   mat_pos_w_set(OP,QT,W0D)
   mat_nor_w_set(ON,QT,W0D)
   // mat 1
   mat_load("%1")
   mat_pos_w_add(OP,QT,W1D)
   mat_nor_w_add(ON,QT,W1D)
   // mat 2
   mat_load("%2")
   mat_pos_w_add(OP,QT,W2D)
   mat_nor_w_add(ON,QT,W2D)
   // mat 3
   mat_load("%3")
   mat_pos_w_add(OP,QT,W3D)
   mat_nor_w_add(ON,QT,W3D)
   // output pos,normal
   STORE3_P3N3("%6")
   
   // restore q4 (WQ)
   "vmov " WQ ", q15\n\t"
   
   : // no output
   : "r" (mat0), "r" (mat1), "r" (mat2), "r" (mat3), "r" (posnorm), "r" (weight), "r" (outPN)
   : "memory", IP, IN, WQ, QT, OP, ON, "q8", "q9", "q10", "q11", "q15" //clobber
   );
}

Code to process single vertex

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#if defined(USE_NEON)
 
  //
  if( wCount==1)
  {
    int boneIndex0 = pVertexBones[v].vec[0];
    const Matrix44f* boneTM0 = &pBoneTMList[boneIndex0];
    clalcSkin1(boneTM0,&pVerticiesNormals[v].pn[0],&skinTempPN[v].p);
  }
  else if( wCount==2)
  {
    int boneIndex0 = pVertexBones[v].vec[0];
    const Matrix44f* boneTM0 = &pBoneTMList[boneIndex0];
    int boneIndex1 = pVertexBones[v].vec[1];
    const Matrix44f* boneTM1 = &pBoneTMList[boneIndex1];
    clalcSkin2(boneTM0,boneTM1,&pVerticiesNormals[v].pn[0],&pVertexWeight[v],&skinTempPN[v].p);
  }
  else if( wCount==3)
  {
    int boneIndex0 = pVertexBones[v].vec[0];
    const Matrix44f* boneTM0 = &pBoneTMList[boneIndex0];
    int boneIndex1 = pVertexBones[v].vec[1];
    const Matrix44f* boneTM1 = &pBoneTMList[boneIndex1];
    int boneIndex2 = pVertexBones[v].vec[2];
    const Matrix44f* boneTM2 = &pBoneTMList[boneIndex2];
    clalcSkin3(boneTM0,boneTM1,boneTM2,&pVerticiesNormals[v].pn[0],&pVertexWeight[v],&skinTempPN[v].p);
  }
  else
  {
    int boneIndex0 = pVertexBones[v].vec[0];
    const Matrix44f* boneTM0 = &pBoneTMList[boneIndex0];
    int boneIndex1 = pVertexBones[v].vec[1];
    const Matrix44f* boneTM1 = &pBoneTMList[boneIndex1];
    int boneIndex2 = pVertexBones[v].vec[2];
    const Matrix44f* boneTM2 = &pBoneTMList[boneIndex2];
    int boneIndex3 = pVertexBones[v].vec[3];
    const Matrix44f* boneTM3 = &pBoneTMList[boneIndex3];
    clalcSkin4(boneTM0,boneTM1,boneTM2,boneTM3,&pVerticiesNormals[v].pn[0],&pVertexWeight[v],&skinTempPN[v].p);
  }

Results

I don’t have synthetic tests to measure performance. Instead i just measured performance in my game.

502ms(c++) vs 307ms(arm neon) on a ~10 sec interval for iPhone 4.
Performance with ARM NEON faster to 39% in that case.

Thanks.

Comments are closed.