Skip to content

Commit e985a49

Browse files
mbellewrevmischa
andauthored
Fixrad (#469)
* small rewrite to PerPixelMath to use one nested for loop instead of multiple nested for loops hopefully better memory locality and maybe help the C optimizer Also, some presets are very sensitive to zoom/zoomexp such as "Aderrasi - Contortion (Escher's Tunnel Mix).milk" Tracked down the discrepancy in rendering to the different scaling of "rad" variable. see also "Idiot24-7 - Ascending to heaven 2.milk" * Try to minimize calls to transcendental functions in PerPixelMath_c (harder to do in PerPixelMath_sse) * minimize calls to transcendental functions in PerPixelMath_sse * Update src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp Co-authored-by: Mischa Spiegelmock <[email protected]>
1 parent 10faca9 commit e985a49

File tree

1 file changed

+174
-182
lines changed

1 file changed

+174
-182
lines changed

src/libprojectM/MilkdropPresetFactory/PresetFrameIO.cpp

+174-182
Original file line numberDiff line numberDiff line change
@@ -179,59 +179,52 @@ void PresetOutputs::Render(const BeatDetect &music, const PipelineContext &conte
179179
// N.B. The more optimization that can be done on this method, the better! This is called a lot and can probably be improved.
180180
void PresetOutputs::PerPixelMath_c(const PipelineContext &context)
181181
{
182-
for (int x = 0; x < gx; x++)
183-
{
184-
for (int y = 0; y < gy; y++)
185-
{
186-
const float fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
187-
rad_mesh[x][y] * 2.0f - 1.0f));
188-
const float fZoom2Inv = 1.0f / fZoom2;
189-
this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
190-
this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
191-
this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f;
192-
this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
193-
}
194-
}
195-
196-
const float fWarpTime = context.time * this->fWarpAnimSpeed;
197-
const float fWarpScaleInv = 1.0f / this->fWarpScale;
198-
float f[4];
199-
f[0] = 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10);
200-
f[1] = 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7);
201-
f[2] = 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3);
202-
f[3] = 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5);
203-
204-
for (int x = 0; x < gx; x++)
182+
const float fWarpTime = context.time * this->fWarpAnimSpeed;
183+
const float fWarpScaleInv = 1.0f / this->fWarpScale;
184+
float f[4];
185+
f[0] = 11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10);
186+
f[1] = 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7);
187+
f[2] = 10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3);
188+
f[3] = 11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5);
189+
190+
for (int x = 0; x < gx; x++)
205191
{
206192
for (int y = 0; y < gy; y++)
207193
{
208194
const float orig_x2 = this->orig_x[x][y];
209195
const float orig_y2 = this->orig_y[x][y];
210-
const float warp_mesh2 = this->warp_mesh[x][y] * 0.0035f;
211-
212-
this->x_mesh[x][y] +=
213-
(warp_mesh2 * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x2 * f[0] - orig_y2 * f[3]))) +
214-
(warp_mesh2 * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x2 * f[1] - orig_y2 * f[2])));
215196

216-
this->y_mesh[x][y] +=
217-
(warp_mesh2 * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x2 * f[2] + orig_y2 * f[1]))) +
218-
(warp_mesh2 * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x2 * f[0] + orig_y2 * f[3])));
219-
}
220-
}
221-
222-
for (int x = 0; x < gx; x++)
223-
{
224-
for (int y = 0; y < gy; y++)
225-
{
226-
const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
227-
const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
228-
229-
const float rot2 = this->rot_mesh[x][y];
230-
const float cos_rot = cosf(rot2);
231-
const float sin_rot = sinf(rot2);
232-
233-
this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y];
234-
this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y];
197+
// zoom and stretch
198+
const float fZoom2Inv = this->zoom_mesh[x][y] == 1.0 ? 1.0 :
199+
std::pow(this->zoom_mesh[x][y], -1*std::pow(this->zoomexp_mesh[x][y], rad_mesh[x][y] * 2.0f - 1.0f));
200+
float u = orig_x2 * 0.5f * fZoom2Inv + 0.5f;
201+
u = (u - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
202+
float v = orig_y2 * 0.5f * fZoom2Inv + 0.5f;
203+
v = (v - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
204+
205+
// warp
206+
if (this->warp_mesh[x][y] != 0.0)
207+
{
208+
const float warp_mesh2 = this->warp_mesh[x][y] * 0.0035f;
209+
u += warp_mesh2 * (sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x2 * f[0] - orig_y2 * f[3])) +
210+
cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x2 * f[1] - orig_y2 * f[2])));
211+
212+
v += warp_mesh2 * (cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x2 * f[2] + orig_y2 * f[1])) +
213+
sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x2 * f[0] + orig_y2 * f[3])));
214+
}
215+
216+
// rotate and translate
217+
if (rot != 0.0)
218+
{
219+
const float cos_rot = cosf(this->rot_mesh[x][y]);
220+
const float sin_rot = sinf(this->rot_mesh[x][y]);
221+
const float u2 = u - this->cx_mesh[x][y];
222+
const float v2 = v - this->cy_mesh[x][y];
223+
u = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y];
224+
v = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y];
225+
}
226+
this->x_mesh[x][y] = u - this->dx_mesh[x][y];
227+
this->y_mesh[x][y] = v - this->dy_mesh[x][y];
235228
}
236229
}
237230
}
@@ -291,157 +284,156 @@ inline __m128 _mm_cosf(__m128 x)
291284

292285
void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
293286
{
287+
const float fWarpTime = context.time * this->fWarpAnimSpeed;
288+
const float fWarpScaleInv = 1.0f / this->fWarpScale;
289+
const float f[4] =
290+
{
291+
11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10),
292+
8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7),
293+
10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3),
294+
11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5)
295+
};
296+
294297
for (int x = 0; x < gx; x++)
295298
{
296299
for (int y = 0; y < gy; y += 4)
297300
{
298-
// fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
299-
// rad_mesh[x][y] * 2.0f - 1.0f));
300-
__m128 rad_mesh_scaled =
301-
_mm_sub_ps(
302-
_mm_mul_ps(
303-
_mm_load_ps(&this->rad_mesh[x][y]),
304-
_mm_set_ps1(2.0f)),
305-
_mm_set_ps1(1.0f));
306-
__m128 zoom_mesh2 = _mm_load_ps(&this->zoom_mesh[x][y]);
307-
__m128 zoomexp_mesh2 = _mm_load_ps(&this->zoomexp_mesh[x][y]);
308-
__m128 fZoom2 = _mm_pow(zoom_mesh2, _mm_pow(zoomexp_mesh2, rad_mesh_scaled));
309-
// fZoom2Inv = 1.0f / fZoom2;
310-
__m128 fZoomInv = _mm_rcp_ps(fZoom2);
311-
312-
// this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
313-
__m128 x_mesh2 =
301+
// const float orig_x2 = this->orig_x[x][y];
302+
// const float orig_y2 = this->orig_y[x][y];
303+
const __m128 orig_x2 = _mm_load_ps(&this->orig_x[x][y]);
304+
const __m128 orig_y2 = _mm_load_ps(&this->orig_y[x][y]);
305+
306+
bool zoomOne = this->zoom_mesh[x][y] == 1.0 && this->zoom_mesh[x][y+1] == 1.0 && this->zoom_mesh[x][y+2] == 1.0 && this->zoom_mesh[x][y+3] == 1.0;
307+
__m128 fZoom2Inv = _mm_set_ps1(1.0f);
308+
if (!zoomOne)
309+
{
310+
// fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
311+
// rad_mesh[x][y] * 2.0f - 1.0f));
312+
const __m128 rad_mesh_scaled =
313+
_mm_sub_ps(
314+
_mm_mul_ps(
315+
_mm_load_ps(&this->rad_mesh[x][y]),
316+
_mm_set_ps1(2.0f)),
317+
_mm_set_ps1(1.0f));
318+
const __m128 zoom_mesh2 = _mm_load_ps(&this->zoom_mesh[x][y]);
319+
const __m128 zoomexp_mesh2 = _mm_load_ps(&this->zoomexp_mesh[x][y]);
320+
const __m128 fZoom2 = _mm_pow(zoom_mesh2, _mm_pow(zoomexp_mesh2, rad_mesh_scaled));
321+
// fZoom2Inv = 1.0f / fZoom2;
322+
fZoom2Inv = _mm_rcp_ps(fZoom2);
323+
}
324+
325+
// float u = orig_x2 * 0.5f * fZoom2Inv + 0.5f;
326+
__m128 u =
314327
_mm_add_ps(
315328
_mm_mul_ps(
316-
_mm_load_ps(&this->orig_x[x][y]),
317-
_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))), // CONSIDER: common sub-expression
329+
orig_x2,
330+
_mm_mul_ps(fZoom2Inv,_mm_set_ps1(0.5f))), // CONSIDER: common sub-expression
318331
_mm_set_ps1(0.5f));
319-
// this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
320-
__m128 cx_mesh2 = _mm_load_ps(&this->cx_mesh[x][y]);
321-
__m128 sx_mesh2 = _mm_load_ps(&this->sx_mesh[x][y]);
322-
_mm_store_ps(&this->x_mesh[x][y],
323-
_mm_add_ps(
332+
// u = (u - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
333+
const __m128 cx_mesh2 = _mm_load_ps(&this->cx_mesh[x][y]);
334+
const __m128 sx_mesh2 = _mm_load_ps(&this->sx_mesh[x][y]);
335+
u = _mm_add_ps(
324336
_mm_div_ps(
325-
_mm_sub_ps(x_mesh2,cx_mesh2),
337+
_mm_sub_ps(u,cx_mesh2),
326338
sx_mesh2),
327339
cx_mesh2
328-
));
340+
);
329341

330-
// this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f;
331-
__m128 y_mesh2 =
342+
// float v = orig_y2 * 0.5f * fZoom2Inv + 0.5f;
343+
__m128 v =
332344
_mm_add_ps(
333345
_mm_mul_ps(
334-
_mm_load_ps(&this->orig_y[x][y]),
335-
_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),
346+
orig_y2,
347+
_mm_mul_ps(fZoom2Inv,_mm_set_ps1(0.5f))),
336348
_mm_set_ps1(0.5f));
337-
// this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
338-
__m128 cy_mesh2 = _mm_load_ps(&this->cy_mesh[x][y]);
339-
__m128 sy_mesh2 = _mm_load_ps(&this->sy_mesh[x][y]);
340-
_mm_store_ps(&this->y_mesh[x][y],
341-
_mm_add_ps(
349+
// v = (v - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
350+
const __m128 cy_mesh2 = _mm_load_ps(&this->cy_mesh[x][y]);
351+
const __m128 sy_mesh2 = _mm_load_ps(&this->sy_mesh[x][y]);
352+
v = _mm_add_ps(
342353
_mm_div_ps(
343-
_mm_sub_ps(y_mesh2,cy_mesh2),
354+
_mm_sub_ps(v,cy_mesh2),
344355
sy_mesh2),
345356
cy_mesh2
346-
));
347-
}
348-
}
349-
350-
const float fWarpTime = context.time * this->fWarpAnimSpeed;
351-
const float fWarpScaleInv = 1.0f / this->fWarpScale;
352-
const float f[4] =
353-
{
354-
11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10),
355-
8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7),
356-
10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3),
357-
11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5)
358-
};
359-
360-
for (int x = 0; x < gx; x++)
361-
{
362-
for (int y = 0; y < gy; y+=4)
363-
{
364-
//float orig_x = this->orig_x[x][y];
365-
//float orig_y = this->orig_y[x][y];
366-
//float warp_mesh = this->warp_mesh[x][y] * 0.0035f;
367-
const __m128 orig_x2 = _mm_load_ps(&this->orig_x[x][y]);
368-
const __m128 orig_y2 = _mm_load_ps(&this->orig_y[x][y]);
369-
const __m128 warp_mesh2 = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f));
370-
371-
// this->x_mesh[x][y] +=
372-
// (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) +
373-
// (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2])));
374-
_mm_store_ps(&this->x_mesh[x][y],
375-
_mm_add_ps(_mm_load_ps(&this->x_mesh[x][y]),
376-
_mm_add_ps(
377-
_mm_mul_ps(warp_mesh2, _mm_sinf(
378-
_mm_add_ps(
379-
_mm_set_ps1(fWarpTime*0.333f),
380-
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
381-
_mm_sub_ps(
382-
_mm_mul_ps(orig_x2, _mm_set_ps1(f[0])),
383-
_mm_mul_ps(orig_y2, _mm_set_ps1(f[3]))
384-
))))),
385-
_mm_mul_ps(warp_mesh2, _mm_cosf(
386-
_mm_sub_ps(
387-
_mm_set_ps1(fWarpTime*0.753f),
388-
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
389-
_mm_sub_ps(
390-
_mm_mul_ps(orig_x2, _mm_set_ps1(f[1])),
391-
_mm_mul_ps(orig_y2, _mm_set_ps1(f[2]))
392-
))))))));
393-
394-
// this->y_mesh[x][y] +=
395-
// (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) +
396-
// (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3])));
397-
_mm_store_ps(&this->y_mesh[x][y],
398-
_mm_add_ps(_mm_load_ps(&this->y_mesh[x][y]),
399-
_mm_add_ps(
400-
_mm_mul_ps(warp_mesh2, _mm_cosf(
401-
_mm_sub_ps(
402-
_mm_set_ps1(fWarpTime*0.375f),
403-
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
404-
_mm_add_ps(
405-
_mm_mul_ps(orig_x2, _mm_set_ps1(f[2])),
406-
_mm_mul_ps(orig_y2, _mm_set_ps1(f[1]))
407-
))))),
408-
_mm_mul_ps(warp_mesh2, _mm_sinf(
409-
_mm_add_ps(
410-
_mm_set_ps1(fWarpTime*0.825f),
411-
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
412-
_mm_add_ps(
413-
_mm_mul_ps(orig_x2, _mm_set_ps1(f[0])),
414-
_mm_mul_ps(orig_y2, _mm_set_ps1(f[3]))
415-
))))))));
416-
}
417-
}
418-
for (int x = 0; x < gx; x++)
419-
{
420-
for (int y = 0; y < gy; y+=4)
421-
{
422-
// const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
423-
// const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
424-
const __m128 u2 = _mm_sub_ps(_mm_load_ps(&this->x_mesh[x][y]),_mm_load_ps(&this->cx_mesh[x][y]));
425-
const __m128 v2 = _mm_sub_ps(_mm_load_ps(&this->y_mesh[x][y]),_mm_load_ps(&this->cy_mesh[x][y]));
426-
427-
// const float rot = this->rot_mesh[x][y];
428-
// const float cos_rot = cosf(rot);
429-
// const float sin_rot = sinf(rot);
430-
__m128 sin_rot, cos_rot;
431-
_mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot);
432-
433-
// this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y];
434-
_mm_store_ps(&this->x_mesh[x][y],
435-
_mm_add_ps(
436-
_mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)),
437-
_mm_sub_ps(_mm_load_ps(&this->cx_mesh[x][y]), _mm_load_ps(&this->dx_mesh[x][y]))
438-
));
439-
// this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y];
440-
_mm_store_ps(&this->y_mesh[x][y],
441-
_mm_add_ps(
442-
_mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)),
443-
_mm_sub_ps(_mm_load_ps(&this->cy_mesh[x][y]), _mm_load_ps(&this->dy_mesh[x][y]))
444-
));
357+
);
358+
359+
// warp
360+
bool warpZero = this->warp_mesh[x][y] == 0.0 && this->warp_mesh[x][y+1] == 0.00 && this->warp_mesh[x][y+2] == 0.00 && this->warp_mesh[x][y+3] == 0.00;
361+
if (!warpZero)
362+
{
363+
// const float warp_mesh2 = this->warp_mesh[x][y] * 0.0035f;
364+
const __m128 warp_mesh2 = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f));
365+
366+
// u +=
367+
// (warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x2 * f[0] - orig_y2 * f[3]))) +
368+
// (warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x2 * f[1] - orig_y2 * f[2])));
369+
u = _mm_add_ps(u,
370+
_mm_add_ps(
371+
_mm_mul_ps(warp_mesh2, _mm_sinf(
372+
_mm_add_ps(
373+
_mm_set_ps1(fWarpTime*0.333f),
374+
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
375+
_mm_sub_ps(
376+
_mm_mul_ps(orig_x2, _mm_set_ps1(f[0])),
377+
_mm_mul_ps(orig_y2, _mm_set_ps1(f[3]))
378+
))))),
379+
_mm_mul_ps(warp_mesh2, _mm_cosf(
380+
_mm_sub_ps(
381+
_mm_set_ps1(fWarpTime*0.753f),
382+
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
383+
_mm_sub_ps(
384+
_mm_mul_ps(orig_x2, _mm_set_ps1(f[1])),
385+
_mm_mul_ps(orig_y2, _mm_set_ps1(f[2]))
386+
)))))));
387+
388+
// v +=
389+
// (warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x2 * f[2] + orig_y2 * f[1]))) +
390+
// (warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x2 * f[0] + orig_y2 * f[3])));
391+
v = _mm_add_ps(v,
392+
_mm_add_ps(
393+
_mm_mul_ps(warp_mesh2, _mm_cosf(
394+
_mm_sub_ps(
395+
_mm_set_ps1(fWarpTime*0.375f),
396+
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
397+
_mm_add_ps(
398+
_mm_mul_ps(orig_x2, _mm_set_ps1(f[2])),
399+
_mm_mul_ps(orig_y2, _mm_set_ps1(f[1]))
400+
))))),
401+
_mm_mul_ps(warp_mesh2, _mm_sinf(
402+
_mm_add_ps(
403+
_mm_set_ps1(fWarpTime*0.825f),
404+
_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
405+
_mm_add_ps(
406+
_mm_mul_ps(orig_x2, _mm_set_ps1(f[0])),
407+
_mm_mul_ps(orig_y2, _mm_set_ps1(f[3]))
408+
)))))));
409+
}
410+
411+
bool rotZero = this->rot_mesh[x][y] == 0.0 && this->rot_mesh[x][y+1] == 0.00 && this->rot_mesh[x][y+2] == 0.00 && this->rot_mesh[x][y+3] == 0.00;
412+
if (!rotZero)
413+
{
414+
// const float u2 = u - this->cx_mesh[x][y];
415+
// const float v2 = v - this->cy_mesh[x][y];
416+
const __m128 u2 = _mm_sub_ps(u,_mm_load_ps(&this->cx_mesh[x][y]));
417+
const __m128 v2 = _mm_sub_ps(v,_mm_load_ps(&this->cy_mesh[x][y]));
418+
419+
// const float cos_rot = cosf(this->rot_mesh[x][y]);
420+
// const float sin_rot = sinf(this->rot_mesh[x][y]);
421+
__m128 sin_rot, cos_rot;
422+
_mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot);
423+
424+
// u = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y];
425+
u = _mm_add_ps(
426+
_mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)),
427+
_mm_load_ps(&this->cx_mesh[x][y]));
428+
// v = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y];
429+
v = _mm_add_ps(
430+
_mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)),
431+
_mm_load_ps(&this->cy_mesh[x][y]));
432+
}
433+
// this->x_mesh[x][y] = u - this->dx_mesh[x][y];
434+
// this->y_mesh[x][y] = v - this->dy_mesh[x][y];
435+
_mm_store_ps(&this->x_mesh[x][y], _mm_sub_ps(u, _mm_load_ps(&this->dx_mesh[x][y])));
436+
_mm_store_ps(&this->y_mesh[x][y], _mm_sub_ps(v, _mm_load_ps(&this->dy_mesh[x][y])));
445437
}
446438
}
447439
}
@@ -494,7 +486,7 @@ void PresetOutputs::Initialize ( int _gx, int _gy )
494486
float origx = x / (float) (gx - 1);
495487
float origy = -((y / (float) (gy - 1)) - 1);
496488

497-
rad_mesh[x][y]=hypot ( ( origx-.5 ) *2, ( origy-.5 ) *2 ) * .7071067;
489+
rad_mesh[x][y]=hypot ( ( origx-.5 ) *2, ( origy-.5 ) *2 );
498490
orig_x[x][y] = (origx - .5) * 2;
499491
orig_y[x][y] = (origy - .5) * 2;
500492
}

0 commit comments

Comments
 (0)