More 24bit optimizations (was: a just another stupid newbie :)
Marcelo E. Magallon
mmagallo@debian.org
Sat, 3 Jan 2004 13:47:01 -0600
--c3bfwLpm8qysLVxt
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
> The attached patch adds more optimizations than the last two
> versions, and when the composite manager tilesize is set to 128x128,
> I can get 20fps playback of a 352x240 video with it (only a wm
> running tho, not an entire de - it does also have the effect of
> making KDE useable, tho, which is a nice bonus - i'mn typing this in
> it, in fact :)
Hmm... you might be interested in giving the following code a look.
It's an implementation of the over operator (in the Porter & Duff
sense) in MMX (for both x86 and amd64). There's also a blend operator
for x86. The amd64 does 2 pixels per call (which means you have to
do something extra in case you have one extra pixel. Use the macros
like this:
void blend_p_rgba_int_mmx(pixel_t *a, pixel_t *b, const size_t N)
{
for(size_t i = 0; i < N; ++i, ++a, ++b)
{
OVER_8F(a, b, a);
}
OVER_8F_FINISH();
}
The integer (serial) version of that code composites 58 MP/s on an
Athlon XP 1600+. This code does 89 MP/s on the same machine. The
blend code is a bit faster (94 MP/s). N MP/s means you use N pixels
as input and produce N/2 pixels output.
If you use a constant alpha factor, you might just load that into a
register and save yourself the whole trouble with unpacking and loading
the alpha value in a register for each pixel. There's room for some
optimization wrt to instruction latency and the like.
This uses GCC extended assembly syntax, which means AT&T syntax.
HTH,
Marcelo
--c3bfwLpm8qysLVxt
Content-Type: text/x-chdr; charset=us-ascii
Content-Disposition: attachment; filename="blend_mmx.h"
/* Copyright (C) 2001-2003 Marcelo E. Magallon <marcelo.magallon@bigfoot.com> */
/* blend_mmx.h
*
* Over operator implementation in MMX, this does:
*
* r = a + (1-aalpha)*b/255
*
* where the second factor is computed using:
*
* x/255 = (x + x/256 + 128)/256
*
* Blend operator implementation in MMX, this computes:
*
* r = aalpha*a/255 + (1-aalpha)*b/255
*
* as:
*
* r = (a-b)*aalpha/255 + b
*/
#ifndef _BLEND_MMX_H_
#define _BLEND_MMX_H_
#define OVER_8F(a,b,r) \
__asm__ volatile( \
"\n\t pxor %%mm2, %%mm2" /* clear mm2 */ \
\
"\n\t mov $128, %%eax" /* need this later */ \
"\n\t movd %%eax, %%mm4" \
"\n\t pshufw $0, %%mm4, %%mm4" /* copy 128 to all words */ \
\
"\n\t movd (%1), %%mm0" /* copy a to mm0 */ \
\
"\n\t movd (%2), %%mm3" /* copy b to mm3 */ \
"\n\t punpcklbw %%mm2, %%mm3" /* 16-bit expand b */ \
\
"\n\t pcmpeqb %%mm1, %%mm1" /* fill mm1 with 1's */ \
"\n\t pxor %%mm0, %%mm1" /* 1 - aalpha */ \
"\n\t punpcklbw %%mm2, %%mm1" /* 16-bit expand 1-aa */ \
"\n\t pshufw $0, %%mm1, %%mm1" /* copy 1-aa to all words */ \
\
"\n\t pmullw %%mm1, %%mm3" /* x = (1-aalpha)*b */ \
"\n\t paddusw %%mm4, %%mm3" /* x += 128 */ \
"\n\t movq %%mm3, %%mm1" /* y = x */ \
"\n\t psrlw $8, %%mm1" /* y /= 256 */ \
"\n\t paddusw %%mm3, %%mm1" /* y = y + x */ \
"\n\t psrlw $8, %%mm1" /* y /= 256 */ \
\
"\n\t packuswb %%mm1, %%mm1" /* pack result */ \
\
"\n\t paddusb %%mm1, %%mm0" /* add a and (1-aalpha)b */ \
"\n\t movd %%mm0, (%0)" /* copy result to memory */ \
: "=p" (r) /* %0 */ \
: "p" (a), /* %1 */ \
"p" (b) /* %2 */ \
: "eax" \
)
#define OVER_8F_FINISH() \
__asm__ volatile("\n\t emms")
/* r = p*a/255 + q(1-a)/255 = (p-q)*a/255 + q */
#define BLEND_8F(a,b,r) \
__asm__ volatile( \
"\n\t movd (%1), %%mm0" /* copy a to mm0 */ \
"\n\t pxor %%mm3, %%mm3" \
"\n\t punpcklbw %%mm3, %%mm0" /* unpack a in mm0 */ \
\
"\n\t pshufw $0, %%mm0, %%mm1" /* 128x4 in mm1 */ \
\
"\n\t movd (%2), %%mm2" /* copy b to mm2 */ \
"\n\t punpcklbw %%mm3, %%mm2" /* unpack b in mm2 */ \
\
"\n\t psubw %%mm2, %%mm0" /* a = a - b */ \
\
"\n\t movq %%mm0, %%mm3" \
\
"\n\t pmullw %%mm1, %%mm0" /* (a - b)*aa in mm0 */ \
\
"\n\t psrlw $15, %%mm3" \
"\n\t psllw $8, %%mm3" /* mm3 = 256 if a - b < 0 */ \
"\n\t mov $128, %%eax" \
"\n\t movd %%eax, %%mm1" \
"\n\t pshufw $0, %%mm1, %%mm1" /* 128x4 in mm1 */ \
\
"\n\t psubw %%mm3, %%mm1" /* sign(a-b) * 128 in mm1 */ \
\
"\n\t paddw %%mm1, %%mm0" /* mm0 = (a-b)*aa +- 128 */ \
\
"\n\t movq %%mm0, %%mm1" /* mm0 = mm1 = x */ \
"\n\t psrlw $8, %%mm1" /* mm1 = x/256 */ \
"\n\t paddw %%mm1, %%mm0" /* mm0 = x + x/256 */ \
"\n\t psrlw $8, %%mm0" /* mm0 = (x + x/256)/256 */ \
\
"\n\t paddw %%mm2, %%mm0" /* mm0 = (a-b)*aa + b */ \
\
"\n\t psllw $8, %%mm0" \
"\n\t psrlw $8, %%mm0" /* mask upper byte in mm0 */ \
"\n\t packuswb %%mm0, %%mm0" /* pack result */ \
"\n\t movd %%mm0, (%0)" /* copy result to memory */ \
: "=p" (r) /* %0 */ \
: "p" (a), /* %1 */ \
"p" (b) /* %2 */ \
: "eax" \
)
#endif
#define BLEND_8F_FINISH() \
__asm__ volatile("\n\t emms")
#if defined(__x86_64__)
#define HAS_BLEND_TWO_PIXELS_PER_CLOCK
#define OVER_8F_INIT()
#define OVER_8F_2PPC(a,b,r) \
__asm__ volatile( \
"\n\t rex64 movd (%1), %%xmm0" /* x0 = aa' */ \
"\n\t rex64 movd (%2), %%xmm1" /* x1 = bb' */ \
\
"\n\t mov $0x00800080, %%rax" /* need this later */ \
"\n\t movd %%rax, %%xmm2" \
"\n\t pshufd $0, %%xmm2, %%xmm2" /* x2 = 00 80 x 8 */ \
\
"\n\t pxor %%xmm3, %%xmm3" /* clear xmm3 */ \
"\n\t pxor %%xmm4, %%xmm4" /* clear xmm4 */ \
\
"\n\t punpcklbw %%xmm3, %%xmm0" /* put a pixels in place */ \
"\n\t punpcklbw %%xmm3, %%xmm1" /* put b pixels in place */ \
\
"\n\t pcmpeqd %%xmm3, %%xmm3" /* fill xmm3 with 1's */ \
"\n\t punpcklbw %%xmm4, %%xmm3" /* construct 16-bit 255 */ \
"\n\t pxor %%xmm0, %%xmm3" /* xmm3 = 1 - alpha */ \
"\n\t pshufhw $0, %%xmm3, %%xmm3" /* put 1 - alpha on all */ \
"\n\t pshuflw $0, %%xmm3, %%xmm3" /* words */ \
\
"\n\t pmullw %%xmm3, %%xmm1" /* x1 = (1-a)*b */ \
"\n\t paddusw %%xmm2, %%xmm1" /* x1 += 128 */ \
"\n\t movdqa %%xmm1, %%xmm2" /* x2 = x1 */ \
"\n\t psrlw $8, %%xmm2" /* x2 /= 256 */ \
"\n\t paddusw %%xmm2, %%xmm1" /* x1 += x1/256 */ \
"\n\t psrlw $8, %%xmm1" /* x1 /= 256 */ \
"\n\t packuswb %%xmm1, %%xmm1" /* saturate & pack result */ \
\
"\n\t packuswb %%xmm0, %%xmm0" /* XXX: pack a again */ \
\
"\n\t paddusb %%xmm1, %%xmm0" /* x0 += x1 */ \
"\n\t rex64 movd %%xmm0, (%0)" /* copy result to memory */ \
: "=p" (r) /* %0 */ \
: "p" (a), /* %1 */ \
"p" (b) /* %2 */ \
: "rax" \
)
#define OVER_8F_2PPC_FINISH() \
__asm__ volatile("\n\t emms")
#endif
#endif /* _BLEND_MMX_H_ */
--c3bfwLpm8qysLVxt--