More 24bit optimizations (was: a just another stupid newbie :)

Marcelo E. Magallon mmagallo@debian.org
Sat, 3 Jan 2004 13:47:01 -0600


--c3bfwLpm8qysLVxt
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

 > The attached patch adds more optimizations than the last two
 > versions, and when the composite manager tilesize is set to 128x128,
 > I can get 20fps playback of a 352x240 video with it (only a wm
 > running tho, not an entire de - it does also have the effect of
 > making KDE useable, tho, which is a nice bonus - i'mn typing this in
 > it, in fact :)

 Hmm...  you might be interested in giving the following code a look.
 It's an implementation of the over operator (in the Porter & Duff
 sense) in MMX (for both x86 and amd64).  There's also a blend operator
 for x86.  The amd64 does 2 pixels per call (which means you have to
 do something extra in case you have one extra pixel.  Use the macros
 like this:

    void blend_p_rgba_int_mmx(pixel_t *a, pixel_t *b, const size_t N)
    {
        for(size_t i = 0; i < N; ++i, ++a, ++b)
        {
            OVER_8F(a, b, a);
        }
        OVER_8F_FINISH();
    }

 The integer (serial) version of that code composites 58 MP/s on an
 Athlon XP 1600+.  This code does 89 MP/s on the same machine.  The
 blend code is a bit faster (94 MP/s).  N MP/s means you use N pixels
 as input and produce N/2 pixels output.

 If you use a constant alpha factor, you might just load that into a
 register and save yourself the whole trouble with unpacking and loading
 the alpha value in a register for each pixel.  There's room for some
 optimization wrt to instruction latency and the like.

 This uses GCC extended assembly syntax, which means AT&T syntax.

 HTH,

 Marcelo

--c3bfwLpm8qysLVxt
Content-Type: text/x-chdr; charset=us-ascii
Content-Disposition: attachment; filename="blend_mmx.h"

/* Copyright (C) 2001-2003 Marcelo E. Magallon <marcelo.magallon@bigfoot.com> */

/* blend_mmx.h
 *
 * Over operator implementation in MMX, this does:
 *
 *       r = a + (1-aalpha)*b/255
 *
 * where the second factor is computed using:
 *
 *       x/255 = (x + x/256 + 128)/256
 *
 * Blend operator implementation in MMX, this computes:
 *
 *       r = aalpha*a/255 + (1-aalpha)*b/255
 *
 * as:
 *
 *       r = (a-b)*aalpha/255 + b
 */

#ifndef _BLEND_MMX_H_
#define _BLEND_MMX_H_

#define OVER_8F(a,b,r)                                                       \
    __asm__ volatile(                                                        \
            "\n\t pxor          %%mm2, %%mm2"   /* clear mm2              */ \
                                                                             \
            "\n\t mov            $128, %%eax"   /* need this later        */ \
            "\n\t movd          %%eax, %%mm4"                                \
            "\n\t pshufw    $0, %%mm4, %%mm4"   /* copy 128 to all words  */ \
                                                                             \
            "\n\t movd           (%1), %%mm0"   /* copy a to mm0          */ \
                                                                             \
            "\n\t movd           (%2), %%mm3"   /* copy b to mm3          */ \
            "\n\t punpcklbw     %%mm2, %%mm3"   /* 16-bit expand b        */ \
                                                                             \
            "\n\t pcmpeqb       %%mm1, %%mm1"   /* fill mm1 with 1's      */ \
            "\n\t pxor          %%mm0, %%mm1"   /* 1 - aalpha             */ \
            "\n\t punpcklbw     %%mm2, %%mm1"   /* 16-bit expand 1-aa     */ \
            "\n\t pshufw    $0, %%mm1, %%mm1"   /* copy 1-aa to all words */ \
                                                                             \
            "\n\t pmullw        %%mm1, %%mm3"   /* x = (1-aalpha)*b       */ \
            "\n\t paddusw       %%mm4, %%mm3"   /* x += 128               */ \
            "\n\t movq          %%mm3, %%mm1"   /* y = x                  */ \
            "\n\t psrlw            $8, %%mm1"   /* y /= 256               */ \
            "\n\t paddusw       %%mm3, %%mm1"   /* y = y + x              */ \
            "\n\t psrlw            $8, %%mm1"   /* y /= 256               */ \
                                                                             \
            "\n\t packuswb      %%mm1, %%mm1"   /* pack result            */ \
                                                                             \
            "\n\t paddusb       %%mm1, %%mm0"   /* add a and (1-aalpha)b  */ \
            "\n\t movd          %%mm0,  (%0)"   /* copy result to memory  */ \
            : "=p" (r)                          /* %0 */                     \
            : "p" (a),                          /* %1 */                     \
              "p" (b)                           /* %2 */                     \
            : "eax"                                                          \
            )

#define OVER_8F_FINISH()                                                     \
    __asm__ volatile("\n\t emms")

/* r = p*a/255 + q(1-a)/255 = (p-q)*a/255 + q */

#define BLEND_8F(a,b,r)                                                      \
    __asm__ volatile(                                                        \
            "\n\t movd           (%1), %%mm0"   /* copy a to mm0          */ \
            "\n\t pxor          %%mm3, %%mm3"                                \
            "\n\t punpcklbw     %%mm3, %%mm0"   /* unpack a in mm0        */ \
                                                                             \
            "\n\t pshufw    $0, %%mm0, %%mm1"   /* 128x4 in mm1           */ \
                                                                             \
            "\n\t movd           (%2), %%mm2"   /* copy b to mm2          */ \
            "\n\t punpcklbw     %%mm3, %%mm2"   /* unpack b in mm2        */ \
                                                                             \
            "\n\t psubw         %%mm2, %%mm0"   /* a = a - b              */ \
                                                                             \
            "\n\t movq          %%mm0, %%mm3"                                \
                                                                             \
            "\n\t pmullw        %%mm1, %%mm0"   /* (a - b)*aa in mm0      */ \
                                                                             \
            "\n\t psrlw           $15, %%mm3"                                \
            "\n\t psllw            $8, %%mm3"   /* mm3 = 256 if a - b < 0 */ \
            "\n\t mov            $128, %%eax"                                \
            "\n\t movd          %%eax, %%mm1"                                \
            "\n\t pshufw    $0, %%mm1, %%mm1"   /* 128x4 in mm1           */ \
                                                                             \
            "\n\t psubw         %%mm3, %%mm1"   /* sign(a-b) * 128 in mm1 */ \
                                                                             \
            "\n\t paddw         %%mm1, %%mm0"   /* mm0 = (a-b)*aa +- 128  */ \
                                                                             \
            "\n\t movq          %%mm0, %%mm1"   /* mm0 = mm1 = x          */ \
            "\n\t psrlw            $8, %%mm1"   /* mm1 = x/256            */ \
            "\n\t paddw         %%mm1, %%mm0"   /* mm0 = x + x/256        */ \
            "\n\t psrlw            $8, %%mm0"   /* mm0 = (x + x/256)/256  */ \
                                                                             \
            "\n\t paddw         %%mm2, %%mm0"   /* mm0 = (a-b)*aa + b     */ \
                                                                             \
            "\n\t psllw            $8, %%mm0"                                \
            "\n\t psrlw            $8, %%mm0"   /* mask upper byte in mm0 */ \
            "\n\t packuswb      %%mm0, %%mm0"   /* pack result            */ \
            "\n\t movd          %%mm0,  (%0)"   /* copy result to memory  */ \
            : "=p" (r)                          /* %0 */                     \
            : "p" (a),                          /* %1 */                     \
              "p" (b)                           /* %2 */                     \
            : "eax"                                                          \
            )
#endif

#define BLEND_8F_FINISH()                                                    \
    __asm__ volatile("\n\t emms")

#if defined(__x86_64__)
#define HAS_BLEND_TWO_PIXELS_PER_CLOCK
#define OVER_8F_INIT()

#define OVER_8F_2PPC(a,b,r)                                                  \
    __asm__ volatile(                                                        \
            "\n\t rex64 movd     (%1), %%xmm0"  /* x0 = aa'               */ \
            "\n\t rex64 movd     (%2), %%xmm1"  /* x1 = bb'               */ \
                                                                             \
            "\n\t mov      $0x00800080, %%rax"  /* need this later        */ \
            "\n\t movd          %%rax, %%xmm2"                               \
            "\n\t pshufd   $0, %%xmm2, %%xmm2"  /* x2 = 00 80 x 8         */ \
                                                                             \
            "\n\t pxor         %%xmm3, %%xmm3"  /* clear xmm3             */ \
            "\n\t pxor         %%xmm4, %%xmm4"  /* clear xmm4             */ \
                                                                             \
            "\n\t punpcklbw    %%xmm3, %%xmm0"  /* put a pixels in place  */ \
            "\n\t punpcklbw    %%xmm3, %%xmm1"  /* put b pixels in place  */ \
                                                                             \
            "\n\t pcmpeqd      %%xmm3, %%xmm3"  /* fill xmm3 with 1's     */ \
            "\n\t punpcklbw    %%xmm4, %%xmm3"  /* construct 16-bit 255   */ \
            "\n\t pxor         %%xmm0, %%xmm3"  /* xmm3 = 1 - alpha       */ \
            "\n\t pshufhw  $0, %%xmm3, %%xmm3"  /* put 1 - alpha on all   */ \
            "\n\t pshuflw  $0, %%xmm3, %%xmm3"  /* words                  */ \
                                                                             \
            "\n\t pmullw       %%xmm3, %%xmm1"  /* x1 = (1-a)*b           */ \
            "\n\t paddusw      %%xmm2, %%xmm1"  /* x1 += 128              */ \
            "\n\t movdqa       %%xmm1, %%xmm2"  /* x2 = x1                */ \
            "\n\t psrlw            $8, %%xmm2"  /* x2 /= 256              */ \
            "\n\t paddusw      %%xmm2, %%xmm1"  /* x1 += x1/256           */ \
            "\n\t psrlw            $8, %%xmm1"  /* x1 /= 256              */ \
            "\n\t packuswb     %%xmm1, %%xmm1"  /* saturate & pack result */ \
                                                                             \
            "\n\t packuswb     %%xmm0, %%xmm0"  /* XXX: pack a again      */ \
                                                                             \
            "\n\t paddusb      %%xmm1, %%xmm0"  /* x0 += x1               */ \
            "\n\t rex64 movd   %%xmm0,   (%0)"  /* copy result to memory  */ \
            : "=p" (r)                          /* %0 */                     \
            : "p" (a),                          /* %1 */                     \
              "p" (b)                           /* %2 */                     \
            : "rax"                                                          \
            )

#define OVER_8F_2PPC_FINISH()                                                \
    __asm__ volatile("\n\t emms")

#endif

#endif /* _BLEND_MMX_H_ */

--c3bfwLpm8qysLVxt--