/*
 * Modified for QT style "Polymer"
 * Copyright 2004-2005, Adam Jakubek <mig21@static.int.pl>
 *
 * Copyright 2003, Sandro Giessl <ceebx@users.sourceforge.net>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License version 2 as published by the Free Software Foundation.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this library; see the file COPYING.LIB.  If not, write to
 * the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include <qcolor.h>
#include <qimage.h>
#include <qpainter.h>
#include <math.h>
#include <stdlib.h>
#include "kcpuinfo.h"
#include "misc.h"
#include "../config.h"

QColor alphaBlendColors(const QColor &bgColor, const QColor &fgColor, const int a)
{

    // normal button...
    QRgb rgb = bgColor.rgb();
    QRgb rgb_b = fgColor.rgb();
    int alpha = a;
    if(alpha>255) alpha = 255;
    if(alpha<0) alpha = 0;
    int inv_alpha = 255 - alpha;

    QColor result  = QColor( qRgb(qRed(rgb_b)*inv_alpha/255 + qRed(rgb)*alpha/255,
                                  qGreen(rgb_b)*inv_alpha/255 + qGreen(rgb)*alpha/255,
                                  qBlue(rgb_b)*inv_alpha/255 + qBlue(rgb)*alpha/255) );

    return result;
}


// Gradient routines taken from KDE and modified slightly

/*
 *   Copyright (C) 1998, 1999, 2001, 2002 Daniel M. Duley <mosfet@kde.org>
 *   (C) 1998, 1999 Christian Tibirna <ctibirna@total.net>
 *   (C) 1998, 1999 Dirk A. Mueller <mueller@kde.org>
 *   (C) 1999 Geert Jansen <g.t.jansen@stud.tue.nl>
 *   (C) 2000 Josef Weidendorfer <weidendo@in.tum.de>
 *   (C) 2004 Zack Rusin <zack@kde.org>
 */

QImage imageGradient(const QSize &size, const QColor &ca,
	const QColor &cb, GradientType eff, int ncols);

QImage& imageDither(QImage &img, const QColor *palette, int size);

QPixmap& pixmapGradient(QPixmap &pixmap, const QColor &ca,
	const QColor &cb, GradientType eff, int ncols)
{
    if(pixmap.depth() > 8 &&
       (eff == VerticalGradient || eff == HorizontalGradient)) {

        int rDiff, gDiff, bDiff;
        int rca, gca, bca /*, rcb, gcb, bcb*/;

        register int x, y;

        rDiff = (/*rcb = */ cb.red())   - (rca = ca.red());
        gDiff = (/*gcb = */ cb.green()) - (gca = ca.green());
        bDiff = (/*bcb = */ cb.blue())  - (bca = ca.blue());

        register int rl = rca << 16;
        register int gl = gca << 16;
        register int bl = bca << 16;

        int rcdelta = ((1<<16) / (eff == VerticalGradient ? pixmap.height() : pixmap.width())) * rDiff;
        int gcdelta = ((1<<16) / (eff == VerticalGradient ? pixmap.height() : pixmap.width())) * gDiff;
        int bcdelta = ((1<<16) / (eff == VerticalGradient ? pixmap.height() : pixmap.width())) * bDiff;

        QPainter p(&pixmap);

        // these for-loops could be merged, but the if's in the inner loop
        // would make it slow
        switch(eff) {
        case VerticalGradient:
            for ( y = 0; y < pixmap.height(); y++ ) {
                rl += rcdelta;
                gl += gcdelta;
                bl += bcdelta;

                p.setPen(QColor(rl>>16, gl>>16, bl>>16));
                p.drawLine(0, y, pixmap.width()-1, y);
            }
            break;
        case HorizontalGradient:
            for( x = 0; x < pixmap.width(); x++) {
                rl += rcdelta;
                gl += gcdelta;
                bl += bcdelta;

                p.setPen(QColor(rl>>16, gl>>16, bl>>16));
                p.drawLine(x, 0, x, pixmap.height()-1);
            }
            break;
        default:
            ;
        }
    }
    else {
        QImage image = imageGradient(pixmap.size(), ca, cb, eff, ncols);
        pixmap.convertFromImage(image);
    }

    return pixmap;
}

QImage imageGradient(const QSize &size, const QColor &ca,
	const QColor &cb, GradientType eff, int ncols)
{
    int rDiff, gDiff, bDiff;
    int rca, gca, bca, rcb, gcb, bcb;

    QImage image(size, 32);

    if (size.width() == 0 || size.height() == 0) {
      return image;
    }

    register int x, y;

    rDiff = (rcb = cb.red())   - (rca = ca.red());
    gDiff = (gcb = cb.green()) - (gca = ca.green());
    bDiff = (bcb = cb.blue())  - (bca = ca.blue());

    if( eff == VerticalGradient || eff == HorizontalGradient ){

        uint *p;
        uint rgb;

        register int rl = rca << 16;
        register int gl = gca << 16;
        register int bl = bca << 16;

        if( eff == VerticalGradient ) {

            int rcdelta = ((1<<16) / size.height()) * rDiff;
            int gcdelta = ((1<<16) / size.height()) * gDiff;
            int bcdelta = ((1<<16) / size.height()) * bDiff;

            for ( y = 0; y < size.height(); y++ ) {
                p = (uint *) image.scanLine(y);

                rl += rcdelta;
                gl += gcdelta;
                bl += bcdelta;

                rgb = qRgb( (rl>>16), (gl>>16), (bl>>16) );

                for( x = 0; x < size.width(); x++ ) {
                    *p = rgb;
                    p++;
                }
            }

        }
        else {                  // must be HorizontalGradient

            unsigned int *o_src = (unsigned int *)image.scanLine(0);
            unsigned int *src = o_src;

            int rcdelta = ((1<<16) / size.width()) * rDiff;
            int gcdelta = ((1<<16) / size.width()) * gDiff;
            int bcdelta = ((1<<16) / size.width()) * bDiff;

            for( x = 0; x < size.width(); x++) {

                rl += rcdelta;
                gl += gcdelta;
                bl += bcdelta;

                *src++ = qRgb( (rl>>16), (gl>>16), (bl>>16));
            }

            src = o_src;

            // Believe it or not, manually copying in a for loop is faster
            // than calling memcpy for each scanline (on the order of ms...).
            // I think this is due to the function call overhead (mosfet).

            for (y = 1; y < size.height(); ++y) {

                p = (unsigned int *)image.scanLine(y);
                src = o_src;
                for(x=0; x < size.width(); ++x)
                    *p++ = *src++;
            }
        }
    }

    else {

        float rfd, gfd, bfd;
        float rd = rca, gd = gca, bd = bca;

        unsigned char *xtable[3];
        unsigned char *ytable[3];

        unsigned int w = size.width(), h = size.height();
        xtable[0] = new unsigned char[w];
        xtable[1] = new unsigned char[w];
        xtable[2] = new unsigned char[w];
        ytable[0] = new unsigned char[h];
        ytable[1] = new unsigned char[h];
        ytable[2] = new unsigned char[h];
        w*=2, h*=2;

        if ( eff == DiagonalGradient || eff == CrossDiagonalGradient) {
            // Diagonal dgradient code inspired by BlackBox (mosfet)
            // BlackBox dgradient is (C) Brad Hughes, <bhughes@tcac.net> and
            // Mike Cole <mike@mydot.com>.

            rfd = (float)rDiff/w;
            gfd = (float)gDiff/w;
            bfd = (float)bDiff/w;

            int dir;
            for (x = 0; x < size.width(); x++, rd+=rfd, gd+=gfd, bd+=bfd) {
                dir = eff == DiagonalGradient? x : size.width() - x - 1;
                xtable[0][dir] = (unsigned char) rd;
                xtable[1][dir] = (unsigned char) gd;
                xtable[2][dir] = (unsigned char) bd;
            }
            rfd = (float)rDiff/h;
            gfd = (float)gDiff/h;
            bfd = (float)bDiff/h;
            rd = gd = bd = 0;
            for (y = 0; y < size.height(); y++, rd+=rfd, gd+=gfd, bd+=bfd) {
                ytable[0][y] = (unsigned char) rd;
                ytable[1][y] = (unsigned char) gd;
                ytable[2][y] = (unsigned char) bd;
            }

            for (y = 0; y < size.height(); y++) {
                unsigned int *scanline = (unsigned int *)image.scanLine(y);
                for (x = 0; x < size.width(); x++) {
                    scanline[x] = qRgb(xtable[0][x] + ytable[0][y],
                                       xtable[1][x] + ytable[1][y],
                                       xtable[2][x] + ytable[2][y]);
                }
            }
        }

        else if (eff == RectangleGradient ||
                 eff == PyramidGradient ||
                 eff == PipeCrossGradient ||
                 eff == EllipticGradient)
        {
            int rSign = rDiff>0? 1: -1;
            int gSign = gDiff>0? 1: -1;
            int bSign = bDiff>0? 1: -1;

            rfd = (float)rDiff / size.width();
            gfd = (float)gDiff / size.width();
            bfd = (float)bDiff / size.width();

            rd = (float)rDiff/2;
            gd = (float)gDiff/2;
            bd = (float)bDiff/2;

            for (x = 0; x < size.width(); x++, rd-=rfd, gd-=gfd, bd-=bfd)
            {
                xtable[0][x] = (unsigned char) abs((int)rd);
                xtable[1][x] = (unsigned char) abs((int)gd);
                xtable[2][x] = (unsigned char) abs((int)bd);
            }

            rfd = (float)rDiff/size.height();
            gfd = (float)gDiff/size.height();
            bfd = (float)bDiff/size.height();

            rd = (float)rDiff/2;
            gd = (float)gDiff/2;
            bd = (float)bDiff/2;

            for (y = 0; y < size.height(); y++, rd-=rfd, gd-=gfd, bd-=bfd)
            {
                ytable[0][y] = (unsigned char) abs((int)rd);
                ytable[1][y] = (unsigned char) abs((int)gd);
                ytable[2][y] = (unsigned char) abs((int)bd);
            }

            int h = (size.height()+1)>>1;
            for (y = 0; y < h; y++) {
                unsigned int *sl1 = (unsigned int *)image.scanLine(y);
                unsigned int *sl2 = (unsigned int *)image.scanLine(QMAX(size.height()-y-1, y));

                int w = (size.width()+1)>>1;
                int x2 = size.width()-1;

                for (x = 0; x < w; x++, x2--) {
		    unsigned int rgb = 0;
                    if (eff == PyramidGradient) {
                        rgb = qRgb(rcb-rSign*(xtable[0][x]+ytable[0][y]),
                                   gcb-gSign*(xtable[1][x]+ytable[1][y]),
                                   bcb-bSign*(xtable[2][x]+ytable[2][y]));
                    }
                    if (eff == RectangleGradient) {
                        rgb = qRgb(rcb - rSign *
                                   QMAX(xtable[0][x], ytable[0][y]) * 2,
                                   gcb - gSign *
                                   QMAX(xtable[1][x], ytable[1][y]) * 2,
                                   bcb - bSign *
                                   QMAX(xtable[2][x], ytable[2][y]) * 2);
                    }
                    if (eff == PipeCrossGradient) {
                        rgb = qRgb(rcb - rSign *
                                   QMIN(xtable[0][x], ytable[0][y]) * 2,
                                   gcb - gSign *
                                   QMIN(xtable[1][x], ytable[1][y]) * 2,
                                   bcb - bSign *
                                   QMIN(xtable[2][x], ytable[2][y]) * 2);
                    }
                    if (eff == EllipticGradient) {
                        rgb = qRgb(rcb - rSign *
                                   (int)sqrt((xtable[0][x]*xtable[0][x] +
                                              ytable[0][y]*ytable[0][y])*2.0),
                                   gcb - gSign *
                                   (int)sqrt((xtable[1][x]*xtable[1][x] +
                                              ytable[1][y]*ytable[1][y])*2.0),
                                   bcb - bSign *
                                   (int)sqrt((xtable[2][x]*xtable[2][x] +
                                              ytable[2][y]*ytable[2][y])*2.0));
                    }

                    sl1[x] = sl2[x] = rgb;
                    sl1[x2] = sl2[x2] = rgb;
                }
            }
        }

        delete [] xtable[0];
        delete [] xtable[1];
        delete [] xtable[2];
        delete [] ytable[0];
        delete [] ytable[1];
        delete [] ytable[2];
    }

    // dither if necessary
    if (ncols && (QPixmap::defaultDepth() < 15 )) {
	if ( ncols < 2 || ncols > 256 )
	    ncols = 3;
	QColor *dPal = new QColor[ncols];
	for (int i=0; i<ncols; i++) {
	    dPal[i].setRgb ( rca + rDiff * i / ( ncols - 1 ),
			     gca + gDiff * i / ( ncols - 1 ),
			     bca + bDiff * i / ( ncols - 1 ) );
	}
        imageDither(image, dPal, ncols);
	    delete [] dPal;
    }

    return image;
}

/**
 * Types for MMX and SSE packing of colors, for safe constraints
 * */
namespace {

struct KIE4Pack
{
    Q_UINT16 data[4];
};

struct KIE8Pack
{
    Q_UINT16 data[8];
};

}

int nearestColor( int r, int g, int b, const QColor *palette, int size )
{
    if (palette == 0)
      return 0;

    int dr = palette[0].red() - r;
    int dg = palette[0].green() - g;
    int db = palette[0].blue() - b;

    int minDist =  dr*dr + dg*dg + db*db;
    int nearest = 0;

    for (int i = 1; i < size; i++ )
    {
        dr = palette[i].red() - r;
        dg = palette[i].green() - g;
        db = palette[i].blue() - b;

        int dist = dr*dr + dg*dg + db*db;

        if ( dist < minDist )
        {
            minDist = dist;
            nearest = i;
        }
    }

    return nearest;
}

// adapted from kFSDither (C) 1997 Martin Jones (mjones@kde.org)
//
// Floyd-Steinberg dithering
// Ref: Bitmapped Graphics Programming in C++
//      Marv Luse, Addison-Wesley Publishing, 1993.
QImage& imageDither(QImage &img, const QColor *palette, int size)
{
    if (img.width() == 0 || img.height() == 0 ||
        palette == 0 || img.depth() <= 8)
      return img;

    QImage dImage( img.width(), img.height(), 8, size );
    int i;

    dImage.setNumColors( size );
    for ( i = 0; i < size; i++ )
        dImage.setColor( i, palette[ i ].rgb() );

    int *rerr1 = new int [ img.width() * 2 ];
    int *gerr1 = new int [ img.width() * 2 ];
    int *berr1 = new int [ img.width() * 2 ];

    memset( rerr1, 0, sizeof( int ) * img.width() * 2 );
    memset( gerr1, 0, sizeof( int ) * img.width() * 2 );
    memset( berr1, 0, sizeof( int ) * img.width() * 2 );

    int *rerr2 = rerr1 + img.width();
    int *gerr2 = gerr1 + img.width();
    int *berr2 = berr1 + img.width();

    for ( int j = 0; j < img.height(); j++ )
    {
        uint *ip = (uint * )img.scanLine( j );
        uchar *dp = dImage.scanLine( j );

        for ( i = 0; i < img.width(); i++ )
        {
            rerr1[i] = rerr2[i] + qRed( *ip );
            rerr2[i] = 0;
            gerr1[i] = gerr2[i] + qGreen( *ip );
            gerr2[i] = 0;
            berr1[i] = berr2[i] + qBlue( *ip );
            berr2[i] = 0;
            ip++;
        }

        *dp++ = nearestColor( rerr1[0], gerr1[0], berr1[0], palette, size );

        for ( i = 1; i < img.width()-1; i++ )
        {
            int indx = nearestColor( rerr1[i], gerr1[i], berr1[i], palette, size );
            *dp = indx;

            int rerr = rerr1[i];
            rerr -= palette[indx].red();
            int gerr = gerr1[i];
            gerr -= palette[indx].green();
            int berr = berr1[i];
            berr -= palette[indx].blue();

            // diffuse red error
            rerr1[ i+1 ] += ( rerr * 7 ) >> 4;
            rerr2[ i-1 ] += ( rerr * 3 ) >> 4;
            rerr2[  i  ] += ( rerr * 5 ) >> 4;
            rerr2[ i+1 ] += ( rerr ) >> 4;

            // diffuse green error
            gerr1[ i+1 ] += ( gerr * 7 ) >> 4;
            gerr2[ i-1 ] += ( gerr * 3 ) >> 4;
            gerr2[  i  ] += ( gerr * 5 ) >> 4;
            gerr2[ i+1 ] += ( gerr ) >> 4;

            // diffuse red error
            berr1[ i+1 ] += ( berr * 7 ) >> 4;
            berr2[ i-1 ] += ( berr * 3 ) >> 4;
            berr2[  i  ] += ( berr * 5 ) >> 4;
            berr2[ i+1 ] += ( berr ) >> 4;

            dp++;
        }

        *dp = nearestColor( rerr1[i], gerr1[i], berr1[i], palette, size );
    }

    delete [] rerr1;
    delete [] gerr1;
    delete [] berr1;

    img = dImage;
    return img;
}

// TODO: add configure flags and cpu features detection
//#define USE_SSE2_INLINE_ASM
//#define USE_MMX_INLINE_ASM

// TODO: add conditional compilation for big-endian machines

QImage& imageBlend(const QColor& clr, QImage& dst, float opacity)
{
    if (dst.width() <= 0 || dst.height() <= 0)
        return dst;

    if (opacity < 0.0 || opacity > 1.0) {
        return dst;
    }

    if (dst.depth() != 32)
        dst = dst.convertDepth(32);

    int pixels = dst.width() * dst.height();

#ifdef USE_SSE2_INLINE_ASM
    if ( KCPUInfo::haveExtension( KCPUInfo::IntelSSE2 ) && pixels > 16 ) {
        Q_UINT16 alpha = Q_UINT16( ( 1.0 - opacity ) * 256.0 );

        KIE8Pack packedalpha = { { alpha, alpha, alpha, 256,
                                   alpha, alpha, alpha, 256 } };

        Q_UINT16 red   = Q_UINT16( clr.red()   * 256 * opacity );
        Q_UINT16 green = Q_UINT16( clr.green() * 256 * opacity );
        Q_UINT16 blue  = Q_UINT16( clr.blue()  * 256 * opacity );

        KIE8Pack packedcolor = { { blue, green, red, 0,
                                   blue, green, red, 0 } };

        // Prepare the XMM5, XMM6 and XMM7 registers for unpacking and blending
        __asm__ __volatile__(
        "pxor        %%xmm7,  %%xmm7\n\t" // Zero out XMM7 for unpacking
        "movdqu        (%0),  %%xmm6\n\t" // Set up (1 - alpha) * 256 in XMM6
        "movdqu        (%1),  %%xmm5\n\t" // Set up color * alpha * 256 in XMM5
        : : "r"(&packedalpha), "r"(&packedcolor),
            "m"(packedcolor),  "m"(packedalpha) );

        Q_UINT32 *data = reinterpret_cast<Q_UINT32*>( dst.bits() );

        // Check how many pixels we need to process to achieve 16 byte alignment
        int offset = (16 - (Q_UINT32( data ) & 0x0f)) / 4;

        // The main loop processes 8 pixels / iteration
        int remainder = (pixels - offset) % 8;
        pixels -= remainder;

        // Alignment loop
        for ( int i = 0; i < offset; i++ ) {
            __asm__ __volatile__(
            "movd         (%0,%1,4),      %%xmm0\n\t"  // Load one pixel to XMM1
            "punpcklbw       %%xmm7,      %%xmm0\n\t"  // Unpack the pixel
            "pmullw          %%xmm6,      %%xmm0\n\t"  // Multiply the pixel with (1 - alpha) * 256
            "paddw           %%xmm5,      %%xmm0\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%xmm0\n\t"  // Divide by 256
            "packuswb        %%xmm1,      %%xmm0\n\t"  // Pack the pixel to a dword
            "movd            %%xmm0,   (%0,%1,4)\n\t"  // Write the pixel to the image
            : : "r"(data), "r"(i) );
        }

        // Main loop
        for ( int i = offset; i < pixels; i += 8 ) {
            __asm__ __volatile(
            // Load 8 pixels to XMM registers 1 - 4
            "movq         (%0,%1,4),      %%xmm0\n\t"  // Load pixels 1 and 2 to XMM1
            "movq        8(%0,%1,4),      %%xmm1\n\t"  // Load pixels 3 and 4 to XMM2
            "movq       16(%0,%1,4),      %%xmm2\n\t"  // Load pixels 5 and 6 to XMM3
            "movq       24(%0,%1,4),      %%xmm3\n\t"  // Load pixels 7 and 8 to XMM4

            // Prefetch the pixels for next iteration
            "prefetchnta 32(%0,%1,4)            \n\t"

            // Blend pixels 1 and 2
            "punpcklbw       %%xmm7,      %%xmm0\n\t"  // Unpack the pixels
            "pmullw          %%xmm6,      %%xmm0\n\t"  // Multiply the pixels with (1 - alpha) * 256
            "paddw           %%xmm5,      %%xmm0\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%xmm0\n\t"  // Divide by 256

            // Blend pixels 3 and 4
            "punpcklbw       %%xmm7,      %%xmm1\n\t"  // Unpack the pixels
            "pmullw          %%xmm6,      %%xmm1\n\t"  // Multiply the pixels with (1 - alpha) * 256
            "paddw           %%xmm5,      %%xmm1\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%xmm1\n\t"  // Divide by 256

            // Blend pixels 5 and 6
            "punpcklbw       %%xmm7,      %%xmm2\n\t"  // Unpack the pixels
            "pmullw          %%xmm6,      %%xmm2\n\t"  // Multiply the pixels with (1 - alpha) * 256
            "paddw           %%xmm5,      %%xmm2\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%xmm2\n\t"  // Divide by 256

            // Blend pixels 7 and 8
            "punpcklbw       %%xmm7,      %%xmm3\n\t"  // Unpack the pixels
            "pmullw          %%xmm6,      %%xmm3\n\t"  // Multiply the pixels with (1 - alpha) * 256
            "paddw           %%xmm5,      %%xmm3\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%xmm3\n\t"  // Divide by 256

            // Pack the pixels into 2 double quadwords
            "packuswb        %%xmm1,      %%xmm0\n\t"  // Pack pixels 1 - 4 to a double qword
            "packuswb        %%xmm3,      %%xmm2\n\t"  // Pack pixles 5 - 8 to a double qword

            // Write the pixels back to the image
            "movdqa          %%xmm0,   (%0,%1,4)\n\t"  // Store pixels 1 - 4
            "movdqa          %%xmm2, 16(%0,%1,4)\n\t"  // Store pixels 5 - 8
            : : "r"(data), "r"(i) );
        }

        // Cleanup loop
        for ( int i = pixels; i < pixels + remainder; i++ ) {
            __asm__ __volatile__(
            "movd         (%0,%1,4),      %%xmm0\n\t"  // Load one pixel to XMM1
            "punpcklbw       %%xmm7,      %%xmm0\n\t"  // Unpack the pixel
            "pmullw          %%xmm6,      %%xmm0\n\t"  // Multiply the pixel with (1 - alpha) * 256
            "paddw           %%xmm5,      %%xmm0\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%xmm0\n\t"  // Divide by 256
            "packuswb        %%xmm1,      %%xmm0\n\t"  // Pack the pixel to a dword
            "movd            %%xmm0,   (%0,%1,4)\n\t"  // Write the pixel to the image
            : : "r"(data), "r"(i) );
        }
    } else
#endif

#ifdef USE_MMX_INLINE_ASM
    if ( KCPUInfo::haveExtension( KCPUInfo::IntelMMX ) && pixels > 1 ) {
        Q_UINT16 alpha = Q_UINT16( ( 1.0 - opacity ) * 256.0 );
        KIE4Pack packedalpha = { { alpha, alpha, alpha, 256 } };

        Q_UINT16 red   = Q_UINT16( clr.red()   * 256 * opacity );
        Q_UINT16 green = Q_UINT16( clr.green() * 256 * opacity );
        Q_UINT16 blue  = Q_UINT16( clr.blue()  * 256 * opacity );

        KIE4Pack packedcolor = { { blue, green, red, 0 } };

        __asm__ __volatile__(
        "pxor        %%mm7,    %%mm7\n\t"       // Zero out MM7 for unpacking
        "movq         (%0),    %%mm6\n\t"       // Set up (1 - alpha) * 256 in MM6
        "movq         (%1),    %%mm5\n\t"       // Set up color * alpha * 256 in MM5
        : : "r"(&packedalpha), "r"(&packedcolor), "m"(packedcolor), "m"(packedalpha) );

        Q_UINT32 *data = reinterpret_cast<Q_UINT32*>( dst.bits() );

        // The main loop processes 4 pixels / iteration
        int remainder = pixels % 4;
        pixels -= remainder;

        // Main loop
        for ( int i = 0; i < pixels; i += 4 ) {
            __asm__ __volatile__(
            // Load 4 pixels to MM registers 1 - 4
            "movd         (%0,%1,4),      %%mm0\n\t"  // Load the 1st pixel to MM0
            "movd        4(%0,%1,4),      %%mm1\n\t"  // Load the 2nd pixel to MM1
            "movd        8(%0,%1,4),      %%mm2\n\t"  // Load the 3rd pixel to MM2
            "movd       12(%0,%1,4),      %%mm3\n\t"  // Load the 4th pixel to MM3

            // Blend the first pixel
            "punpcklbw        %%mm7,      %%mm0\n\t"  // Unpack the pixel
            "pmullw           %%mm6,      %%mm0\n\t"  // Multiply the pixel with (1 - alpha) * 256
            "paddw            %%mm5,      %%mm0\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%mm0\n\t"  // Divide by 256

            // Blend the second pixel
            "punpcklbw        %%mm7,      %%mm1\n\t"  // Unpack the pixel
            "pmullw           %%mm6,      %%mm1\n\t"  // Multiply the pixel with (1 - alpha) * 256
            "paddw            %%mm5,      %%mm1\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%mm1\n\t"  // Divide by 256

            // Blend the third pixel
            "punpcklbw        %%mm7,      %%mm2\n\t"  // Unpack the pixel
            "pmullw           %%mm6,      %%mm2\n\t"  // Multiply the pixel with (1 - alpha) * 256
            "paddw            %%mm5,      %%mm2\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%mm2\n\t"  // Divide by 256

            // Blend the fourth pixel
            "punpcklbw        %%mm7,      %%mm3\n\t"  // Unpack the pixel
            "pmullw           %%mm6,      %%mm3\n\t"  // Multiply the pixel with (1 - alpha) * 256
            "paddw            %%mm5,      %%mm3\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%mm3\n\t"  // Divide by 256

            // Pack the pixels into 2 quadwords
            "packuswb         %%mm1,      %%mm0\n\t"  // Pack pixels 1 and 2 to a qword
            "packuswb         %%mm3,      %%mm2\n\t"  // Pack pixels 3 and 4 to a qword

            // Write the pixels back to the image
            "movq             %%mm0,  (%0,%1,4)\n\t"  // Store pixels 1 and 2
            "movq             %%mm2, 8(%0,%1,4)\n\t"  // Store pixels 3 and 4
            : : "r"(data), "r"(i) );
        }

        // Cleanup loop
        for ( int i = pixels; i < pixels + remainder; i++ ) {
            __asm__ __volatile__(
            "movd         (%0,%1,4),      %%mm0\n\t"  // Load one pixel to MM1
            "punpcklbw        %%mm7,      %%mm0\n\t"  // Unpack the pixel
            "pmullw           %%mm6,      %%mm0\n\t"  // Multiply the pixel with 1 - alpha * 256
            "paddw            %%mm5,      %%mm0\n\t"  // Add color * alpha * 256 to the result
            "psrlw               $8,      %%mm0\n\t"  // Divide by 256
            "packuswb         %%mm0,      %%mm0\n\t"  // Pack the pixel to a dword
            "movd             %%mm0,  (%0,%1,4)\n\t"  // Write the pixel to the image
            : : "r"(data), "r"(i) );
        }

        // Empty the MMX state
        __asm__ __volatile__("emms");
    } else
#endif // USE_MMX_INLINE_ASM

    {
        int rcol, gcol, bcol;
        clr.rgb(&rcol, &gcol, &bcol);

#ifdef WORDS_BIGENDIAN   // ARGB (skip alpha)
        register unsigned char *data = (unsigned char *)dst.bits() + 1;
#else                    // BGRA
        register unsigned char *data = (unsigned char *)dst.bits();
#endif

        for (register int i=0; i<pixels; i++)
        {
#ifdef WORDS_BIGENDIAN
            *data += (unsigned char)((rcol - *data) * opacity);
            ++data;
            *data += (unsigned char)((gcol - *data) * opacity);
            ++data;
            *data += (unsigned char)((bcol - *data) * opacity);
            ++data;
#else
            *data += (unsigned char)((bcol - *data) * opacity);
            ++data;
            *data += (unsigned char)((gcol - *data) * opacity);
            ++data;
            *data += (unsigned char)((rcol - *data) * opacity);
            ++data;
#endif
            data++; // skip alpha
        }
    }

    return dst;
}

QImage& imageBlend(QImage& src, QImage& dst, float opacity)
{
    if (src.width() <= 0 || src.height() <= 0)
        return dst;
    if (dst.width() <= 0 || dst.height() <= 0)
        return dst;

    if (src.width() != dst.width() || src.height() != dst.height()) {
        return dst;
    }

    if (opacity < 0.0 || opacity > 1.0) {
        return dst;
    }

    if (src.depth() != 32) src = src.convertDepth(32);
    if (dst.depth() != 32) dst = dst.convertDepth(32);

    int pixels = src.width() * src.height();

#ifdef USE_SSE2_INLINE_ASM
    if ( KCPUInfo::haveExtension( KCPUInfo::IntelSSE2 ) && pixels > 16 ) {
        Q_UINT16 alpha = Q_UINT16( opacity * 256.0 );
        KIE8Pack packedalpha = { { alpha, alpha, alpha, 0,
                                   alpha, alpha, alpha, 0 } };

        // Prepare the XMM6 and XMM7 registers for unpacking and blending
        __asm__ __volatile__(
        "pxor      %%xmm7, %%xmm7\n\t" // Zero out XMM7 for unpacking
        "movdqu      (%0), %%xmm6\n\t" // Set up alpha * 256 in XMM6
        : : "r"(&packedalpha), "m"(packedalpha) );

        Q_UINT32 *data1 = reinterpret_cast<Q_UINT32*>( src.bits() );
        Q_UINT32 *data2 = reinterpret_cast<Q_UINT32*>( dst.bits() );

        // Check how many pixels we need to process to achieve 16 byte alignment
        int offset = (16 - (Q_UINT32( data2 ) & 0x0f)) / 4;

        // The main loop processes 4 pixels / iteration
        int remainder = (pixels - offset) % 4;
        pixels -= remainder;

        // Alignment loop
        for ( int i = 0; i < offset; i++ ) {
            __asm__ __volatile__(
            "movd       (%1,%2,4),    %%xmm1\n\t"  // Load one dst pixel to XMM1
            "punpcklbw     %%xmm7,    %%xmm1\n\t"  // Unpack the pixel
            "movd       (%0,%2,4),    %%xmm0\n\t"  // Load one src pixel to XMM0
            "punpcklbw     %%xmm7,    %%xmm0\n\t"  // Unpack the pixel
            "psubw         %%xmm1,    %%xmm0\n\t"  // Subtract dst from src
            "pmullw        %%xmm6,    %%xmm0\n\t"  // Multiply the result with alpha * 256
            "psllw             $8,    %%xmm1\n\t"  // Multiply dst with 256
            "paddw         %%xmm1,    %%xmm0\n\t"  // Add dst to result
            "psrlw             $8,    %%xmm0\n\t"  // Divide by 256
            "packuswb      %%xmm1,    %%xmm0\n\t"  // Pack the pixel to a dword
            "movd          %%xmm0, (%1,%2,4)\n\t"  // Write the pixel to the image
            : : "r"(data1), "r"(data2), "r"(i) );
        }

        // Main loop
        for ( int i = offset; i < pixels; i += 4 ) {
            __asm__ __volatile__(
            // Load 4 src pixels to XMM0 and XMM2 and 4 dst pixels to XMM1 and XMM3
            "movq       (%0,%2,4),    %%xmm0\n\t"  // Load two src pixels to XMM0
            "movq       (%1,%2,4),    %%xmm1\n\t"  // Load two dst pixels to XMM1
            "movq      8(%0,%2,4),    %%xmm2\n\t"  // Load two src pixels to XMM2
            "movq      8(%1,%2,4),    %%xmm3\n\t"  // Load two dst pixels to XMM3

            // Prefetch the pixels for the iteration after the next one
            "prefetchnta 32(%0,%2,4)        \n\t"
            "prefetchnta 32(%1,%2,4)        \n\t"

            // Blend the first two pixels
            "punpcklbw     %%xmm7,    %%xmm1\n\t"  // Unpack the dst pixels
            "punpcklbw     %%xmm7,    %%xmm0\n\t"  // Unpack the src pixels
            "psubw         %%xmm1,    %%xmm0\n\t"  // Subtract dst from src
            "pmullw        %%xmm6,    %%xmm0\n\t"  // Multiply the result with alpha * 256
            "psllw             $8,    %%xmm1\n\t"  // Multiply dst with 256
            "paddw         %%xmm1,    %%xmm0\n\t"  // Add dst to the result
            "psrlw             $8,    %%xmm0\n\t"  // Divide by 256

            // Blend the next two pixels
            "punpcklbw     %%xmm7,    %%xmm3\n\t"  // Unpack the dst pixels
            "punpcklbw     %%xmm7,    %%xmm2\n\t"  // Unpack the src pixels
            "psubw         %%xmm3,    %%xmm2\n\t"  // Subtract dst from src
            "pmullw        %%xmm6,    %%xmm2\n\t"  // Multiply the result with alpha * 256
            "psllw             $8,    %%xmm3\n\t"  // Multiply dst with 256
            "paddw         %%xmm3,    %%xmm2\n\t"  // Add dst to the result
            "psrlw             $8,    %%xmm2\n\t"  // Divide by 256

            // Write the pixels back to the image
            "packuswb      %%xmm2,    %%xmm0\n\t"  // Pack the pixels to a double qword
            "movdqa        %%xmm0, (%1,%2,4)\n\t"  // Store the pixels
            : : "r"(data1), "r"(data2), "r"(i) );
        }

        // Cleanup loop
        for ( int i = pixels; i < pixels + remainder; i++ ) {
            __asm__ __volatile__(
            "movd       (%1,%2,4),    %%xmm1\n\t"  // Load one dst pixel to XMM1
            "punpcklbw     %%xmm7,    %%xmm1\n\t"  // Unpack the pixel
            "movd       (%0,%2,4),    %%xmm0\n\t"  // Load one src pixel to XMM0
            "punpcklbw     %%xmm7,    %%xmm0\n\t"  // Unpack the pixel
            "psubw         %%xmm1,    %%xmm0\n\t"  // Subtract dst from src
            "pmullw        %%xmm6,    %%xmm0\n\t"  // Multiply the result with alpha * 256
            "psllw             $8,    %%xmm1\n\t"  // Multiply dst with 256
            "paddw         %%xmm1,    %%xmm0\n\t"  // Add dst to result
            "psrlw             $8,    %%xmm0\n\t"  // Divide by 256
            "packuswb      %%xmm1,    %%xmm0\n\t"  // Pack the pixel to a dword
            "movd          %%xmm0, (%1,%2,4)\n\t"  // Write the pixel to the image
            : : "r"(data1), "r"(data2), "r"(i) );
        }
    } else
#endif // USE_SSE2_INLINE_ASM

#ifdef USE_MMX_INLINE_ASM
    if ( KCPUInfo::haveExtension( KCPUInfo::IntelMMX ) && pixels > 1 ) {
        Q_UINT16 alpha = Q_UINT16( opacity * 256.0 );
        KIE4Pack packedalpha = { { alpha, alpha, alpha, 0 } };

        // Prepare the MM6 and MM7 registers for blending and unpacking
        __asm__ __volatile__(
        "pxor       %%mm7,   %%mm7\n\t"      // Zero out MM7 for unpacking
        "movq        (%0),   %%mm6\n\t"      // Set up alpha * 256 in MM6
        : : "r"(&packedalpha), "m"(packedalpha) );

        Q_UINT32 *data1 = reinterpret_cast<Q_UINT32*>( src.bits() );
        Q_UINT32 *data2 = reinterpret_cast<Q_UINT32*>( dst.bits() );

        // The main loop processes 2 pixels / iteration
        int remainder = pixels % 2;
        pixels -= remainder;

        // Main loop
        for ( int i = 0; i < pixels; i += 2 ) {
            __asm__ __volatile__(
            // Load 2 src pixels to MM0 and MM2 and 2 dst pixels to MM1 and MM3
            "movd        (%0,%2,4),     %%mm0\n\t"  // Load the 1st src pixel to MM0
            "movd        (%1,%2,4),     %%mm1\n\t"  // Load the 1st dst pixel to MM1
            "movd       4(%0,%2,4),     %%mm2\n\t"  // Load the 2nd src pixel to MM2
            "movd       4(%1,%2,4),     %%mm3\n\t"  // Load the 2nd dst pixel to MM3

            // Blend the first pixel
            "punpcklbw       %%mm7,     %%mm0\n\t"  // Unpack the src pixel
            "punpcklbw       %%mm7,     %%mm1\n\t"  // Unpack the dst pixel
            "psubw           %%mm1,     %%mm0\n\t"  // Subtract dst from src
            "pmullw          %%mm6,     %%mm0\n\t"  // Multiply the result with alpha * 256
            "psllw              $8,     %%mm1\n\t"  // Multiply dst with 256
            "paddw           %%mm1,     %%mm0\n\t"  // Add dst to the result
            "psrlw              $8,     %%mm0\n\t"  // Divide by 256

            // Blend the second pixel
            "punpcklbw       %%mm7,     %%mm2\n\t"  // Unpack the src pixel
            "punpcklbw       %%mm7,     %%mm3\n\t"  // Unpack the dst pixel
            "psubw           %%mm3,     %%mm2\n\t"  // Subtract dst from src
            "pmullw          %%mm6,     %%mm2\n\t"  // Multiply the result with alpha * 256
            "psllw              $8,     %%mm3\n\t"  // Multiply dst with 256
            "paddw           %%mm3,     %%mm2\n\t"  // Add dst to the result
            "psrlw              $8,     %%mm2\n\t"  // Divide by 256

            // Write the pixels back to the image
            "packuswb        %%mm2,     %%mm0\n\t"  // Pack the pixels to a qword
            "movq            %%mm0, (%1,%2,4)\n\t"  // Store the pixels
            : : "r"(data1), "r"(data2), "r"(i) );
        }

        // Blend the remaining pixel (if there is one)
        if ( remainder ) {
             __asm__ __volatile__(
            "movd             (%0),     %%mm0\n\t"  // Load one src pixel to MM0
            "punpcklbw       %%mm7,     %%mm0\n\t"  // Unpack the src pixel
            "movd             (%1),     %%mm1\n\t"  // Load one dst pixel to MM1
            "punpcklbw       %%mm7,     %%mm1\n\t"  // Unpack the dst pixel
            "psubw           %%mm1,     %%mm0\n\t"  // Subtract dst from src
            "pmullw          %%mm6,     %%mm0\n\t"  // Multiply the result with alpha * 256
            "psllw              $8,     %%mm1\n\t"  // Multiply dst with 256
            "paddw           %%mm1,     %%mm0\n\t"  // Add dst to result
            "psrlw              $8,     %%mm0\n\t"  // Divide by 256
            "packuswb        %%mm0,     %%mm0\n\t"  // Pack the pixel to a dword
            "movd            %%mm0,      (%1)\n\t"  // Write the pixel to the image
            : : "r"(data1 + pixels), "r"(data2 + pixels) );
        }

        // Empty the MMX state
        __asm__ __volatile__("emms");
    } else
#endif // USE_MMX_INLINE_ASM

    {
#ifdef WORDS_BIGENDIAN   // ARGB (skip alpha)
        register unsigned char *data1 = (unsigned char *)dst.bits() + 1;
        register unsigned char *data2 = (unsigned char *)src.bits() + 1;
#else                    // BGRA
        register unsigned char *data1 = (unsigned char *)dst.bits();
        register unsigned char *data2 = (unsigned char *)src.bits();
#endif

        for (register int i=0; i<pixels; i++)
        {
#ifdef WORDS_BIGENDIAN
            *data1 += (unsigned char)((*(data2++) - *data1) * opacity);
            data1++;
			*data1 += (unsigned char)((*(data2++) - *data1) * opacity);
            data1++;
            *data1 += (unsigned char)((*(data2++) - *data1) * opacity);
            data1++;
#else
            *data1 += (unsigned char)((*(data2++) - *data1) * opacity);
            data1++;
            *data1 += (unsigned char)((*(data2++) - *data1) * opacity);
            data1++;
            *data1 += (unsigned char)((*(data2++) - *data1) * opacity);
            data1++;
#endif
            data1++; // skip alpha
            data2++;
        }
    }

    return dst;
}
