Magellan Linux

Annotation of /trunk/vnc/patches/9914_all_6.8.2-mmx-gcc4.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 153 - (hide annotations) (download)
Tue May 8 20:52:56 2007 UTC (17 years, 1 month ago) by niro
File size: 52896 byte(s)
-import

1 niro 153 diff -ur xc-orig/programs/Xserver/fb/Imakefile xc/programs/Xserver/fb/Imakefile
2     --- xc-orig/programs/Xserver/fb/Imakefile 2005-02-11 04:00:50.004092510 -0500
3     +++ xc/programs/Xserver/fb/Imakefile 2005-02-11 04:01:32.059345739 -0500
4     @@ -3,13 +3,22 @@
5     XCOMM
6     XCOMM Id: Imakefile,v 1.1 1999/11/02 03:54:44 keithp Exp $
7    
8     -#if defined(i386Architecture) && defined(HasGcc34) && HasGcc34
9     +#if defined(HasGcc34) && HasGcc34
10     MMXOPTIONS= -mmmx -Winline --param inline-unit-growth=10000 \
11     - --param large-function-growth=10000 -DUSE_GCC34_MMX
12     + --param large-function-growth=10000 -DUSE_MMX
13     +SSEOPTIONS= $(MMXOPTIONS) -msse -DUSE_SSE
14    
15     +#if defined(i386Architecture)
16     SpecialCObjectRule(fbmmx,fbmmx.c,$(MMXOPTIONS))
17     +#elif defined(AMD64Architecture)
18     +SpecialCObjectRule(fbmmx,fbmmx.c,$(SSEOPTIONS))
19     +#endif
20     +
21     +#if defined(i386Architecture) || defined(AMD64Architecture)
22     SpecialCObjectRule(fbpict,fbpict.c,$(MMXOPTIONS))
23     SpecialCObjectRule(fbfill,fbfill.c,$(MMXOPTIONS))
24     +SpecialCObjectRule(fbcopy,fbcopy.c,$(MMXOPTIONS))
25     +#endif
26    
27     #endif
28    
29     diff -ur xc-orig/programs/Xserver/fb/fbcompose.c xc/programs/Xserver/fb/fbcompose.c
30     --- xc-orig/programs/Xserver/fb/fbcompose.c 2005-02-11 04:00:50.009092659 -0500
31     +++ xc/programs/Xserver/fb/fbcompose.c 2005-02-11 04:01:32.067345977 -0500
32     @@ -1,8 +1,8 @@
33     /*
34     - * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.3 2004/05/12 01:49:46 anholt Exp $
35     + * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.5 2005/01/13 20:49:21 sandmann Exp $
36     * $XFree86: xc/programs/Xserver/fb/fbcompose.c,v 1.17tsi Exp $
37     *
38     - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
39     + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
40     *
41     * Permission to use, copy, modify, distribute, and sell this software and its
42     * documentation for any purpose is hereby granted without fee, provided that
43     @@ -2693,7 +2693,6 @@
44     op->u.transform.y = y - op->u.transform.top_y;
45     }
46    
47     -
48     Bool
49     fbBuildCompositeOperand (PicturePtr pPict,
50     FbCompositeOperand op[4],
51     @@ -2710,7 +2709,6 @@
52    
53     op->u.transform.top_y = pPict->pDrawable->y;
54     op->u.transform.left_x = pPict->pDrawable->x;
55     -
56     op->u.transform.start_x = x - op->u.transform.left_x;
57     op->u.transform.x = op->u.transform.start_x;
58     op->u.transform.y = y - op->u.transform.top_y;
59     @@ -2822,6 +2820,21 @@
60     FbCombineFunc f;
61     int w;
62    
63     +#if 0
64     + ErrorF ("op: %d\n"
65     + "src format: %lx\n"
66     + "msk format %lx\n"
67     + "dst format %lx\n"
68     + "width: %d\n"
69     + "height %d\n",
70     + op,
71     + pSrc? pSrc->format : 0,
72     + pMask? pMask->format : 0,
73     + pDst? pDst->format : 0,
74     + width, height);
75     + ErrorF ("PICT_x8r8g8b8: %lx\n", PICT_x8r8g8b8);
76     +#endif
77     +
78     if (!fbBuildCompositeOperand (pSrc, src, xSrc, ySrc, TRUE, TRUE))
79     return;
80     if (!fbBuildCompositeOperand (pDst, dst, xDst, yDst, FALSE, TRUE))
81     diff -ur xc-orig/programs/Xserver/fb/fbcopy.c xc/programs/Xserver/fb/fbcopy.c
82     --- xc-orig/programs/Xserver/fb/fbcopy.c 2005-02-11 04:00:50.004092510 -0500
83     +++ xc/programs/Xserver/fb/fbcopy.c 2005-02-11 04:01:32.068346007 -0500
84     @@ -1,7 +1,7 @@
85     /*
86     * Id: fbcopy.c,v 1.1 1999/11/02 03:54:45 keithp Exp $
87     *
88     - * Copyright © 1998 Keith Packard
89     + * Copyright © 1998 Keith Packard
90     *
91     * Permission to use, copy, modify, distribute, and sell this software and its
92     * documentation for any purpose is hereby granted without fee, provided that
93     @@ -27,6 +27,7 @@
94     #ifdef IN_MODULE
95     #include "xf86_ansic.h"
96     #endif
97     +#include "fbmmx.h"
98    
99     void
100     fbCopyNtoN (DrawablePtr pSrcDrawable,
101     @@ -54,28 +55,51 @@
102    
103     fbGetDrawable (pSrcDrawable, src, srcStride, srcBpp, srcXoff, srcYoff);
104     fbGetDrawable (pDstDrawable, dst, dstStride, dstBpp, dstXoff, dstYoff);
105     -
106     +
107     while (nbox--)
108     {
109     +#ifdef USE_MMX
110     + if (!reverse && !upsidedown && fbHaveMMX())
111     + {
112     + if (!fbCopyAreammx (pSrcDrawable,
113     + pDstDrawable,
114     +
115     + (pbox->x1 + dx + srcXoff),
116     + (pbox->y1 + dy + srcYoff),
117     +
118     + (pbox->x1 + dstXoff),
119     + (pbox->y1 + dstYoff),
120     +
121     + (pbox->x2 - pbox->x1),
122     + (pbox->y2 - pbox->y1)))
123     + goto fallback;
124     + else
125     + goto next;
126     + }
127     + fallback:
128     +#endif
129     fbBlt (src + (pbox->y1 + dy + srcYoff) * srcStride,
130     srcStride,
131     (pbox->x1 + dx + srcXoff) * srcBpp,
132     -
133     +
134     dst + (pbox->y1 + dstYoff) * dstStride,
135     dstStride,
136     (pbox->x1 + dstXoff) * dstBpp,
137     -
138     +
139     (pbox->x2 - pbox->x1) * dstBpp,
140     (pbox->y2 - pbox->y1),
141     -
142     +
143     alu,
144     pm,
145     dstBpp,
146     -
147     +
148     reverse,
149     upsidedown);
150     +#ifdef USE_MMX
151     + next:
152     +#endif
153     pbox++;
154     - }
155     + }
156     }
157    
158     void
159     @@ -594,7 +618,7 @@
160     int yOut)
161     {
162     fbCopyProc copy;
163     -
164     +
165     #ifdef FB_24_32BIT
166     if (pSrcDrawable->bitsPerPixel != pDstDrawable->bitsPerPixel)
167     copy = fb24_32CopyMtoN;
168     diff -ur xc-orig/programs/Xserver/fb/fbfill.c xc/programs/Xserver/fb/fbfill.c
169     --- xc-orig/programs/Xserver/fb/fbfill.c 2005-02-11 04:00:50.006092570 -0500
170     +++ xc/programs/Xserver/fb/fbfill.c 2005-02-11 04:01:32.069346037 -0500
171     @@ -1,7 +1,7 @@
172     /*
173     * Id: fbfill.c,v 1.1 1999/11/02 03:54:45 keithp Exp $
174     *
175     - * Copyright © 1998 Keith Packard
176     + * Copyright © 1998 Keith Packard
177     *
178     * Permission to use, copy, modify, distribute, and sell this software and its
179     * documentation for any purpose is hereby granted without fee, provided that
180     @@ -44,7 +44,7 @@
181    
182     switch (pGC->fillStyle) {
183     case FillSolid:
184     -#ifdef USE_GCC34_MMX
185     +#ifdef USE_MMX
186     if (!pPriv->and && fbHaveMMX())
187     if (fbSolidFillmmx (pDrawable, x, y, width, height, pPriv->xor))
188     return;
189    
190     diff -ur xc-orig/programs/Xserver/fb/fbmmx.c xc/programs/Xserver/fb/fbmmx.c
191     --- xc-orig/programs/Xserver/fb/fbmmx.c 2005-02-11 04:00:50.006092570 -0500
192     +++ xc/programs/Xserver/fb/fbmmx.c 2005-02-11 04:01:32.072346126 -0500
193     @@ -1,5 +1,6 @@
194     /*
195     - * Copyright © 2004 Red Hat, Inc.
196     + * Copyright © 2004 Red Hat, Inc.
197     + * Copyright © 2004 Nicholas Miell
198     *
199     * Permission to use, copy, modify, distribute, and sell this software and its
200     * documentation for any purpose is hereby granted without fee, provided that
201     @@ -18,14 +19,23 @@
202     * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
203     * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
204     *
205     - * Author: Søren Sandmann (sandmann@redhat.com)
206     - *
207     + * Author: Søren Sandmann (sandmann@redhat.com)
208     + * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
209     + *
210     * Based on work by Owen Taylor
211     */
212    
213     +
214     +#ifdef USE_MMX
215     +
216     #include "fb.h"
217     +#include "fbmmx.h"
218     +
219     +#include <mmintrin.h>
220    
221     -#ifdef USE_GCC34_MMX
222     +#ifdef USE_SSE
223     +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
224     +#endif
225    
226     #ifdef RENDER
227    
228     @@ -33,11 +43,6 @@
229     #include "mipict.h"
230     #include "fbpict.h"
231    
232     -typedef int Vector1x64 __attribute__ ((mode(DI)));
233     -typedef int Vector2x32 __attribute__ ((mode(V2SI)));
234     -typedef int Vector4x16 __attribute__ ((mode(V4HI)));
235     -typedef int Vector8x8 __attribute__ ((mode(V8QI)));
236     -
237     typedef unsigned long long ullong;
238    
239     #define noVERBOSE
240     @@ -50,7 +55,6 @@
241    
242     typedef struct
243     {
244     - ullong mmx_zero;
245     ullong mmx_4x00ff;
246     ullong mmx_4x0080;
247     ullong mmx_565_rgb;
248     @@ -70,7 +74,6 @@
249    
250     static const MMXData c =
251     {
252     - .mmx_zero = 0x0000000000000000ULL,
253     .mmx_4x00ff = 0x00ff00ff00ff00ffULL,
254     .mmx_4x0080 = 0x0080008000800080ULL,
255     .mmx_565_rgb = 0x000001f0003f001fULL,
256     @@ -88,121 +91,112 @@
257     .mmx_000000000000ffff = 0x000000000000ffffULL,
258     };
259    
260     -static __inline__ Vector1x64
261     -shift (Vector1x64 v, int s)
262     +#define MC(x) ((__m64) c.mmx_##x)
263     +
264     +static __inline__ __m64
265     +shift (__m64 v, int s)
266     {
267     if (s > 0)
268     - return __builtin_ia32_psllq (v, s);
269     + return _mm_slli_si64 (v, s);
270     else if (s < 0)
271     - return __builtin_ia32_psrlq (v, -s);
272     + return _mm_srli_si64 (v, -s);
273     else
274     return v;
275     }
276    
277     -static __inline__ Vector4x16
278     -negate (Vector4x16 mask)
279     +static __inline__ __m64
280     +negate (__m64 mask)
281     {
282     - return (Vector4x16)__builtin_ia32_pxor (
283     - (Vector1x64)mask,
284     - (Vector1x64)c.mmx_4x00ff);
285     + return _mm_xor_si64 (mask, MC(4x00ff));
286     }
287    
288     -static __inline__ Vector4x16
289     -pix_multiply (Vector4x16 a, Vector4x16 b)
290     +static __inline__ __m64
291     +pix_multiply (__m64 a, __m64 b)
292     {
293     - Vector4x16 res;
294     + __m64 res;
295    
296     - res = __builtin_ia32_pmullw (a, b);
297     - res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080);
298     - res = __builtin_ia32_psrlw (res, 8);
299     + res = _mm_mullo_pi16 (a, b);
300     + res = _mm_add_pi16 (res, MC(4x0080));
301     + res = _mm_srli_pi16 (res, 8);
302    
303     return res;
304     }
305    
306     -#if 0
307     +#ifdef USE_SSE
308     #define HAVE_PSHUFW
309     #endif
310    
311     #ifdef HAVE_PSHUFW
312    
313     -static __inline__ Vector4x16
314     -expand_alpha (Vector4x16 pixel)
315     +static __inline__ __m64
316     +expand_alpha (__m64 pixel)
317     {
318     - Vector4x16 result;
319     - __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel));
320     - return result;
321     + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
322     }
323    
324     -static __inline__ Vector4x16
325     -expand_alpha_rev (Vector4x16 pixel)
326     +static __inline__ __m64
327     +expand_alpha_rev (__m64 pixel)
328     {
329     - Vector4x16 result;
330     - __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel));
331     - return result;
332     + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
333     }
334    
335     -static __inline__ Vector4x16
336     -invert_colors (Vector4x16 pixel)
337     +static __inline__ __m64
338     +invert_colors (__m64 pixel)
339     {
340     - Vector4x16 result;
341     -
342     - /* 0xC6 = 11000110 */
343     - /* 3 0 1 2 */
344     -
345     - __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel));
346     -
347     - return result;
348     + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
349     }
350    
351     #else
352    
353     -static __inline__ Vector4x16
354     -expand_alpha (Vector4x16 pixel)
355     +static __inline__ __m64
356     +expand_alpha (__m64 pixel)
357     {
358     - Vector1x64 t1, t2;
359     -
360     - t1 = shift ((Vector1x64)pixel, -48);
361     + __m64 t1, t2;
362     +
363     + t1 = shift (pixel, -48);
364     t2 = shift (t1, 16);
365     - t1 = __builtin_ia32_por (t1, t2);
366     + t1 = _mm_or_si64 (t1, t2);
367     t2 = shift (t1, 32);
368     - t1 = __builtin_ia32_por (t1, t2);
369     -
370     - return (Vector4x16)t1;
371     + t1 = _mm_or_si64 (t1, t2);
372     +
373     + return t1;
374     }
375    
376     -static __inline__ Vector4x16
377     -expand_alpha_rev (Vector4x16 pixel)
378     +static __inline__ __m64
379     +expand_alpha_rev (__m64 pixel)
380     {
381     - Vector1x64 t1, t2;
382     -
383     - t1 = shift ((Vector1x64)pixel, 48);
384     + __m64 t1, t2;
385     +
386     + /* move alpha to low 16 bits and zero the rest */
387     + t1 = shift (pixel, 48);
388     t1 = shift (t1, -48);
389     +
390     t2 = shift (t1, 16);
391     - t1 = __builtin_ia32_por (t1, t2);
392     + t1 = _mm_or_si64 (t1, t2);
393     t2 = shift (t1, 32);
394     - t1 = __builtin_ia32_por (t1, t2);
395     -
396     - return (Vector4x16)t1;
397     + t1 = _mm_or_si64 (t1, t2);
398     +
399     + return t1;
400     }
401    
402     -static __inline__ Vector4x16
403     -invert_colors (Vector4x16 pixel)
404     +static __inline__ __m64
405     +invert_colors (__m64 pixel)
406     {
407     - Vector1x64 x, y, z;
408     -
409     - x = y = z = (Vector1x64)pixel;
410     -
411     - x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000);
412     - y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff);
413     - z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000);
414     -
415     + __m64 x, y, z;
416     +
417     + x = y = z = pixel;
418     +
419     + x = _mm_and_si64 (x, MC(ffff0000ffff0000));
420     + y = _mm_and_si64 (y, MC(000000000000ffff));
421     + z = _mm_and_si64 (z, MC(0000ffff00000000));
422     +
423     y = shift (y, 32);
424     z = shift (z, -32);
425     -
426     - x = __builtin_ia32_por (x, y);
427     - x = __builtin_ia32_por (x, z);
428     -
429     - return (Vector4x16)x;
430     +
431     + x = _mm_or_si64 (x, y);
432     + x = _mm_or_si64 (x, z);
433     +
434     + return x;
435     }
436    
437     #endif
438     @@ -210,147 +204,138 @@
439     /* Notes about writing mmx code
440     *
441     * give memory operands as the second operand. If you give it as the
442     - * first, gcc will first load it into a register, then use that register
443     + * first, gcc will first load it into a register, then use that
444     + * register
445     *
446     * ie. use
447     *
448     - * __builtin_pmullw (x, mmx_constant[8]);
449     + * _mm_mullo_pi16 (x, mmx_constant);
450     *
451     * not
452     *
453     - * __builtin_pmullw (mmx_constant[8], x);
454     + * _mm_mullo_pi16 (mmx_constant, x);
455     *
456     - * Also try to minimize dependencies. Ie. when you need a value, try to calculate
457     - * it from a value that was calculated as early as possible.
458     + * Also try to minimize dependencies. i.e. when you need a value, try
459     + * to calculate it from a value that was calculated as early as
460     + * possible.
461     */
462    
463     -static __inline__ Vector4x16
464     -over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest)
465     +static __inline__ __m64
466     +over (__m64 src, __m64 srca, __m64 dest)
467     {
468     - return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca)));
469     + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
470     }
471    
472     -static __inline__ Vector4x16
473     -over_rev_non_pre (Vector4x16 src, Vector4x16 dest)
474     +static __inline__ __m64
475     +over_rev_non_pre (__m64 src, __m64 dest)
476     {
477     - Vector4x16 srca = expand_alpha (src);
478     - Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha);
479     -
480     + __m64 srca = expand_alpha (src);
481     + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
482     +
483     return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
484     }
485    
486     -static __inline__ Vector4x16
487     -in (Vector4x16 src,
488     - Vector4x16 mask)
489     +static __inline__ __m64
490     +in (__m64 src,
491     + __m64 mask)
492     {
493     return pix_multiply (src, mask);
494     }
495    
496     -static __inline__ Vector4x16
497     -in_over (Vector4x16 src,
498     - Vector4x16 srca,
499     - Vector4x16 mask,
500     - Vector4x16 dest)
501     +static __inline__ __m64
502     +in_over (__m64 src,
503     + __m64 srca,
504     + __m64 mask,
505     + __m64 dest)
506     {
507     return over(in(src, mask), pix_multiply(srca, mask), dest);
508     }
509    
510     -static __inline__ Vector8x8
511     -cvt32to64 (CARD32 v)
512     -{
513     - ullong r = v;
514     - return (Vector8x8)r;
515     -}
516     -
517     -static __inline__ Vector4x16
518     +static __inline__ __m64
519     load8888 (CARD32 v)
520     {
521     - return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v),
522     - (Vector8x8)c.mmx_zero);
523     + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
524     }
525    
526     -static __inline__ Vector8x8
527     -pack8888 (Vector4x16 lo, Vector4x16 hi)
528     +static __inline__ __m64
529     +pack8888 (__m64 lo, __m64 hi)
530     {
531     - Vector8x8 r;
532     - r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi);
533     + __m64 r;
534     + r = _mm_packs_pu16 (lo, hi);
535     return r;
536     }
537    
538     -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB
539     -
540     ---- Expanding 565 in the low word ---
541     -
542     -m = (m << (32 - 3)) | (m << (16 - 5)) | m;
543     -m = m & (01f0003f001f);
544     -m = m * (008404100840);
545     -m = m >> 8;
546     -
547     -Note the trick here - the top word is shifted by another nibble to avoid
548     -it bumping into the middle word
549     -*/
550     -static __inline__ Vector4x16
551     -expand565 (Vector4x16 pixel, int pos)
552     +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
553     + *
554     + * 00RR00GG00BB
555     + *
556     + * --- Expanding 565 in the low word ---
557     + *
558     + * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
559     + * m = m & (01f0003f001f);
560     + * m = m * (008404100840);
561     + * m = m >> 8;
562     + *
563     + * Note the trick here - the top word is shifted by another nibble to
564     + * avoid it bumping into the middle word
565     + */
566     +static __inline__ __m64
567     +expand565 (__m64 pixel, int pos)
568     {
569     - Vector1x64 p = (Vector1x64)pixel;
570     + __m64 p = pixel;
571     + __m64 t1, t2;
572    
573     /* move pixel to low 16 bit and zero the rest */
574     p = shift (shift (p, (3 - pos) * 16), -48);
575    
576     - Vector1x64 t1 = shift (p, 36 - 11);
577     - Vector1x64 t2 = shift (p, 16 - 5);
578     + t1 = shift (p, 36 - 11);
579     + t2 = shift (p, 16 - 5);
580    
581     - p = __builtin_ia32_por (t1, p);
582     - p = __builtin_ia32_por (t2, p);
583     - p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb);
584     + p = _mm_or_si64 (t1, p);
585     + p = _mm_or_si64 (t2, p);
586     + p = _mm_and_si64 (p, MC(565_rgb));
587    
588     - pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier);
589     - return __builtin_ia32_psrlw (pixel, 8);
590     + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
591     + return _mm_srli_pi16 (pixel, 8);
592     }
593    
594     -static __inline__ Vector4x16
595     -expand8888 (Vector4x16 in, int pos)
596     +static __inline__ __m64
597     +expand8888 (__m64 in, int pos)
598     {
599     if (pos == 0)
600     - return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
601     + return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
602     else
603     - return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
604     + return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
605     }
606    
607     -static __inline__ Vector4x16
608     -pack565 (Vector4x16 pixel, Vector4x16 target, int pos)
609     +static __inline__ __m64
610     +pack565 (__m64 pixel, __m64 target, int pos)
611     {
612     - Vector1x64 p = (Vector1x64)pixel;
613     - Vector1x64 t = (Vector1x64)target;
614     - Vector1x64 r, g, b;
615     + __m64 p = pixel;
616     + __m64 t = target;
617     + __m64 r, g, b;
618    
619     - r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r);
620     - g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g);
621     - b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b);
622     + r = _mm_and_si64 (p, MC(565_r));
623     + g = _mm_and_si64 (p, MC(565_g));
624     + b = _mm_and_si64 (p, MC(565_b));
625    
626     r = shift (r, - (32 - 8) + pos * 16);
627     g = shift (g, - (16 - 3) + pos * 16);
628     b = shift (b, - (0 + 3) + pos * 16);
629     -
630     +
631     if (pos == 0)
632     - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0);
633     + t = _mm_and_si64 (t, MC(mask_0));
634     else if (pos == 1)
635     - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1);
636     + t = _mm_and_si64 (t, MC(mask_1));
637     else if (pos == 2)
638     - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2);
639     + t = _mm_and_si64 (t, MC(mask_2));
640     else if (pos == 3)
641     - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3);
642     + t = _mm_and_si64 (t, MC(mask_3));
643    
644     - p = __builtin_ia32_por (r, t);
645     - p = __builtin_ia32_por (g, p);
646     + p = _mm_or_si64 (r, t);
647     + p = _mm_or_si64 (g, p);
648    
649     - return (Vector4x16)__builtin_ia32_por (b, p);
650     -}
651     -
652     -static __inline__ void
653     -emms (void)
654     -{
655     - __asm__ __volatile__ ("emms");
656     + return _mm_or_si64 (b, p);
657     }
658    
659     void
660     @@ -371,8 +356,8 @@
661     CARD32 *dstLine, *dst;
662     CARD16 w;
663     FbStride dstStride;
664     - Vector4x16 vsrc, vsrca;
665     -
666     + __m64 vsrc, vsrca;
667     +
668     CHECKPOINT();
669    
670     fbComposeGetSolid(pSrc, src, pDst->format);
671     @@ -384,51 +369,52 @@
672    
673     vsrc = load8888 (src);
674     vsrca = expand_alpha (vsrc);
675     -
676     +
677     while (height--)
678     {
679     dst = dstLine;
680     dstLine += dstStride;
681     w = width;
682     -
683     +
684     CHECKPOINT();
685    
686     while (w && (unsigned long)dst & 7)
687     {
688     - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
689     + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)),
690     + _mm_setzero_si64());
691    
692     w--;
693     dst++;
694     }
695     -
696     +
697     while (w >= 2)
698     {
699     - Vector4x16 vdest;
700     - Vector4x16 dest0, dest1;
701     -
702     - vdest = *(Vector4x16 *)dst;
703     + __m64 vdest;
704     + __m64 dest0, dest1;
705     +
706     + vdest = *(__m64 *)dst;
707    
708     dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
709     dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
710    
711     - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
712     + *(__m64 *)dst = pack8888(dest0, dest1);
713    
714     dst += 2;
715     w -= 2;
716     }
717     -
718     +
719     CHECKPOINT();
720    
721     while (w)
722     {
723     - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
724     + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64());
725    
726     w--;
727     dst++;
728     }
729     }
730    
731     - emms();
732     + _mm_empty();
733     }
734    
735     void
736     @@ -449,8 +435,8 @@
737     CARD16 *dstLine, *dst;
738     CARD16 w;
739     FbStride dstStride;
740     - Vector4x16 vsrc, vsrca;
741     -
742     + __m64 vsrc, vsrca;
743     +
744     CHECKPOINT();
745    
746     fbComposeGetSolid(pSrc, src, pDst->format);
747     @@ -462,49 +448,49 @@
748    
749     vsrc = load8888 (src);
750     vsrca = expand_alpha (vsrc);
751     -
752     +
753     while (height--)
754     {
755     dst = dstLine;
756     dstLine += dstStride;
757     w = width;
758     -
759     +
760     CHECKPOINT();
761    
762     while (w && (unsigned long)dst & 7)
763     {
764     ullong d = *dst;
765     - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
766     + __m64 vdest = expand565 ((__m64)d, 0);
767     vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
768     *dst = (ullong)vdest;
769    
770     w--;
771     dst++;
772     }
773     -
774     +
775     while (w >= 4)
776     {
777     - Vector4x16 vdest;
778     -
779     - vdest = *(Vector4x16 *)dst;
780     + __m64 vdest;
781     +
782     + vdest = *(__m64 *)dst;
783    
784     vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
785     vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
786     vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
787     vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
788    
789     - *(Vector8x8 *)dst = (Vector8x8)vdest;
790     + *(__m64 *)dst = vdest;
791    
792     dst += 4;
793     w -= 4;
794     }
795     -
796     +
797     CHECKPOINT();
798    
799     while (w)
800     {
801     ullong d = *dst;
802     - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
803     + __m64 vdest = expand565 ((__m64)d, 0);
804     vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
805     *dst = (ullong)vdest;
806    
807     @@ -513,7 +499,7 @@
808     }
809     }
810    
811     - emms();
812     + _mm_empty();
813     }
814    
815     void
816     @@ -534,8 +520,8 @@
817     CARD32 *dstLine;
818     CARD32 *maskLine;
819     FbStride dstStride, maskStride;
820     - Vector4x16 vsrc, vsrca;
821     -
822     + __m64 vsrc, vsrca;
823     +
824     CHECKPOINT();
825    
826     fbComposeGetSolid(pSrc, src, pDst->format);
827     @@ -562,9 +548,9 @@
828    
829     if (m)
830     {
831     - Vector4x16 vdest = load8888(*q);
832     + __m64 vdest = load8888(*q);
833     vdest = in_over(vsrc, vsrca, load8888(m), vdest);
834     - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
835     + *q = (ullong)pack8888(vdest, _mm_setzero_si64());
836     }
837    
838     twidth--;
839     @@ -580,15 +566,15 @@
840    
841     if (m0 | m1)
842     {
843     - Vector4x16 dest0, dest1;
844     - Vector4x16 vdest = *(Vector4x16 *)q;
845     + __m64 dest0, dest1;
846     + __m64 vdest = *(__m64 *)q;
847    
848     dest0 = in_over(vsrc, vsrca, load8888(m0),
849     expand8888 (vdest, 0));
850     dest1 = in_over(vsrc, vsrca, load8888(m1),
851     expand8888 (vdest, 1));
852    
853     - *(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1);
854     + *(__m64 *)q = pack8888(dest0, dest1);
855     }
856    
857     p += 2;
858     @@ -602,9 +588,9 @@
859    
860     if (m)
861     {
862     - Vector4x16 vdest = load8888(*q);
863     + __m64 vdest = load8888(*q);
864     vdest = in_over(vsrc, vsrca, load8888(m), vdest);
865     - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
866     + *q = (ullong)pack8888(vdest, _mm_setzero_si64());
867     }
868    
869     twidth--;
870     @@ -616,7 +602,133 @@
871     maskLine += maskStride;
872     }
873    
874     - emms();
875     + _mm_empty();
876     +}
877     +
878     +void
879     +fbCompositeSrc_8888x8x8888mmx (CARD8 op,
880     + PicturePtr pSrc,
881     + PicturePtr pMask,
882     + PicturePtr pDst,
883     + INT16 xSrc,
884     + INT16 ySrc,
885     + INT16 xMask,
886     + INT16 yMask,
887     + INT16 xDst,
888     + INT16 yDst,
889     + CARD16 width,
890     + CARD16 height)
891     +{
892     + CARD32 *dstLine, *dst;
893     + CARD32 *srcLine, *src;
894     + CARD8 *maskLine;
895     + CARD32 mask;
896     + __m64 vmask;
897     + FbStride dstStride, srcStride, maskStride;
898     + CARD16 w;
899     + __m64 srca;
900     +
901     + CHECKPOINT();
902     +
903     + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
904     + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
905     + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
906     +
907     + mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
908     + vmask = load8888 (mask);
909     + srca = MC(4x00ff);
910     +
911     + while (height--)
912     + {
913     + dst = dstLine;
914     + dstLine += dstStride;
915     + src = srcLine;
916     + srcLine += srcStride;
917     + w = width;
918     +
919     + while (w && (unsigned long)dst & 7)
920     + {
921     + __m64 s = load8888 (*src);
922     + __m64 d = load8888 (*dst);
923     +
924     + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
925     +
926     + w--;
927     + dst++;
928     + src++;
929     + }
930     +
931     + while (w >= 16)
932     + {
933     + __m64 vd0 = *(__m64 *)(dst + 0);
934     + __m64 vd1 = *(__m64 *)(dst + 2);
935     + __m64 vd2 = *(__m64 *)(dst + 4);
936     + __m64 vd3 = *(__m64 *)(dst + 6);
937     + __m64 vd4 = *(__m64 *)(dst + 8);
938     + __m64 vd5 = *(__m64 *)(dst + 10);
939     + __m64 vd6 = *(__m64 *)(dst + 12);
940     + __m64 vd7 = *(__m64 *)(dst + 14);
941     +
942     + __m64 vs0 = *(__m64 *)(src + 0);
943     + __m64 vs1 = *(__m64 *)(src + 2);
944     + __m64 vs2 = *(__m64 *)(src + 4);
945     + __m64 vs3 = *(__m64 *)(src + 6);
946     + __m64 vs4 = *(__m64 *)(src + 8);
947     + __m64 vs5 = *(__m64 *)(src + 10);
948     + __m64 vs6 = *(__m64 *)(src + 12);
949     + __m64 vs7 = *(__m64 *)(dst + 14);
950     +
951     + vd0 = (__m64)pack8888 (
952     + in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
953     + in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
954     +
955     + vd1 = (__m64)pack8888 (
956     + in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
957     + in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
958     +
959     + vd2 = (__m64)pack8888 (
960     + in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
961     + in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
962     +
963     + vd3 = (__m64)pack8888 (
964     + in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
965     + in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
966     +
967     + vd4 = (__m64)pack8888 (
968     + in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
969     + in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
970     +
971     + vd5 = (__m64)pack8888 (
972     + in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
973     + in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
974     +
975     + vd6 = (__m64)pack8888 (
976     + in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
977     + in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
978     +
979     + vd7 = (__m64)pack8888 (
980     + in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
981     + in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
982     +
983     + w -= 16;
984     + dst += 16;
985     + src += 16;
986     + }
987     +
988     + while (w)
989     + {
990     + __m64 s = load8888 (*src);
991     + __m64 d = load8888 (*dst);
992     +
993     + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
994     +
995     + w--;
996     + dst++;
997     + src++;
998     + }
999     + }
1000     +
1001     + _mm_empty();
1002     }
1003    
1004     void
1005     @@ -638,7 +750,7 @@
1006     CARD8 *maskLine, *mask;
1007     FbStride dstStride, maskStride;
1008     CARD16 w;
1009     - Vector4x16 vsrc, vsrca;
1010     + __m64 vsrc, vsrca;
1011     ullong srcsrc;
1012    
1013     CHECKPOINT();
1014     @@ -648,7 +760,7 @@
1015     srca = src >> 24;
1016     if (srca == 0)
1017     return;
1018     -
1019     +
1020     srcsrc = (unsigned long long)src << 32 | src;
1021    
1022     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1023     @@ -664,7 +776,7 @@
1024     mask = maskLine;
1025     maskLine += maskStride;
1026     w = width;
1027     -
1028     +
1029     CHECKPOINT();
1030    
1031     while (w && (unsigned long)dst & 7)
1032     @@ -673,15 +785,15 @@
1033    
1034     if (m)
1035     {
1036     - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst));
1037     - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
1038     + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
1039     + *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1040     }
1041    
1042     w--;
1043     mask++;
1044     dst++;
1045     }
1046     -
1047     +
1048     CHECKPOINT();
1049    
1050     while (w >= 2)
1051     @@ -689,29 +801,29 @@
1052     ullong m0, m1;
1053     m0 = *mask;
1054     m1 = *(mask + 1);
1055     -
1056     +
1057     if (srca == 0xff && (m0 & m1) == 0xff)
1058     {
1059     *(unsigned long long *)dst = srcsrc;
1060     }
1061     else if (m0 | m1)
1062     {
1063     - Vector4x16 vdest;
1064     - Vector4x16 dest0, dest1;
1065     -
1066     - vdest = *(Vector4x16 *)dst;
1067     + __m64 vdest;
1068     + __m64 dest0, dest1;
1069     +
1070     + vdest = *(__m64 *)dst;
1071    
1072     - dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0));
1073     - dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1));
1074     + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
1075     + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
1076    
1077     - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
1078     + *(__m64 *)dst = pack8888(dest0, dest1);
1079     }
1080    
1081     mask += 2;
1082     dst += 2;
1083     w -= 2;
1084     }
1085     -
1086     +
1087     CHECKPOINT();
1088    
1089     while (w)
1090     @@ -720,9 +832,9 @@
1091    
1092     if (m)
1093     {
1094     - Vector4x16 vdest = load8888(*dst);
1095     - vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest);
1096     - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
1097     + __m64 vdest = load8888(*dst);
1098     + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
1099     + *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1100     }
1101    
1102     w--;
1103     @@ -731,7 +843,7 @@
1104     }
1105     }
1106    
1107     - emms();
1108     + _mm_empty();
1109     }
1110    
1111    
1112     @@ -754,7 +866,7 @@
1113     CARD8 *maskLine, *mask;
1114     FbStride dstStride, maskStride;
1115     CARD16 w;
1116     - Vector4x16 vsrc, vsrca;
1117     + __m64 vsrc, vsrca;
1118     unsigned long long srcsrcsrcsrc, src16;
1119    
1120     CHECKPOINT();
1121     @@ -770,9 +882,9 @@
1122    
1123     vsrc = load8888 (src);
1124     vsrca = expand_alpha (vsrc);
1125     -
1126     - src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0);
1127     -
1128     +
1129     + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
1130     +
1131     srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
1132     (ullong)src16 << 16 | (ullong)src16;
1133    
1134     @@ -783,7 +895,7 @@
1135     mask = maskLine;
1136     maskLine += maskStride;
1137     w = width;
1138     -
1139     +
1140     CHECKPOINT();
1141    
1142     while (w && (unsigned long)dst & 7)
1143     @@ -793,16 +905,16 @@
1144     if (m)
1145     {
1146     ullong d = *dst;
1147     - Vector4x16 vd = (Vector4x16)d;
1148     - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
1149     - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
1150     + __m64 vd = (__m64)d;
1151     + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1152     + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1153     }
1154    
1155     w--;
1156     mask++;
1157     dst++;
1158     }
1159     -
1160     +
1161     CHECKPOINT();
1162    
1163     while (w >= 4)
1164     @@ -812,35 +924,35 @@
1165     m1 = *(mask + 1);
1166     m2 = *(mask + 2);
1167     m3 = *(mask + 3);
1168     -
1169     +
1170     if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
1171     {
1172     *(unsigned long long *)dst = srcsrcsrcsrc;
1173     }
1174     else if (m0 | m1 | m2 | m3)
1175     {
1176     - Vector4x16 vdest;
1177     - Vector4x16 vm0, vm1, vm2, vm3;
1178     -
1179     - vdest = *(Vector4x16 *)dst;
1180     -
1181     - vm0 = (Vector4x16)m0;
1182     + __m64 vdest;
1183     + __m64 vm0, vm1, vm2, vm3;
1184     +
1185     + vdest = *(__m64 *)dst;
1186     +
1187     + vm0 = (__m64)m0;
1188     vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
1189     - vm1 = (Vector4x16)m1;
1190     + vm1 = (__m64)m1;
1191     vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
1192     - vm2 = (Vector4x16)m2;
1193     + vm2 = (__m64)m2;
1194     vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
1195     - vm3 = (Vector4x16)m3;
1196     + vm3 = (__m64)m3;
1197     vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
1198    
1199     - *(Vector4x16 *)dst = vdest;
1200     + *(__m64 *)dst = vdest;
1201     }
1202    
1203     w -= 4;
1204     mask += 4;
1205     dst += 4;
1206     }
1207     -
1208     +
1209     CHECKPOINT();
1210    
1211     while (w)
1212     @@ -850,9 +962,9 @@
1213     if (m)
1214     {
1215     ullong d = *dst;
1216     - Vector4x16 vd = (Vector4x16)d;
1217     - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
1218     - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
1219     + __m64 vd = (__m64)d;
1220     + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1221     + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1222     }
1223    
1224     w--;
1225     @@ -861,7 +973,7 @@
1226     }
1227     }
1228    
1229     - emms();
1230     + _mm_empty();
1231     }
1232    
1233     void
1234     @@ -887,9 +999,9 @@
1235    
1236     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1237     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1238     -
1239     +
1240     assert (pSrc->pDrawable == pMask->pDrawable);
1241     -
1242     +
1243     while (height--)
1244     {
1245     dst = dstLine;
1246     @@ -897,14 +1009,14 @@
1247     src = srcLine;
1248     srcLine += srcStride;
1249     w = width;
1250     -
1251     +
1252     CHECKPOINT();
1253    
1254     while (w && (unsigned long)dst & 7)
1255     {
1256     - Vector4x16 vsrc = load8888 (*src);
1257     + __m64 vsrc = load8888 (*src);
1258     ullong d = *dst;
1259     - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1260     + __m64 vdest = expand565 ((__m64)d, 0);
1261    
1262     vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1263    
1264     @@ -914,19 +1026,19 @@
1265     dst++;
1266     src++;
1267     }
1268     -
1269     +
1270     CHECKPOINT();
1271    
1272     while (w >= 4)
1273     {
1274     CARD32 s0, s1, s2, s3;
1275     unsigned char a0, a1, a2, a3;
1276     -
1277     +
1278     s0 = *src;
1279     s1 = *(src + 1);
1280     s2 = *(src + 2);
1281     s3 = *(src + 3);
1282     -
1283     +
1284     a0 = (s0 >> 24);
1285     a1 = (s1 >> 24);
1286     a2 = (s2 >> 24);
1287     @@ -934,38 +1046,38 @@
1288    
1289     if ((a0 & a1 & a2 & a3) == 0xFF)
1290     {
1291     - Vector4x16 vdest;
1292     - vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0);
1293     + __m64 vdest;
1294     + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
1295     vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
1296     vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
1297     vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
1298     -
1299     - *(Vector4x16 *)dst = vdest;
1300     +
1301     + *(__m64 *)dst = vdest;
1302     }
1303     else if (a0 | a1 | a2 | a3)
1304     {
1305     - Vector4x16 vdest = *(Vector4x16 *)dst;
1306     -
1307     + __m64 vdest = *(__m64 *)dst;
1308     +
1309     vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
1310     vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
1311     vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
1312     vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
1313     -
1314     - *(Vector4x16 *)dst = vdest;
1315     +
1316     + *(__m64 *)dst = vdest;
1317     }
1318    
1319     w -= 4;
1320     dst += 4;
1321     src += 4;
1322     }
1323     -
1324     +
1325     CHECKPOINT();
1326    
1327     while (w)
1328     {
1329     - Vector4x16 vsrc = load8888 (*src);
1330     + __m64 vsrc = load8888 (*src);
1331     ullong d = *dst;
1332     - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1333     + __m64 vdest = expand565 ((__m64)d, 0);
1334    
1335     vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1336    
1337     @@ -976,11 +1088,11 @@
1338     src++;
1339     }
1340     }
1341     -
1342     - emms();
1343     +
1344     + _mm_empty();
1345     }
1346    
1347     -/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1348     +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1349    
1350     void
1351     fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
1352     @@ -1005,9 +1117,9 @@
1353    
1354     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1355     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1356     -
1357     +
1358     assert (pSrc->pDrawable == pMask->pDrawable);
1359     -
1360     +
1361     while (height--)
1362     {
1363     dst = dstLine;
1364     @@ -1015,28 +1127,28 @@
1365     src = srcLine;
1366     srcLine += srcStride;
1367     w = width;
1368     -
1369     +
1370     while (w && (unsigned long)dst & 7)
1371     {
1372     - Vector4x16 s = load8888 (*src);
1373     - Vector4x16 d = load8888 (*dst);
1374     + __m64 s = load8888 (*src);
1375     + __m64 d = load8888 (*dst);
1376    
1377     - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
1378     + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1379    
1380     w--;
1381     dst++;
1382     src++;
1383     }
1384     -
1385     +
1386     while (w >= 2)
1387     {
1388     ullong s0, s1;
1389     unsigned char a0, a1;
1390     - Vector4x16 d0, d1;
1391     -
1392     + __m64 d0, d1;
1393     +
1394     s0 = *src;
1395     s1 = *(src + 1);
1396     -
1397     +
1398     a0 = (s0 >> 24);
1399     a1 = (s1 >> 24);
1400    
1401     @@ -1044,17 +1156,17 @@
1402     {
1403     d0 = invert_colors(load8888(s0));
1404     d1 = invert_colors(load8888(s1));
1405     -
1406     - *(Vector8x8 *)dst = pack8888 (d0, d1);
1407     +
1408     + *(__m64 *)dst = pack8888 (d0, d1);
1409     }
1410     else if (a0 | a1)
1411     {
1412     - Vector4x16 vdest = *(Vector4x16 *)dst;
1413     -
1414     + __m64 vdest = *(__m64 *)dst;
1415     +
1416     d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1417     d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1418     -
1419     - *(Vector8x8 *)dst = pack8888 (d0, d1);
1420     +
1421     + *(__m64 *)dst = pack8888 (d0, d1);
1422     }
1423    
1424     w -= 2;
1425     @@ -1064,18 +1176,18 @@
1426    
1427     while (w)
1428     {
1429     - Vector4x16 s = load8888 (*src);
1430     - Vector4x16 d = load8888 (*dst);
1431     + __m64 s = load8888 (*src);
1432     + __m64 d = load8888 (*dst);
1433    
1434     - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
1435     + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1436    
1437     w--;
1438     dst++;
1439     src++;
1440     }
1441     }
1442     -
1443     - emms();
1444     +
1445     + _mm_empty();
1446     }
1447    
1448     void
1449     @@ -1096,7 +1208,7 @@
1450     CARD16 *dstLine;
1451     CARD32 *maskLine;
1452     FbStride dstStride, maskStride;
1453     - Vector4x16 vsrc, vsrca;
1454     + __m64 vsrc, vsrca;
1455    
1456     CHECKPOINT();
1457    
1458     @@ -1125,7 +1237,7 @@
1459     if (m)
1460     {
1461     ullong d = *q;
1462     - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1463     + __m64 vdest = expand565 ((__m64)d, 0);
1464     vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1465     *q = (ullong)vdest;
1466     }
1467     @@ -1146,14 +1258,14 @@
1468    
1469     if ((m0 | m1 | m2 | m3))
1470     {
1471     - Vector4x16 vdest = *(Vector4x16 *)q;
1472     + __m64 vdest = *(__m64 *)q;
1473    
1474     vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1475     vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1476     vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1477     vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1478    
1479     - *(Vector4x16 *)q = vdest;
1480     + *(__m64 *)q = vdest;
1481     }
1482     twidth -= 4;
1483     p += 4;
1484     @@ -1168,7 +1280,7 @@
1485     if (m)
1486     {
1487     ullong d = *q;
1488     - Vector4x16 vdest = expand565((Vector4x16)d, 0);
1489     + __m64 vdest = expand565((__m64)d, 0);
1490     vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1491     *q = (ullong)vdest;
1492     }
1493     @@ -1182,7 +1294,7 @@
1494     dstLine += dstStride;
1495     }
1496    
1497     - emms ();
1498     + _mm_empty ();
1499     }
1500    
1501     void
1502     @@ -1210,7 +1322,7 @@
1503    
1504     fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
1505     fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
1506     -
1507     +
1508     while (height--)
1509     {
1510     dst = dstLine;
1511     @@ -1218,7 +1330,7 @@
1512     src = srcLine;
1513     srcLine += srcStride;
1514     w = width;
1515     -
1516     +
1517     while (w && (unsigned long)dst & 7)
1518     {
1519     s = *src;
1520     @@ -1234,13 +1346,7 @@
1521    
1522     while (w >= 8)
1523     {
1524     - __asm__ __volatile__ (
1525     - "movq (%0), %%mm2\n\t"
1526     - "movq (%1), %%mm3\n\t"
1527     - "paddusb %%mm2, %%mm3\n\t"
1528     - "movq %%mm3, (%1)\n\t"
1529     - : /* no output */ : "r" (src), "r" (dst));
1530     -
1531     + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1532     dst += 8;
1533     src += 8;
1534     w -= 8;
1535     @@ -1259,8 +1365,8 @@
1536     w--;
1537     }
1538     }
1539     -
1540     - emms();
1541     +
1542     + _mm_empty();
1543     }
1544    
1545     void
1546     @@ -1297,13 +1403,8 @@
1547    
1548     while (w && (unsigned long)dst & 7)
1549     {
1550     - __asm__ __volatile__ (
1551     - "movd %0, %%mm2\n\t"
1552     - "movd %1, %%mm3\n\t"
1553     - "paddusb %%mm2, %%mm3\n\t"
1554     - "movd %%mm3, %1\n\t"
1555     - : /* no output */ : "m" (*src), "m" (*dst));
1556     -
1557     + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1558     + _mm_cvtsi32_si64(*dst)));
1559     dst++;
1560     src++;
1561     w--;
1562     @@ -1311,13 +1412,7 @@
1563    
1564     while (w >= 2)
1565     {
1566     - __asm__ __volatile__ (
1567     - "movq (%0), %%mm2\n\t"
1568     - "movq (%1), %%mm3\n\t"
1569     - "paddusb %%mm2, %%mm3\n\t"
1570     - "movq %%mm3, (%1)\n\t"
1571     - : /* no output */ : "r" (src), "r" (dst));
1572     -
1573     + *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1574     dst += 2;
1575     src += 2;
1576     w -= 2;
1577     @@ -1325,16 +1420,13 @@
1578    
1579     if (w)
1580     {
1581     - __asm__ __volatile__ (
1582     - "movd %0, %%mm2\n\t"
1583     - "movd %1, %%mm3\n\t"
1584     - "paddusb %%mm2, %%mm3\n\t"
1585     - "movd %%mm3, %1\n\t"
1586     - : /* no output */ : "m" (*src), "m" (*dst));
1587     + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1588     + _mm_cvtsi32_si64(*dst)));
1589     +
1590     }
1591     }
1592     -
1593     - emms();
1594     +
1595     + _mm_empty();
1596     }
1597    
1598     #define GetStart(drw,x,y,type,stride,line,bpp) {\
1599     @@ -1358,19 +1450,19 @@
1600     FbStride stride;
1601     int bpp;
1602     ullong fill;
1603     - Vector8x8 vfill;
1604     + __m64 vfill;
1605     CARD32 byte_width;
1606     CARD8 *byte_line;
1607     FbBits *bits;
1608     int xoff, yoff;
1609    
1610     CHECKPOINT();
1611     -
1612     +
1613     fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1614     -
1615     +
1616     if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1617     return FALSE;
1618     -
1619     +
1620     if (bpp != 16 && bpp != 32)
1621     return FALSE;
1622    
1623     @@ -1388,9 +1480,9 @@
1624     byte_width = 4 * width;
1625     stride *= 4;
1626     }
1627     -
1628     +
1629     fill = ((ullong)xor << 32) | xor;
1630     - vfill = (Vector8x8)fill;
1631     + vfill = (__m64)fill;
1632    
1633     while (height--)
1634     {
1635     @@ -1398,7 +1490,7 @@
1636     CARD8 *d = byte_line;
1637     byte_line += stride;
1638     w = byte_width;
1639     -
1640     +
1641     while (w >= 2 && ((unsigned long)d & 3))
1642     {
1643     *(CARD16 *)d = xor;
1644     @@ -1406,35 +1498,32 @@
1645     d += 2;
1646     }
1647    
1648     - while (w >= 4 && ((unsigned int)d & 7))
1649     + while (w >= 4 && ((unsigned long)d & 7))
1650     {
1651     *(CARD32 *)d = xor;
1652     -
1653     +
1654     w -= 4;
1655     d += 4;
1656     }
1657    
1658     while (w >= 64)
1659     {
1660     - __asm__ __volatile (
1661     - "movq %0, (%1)\n\t"
1662     - "movq %0, 8(%1)\n\t"
1663     - "movq %0, 16(%1)\n\t"
1664     - "movq %0, 24(%1)\n\t"
1665     - "movq %0, 32(%1)\n\t"
1666     - "movq %0, 40(%1)\n\t"
1667     - "movq %0, 48(%1)\n\t"
1668     - "movq %0, 56(%1)\n\t"
1669     - : /* no output */
1670     - : "y" (vfill), "r" (d)
1671     - : "memory");
1672     + *(__m64*) (d + 0) = vfill;
1673     + *(__m64*) (d + 8) = vfill;
1674     + *(__m64*) (d + 16) = vfill;
1675     + *(__m64*) (d + 24) = vfill;
1676     + *(__m64*) (d + 32) = vfill;
1677     + *(__m64*) (d + 40) = vfill;
1678     + *(__m64*) (d + 48) = vfill;
1679     + *(__m64*) (d + 56) = vfill;
1680     +
1681     w -= 64;
1682     d += 64;
1683     }
1684     while (w >= 4)
1685     {
1686     *(CARD32 *)d = xor;
1687     -
1688     +
1689     w -= 4;
1690     d += 4;
1691     }
1692     @@ -1446,16 +1535,160 @@
1693     }
1694     }
1695    
1696     - emms();
1697     + _mm_empty();
1698     + return TRUE;
1699     +}
1700     +
1701     +Bool
1702     +fbCopyAreammx (DrawablePtr pSrc,
1703     + DrawablePtr pDst,
1704     + int src_x,
1705     + int src_y,
1706     + int dst_x,
1707     + int dst_y,
1708     + int width,
1709     + int height)
1710     +{
1711     + FbBits * src_bits;
1712     + FbStride src_stride;
1713     + int src_bpp;
1714     + int src_xoff;
1715     + int src_yoff;
1716     +
1717     + FbBits * dst_bits;
1718     + FbStride dst_stride;
1719     + int dst_bpp;
1720     + int dst_xoff;
1721     + int dst_yoff;
1722     +
1723     + CARD8 * src_bytes;
1724     + CARD8 * dst_bytes;
1725     + int byte_width;
1726     +
1727     + fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
1728     + fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
1729     +
1730     + if (src_bpp != 16 && src_bpp != 32)
1731     + return FALSE;
1732     +
1733     + if (dst_bpp != 16 && dst_bpp != 32)
1734     + return FALSE;
1735     +
1736     + if (src_bpp != dst_bpp)
1737     + {
1738     + return FALSE;
1739     + }
1740     +
1741     + if (src_bpp == 16)
1742     + {
1743     + src_stride = src_stride * sizeof (FbBits) / 2;
1744     + dst_stride = dst_stride * sizeof (FbBits) / 2;
1745     + src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1746     + dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1747     + byte_width = 2 * width;
1748     + src_stride *= 2;
1749     + dst_stride *= 2;
1750     + }
1751     + else
1752     + {
1753     + src_stride = src_stride * sizeof (FbBits) / 4;
1754     + dst_stride = dst_stride * sizeof (FbBits) / 4;
1755     + src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1756     + dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1757     + byte_width = 4 * width;
1758     + src_stride *= 4;
1759     + dst_stride *= 4;
1760     + }
1761     +
1762     + while (height--)
1763     + {
1764     + int w;
1765     + CARD8 *s = src_bytes;
1766     + CARD8 *d = dst_bytes;
1767     + src_bytes += src_stride;
1768     + dst_bytes += dst_stride;
1769     + w = byte_width;
1770     +
1771     + while (w >= 2 && ((unsigned long)d & 3))
1772     + {
1773     + *(CARD16 *)d = *(CARD16 *)s;
1774     + w -= 2;
1775     + s += 2;
1776     + d += 2;
1777     + }
1778     +
1779     + while (w >= 4 && ((unsigned int)d & 7))
1780     + {
1781     + *(CARD32 *)d = *(CARD32 *)s;
1782     +
1783     + w -= 4;
1784     + s += 4;
1785     + d += 4;
1786     + }
1787     +
1788     + while (w >= 64)
1789     + {
1790     + *(__m64 *)(d + 0) = *(__m64 *)(s + 0);
1791     + *(__m64 *)(d + 8) = *(__m64 *)(s + 8);
1792     + *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
1793     + *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
1794     + *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
1795     + *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
1796     + *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
1797     + *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
1798     + w -= 64;
1799     + s += 64;
1800     + d += 64;
1801     + }
1802     + while (w >= 4)
1803     + {
1804     + *(CARD32 *)d = *(CARD32 *)s;
1805     +
1806     + w -= 4;
1807     + s += 4;
1808     + d += 4;
1809     + }
1810     + if (w >= 2)
1811     + {
1812     + *(CARD16 *)d = *(CARD16 *)s;
1813     + w -= 2;
1814     + s += 2;
1815     + d += 2;
1816     + }
1817     + }
1818     +
1819     + _mm_empty();
1820     return TRUE;
1821     }
1822    
1823     +void
1824     +fbCompositeCopyAreammx (CARD8 op,
1825     + PicturePtr pSrc,
1826     + PicturePtr pMask,
1827     + PicturePtr pDst,
1828     + INT16 xSrc,
1829     + INT16 ySrc,
1830     + INT16 xMask,
1831     + INT16 yMask,
1832     + INT16 xDst,
1833     + INT16 yDst,
1834     + CARD16 width,
1835     + CARD16 height)
1836     +{
1837     + fbCopyAreammx (pSrc->pDrawable,
1838     + pDst->pDrawable,
1839     + xSrc, ySrc,
1840     + xDst, yDst,
1841     + width, height);
1842     +}
1843     +
1844     +#ifndef __amd64__
1845     Bool
1846     fbHaveMMX (void)
1847     {
1848     static Bool initialized = FALSE;
1849     static Bool mmx_present;
1850     -
1851     +
1852     if (!initialized)
1853     {
1854     int tmp; /* static variables are accessed through %ebx,
1855     @@ -1466,7 +1699,7 @@
1856    
1857     __asm__ __volatile__ (
1858     /* Check if bit 21 in flags word is writeable */
1859     -
1860     +
1861     "pusha \n\t"
1862     "pushfl \n\t"
1863     "popl %%eax \n\t"
1864     @@ -1502,13 +1735,14 @@
1865     : /* no input */);
1866    
1867     initialized = TRUE;
1868     -
1869     +
1870     mmx_present = tmp;
1871     }
1872    
1873     return mmx_present;
1874     }
1875     +#endif /* __amd64__ */
1876    
1877    
1878     #endif /* RENDER */
1879     -#endif /* USE_GCC34_MMX */
1880     +#endif /* USE_MMX */
1881     diff -ur xc-orig/programs/Xserver/fb/fbmmx.h xc/programs/Xserver/fb/fbmmx.h
1882     --- xc-orig/programs/Xserver/fb/fbmmx.h 2005-02-11 04:00:50.006092570 -0500
1883     +++ xc/programs/Xserver/fb/fbmmx.h 2005-02-11 04:01:32.072346126 -0500
1884     @@ -1,5 +1,5 @@
1885     /*
1886     - * Copyright © 2004 Red Hat, Inc.
1887     + * Copyright © 2004 Red Hat, Inc.
1888     *
1889     * Permission to use, copy, modify, distribute, and sell this software and its
1890     * documentation for any purpose is hereby granted without fee, provided that
1891     @@ -18,17 +18,23 @@
1892     * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
1893     * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1894     *
1895     - * Author: Søren Sandmann (sandmann@redhat.com)
1896     + * Author: Søren Sandmann (sandmann@redhat.com)
1897     *
1898     * Based on work by Owen Taylor
1899     */
1900     -#ifdef USE_GCC34_MMX
1901     +#ifdef USE_MMX
1902     +
1903     +#ifndef __amd64__
1904     Bool fbHaveMMX(void);
1905     #else
1906     -#define fbHaveMMX FALSE
1907     +#define fbHaveMMX() TRUE
1908     +#endif
1909     +
1910     +#else
1911     +#define fbHaveMMX() FALSE
1912     #endif
1913    
1914     -#ifdef USE_GCC34_MMX
1915     +#ifdef USE_MMX
1916    
1917     void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
1918     PicturePtr pSrc,
1919     @@ -150,6 +156,38 @@
1920     INT16 yDst,
1921     CARD16 width,
1922     CARD16 height);
1923     +void fbCompositeSrc_8888x8x8888mmx (CARD8 op,
1924     + PicturePtr pSrc,
1925     + PicturePtr pMask,
1926     + PicturePtr pDst,
1927     + INT16 xSrc,
1928     + INT16 ySrc,
1929     + INT16 xMask,
1930     + INT16 yMask,
1931     + INT16 xDst,
1932     + INT16 yDst,
1933     + CARD16 width,
1934     + CARD16 height);
1935     +Bool fbCopyAreammx (DrawablePtr pSrc,
1936     + DrawablePtr pDst,
1937     + int src_x,
1938     + int src_y,
1939     + int dst_x,
1940     + int dst_y,
1941     + int width,
1942     + int height);
1943     +void fbCompositeCopyAreammx (CARD8 op,
1944     + PicturePtr pSrc,
1945     + PicturePtr pMask,
1946     + PicturePtr pDst,
1947     + INT16 xSrc,
1948     + INT16 ySrc,
1949     + INT16 xMask,
1950     + INT16 yMask,
1951     + INT16 xDst,
1952     + INT16 yDst,
1953     + CARD16 width,
1954     + CARD16 height);
1955     Bool fbSolidFillmmx (DrawablePtr pDraw,
1956     int x,
1957     int y,
1958     @@ -157,4 +195,4 @@
1959     int height,
1960     FbBits xor);
1961    
1962     -#endif /* USE_GCC34_MMX */
1963     +#endif /* USE_MMX */
1964    
1965     diff -ur xc-orig/programs/Xserver/fb/fbpict.c xc/programs/Xserver/fb/fbpict.c
1966     --- xc-orig/programs/Xserver/fb/fbpict.c 2005-02-11 04:00:50.007092600 -0500
1967     +++ xc/programs/Xserver/fb/fbpict.c 2005-02-11 04:01:32.075346216 -0500
1968     @@ -1,7 +1,7 @@
1969     /*
1970     * $XFree86: xc/programs/Xserver/fb/fbpict.c,v 1.15 2002/09/26 02:56:48 keithp Exp $
1971     *
1972     - * Copyright © 2000 SuSE, Inc.
1973     + * Copyright © 2000 SuSE, Inc.
1974     *
1975     * Permission to use, copy, modify, distribute, and sell this software and its
1976     * documentation for any purpose is hereby granted without fee, provided that
1977     @@ -863,6 +863,15 @@
1978     if (!pSrc->transform && !(pMask && pMask->transform))
1979     if (!maskAlphaMap && !srcAlphaMap && !dstAlphaMap)
1980     switch (op) {
1981     + case PictOpSrc:
1982     +#ifdef USE_MMX
1983     + if (!pMask && pSrc->format == pDst->format &&
1984     + pSrc->pDrawable != pDst->pDrawable)
1985     + {
1986     + func = fbCompositeCopyAreammx;
1987     + }
1988     +#endif
1989     + break;
1990     case PictOpOver:
1991     if (pMask)
1992     {
1993     @@ -877,7 +886,7 @@
1994     switch (pDst->format) {
1995     case PICT_r5g6b5:
1996     case PICT_b5g6r5:
1997     -#ifdef USE_GCC34_MMX
1998     +#ifdef USE_MMX
1999     if (fbHaveMMX())
2000     func = fbCompositeSolidMask_nx8x0565mmx;
2001     else
2002     @@ -892,7 +901,7 @@
2003     case PICT_x8r8g8b8:
2004     case PICT_a8b8g8r8:
2005     case PICT_x8b8g8r8:
2006     -#ifdef USE_GCC34_MMX
2007     +#ifdef USE_MMX
2008     if (fbHaveMMX())
2009     func = fbCompositeSolidMask_nx8x8888mmx;
2010     else
2011     @@ -906,7 +915,7 @@
2012     switch (pDst->format) {
2013     case PICT_a8r8g8b8:
2014     case PICT_x8r8g8b8:
2015     -#ifdef USE_GCC34_MMX
2016     +#ifdef USE_MMX
2017     if (fbHaveMMX())
2018     func = fbCompositeSolidMask_nx8888x8888Cmmx;
2019     else
2020     @@ -914,7 +923,7 @@
2021     func = fbCompositeSolidMask_nx8888x8888C;
2022     break;
2023     case PICT_r5g6b5:
2024     -#ifdef USE_GCC34_MMX
2025     +#ifdef USE_MMX
2026     if (fbHaveMMX())
2027     func = fbCompositeSolidMask_nx8888x0565Cmmx;
2028     else
2029     @@ -929,7 +938,7 @@
2030     switch (pDst->format) {
2031     case PICT_a8b8g8r8:
2032     case PICT_x8b8g8r8:
2033     -#ifdef USE_GCC34_MMX
2034     +#ifdef USE_MMX
2035     if (fbHaveMMX())
2036     func = fbCompositeSolidMask_nx8888x8888Cmmx;
2037     else
2038     @@ -937,7 +946,7 @@
2039     func = fbCompositeSolidMask_nx8888x8888C;
2040     break;
2041     case PICT_b5g6r5:
2042     -#ifdef USE_GCC34_MMX
2043     +#ifdef USE_MMX
2044     if (fbHaveMMX())
2045     func = fbCompositeSolidMask_nx8888x0565Cmmx;
2046     else
2047     @@ -970,6 +979,7 @@
2048     xSrc == xMask && ySrc == yMask &&
2049     !pMask->componentAlpha)
2050     {
2051     + /* source == mask: non-premultiplied data */
2052     switch (pSrc->format) {
2053     case PICT_x8b8g8r8:
2054     switch (pMask->format) {
2055     @@ -978,13 +988,13 @@
2056     switch (pDst->format) {
2057     case PICT_a8r8g8b8:
2058     case PICT_x8r8g8b8:
2059     -#ifdef USE_GCC34_MMX
2060     +#ifdef USE_MMX
2061     if (fbHaveMMX())
2062     func = fbCompositeSrc_8888RevNPx8888mmx;
2063     #endif
2064     break;
2065     case PICT_r5g6b5:
2066     -#ifdef USE_GCC34_MMX
2067     +#ifdef USE_MMX
2068     if (fbHaveMMX())
2069     func = fbCompositeSrc_8888RevNPx0565mmx;
2070     #endif
2071     @@ -1000,13 +1010,13 @@
2072     switch (pDst->format) {
2073     case PICT_a8b8g8r8:
2074     case PICT_x8b8g8r8:
2075     -#ifdef USE_GCC34_MMX
2076     +#ifdef USE_MMX
2077     if (fbHaveMMX())
2078     func = fbCompositeSrc_8888RevNPx8888mmx;
2079     #endif
2080     break;
2081     case PICT_r5g6b5:
2082     -#ifdef USE_GCC34_MMX
2083     +#ifdef USE_MMX
2084     if (fbHaveMMX())
2085     func = fbCompositeSrc_8888RevNPx0565mmx;
2086     #endif
2087     @@ -1018,9 +1028,27 @@
2088     }
2089     break;
2090     }
2091     + else
2092     + {
2093     + /* non-repeating source, repeating mask => translucent window */
2094     + if (maskRepeat &&
2095     + pMask->pDrawable->width == 1 &&
2096     + pMask->pDrawable->height == 1)
2097     + {
2098     + if (pSrc->format == PICT_x8r8g8b8 &&
2099     + pDst->format == PICT_x8r8g8b8 &&
2100     + pMask->format == PICT_a8)
2101     + {
2102     +#ifdef USE_MMX
2103     + if (fbHaveMMX())
2104     + func = fbCompositeSrc_8888x8x8888mmx;
2105     +#endif
2106     + }
2107     + }
2108     + }
2109     }
2110     }
2111     - else
2112     + else /* no mask */
2113     {
2114     if (srcRepeat &&
2115     pSrc->pDrawable->width == 1 &&
2116     @@ -1032,7 +1060,7 @@
2117     switch (pDst->format) {
2118     case PICT_a8r8g8b8:
2119     case PICT_x8r8g8b8:
2120     -#ifdef USE_GCC34_MMX
2121     +#ifdef USE_MMX
2122     if (fbHaveMMX())
2123     {
2124     srcRepeat = FALSE;
2125     @@ -1041,7 +1069,7 @@
2126     #endif
2127     break;
2128     case PICT_r5g6b5:
2129     -#ifdef USE_GCC34_MMX
2130     +#ifdef USE_MMX
2131     if (fbHaveMMX())
2132     {
2133     srcRepeat = FALSE;
2134     @@ -1070,6 +1098,27 @@
2135     break;
2136     }
2137     break;
2138     + case PICT_x8r8g8b8:
2139     + switch (pDst->format) {
2140     + case PICT_a8r8g8b8:
2141     + case PICT_x8r8g8b8:
2142     +#ifdef USE_MMX
2143     + if (fbHaveMMX())
2144     + func = fbCompositeCopyAreammx;
2145     +#endif
2146     + break;
2147     + }
2148     + case PICT_x8b8g8r8:
2149     + switch (pDst->format) {
2150     + case PICT_a8b8g8r8:
2151     + case PICT_x8b8g8r8:
2152     +#ifdef USE_MMX
2153     + if (fbHaveMMX())
2154     + func = fbCompositeCopyAreammx;
2155     +#endif
2156     + break;
2157     + }
2158     + break;
2159     case PICT_a8b8g8r8:
2160     switch (pDst->format) {
2161     case PICT_a8b8g8r8:
2162     @@ -1109,7 +1158,7 @@
2163     case PICT_a8r8g8b8:
2164     switch (pDst->format) {
2165     case PICT_a8r8g8b8:
2166     -#ifdef USE_GCC34_MMX
2167     +#ifdef USE_MMX
2168     if (fbHaveMMX())
2169     func = fbCompositeSrcAdd_8888x8888mmx;
2170     else
2171     @@ -1121,7 +1170,7 @@
2172     case PICT_a8b8g8r8:
2173     switch (pDst->format) {
2174     case PICT_a8b8g8r8:
2175     -#ifdef USE_GCC34_MMX
2176     +#ifdef USE_MMX
2177     if (fbHaveMMX())
2178     func = fbCompositeSrcAdd_8888x8888mmx;
2179     else
2180     @@ -1133,7 +1182,7 @@
2181     case PICT_a8:
2182     switch (pDst->format) {
2183     case PICT_a8:
2184     -#ifdef USE_GCC34_MMX
2185     +#ifdef USE_MMX
2186     if (fbHaveMMX())
2187     func = fbCompositeSrcAdd_8000x8000mmx;
2188     else