Magellan Linux

Contents of /trunk/vnc/patches/9914_all_6.8.2-mmx-gcc4.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 153 - (show annotations) (download)
Tue May 8 20:52:56 2007 UTC (17 years ago) by niro
File size: 52896 byte(s)
-import

1 diff -ur xc-orig/programs/Xserver/fb/Imakefile xc/programs/Xserver/fb/Imakefile
2 --- xc-orig/programs/Xserver/fb/Imakefile 2005-02-11 04:00:50.004092510 -0500
3 +++ xc/programs/Xserver/fb/Imakefile 2005-02-11 04:01:32.059345739 -0500
4 @@ -3,13 +3,22 @@
5 XCOMM
6 XCOMM Id: Imakefile,v 1.1 1999/11/02 03:54:44 keithp Exp $
7
8 -#if defined(i386Architecture) && defined(HasGcc34) && HasGcc34
9 +#if defined(HasGcc34) && HasGcc34
10 MMXOPTIONS= -mmmx -Winline --param inline-unit-growth=10000 \
11 - --param large-function-growth=10000 -DUSE_GCC34_MMX
12 + --param large-function-growth=10000 -DUSE_MMX
13 +SSEOPTIONS= $(MMXOPTIONS) -msse -DUSE_SSE
14
15 +#if defined(i386Architecture)
16 SpecialCObjectRule(fbmmx,fbmmx.c,$(MMXOPTIONS))
17 +#elif defined(AMD64Architecture)
18 +SpecialCObjectRule(fbmmx,fbmmx.c,$(SSEOPTIONS))
19 +#endif
20 +
21 +#if defined(i386Architecture) || defined(AMD64Architecture)
22 SpecialCObjectRule(fbpict,fbpict.c,$(MMXOPTIONS))
23 SpecialCObjectRule(fbfill,fbfill.c,$(MMXOPTIONS))
24 +SpecialCObjectRule(fbcopy,fbcopy.c,$(MMXOPTIONS))
25 +#endif
26
27 #endif
28
29 diff -ur xc-orig/programs/Xserver/fb/fbcompose.c xc/programs/Xserver/fb/fbcompose.c
30 --- xc-orig/programs/Xserver/fb/fbcompose.c 2005-02-11 04:00:50.009092659 -0500
31 +++ xc/programs/Xserver/fb/fbcompose.c 2005-02-11 04:01:32.067345977 -0500
32 @@ -1,8 +1,8 @@
33 /*
34 - * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.3 2004/05/12 01:49:46 anholt Exp $
35 + * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.5 2005/01/13 20:49:21 sandmann Exp $
36 * $XFree86: xc/programs/Xserver/fb/fbcompose.c,v 1.17tsi Exp $
37 *
38 - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
39 + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
40 *
41 * Permission to use, copy, modify, distribute, and sell this software and its
42 * documentation for any purpose is hereby granted without fee, provided that
43 @@ -2693,7 +2693,6 @@
44 op->u.transform.y = y - op->u.transform.top_y;
45 }
46
47 -
48 Bool
49 fbBuildCompositeOperand (PicturePtr pPict,
50 FbCompositeOperand op[4],
51 @@ -2710,7 +2709,6 @@
52
53 op->u.transform.top_y = pPict->pDrawable->y;
54 op->u.transform.left_x = pPict->pDrawable->x;
55 -
56 op->u.transform.start_x = x - op->u.transform.left_x;
57 op->u.transform.x = op->u.transform.start_x;
58 op->u.transform.y = y - op->u.transform.top_y;
59 @@ -2822,6 +2820,21 @@
60 FbCombineFunc f;
61 int w;
62
63 +#if 0
64 + ErrorF ("op: %d\n"
65 + "src format: %lx\n"
66 + "msk format %lx\n"
67 + "dst format %lx\n"
68 + "width: %d\n"
69 + "height %d\n",
70 + op,
71 + pSrc? pSrc->format : 0,
72 + pMask? pMask->format : 0,
73 + pDst? pDst->format : 0,
74 + width, height);
75 + ErrorF ("PICT_x8r8g8b8: %lx\n", PICT_x8r8g8b8);
76 +#endif
77 +
78 if (!fbBuildCompositeOperand (pSrc, src, xSrc, ySrc, TRUE, TRUE))
79 return;
80 if (!fbBuildCompositeOperand (pDst, dst, xDst, yDst, FALSE, TRUE))
81 diff -ur xc-orig/programs/Xserver/fb/fbcopy.c xc/programs/Xserver/fb/fbcopy.c
82 --- xc-orig/programs/Xserver/fb/fbcopy.c 2005-02-11 04:00:50.004092510 -0500
83 +++ xc/programs/Xserver/fb/fbcopy.c 2005-02-11 04:01:32.068346007 -0500
84 @@ -1,7 +1,7 @@
85 /*
86 * Id: fbcopy.c,v 1.1 1999/11/02 03:54:45 keithp Exp $
87 *
88 - * Copyright © 1998 Keith Packard
89 + * Copyright © 1998 Keith Packard
90 *
91 * Permission to use, copy, modify, distribute, and sell this software and its
92 * documentation for any purpose is hereby granted without fee, provided that
93 @@ -27,6 +27,7 @@
94 #ifdef IN_MODULE
95 #include "xf86_ansic.h"
96 #endif
97 +#include "fbmmx.h"
98
99 void
100 fbCopyNtoN (DrawablePtr pSrcDrawable,
101 @@ -54,28 +55,51 @@
102
103 fbGetDrawable (pSrcDrawable, src, srcStride, srcBpp, srcXoff, srcYoff);
104 fbGetDrawable (pDstDrawable, dst, dstStride, dstBpp, dstXoff, dstYoff);
105 -
106 +
107 while (nbox--)
108 {
109 +#ifdef USE_MMX
110 + if (!reverse && !upsidedown && fbHaveMMX())
111 + {
112 + if (!fbCopyAreammx (pSrcDrawable,
113 + pDstDrawable,
114 +
115 + (pbox->x1 + dx + srcXoff),
116 + (pbox->y1 + dy + srcYoff),
117 +
118 + (pbox->x1 + dstXoff),
119 + (pbox->y1 + dstYoff),
120 +
121 + (pbox->x2 - pbox->x1),
122 + (pbox->y2 - pbox->y1)))
123 + goto fallback;
124 + else
125 + goto next;
126 + }
127 + fallback:
128 +#endif
129 fbBlt (src + (pbox->y1 + dy + srcYoff) * srcStride,
130 srcStride,
131 (pbox->x1 + dx + srcXoff) * srcBpp,
132 -
133 +
134 dst + (pbox->y1 + dstYoff) * dstStride,
135 dstStride,
136 (pbox->x1 + dstXoff) * dstBpp,
137 -
138 +
139 (pbox->x2 - pbox->x1) * dstBpp,
140 (pbox->y2 - pbox->y1),
141 -
142 +
143 alu,
144 pm,
145 dstBpp,
146 -
147 +
148 reverse,
149 upsidedown);
150 +#ifdef USE_MMX
151 + next:
152 +#endif
153 pbox++;
154 - }
155 + }
156 }
157
158 void
159 @@ -594,7 +618,7 @@
160 int yOut)
161 {
162 fbCopyProc copy;
163 -
164 +
165 #ifdef FB_24_32BIT
166 if (pSrcDrawable->bitsPerPixel != pDstDrawable->bitsPerPixel)
167 copy = fb24_32CopyMtoN;
168 diff -ur xc-orig/programs/Xserver/fb/fbfill.c xc/programs/Xserver/fb/fbfill.c
169 --- xc-orig/programs/Xserver/fb/fbfill.c 2005-02-11 04:00:50.006092570 -0500
170 +++ xc/programs/Xserver/fb/fbfill.c 2005-02-11 04:01:32.069346037 -0500
171 @@ -1,7 +1,7 @@
172 /*
173 * Id: fbfill.c,v 1.1 1999/11/02 03:54:45 keithp Exp $
174 *
175 - * Copyright © 1998 Keith Packard
176 + * Copyright © 1998 Keith Packard
177 *
178 * Permission to use, copy, modify, distribute, and sell this software and its
179 * documentation for any purpose is hereby granted without fee, provided that
180 @@ -44,7 +44,7 @@
181
182 switch (pGC->fillStyle) {
183 case FillSolid:
184 -#ifdef USE_GCC34_MMX
185 +#ifdef USE_MMX
186 if (!pPriv->and && fbHaveMMX())
187 if (fbSolidFillmmx (pDrawable, x, y, width, height, pPriv->xor))
188 return;
189
190 diff -ur xc-orig/programs/Xserver/fb/fbmmx.c xc/programs/Xserver/fb/fbmmx.c
191 --- xc-orig/programs/Xserver/fb/fbmmx.c 2005-02-11 04:00:50.006092570 -0500
192 +++ xc/programs/Xserver/fb/fbmmx.c 2005-02-11 04:01:32.072346126 -0500
193 @@ -1,5 +1,6 @@
194 /*
195 - * Copyright © 2004 Red Hat, Inc.
196 + * Copyright © 2004 Red Hat, Inc.
197 + * Copyright © 2004 Nicholas Miell
198 *
199 * Permission to use, copy, modify, distribute, and sell this software and its
200 * documentation for any purpose is hereby granted without fee, provided that
201 @@ -18,14 +19,23 @@
202 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
203 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
204 *
205 - * Author: Søren Sandmann (sandmann@redhat.com)
206 - *
207 + * Author: Søren Sandmann (sandmann@redhat.com)
208 + * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
209 + *
210 * Based on work by Owen Taylor
211 */
212
213 +
214 +#ifdef USE_MMX
215 +
216 #include "fb.h"
217 +#include "fbmmx.h"
218 +
219 +#include <mmintrin.h>
220
221 -#ifdef USE_GCC34_MMX
222 +#ifdef USE_SSE
223 +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
224 +#endif
225
226 #ifdef RENDER
227
228 @@ -33,11 +43,6 @@
229 #include "mipict.h"
230 #include "fbpict.h"
231
232 -typedef int Vector1x64 __attribute__ ((mode(DI)));
233 -typedef int Vector2x32 __attribute__ ((mode(V2SI)));
234 -typedef int Vector4x16 __attribute__ ((mode(V4HI)));
235 -typedef int Vector8x8 __attribute__ ((mode(V8QI)));
236 -
237 typedef unsigned long long ullong;
238
239 #define noVERBOSE
240 @@ -50,7 +55,6 @@
241
242 typedef struct
243 {
244 - ullong mmx_zero;
245 ullong mmx_4x00ff;
246 ullong mmx_4x0080;
247 ullong mmx_565_rgb;
248 @@ -70,7 +74,6 @@
249
250 static const MMXData c =
251 {
252 - .mmx_zero = 0x0000000000000000ULL,
253 .mmx_4x00ff = 0x00ff00ff00ff00ffULL,
254 .mmx_4x0080 = 0x0080008000800080ULL,
255 .mmx_565_rgb = 0x000001f0003f001fULL,
256 @@ -88,121 +91,112 @@
257 .mmx_000000000000ffff = 0x000000000000ffffULL,
258 };
259
260 -static __inline__ Vector1x64
261 -shift (Vector1x64 v, int s)
262 +#define MC(x) ((__m64) c.mmx_##x)
263 +
264 +static __inline__ __m64
265 +shift (__m64 v, int s)
266 {
267 if (s > 0)
268 - return __builtin_ia32_psllq (v, s);
269 + return _mm_slli_si64 (v, s);
270 else if (s < 0)
271 - return __builtin_ia32_psrlq (v, -s);
272 + return _mm_srli_si64 (v, -s);
273 else
274 return v;
275 }
276
277 -static __inline__ Vector4x16
278 -negate (Vector4x16 mask)
279 +static __inline__ __m64
280 +negate (__m64 mask)
281 {
282 - return (Vector4x16)__builtin_ia32_pxor (
283 - (Vector1x64)mask,
284 - (Vector1x64)c.mmx_4x00ff);
285 + return _mm_xor_si64 (mask, MC(4x00ff));
286 }
287
288 -static __inline__ Vector4x16
289 -pix_multiply (Vector4x16 a, Vector4x16 b)
290 +static __inline__ __m64
291 +pix_multiply (__m64 a, __m64 b)
292 {
293 - Vector4x16 res;
294 + __m64 res;
295
296 - res = __builtin_ia32_pmullw (a, b);
297 - res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080);
298 - res = __builtin_ia32_psrlw (res, 8);
299 + res = _mm_mullo_pi16 (a, b);
300 + res = _mm_add_pi16 (res, MC(4x0080));
301 + res = _mm_srli_pi16 (res, 8);
302
303 return res;
304 }
305
306 -#if 0
307 +#ifdef USE_SSE
308 #define HAVE_PSHUFW
309 #endif
310
311 #ifdef HAVE_PSHUFW
312
313 -static __inline__ Vector4x16
314 -expand_alpha (Vector4x16 pixel)
315 +static __inline__ __m64
316 +expand_alpha (__m64 pixel)
317 {
318 - Vector4x16 result;
319 - __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel));
320 - return result;
321 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
322 }
323
324 -static __inline__ Vector4x16
325 -expand_alpha_rev (Vector4x16 pixel)
326 +static __inline__ __m64
327 +expand_alpha_rev (__m64 pixel)
328 {
329 - Vector4x16 result;
330 - __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel));
331 - return result;
332 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
333 }
334
335 -static __inline__ Vector4x16
336 -invert_colors (Vector4x16 pixel)
337 +static __inline__ __m64
338 +invert_colors (__m64 pixel)
339 {
340 - Vector4x16 result;
341 -
342 - /* 0xC6 = 11000110 */
343 - /* 3 0 1 2 */
344 -
345 - __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel));
346 -
347 - return result;
348 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
349 }
350
351 #else
352
353 -static __inline__ Vector4x16
354 -expand_alpha (Vector4x16 pixel)
355 +static __inline__ __m64
356 +expand_alpha (__m64 pixel)
357 {
358 - Vector1x64 t1, t2;
359 -
360 - t1 = shift ((Vector1x64)pixel, -48);
361 + __m64 t1, t2;
362 +
363 + t1 = shift (pixel, -48);
364 t2 = shift (t1, 16);
365 - t1 = __builtin_ia32_por (t1, t2);
366 + t1 = _mm_or_si64 (t1, t2);
367 t2 = shift (t1, 32);
368 - t1 = __builtin_ia32_por (t1, t2);
369 -
370 - return (Vector4x16)t1;
371 + t1 = _mm_or_si64 (t1, t2);
372 +
373 + return t1;
374 }
375
376 -static __inline__ Vector4x16
377 -expand_alpha_rev (Vector4x16 pixel)
378 +static __inline__ __m64
379 +expand_alpha_rev (__m64 pixel)
380 {
381 - Vector1x64 t1, t2;
382 -
383 - t1 = shift ((Vector1x64)pixel, 48);
384 + __m64 t1, t2;
385 +
386 + /* move alpha to low 16 bits and zero the rest */
387 + t1 = shift (pixel, 48);
388 t1 = shift (t1, -48);
389 +
390 t2 = shift (t1, 16);
391 - t1 = __builtin_ia32_por (t1, t2);
392 + t1 = _mm_or_si64 (t1, t2);
393 t2 = shift (t1, 32);
394 - t1 = __builtin_ia32_por (t1, t2);
395 -
396 - return (Vector4x16)t1;
397 + t1 = _mm_or_si64 (t1, t2);
398 +
399 + return t1;
400 }
401
402 -static __inline__ Vector4x16
403 -invert_colors (Vector4x16 pixel)
404 +static __inline__ __m64
405 +invert_colors (__m64 pixel)
406 {
407 - Vector1x64 x, y, z;
408 -
409 - x = y = z = (Vector1x64)pixel;
410 -
411 - x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000);
412 - y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff);
413 - z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000);
414 -
415 + __m64 x, y, z;
416 +
417 + x = y = z = pixel;
418 +
419 + x = _mm_and_si64 (x, MC(ffff0000ffff0000));
420 + y = _mm_and_si64 (y, MC(000000000000ffff));
421 + z = _mm_and_si64 (z, MC(0000ffff00000000));
422 +
423 y = shift (y, 32);
424 z = shift (z, -32);
425 -
426 - x = __builtin_ia32_por (x, y);
427 - x = __builtin_ia32_por (x, z);
428 -
429 - return (Vector4x16)x;
430 +
431 + x = _mm_or_si64 (x, y);
432 + x = _mm_or_si64 (x, z);
433 +
434 + return x;
435 }
436
437 #endif
438 @@ -210,147 +204,138 @@
439 /* Notes about writing mmx code
440 *
441 * give memory operands as the second operand. If you give it as the
442 - * first, gcc will first load it into a register, then use that register
443 + * first, gcc will first load it into a register, then use that
444 + * register
445 *
446 * ie. use
447 *
448 - * __builtin_pmullw (x, mmx_constant[8]);
449 + * _mm_mullo_pi16 (x, mmx_constant);
450 *
451 * not
452 *
453 - * __builtin_pmullw (mmx_constant[8], x);
454 + * _mm_mullo_pi16 (mmx_constant, x);
455 *
456 - * Also try to minimize dependencies. Ie. when you need a value, try to calculate
457 - * it from a value that was calculated as early as possible.
458 + * Also try to minimize dependencies. i.e. when you need a value, try
459 + * to calculate it from a value that was calculated as early as
460 + * possible.
461 */
462
463 -static __inline__ Vector4x16
464 -over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest)
465 +static __inline__ __m64
466 +over (__m64 src, __m64 srca, __m64 dest)
467 {
468 - return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca)));
469 + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
470 }
471
472 -static __inline__ Vector4x16
473 -over_rev_non_pre (Vector4x16 src, Vector4x16 dest)
474 +static __inline__ __m64
475 +over_rev_non_pre (__m64 src, __m64 dest)
476 {
477 - Vector4x16 srca = expand_alpha (src);
478 - Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha);
479 -
480 + __m64 srca = expand_alpha (src);
481 + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
482 +
483 return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
484 }
485
486 -static __inline__ Vector4x16
487 -in (Vector4x16 src,
488 - Vector4x16 mask)
489 +static __inline__ __m64
490 +in (__m64 src,
491 + __m64 mask)
492 {
493 return pix_multiply (src, mask);
494 }
495
496 -static __inline__ Vector4x16
497 -in_over (Vector4x16 src,
498 - Vector4x16 srca,
499 - Vector4x16 mask,
500 - Vector4x16 dest)
501 +static __inline__ __m64
502 +in_over (__m64 src,
503 + __m64 srca,
504 + __m64 mask,
505 + __m64 dest)
506 {
507 return over(in(src, mask), pix_multiply(srca, mask), dest);
508 }
509
510 -static __inline__ Vector8x8
511 -cvt32to64 (CARD32 v)
512 -{
513 - ullong r = v;
514 - return (Vector8x8)r;
515 -}
516 -
517 -static __inline__ Vector4x16
518 +static __inline__ __m64
519 load8888 (CARD32 v)
520 {
521 - return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v),
522 - (Vector8x8)c.mmx_zero);
523 + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
524 }
525
526 -static __inline__ Vector8x8
527 -pack8888 (Vector4x16 lo, Vector4x16 hi)
528 +static __inline__ __m64
529 +pack8888 (__m64 lo, __m64 hi)
530 {
531 - Vector8x8 r;
532 - r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi);
533 + __m64 r;
534 + r = _mm_packs_pu16 (lo, hi);
535 return r;
536 }
537
538 -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB
539 -
540 ---- Expanding 565 in the low word ---
541 -
542 -m = (m << (32 - 3)) | (m << (16 - 5)) | m;
543 -m = m & (01f0003f001f);
544 -m = m * (008404100840);
545 -m = m >> 8;
546 -
547 -Note the trick here - the top word is shifted by another nibble to avoid
548 -it bumping into the middle word
549 -*/
550 -static __inline__ Vector4x16
551 -expand565 (Vector4x16 pixel, int pos)
552 +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
553 + *
554 + * 00RR00GG00BB
555 + *
556 + * --- Expanding 565 in the low word ---
557 + *
558 + * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
559 + * m = m & (01f0003f001f);
560 + * m = m * (008404100840);
561 + * m = m >> 8;
562 + *
563 + * Note the trick here - the top word is shifted by another nibble to
564 + * avoid it bumping into the middle word
565 + */
566 +static __inline__ __m64
567 +expand565 (__m64 pixel, int pos)
568 {
569 - Vector1x64 p = (Vector1x64)pixel;
570 + __m64 p = pixel;
571 + __m64 t1, t2;
572
573 /* move pixel to low 16 bit and zero the rest */
574 p = shift (shift (p, (3 - pos) * 16), -48);
575
576 - Vector1x64 t1 = shift (p, 36 - 11);
577 - Vector1x64 t2 = shift (p, 16 - 5);
578 + t1 = shift (p, 36 - 11);
579 + t2 = shift (p, 16 - 5);
580
581 - p = __builtin_ia32_por (t1, p);
582 - p = __builtin_ia32_por (t2, p);
583 - p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb);
584 + p = _mm_or_si64 (t1, p);
585 + p = _mm_or_si64 (t2, p);
586 + p = _mm_and_si64 (p, MC(565_rgb));
587
588 - pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier);
589 - return __builtin_ia32_psrlw (pixel, 8);
590 + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
591 + return _mm_srli_pi16 (pixel, 8);
592 }
593
594 -static __inline__ Vector4x16
595 -expand8888 (Vector4x16 in, int pos)
596 +static __inline__ __m64
597 +expand8888 (__m64 in, int pos)
598 {
599 if (pos == 0)
600 - return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
601 + return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
602 else
603 - return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
604 + return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
605 }
606
607 -static __inline__ Vector4x16
608 -pack565 (Vector4x16 pixel, Vector4x16 target, int pos)
609 +static __inline__ __m64
610 +pack565 (__m64 pixel, __m64 target, int pos)
611 {
612 - Vector1x64 p = (Vector1x64)pixel;
613 - Vector1x64 t = (Vector1x64)target;
614 - Vector1x64 r, g, b;
615 + __m64 p = pixel;
616 + __m64 t = target;
617 + __m64 r, g, b;
618
619 - r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r);
620 - g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g);
621 - b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b);
622 + r = _mm_and_si64 (p, MC(565_r));
623 + g = _mm_and_si64 (p, MC(565_g));
624 + b = _mm_and_si64 (p, MC(565_b));
625
626 r = shift (r, - (32 - 8) + pos * 16);
627 g = shift (g, - (16 - 3) + pos * 16);
628 b = shift (b, - (0 + 3) + pos * 16);
629 -
630 +
631 if (pos == 0)
632 - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0);
633 + t = _mm_and_si64 (t, MC(mask_0));
634 else if (pos == 1)
635 - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1);
636 + t = _mm_and_si64 (t, MC(mask_1));
637 else if (pos == 2)
638 - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2);
639 + t = _mm_and_si64 (t, MC(mask_2));
640 else if (pos == 3)
641 - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3);
642 + t = _mm_and_si64 (t, MC(mask_3));
643
644 - p = __builtin_ia32_por (r, t);
645 - p = __builtin_ia32_por (g, p);
646 + p = _mm_or_si64 (r, t);
647 + p = _mm_or_si64 (g, p);
648
649 - return (Vector4x16)__builtin_ia32_por (b, p);
650 -}
651 -
652 -static __inline__ void
653 -emms (void)
654 -{
655 - __asm__ __volatile__ ("emms");
656 + return _mm_or_si64 (b, p);
657 }
658
659 void
660 @@ -371,8 +356,8 @@
661 CARD32 *dstLine, *dst;
662 CARD16 w;
663 FbStride dstStride;
664 - Vector4x16 vsrc, vsrca;
665 -
666 + __m64 vsrc, vsrca;
667 +
668 CHECKPOINT();
669
670 fbComposeGetSolid(pSrc, src, pDst->format);
671 @@ -384,51 +369,52 @@
672
673 vsrc = load8888 (src);
674 vsrca = expand_alpha (vsrc);
675 -
676 +
677 while (height--)
678 {
679 dst = dstLine;
680 dstLine += dstStride;
681 w = width;
682 -
683 +
684 CHECKPOINT();
685
686 while (w && (unsigned long)dst & 7)
687 {
688 - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
689 + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)),
690 + _mm_setzero_si64());
691
692 w--;
693 dst++;
694 }
695 -
696 +
697 while (w >= 2)
698 {
699 - Vector4x16 vdest;
700 - Vector4x16 dest0, dest1;
701 -
702 - vdest = *(Vector4x16 *)dst;
703 + __m64 vdest;
704 + __m64 dest0, dest1;
705 +
706 + vdest = *(__m64 *)dst;
707
708 dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
709 dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
710
711 - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
712 + *(__m64 *)dst = pack8888(dest0, dest1);
713
714 dst += 2;
715 w -= 2;
716 }
717 -
718 +
719 CHECKPOINT();
720
721 while (w)
722 {
723 - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
724 + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64());
725
726 w--;
727 dst++;
728 }
729 }
730
731 - emms();
732 + _mm_empty();
733 }
734
735 void
736 @@ -449,8 +435,8 @@
737 CARD16 *dstLine, *dst;
738 CARD16 w;
739 FbStride dstStride;
740 - Vector4x16 vsrc, vsrca;
741 -
742 + __m64 vsrc, vsrca;
743 +
744 CHECKPOINT();
745
746 fbComposeGetSolid(pSrc, src, pDst->format);
747 @@ -462,49 +448,49 @@
748
749 vsrc = load8888 (src);
750 vsrca = expand_alpha (vsrc);
751 -
752 +
753 while (height--)
754 {
755 dst = dstLine;
756 dstLine += dstStride;
757 w = width;
758 -
759 +
760 CHECKPOINT();
761
762 while (w && (unsigned long)dst & 7)
763 {
764 ullong d = *dst;
765 - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
766 + __m64 vdest = expand565 ((__m64)d, 0);
767 vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
768 *dst = (ullong)vdest;
769
770 w--;
771 dst++;
772 }
773 -
774 +
775 while (w >= 4)
776 {
777 - Vector4x16 vdest;
778 -
779 - vdest = *(Vector4x16 *)dst;
780 + __m64 vdest;
781 +
782 + vdest = *(__m64 *)dst;
783
784 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
785 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
786 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
787 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
788
789 - *(Vector8x8 *)dst = (Vector8x8)vdest;
790 + *(__m64 *)dst = vdest;
791
792 dst += 4;
793 w -= 4;
794 }
795 -
796 +
797 CHECKPOINT();
798
799 while (w)
800 {
801 ullong d = *dst;
802 - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
803 + __m64 vdest = expand565 ((__m64)d, 0);
804 vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
805 *dst = (ullong)vdest;
806
807 @@ -513,7 +499,7 @@
808 }
809 }
810
811 - emms();
812 + _mm_empty();
813 }
814
815 void
816 @@ -534,8 +520,8 @@
817 CARD32 *dstLine;
818 CARD32 *maskLine;
819 FbStride dstStride, maskStride;
820 - Vector4x16 vsrc, vsrca;
821 -
822 + __m64 vsrc, vsrca;
823 +
824 CHECKPOINT();
825
826 fbComposeGetSolid(pSrc, src, pDst->format);
827 @@ -562,9 +548,9 @@
828
829 if (m)
830 {
831 - Vector4x16 vdest = load8888(*q);
832 + __m64 vdest = load8888(*q);
833 vdest = in_over(vsrc, vsrca, load8888(m), vdest);
834 - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
835 + *q = (ullong)pack8888(vdest, _mm_setzero_si64());
836 }
837
838 twidth--;
839 @@ -580,15 +566,15 @@
840
841 if (m0 | m1)
842 {
843 - Vector4x16 dest0, dest1;
844 - Vector4x16 vdest = *(Vector4x16 *)q;
845 + __m64 dest0, dest1;
846 + __m64 vdest = *(__m64 *)q;
847
848 dest0 = in_over(vsrc, vsrca, load8888(m0),
849 expand8888 (vdest, 0));
850 dest1 = in_over(vsrc, vsrca, load8888(m1),
851 expand8888 (vdest, 1));
852
853 - *(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1);
854 + *(__m64 *)q = pack8888(dest0, dest1);
855 }
856
857 p += 2;
858 @@ -602,9 +588,9 @@
859
860 if (m)
861 {
862 - Vector4x16 vdest = load8888(*q);
863 + __m64 vdest = load8888(*q);
864 vdest = in_over(vsrc, vsrca, load8888(m), vdest);
865 - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
866 + *q = (ullong)pack8888(vdest, _mm_setzero_si64());
867 }
868
869 twidth--;
870 @@ -616,7 +602,133 @@
871 maskLine += maskStride;
872 }
873
874 - emms();
875 + _mm_empty();
876 +}
877 +
878 +void
879 +fbCompositeSrc_8888x8x8888mmx (CARD8 op,
880 + PicturePtr pSrc,
881 + PicturePtr pMask,
882 + PicturePtr pDst,
883 + INT16 xSrc,
884 + INT16 ySrc,
885 + INT16 xMask,
886 + INT16 yMask,
887 + INT16 xDst,
888 + INT16 yDst,
889 + CARD16 width,
890 + CARD16 height)
891 +{
892 + CARD32 *dstLine, *dst;
893 + CARD32 *srcLine, *src;
894 + CARD8 *maskLine;
895 + CARD32 mask;
896 + __m64 vmask;
897 + FbStride dstStride, srcStride, maskStride;
898 + CARD16 w;
899 + __m64 srca;
900 +
901 + CHECKPOINT();
902 +
903 + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
904 + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
905 + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
906 +
907 + mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
908 + vmask = load8888 (mask);
909 + srca = MC(4x00ff);
910 +
911 + while (height--)
912 + {
913 + dst = dstLine;
914 + dstLine += dstStride;
915 + src = srcLine;
916 + srcLine += srcStride;
917 + w = width;
918 +
919 + while (w && (unsigned long)dst & 7)
920 + {
921 + __m64 s = load8888 (*src);
922 + __m64 d = load8888 (*dst);
923 +
924 + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
925 +
926 + w--;
927 + dst++;
928 + src++;
929 + }
930 +
931 + while (w >= 16)
932 + {
933 + __m64 vd0 = *(__m64 *)(dst + 0);
934 + __m64 vd1 = *(__m64 *)(dst + 2);
935 + __m64 vd2 = *(__m64 *)(dst + 4);
936 + __m64 vd3 = *(__m64 *)(dst + 6);
937 + __m64 vd4 = *(__m64 *)(dst + 8);
938 + __m64 vd5 = *(__m64 *)(dst + 10);
939 + __m64 vd6 = *(__m64 *)(dst + 12);
940 + __m64 vd7 = *(__m64 *)(dst + 14);
941 +
942 + __m64 vs0 = *(__m64 *)(src + 0);
943 + __m64 vs1 = *(__m64 *)(src + 2);
944 + __m64 vs2 = *(__m64 *)(src + 4);
945 + __m64 vs3 = *(__m64 *)(src + 6);
946 + __m64 vs4 = *(__m64 *)(src + 8);
947 + __m64 vs5 = *(__m64 *)(src + 10);
948 + __m64 vs6 = *(__m64 *)(src + 12);
949 + __m64 vs7 = *(__m64 *)(dst + 14);
950 +
951 + vd0 = (__m64)pack8888 (
952 + in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
953 + in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
954 +
955 + vd1 = (__m64)pack8888 (
956 + in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
957 + in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
958 +
959 + vd2 = (__m64)pack8888 (
960 + in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
961 + in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
962 +
963 + vd3 = (__m64)pack8888 (
964 + in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
965 + in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
966 +
967 + vd4 = (__m64)pack8888 (
968 + in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
969 + in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
970 +
971 + vd5 = (__m64)pack8888 (
972 + in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
973 + in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
974 +
975 + vd6 = (__m64)pack8888 (
976 + in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
977 + in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
978 +
979 + vd7 = (__m64)pack8888 (
980 + in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
981 + in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
982 +
983 + w -= 16;
984 + dst += 16;
985 + src += 16;
986 + }
987 +
988 + while (w)
989 + {
990 + __m64 s = load8888 (*src);
991 + __m64 d = load8888 (*dst);
992 +
993 + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
994 +
995 + w--;
996 + dst++;
997 + src++;
998 + }
999 + }
1000 +
1001 + _mm_empty();
1002 }
1003
1004 void
1005 @@ -638,7 +750,7 @@
1006 CARD8 *maskLine, *mask;
1007 FbStride dstStride, maskStride;
1008 CARD16 w;
1009 - Vector4x16 vsrc, vsrca;
1010 + __m64 vsrc, vsrca;
1011 ullong srcsrc;
1012
1013 CHECKPOINT();
1014 @@ -648,7 +760,7 @@
1015 srca = src >> 24;
1016 if (srca == 0)
1017 return;
1018 -
1019 +
1020 srcsrc = (unsigned long long)src << 32 | src;
1021
1022 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1023 @@ -664,7 +776,7 @@
1024 mask = maskLine;
1025 maskLine += maskStride;
1026 w = width;
1027 -
1028 +
1029 CHECKPOINT();
1030
1031 while (w && (unsigned long)dst & 7)
1032 @@ -673,15 +785,15 @@
1033
1034 if (m)
1035 {
1036 - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst));
1037 - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
1038 + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
1039 + *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1040 }
1041
1042 w--;
1043 mask++;
1044 dst++;
1045 }
1046 -
1047 +
1048 CHECKPOINT();
1049
1050 while (w >= 2)
1051 @@ -689,29 +801,29 @@
1052 ullong m0, m1;
1053 m0 = *mask;
1054 m1 = *(mask + 1);
1055 -
1056 +
1057 if (srca == 0xff && (m0 & m1) == 0xff)
1058 {
1059 *(unsigned long long *)dst = srcsrc;
1060 }
1061 else if (m0 | m1)
1062 {
1063 - Vector4x16 vdest;
1064 - Vector4x16 dest0, dest1;
1065 -
1066 - vdest = *(Vector4x16 *)dst;
1067 + __m64 vdest;
1068 + __m64 dest0, dest1;
1069 +
1070 + vdest = *(__m64 *)dst;
1071
1072 - dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0));
1073 - dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1));
1074 + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
1075 + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
1076
1077 - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
1078 + *(__m64 *)dst = pack8888(dest0, dest1);
1079 }
1080
1081 mask += 2;
1082 dst += 2;
1083 w -= 2;
1084 }
1085 -
1086 +
1087 CHECKPOINT();
1088
1089 while (w)
1090 @@ -720,9 +832,9 @@
1091
1092 if (m)
1093 {
1094 - Vector4x16 vdest = load8888(*dst);
1095 - vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest);
1096 - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
1097 + __m64 vdest = load8888(*dst);
1098 + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
1099 + *dst = (ullong)pack8888(vdest, _mm_setzero_si64());
1100 }
1101
1102 w--;
1103 @@ -731,7 +843,7 @@
1104 }
1105 }
1106
1107 - emms();
1108 + _mm_empty();
1109 }
1110
1111
1112 @@ -754,7 +866,7 @@
1113 CARD8 *maskLine, *mask;
1114 FbStride dstStride, maskStride;
1115 CARD16 w;
1116 - Vector4x16 vsrc, vsrca;
1117 + __m64 vsrc, vsrca;
1118 unsigned long long srcsrcsrcsrc, src16;
1119
1120 CHECKPOINT();
1121 @@ -770,9 +882,9 @@
1122
1123 vsrc = load8888 (src);
1124 vsrca = expand_alpha (vsrc);
1125 -
1126 - src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0);
1127 -
1128 +
1129 + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
1130 +
1131 srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
1132 (ullong)src16 << 16 | (ullong)src16;
1133
1134 @@ -783,7 +895,7 @@
1135 mask = maskLine;
1136 maskLine += maskStride;
1137 w = width;
1138 -
1139 +
1140 CHECKPOINT();
1141
1142 while (w && (unsigned long)dst & 7)
1143 @@ -793,16 +905,16 @@
1144 if (m)
1145 {
1146 ullong d = *dst;
1147 - Vector4x16 vd = (Vector4x16)d;
1148 - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
1149 - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
1150 + __m64 vd = (__m64)d;
1151 + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1152 + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1153 }
1154
1155 w--;
1156 mask++;
1157 dst++;
1158 }
1159 -
1160 +
1161 CHECKPOINT();
1162
1163 while (w >= 4)
1164 @@ -812,35 +924,35 @@
1165 m1 = *(mask + 1);
1166 m2 = *(mask + 2);
1167 m3 = *(mask + 3);
1168 -
1169 +
1170 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
1171 {
1172 *(unsigned long long *)dst = srcsrcsrcsrc;
1173 }
1174 else if (m0 | m1 | m2 | m3)
1175 {
1176 - Vector4x16 vdest;
1177 - Vector4x16 vm0, vm1, vm2, vm3;
1178 -
1179 - vdest = *(Vector4x16 *)dst;
1180 -
1181 - vm0 = (Vector4x16)m0;
1182 + __m64 vdest;
1183 + __m64 vm0, vm1, vm2, vm3;
1184 +
1185 + vdest = *(__m64 *)dst;
1186 +
1187 + vm0 = (__m64)m0;
1188 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
1189 - vm1 = (Vector4x16)m1;
1190 + vm1 = (__m64)m1;
1191 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
1192 - vm2 = (Vector4x16)m2;
1193 + vm2 = (__m64)m2;
1194 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
1195 - vm3 = (Vector4x16)m3;
1196 + vm3 = (__m64)m3;
1197 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
1198
1199 - *(Vector4x16 *)dst = vdest;
1200 + *(__m64 *)dst = vdest;
1201 }
1202
1203 w -= 4;
1204 mask += 4;
1205 dst += 4;
1206 }
1207 -
1208 +
1209 CHECKPOINT();
1210
1211 while (w)
1212 @@ -850,9 +962,9 @@
1213 if (m)
1214 {
1215 ullong d = *dst;
1216 - Vector4x16 vd = (Vector4x16)d;
1217 - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
1218 - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
1219 + __m64 vd = (__m64)d;
1220 + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
1221 + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
1222 }
1223
1224 w--;
1225 @@ -861,7 +973,7 @@
1226 }
1227 }
1228
1229 - emms();
1230 + _mm_empty();
1231 }
1232
1233 void
1234 @@ -887,9 +999,9 @@
1235
1236 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1237 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1238 -
1239 +
1240 assert (pSrc->pDrawable == pMask->pDrawable);
1241 -
1242 +
1243 while (height--)
1244 {
1245 dst = dstLine;
1246 @@ -897,14 +1009,14 @@
1247 src = srcLine;
1248 srcLine += srcStride;
1249 w = width;
1250 -
1251 +
1252 CHECKPOINT();
1253
1254 while (w && (unsigned long)dst & 7)
1255 {
1256 - Vector4x16 vsrc = load8888 (*src);
1257 + __m64 vsrc = load8888 (*src);
1258 ullong d = *dst;
1259 - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1260 + __m64 vdest = expand565 ((__m64)d, 0);
1261
1262 vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1263
1264 @@ -914,19 +1026,19 @@
1265 dst++;
1266 src++;
1267 }
1268 -
1269 +
1270 CHECKPOINT();
1271
1272 while (w >= 4)
1273 {
1274 CARD32 s0, s1, s2, s3;
1275 unsigned char a0, a1, a2, a3;
1276 -
1277 +
1278 s0 = *src;
1279 s1 = *(src + 1);
1280 s2 = *(src + 2);
1281 s3 = *(src + 3);
1282 -
1283 +
1284 a0 = (s0 >> 24);
1285 a1 = (s1 >> 24);
1286 a2 = (s2 >> 24);
1287 @@ -934,38 +1046,38 @@
1288
1289 if ((a0 & a1 & a2 & a3) == 0xFF)
1290 {
1291 - Vector4x16 vdest;
1292 - vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0);
1293 + __m64 vdest;
1294 + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
1295 vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
1296 vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
1297 vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
1298 -
1299 - *(Vector4x16 *)dst = vdest;
1300 +
1301 + *(__m64 *)dst = vdest;
1302 }
1303 else if (a0 | a1 | a2 | a3)
1304 {
1305 - Vector4x16 vdest = *(Vector4x16 *)dst;
1306 -
1307 + __m64 vdest = *(__m64 *)dst;
1308 +
1309 vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
1310 vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
1311 vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
1312 vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
1313 -
1314 - *(Vector4x16 *)dst = vdest;
1315 +
1316 + *(__m64 *)dst = vdest;
1317 }
1318
1319 w -= 4;
1320 dst += 4;
1321 src += 4;
1322 }
1323 -
1324 +
1325 CHECKPOINT();
1326
1327 while (w)
1328 {
1329 - Vector4x16 vsrc = load8888 (*src);
1330 + __m64 vsrc = load8888 (*src);
1331 ullong d = *dst;
1332 - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1333 + __m64 vdest = expand565 ((__m64)d, 0);
1334
1335 vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1336
1337 @@ -976,11 +1088,11 @@
1338 src++;
1339 }
1340 }
1341 -
1342 - emms();
1343 +
1344 + _mm_empty();
1345 }
1346
1347 -/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1348 +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1349
1350 void
1351 fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
1352 @@ -1005,9 +1117,9 @@
1353
1354 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1355 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1356 -
1357 +
1358 assert (pSrc->pDrawable == pMask->pDrawable);
1359 -
1360 +
1361 while (height--)
1362 {
1363 dst = dstLine;
1364 @@ -1015,28 +1127,28 @@
1365 src = srcLine;
1366 srcLine += srcStride;
1367 w = width;
1368 -
1369 +
1370 while (w && (unsigned long)dst & 7)
1371 {
1372 - Vector4x16 s = load8888 (*src);
1373 - Vector4x16 d = load8888 (*dst);
1374 + __m64 s = load8888 (*src);
1375 + __m64 d = load8888 (*dst);
1376
1377 - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
1378 + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1379
1380 w--;
1381 dst++;
1382 src++;
1383 }
1384 -
1385 +
1386 while (w >= 2)
1387 {
1388 ullong s0, s1;
1389 unsigned char a0, a1;
1390 - Vector4x16 d0, d1;
1391 -
1392 + __m64 d0, d1;
1393 +
1394 s0 = *src;
1395 s1 = *(src + 1);
1396 -
1397 +
1398 a0 = (s0 >> 24);
1399 a1 = (s1 >> 24);
1400
1401 @@ -1044,17 +1156,17 @@
1402 {
1403 d0 = invert_colors(load8888(s0));
1404 d1 = invert_colors(load8888(s1));
1405 -
1406 - *(Vector8x8 *)dst = pack8888 (d0, d1);
1407 +
1408 + *(__m64 *)dst = pack8888 (d0, d1);
1409 }
1410 else if (a0 | a1)
1411 {
1412 - Vector4x16 vdest = *(Vector4x16 *)dst;
1413 -
1414 + __m64 vdest = *(__m64 *)dst;
1415 +
1416 d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1417 d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1418 -
1419 - *(Vector8x8 *)dst = pack8888 (d0, d1);
1420 +
1421 + *(__m64 *)dst = pack8888 (d0, d1);
1422 }
1423
1424 w -= 2;
1425 @@ -1064,18 +1176,18 @@
1426
1427 while (w)
1428 {
1429 - Vector4x16 s = load8888 (*src);
1430 - Vector4x16 d = load8888 (*dst);
1431 + __m64 s = load8888 (*src);
1432 + __m64 d = load8888 (*dst);
1433
1434 - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
1435 + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1436
1437 w--;
1438 dst++;
1439 src++;
1440 }
1441 }
1442 -
1443 - emms();
1444 +
1445 + _mm_empty();
1446 }
1447
1448 void
1449 @@ -1096,7 +1208,7 @@
1450 CARD16 *dstLine;
1451 CARD32 *maskLine;
1452 FbStride dstStride, maskStride;
1453 - Vector4x16 vsrc, vsrca;
1454 + __m64 vsrc, vsrca;
1455
1456 CHECKPOINT();
1457
1458 @@ -1125,7 +1237,7 @@
1459 if (m)
1460 {
1461 ullong d = *q;
1462 - Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1463 + __m64 vdest = expand565 ((__m64)d, 0);
1464 vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1465 *q = (ullong)vdest;
1466 }
1467 @@ -1146,14 +1258,14 @@
1468
1469 if ((m0 | m1 | m2 | m3))
1470 {
1471 - Vector4x16 vdest = *(Vector4x16 *)q;
1472 + __m64 vdest = *(__m64 *)q;
1473
1474 vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1475 vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1476 vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1477 vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1478
1479 - *(Vector4x16 *)q = vdest;
1480 + *(__m64 *)q = vdest;
1481 }
1482 twidth -= 4;
1483 p += 4;
1484 @@ -1168,7 +1280,7 @@
1485 if (m)
1486 {
1487 ullong d = *q;
1488 - Vector4x16 vdest = expand565((Vector4x16)d, 0);
1489 + __m64 vdest = expand565((__m64)d, 0);
1490 vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1491 *q = (ullong)vdest;
1492 }
1493 @@ -1182,7 +1294,7 @@
1494 dstLine += dstStride;
1495 }
1496
1497 - emms ();
1498 + _mm_empty ();
1499 }
1500
1501 void
1502 @@ -1210,7 +1322,7 @@
1503
1504 fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
1505 fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
1506 -
1507 +
1508 while (height--)
1509 {
1510 dst = dstLine;
1511 @@ -1218,7 +1330,7 @@
1512 src = srcLine;
1513 srcLine += srcStride;
1514 w = width;
1515 -
1516 +
1517 while (w && (unsigned long)dst & 7)
1518 {
1519 s = *src;
1520 @@ -1234,13 +1346,7 @@
1521
1522 while (w >= 8)
1523 {
1524 - __asm__ __volatile__ (
1525 - "movq (%0), %%mm2\n\t"
1526 - "movq (%1), %%mm3\n\t"
1527 - "paddusb %%mm2, %%mm3\n\t"
1528 - "movq %%mm3, (%1)\n\t"
1529 - : /* no output */ : "r" (src), "r" (dst));
1530 -
1531 + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1532 dst += 8;
1533 src += 8;
1534 w -= 8;
1535 @@ -1259,8 +1365,8 @@
1536 w--;
1537 }
1538 }
1539 -
1540 - emms();
1541 +
1542 + _mm_empty();
1543 }
1544
1545 void
1546 @@ -1297,13 +1403,8 @@
1547
1548 while (w && (unsigned long)dst & 7)
1549 {
1550 - __asm__ __volatile__ (
1551 - "movd %0, %%mm2\n\t"
1552 - "movd %1, %%mm3\n\t"
1553 - "paddusb %%mm2, %%mm3\n\t"
1554 - "movd %%mm3, %1\n\t"
1555 - : /* no output */ : "m" (*src), "m" (*dst));
1556 -
1557 + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1558 + _mm_cvtsi32_si64(*dst)));
1559 dst++;
1560 src++;
1561 w--;
1562 @@ -1311,13 +1412,7 @@
1563
1564 while (w >= 2)
1565 {
1566 - __asm__ __volatile__ (
1567 - "movq (%0), %%mm2\n\t"
1568 - "movq (%1), %%mm3\n\t"
1569 - "paddusb %%mm2, %%mm3\n\t"
1570 - "movq %%mm3, (%1)\n\t"
1571 - : /* no output */ : "r" (src), "r" (dst));
1572 -
1573 + *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1574 dst += 2;
1575 src += 2;
1576 w -= 2;
1577 @@ -1325,16 +1420,13 @@
1578
1579 if (w)
1580 {
1581 - __asm__ __volatile__ (
1582 - "movd %0, %%mm2\n\t"
1583 - "movd %1, %%mm3\n\t"
1584 - "paddusb %%mm2, %%mm3\n\t"
1585 - "movd %%mm3, %1\n\t"
1586 - : /* no output */ : "m" (*src), "m" (*dst));
1587 + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1588 + _mm_cvtsi32_si64(*dst)));
1589 +
1590 }
1591 }
1592 -
1593 - emms();
1594 +
1595 + _mm_empty();
1596 }
1597
1598 #define GetStart(drw,x,y,type,stride,line,bpp) {\
1599 @@ -1358,19 +1450,19 @@
1600 FbStride stride;
1601 int bpp;
1602 ullong fill;
1603 - Vector8x8 vfill;
1604 + __m64 vfill;
1605 CARD32 byte_width;
1606 CARD8 *byte_line;
1607 FbBits *bits;
1608 int xoff, yoff;
1609
1610 CHECKPOINT();
1611 -
1612 +
1613 fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1614 -
1615 +
1616 if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1617 return FALSE;
1618 -
1619 +
1620 if (bpp != 16 && bpp != 32)
1621 return FALSE;
1622
1623 @@ -1388,9 +1480,9 @@
1624 byte_width = 4 * width;
1625 stride *= 4;
1626 }
1627 -
1628 +
1629 fill = ((ullong)xor << 32) | xor;
1630 - vfill = (Vector8x8)fill;
1631 + vfill = (__m64)fill;
1632
1633 while (height--)
1634 {
1635 @@ -1398,7 +1490,7 @@
1636 CARD8 *d = byte_line;
1637 byte_line += stride;
1638 w = byte_width;
1639 -
1640 +
1641 while (w >= 2 && ((unsigned long)d & 3))
1642 {
1643 *(CARD16 *)d = xor;
1644 @@ -1406,35 +1498,32 @@
1645 d += 2;
1646 }
1647
1648 - while (w >= 4 && ((unsigned int)d & 7))
1649 + while (w >= 4 && ((unsigned long)d & 7))
1650 {
1651 *(CARD32 *)d = xor;
1652 -
1653 +
1654 w -= 4;
1655 d += 4;
1656 }
1657
1658 while (w >= 64)
1659 {
1660 - __asm__ __volatile (
1661 - "movq %0, (%1)\n\t"
1662 - "movq %0, 8(%1)\n\t"
1663 - "movq %0, 16(%1)\n\t"
1664 - "movq %0, 24(%1)\n\t"
1665 - "movq %0, 32(%1)\n\t"
1666 - "movq %0, 40(%1)\n\t"
1667 - "movq %0, 48(%1)\n\t"
1668 - "movq %0, 56(%1)\n\t"
1669 - : /* no output */
1670 - : "y" (vfill), "r" (d)
1671 - : "memory");
1672 + *(__m64*) (d + 0) = vfill;
1673 + *(__m64*) (d + 8) = vfill;
1674 + *(__m64*) (d + 16) = vfill;
1675 + *(__m64*) (d + 24) = vfill;
1676 + *(__m64*) (d + 32) = vfill;
1677 + *(__m64*) (d + 40) = vfill;
1678 + *(__m64*) (d + 48) = vfill;
1679 + *(__m64*) (d + 56) = vfill;
1680 +
1681 w -= 64;
1682 d += 64;
1683 }
1684 while (w >= 4)
1685 {
1686 *(CARD32 *)d = xor;
1687 -
1688 +
1689 w -= 4;
1690 d += 4;
1691 }
1692 @@ -1446,16 +1535,160 @@
1693 }
1694 }
1695
1696 - emms();
1697 + _mm_empty();
1698 + return TRUE;
1699 +}
1700 +
1701 +Bool
1702 +fbCopyAreammx (DrawablePtr pSrc,
1703 + DrawablePtr pDst,
1704 + int src_x,
1705 + int src_y,
1706 + int dst_x,
1707 + int dst_y,
1708 + int width,
1709 + int height)
1710 +{
1711 + FbBits * src_bits;
1712 + FbStride src_stride;
1713 + int src_bpp;
1714 + int src_xoff;
1715 + int src_yoff;
1716 +
1717 + FbBits * dst_bits;
1718 + FbStride dst_stride;
1719 + int dst_bpp;
1720 + int dst_xoff;
1721 + int dst_yoff;
1722 +
1723 + CARD8 * src_bytes;
1724 + CARD8 * dst_bytes;
1725 + int byte_width;
1726 +
1727 + fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
1728 + fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
1729 +
1730 + if (src_bpp != 16 && src_bpp != 32)
1731 + return FALSE;
1732 +
1733 + if (dst_bpp != 16 && dst_bpp != 32)
1734 + return FALSE;
1735 +
1736 + if (src_bpp != dst_bpp)
1737 + {
1738 + return FALSE;
1739 + }
1740 +
1741 + if (src_bpp == 16)
1742 + {
1743 + src_stride = src_stride * sizeof (FbBits) / 2;
1744 + dst_stride = dst_stride * sizeof (FbBits) / 2;
1745 + src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1746 + dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1747 + byte_width = 2 * width;
1748 + src_stride *= 2;
1749 + dst_stride *= 2;
1750 + }
1751 + else
1752 + {
1753 + src_stride = src_stride * sizeof (FbBits) / 4;
1754 + dst_stride = dst_stride * sizeof (FbBits) / 4;
1755 + src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1756 + dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1757 + byte_width = 4 * width;
1758 + src_stride *= 4;
1759 + dst_stride *= 4;
1760 + }
1761 +
1762 + while (height--)
1763 + {
1764 + int w;
1765 + CARD8 *s = src_bytes;
1766 + CARD8 *d = dst_bytes;
1767 + src_bytes += src_stride;
1768 + dst_bytes += dst_stride;
1769 + w = byte_width;
1770 +
1771 + while (w >= 2 && ((unsigned long)d & 3))
1772 + {
1773 + *(CARD16 *)d = *(CARD16 *)s;
1774 + w -= 2;
1775 + s += 2;
1776 + d += 2;
1777 + }
1778 +
1779 + while (w >= 4 && ((unsigned int)d & 7))
1780 + {
1781 + *(CARD32 *)d = *(CARD32 *)s;
1782 +
1783 + w -= 4;
1784 + s += 4;
1785 + d += 4;
1786 + }
1787 +
1788 + while (w >= 64)
1789 + {
1790 + *(__m64 *)(d + 0) = *(__m64 *)(s + 0);
1791 + *(__m64 *)(d + 8) = *(__m64 *)(s + 8);
1792 + *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
1793 + *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
1794 + *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
1795 + *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
1796 + *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
1797 + *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
1798 + w -= 64;
1799 + s += 64;
1800 + d += 64;
1801 + }
1802 + while (w >= 4)
1803 + {
1804 + *(CARD32 *)d = *(CARD32 *)s;
1805 +
1806 + w -= 4;
1807 + s += 4;
1808 + d += 4;
1809 + }
1810 + if (w >= 2)
1811 + {
1812 + *(CARD16 *)d = *(CARD16 *)s;
1813 + w -= 2;
1814 + s += 2;
1815 + d += 2;
1816 + }
1817 + }
1818 +
1819 + _mm_empty();
1820 return TRUE;
1821 }
1822
1823 +void
1824 +fbCompositeCopyAreammx (CARD8 op,
1825 + PicturePtr pSrc,
1826 + PicturePtr pMask,
1827 + PicturePtr pDst,
1828 + INT16 xSrc,
1829 + INT16 ySrc,
1830 + INT16 xMask,
1831 + INT16 yMask,
1832 + INT16 xDst,
1833 + INT16 yDst,
1834 + CARD16 width,
1835 + CARD16 height)
1836 +{
1837 + fbCopyAreammx (pSrc->pDrawable,
1838 + pDst->pDrawable,
1839 + xSrc, ySrc,
1840 + xDst, yDst,
1841 + width, height);
1842 +}
1843 +
1844 +#ifndef __amd64__
1845 Bool
1846 fbHaveMMX (void)
1847 {
1848 static Bool initialized = FALSE;
1849 static Bool mmx_present;
1850 -
1851 +
1852 if (!initialized)
1853 {
1854 int tmp; /* static variables are accessed through %ebx,
1855 @@ -1466,7 +1699,7 @@
1856
1857 __asm__ __volatile__ (
1858 /* Check if bit 21 in flags word is writeable */
1859 -
1860 +
1861 "pusha \n\t"
1862 "pushfl \n\t"
1863 "popl %%eax \n\t"
1864 @@ -1502,13 +1735,14 @@
1865 : /* no input */);
1866
1867 initialized = TRUE;
1868 -
1869 +
1870 mmx_present = tmp;
1871 }
1872
1873 return mmx_present;
1874 }
1875 +#endif /* __amd64__ */
1876
1877
1878 #endif /* RENDER */
1879 -#endif /* USE_GCC34_MMX */
1880 +#endif /* USE_MMX */
1881 diff -ur xc-orig/programs/Xserver/fb/fbmmx.h xc/programs/Xserver/fb/fbmmx.h
1882 --- xc-orig/programs/Xserver/fb/fbmmx.h 2005-02-11 04:00:50.006092570 -0500
1883 +++ xc/programs/Xserver/fb/fbmmx.h 2005-02-11 04:01:32.072346126 -0500
1884 @@ -1,5 +1,5 @@
1885 /*
1886 - * Copyright © 2004 Red Hat, Inc.
1887 + * Copyright © 2004 Red Hat, Inc.
1888 *
1889 * Permission to use, copy, modify, distribute, and sell this software and its
1890 * documentation for any purpose is hereby granted without fee, provided that
1891 @@ -18,17 +18,23 @@
1892 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
1893 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1894 *
1895 - * Author: Søren Sandmann (sandmann@redhat.com)
1896 + * Author: Søren Sandmann (sandmann@redhat.com)
1897 *
1898 * Based on work by Owen Taylor
1899 */
1900 -#ifdef USE_GCC34_MMX
1901 +#ifdef USE_MMX
1902 +
1903 +#ifndef __amd64__
1904 Bool fbHaveMMX(void);
1905 #else
1906 -#define fbHaveMMX FALSE
1907 +#define fbHaveMMX() TRUE
1908 +#endif
1909 +
1910 +#else
1911 +#define fbHaveMMX() FALSE
1912 #endif
1913
1914 -#ifdef USE_GCC34_MMX
1915 +#ifdef USE_MMX
1916
1917 void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
1918 PicturePtr pSrc,
1919 @@ -150,6 +156,38 @@
1920 INT16 yDst,
1921 CARD16 width,
1922 CARD16 height);
1923 +void fbCompositeSrc_8888x8x8888mmx (CARD8 op,
1924 + PicturePtr pSrc,
1925 + PicturePtr pMask,
1926 + PicturePtr pDst,
1927 + INT16 xSrc,
1928 + INT16 ySrc,
1929 + INT16 xMask,
1930 + INT16 yMask,
1931 + INT16 xDst,
1932 + INT16 yDst,
1933 + CARD16 width,
1934 + CARD16 height);
1935 +Bool fbCopyAreammx (DrawablePtr pSrc,
1936 + DrawablePtr pDst,
1937 + int src_x,
1938 + int src_y,
1939 + int dst_x,
1940 + int dst_y,
1941 + int width,
1942 + int height);
1943 +void fbCompositeCopyAreammx (CARD8 op,
1944 + PicturePtr pSrc,
1945 + PicturePtr pMask,
1946 + PicturePtr pDst,
1947 + INT16 xSrc,
1948 + INT16 ySrc,
1949 + INT16 xMask,
1950 + INT16 yMask,
1951 + INT16 xDst,
1952 + INT16 yDst,
1953 + CARD16 width,
1954 + CARD16 height);
1955 Bool fbSolidFillmmx (DrawablePtr pDraw,
1956 int x,
1957 int y,
1958 @@ -157,4 +195,4 @@
1959 int height,
1960 FbBits xor);
1961
1962 -#endif /* USE_GCC34_MMX */
1963 +#endif /* USE_MMX */
1964
1965 diff -ur xc-orig/programs/Xserver/fb/fbpict.c xc/programs/Xserver/fb/fbpict.c
1966 --- xc-orig/programs/Xserver/fb/fbpict.c 2005-02-11 04:00:50.007092600 -0500
1967 +++ xc/programs/Xserver/fb/fbpict.c 2005-02-11 04:01:32.075346216 -0500
1968 @@ -1,7 +1,7 @@
1969 /*
1970 * $XFree86: xc/programs/Xserver/fb/fbpict.c,v 1.15 2002/09/26 02:56:48 keithp Exp $
1971 *
1972 - * Copyright © 2000 SuSE, Inc.
1973 + * Copyright © 2000 SuSE, Inc.
1974 *
1975 * Permission to use, copy, modify, distribute, and sell this software and its
1976 * documentation for any purpose is hereby granted without fee, provided that
1977 @@ -863,6 +863,15 @@
1978 if (!pSrc->transform && !(pMask && pMask->transform))
1979 if (!maskAlphaMap && !srcAlphaMap && !dstAlphaMap)
1980 switch (op) {
1981 + case PictOpSrc:
1982 +#ifdef USE_MMX
1983 + if (!pMask && pSrc->format == pDst->format &&
1984 + pSrc->pDrawable != pDst->pDrawable)
1985 + {
1986 + func = fbCompositeCopyAreammx;
1987 + }
1988 +#endif
1989 + break;
1990 case PictOpOver:
1991 if (pMask)
1992 {
1993 @@ -877,7 +886,7 @@
1994 switch (pDst->format) {
1995 case PICT_r5g6b5:
1996 case PICT_b5g6r5:
1997 -#ifdef USE_GCC34_MMX
1998 +#ifdef USE_MMX
1999 if (fbHaveMMX())
2000 func = fbCompositeSolidMask_nx8x0565mmx;
2001 else
2002 @@ -892,7 +901,7 @@
2003 case PICT_x8r8g8b8:
2004 case PICT_a8b8g8r8:
2005 case PICT_x8b8g8r8:
2006 -#ifdef USE_GCC34_MMX
2007 +#ifdef USE_MMX
2008 if (fbHaveMMX())
2009 func = fbCompositeSolidMask_nx8x8888mmx;
2010 else
2011 @@ -906,7 +915,7 @@
2012 switch (pDst->format) {
2013 case PICT_a8r8g8b8:
2014 case PICT_x8r8g8b8:
2015 -#ifdef USE_GCC34_MMX
2016 +#ifdef USE_MMX
2017 if (fbHaveMMX())
2018 func = fbCompositeSolidMask_nx8888x8888Cmmx;
2019 else
2020 @@ -914,7 +923,7 @@
2021 func = fbCompositeSolidMask_nx8888x8888C;
2022 break;
2023 case PICT_r5g6b5:
2024 -#ifdef USE_GCC34_MMX
2025 +#ifdef USE_MMX
2026 if (fbHaveMMX())
2027 func = fbCompositeSolidMask_nx8888x0565Cmmx;
2028 else
2029 @@ -929,7 +938,7 @@
2030 switch (pDst->format) {
2031 case PICT_a8b8g8r8:
2032 case PICT_x8b8g8r8:
2033 -#ifdef USE_GCC34_MMX
2034 +#ifdef USE_MMX
2035 if (fbHaveMMX())
2036 func = fbCompositeSolidMask_nx8888x8888Cmmx;
2037 else
2038 @@ -937,7 +946,7 @@
2039 func = fbCompositeSolidMask_nx8888x8888C;
2040 break;
2041 case PICT_b5g6r5:
2042 -#ifdef USE_GCC34_MMX
2043 +#ifdef USE_MMX
2044 if (fbHaveMMX())
2045 func = fbCompositeSolidMask_nx8888x0565Cmmx;
2046 else
2047 @@ -970,6 +979,7 @@
2048 xSrc == xMask && ySrc == yMask &&
2049 !pMask->componentAlpha)
2050 {
2051 + /* source == mask: non-premultiplied data */
2052 switch (pSrc->format) {
2053 case PICT_x8b8g8r8:
2054 switch (pMask->format) {
2055 @@ -978,13 +988,13 @@
2056 switch (pDst->format) {
2057 case PICT_a8r8g8b8:
2058 case PICT_x8r8g8b8:
2059 -#ifdef USE_GCC34_MMX
2060 +#ifdef USE_MMX
2061 if (fbHaveMMX())
2062 func = fbCompositeSrc_8888RevNPx8888mmx;
2063 #endif
2064 break;
2065 case PICT_r5g6b5:
2066 -#ifdef USE_GCC34_MMX
2067 +#ifdef USE_MMX
2068 if (fbHaveMMX())
2069 func = fbCompositeSrc_8888RevNPx0565mmx;
2070 #endif
2071 @@ -1000,13 +1010,13 @@
2072 switch (pDst->format) {
2073 case PICT_a8b8g8r8:
2074 case PICT_x8b8g8r8:
2075 -#ifdef USE_GCC34_MMX
2076 +#ifdef USE_MMX
2077 if (fbHaveMMX())
2078 func = fbCompositeSrc_8888RevNPx8888mmx;
2079 #endif
2080 break;
2081 case PICT_r5g6b5:
2082 -#ifdef USE_GCC34_MMX
2083 +#ifdef USE_MMX
2084 if (fbHaveMMX())
2085 func = fbCompositeSrc_8888RevNPx0565mmx;
2086 #endif
2087 @@ -1018,9 +1028,27 @@
2088 }
2089 break;
2090 }
2091 + else
2092 + {
2093 + /* non-repeating source, repeating mask => translucent window */
2094 + if (maskRepeat &&
2095 + pMask->pDrawable->width == 1 &&
2096 + pMask->pDrawable->height == 1)
2097 + {
2098 + if (pSrc->format == PICT_x8r8g8b8 &&
2099 + pDst->format == PICT_x8r8g8b8 &&
2100 + pMask->format == PICT_a8)
2101 + {
2102 +#ifdef USE_MMX
2103 + if (fbHaveMMX())
2104 + func = fbCompositeSrc_8888x8x8888mmx;
2105 +#endif
2106 + }
2107 + }
2108 + }
2109 }
2110 }
2111 - else
2112 + else /* no mask */
2113 {
2114 if (srcRepeat &&
2115 pSrc->pDrawable->width == 1 &&
2116 @@ -1032,7 +1060,7 @@
2117 switch (pDst->format) {
2118 case PICT_a8r8g8b8:
2119 case PICT_x8r8g8b8:
2120 -#ifdef USE_GCC34_MMX
2121 +#ifdef USE_MMX
2122 if (fbHaveMMX())
2123 {
2124 srcRepeat = FALSE;
2125 @@ -1041,7 +1069,7 @@
2126 #endif
2127 break;
2128 case PICT_r5g6b5:
2129 -#ifdef USE_GCC34_MMX
2130 +#ifdef USE_MMX
2131 if (fbHaveMMX())
2132 {
2133 srcRepeat = FALSE;
2134 @@ -1070,6 +1098,27 @@
2135 break;
2136 }
2137 break;
2138 + case PICT_x8r8g8b8:
2139 + switch (pDst->format) {
2140 + case PICT_a8r8g8b8:
2141 + case PICT_x8r8g8b8:
2142 +#ifdef USE_MMX
2143 + if (fbHaveMMX())
2144 + func = fbCompositeCopyAreammx;
2145 +#endif
2146 + break;
2147 + }
2148 + case PICT_x8b8g8r8:
2149 + switch (pDst->format) {
2150 + case PICT_a8b8g8r8:
2151 + case PICT_x8b8g8r8:
2152 +#ifdef USE_MMX
2153 + if (fbHaveMMX())
2154 + func = fbCompositeCopyAreammx;
2155 +#endif
2156 + break;
2157 + }
2158 + break;
2159 case PICT_a8b8g8r8:
2160 switch (pDst->format) {
2161 case PICT_a8b8g8r8:
2162 @@ -1109,7 +1158,7 @@
2163 case PICT_a8r8g8b8:
2164 switch (pDst->format) {
2165 case PICT_a8r8g8b8:
2166 -#ifdef USE_GCC34_MMX
2167 +#ifdef USE_MMX
2168 if (fbHaveMMX())
2169 func = fbCompositeSrcAdd_8888x8888mmx;
2170 else
2171 @@ -1121,7 +1170,7 @@
2172 case PICT_a8b8g8r8:
2173 switch (pDst->format) {
2174 case PICT_a8b8g8r8:
2175 -#ifdef USE_GCC34_MMX
2176 +#ifdef USE_MMX
2177 if (fbHaveMMX())
2178 func = fbCompositeSrcAdd_8888x8888mmx;
2179 else
2180 @@ -1133,7 +1182,7 @@
2181 case PICT_a8:
2182 switch (pDst->format) {
2183 case PICT_a8:
2184 -#ifdef USE_GCC34_MMX
2185 +#ifdef USE_MMX
2186 if (fbHaveMMX())
2187 func = fbCompositeSrcAdd_8000x8000mmx;
2188 else