Annotation of /trunk/xorg-old/patches-6.8.2-r10/9914_all_6.8.2-mmx-gcc4.patch
Parent Directory | Revision Log
Revision 167 -
(hide annotations)
(download)
Tue May 8 20:58:51 2007 UTC (17 years, 4 months ago) by niro
File size: 52896 byte(s)
Tue May 8 20:58:51 2007 UTC (17 years, 4 months ago) by niro
File size: 52896 byte(s)
-import
1 | niro | 167 | diff -ur xc-orig/programs/Xserver/fb/Imakefile xc/programs/Xserver/fb/Imakefile |
2 | --- xc-orig/programs/Xserver/fb/Imakefile 2005-02-11 04:00:50.004092510 -0500 | ||
3 | +++ xc/programs/Xserver/fb/Imakefile 2005-02-11 04:01:32.059345739 -0500 | ||
4 | @@ -3,13 +3,22 @@ | ||
5 | XCOMM | ||
6 | XCOMM Id: Imakefile,v 1.1 1999/11/02 03:54:44 keithp Exp $ | ||
7 | |||
8 | -#if defined(i386Architecture) && defined(HasGcc34) && HasGcc34 | ||
9 | +#if defined(HasGcc34) && HasGcc34 | ||
10 | MMXOPTIONS= -mmmx -Winline --param inline-unit-growth=10000 \ | ||
11 | - --param large-function-growth=10000 -DUSE_GCC34_MMX | ||
12 | + --param large-function-growth=10000 -DUSE_MMX | ||
13 | +SSEOPTIONS= $(MMXOPTIONS) -msse -DUSE_SSE | ||
14 | |||
15 | +#if defined(i386Architecture) | ||
16 | SpecialCObjectRule(fbmmx,fbmmx.c,$(MMXOPTIONS)) | ||
17 | +#elif defined(AMD64Architecture) | ||
18 | +SpecialCObjectRule(fbmmx,fbmmx.c,$(SSEOPTIONS)) | ||
19 | +#endif | ||
20 | + | ||
21 | +#if defined(i386Architecture) || defined(AMD64Architecture) | ||
22 | SpecialCObjectRule(fbpict,fbpict.c,$(MMXOPTIONS)) | ||
23 | SpecialCObjectRule(fbfill,fbfill.c,$(MMXOPTIONS)) | ||
24 | +SpecialCObjectRule(fbcopy,fbcopy.c,$(MMXOPTIONS)) | ||
25 | +#endif | ||
26 | |||
27 | #endif | ||
28 | |||
29 | diff -ur xc-orig/programs/Xserver/fb/fbcompose.c xc/programs/Xserver/fb/fbcompose.c | ||
30 | --- xc-orig/programs/Xserver/fb/fbcompose.c 2005-02-11 04:00:50.009092659 -0500 | ||
31 | +++ xc/programs/Xserver/fb/fbcompose.c 2005-02-11 04:01:32.067345977 -0500 | ||
32 | @@ -1,8 +1,8 @@ | ||
33 | /* | ||
34 | - * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.3 2004/05/12 01:49:46 anholt Exp $ | ||
35 | + * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.5 2005/01/13 20:49:21 sandmann Exp $ | ||
36 | * $XFree86: xc/programs/Xserver/fb/fbcompose.c,v 1.17tsi Exp $ | ||
37 | * | ||
38 | - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. | ||
39 | + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. | ||
40 | * | ||
41 | * Permission to use, copy, modify, distribute, and sell this software and its | ||
42 | * documentation for any purpose is hereby granted without fee, provided that | ||
43 | @@ -2693,7 +2693,6 @@ | ||
44 | op->u.transform.y = y - op->u.transform.top_y; | ||
45 | } | ||
46 | |||
47 | - | ||
48 | Bool | ||
49 | fbBuildCompositeOperand (PicturePtr pPict, | ||
50 | FbCompositeOperand op[4], | ||
51 | @@ -2710,7 +2709,6 @@ | ||
52 | |||
53 | op->u.transform.top_y = pPict->pDrawable->y; | ||
54 | op->u.transform.left_x = pPict->pDrawable->x; | ||
55 | - | ||
56 | op->u.transform.start_x = x - op->u.transform.left_x; | ||
57 | op->u.transform.x = op->u.transform.start_x; | ||
58 | op->u.transform.y = y - op->u.transform.top_y; | ||
59 | @@ -2822,6 +2820,21 @@ | ||
60 | FbCombineFunc f; | ||
61 | int w; | ||
62 | |||
63 | +#if 0 | ||
64 | + ErrorF ("op: %d\n" | ||
65 | + "src format: %lx\n" | ||
66 | + "msk format %lx\n" | ||
67 | + "dst format %lx\n" | ||
68 | + "width: %d\n" | ||
69 | + "height %d\n", | ||
70 | + op, | ||
71 | + pSrc? pSrc->format : 0, | ||
72 | + pMask? pMask->format : 0, | ||
73 | + pDst? pDst->format : 0, | ||
74 | + width, height); | ||
75 | + ErrorF ("PICT_x8r8g8b8: %lx\n", PICT_x8r8g8b8); | ||
76 | +#endif | ||
77 | + | ||
78 | if (!fbBuildCompositeOperand (pSrc, src, xSrc, ySrc, TRUE, TRUE)) | ||
79 | return; | ||
80 | if (!fbBuildCompositeOperand (pDst, dst, xDst, yDst, FALSE, TRUE)) | ||
81 | diff -ur xc-orig/programs/Xserver/fb/fbcopy.c xc/programs/Xserver/fb/fbcopy.c | ||
82 | --- xc-orig/programs/Xserver/fb/fbcopy.c 2005-02-11 04:00:50.004092510 -0500 | ||
83 | +++ xc/programs/Xserver/fb/fbcopy.c 2005-02-11 04:01:32.068346007 -0500 | ||
84 | @@ -1,7 +1,7 @@ | ||
85 | /* | ||
86 | * Id: fbcopy.c,v 1.1 1999/11/02 03:54:45 keithp Exp $ | ||
87 | * | ||
88 | - * Copyright © 1998 Keith Packard | ||
89 | + * Copyright © 1998 Keith Packard | ||
90 | * | ||
91 | * Permission to use, copy, modify, distribute, and sell this software and its | ||
92 | * documentation for any purpose is hereby granted without fee, provided that | ||
93 | @@ -27,6 +27,7 @@ | ||
94 | #ifdef IN_MODULE | ||
95 | #include "xf86_ansic.h" | ||
96 | #endif | ||
97 | +#include "fbmmx.h" | ||
98 | |||
99 | void | ||
100 | fbCopyNtoN (DrawablePtr pSrcDrawable, | ||
101 | @@ -54,28 +55,51 @@ | ||
102 | |||
103 | fbGetDrawable (pSrcDrawable, src, srcStride, srcBpp, srcXoff, srcYoff); | ||
104 | fbGetDrawable (pDstDrawable, dst, dstStride, dstBpp, dstXoff, dstYoff); | ||
105 | - | ||
106 | + | ||
107 | while (nbox--) | ||
108 | { | ||
109 | +#ifdef USE_MMX | ||
110 | + if (!reverse && !upsidedown && fbHaveMMX()) | ||
111 | + { | ||
112 | + if (!fbCopyAreammx (pSrcDrawable, | ||
113 | + pDstDrawable, | ||
114 | + | ||
115 | + (pbox->x1 + dx + srcXoff), | ||
116 | + (pbox->y1 + dy + srcYoff), | ||
117 | + | ||
118 | + (pbox->x1 + dstXoff), | ||
119 | + (pbox->y1 + dstYoff), | ||
120 | + | ||
121 | + (pbox->x2 - pbox->x1), | ||
122 | + (pbox->y2 - pbox->y1))) | ||
123 | + goto fallback; | ||
124 | + else | ||
125 | + goto next; | ||
126 | + } | ||
127 | + fallback: | ||
128 | +#endif | ||
129 | fbBlt (src + (pbox->y1 + dy + srcYoff) * srcStride, | ||
130 | srcStride, | ||
131 | (pbox->x1 + dx + srcXoff) * srcBpp, | ||
132 | - | ||
133 | + | ||
134 | dst + (pbox->y1 + dstYoff) * dstStride, | ||
135 | dstStride, | ||
136 | (pbox->x1 + dstXoff) * dstBpp, | ||
137 | - | ||
138 | + | ||
139 | (pbox->x2 - pbox->x1) * dstBpp, | ||
140 | (pbox->y2 - pbox->y1), | ||
141 | - | ||
142 | + | ||
143 | alu, | ||
144 | pm, | ||
145 | dstBpp, | ||
146 | - | ||
147 | + | ||
148 | reverse, | ||
149 | upsidedown); | ||
150 | +#ifdef USE_MMX | ||
151 | + next: | ||
152 | +#endif | ||
153 | pbox++; | ||
154 | - } | ||
155 | + } | ||
156 | } | ||
157 | |||
158 | void | ||
159 | @@ -594,7 +618,7 @@ | ||
160 | int yOut) | ||
161 | { | ||
162 | fbCopyProc copy; | ||
163 | - | ||
164 | + | ||
165 | #ifdef FB_24_32BIT | ||
166 | if (pSrcDrawable->bitsPerPixel != pDstDrawable->bitsPerPixel) | ||
167 | copy = fb24_32CopyMtoN; | ||
168 | diff -ur xc-orig/programs/Xserver/fb/fbfill.c xc/programs/Xserver/fb/fbfill.c | ||
169 | --- xc-orig/programs/Xserver/fb/fbfill.c 2005-02-11 04:00:50.006092570 -0500 | ||
170 | +++ xc/programs/Xserver/fb/fbfill.c 2005-02-11 04:01:32.069346037 -0500 | ||
171 | @@ -1,7 +1,7 @@ | ||
172 | /* | ||
173 | * Id: fbfill.c,v 1.1 1999/11/02 03:54:45 keithp Exp $ | ||
174 | * | ||
175 | - * Copyright © 1998 Keith Packard | ||
176 | + * Copyright © 1998 Keith Packard | ||
177 | * | ||
178 | * Permission to use, copy, modify, distribute, and sell this software and its | ||
179 | * documentation for any purpose is hereby granted without fee, provided that | ||
180 | @@ -44,7 +44,7 @@ | ||
181 | |||
182 | switch (pGC->fillStyle) { | ||
183 | case FillSolid: | ||
184 | -#ifdef USE_GCC34_MMX | ||
185 | +#ifdef USE_MMX | ||
186 | if (!pPriv->and && fbHaveMMX()) | ||
187 | if (fbSolidFillmmx (pDrawable, x, y, width, height, pPriv->xor)) | ||
188 | return; | ||
189 | |||
190 | diff -ur xc-orig/programs/Xserver/fb/fbmmx.c xc/programs/Xserver/fb/fbmmx.c | ||
191 | --- xc-orig/programs/Xserver/fb/fbmmx.c 2005-02-11 04:00:50.006092570 -0500 | ||
192 | +++ xc/programs/Xserver/fb/fbmmx.c 2005-02-11 04:01:32.072346126 -0500 | ||
193 | @@ -1,5 +1,6 @@ | ||
194 | /* | ||
195 | - * Copyright © 2004 Red Hat, Inc. | ||
196 | + * Copyright © 2004 Red Hat, Inc. | ||
197 | + * Copyright © 2004 Nicholas Miell | ||
198 | * | ||
199 | * Permission to use, copy, modify, distribute, and sell this software and its | ||
200 | * documentation for any purpose is hereby granted without fee, provided that | ||
201 | @@ -18,14 +19,23 @@ | ||
202 | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN | ||
203 | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
204 | * | ||
205 | - * Author: Søren Sandmann (sandmann@redhat.com) | ||
206 | - * | ||
207 | + * Author: Søren Sandmann (sandmann@redhat.com) | ||
208 | + * Minor Improvements: Nicholas Miell (nmiell@gmail.com) | ||
209 | + * | ||
210 | * Based on work by Owen Taylor | ||
211 | */ | ||
212 | |||
213 | + | ||
214 | +#ifdef USE_MMX | ||
215 | + | ||
216 | #include "fb.h" | ||
217 | +#include "fbmmx.h" | ||
218 | + | ||
219 | +#include <mmintrin.h> | ||
220 | |||
221 | -#ifdef USE_GCC34_MMX | ||
222 | +#ifdef USE_SSE | ||
223 | +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ | ||
224 | +#endif | ||
225 | |||
226 | #ifdef RENDER | ||
227 | |||
228 | @@ -33,11 +43,6 @@ | ||
229 | #include "mipict.h" | ||
230 | #include "fbpict.h" | ||
231 | |||
232 | -typedef int Vector1x64 __attribute__ ((mode(DI))); | ||
233 | -typedef int Vector2x32 __attribute__ ((mode(V2SI))); | ||
234 | -typedef int Vector4x16 __attribute__ ((mode(V4HI))); | ||
235 | -typedef int Vector8x8 __attribute__ ((mode(V8QI))); | ||
236 | - | ||
237 | typedef unsigned long long ullong; | ||
238 | |||
239 | #define noVERBOSE | ||
240 | @@ -50,7 +55,6 @@ | ||
241 | |||
242 | typedef struct | ||
243 | { | ||
244 | - ullong mmx_zero; | ||
245 | ullong mmx_4x00ff; | ||
246 | ullong mmx_4x0080; | ||
247 | ullong mmx_565_rgb; | ||
248 | @@ -70,7 +74,6 @@ | ||
249 | |||
250 | static const MMXData c = | ||
251 | { | ||
252 | - .mmx_zero = 0x0000000000000000ULL, | ||
253 | .mmx_4x00ff = 0x00ff00ff00ff00ffULL, | ||
254 | .mmx_4x0080 = 0x0080008000800080ULL, | ||
255 | .mmx_565_rgb = 0x000001f0003f001fULL, | ||
256 | @@ -88,121 +91,112 @@ | ||
257 | .mmx_000000000000ffff = 0x000000000000ffffULL, | ||
258 | }; | ||
259 | |||
260 | -static __inline__ Vector1x64 | ||
261 | -shift (Vector1x64 v, int s) | ||
262 | +#define MC(x) ((__m64) c.mmx_##x) | ||
263 | + | ||
264 | +static __inline__ __m64 | ||
265 | +shift (__m64 v, int s) | ||
266 | { | ||
267 | if (s > 0) | ||
268 | - return __builtin_ia32_psllq (v, s); | ||
269 | + return _mm_slli_si64 (v, s); | ||
270 | else if (s < 0) | ||
271 | - return __builtin_ia32_psrlq (v, -s); | ||
272 | + return _mm_srli_si64 (v, -s); | ||
273 | else | ||
274 | return v; | ||
275 | } | ||
276 | |||
277 | -static __inline__ Vector4x16 | ||
278 | -negate (Vector4x16 mask) | ||
279 | +static __inline__ __m64 | ||
280 | +negate (__m64 mask) | ||
281 | { | ||
282 | - return (Vector4x16)__builtin_ia32_pxor ( | ||
283 | - (Vector1x64)mask, | ||
284 | - (Vector1x64)c.mmx_4x00ff); | ||
285 | + return _mm_xor_si64 (mask, MC(4x00ff)); | ||
286 | } | ||
287 | |||
288 | -static __inline__ Vector4x16 | ||
289 | -pix_multiply (Vector4x16 a, Vector4x16 b) | ||
290 | +static __inline__ __m64 | ||
291 | +pix_multiply (__m64 a, __m64 b) | ||
292 | { | ||
293 | - Vector4x16 res; | ||
294 | + __m64 res; | ||
295 | |||
296 | - res = __builtin_ia32_pmullw (a, b); | ||
297 | - res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080); | ||
298 | - res = __builtin_ia32_psrlw (res, 8); | ||
299 | + res = _mm_mullo_pi16 (a, b); | ||
300 | + res = _mm_add_pi16 (res, MC(4x0080)); | ||
301 | + res = _mm_srli_pi16 (res, 8); | ||
302 | |||
303 | return res; | ||
304 | } | ||
305 | |||
306 | -#if 0 | ||
307 | +#ifdef USE_SSE | ||
308 | #define HAVE_PSHUFW | ||
309 | #endif | ||
310 | |||
311 | #ifdef HAVE_PSHUFW | ||
312 | |||
313 | -static __inline__ Vector4x16 | ||
314 | -expand_alpha (Vector4x16 pixel) | ||
315 | +static __inline__ __m64 | ||
316 | +expand_alpha (__m64 pixel) | ||
317 | { | ||
318 | - Vector4x16 result; | ||
319 | - __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel)); | ||
320 | - return result; | ||
321 | + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); | ||
322 | } | ||
323 | |||
324 | -static __inline__ Vector4x16 | ||
325 | -expand_alpha_rev (Vector4x16 pixel) | ||
326 | +static __inline__ __m64 | ||
327 | +expand_alpha_rev (__m64 pixel) | ||
328 | { | ||
329 | - Vector4x16 result; | ||
330 | - __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel)); | ||
331 | - return result; | ||
332 | + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); | ||
333 | } | ||
334 | |||
335 | -static __inline__ Vector4x16 | ||
336 | -invert_colors (Vector4x16 pixel) | ||
337 | +static __inline__ __m64 | ||
338 | +invert_colors (__m64 pixel) | ||
339 | { | ||
340 | - Vector4x16 result; | ||
341 | - | ||
342 | - /* 0xC6 = 11000110 */ | ||
343 | - /* 3 0 1 2 */ | ||
344 | - | ||
345 | - __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel)); | ||
346 | - | ||
347 | - return result; | ||
348 | + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); | ||
349 | } | ||
350 | |||
351 | #else | ||
352 | |||
353 | -static __inline__ Vector4x16 | ||
354 | -expand_alpha (Vector4x16 pixel) | ||
355 | +static __inline__ __m64 | ||
356 | +expand_alpha (__m64 pixel) | ||
357 | { | ||
358 | - Vector1x64 t1, t2; | ||
359 | - | ||
360 | - t1 = shift ((Vector1x64)pixel, -48); | ||
361 | + __m64 t1, t2; | ||
362 | + | ||
363 | + t1 = shift (pixel, -48); | ||
364 | t2 = shift (t1, 16); | ||
365 | - t1 = __builtin_ia32_por (t1, t2); | ||
366 | + t1 = _mm_or_si64 (t1, t2); | ||
367 | t2 = shift (t1, 32); | ||
368 | - t1 = __builtin_ia32_por (t1, t2); | ||
369 | - | ||
370 | - return (Vector4x16)t1; | ||
371 | + t1 = _mm_or_si64 (t1, t2); | ||
372 | + | ||
373 | + return t1; | ||
374 | } | ||
375 | |||
376 | -static __inline__ Vector4x16 | ||
377 | -expand_alpha_rev (Vector4x16 pixel) | ||
378 | +static __inline__ __m64 | ||
379 | +expand_alpha_rev (__m64 pixel) | ||
380 | { | ||
381 | - Vector1x64 t1, t2; | ||
382 | - | ||
383 | - t1 = shift ((Vector1x64)pixel, 48); | ||
384 | + __m64 t1, t2; | ||
385 | + | ||
386 | + /* move alpha to low 16 bits and zero the rest */ | ||
387 | + t1 = shift (pixel, 48); | ||
388 | t1 = shift (t1, -48); | ||
389 | + | ||
390 | t2 = shift (t1, 16); | ||
391 | - t1 = __builtin_ia32_por (t1, t2); | ||
392 | + t1 = _mm_or_si64 (t1, t2); | ||
393 | t2 = shift (t1, 32); | ||
394 | - t1 = __builtin_ia32_por (t1, t2); | ||
395 | - | ||
396 | - return (Vector4x16)t1; | ||
397 | + t1 = _mm_or_si64 (t1, t2); | ||
398 | + | ||
399 | + return t1; | ||
400 | } | ||
401 | |||
402 | -static __inline__ Vector4x16 | ||
403 | -invert_colors (Vector4x16 pixel) | ||
404 | +static __inline__ __m64 | ||
405 | +invert_colors (__m64 pixel) | ||
406 | { | ||
407 | - Vector1x64 x, y, z; | ||
408 | - | ||
409 | - x = y = z = (Vector1x64)pixel; | ||
410 | - | ||
411 | - x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000); | ||
412 | - y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff); | ||
413 | - z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000); | ||
414 | - | ||
415 | + __m64 x, y, z; | ||
416 | + | ||
417 | + x = y = z = pixel; | ||
418 | + | ||
419 | + x = _mm_and_si64 (x, MC(ffff0000ffff0000)); | ||
420 | + y = _mm_and_si64 (y, MC(000000000000ffff)); | ||
421 | + z = _mm_and_si64 (z, MC(0000ffff00000000)); | ||
422 | + | ||
423 | y = shift (y, 32); | ||
424 | z = shift (z, -32); | ||
425 | - | ||
426 | - x = __builtin_ia32_por (x, y); | ||
427 | - x = __builtin_ia32_por (x, z); | ||
428 | - | ||
429 | - return (Vector4x16)x; | ||
430 | + | ||
431 | + x = _mm_or_si64 (x, y); | ||
432 | + x = _mm_or_si64 (x, z); | ||
433 | + | ||
434 | + return x; | ||
435 | } | ||
436 | |||
437 | #endif | ||
438 | @@ -210,147 +204,138 @@ | ||
439 | /* Notes about writing mmx code | ||
440 | * | ||
441 | * give memory operands as the second operand. If you give it as the | ||
442 | - * first, gcc will first load it into a register, then use that register | ||
443 | + * first, gcc will first load it into a register, then use that | ||
444 | + * register | ||
445 | * | ||
446 | * ie. use | ||
447 | * | ||
448 | - * __builtin_pmullw (x, mmx_constant[8]); | ||
449 | + * _mm_mullo_pi16 (x, mmx_constant); | ||
450 | * | ||
451 | * not | ||
452 | * | ||
453 | - * __builtin_pmullw (mmx_constant[8], x); | ||
454 | + * _mm_mullo_pi16 (mmx_constant, x); | ||
455 | * | ||
456 | - * Also try to minimize dependencies. Ie. when you need a value, try to calculate | ||
457 | - * it from a value that was calculated as early as possible. | ||
458 | + * Also try to minimize dependencies. i.e. when you need a value, try | ||
459 | + * to calculate it from a value that was calculated as early as | ||
460 | + * possible. | ||
461 | */ | ||
462 | |||
463 | -static __inline__ Vector4x16 | ||
464 | -over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest) | ||
465 | +static __inline__ __m64 | ||
466 | +over (__m64 src, __m64 srca, __m64 dest) | ||
467 | { | ||
468 | - return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca))); | ||
469 | + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); | ||
470 | } | ||
471 | |||
472 | -static __inline__ Vector4x16 | ||
473 | -over_rev_non_pre (Vector4x16 src, Vector4x16 dest) | ||
474 | +static __inline__ __m64 | ||
475 | +over_rev_non_pre (__m64 src, __m64 dest) | ||
476 | { | ||
477 | - Vector4x16 srca = expand_alpha (src); | ||
478 | - Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha); | ||
479 | - | ||
480 | + __m64 srca = expand_alpha (src); | ||
481 | + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); | ||
482 | + | ||
483 | return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); | ||
484 | } | ||
485 | |||
486 | -static __inline__ Vector4x16 | ||
487 | -in (Vector4x16 src, | ||
488 | - Vector4x16 mask) | ||
489 | +static __inline__ __m64 | ||
490 | +in (__m64 src, | ||
491 | + __m64 mask) | ||
492 | { | ||
493 | return pix_multiply (src, mask); | ||
494 | } | ||
495 | |||
496 | -static __inline__ Vector4x16 | ||
497 | -in_over (Vector4x16 src, | ||
498 | - Vector4x16 srca, | ||
499 | - Vector4x16 mask, | ||
500 | - Vector4x16 dest) | ||
501 | +static __inline__ __m64 | ||
502 | +in_over (__m64 src, | ||
503 | + __m64 srca, | ||
504 | + __m64 mask, | ||
505 | + __m64 dest) | ||
506 | { | ||
507 | return over(in(src, mask), pix_multiply(srca, mask), dest); | ||
508 | } | ||
509 | |||
510 | -static __inline__ Vector8x8 | ||
511 | -cvt32to64 (CARD32 v) | ||
512 | -{ | ||
513 | - ullong r = v; | ||
514 | - return (Vector8x8)r; | ||
515 | -} | ||
516 | - | ||
517 | -static __inline__ Vector4x16 | ||
518 | +static __inline__ __m64 | ||
519 | load8888 (CARD32 v) | ||
520 | { | ||
521 | - return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v), | ||
522 | - (Vector8x8)c.mmx_zero); | ||
523 | + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); | ||
524 | } | ||
525 | |||
526 | -static __inline__ Vector8x8 | ||
527 | -pack8888 (Vector4x16 lo, Vector4x16 hi) | ||
528 | +static __inline__ __m64 | ||
529 | +pack8888 (__m64 lo, __m64 hi) | ||
530 | { | ||
531 | - Vector8x8 r; | ||
532 | - r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi); | ||
533 | + __m64 r; | ||
534 | + r = _mm_packs_pu16 (lo, hi); | ||
535 | return r; | ||
536 | } | ||
537 | |||
538 | -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB | ||
539 | - | ||
540 | ---- Expanding 565 in the low word --- | ||
541 | - | ||
542 | -m = (m << (32 - 3)) | (m << (16 - 5)) | m; | ||
543 | -m = m & (01f0003f001f); | ||
544 | -m = m * (008404100840); | ||
545 | -m = m >> 8; | ||
546 | - | ||
547 | -Note the trick here - the top word is shifted by another nibble to avoid | ||
548 | -it bumping into the middle word | ||
549 | -*/ | ||
550 | -static __inline__ Vector4x16 | ||
551 | -expand565 (Vector4x16 pixel, int pos) | ||
552 | +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into | ||
553 | + * | ||
554 | + * 00RR00GG00BB | ||
555 | + * | ||
556 | + * --- Expanding 565 in the low word --- | ||
557 | + * | ||
558 | + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; | ||
559 | + * m = m & (01f0003f001f); | ||
560 | + * m = m * (008404100840); | ||
561 | + * m = m >> 8; | ||
562 | + * | ||
563 | + * Note the trick here - the top word is shifted by another nibble to | ||
564 | + * avoid it bumping into the middle word | ||
565 | + */ | ||
566 | +static __inline__ __m64 | ||
567 | +expand565 (__m64 pixel, int pos) | ||
568 | { | ||
569 | - Vector1x64 p = (Vector1x64)pixel; | ||
570 | + __m64 p = pixel; | ||
571 | + __m64 t1, t2; | ||
572 | |||
573 | /* move pixel to low 16 bit and zero the rest */ | ||
574 | p = shift (shift (p, (3 - pos) * 16), -48); | ||
575 | |||
576 | - Vector1x64 t1 = shift (p, 36 - 11); | ||
577 | - Vector1x64 t2 = shift (p, 16 - 5); | ||
578 | + t1 = shift (p, 36 - 11); | ||
579 | + t2 = shift (p, 16 - 5); | ||
580 | |||
581 | - p = __builtin_ia32_por (t1, p); | ||
582 | - p = __builtin_ia32_por (t2, p); | ||
583 | - p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb); | ||
584 | + p = _mm_or_si64 (t1, p); | ||
585 | + p = _mm_or_si64 (t2, p); | ||
586 | + p = _mm_and_si64 (p, MC(565_rgb)); | ||
587 | |||
588 | - pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier); | ||
589 | - return __builtin_ia32_psrlw (pixel, 8); | ||
590 | + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); | ||
591 | + return _mm_srli_pi16 (pixel, 8); | ||
592 | } | ||
593 | |||
594 | -static __inline__ Vector4x16 | ||
595 | -expand8888 (Vector4x16 in, int pos) | ||
596 | +static __inline__ __m64 | ||
597 | +expand8888 (__m64 in, int pos) | ||
598 | { | ||
599 | if (pos == 0) | ||
600 | - return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); | ||
601 | + return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); | ||
602 | else | ||
603 | - return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); | ||
604 | + return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); | ||
605 | } | ||
606 | |||
607 | -static __inline__ Vector4x16 | ||
608 | -pack565 (Vector4x16 pixel, Vector4x16 target, int pos) | ||
609 | +static __inline__ __m64 | ||
610 | +pack565 (__m64 pixel, __m64 target, int pos) | ||
611 | { | ||
612 | - Vector1x64 p = (Vector1x64)pixel; | ||
613 | - Vector1x64 t = (Vector1x64)target; | ||
614 | - Vector1x64 r, g, b; | ||
615 | + __m64 p = pixel; | ||
616 | + __m64 t = target; | ||
617 | + __m64 r, g, b; | ||
618 | |||
619 | - r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r); | ||
620 | - g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g); | ||
621 | - b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b); | ||
622 | + r = _mm_and_si64 (p, MC(565_r)); | ||
623 | + g = _mm_and_si64 (p, MC(565_g)); | ||
624 | + b = _mm_and_si64 (p, MC(565_b)); | ||
625 | |||
626 | r = shift (r, - (32 - 8) + pos * 16); | ||
627 | g = shift (g, - (16 - 3) + pos * 16); | ||
628 | b = shift (b, - (0 + 3) + pos * 16); | ||
629 | - | ||
630 | + | ||
631 | if (pos == 0) | ||
632 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0); | ||
633 | + t = _mm_and_si64 (t, MC(mask_0)); | ||
634 | else if (pos == 1) | ||
635 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1); | ||
636 | + t = _mm_and_si64 (t, MC(mask_1)); | ||
637 | else if (pos == 2) | ||
638 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2); | ||
639 | + t = _mm_and_si64 (t, MC(mask_2)); | ||
640 | else if (pos == 3) | ||
641 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3); | ||
642 | + t = _mm_and_si64 (t, MC(mask_3)); | ||
643 | |||
644 | - p = __builtin_ia32_por (r, t); | ||
645 | - p = __builtin_ia32_por (g, p); | ||
646 | + p = _mm_or_si64 (r, t); | ||
647 | + p = _mm_or_si64 (g, p); | ||
648 | |||
649 | - return (Vector4x16)__builtin_ia32_por (b, p); | ||
650 | -} | ||
651 | - | ||
652 | -static __inline__ void | ||
653 | -emms (void) | ||
654 | -{ | ||
655 | - __asm__ __volatile__ ("emms"); | ||
656 | + return _mm_or_si64 (b, p); | ||
657 | } | ||
658 | |||
659 | void | ||
660 | @@ -371,8 +356,8 @@ | ||
661 | CARD32 *dstLine, *dst; | ||
662 | CARD16 w; | ||
663 | FbStride dstStride; | ||
664 | - Vector4x16 vsrc, vsrca; | ||
665 | - | ||
666 | + __m64 vsrc, vsrca; | ||
667 | + | ||
668 | CHECKPOINT(); | ||
669 | |||
670 | fbComposeGetSolid(pSrc, src, pDst->format); | ||
671 | @@ -384,51 +369,52 @@ | ||
672 | |||
673 | vsrc = load8888 (src); | ||
674 | vsrca = expand_alpha (vsrc); | ||
675 | - | ||
676 | + | ||
677 | while (height--) | ||
678 | { | ||
679 | dst = dstLine; | ||
680 | dstLine += dstStride; | ||
681 | w = width; | ||
682 | - | ||
683 | + | ||
684 | CHECKPOINT(); | ||
685 | |||
686 | while (w && (unsigned long)dst & 7) | ||
687 | { | ||
688 | - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); | ||
689 | + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), | ||
690 | + _mm_setzero_si64()); | ||
691 | |||
692 | w--; | ||
693 | dst++; | ||
694 | } | ||
695 | - | ||
696 | + | ||
697 | while (w >= 2) | ||
698 | { | ||
699 | - Vector4x16 vdest; | ||
700 | - Vector4x16 dest0, dest1; | ||
701 | - | ||
702 | - vdest = *(Vector4x16 *)dst; | ||
703 | + __m64 vdest; | ||
704 | + __m64 dest0, dest1; | ||
705 | + | ||
706 | + vdest = *(__m64 *)dst; | ||
707 | |||
708 | dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); | ||
709 | dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); | ||
710 | |||
711 | - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); | ||
712 | + *(__m64 *)dst = pack8888(dest0, dest1); | ||
713 | |||
714 | dst += 2; | ||
715 | w -= 2; | ||
716 | } | ||
717 | - | ||
718 | + | ||
719 | CHECKPOINT(); | ||
720 | |||
721 | while (w) | ||
722 | { | ||
723 | - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); | ||
724 | + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); | ||
725 | |||
726 | w--; | ||
727 | dst++; | ||
728 | } | ||
729 | } | ||
730 | |||
731 | - emms(); | ||
732 | + _mm_empty(); | ||
733 | } | ||
734 | |||
735 | void | ||
736 | @@ -449,8 +435,8 @@ | ||
737 | CARD16 *dstLine, *dst; | ||
738 | CARD16 w; | ||
739 | FbStride dstStride; | ||
740 | - Vector4x16 vsrc, vsrca; | ||
741 | - | ||
742 | + __m64 vsrc, vsrca; | ||
743 | + | ||
744 | CHECKPOINT(); | ||
745 | |||
746 | fbComposeGetSolid(pSrc, src, pDst->format); | ||
747 | @@ -462,49 +448,49 @@ | ||
748 | |||
749 | vsrc = load8888 (src); | ||
750 | vsrca = expand_alpha (vsrc); | ||
751 | - | ||
752 | + | ||
753 | while (height--) | ||
754 | { | ||
755 | dst = dstLine; | ||
756 | dstLine += dstStride; | ||
757 | w = width; | ||
758 | - | ||
759 | + | ||
760 | CHECKPOINT(); | ||
761 | |||
762 | while (w && (unsigned long)dst & 7) | ||
763 | { | ||
764 | ullong d = *dst; | ||
765 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); | ||
766 | + __m64 vdest = expand565 ((__m64)d, 0); | ||
767 | vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); | ||
768 | *dst = (ullong)vdest; | ||
769 | |||
770 | w--; | ||
771 | dst++; | ||
772 | } | ||
773 | - | ||
774 | + | ||
775 | while (w >= 4) | ||
776 | { | ||
777 | - Vector4x16 vdest; | ||
778 | - | ||
779 | - vdest = *(Vector4x16 *)dst; | ||
780 | + __m64 vdest; | ||
781 | + | ||
782 | + vdest = *(__m64 *)dst; | ||
783 | |||
784 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); | ||
785 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); | ||
786 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); | ||
787 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); | ||
788 | |||
789 | - *(Vector8x8 *)dst = (Vector8x8)vdest; | ||
790 | + *(__m64 *)dst = vdest; | ||
791 | |||
792 | dst += 4; | ||
793 | w -= 4; | ||
794 | } | ||
795 | - | ||
796 | + | ||
797 | CHECKPOINT(); | ||
798 | |||
799 | while (w) | ||
800 | { | ||
801 | ullong d = *dst; | ||
802 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); | ||
803 | + __m64 vdest = expand565 ((__m64)d, 0); | ||
804 | vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); | ||
805 | *dst = (ullong)vdest; | ||
806 | |||
807 | @@ -513,7 +499,7 @@ | ||
808 | } | ||
809 | } | ||
810 | |||
811 | - emms(); | ||
812 | + _mm_empty(); | ||
813 | } | ||
814 | |||
815 | void | ||
816 | @@ -534,8 +520,8 @@ | ||
817 | CARD32 *dstLine; | ||
818 | CARD32 *maskLine; | ||
819 | FbStride dstStride, maskStride; | ||
820 | - Vector4x16 vsrc, vsrca; | ||
821 | - | ||
822 | + __m64 vsrc, vsrca; | ||
823 | + | ||
824 | CHECKPOINT(); | ||
825 | |||
826 | fbComposeGetSolid(pSrc, src, pDst->format); | ||
827 | @@ -562,9 +548,9 @@ | ||
828 | |||
829 | if (m) | ||
830 | { | ||
831 | - Vector4x16 vdest = load8888(*q); | ||
832 | + __m64 vdest = load8888(*q); | ||
833 | vdest = in_over(vsrc, vsrca, load8888(m), vdest); | ||
834 | - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); | ||
835 | + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); | ||
836 | } | ||
837 | |||
838 | twidth--; | ||
839 | @@ -580,15 +566,15 @@ | ||
840 | |||
841 | if (m0 | m1) | ||
842 | { | ||
843 | - Vector4x16 dest0, dest1; | ||
844 | - Vector4x16 vdest = *(Vector4x16 *)q; | ||
845 | + __m64 dest0, dest1; | ||
846 | + __m64 vdest = *(__m64 *)q; | ||
847 | |||
848 | dest0 = in_over(vsrc, vsrca, load8888(m0), | ||
849 | expand8888 (vdest, 0)); | ||
850 | dest1 = in_over(vsrc, vsrca, load8888(m1), | ||
851 | expand8888 (vdest, 1)); | ||
852 | |||
853 | - *(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1); | ||
854 | + *(__m64 *)q = pack8888(dest0, dest1); | ||
855 | } | ||
856 | |||
857 | p += 2; | ||
858 | @@ -602,9 +588,9 @@ | ||
859 | |||
860 | if (m) | ||
861 | { | ||
862 | - Vector4x16 vdest = load8888(*q); | ||
863 | + __m64 vdest = load8888(*q); | ||
864 | vdest = in_over(vsrc, vsrca, load8888(m), vdest); | ||
865 | - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); | ||
866 | + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); | ||
867 | } | ||
868 | |||
869 | twidth--; | ||
870 | @@ -616,7 +602,133 @@ | ||
871 | maskLine += maskStride; | ||
872 | } | ||
873 | |||
874 | - emms(); | ||
875 | + _mm_empty(); | ||
876 | +} | ||
877 | + | ||
878 | +void | ||
879 | +fbCompositeSrc_8888x8x8888mmx (CARD8 op, | ||
880 | + PicturePtr pSrc, | ||
881 | + PicturePtr pMask, | ||
882 | + PicturePtr pDst, | ||
883 | + INT16 xSrc, | ||
884 | + INT16 ySrc, | ||
885 | + INT16 xMask, | ||
886 | + INT16 yMask, | ||
887 | + INT16 xDst, | ||
888 | + INT16 yDst, | ||
889 | + CARD16 width, | ||
890 | + CARD16 height) | ||
891 | +{ | ||
892 | + CARD32 *dstLine, *dst; | ||
893 | + CARD32 *srcLine, *src; | ||
894 | + CARD8 *maskLine; | ||
895 | + CARD32 mask; | ||
896 | + __m64 vmask; | ||
897 | + FbStride dstStride, srcStride, maskStride; | ||
898 | + CARD16 w; | ||
899 | + __m64 srca; | ||
900 | + | ||
901 | + CHECKPOINT(); | ||
902 | + | ||
903 | + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); | ||
904 | + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); | ||
905 | + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); | ||
906 | + | ||
907 | + mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; | ||
908 | + vmask = load8888 (mask); | ||
909 | + srca = MC(4x00ff); | ||
910 | + | ||
911 | + while (height--) | ||
912 | + { | ||
913 | + dst = dstLine; | ||
914 | + dstLine += dstStride; | ||
915 | + src = srcLine; | ||
916 | + srcLine += srcStride; | ||
917 | + w = width; | ||
918 | + | ||
919 | + while (w && (unsigned long)dst & 7) | ||
920 | + { | ||
921 | + __m64 s = load8888 (*src); | ||
922 | + __m64 d = load8888 (*dst); | ||
923 | + | ||
924 | + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); | ||
925 | + | ||
926 | + w--; | ||
927 | + dst++; | ||
928 | + src++; | ||
929 | + } | ||
930 | + | ||
931 | + while (w >= 16) | ||
932 | + { | ||
933 | + __m64 vd0 = *(__m64 *)(dst + 0); | ||
934 | + __m64 vd1 = *(__m64 *)(dst + 2); | ||
935 | + __m64 vd2 = *(__m64 *)(dst + 4); | ||
936 | + __m64 vd3 = *(__m64 *)(dst + 6); | ||
937 | + __m64 vd4 = *(__m64 *)(dst + 8); | ||
938 | + __m64 vd5 = *(__m64 *)(dst + 10); | ||
939 | + __m64 vd6 = *(__m64 *)(dst + 12); | ||
940 | + __m64 vd7 = *(__m64 *)(dst + 14); | ||
941 | + | ||
942 | + __m64 vs0 = *(__m64 *)(src + 0); | ||
943 | + __m64 vs1 = *(__m64 *)(src + 2); | ||
944 | + __m64 vs2 = *(__m64 *)(src + 4); | ||
945 | + __m64 vs3 = *(__m64 *)(src + 6); | ||
946 | + __m64 vs4 = *(__m64 *)(src + 8); | ||
947 | + __m64 vs5 = *(__m64 *)(src + 10); | ||
948 | + __m64 vs6 = *(__m64 *)(src + 12); | ||
949 | + __m64 vs7 = *(__m64 *)(dst + 14); | ||
950 | + | ||
951 | + vd0 = (__m64)pack8888 ( | ||
952 | + in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), | ||
953 | + in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); | ||
954 | + | ||
955 | + vd1 = (__m64)pack8888 ( | ||
956 | + in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), | ||
957 | + in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); | ||
958 | + | ||
959 | + vd2 = (__m64)pack8888 ( | ||
960 | + in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), | ||
961 | + in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); | ||
962 | + | ||
963 | + vd3 = (__m64)pack8888 ( | ||
964 | + in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), | ||
965 | + in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); | ||
966 | + | ||
967 | + vd4 = (__m64)pack8888 ( | ||
968 | + in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), | ||
969 | + in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); | ||
970 | + | ||
971 | + vd5 = (__m64)pack8888 ( | ||
972 | + in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), | ||
973 | + in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); | ||
974 | + | ||
975 | + vd6 = (__m64)pack8888 ( | ||
976 | + in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), | ||
977 | + in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); | ||
978 | + | ||
979 | + vd7 = (__m64)pack8888 ( | ||
980 | + in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), | ||
981 | + in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); | ||
982 | + | ||
983 | + w -= 16; | ||
984 | + dst += 16; | ||
985 | + src += 16; | ||
986 | + } | ||
987 | + | ||
988 | + while (w) | ||
989 | + { | ||
990 | + __m64 s = load8888 (*src); | ||
991 | + __m64 d = load8888 (*dst); | ||
992 | + | ||
993 | + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); | ||
994 | + | ||
995 | + w--; | ||
996 | + dst++; | ||
997 | + src++; | ||
998 | + } | ||
999 | + } | ||
1000 | + | ||
1001 | + _mm_empty(); | ||
1002 | } | ||
1003 | |||
1004 | void | ||
1005 | @@ -638,7 +750,7 @@ | ||
1006 | CARD8 *maskLine, *mask; | ||
1007 | FbStride dstStride, maskStride; | ||
1008 | CARD16 w; | ||
1009 | - Vector4x16 vsrc, vsrca; | ||
1010 | + __m64 vsrc, vsrca; | ||
1011 | ullong srcsrc; | ||
1012 | |||
1013 | CHECKPOINT(); | ||
1014 | @@ -648,7 +760,7 @@ | ||
1015 | srca = src >> 24; | ||
1016 | if (srca == 0) | ||
1017 | return; | ||
1018 | - | ||
1019 | + | ||
1020 | srcsrc = (unsigned long long)src << 32 | src; | ||
1021 | |||
1022 | fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); | ||
1023 | @@ -664,7 +776,7 @@ | ||
1024 | mask = maskLine; | ||
1025 | maskLine += maskStride; | ||
1026 | w = width; | ||
1027 | - | ||
1028 | + | ||
1029 | CHECKPOINT(); | ||
1030 | |||
1031 | while (w && (unsigned long)dst & 7) | ||
1032 | @@ -673,15 +785,15 @@ | ||
1033 | |||
1034 | if (m) | ||
1035 | { | ||
1036 | - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst)); | ||
1037 | - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); | ||
1038 | + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); | ||
1039 | + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); | ||
1040 | } | ||
1041 | |||
1042 | w--; | ||
1043 | mask++; | ||
1044 | dst++; | ||
1045 | } | ||
1046 | - | ||
1047 | + | ||
1048 | CHECKPOINT(); | ||
1049 | |||
1050 | while (w >= 2) | ||
1051 | @@ -689,29 +801,29 @@ | ||
1052 | ullong m0, m1; | ||
1053 | m0 = *mask; | ||
1054 | m1 = *(mask + 1); | ||
1055 | - | ||
1056 | + | ||
1057 | if (srca == 0xff && (m0 & m1) == 0xff) | ||
1058 | { | ||
1059 | *(unsigned long long *)dst = srcsrc; | ||
1060 | } | ||
1061 | else if (m0 | m1) | ||
1062 | { | ||
1063 | - Vector4x16 vdest; | ||
1064 | - Vector4x16 dest0, dest1; | ||
1065 | - | ||
1066 | - vdest = *(Vector4x16 *)dst; | ||
1067 | + __m64 vdest; | ||
1068 | + __m64 dest0, dest1; | ||
1069 | + | ||
1070 | + vdest = *(__m64 *)dst; | ||
1071 | |||
1072 | - dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0)); | ||
1073 | - dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1)); | ||
1074 | + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); | ||
1075 | + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); | ||
1076 | |||
1077 | - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); | ||
1078 | + *(__m64 *)dst = pack8888(dest0, dest1); | ||
1079 | } | ||
1080 | |||
1081 | mask += 2; | ||
1082 | dst += 2; | ||
1083 | w -= 2; | ||
1084 | } | ||
1085 | - | ||
1086 | + | ||
1087 | CHECKPOINT(); | ||
1088 | |||
1089 | while (w) | ||
1090 | @@ -720,9 +832,9 @@ | ||
1091 | |||
1092 | if (m) | ||
1093 | { | ||
1094 | - Vector4x16 vdest = load8888(*dst); | ||
1095 | - vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest); | ||
1096 | - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); | ||
1097 | + __m64 vdest = load8888(*dst); | ||
1098 | + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); | ||
1099 | + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); | ||
1100 | } | ||
1101 | |||
1102 | w--; | ||
1103 | @@ -731,7 +843,7 @@ | ||
1104 | } | ||
1105 | } | ||
1106 | |||
1107 | - emms(); | ||
1108 | + _mm_empty(); | ||
1109 | } | ||
1110 | |||
1111 | |||
1112 | @@ -754,7 +866,7 @@ | ||
1113 | CARD8 *maskLine, *mask; | ||
1114 | FbStride dstStride, maskStride; | ||
1115 | CARD16 w; | ||
1116 | - Vector4x16 vsrc, vsrca; | ||
1117 | + __m64 vsrc, vsrca; | ||
1118 | unsigned long long srcsrcsrcsrc, src16; | ||
1119 | |||
1120 | CHECKPOINT(); | ||
1121 | @@ -770,9 +882,9 @@ | ||
1122 | |||
1123 | vsrc = load8888 (src); | ||
1124 | vsrca = expand_alpha (vsrc); | ||
1125 | - | ||
1126 | - src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0); | ||
1127 | - | ||
1128 | + | ||
1129 | + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); | ||
1130 | + | ||
1131 | srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | | ||
1132 | (ullong)src16 << 16 | (ullong)src16; | ||
1133 | |||
1134 | @@ -783,7 +895,7 @@ | ||
1135 | mask = maskLine; | ||
1136 | maskLine += maskStride; | ||
1137 | w = width; | ||
1138 | - | ||
1139 | + | ||
1140 | CHECKPOINT(); | ||
1141 | |||
1142 | while (w && (unsigned long)dst & 7) | ||
1143 | @@ -793,16 +905,16 @@ | ||
1144 | if (m) | ||
1145 | { | ||
1146 | ullong d = *dst; | ||
1147 | - Vector4x16 vd = (Vector4x16)d; | ||
1148 | - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); | ||
1149 | - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); | ||
1150 | + __m64 vd = (__m64)d; | ||
1151 | + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); | ||
1152 | + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); | ||
1153 | } | ||
1154 | |||
1155 | w--; | ||
1156 | mask++; | ||
1157 | dst++; | ||
1158 | } | ||
1159 | - | ||
1160 | + | ||
1161 | CHECKPOINT(); | ||
1162 | |||
1163 | while (w >= 4) | ||
1164 | @@ -812,35 +924,35 @@ | ||
1165 | m1 = *(mask + 1); | ||
1166 | m2 = *(mask + 2); | ||
1167 | m3 = *(mask + 3); | ||
1168 | - | ||
1169 | + | ||
1170 | if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) | ||
1171 | { | ||
1172 | *(unsigned long long *)dst = srcsrcsrcsrc; | ||
1173 | } | ||
1174 | else if (m0 | m1 | m2 | m3) | ||
1175 | { | ||
1176 | - Vector4x16 vdest; | ||
1177 | - Vector4x16 vm0, vm1, vm2, vm3; | ||
1178 | - | ||
1179 | - vdest = *(Vector4x16 *)dst; | ||
1180 | - | ||
1181 | - vm0 = (Vector4x16)m0; | ||
1182 | + __m64 vdest; | ||
1183 | + __m64 vm0, vm1, vm2, vm3; | ||
1184 | + | ||
1185 | + vdest = *(__m64 *)dst; | ||
1186 | + | ||
1187 | + vm0 = (__m64)m0; | ||
1188 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); | ||
1189 | - vm1 = (Vector4x16)m1; | ||
1190 | + vm1 = (__m64)m1; | ||
1191 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); | ||
1192 | - vm2 = (Vector4x16)m2; | ||
1193 | + vm2 = (__m64)m2; | ||
1194 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); | ||
1195 | - vm3 = (Vector4x16)m3; | ||
1196 | + vm3 = (__m64)m3; | ||
1197 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); | ||
1198 | |||
1199 | - *(Vector4x16 *)dst = vdest; | ||
1200 | + *(__m64 *)dst = vdest; | ||
1201 | } | ||
1202 | |||
1203 | w -= 4; | ||
1204 | mask += 4; | ||
1205 | dst += 4; | ||
1206 | } | ||
1207 | - | ||
1208 | + | ||
1209 | CHECKPOINT(); | ||
1210 | |||
1211 | while (w) | ||
1212 | @@ -850,9 +962,9 @@ | ||
1213 | if (m) | ||
1214 | { | ||
1215 | ullong d = *dst; | ||
1216 | - Vector4x16 vd = (Vector4x16)d; | ||
1217 | - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); | ||
1218 | - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); | ||
1219 | + __m64 vd = (__m64)d; | ||
1220 | + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); | ||
1221 | + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); | ||
1222 | } | ||
1223 | |||
1224 | w--; | ||
1225 | @@ -861,7 +973,7 @@ | ||
1226 | } | ||
1227 | } | ||
1228 | |||
1229 | - emms(); | ||
1230 | + _mm_empty(); | ||
1231 | } | ||
1232 | |||
1233 | void | ||
1234 | @@ -887,9 +999,9 @@ | ||
1235 | |||
1236 | fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); | ||
1237 | fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); | ||
1238 | - | ||
1239 | + | ||
1240 | assert (pSrc->pDrawable == pMask->pDrawable); | ||
1241 | - | ||
1242 | + | ||
1243 | while (height--) | ||
1244 | { | ||
1245 | dst = dstLine; | ||
1246 | @@ -897,14 +1009,14 @@ | ||
1247 | src = srcLine; | ||
1248 | srcLine += srcStride; | ||
1249 | w = width; | ||
1250 | - | ||
1251 | + | ||
1252 | CHECKPOINT(); | ||
1253 | |||
1254 | while (w && (unsigned long)dst & 7) | ||
1255 | { | ||
1256 | - Vector4x16 vsrc = load8888 (*src); | ||
1257 | + __m64 vsrc = load8888 (*src); | ||
1258 | ullong d = *dst; | ||
1259 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); | ||
1260 | + __m64 vdest = expand565 ((__m64)d, 0); | ||
1261 | |||
1262 | vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); | ||
1263 | |||
1264 | @@ -914,19 +1026,19 @@ | ||
1265 | dst++; | ||
1266 | src++; | ||
1267 | } | ||
1268 | - | ||
1269 | + | ||
1270 | CHECKPOINT(); | ||
1271 | |||
1272 | while (w >= 4) | ||
1273 | { | ||
1274 | CARD32 s0, s1, s2, s3; | ||
1275 | unsigned char a0, a1, a2, a3; | ||
1276 | - | ||
1277 | + | ||
1278 | s0 = *src; | ||
1279 | s1 = *(src + 1); | ||
1280 | s2 = *(src + 2); | ||
1281 | s3 = *(src + 3); | ||
1282 | - | ||
1283 | + | ||
1284 | a0 = (s0 >> 24); | ||
1285 | a1 = (s1 >> 24); | ||
1286 | a2 = (s2 >> 24); | ||
1287 | @@ -934,38 +1046,38 @@ | ||
1288 | |||
1289 | if ((a0 & a1 & a2 & a3) == 0xFF) | ||
1290 | { | ||
1291 | - Vector4x16 vdest; | ||
1292 | - vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0); | ||
1293 | + __m64 vdest; | ||
1294 | + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); | ||
1295 | vdest = pack565(invert_colors(load8888(s1)), vdest, 1); | ||
1296 | vdest = pack565(invert_colors(load8888(s2)), vdest, 2); | ||
1297 | vdest = pack565(invert_colors(load8888(s3)), vdest, 3); | ||
1298 | - | ||
1299 | - *(Vector4x16 *)dst = vdest; | ||
1300 | + | ||
1301 | + *(__m64 *)dst = vdest; | ||
1302 | } | ||
1303 | else if (a0 | a1 | a2 | a3) | ||
1304 | { | ||
1305 | - Vector4x16 vdest = *(Vector4x16 *)dst; | ||
1306 | - | ||
1307 | + __m64 vdest = *(__m64 *)dst; | ||
1308 | + | ||
1309 | vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); | ||
1310 | vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); | ||
1311 | vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); | ||
1312 | vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); | ||
1313 | - | ||
1314 | - *(Vector4x16 *)dst = vdest; | ||
1315 | + | ||
1316 | + *(__m64 *)dst = vdest; | ||
1317 | } | ||
1318 | |||
1319 | w -= 4; | ||
1320 | dst += 4; | ||
1321 | src += 4; | ||
1322 | } | ||
1323 | - | ||
1324 | + | ||
1325 | CHECKPOINT(); | ||
1326 | |||
1327 | while (w) | ||
1328 | { | ||
1329 | - Vector4x16 vsrc = load8888 (*src); | ||
1330 | + __m64 vsrc = load8888 (*src); | ||
1331 | ullong d = *dst; | ||
1332 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); | ||
1333 | + __m64 vdest = expand565 ((__m64)d, 0); | ||
1334 | |||
1335 | vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); | ||
1336 | |||
1337 | @@ -976,11 +1088,11 @@ | ||
1338 | src++; | ||
1339 | } | ||
1340 | } | ||
1341 | - | ||
1342 | - emms(); | ||
1343 | + | ||
1344 | + _mm_empty(); | ||
1345 | } | ||
1346 | |||
1347 | -/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ | ||
1348 | +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ | ||
1349 | |||
1350 | void | ||
1351 | fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, | ||
1352 | @@ -1005,9 +1117,9 @@ | ||
1353 | |||
1354 | fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); | ||
1355 | fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); | ||
1356 | - | ||
1357 | + | ||
1358 | assert (pSrc->pDrawable == pMask->pDrawable); | ||
1359 | - | ||
1360 | + | ||
1361 | while (height--) | ||
1362 | { | ||
1363 | dst = dstLine; | ||
1364 | @@ -1015,28 +1127,28 @@ | ||
1365 | src = srcLine; | ||
1366 | srcLine += srcStride; | ||
1367 | w = width; | ||
1368 | - | ||
1369 | + | ||
1370 | while (w && (unsigned long)dst & 7) | ||
1371 | { | ||
1372 | - Vector4x16 s = load8888 (*src); | ||
1373 | - Vector4x16 d = load8888 (*dst); | ||
1374 | + __m64 s = load8888 (*src); | ||
1375 | + __m64 d = load8888 (*dst); | ||
1376 | |||
1377 | - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); | ||
1378 | + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); | ||
1379 | |||
1380 | w--; | ||
1381 | dst++; | ||
1382 | src++; | ||
1383 | } | ||
1384 | - | ||
1385 | + | ||
1386 | while (w >= 2) | ||
1387 | { | ||
1388 | ullong s0, s1; | ||
1389 | unsigned char a0, a1; | ||
1390 | - Vector4x16 d0, d1; | ||
1391 | - | ||
1392 | + __m64 d0, d1; | ||
1393 | + | ||
1394 | s0 = *src; | ||
1395 | s1 = *(src + 1); | ||
1396 | - | ||
1397 | + | ||
1398 | a0 = (s0 >> 24); | ||
1399 | a1 = (s1 >> 24); | ||
1400 | |||
1401 | @@ -1044,17 +1156,17 @@ | ||
1402 | { | ||
1403 | d0 = invert_colors(load8888(s0)); | ||
1404 | d1 = invert_colors(load8888(s1)); | ||
1405 | - | ||
1406 | - *(Vector8x8 *)dst = pack8888 (d0, d1); | ||
1407 | + | ||
1408 | + *(__m64 *)dst = pack8888 (d0, d1); | ||
1409 | } | ||
1410 | else if (a0 | a1) | ||
1411 | { | ||
1412 | - Vector4x16 vdest = *(Vector4x16 *)dst; | ||
1413 | - | ||
1414 | + __m64 vdest = *(__m64 *)dst; | ||
1415 | + | ||
1416 | d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); | ||
1417 | d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); | ||
1418 | - | ||
1419 | - *(Vector8x8 *)dst = pack8888 (d0, d1); | ||
1420 | + | ||
1421 | + *(__m64 *)dst = pack8888 (d0, d1); | ||
1422 | } | ||
1423 | |||
1424 | w -= 2; | ||
1425 | @@ -1064,18 +1176,18 @@ | ||
1426 | |||
1427 | while (w) | ||
1428 | { | ||
1429 | - Vector4x16 s = load8888 (*src); | ||
1430 | - Vector4x16 d = load8888 (*dst); | ||
1431 | + __m64 s = load8888 (*src); | ||
1432 | + __m64 d = load8888 (*dst); | ||
1433 | |||
1434 | - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); | ||
1435 | + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); | ||
1436 | |||
1437 | w--; | ||
1438 | dst++; | ||
1439 | src++; | ||
1440 | } | ||
1441 | } | ||
1442 | - | ||
1443 | - emms(); | ||
1444 | + | ||
1445 | + _mm_empty(); | ||
1446 | } | ||
1447 | |||
1448 | void | ||
1449 | @@ -1096,7 +1208,7 @@ | ||
1450 | CARD16 *dstLine; | ||
1451 | CARD32 *maskLine; | ||
1452 | FbStride dstStride, maskStride; | ||
1453 | - Vector4x16 vsrc, vsrca; | ||
1454 | + __m64 vsrc, vsrca; | ||
1455 | |||
1456 | CHECKPOINT(); | ||
1457 | |||
1458 | @@ -1125,7 +1237,7 @@ | ||
1459 | if (m) | ||
1460 | { | ||
1461 | ullong d = *q; | ||
1462 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); | ||
1463 | + __m64 vdest = expand565 ((__m64)d, 0); | ||
1464 | vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); | ||
1465 | *q = (ullong)vdest; | ||
1466 | } | ||
1467 | @@ -1146,14 +1258,14 @@ | ||
1468 | |||
1469 | if ((m0 | m1 | m2 | m3)) | ||
1470 | { | ||
1471 | - Vector4x16 vdest = *(Vector4x16 *)q; | ||
1472 | + __m64 vdest = *(__m64 *)q; | ||
1473 | |||
1474 | vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); | ||
1475 | vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); | ||
1476 | vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); | ||
1477 | vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); | ||
1478 | |||
1479 | - *(Vector4x16 *)q = vdest; | ||
1480 | + *(__m64 *)q = vdest; | ||
1481 | } | ||
1482 | twidth -= 4; | ||
1483 | p += 4; | ||
1484 | @@ -1168,7 +1280,7 @@ | ||
1485 | if (m) | ||
1486 | { | ||
1487 | ullong d = *q; | ||
1488 | - Vector4x16 vdest = expand565((Vector4x16)d, 0); | ||
1489 | + __m64 vdest = expand565((__m64)d, 0); | ||
1490 | vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); | ||
1491 | *q = (ullong)vdest; | ||
1492 | } | ||
1493 | @@ -1182,7 +1294,7 @@ | ||
1494 | dstLine += dstStride; | ||
1495 | } | ||
1496 | |||
1497 | - emms (); | ||
1498 | + _mm_empty (); | ||
1499 | } | ||
1500 | |||
1501 | void | ||
1502 | @@ -1210,7 +1322,7 @@ | ||
1503 | |||
1504 | fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); | ||
1505 | fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); | ||
1506 | - | ||
1507 | + | ||
1508 | while (height--) | ||
1509 | { | ||
1510 | dst = dstLine; | ||
1511 | @@ -1218,7 +1330,7 @@ | ||
1512 | src = srcLine; | ||
1513 | srcLine += srcStride; | ||
1514 | w = width; | ||
1515 | - | ||
1516 | + | ||
1517 | while (w && (unsigned long)dst & 7) | ||
1518 | { | ||
1519 | s = *src; | ||
1520 | @@ -1234,13 +1346,7 @@ | ||
1521 | |||
1522 | while (w >= 8) | ||
1523 | { | ||
1524 | - __asm__ __volatile__ ( | ||
1525 | - "movq (%0), %%mm2\n\t" | ||
1526 | - "movq (%1), %%mm3\n\t" | ||
1527 | - "paddusb %%mm2, %%mm3\n\t" | ||
1528 | - "movq %%mm3, (%1)\n\t" | ||
1529 | - : /* no output */ : "r" (src), "r" (dst)); | ||
1530 | - | ||
1531 | + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); | ||
1532 | dst += 8; | ||
1533 | src += 8; | ||
1534 | w -= 8; | ||
1535 | @@ -1259,8 +1365,8 @@ | ||
1536 | w--; | ||
1537 | } | ||
1538 | } | ||
1539 | - | ||
1540 | - emms(); | ||
1541 | + | ||
1542 | + _mm_empty(); | ||
1543 | } | ||
1544 | |||
1545 | void | ||
1546 | @@ -1297,13 +1403,8 @@ | ||
1547 | |||
1548 | while (w && (unsigned long)dst & 7) | ||
1549 | { | ||
1550 | - __asm__ __volatile__ ( | ||
1551 | - "movd %0, %%mm2\n\t" | ||
1552 | - "movd %1, %%mm3\n\t" | ||
1553 | - "paddusb %%mm2, %%mm3\n\t" | ||
1554 | - "movd %%mm3, %1\n\t" | ||
1555 | - : /* no output */ : "m" (*src), "m" (*dst)); | ||
1556 | - | ||
1557 | + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), | ||
1558 | + _mm_cvtsi32_si64(*dst))); | ||
1559 | dst++; | ||
1560 | src++; | ||
1561 | w--; | ||
1562 | @@ -1311,13 +1412,7 @@ | ||
1563 | |||
1564 | while (w >= 2) | ||
1565 | { | ||
1566 | - __asm__ __volatile__ ( | ||
1567 | - "movq (%0), %%mm2\n\t" | ||
1568 | - "movq (%1), %%mm3\n\t" | ||
1569 | - "paddusb %%mm2, %%mm3\n\t" | ||
1570 | - "movq %%mm3, (%1)\n\t" | ||
1571 | - : /* no output */ : "r" (src), "r" (dst)); | ||
1572 | - | ||
1573 | + *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); | ||
1574 | dst += 2; | ||
1575 | src += 2; | ||
1576 | w -= 2; | ||
1577 | @@ -1325,16 +1420,13 @@ | ||
1578 | |||
1579 | if (w) | ||
1580 | { | ||
1581 | - __asm__ __volatile__ ( | ||
1582 | - "movd %0, %%mm2\n\t" | ||
1583 | - "movd %1, %%mm3\n\t" | ||
1584 | - "paddusb %%mm2, %%mm3\n\t" | ||
1585 | - "movd %%mm3, %1\n\t" | ||
1586 | - : /* no output */ : "m" (*src), "m" (*dst)); | ||
1587 | + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), | ||
1588 | + _mm_cvtsi32_si64(*dst))); | ||
1589 | + | ||
1590 | } | ||
1591 | } | ||
1592 | - | ||
1593 | - emms(); | ||
1594 | + | ||
1595 | + _mm_empty(); | ||
1596 | } | ||
1597 | |||
1598 | #define GetStart(drw,x,y,type,stride,line,bpp) {\ | ||
1599 | @@ -1358,19 +1450,19 @@ | ||
1600 | FbStride stride; | ||
1601 | int bpp; | ||
1602 | ullong fill; | ||
1603 | - Vector8x8 vfill; | ||
1604 | + __m64 vfill; | ||
1605 | CARD32 byte_width; | ||
1606 | CARD8 *byte_line; | ||
1607 | FbBits *bits; | ||
1608 | int xoff, yoff; | ||
1609 | |||
1610 | CHECKPOINT(); | ||
1611 | - | ||
1612 | + | ||
1613 | fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); | ||
1614 | - | ||
1615 | + | ||
1616 | if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) | ||
1617 | return FALSE; | ||
1618 | - | ||
1619 | + | ||
1620 | if (bpp != 16 && bpp != 32) | ||
1621 | return FALSE; | ||
1622 | |||
1623 | @@ -1388,9 +1480,9 @@ | ||
1624 | byte_width = 4 * width; | ||
1625 | stride *= 4; | ||
1626 | } | ||
1627 | - | ||
1628 | + | ||
1629 | fill = ((ullong)xor << 32) | xor; | ||
1630 | - vfill = (Vector8x8)fill; | ||
1631 | + vfill = (__m64)fill; | ||
1632 | |||
1633 | while (height--) | ||
1634 | { | ||
1635 | @@ -1398,7 +1490,7 @@ | ||
1636 | CARD8 *d = byte_line; | ||
1637 | byte_line += stride; | ||
1638 | w = byte_width; | ||
1639 | - | ||
1640 | + | ||
1641 | while (w >= 2 && ((unsigned long)d & 3)) | ||
1642 | { | ||
1643 | *(CARD16 *)d = xor; | ||
1644 | @@ -1406,35 +1498,32 @@ | ||
1645 | d += 2; | ||
1646 | } | ||
1647 | |||
1648 | - while (w >= 4 && ((unsigned int)d & 7)) | ||
1649 | + while (w >= 4 && ((unsigned long)d & 7)) | ||
1650 | { | ||
1651 | *(CARD32 *)d = xor; | ||
1652 | - | ||
1653 | + | ||
1654 | w -= 4; | ||
1655 | d += 4; | ||
1656 | } | ||
1657 | |||
1658 | while (w >= 64) | ||
1659 | { | ||
1660 | - __asm__ __volatile ( | ||
1661 | - "movq %0, (%1)\n\t" | ||
1662 | - "movq %0, 8(%1)\n\t" | ||
1663 | - "movq %0, 16(%1)\n\t" | ||
1664 | - "movq %0, 24(%1)\n\t" | ||
1665 | - "movq %0, 32(%1)\n\t" | ||
1666 | - "movq %0, 40(%1)\n\t" | ||
1667 | - "movq %0, 48(%1)\n\t" | ||
1668 | - "movq %0, 56(%1)\n\t" | ||
1669 | - : /* no output */ | ||
1670 | - : "y" (vfill), "r" (d) | ||
1671 | - : "memory"); | ||
1672 | + *(__m64*) (d + 0) = vfill; | ||
1673 | + *(__m64*) (d + 8) = vfill; | ||
1674 | + *(__m64*) (d + 16) = vfill; | ||
1675 | + *(__m64*) (d + 24) = vfill; | ||
1676 | + *(__m64*) (d + 32) = vfill; | ||
1677 | + *(__m64*) (d + 40) = vfill; | ||
1678 | + *(__m64*) (d + 48) = vfill; | ||
1679 | + *(__m64*) (d + 56) = vfill; | ||
1680 | + | ||
1681 | w -= 64; | ||
1682 | d += 64; | ||
1683 | } | ||
1684 | while (w >= 4) | ||
1685 | { | ||
1686 | *(CARD32 *)d = xor; | ||
1687 | - | ||
1688 | + | ||
1689 | w -= 4; | ||
1690 | d += 4; | ||
1691 | } | ||
1692 | @@ -1446,16 +1535,160 @@ | ||
1693 | } | ||
1694 | } | ||
1695 | |||
1696 | - emms(); | ||
1697 | + _mm_empty(); | ||
1698 | + return TRUE; | ||
1699 | +} | ||
1700 | + | ||
1701 | +Bool | ||
1702 | +fbCopyAreammx (DrawablePtr pSrc, | ||
1703 | + DrawablePtr pDst, | ||
1704 | + int src_x, | ||
1705 | + int src_y, | ||
1706 | + int dst_x, | ||
1707 | + int dst_y, | ||
1708 | + int width, | ||
1709 | + int height) | ||
1710 | +{ | ||
1711 | + FbBits * src_bits; | ||
1712 | + FbStride src_stride; | ||
1713 | + int src_bpp; | ||
1714 | + int src_xoff; | ||
1715 | + int src_yoff; | ||
1716 | + | ||
1717 | + FbBits * dst_bits; | ||
1718 | + FbStride dst_stride; | ||
1719 | + int dst_bpp; | ||
1720 | + int dst_xoff; | ||
1721 | + int dst_yoff; | ||
1722 | + | ||
1723 | + CARD8 * src_bytes; | ||
1724 | + CARD8 * dst_bytes; | ||
1725 | + int byte_width; | ||
1726 | + | ||
1727 | + fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); | ||
1728 | + fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); | ||
1729 | + | ||
1730 | + if (src_bpp != 16 && src_bpp != 32) | ||
1731 | + return FALSE; | ||
1732 | + | ||
1733 | + if (dst_bpp != 16 && dst_bpp != 32) | ||
1734 | + return FALSE; | ||
1735 | + | ||
1736 | + if (src_bpp != dst_bpp) | ||
1737 | + { | ||
1738 | + return FALSE; | ||
1739 | + } | ||
1740 | + | ||
1741 | + if (src_bpp == 16) | ||
1742 | + { | ||
1743 | + src_stride = src_stride * sizeof (FbBits) / 2; | ||
1744 | + dst_stride = dst_stride * sizeof (FbBits) / 2; | ||
1745 | + src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); | ||
1746 | + dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); | ||
1747 | + byte_width = 2 * width; | ||
1748 | + src_stride *= 2; | ||
1749 | + dst_stride *= 2; | ||
1750 | + } | ||
1751 | + else | ||
1752 | + { | ||
1753 | + src_stride = src_stride * sizeof (FbBits) / 4; | ||
1754 | + dst_stride = dst_stride * sizeof (FbBits) / 4; | ||
1755 | + src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); | ||
1756 | + dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); | ||
1757 | + byte_width = 4 * width; | ||
1758 | + src_stride *= 4; | ||
1759 | + dst_stride *= 4; | ||
1760 | + } | ||
1761 | + | ||
1762 | + while (height--) | ||
1763 | + { | ||
1764 | + int w; | ||
1765 | + CARD8 *s = src_bytes; | ||
1766 | + CARD8 *d = dst_bytes; | ||
1767 | + src_bytes += src_stride; | ||
1768 | + dst_bytes += dst_stride; | ||
1769 | + w = byte_width; | ||
1770 | + | ||
1771 | + while (w >= 2 && ((unsigned long)d & 3)) | ||
1772 | + { | ||
1773 | + *(CARD16 *)d = *(CARD16 *)s; | ||
1774 | + w -= 2; | ||
1775 | + s += 2; | ||
1776 | + d += 2; | ||
1777 | + } | ||
1778 | + | ||
1779 | + while (w >= 4 && ((unsigned int)d & 7)) | ||
1780 | + { | ||
1781 | + *(CARD32 *)d = *(CARD32 *)s; | ||
1782 | + | ||
1783 | + w -= 4; | ||
1784 | + s += 4; | ||
1785 | + d += 4; | ||
1786 | + } | ||
1787 | + | ||
1788 | + while (w >= 64) | ||
1789 | + { | ||
1790 | + *(__m64 *)(d + 0) = *(__m64 *)(s + 0); | ||
1791 | + *(__m64 *)(d + 8) = *(__m64 *)(s + 8); | ||
1792 | + *(__m64 *)(d + 16) = *(__m64 *)(s + 16); | ||
1793 | + *(__m64 *)(d + 24) = *(__m64 *)(s + 24); | ||
1794 | + *(__m64 *)(d + 32) = *(__m64 *)(s + 32); | ||
1795 | + *(__m64 *)(d + 40) = *(__m64 *)(s + 40); | ||
1796 | + *(__m64 *)(d + 48) = *(__m64 *)(s + 48); | ||
1797 | + *(__m64 *)(d + 56) = *(__m64 *)(s + 56); | ||
1798 | + w -= 64; | ||
1799 | + s += 64; | ||
1800 | + d += 64; | ||
1801 | + } | ||
1802 | + while (w >= 4) | ||
1803 | + { | ||
1804 | + *(CARD32 *)d = *(CARD32 *)s; | ||
1805 | + | ||
1806 | + w -= 4; | ||
1807 | + s += 4; | ||
1808 | + d += 4; | ||
1809 | + } | ||
1810 | + if (w >= 2) | ||
1811 | + { | ||
1812 | + *(CARD16 *)d = *(CARD16 *)s; | ||
1813 | + w -= 2; | ||
1814 | + s += 2; | ||
1815 | + d += 2; | ||
1816 | + } | ||
1817 | + } | ||
1818 | + | ||
1819 | + _mm_empty(); | ||
1820 | return TRUE; | ||
1821 | } | ||
1822 | |||
1823 | +void | ||
1824 | +fbCompositeCopyAreammx (CARD8 op, | ||
1825 | + PicturePtr pSrc, | ||
1826 | + PicturePtr pMask, | ||
1827 | + PicturePtr pDst, | ||
1828 | + INT16 xSrc, | ||
1829 | + INT16 ySrc, | ||
1830 | + INT16 xMask, | ||
1831 | + INT16 yMask, | ||
1832 | + INT16 xDst, | ||
1833 | + INT16 yDst, | ||
1834 | + CARD16 width, | ||
1835 | + CARD16 height) | ||
1836 | +{ | ||
1837 | + fbCopyAreammx (pSrc->pDrawable, | ||
1838 | + pDst->pDrawable, | ||
1839 | + xSrc, ySrc, | ||
1840 | + xDst, yDst, | ||
1841 | + width, height); | ||
1842 | +} | ||
1843 | + | ||
1844 | +#ifndef __amd64__ | ||
1845 | Bool | ||
1846 | fbHaveMMX (void) | ||
1847 | { | ||
1848 | static Bool initialized = FALSE; | ||
1849 | static Bool mmx_present; | ||
1850 | - | ||
1851 | + | ||
1852 | if (!initialized) | ||
1853 | { | ||
1854 | int tmp; /* static variables are accessed through %ebx, | ||
1855 | @@ -1466,7 +1699,7 @@ | ||
1856 | |||
1857 | __asm__ __volatile__ ( | ||
1858 | /* Check if bit 21 in flags word is writeable */ | ||
1859 | - | ||
1860 | + | ||
1861 | "pusha \n\t" | ||
1862 | "pushfl \n\t" | ||
1863 | "popl %%eax \n\t" | ||
1864 | @@ -1502,13 +1735,14 @@ | ||
1865 | : /* no input */); | ||
1866 | |||
1867 | initialized = TRUE; | ||
1868 | - | ||
1869 | + | ||
1870 | mmx_present = tmp; | ||
1871 | } | ||
1872 | |||
1873 | return mmx_present; | ||
1874 | } | ||
1875 | +#endif /* __amd64__ */ | ||
1876 | |||
1877 | |||
1878 | #endif /* RENDER */ | ||
1879 | -#endif /* USE_GCC34_MMX */ | ||
1880 | +#endif /* USE_MMX */ | ||
1881 | diff -ur xc-orig/programs/Xserver/fb/fbmmx.h xc/programs/Xserver/fb/fbmmx.h | ||
1882 | --- xc-orig/programs/Xserver/fb/fbmmx.h 2005-02-11 04:00:50.006092570 -0500 | ||
1883 | +++ xc/programs/Xserver/fb/fbmmx.h 2005-02-11 04:01:32.072346126 -0500 | ||
1884 | @@ -1,5 +1,5 @@ | ||
1885 | /* | ||
1886 | - * Copyright © 2004 Red Hat, Inc. | ||
1887 | + * Copyright © 2004 Red Hat, Inc. | ||
1888 | * | ||
1889 | * Permission to use, copy, modify, distribute, and sell this software and its | ||
1890 | * documentation for any purpose is hereby granted without fee, provided that | ||
1891 | @@ -18,17 +18,23 @@ | ||
1892 | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN | ||
1893 | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
1894 | * | ||
1895 | - * Author: Søren Sandmann (sandmann@redhat.com) | ||
1896 | + * Author: Søren Sandmann (sandmann@redhat.com) | ||
1897 | * | ||
1898 | * Based on work by Owen Taylor | ||
1899 | */ | ||
1900 | -#ifdef USE_GCC34_MMX | ||
1901 | +#ifdef USE_MMX | ||
1902 | + | ||
1903 | +#ifndef __amd64__ | ||
1904 | Bool fbHaveMMX(void); | ||
1905 | #else | ||
1906 | -#define fbHaveMMX FALSE | ||
1907 | +#define fbHaveMMX() TRUE | ||
1908 | +#endif | ||
1909 | + | ||
1910 | +#else | ||
1911 | +#define fbHaveMMX() FALSE | ||
1912 | #endif | ||
1913 | |||
1914 | -#ifdef USE_GCC34_MMX | ||
1915 | +#ifdef USE_MMX | ||
1916 | |||
1917 | void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, | ||
1918 | PicturePtr pSrc, | ||
1919 | @@ -150,6 +156,38 @@ | ||
1920 | INT16 yDst, | ||
1921 | CARD16 width, | ||
1922 | CARD16 height); | ||
1923 | +void fbCompositeSrc_8888x8x8888mmx (CARD8 op, | ||
1924 | + PicturePtr pSrc, | ||
1925 | + PicturePtr pMask, | ||
1926 | + PicturePtr pDst, | ||
1927 | + INT16 xSrc, | ||
1928 | + INT16 ySrc, | ||
1929 | + INT16 xMask, | ||
1930 | + INT16 yMask, | ||
1931 | + INT16 xDst, | ||
1932 | + INT16 yDst, | ||
1933 | + CARD16 width, | ||
1934 | + CARD16 height); | ||
1935 | +Bool fbCopyAreammx (DrawablePtr pSrc, | ||
1936 | + DrawablePtr pDst, | ||
1937 | + int src_x, | ||
1938 | + int src_y, | ||
1939 | + int dst_x, | ||
1940 | + int dst_y, | ||
1941 | + int width, | ||
1942 | + int height); | ||
1943 | +void fbCompositeCopyAreammx (CARD8 op, | ||
1944 | + PicturePtr pSrc, | ||
1945 | + PicturePtr pMask, | ||
1946 | + PicturePtr pDst, | ||
1947 | + INT16 xSrc, | ||
1948 | + INT16 ySrc, | ||
1949 | + INT16 xMask, | ||
1950 | + INT16 yMask, | ||
1951 | + INT16 xDst, | ||
1952 | + INT16 yDst, | ||
1953 | + CARD16 width, | ||
1954 | + CARD16 height); | ||
1955 | Bool fbSolidFillmmx (DrawablePtr pDraw, | ||
1956 | int x, | ||
1957 | int y, | ||
1958 | @@ -157,4 +195,4 @@ | ||
1959 | int height, | ||
1960 | FbBits xor); | ||
1961 | |||
1962 | -#endif /* USE_GCC34_MMX */ | ||
1963 | +#endif /* USE_MMX */ | ||
1964 | |||
1965 | diff -ur xc-orig/programs/Xserver/fb/fbpict.c xc/programs/Xserver/fb/fbpict.c | ||
1966 | --- xc-orig/programs/Xserver/fb/fbpict.c 2005-02-11 04:00:50.007092600 -0500 | ||
1967 | +++ xc/programs/Xserver/fb/fbpict.c 2005-02-11 04:01:32.075346216 -0500 | ||
1968 | @@ -1,7 +1,7 @@ | ||
1969 | /* | ||
1970 | * $XFree86: xc/programs/Xserver/fb/fbpict.c,v 1.15 2002/09/26 02:56:48 keithp Exp $ | ||
1971 | * | ||
1972 | - * Copyright © 2000 SuSE, Inc. | ||
1973 | + * Copyright © 2000 SuSE, Inc. | ||
1974 | * | ||
1975 | * Permission to use, copy, modify, distribute, and sell this software and its | ||
1976 | * documentation for any purpose is hereby granted without fee, provided that | ||
1977 | @@ -863,6 +863,15 @@ | ||
1978 | if (!pSrc->transform && !(pMask && pMask->transform)) | ||
1979 | if (!maskAlphaMap && !srcAlphaMap && !dstAlphaMap) | ||
1980 | switch (op) { | ||
1981 | + case PictOpSrc: | ||
1982 | +#ifdef USE_MMX | ||
1983 | + if (!pMask && pSrc->format == pDst->format && | ||
1984 | + pSrc->pDrawable != pDst->pDrawable) | ||
1985 | + { | ||
1986 | + func = fbCompositeCopyAreammx; | ||
1987 | + } | ||
1988 | +#endif | ||
1989 | + break; | ||
1990 | case PictOpOver: | ||
1991 | if (pMask) | ||
1992 | { | ||
1993 | @@ -877,7 +886,7 @@ | ||
1994 | switch (pDst->format) { | ||
1995 | case PICT_r5g6b5: | ||
1996 | case PICT_b5g6r5: | ||
1997 | -#ifdef USE_GCC34_MMX | ||
1998 | +#ifdef USE_MMX | ||
1999 | if (fbHaveMMX()) | ||
2000 | func = fbCompositeSolidMask_nx8x0565mmx; | ||
2001 | else | ||
2002 | @@ -892,7 +901,7 @@ | ||
2003 | case PICT_x8r8g8b8: | ||
2004 | case PICT_a8b8g8r8: | ||
2005 | case PICT_x8b8g8r8: | ||
2006 | -#ifdef USE_GCC34_MMX | ||
2007 | +#ifdef USE_MMX | ||
2008 | if (fbHaveMMX()) | ||
2009 | func = fbCompositeSolidMask_nx8x8888mmx; | ||
2010 | else | ||
2011 | @@ -906,7 +915,7 @@ | ||
2012 | switch (pDst->format) { | ||
2013 | case PICT_a8r8g8b8: | ||
2014 | case PICT_x8r8g8b8: | ||
2015 | -#ifdef USE_GCC34_MMX | ||
2016 | +#ifdef USE_MMX | ||
2017 | if (fbHaveMMX()) | ||
2018 | func = fbCompositeSolidMask_nx8888x8888Cmmx; | ||
2019 | else | ||
2020 | @@ -914,7 +923,7 @@ | ||
2021 | func = fbCompositeSolidMask_nx8888x8888C; | ||
2022 | break; | ||
2023 | case PICT_r5g6b5: | ||
2024 | -#ifdef USE_GCC34_MMX | ||
2025 | +#ifdef USE_MMX | ||
2026 | if (fbHaveMMX()) | ||
2027 | func = fbCompositeSolidMask_nx8888x0565Cmmx; | ||
2028 | else | ||
2029 | @@ -929,7 +938,7 @@ | ||
2030 | switch (pDst->format) { | ||
2031 | case PICT_a8b8g8r8: | ||
2032 | case PICT_x8b8g8r8: | ||
2033 | -#ifdef USE_GCC34_MMX | ||
2034 | +#ifdef USE_MMX | ||
2035 | if (fbHaveMMX()) | ||
2036 | func = fbCompositeSolidMask_nx8888x8888Cmmx; | ||
2037 | else | ||
2038 | @@ -937,7 +946,7 @@ | ||
2039 | func = fbCompositeSolidMask_nx8888x8888C; | ||
2040 | break; | ||
2041 | case PICT_b5g6r5: | ||
2042 | -#ifdef USE_GCC34_MMX | ||
2043 | +#ifdef USE_MMX | ||
2044 | if (fbHaveMMX()) | ||
2045 | func = fbCompositeSolidMask_nx8888x0565Cmmx; | ||
2046 | else | ||
2047 | @@ -970,6 +979,7 @@ | ||
2048 | xSrc == xMask && ySrc == yMask && | ||
2049 | !pMask->componentAlpha) | ||
2050 | { | ||
2051 | + /* source == mask: non-premultiplied data */ | ||
2052 | switch (pSrc->format) { | ||
2053 | case PICT_x8b8g8r8: | ||
2054 | switch (pMask->format) { | ||
2055 | @@ -978,13 +988,13 @@ | ||
2056 | switch (pDst->format) { | ||
2057 | case PICT_a8r8g8b8: | ||
2058 | case PICT_x8r8g8b8: | ||
2059 | -#ifdef USE_GCC34_MMX | ||
2060 | +#ifdef USE_MMX | ||
2061 | if (fbHaveMMX()) | ||
2062 | func = fbCompositeSrc_8888RevNPx8888mmx; | ||
2063 | #endif | ||
2064 | break; | ||
2065 | case PICT_r5g6b5: | ||
2066 | -#ifdef USE_GCC34_MMX | ||
2067 | +#ifdef USE_MMX | ||
2068 | if (fbHaveMMX()) | ||
2069 | func = fbCompositeSrc_8888RevNPx0565mmx; | ||
2070 | #endif | ||
2071 | @@ -1000,13 +1010,13 @@ | ||
2072 | switch (pDst->format) { | ||
2073 | case PICT_a8b8g8r8: | ||
2074 | case PICT_x8b8g8r8: | ||
2075 | -#ifdef USE_GCC34_MMX | ||
2076 | +#ifdef USE_MMX | ||
2077 | if (fbHaveMMX()) | ||
2078 | func = fbCompositeSrc_8888RevNPx8888mmx; | ||
2079 | #endif | ||
2080 | break; | ||
2081 | case PICT_r5g6b5: | ||
2082 | -#ifdef USE_GCC34_MMX | ||
2083 | +#ifdef USE_MMX | ||
2084 | if (fbHaveMMX()) | ||
2085 | func = fbCompositeSrc_8888RevNPx0565mmx; | ||
2086 | #endif | ||
2087 | @@ -1018,9 +1028,27 @@ | ||
2088 | } | ||
2089 | break; | ||
2090 | } | ||
2091 | + else | ||
2092 | + { | ||
2093 | + /* non-repeating source, repeating mask => translucent window */ | ||
2094 | + if (maskRepeat && | ||
2095 | + pMask->pDrawable->width == 1 && | ||
2096 | + pMask->pDrawable->height == 1) | ||
2097 | + { | ||
2098 | + if (pSrc->format == PICT_x8r8g8b8 && | ||
2099 | + pDst->format == PICT_x8r8g8b8 && | ||
2100 | + pMask->format == PICT_a8) | ||
2101 | + { | ||
2102 | +#ifdef USE_MMX | ||
2103 | + if (fbHaveMMX()) | ||
2104 | + func = fbCompositeSrc_8888x8x8888mmx; | ||
2105 | +#endif | ||
2106 | + } | ||
2107 | + } | ||
2108 | + } | ||
2109 | } | ||
2110 | } | ||
2111 | - else | ||
2112 | + else /* no mask */ | ||
2113 | { | ||
2114 | if (srcRepeat && | ||
2115 | pSrc->pDrawable->width == 1 && | ||
2116 | @@ -1032,7 +1060,7 @@ | ||
2117 | switch (pDst->format) { | ||
2118 | case PICT_a8r8g8b8: | ||
2119 | case PICT_x8r8g8b8: | ||
2120 | -#ifdef USE_GCC34_MMX | ||
2121 | +#ifdef USE_MMX | ||
2122 | if (fbHaveMMX()) | ||
2123 | { | ||
2124 | srcRepeat = FALSE; | ||
2125 | @@ -1041,7 +1069,7 @@ | ||
2126 | #endif | ||
2127 | break; | ||
2128 | case PICT_r5g6b5: | ||
2129 | -#ifdef USE_GCC34_MMX | ||
2130 | +#ifdef USE_MMX | ||
2131 | if (fbHaveMMX()) | ||
2132 | { | ||
2133 | srcRepeat = FALSE; | ||
2134 | @@ -1070,6 +1098,27 @@ | ||
2135 | break; | ||
2136 | } | ||
2137 | break; | ||
2138 | + case PICT_x8r8g8b8: | ||
2139 | + switch (pDst->format) { | ||
2140 | + case PICT_a8r8g8b8: | ||
2141 | + case PICT_x8r8g8b8: | ||
2142 | +#ifdef USE_MMX | ||
2143 | + if (fbHaveMMX()) | ||
2144 | + func = fbCompositeCopyAreammx; | ||
2145 | +#endif | ||
2146 | + break; | ||
2147 | + } | ||
2148 | + case PICT_x8b8g8r8: | ||
2149 | + switch (pDst->format) { | ||
2150 | + case PICT_a8b8g8r8: | ||
2151 | + case PICT_x8b8g8r8: | ||
2152 | +#ifdef USE_MMX | ||
2153 | + if (fbHaveMMX()) | ||
2154 | + func = fbCompositeCopyAreammx; | ||
2155 | +#endif | ||
2156 | + break; | ||
2157 | + } | ||
2158 | + break; | ||
2159 | case PICT_a8b8g8r8: | ||
2160 | switch (pDst->format) { | ||
2161 | case PICT_a8b8g8r8: | ||
2162 | @@ -1109,7 +1158,7 @@ | ||
2163 | case PICT_a8r8g8b8: | ||
2164 | switch (pDst->format) { | ||
2165 | case PICT_a8r8g8b8: | ||
2166 | -#ifdef USE_GCC34_MMX | ||
2167 | +#ifdef USE_MMX | ||
2168 | if (fbHaveMMX()) | ||
2169 | func = fbCompositeSrcAdd_8888x8888mmx; | ||
2170 | else | ||
2171 | @@ -1121,7 +1170,7 @@ | ||
2172 | case PICT_a8b8g8r8: | ||
2173 | switch (pDst->format) { | ||
2174 | case PICT_a8b8g8r8: | ||
2175 | -#ifdef USE_GCC34_MMX | ||
2176 | +#ifdef USE_MMX | ||
2177 | if (fbHaveMMX()) | ||
2178 | func = fbCompositeSrcAdd_8888x8888mmx; | ||
2179 | else | ||
2180 | @@ -1133,7 +1182,7 @@ | ||
2181 | case PICT_a8: | ||
2182 | switch (pDst->format) { | ||
2183 | case PICT_a8: | ||
2184 | -#ifdef USE_GCC34_MMX | ||
2185 | +#ifdef USE_MMX | ||
2186 | if (fbHaveMMX()) | ||
2187 | func = fbCompositeSrcAdd_8000x8000mmx; | ||
2188 | else |