Contents of /trunk/xorg-old/patches-6.8.2-r10/9914_all_6.8.2-mmx-gcc4.patch
Parent Directory | Revision Log
Revision 167 -
(show annotations)
(download)
Tue May 8 20:58:51 2007 UTC (17 years, 4 months ago) by niro
File size: 52896 byte(s)
Tue May 8 20:58:51 2007 UTC (17 years, 4 months ago) by niro
File size: 52896 byte(s)
-import
1 | diff -ur xc-orig/programs/Xserver/fb/Imakefile xc/programs/Xserver/fb/Imakefile |
2 | --- xc-orig/programs/Xserver/fb/Imakefile 2005-02-11 04:00:50.004092510 -0500 |
3 | +++ xc/programs/Xserver/fb/Imakefile 2005-02-11 04:01:32.059345739 -0500 |
4 | @@ -3,13 +3,22 @@ |
5 | XCOMM |
6 | XCOMM Id: Imakefile,v 1.1 1999/11/02 03:54:44 keithp Exp $ |
7 | |
8 | -#if defined(i386Architecture) && defined(HasGcc34) && HasGcc34 |
9 | +#if defined(HasGcc34) && HasGcc34 |
10 | MMXOPTIONS= -mmmx -Winline --param inline-unit-growth=10000 \ |
11 | - --param large-function-growth=10000 -DUSE_GCC34_MMX |
12 | + --param large-function-growth=10000 -DUSE_MMX |
13 | +SSEOPTIONS= $(MMXOPTIONS) -msse -DUSE_SSE |
14 | |
15 | +#if defined(i386Architecture) |
16 | SpecialCObjectRule(fbmmx,fbmmx.c,$(MMXOPTIONS)) |
17 | +#elif defined(AMD64Architecture) |
18 | +SpecialCObjectRule(fbmmx,fbmmx.c,$(SSEOPTIONS)) |
19 | +#endif |
20 | + |
21 | +#if defined(i386Architecture) || defined(AMD64Architecture) |
22 | SpecialCObjectRule(fbpict,fbpict.c,$(MMXOPTIONS)) |
23 | SpecialCObjectRule(fbfill,fbfill.c,$(MMXOPTIONS)) |
24 | +SpecialCObjectRule(fbcopy,fbcopy.c,$(MMXOPTIONS)) |
25 | +#endif |
26 | |
27 | #endif |
28 | |
29 | diff -ur xc-orig/programs/Xserver/fb/fbcompose.c xc/programs/Xserver/fb/fbcompose.c |
30 | --- xc-orig/programs/Xserver/fb/fbcompose.c 2005-02-11 04:00:50.009092659 -0500 |
31 | +++ xc/programs/Xserver/fb/fbcompose.c 2005-02-11 04:01:32.067345977 -0500 |
32 | @@ -1,8 +1,8 @@ |
33 | /* |
34 | - * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.3 2004/05/12 01:49:46 anholt Exp $ |
35 | + * $XdotOrg: xc/programs/Xserver/fb/fbcompose.c,v 1.5 2005/01/13 20:49:21 sandmann Exp $ |
36 | * $XFree86: xc/programs/Xserver/fb/fbcompose.c,v 1.17tsi Exp $ |
37 | * |
38 | - * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. |
39 | + * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc. |
40 | * |
41 | * Permission to use, copy, modify, distribute, and sell this software and its |
42 | * documentation for any purpose is hereby granted without fee, provided that |
43 | @@ -2693,7 +2693,6 @@ |
44 | op->u.transform.y = y - op->u.transform.top_y; |
45 | } |
46 | |
47 | - |
48 | Bool |
49 | fbBuildCompositeOperand (PicturePtr pPict, |
50 | FbCompositeOperand op[4], |
51 | @@ -2710,7 +2709,6 @@ |
52 | |
53 | op->u.transform.top_y = pPict->pDrawable->y; |
54 | op->u.transform.left_x = pPict->pDrawable->x; |
55 | - |
56 | op->u.transform.start_x = x - op->u.transform.left_x; |
57 | op->u.transform.x = op->u.transform.start_x; |
58 | op->u.transform.y = y - op->u.transform.top_y; |
59 | @@ -2822,6 +2820,21 @@ |
60 | FbCombineFunc f; |
61 | int w; |
62 | |
63 | +#if 0 |
64 | + ErrorF ("op: %d\n" |
65 | + "src format: %lx\n" |
66 | + "msk format %lx\n" |
67 | + "dst format %lx\n" |
68 | + "width: %d\n" |
69 | + "height %d\n", |
70 | + op, |
71 | + pSrc? pSrc->format : 0, |
72 | + pMask? pMask->format : 0, |
73 | + pDst? pDst->format : 0, |
74 | + width, height); |
75 | + ErrorF ("PICT_x8r8g8b8: %lx\n", PICT_x8r8g8b8); |
76 | +#endif |
77 | + |
78 | if (!fbBuildCompositeOperand (pSrc, src, xSrc, ySrc, TRUE, TRUE)) |
79 | return; |
80 | if (!fbBuildCompositeOperand (pDst, dst, xDst, yDst, FALSE, TRUE)) |
81 | diff -ur xc-orig/programs/Xserver/fb/fbcopy.c xc/programs/Xserver/fb/fbcopy.c |
82 | --- xc-orig/programs/Xserver/fb/fbcopy.c 2005-02-11 04:00:50.004092510 -0500 |
83 | +++ xc/programs/Xserver/fb/fbcopy.c 2005-02-11 04:01:32.068346007 -0500 |
84 | @@ -1,7 +1,7 @@ |
85 | /* |
86 | * Id: fbcopy.c,v 1.1 1999/11/02 03:54:45 keithp Exp $ |
87 | * |
88 | - * Copyright © 1998 Keith Packard |
89 | + * Copyright © 1998 Keith Packard |
90 | * |
91 | * Permission to use, copy, modify, distribute, and sell this software and its |
92 | * documentation for any purpose is hereby granted without fee, provided that |
93 | @@ -27,6 +27,7 @@ |
94 | #ifdef IN_MODULE |
95 | #include "xf86_ansic.h" |
96 | #endif |
97 | +#include "fbmmx.h" |
98 | |
99 | void |
100 | fbCopyNtoN (DrawablePtr pSrcDrawable, |
101 | @@ -54,28 +55,51 @@ |
102 | |
103 | fbGetDrawable (pSrcDrawable, src, srcStride, srcBpp, srcXoff, srcYoff); |
104 | fbGetDrawable (pDstDrawable, dst, dstStride, dstBpp, dstXoff, dstYoff); |
105 | - |
106 | + |
107 | while (nbox--) |
108 | { |
109 | +#ifdef USE_MMX |
110 | + if (!reverse && !upsidedown && fbHaveMMX()) |
111 | + { |
112 | + if (!fbCopyAreammx (pSrcDrawable, |
113 | + pDstDrawable, |
114 | + |
115 | + (pbox->x1 + dx + srcXoff), |
116 | + (pbox->y1 + dy + srcYoff), |
117 | + |
118 | + (pbox->x1 + dstXoff), |
119 | + (pbox->y1 + dstYoff), |
120 | + |
121 | + (pbox->x2 - pbox->x1), |
122 | + (pbox->y2 - pbox->y1))) |
123 | + goto fallback; |
124 | + else |
125 | + goto next; |
126 | + } |
127 | + fallback: |
128 | +#endif |
129 | fbBlt (src + (pbox->y1 + dy + srcYoff) * srcStride, |
130 | srcStride, |
131 | (pbox->x1 + dx + srcXoff) * srcBpp, |
132 | - |
133 | + |
134 | dst + (pbox->y1 + dstYoff) * dstStride, |
135 | dstStride, |
136 | (pbox->x1 + dstXoff) * dstBpp, |
137 | - |
138 | + |
139 | (pbox->x2 - pbox->x1) * dstBpp, |
140 | (pbox->y2 - pbox->y1), |
141 | - |
142 | + |
143 | alu, |
144 | pm, |
145 | dstBpp, |
146 | - |
147 | + |
148 | reverse, |
149 | upsidedown); |
150 | +#ifdef USE_MMX |
151 | + next: |
152 | +#endif |
153 | pbox++; |
154 | - } |
155 | + } |
156 | } |
157 | |
158 | void |
159 | @@ -594,7 +618,7 @@ |
160 | int yOut) |
161 | { |
162 | fbCopyProc copy; |
163 | - |
164 | + |
165 | #ifdef FB_24_32BIT |
166 | if (pSrcDrawable->bitsPerPixel != pDstDrawable->bitsPerPixel) |
167 | copy = fb24_32CopyMtoN; |
168 | diff -ur xc-orig/programs/Xserver/fb/fbfill.c xc/programs/Xserver/fb/fbfill.c |
169 | --- xc-orig/programs/Xserver/fb/fbfill.c 2005-02-11 04:00:50.006092570 -0500 |
170 | +++ xc/programs/Xserver/fb/fbfill.c 2005-02-11 04:01:32.069346037 -0500 |
171 | @@ -1,7 +1,7 @@ |
172 | /* |
173 | * Id: fbfill.c,v 1.1 1999/11/02 03:54:45 keithp Exp $ |
174 | * |
175 | - * Copyright © 1998 Keith Packard |
176 | + * Copyright © 1998 Keith Packard |
177 | * |
178 | * Permission to use, copy, modify, distribute, and sell this software and its |
179 | * documentation for any purpose is hereby granted without fee, provided that |
180 | @@ -44,7 +44,7 @@ |
181 | |
182 | switch (pGC->fillStyle) { |
183 | case FillSolid: |
184 | -#ifdef USE_GCC34_MMX |
185 | +#ifdef USE_MMX |
186 | if (!pPriv->and && fbHaveMMX()) |
187 | if (fbSolidFillmmx (pDrawable, x, y, width, height, pPriv->xor)) |
188 | return; |
189 | |
190 | diff -ur xc-orig/programs/Xserver/fb/fbmmx.c xc/programs/Xserver/fb/fbmmx.c |
191 | --- xc-orig/programs/Xserver/fb/fbmmx.c 2005-02-11 04:00:50.006092570 -0500 |
192 | +++ xc/programs/Xserver/fb/fbmmx.c 2005-02-11 04:01:32.072346126 -0500 |
193 | @@ -1,5 +1,6 @@ |
194 | /* |
195 | - * Copyright © 2004 Red Hat, Inc. |
196 | + * Copyright © 2004 Red Hat, Inc. |
197 | + * Copyright © 2004 Nicholas Miell |
198 | * |
199 | * Permission to use, copy, modify, distribute, and sell this software and its |
200 | * documentation for any purpose is hereby granted without fee, provided that |
201 | @@ -18,14 +19,23 @@ |
202 | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
203 | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
204 | * |
205 | - * Author: Søren Sandmann (sandmann@redhat.com) |
206 | - * |
207 | + * Author: Søren Sandmann (sandmann@redhat.com) |
208 | + * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
209 | + * |
210 | * Based on work by Owen Taylor |
211 | */ |
212 | |
213 | + |
214 | +#ifdef USE_MMX |
215 | + |
216 | #include "fb.h" |
217 | +#include "fbmmx.h" |
218 | + |
219 | +#include <mmintrin.h> |
220 | |
221 | -#ifdef USE_GCC34_MMX |
222 | +#ifdef USE_SSE |
223 | +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
224 | +#endif |
225 | |
226 | #ifdef RENDER |
227 | |
228 | @@ -33,11 +43,6 @@ |
229 | #include "mipict.h" |
230 | #include "fbpict.h" |
231 | |
232 | -typedef int Vector1x64 __attribute__ ((mode(DI))); |
233 | -typedef int Vector2x32 __attribute__ ((mode(V2SI))); |
234 | -typedef int Vector4x16 __attribute__ ((mode(V4HI))); |
235 | -typedef int Vector8x8 __attribute__ ((mode(V8QI))); |
236 | - |
237 | typedef unsigned long long ullong; |
238 | |
239 | #define noVERBOSE |
240 | @@ -50,7 +55,6 @@ |
241 | |
242 | typedef struct |
243 | { |
244 | - ullong mmx_zero; |
245 | ullong mmx_4x00ff; |
246 | ullong mmx_4x0080; |
247 | ullong mmx_565_rgb; |
248 | @@ -70,7 +74,6 @@ |
249 | |
250 | static const MMXData c = |
251 | { |
252 | - .mmx_zero = 0x0000000000000000ULL, |
253 | .mmx_4x00ff = 0x00ff00ff00ff00ffULL, |
254 | .mmx_4x0080 = 0x0080008000800080ULL, |
255 | .mmx_565_rgb = 0x000001f0003f001fULL, |
256 | @@ -88,121 +91,112 @@ |
257 | .mmx_000000000000ffff = 0x000000000000ffffULL, |
258 | }; |
259 | |
260 | -static __inline__ Vector1x64 |
261 | -shift (Vector1x64 v, int s) |
262 | +#define MC(x) ((__m64) c.mmx_##x) |
263 | + |
264 | +static __inline__ __m64 |
265 | +shift (__m64 v, int s) |
266 | { |
267 | if (s > 0) |
268 | - return __builtin_ia32_psllq (v, s); |
269 | + return _mm_slli_si64 (v, s); |
270 | else if (s < 0) |
271 | - return __builtin_ia32_psrlq (v, -s); |
272 | + return _mm_srli_si64 (v, -s); |
273 | else |
274 | return v; |
275 | } |
276 | |
277 | -static __inline__ Vector4x16 |
278 | -negate (Vector4x16 mask) |
279 | +static __inline__ __m64 |
280 | +negate (__m64 mask) |
281 | { |
282 | - return (Vector4x16)__builtin_ia32_pxor ( |
283 | - (Vector1x64)mask, |
284 | - (Vector1x64)c.mmx_4x00ff); |
285 | + return _mm_xor_si64 (mask, MC(4x00ff)); |
286 | } |
287 | |
288 | -static __inline__ Vector4x16 |
289 | -pix_multiply (Vector4x16 a, Vector4x16 b) |
290 | +static __inline__ __m64 |
291 | +pix_multiply (__m64 a, __m64 b) |
292 | { |
293 | - Vector4x16 res; |
294 | + __m64 res; |
295 | |
296 | - res = __builtin_ia32_pmullw (a, b); |
297 | - res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080); |
298 | - res = __builtin_ia32_psrlw (res, 8); |
299 | + res = _mm_mullo_pi16 (a, b); |
300 | + res = _mm_add_pi16 (res, MC(4x0080)); |
301 | + res = _mm_srli_pi16 (res, 8); |
302 | |
303 | return res; |
304 | } |
305 | |
306 | -#if 0 |
307 | +#ifdef USE_SSE |
308 | #define HAVE_PSHUFW |
309 | #endif |
310 | |
311 | #ifdef HAVE_PSHUFW |
312 | |
313 | -static __inline__ Vector4x16 |
314 | -expand_alpha (Vector4x16 pixel) |
315 | +static __inline__ __m64 |
316 | +expand_alpha (__m64 pixel) |
317 | { |
318 | - Vector4x16 result; |
319 | - __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
320 | - return result; |
321 | + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); |
322 | } |
323 | |
324 | -static __inline__ Vector4x16 |
325 | -expand_alpha_rev (Vector4x16 pixel) |
326 | +static __inline__ __m64 |
327 | +expand_alpha_rev (__m64 pixel) |
328 | { |
329 | - Vector4x16 result; |
330 | - __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
331 | - return result; |
332 | + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); |
333 | } |
334 | |
335 | -static __inline__ Vector4x16 |
336 | -invert_colors (Vector4x16 pixel) |
337 | +static __inline__ __m64 |
338 | +invert_colors (__m64 pixel) |
339 | { |
340 | - Vector4x16 result; |
341 | - |
342 | - /* 0xC6 = 11000110 */ |
343 | - /* 3 0 1 2 */ |
344 | - |
345 | - __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
346 | - |
347 | - return result; |
348 | + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); |
349 | } |
350 | |
351 | #else |
352 | |
353 | -static __inline__ Vector4x16 |
354 | -expand_alpha (Vector4x16 pixel) |
355 | +static __inline__ __m64 |
356 | +expand_alpha (__m64 pixel) |
357 | { |
358 | - Vector1x64 t1, t2; |
359 | - |
360 | - t1 = shift ((Vector1x64)pixel, -48); |
361 | + __m64 t1, t2; |
362 | + |
363 | + t1 = shift (pixel, -48); |
364 | t2 = shift (t1, 16); |
365 | - t1 = __builtin_ia32_por (t1, t2); |
366 | + t1 = _mm_or_si64 (t1, t2); |
367 | t2 = shift (t1, 32); |
368 | - t1 = __builtin_ia32_por (t1, t2); |
369 | - |
370 | - return (Vector4x16)t1; |
371 | + t1 = _mm_or_si64 (t1, t2); |
372 | + |
373 | + return t1; |
374 | } |
375 | |
376 | -static __inline__ Vector4x16 |
377 | -expand_alpha_rev (Vector4x16 pixel) |
378 | +static __inline__ __m64 |
379 | +expand_alpha_rev (__m64 pixel) |
380 | { |
381 | - Vector1x64 t1, t2; |
382 | - |
383 | - t1 = shift ((Vector1x64)pixel, 48); |
384 | + __m64 t1, t2; |
385 | + |
386 | + /* move alpha to low 16 bits and zero the rest */ |
387 | + t1 = shift (pixel, 48); |
388 | t1 = shift (t1, -48); |
389 | + |
390 | t2 = shift (t1, 16); |
391 | - t1 = __builtin_ia32_por (t1, t2); |
392 | + t1 = _mm_or_si64 (t1, t2); |
393 | t2 = shift (t1, 32); |
394 | - t1 = __builtin_ia32_por (t1, t2); |
395 | - |
396 | - return (Vector4x16)t1; |
397 | + t1 = _mm_or_si64 (t1, t2); |
398 | + |
399 | + return t1; |
400 | } |
401 | |
402 | -static __inline__ Vector4x16 |
403 | -invert_colors (Vector4x16 pixel) |
404 | +static __inline__ __m64 |
405 | +invert_colors (__m64 pixel) |
406 | { |
407 | - Vector1x64 x, y, z; |
408 | - |
409 | - x = y = z = (Vector1x64)pixel; |
410 | - |
411 | - x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000); |
412 | - y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff); |
413 | - z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000); |
414 | - |
415 | + __m64 x, y, z; |
416 | + |
417 | + x = y = z = pixel; |
418 | + |
419 | + x = _mm_and_si64 (x, MC(ffff0000ffff0000)); |
420 | + y = _mm_and_si64 (y, MC(000000000000ffff)); |
421 | + z = _mm_and_si64 (z, MC(0000ffff00000000)); |
422 | + |
423 | y = shift (y, 32); |
424 | z = shift (z, -32); |
425 | - |
426 | - x = __builtin_ia32_por (x, y); |
427 | - x = __builtin_ia32_por (x, z); |
428 | - |
429 | - return (Vector4x16)x; |
430 | + |
431 | + x = _mm_or_si64 (x, y); |
432 | + x = _mm_or_si64 (x, z); |
433 | + |
434 | + return x; |
435 | } |
436 | |
437 | #endif |
438 | @@ -210,147 +204,138 @@ |
439 | /* Notes about writing mmx code |
440 | * |
441 | * give memory operands as the second operand. If you give it as the |
442 | - * first, gcc will first load it into a register, then use that register |
443 | + * first, gcc will first load it into a register, then use that |
444 | + * register |
445 | * |
446 | * ie. use |
447 | * |
448 | - * __builtin_pmullw (x, mmx_constant[8]); |
449 | + * _mm_mullo_pi16 (x, mmx_constant); |
450 | * |
451 | * not |
452 | * |
453 | - * __builtin_pmullw (mmx_constant[8], x); |
454 | + * _mm_mullo_pi16 (mmx_constant, x); |
455 | * |
456 | - * Also try to minimize dependencies. Ie. when you need a value, try to calculate |
457 | - * it from a value that was calculated as early as possible. |
458 | + * Also try to minimize dependencies. i.e. when you need a value, try |
459 | + * to calculate it from a value that was calculated as early as |
460 | + * possible. |
461 | */ |
462 | |
463 | -static __inline__ Vector4x16 |
464 | -over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest) |
465 | +static __inline__ __m64 |
466 | +over (__m64 src, __m64 srca, __m64 dest) |
467 | { |
468 | - return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca))); |
469 | + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); |
470 | } |
471 | |
472 | -static __inline__ Vector4x16 |
473 | -over_rev_non_pre (Vector4x16 src, Vector4x16 dest) |
474 | +static __inline__ __m64 |
475 | +over_rev_non_pre (__m64 src, __m64 dest) |
476 | { |
477 | - Vector4x16 srca = expand_alpha (src); |
478 | - Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha); |
479 | - |
480 | + __m64 srca = expand_alpha (src); |
481 | + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); |
482 | + |
483 | return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); |
484 | } |
485 | |
486 | -static __inline__ Vector4x16 |
487 | -in (Vector4x16 src, |
488 | - Vector4x16 mask) |
489 | +static __inline__ __m64 |
490 | +in (__m64 src, |
491 | + __m64 mask) |
492 | { |
493 | return pix_multiply (src, mask); |
494 | } |
495 | |
496 | -static __inline__ Vector4x16 |
497 | -in_over (Vector4x16 src, |
498 | - Vector4x16 srca, |
499 | - Vector4x16 mask, |
500 | - Vector4x16 dest) |
501 | +static __inline__ __m64 |
502 | +in_over (__m64 src, |
503 | + __m64 srca, |
504 | + __m64 mask, |
505 | + __m64 dest) |
506 | { |
507 | return over(in(src, mask), pix_multiply(srca, mask), dest); |
508 | } |
509 | |
510 | -static __inline__ Vector8x8 |
511 | -cvt32to64 (CARD32 v) |
512 | -{ |
513 | - ullong r = v; |
514 | - return (Vector8x8)r; |
515 | -} |
516 | - |
517 | -static __inline__ Vector4x16 |
518 | +static __inline__ __m64 |
519 | load8888 (CARD32 v) |
520 | { |
521 | - return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v), |
522 | - (Vector8x8)c.mmx_zero); |
523 | + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); |
524 | } |
525 | |
526 | -static __inline__ Vector8x8 |
527 | -pack8888 (Vector4x16 lo, Vector4x16 hi) |
528 | +static __inline__ __m64 |
529 | +pack8888 (__m64 lo, __m64 hi) |
530 | { |
531 | - Vector8x8 r; |
532 | - r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi); |
533 | + __m64 r; |
534 | + r = _mm_packs_pu16 (lo, hi); |
535 | return r; |
536 | } |
537 | |
538 | -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB |
539 | - |
540 | ---- Expanding 565 in the low word --- |
541 | - |
542 | -m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
543 | -m = m & (01f0003f001f); |
544 | -m = m * (008404100840); |
545 | -m = m >> 8; |
546 | - |
547 | -Note the trick here - the top word is shifted by another nibble to avoid |
548 | -it bumping into the middle word |
549 | -*/ |
550 | -static __inline__ Vector4x16 |
551 | -expand565 (Vector4x16 pixel, int pos) |
552 | +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
553 | + * |
554 | + * 00RR00GG00BB |
555 | + * |
556 | + * --- Expanding 565 in the low word --- |
557 | + * |
558 | + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
559 | + * m = m & (01f0003f001f); |
560 | + * m = m * (008404100840); |
561 | + * m = m >> 8; |
562 | + * |
563 | + * Note the trick here - the top word is shifted by another nibble to |
564 | + * avoid it bumping into the middle word |
565 | + */ |
566 | +static __inline__ __m64 |
567 | +expand565 (__m64 pixel, int pos) |
568 | { |
569 | - Vector1x64 p = (Vector1x64)pixel; |
570 | + __m64 p = pixel; |
571 | + __m64 t1, t2; |
572 | |
573 | /* move pixel to low 16 bit and zero the rest */ |
574 | p = shift (shift (p, (3 - pos) * 16), -48); |
575 | |
576 | - Vector1x64 t1 = shift (p, 36 - 11); |
577 | - Vector1x64 t2 = shift (p, 16 - 5); |
578 | + t1 = shift (p, 36 - 11); |
579 | + t2 = shift (p, 16 - 5); |
580 | |
581 | - p = __builtin_ia32_por (t1, p); |
582 | - p = __builtin_ia32_por (t2, p); |
583 | - p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb); |
584 | + p = _mm_or_si64 (t1, p); |
585 | + p = _mm_or_si64 (t2, p); |
586 | + p = _mm_and_si64 (p, MC(565_rgb)); |
587 | |
588 | - pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier); |
589 | - return __builtin_ia32_psrlw (pixel, 8); |
590 | + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); |
591 | + return _mm_srli_pi16 (pixel, 8); |
592 | } |
593 | |
594 | -static __inline__ Vector4x16 |
595 | -expand8888 (Vector4x16 in, int pos) |
596 | +static __inline__ __m64 |
597 | +expand8888 (__m64 in, int pos) |
598 | { |
599 | if (pos == 0) |
600 | - return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); |
601 | + return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); |
602 | else |
603 | - return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); |
604 | + return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); |
605 | } |
606 | |
607 | -static __inline__ Vector4x16 |
608 | -pack565 (Vector4x16 pixel, Vector4x16 target, int pos) |
609 | +static __inline__ __m64 |
610 | +pack565 (__m64 pixel, __m64 target, int pos) |
611 | { |
612 | - Vector1x64 p = (Vector1x64)pixel; |
613 | - Vector1x64 t = (Vector1x64)target; |
614 | - Vector1x64 r, g, b; |
615 | + __m64 p = pixel; |
616 | + __m64 t = target; |
617 | + __m64 r, g, b; |
618 | |
619 | - r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r); |
620 | - g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g); |
621 | - b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b); |
622 | + r = _mm_and_si64 (p, MC(565_r)); |
623 | + g = _mm_and_si64 (p, MC(565_g)); |
624 | + b = _mm_and_si64 (p, MC(565_b)); |
625 | |
626 | r = shift (r, - (32 - 8) + pos * 16); |
627 | g = shift (g, - (16 - 3) + pos * 16); |
628 | b = shift (b, - (0 + 3) + pos * 16); |
629 | - |
630 | + |
631 | if (pos == 0) |
632 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0); |
633 | + t = _mm_and_si64 (t, MC(mask_0)); |
634 | else if (pos == 1) |
635 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1); |
636 | + t = _mm_and_si64 (t, MC(mask_1)); |
637 | else if (pos == 2) |
638 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2); |
639 | + t = _mm_and_si64 (t, MC(mask_2)); |
640 | else if (pos == 3) |
641 | - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3); |
642 | + t = _mm_and_si64 (t, MC(mask_3)); |
643 | |
644 | - p = __builtin_ia32_por (r, t); |
645 | - p = __builtin_ia32_por (g, p); |
646 | + p = _mm_or_si64 (r, t); |
647 | + p = _mm_or_si64 (g, p); |
648 | |
649 | - return (Vector4x16)__builtin_ia32_por (b, p); |
650 | -} |
651 | - |
652 | -static __inline__ void |
653 | -emms (void) |
654 | -{ |
655 | - __asm__ __volatile__ ("emms"); |
656 | + return _mm_or_si64 (b, p); |
657 | } |
658 | |
659 | void |
660 | @@ -371,8 +356,8 @@ |
661 | CARD32 *dstLine, *dst; |
662 | CARD16 w; |
663 | FbStride dstStride; |
664 | - Vector4x16 vsrc, vsrca; |
665 | - |
666 | + __m64 vsrc, vsrca; |
667 | + |
668 | CHECKPOINT(); |
669 | |
670 | fbComposeGetSolid(pSrc, src, pDst->format); |
671 | @@ -384,51 +369,52 @@ |
672 | |
673 | vsrc = load8888 (src); |
674 | vsrca = expand_alpha (vsrc); |
675 | - |
676 | + |
677 | while (height--) |
678 | { |
679 | dst = dstLine; |
680 | dstLine += dstStride; |
681 | w = width; |
682 | - |
683 | + |
684 | CHECKPOINT(); |
685 | |
686 | while (w && (unsigned long)dst & 7) |
687 | { |
688 | - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); |
689 | + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), |
690 | + _mm_setzero_si64()); |
691 | |
692 | w--; |
693 | dst++; |
694 | } |
695 | - |
696 | + |
697 | while (w >= 2) |
698 | { |
699 | - Vector4x16 vdest; |
700 | - Vector4x16 dest0, dest1; |
701 | - |
702 | - vdest = *(Vector4x16 *)dst; |
703 | + __m64 vdest; |
704 | + __m64 dest0, dest1; |
705 | + |
706 | + vdest = *(__m64 *)dst; |
707 | |
708 | dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); |
709 | dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); |
710 | |
711 | - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); |
712 | + *(__m64 *)dst = pack8888(dest0, dest1); |
713 | |
714 | dst += 2; |
715 | w -= 2; |
716 | } |
717 | - |
718 | + |
719 | CHECKPOINT(); |
720 | |
721 | while (w) |
722 | { |
723 | - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); |
724 | + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); |
725 | |
726 | w--; |
727 | dst++; |
728 | } |
729 | } |
730 | |
731 | - emms(); |
732 | + _mm_empty(); |
733 | } |
734 | |
735 | void |
736 | @@ -449,8 +435,8 @@ |
737 | CARD16 *dstLine, *dst; |
738 | CARD16 w; |
739 | FbStride dstStride; |
740 | - Vector4x16 vsrc, vsrca; |
741 | - |
742 | + __m64 vsrc, vsrca; |
743 | + |
744 | CHECKPOINT(); |
745 | |
746 | fbComposeGetSolid(pSrc, src, pDst->format); |
747 | @@ -462,49 +448,49 @@ |
748 | |
749 | vsrc = load8888 (src); |
750 | vsrca = expand_alpha (vsrc); |
751 | - |
752 | + |
753 | while (height--) |
754 | { |
755 | dst = dstLine; |
756 | dstLine += dstStride; |
757 | w = width; |
758 | - |
759 | + |
760 | CHECKPOINT(); |
761 | |
762 | while (w && (unsigned long)dst & 7) |
763 | { |
764 | ullong d = *dst; |
765 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
766 | + __m64 vdest = expand565 ((__m64)d, 0); |
767 | vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
768 | *dst = (ullong)vdest; |
769 | |
770 | w--; |
771 | dst++; |
772 | } |
773 | - |
774 | + |
775 | while (w >= 4) |
776 | { |
777 | - Vector4x16 vdest; |
778 | - |
779 | - vdest = *(Vector4x16 *)dst; |
780 | + __m64 vdest; |
781 | + |
782 | + vdest = *(__m64 *)dst; |
783 | |
784 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); |
785 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); |
786 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); |
787 | vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); |
788 | |
789 | - *(Vector8x8 *)dst = (Vector8x8)vdest; |
790 | + *(__m64 *)dst = vdest; |
791 | |
792 | dst += 4; |
793 | w -= 4; |
794 | } |
795 | - |
796 | + |
797 | CHECKPOINT(); |
798 | |
799 | while (w) |
800 | { |
801 | ullong d = *dst; |
802 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
803 | + __m64 vdest = expand565 ((__m64)d, 0); |
804 | vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
805 | *dst = (ullong)vdest; |
806 | |
807 | @@ -513,7 +499,7 @@ |
808 | } |
809 | } |
810 | |
811 | - emms(); |
812 | + _mm_empty(); |
813 | } |
814 | |
815 | void |
816 | @@ -534,8 +520,8 @@ |
817 | CARD32 *dstLine; |
818 | CARD32 *maskLine; |
819 | FbStride dstStride, maskStride; |
820 | - Vector4x16 vsrc, vsrca; |
821 | - |
822 | + __m64 vsrc, vsrca; |
823 | + |
824 | CHECKPOINT(); |
825 | |
826 | fbComposeGetSolid(pSrc, src, pDst->format); |
827 | @@ -562,9 +548,9 @@ |
828 | |
829 | if (m) |
830 | { |
831 | - Vector4x16 vdest = load8888(*q); |
832 | + __m64 vdest = load8888(*q); |
833 | vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
834 | - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
835 | + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
836 | } |
837 | |
838 | twidth--; |
839 | @@ -580,15 +566,15 @@ |
840 | |
841 | if (m0 | m1) |
842 | { |
843 | - Vector4x16 dest0, dest1; |
844 | - Vector4x16 vdest = *(Vector4x16 *)q; |
845 | + __m64 dest0, dest1; |
846 | + __m64 vdest = *(__m64 *)q; |
847 | |
848 | dest0 = in_over(vsrc, vsrca, load8888(m0), |
849 | expand8888 (vdest, 0)); |
850 | dest1 = in_over(vsrc, vsrca, load8888(m1), |
851 | expand8888 (vdest, 1)); |
852 | |
853 | - *(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1); |
854 | + *(__m64 *)q = pack8888(dest0, dest1); |
855 | } |
856 | |
857 | p += 2; |
858 | @@ -602,9 +588,9 @@ |
859 | |
860 | if (m) |
861 | { |
862 | - Vector4x16 vdest = load8888(*q); |
863 | + __m64 vdest = load8888(*q); |
864 | vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
865 | - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
866 | + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
867 | } |
868 | |
869 | twidth--; |
870 | @@ -616,7 +602,133 @@ |
871 | maskLine += maskStride; |
872 | } |
873 | |
874 | - emms(); |
875 | + _mm_empty(); |
876 | +} |
877 | + |
878 | +void |
879 | +fbCompositeSrc_8888x8x8888mmx (CARD8 op, |
880 | + PicturePtr pSrc, |
881 | + PicturePtr pMask, |
882 | + PicturePtr pDst, |
883 | + INT16 xSrc, |
884 | + INT16 ySrc, |
885 | + INT16 xMask, |
886 | + INT16 yMask, |
887 | + INT16 xDst, |
888 | + INT16 yDst, |
889 | + CARD16 width, |
890 | + CARD16 height) |
891 | +{ |
892 | + CARD32 *dstLine, *dst; |
893 | + CARD32 *srcLine, *src; |
894 | + CARD8 *maskLine; |
895 | + CARD32 mask; |
896 | + __m64 vmask; |
897 | + FbStride dstStride, srcStride, maskStride; |
898 | + CARD16 w; |
899 | + __m64 srca; |
900 | + |
901 | + CHECKPOINT(); |
902 | + |
903 | + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
904 | + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
905 | + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); |
906 | + |
907 | + mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; |
908 | + vmask = load8888 (mask); |
909 | + srca = MC(4x00ff); |
910 | + |
911 | + while (height--) |
912 | + { |
913 | + dst = dstLine; |
914 | + dstLine += dstStride; |
915 | + src = srcLine; |
916 | + srcLine += srcStride; |
917 | + w = width; |
918 | + |
919 | + while (w && (unsigned long)dst & 7) |
920 | + { |
921 | + __m64 s = load8888 (*src); |
922 | + __m64 d = load8888 (*dst); |
923 | + |
924 | + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
925 | + |
926 | + w--; |
927 | + dst++; |
928 | + src++; |
929 | + } |
930 | + |
931 | + while (w >= 16) |
932 | + { |
933 | + __m64 vd0 = *(__m64 *)(dst + 0); |
934 | + __m64 vd1 = *(__m64 *)(dst + 2); |
935 | + __m64 vd2 = *(__m64 *)(dst + 4); |
936 | + __m64 vd3 = *(__m64 *)(dst + 6); |
937 | + __m64 vd4 = *(__m64 *)(dst + 8); |
938 | + __m64 vd5 = *(__m64 *)(dst + 10); |
939 | + __m64 vd6 = *(__m64 *)(dst + 12); |
940 | + __m64 vd7 = *(__m64 *)(dst + 14); |
941 | + |
942 | + __m64 vs0 = *(__m64 *)(src + 0); |
943 | + __m64 vs1 = *(__m64 *)(src + 2); |
944 | + __m64 vs2 = *(__m64 *)(src + 4); |
945 | + __m64 vs3 = *(__m64 *)(src + 6); |
946 | + __m64 vs4 = *(__m64 *)(src + 8); |
947 | + __m64 vs5 = *(__m64 *)(src + 10); |
948 | + __m64 vs6 = *(__m64 *)(src + 12); |
949 | + __m64 vs7 = *(__m64 *)(dst + 14); |
950 | + |
951 | + vd0 = (__m64)pack8888 ( |
952 | + in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
953 | + in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
954 | + |
955 | + vd1 = (__m64)pack8888 ( |
956 | + in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
957 | + in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
958 | + |
959 | + vd2 = (__m64)pack8888 ( |
960 | + in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
961 | + in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
962 | + |
963 | + vd3 = (__m64)pack8888 ( |
964 | + in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
965 | + in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
966 | + |
967 | + vd4 = (__m64)pack8888 ( |
968 | + in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
969 | + in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
970 | + |
971 | + vd5 = (__m64)pack8888 ( |
972 | + in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
973 | + in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
974 | + |
975 | + vd6 = (__m64)pack8888 ( |
976 | + in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
977 | + in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
978 | + |
979 | + vd7 = (__m64)pack8888 ( |
980 | + in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
981 | + in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
982 | + |
983 | + w -= 16; |
984 | + dst += 16; |
985 | + src += 16; |
986 | + } |
987 | + |
988 | + while (w) |
989 | + { |
990 | + __m64 s = load8888 (*src); |
991 | + __m64 d = load8888 (*dst); |
992 | + |
993 | + *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
994 | + |
995 | + w--; |
996 | + dst++; |
997 | + src++; |
998 | + } |
999 | + } |
1000 | + |
1001 | + _mm_empty(); |
1002 | } |
1003 | |
1004 | void |
1005 | @@ -638,7 +750,7 @@ |
1006 | CARD8 *maskLine, *mask; |
1007 | FbStride dstStride, maskStride; |
1008 | CARD16 w; |
1009 | - Vector4x16 vsrc, vsrca; |
1010 | + __m64 vsrc, vsrca; |
1011 | ullong srcsrc; |
1012 | |
1013 | CHECKPOINT(); |
1014 | @@ -648,7 +760,7 @@ |
1015 | srca = src >> 24; |
1016 | if (srca == 0) |
1017 | return; |
1018 | - |
1019 | + |
1020 | srcsrc = (unsigned long long)src << 32 | src; |
1021 | |
1022 | fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
1023 | @@ -664,7 +776,7 @@ |
1024 | mask = maskLine; |
1025 | maskLine += maskStride; |
1026 | w = width; |
1027 | - |
1028 | + |
1029 | CHECKPOINT(); |
1030 | |
1031 | while (w && (unsigned long)dst & 7) |
1032 | @@ -673,15 +785,15 @@ |
1033 | |
1034 | if (m) |
1035 | { |
1036 | - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst)); |
1037 | - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
1038 | + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); |
1039 | + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
1040 | } |
1041 | |
1042 | w--; |
1043 | mask++; |
1044 | dst++; |
1045 | } |
1046 | - |
1047 | + |
1048 | CHECKPOINT(); |
1049 | |
1050 | while (w >= 2) |
1051 | @@ -689,29 +801,29 @@ |
1052 | ullong m0, m1; |
1053 | m0 = *mask; |
1054 | m1 = *(mask + 1); |
1055 | - |
1056 | + |
1057 | if (srca == 0xff && (m0 & m1) == 0xff) |
1058 | { |
1059 | *(unsigned long long *)dst = srcsrc; |
1060 | } |
1061 | else if (m0 | m1) |
1062 | { |
1063 | - Vector4x16 vdest; |
1064 | - Vector4x16 dest0, dest1; |
1065 | - |
1066 | - vdest = *(Vector4x16 *)dst; |
1067 | + __m64 vdest; |
1068 | + __m64 dest0, dest1; |
1069 | + |
1070 | + vdest = *(__m64 *)dst; |
1071 | |
1072 | - dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0)); |
1073 | - dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1)); |
1074 | + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); |
1075 | + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); |
1076 | |
1077 | - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); |
1078 | + *(__m64 *)dst = pack8888(dest0, dest1); |
1079 | } |
1080 | |
1081 | mask += 2; |
1082 | dst += 2; |
1083 | w -= 2; |
1084 | } |
1085 | - |
1086 | + |
1087 | CHECKPOINT(); |
1088 | |
1089 | while (w) |
1090 | @@ -720,9 +832,9 @@ |
1091 | |
1092 | if (m) |
1093 | { |
1094 | - Vector4x16 vdest = load8888(*dst); |
1095 | - vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest); |
1096 | - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
1097 | + __m64 vdest = load8888(*dst); |
1098 | + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); |
1099 | + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
1100 | } |
1101 | |
1102 | w--; |
1103 | @@ -731,7 +843,7 @@ |
1104 | } |
1105 | } |
1106 | |
1107 | - emms(); |
1108 | + _mm_empty(); |
1109 | } |
1110 | |
1111 | |
1112 | @@ -754,7 +866,7 @@ |
1113 | CARD8 *maskLine, *mask; |
1114 | FbStride dstStride, maskStride; |
1115 | CARD16 w; |
1116 | - Vector4x16 vsrc, vsrca; |
1117 | + __m64 vsrc, vsrca; |
1118 | unsigned long long srcsrcsrcsrc, src16; |
1119 | |
1120 | CHECKPOINT(); |
1121 | @@ -770,9 +882,9 @@ |
1122 | |
1123 | vsrc = load8888 (src); |
1124 | vsrca = expand_alpha (vsrc); |
1125 | - |
1126 | - src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0); |
1127 | - |
1128 | + |
1129 | + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); |
1130 | + |
1131 | srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | |
1132 | (ullong)src16 << 16 | (ullong)src16; |
1133 | |
1134 | @@ -783,7 +895,7 @@ |
1135 | mask = maskLine; |
1136 | maskLine += maskStride; |
1137 | w = width; |
1138 | - |
1139 | + |
1140 | CHECKPOINT(); |
1141 | |
1142 | while (w && (unsigned long)dst & 7) |
1143 | @@ -793,16 +905,16 @@ |
1144 | if (m) |
1145 | { |
1146 | ullong d = *dst; |
1147 | - Vector4x16 vd = (Vector4x16)d; |
1148 | - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); |
1149 | - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); |
1150 | + __m64 vd = (__m64)d; |
1151 | + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
1152 | + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
1153 | } |
1154 | |
1155 | w--; |
1156 | mask++; |
1157 | dst++; |
1158 | } |
1159 | - |
1160 | + |
1161 | CHECKPOINT(); |
1162 | |
1163 | while (w >= 4) |
1164 | @@ -812,35 +924,35 @@ |
1165 | m1 = *(mask + 1); |
1166 | m2 = *(mask + 2); |
1167 | m3 = *(mask + 3); |
1168 | - |
1169 | + |
1170 | if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
1171 | { |
1172 | *(unsigned long long *)dst = srcsrcsrcsrc; |
1173 | } |
1174 | else if (m0 | m1 | m2 | m3) |
1175 | { |
1176 | - Vector4x16 vdest; |
1177 | - Vector4x16 vm0, vm1, vm2, vm3; |
1178 | - |
1179 | - vdest = *(Vector4x16 *)dst; |
1180 | - |
1181 | - vm0 = (Vector4x16)m0; |
1182 | + __m64 vdest; |
1183 | + __m64 vm0, vm1, vm2, vm3; |
1184 | + |
1185 | + vdest = *(__m64 *)dst; |
1186 | + |
1187 | + vm0 = (__m64)m0; |
1188 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); |
1189 | - vm1 = (Vector4x16)m1; |
1190 | + vm1 = (__m64)m1; |
1191 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); |
1192 | - vm2 = (Vector4x16)m2; |
1193 | + vm2 = (__m64)m2; |
1194 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); |
1195 | - vm3 = (Vector4x16)m3; |
1196 | + vm3 = (__m64)m3; |
1197 | vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); |
1198 | |
1199 | - *(Vector4x16 *)dst = vdest; |
1200 | + *(__m64 *)dst = vdest; |
1201 | } |
1202 | |
1203 | w -= 4; |
1204 | mask += 4; |
1205 | dst += 4; |
1206 | } |
1207 | - |
1208 | + |
1209 | CHECKPOINT(); |
1210 | |
1211 | while (w) |
1212 | @@ -850,9 +962,9 @@ |
1213 | if (m) |
1214 | { |
1215 | ullong d = *dst; |
1216 | - Vector4x16 vd = (Vector4x16)d; |
1217 | - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); |
1218 | - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); |
1219 | + __m64 vd = (__m64)d; |
1220 | + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
1221 | + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
1222 | } |
1223 | |
1224 | w--; |
1225 | @@ -861,7 +973,7 @@ |
1226 | } |
1227 | } |
1228 | |
1229 | - emms(); |
1230 | + _mm_empty(); |
1231 | } |
1232 | |
1233 | void |
1234 | @@ -887,9 +999,9 @@ |
1235 | |
1236 | fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
1237 | fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1238 | - |
1239 | + |
1240 | assert (pSrc->pDrawable == pMask->pDrawable); |
1241 | - |
1242 | + |
1243 | while (height--) |
1244 | { |
1245 | dst = dstLine; |
1246 | @@ -897,14 +1009,14 @@ |
1247 | src = srcLine; |
1248 | srcLine += srcStride; |
1249 | w = width; |
1250 | - |
1251 | + |
1252 | CHECKPOINT(); |
1253 | |
1254 | while (w && (unsigned long)dst & 7) |
1255 | { |
1256 | - Vector4x16 vsrc = load8888 (*src); |
1257 | + __m64 vsrc = load8888 (*src); |
1258 | ullong d = *dst; |
1259 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1260 | + __m64 vdest = expand565 ((__m64)d, 0); |
1261 | |
1262 | vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
1263 | |
1264 | @@ -914,19 +1026,19 @@ |
1265 | dst++; |
1266 | src++; |
1267 | } |
1268 | - |
1269 | + |
1270 | CHECKPOINT(); |
1271 | |
1272 | while (w >= 4) |
1273 | { |
1274 | CARD32 s0, s1, s2, s3; |
1275 | unsigned char a0, a1, a2, a3; |
1276 | - |
1277 | + |
1278 | s0 = *src; |
1279 | s1 = *(src + 1); |
1280 | s2 = *(src + 2); |
1281 | s3 = *(src + 3); |
1282 | - |
1283 | + |
1284 | a0 = (s0 >> 24); |
1285 | a1 = (s1 >> 24); |
1286 | a2 = (s2 >> 24); |
1287 | @@ -934,38 +1046,38 @@ |
1288 | |
1289 | if ((a0 & a1 & a2 & a3) == 0xFF) |
1290 | { |
1291 | - Vector4x16 vdest; |
1292 | - vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0); |
1293 | + __m64 vdest; |
1294 | + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); |
1295 | vdest = pack565(invert_colors(load8888(s1)), vdest, 1); |
1296 | vdest = pack565(invert_colors(load8888(s2)), vdest, 2); |
1297 | vdest = pack565(invert_colors(load8888(s3)), vdest, 3); |
1298 | - |
1299 | - *(Vector4x16 *)dst = vdest; |
1300 | + |
1301 | + *(__m64 *)dst = vdest; |
1302 | } |
1303 | else if (a0 | a1 | a2 | a3) |
1304 | { |
1305 | - Vector4x16 vdest = *(Vector4x16 *)dst; |
1306 | - |
1307 | + __m64 vdest = *(__m64 *)dst; |
1308 | + |
1309 | vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); |
1310 | vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); |
1311 | vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); |
1312 | vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); |
1313 | - |
1314 | - *(Vector4x16 *)dst = vdest; |
1315 | + |
1316 | + *(__m64 *)dst = vdest; |
1317 | } |
1318 | |
1319 | w -= 4; |
1320 | dst += 4; |
1321 | src += 4; |
1322 | } |
1323 | - |
1324 | + |
1325 | CHECKPOINT(); |
1326 | |
1327 | while (w) |
1328 | { |
1329 | - Vector4x16 vsrc = load8888 (*src); |
1330 | + __m64 vsrc = load8888 (*src); |
1331 | ullong d = *dst; |
1332 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1333 | + __m64 vdest = expand565 ((__m64)d, 0); |
1334 | |
1335 | vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
1336 | |
1337 | @@ -976,11 +1088,11 @@ |
1338 | src++; |
1339 | } |
1340 | } |
1341 | - |
1342 | - emms(); |
1343 | + |
1344 | + _mm_empty(); |
1345 | } |
1346 | |
1347 | -/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
1348 | +/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
1349 | |
1350 | void |
1351 | fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, |
1352 | @@ -1005,9 +1117,9 @@ |
1353 | |
1354 | fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
1355 | fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1356 | - |
1357 | + |
1358 | assert (pSrc->pDrawable == pMask->pDrawable); |
1359 | - |
1360 | + |
1361 | while (height--) |
1362 | { |
1363 | dst = dstLine; |
1364 | @@ -1015,28 +1127,28 @@ |
1365 | src = srcLine; |
1366 | srcLine += srcStride; |
1367 | w = width; |
1368 | - |
1369 | + |
1370 | while (w && (unsigned long)dst & 7) |
1371 | { |
1372 | - Vector4x16 s = load8888 (*src); |
1373 | - Vector4x16 d = load8888 (*dst); |
1374 | + __m64 s = load8888 (*src); |
1375 | + __m64 d = load8888 (*dst); |
1376 | |
1377 | - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); |
1378 | + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
1379 | |
1380 | w--; |
1381 | dst++; |
1382 | src++; |
1383 | } |
1384 | - |
1385 | + |
1386 | while (w >= 2) |
1387 | { |
1388 | ullong s0, s1; |
1389 | unsigned char a0, a1; |
1390 | - Vector4x16 d0, d1; |
1391 | - |
1392 | + __m64 d0, d1; |
1393 | + |
1394 | s0 = *src; |
1395 | s1 = *(src + 1); |
1396 | - |
1397 | + |
1398 | a0 = (s0 >> 24); |
1399 | a1 = (s1 >> 24); |
1400 | |
1401 | @@ -1044,17 +1156,17 @@ |
1402 | { |
1403 | d0 = invert_colors(load8888(s0)); |
1404 | d1 = invert_colors(load8888(s1)); |
1405 | - |
1406 | - *(Vector8x8 *)dst = pack8888 (d0, d1); |
1407 | + |
1408 | + *(__m64 *)dst = pack8888 (d0, d1); |
1409 | } |
1410 | else if (a0 | a1) |
1411 | { |
1412 | - Vector4x16 vdest = *(Vector4x16 *)dst; |
1413 | - |
1414 | + __m64 vdest = *(__m64 *)dst; |
1415 | + |
1416 | d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); |
1417 | d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); |
1418 | - |
1419 | - *(Vector8x8 *)dst = pack8888 (d0, d1); |
1420 | + |
1421 | + *(__m64 *)dst = pack8888 (d0, d1); |
1422 | } |
1423 | |
1424 | w -= 2; |
1425 | @@ -1064,18 +1176,18 @@ |
1426 | |
1427 | while (w) |
1428 | { |
1429 | - Vector4x16 s = load8888 (*src); |
1430 | - Vector4x16 d = load8888 (*dst); |
1431 | + __m64 s = load8888 (*src); |
1432 | + __m64 d = load8888 (*dst); |
1433 | |
1434 | - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); |
1435 | + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
1436 | |
1437 | w--; |
1438 | dst++; |
1439 | src++; |
1440 | } |
1441 | } |
1442 | - |
1443 | - emms(); |
1444 | + |
1445 | + _mm_empty(); |
1446 | } |
1447 | |
1448 | void |
1449 | @@ -1096,7 +1208,7 @@ |
1450 | CARD16 *dstLine; |
1451 | CARD32 *maskLine; |
1452 | FbStride dstStride, maskStride; |
1453 | - Vector4x16 vsrc, vsrca; |
1454 | + __m64 vsrc, vsrca; |
1455 | |
1456 | CHECKPOINT(); |
1457 | |
1458 | @@ -1125,7 +1237,7 @@ |
1459 | if (m) |
1460 | { |
1461 | ullong d = *q; |
1462 | - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1463 | + __m64 vdest = expand565 ((__m64)d, 0); |
1464 | vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
1465 | *q = (ullong)vdest; |
1466 | } |
1467 | @@ -1146,14 +1258,14 @@ |
1468 | |
1469 | if ((m0 | m1 | m2 | m3)) |
1470 | { |
1471 | - Vector4x16 vdest = *(Vector4x16 *)q; |
1472 | + __m64 vdest = *(__m64 *)q; |
1473 | |
1474 | vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); |
1475 | vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); |
1476 | vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); |
1477 | vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); |
1478 | |
1479 | - *(Vector4x16 *)q = vdest; |
1480 | + *(__m64 *)q = vdest; |
1481 | } |
1482 | twidth -= 4; |
1483 | p += 4; |
1484 | @@ -1168,7 +1280,7 @@ |
1485 | if (m) |
1486 | { |
1487 | ullong d = *q; |
1488 | - Vector4x16 vdest = expand565((Vector4x16)d, 0); |
1489 | + __m64 vdest = expand565((__m64)d, 0); |
1490 | vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); |
1491 | *q = (ullong)vdest; |
1492 | } |
1493 | @@ -1182,7 +1294,7 @@ |
1494 | dstLine += dstStride; |
1495 | } |
1496 | |
1497 | - emms (); |
1498 | + _mm_empty (); |
1499 | } |
1500 | |
1501 | void |
1502 | @@ -1210,7 +1322,7 @@ |
1503 | |
1504 | fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); |
1505 | fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); |
1506 | - |
1507 | + |
1508 | while (height--) |
1509 | { |
1510 | dst = dstLine; |
1511 | @@ -1218,7 +1330,7 @@ |
1512 | src = srcLine; |
1513 | srcLine += srcStride; |
1514 | w = width; |
1515 | - |
1516 | + |
1517 | while (w && (unsigned long)dst & 7) |
1518 | { |
1519 | s = *src; |
1520 | @@ -1234,13 +1346,7 @@ |
1521 | |
1522 | while (w >= 8) |
1523 | { |
1524 | - __asm__ __volatile__ ( |
1525 | - "movq (%0), %%mm2\n\t" |
1526 | - "movq (%1), %%mm3\n\t" |
1527 | - "paddusb %%mm2, %%mm3\n\t" |
1528 | - "movq %%mm3, (%1)\n\t" |
1529 | - : /* no output */ : "r" (src), "r" (dst)); |
1530 | - |
1531 | + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
1532 | dst += 8; |
1533 | src += 8; |
1534 | w -= 8; |
1535 | @@ -1259,8 +1365,8 @@ |
1536 | w--; |
1537 | } |
1538 | } |
1539 | - |
1540 | - emms(); |
1541 | + |
1542 | + _mm_empty(); |
1543 | } |
1544 | |
1545 | void |
1546 | @@ -1297,13 +1403,8 @@ |
1547 | |
1548 | while (w && (unsigned long)dst & 7) |
1549 | { |
1550 | - __asm__ __volatile__ ( |
1551 | - "movd %0, %%mm2\n\t" |
1552 | - "movd %1, %%mm3\n\t" |
1553 | - "paddusb %%mm2, %%mm3\n\t" |
1554 | - "movd %%mm3, %1\n\t" |
1555 | - : /* no output */ : "m" (*src), "m" (*dst)); |
1556 | - |
1557 | + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
1558 | + _mm_cvtsi32_si64(*dst))); |
1559 | dst++; |
1560 | src++; |
1561 | w--; |
1562 | @@ -1311,13 +1412,7 @@ |
1563 | |
1564 | while (w >= 2) |
1565 | { |
1566 | - __asm__ __volatile__ ( |
1567 | - "movq (%0), %%mm2\n\t" |
1568 | - "movq (%1), %%mm3\n\t" |
1569 | - "paddusb %%mm2, %%mm3\n\t" |
1570 | - "movq %%mm3, (%1)\n\t" |
1571 | - : /* no output */ : "r" (src), "r" (dst)); |
1572 | - |
1573 | + *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
1574 | dst += 2; |
1575 | src += 2; |
1576 | w -= 2; |
1577 | @@ -1325,16 +1420,13 @@ |
1578 | |
1579 | if (w) |
1580 | { |
1581 | - __asm__ __volatile__ ( |
1582 | - "movd %0, %%mm2\n\t" |
1583 | - "movd %1, %%mm3\n\t" |
1584 | - "paddusb %%mm2, %%mm3\n\t" |
1585 | - "movd %%mm3, %1\n\t" |
1586 | - : /* no output */ : "m" (*src), "m" (*dst)); |
1587 | + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
1588 | + _mm_cvtsi32_si64(*dst))); |
1589 | + |
1590 | } |
1591 | } |
1592 | - |
1593 | - emms(); |
1594 | + |
1595 | + _mm_empty(); |
1596 | } |
1597 | |
1598 | #define GetStart(drw,x,y,type,stride,line,bpp) {\ |
1599 | @@ -1358,19 +1450,19 @@ |
1600 | FbStride stride; |
1601 | int bpp; |
1602 | ullong fill; |
1603 | - Vector8x8 vfill; |
1604 | + __m64 vfill; |
1605 | CARD32 byte_width; |
1606 | CARD8 *byte_line; |
1607 | FbBits *bits; |
1608 | int xoff, yoff; |
1609 | |
1610 | CHECKPOINT(); |
1611 | - |
1612 | + |
1613 | fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); |
1614 | - |
1615 | + |
1616 | if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) |
1617 | return FALSE; |
1618 | - |
1619 | + |
1620 | if (bpp != 16 && bpp != 32) |
1621 | return FALSE; |
1622 | |
1623 | @@ -1388,9 +1480,9 @@ |
1624 | byte_width = 4 * width; |
1625 | stride *= 4; |
1626 | } |
1627 | - |
1628 | + |
1629 | fill = ((ullong)xor << 32) | xor; |
1630 | - vfill = (Vector8x8)fill; |
1631 | + vfill = (__m64)fill; |
1632 | |
1633 | while (height--) |
1634 | { |
1635 | @@ -1398,7 +1490,7 @@ |
1636 | CARD8 *d = byte_line; |
1637 | byte_line += stride; |
1638 | w = byte_width; |
1639 | - |
1640 | + |
1641 | while (w >= 2 && ((unsigned long)d & 3)) |
1642 | { |
1643 | *(CARD16 *)d = xor; |
1644 | @@ -1406,35 +1498,32 @@ |
1645 | d += 2; |
1646 | } |
1647 | |
1648 | - while (w >= 4 && ((unsigned int)d & 7)) |
1649 | + while (w >= 4 && ((unsigned long)d & 7)) |
1650 | { |
1651 | *(CARD32 *)d = xor; |
1652 | - |
1653 | + |
1654 | w -= 4; |
1655 | d += 4; |
1656 | } |
1657 | |
1658 | while (w >= 64) |
1659 | { |
1660 | - __asm__ __volatile ( |
1661 | - "movq %0, (%1)\n\t" |
1662 | - "movq %0, 8(%1)\n\t" |
1663 | - "movq %0, 16(%1)\n\t" |
1664 | - "movq %0, 24(%1)\n\t" |
1665 | - "movq %0, 32(%1)\n\t" |
1666 | - "movq %0, 40(%1)\n\t" |
1667 | - "movq %0, 48(%1)\n\t" |
1668 | - "movq %0, 56(%1)\n\t" |
1669 | - : /* no output */ |
1670 | - : "y" (vfill), "r" (d) |
1671 | - : "memory"); |
1672 | + *(__m64*) (d + 0) = vfill; |
1673 | + *(__m64*) (d + 8) = vfill; |
1674 | + *(__m64*) (d + 16) = vfill; |
1675 | + *(__m64*) (d + 24) = vfill; |
1676 | + *(__m64*) (d + 32) = vfill; |
1677 | + *(__m64*) (d + 40) = vfill; |
1678 | + *(__m64*) (d + 48) = vfill; |
1679 | + *(__m64*) (d + 56) = vfill; |
1680 | + |
1681 | w -= 64; |
1682 | d += 64; |
1683 | } |
1684 | while (w >= 4) |
1685 | { |
1686 | *(CARD32 *)d = xor; |
1687 | - |
1688 | + |
1689 | w -= 4; |
1690 | d += 4; |
1691 | } |
1692 | @@ -1446,16 +1535,160 @@ |
1693 | } |
1694 | } |
1695 | |
1696 | - emms(); |
1697 | + _mm_empty(); |
1698 | + return TRUE; |
1699 | +} |
1700 | + |
1701 | +Bool |
1702 | +fbCopyAreammx (DrawablePtr pSrc, |
1703 | + DrawablePtr pDst, |
1704 | + int src_x, |
1705 | + int src_y, |
1706 | + int dst_x, |
1707 | + int dst_y, |
1708 | + int width, |
1709 | + int height) |
1710 | +{ |
1711 | + FbBits * src_bits; |
1712 | + FbStride src_stride; |
1713 | + int src_bpp; |
1714 | + int src_xoff; |
1715 | + int src_yoff; |
1716 | + |
1717 | + FbBits * dst_bits; |
1718 | + FbStride dst_stride; |
1719 | + int dst_bpp; |
1720 | + int dst_xoff; |
1721 | + int dst_yoff; |
1722 | + |
1723 | + CARD8 * src_bytes; |
1724 | + CARD8 * dst_bytes; |
1725 | + int byte_width; |
1726 | + |
1727 | + fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); |
1728 | + fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); |
1729 | + |
1730 | + if (src_bpp != 16 && src_bpp != 32) |
1731 | + return FALSE; |
1732 | + |
1733 | + if (dst_bpp != 16 && dst_bpp != 32) |
1734 | + return FALSE; |
1735 | + |
1736 | + if (src_bpp != dst_bpp) |
1737 | + { |
1738 | + return FALSE; |
1739 | + } |
1740 | + |
1741 | + if (src_bpp == 16) |
1742 | + { |
1743 | + src_stride = src_stride * sizeof (FbBits) / 2; |
1744 | + dst_stride = dst_stride * sizeof (FbBits) / 2; |
1745 | + src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
1746 | + dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
1747 | + byte_width = 2 * width; |
1748 | + src_stride *= 2; |
1749 | + dst_stride *= 2; |
1750 | + } |
1751 | + else |
1752 | + { |
1753 | + src_stride = src_stride * sizeof (FbBits) / 4; |
1754 | + dst_stride = dst_stride * sizeof (FbBits) / 4; |
1755 | + src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
1756 | + dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
1757 | + byte_width = 4 * width; |
1758 | + src_stride *= 4; |
1759 | + dst_stride *= 4; |
1760 | + } |
1761 | + |
1762 | + while (height--) |
1763 | + { |
1764 | + int w; |
1765 | + CARD8 *s = src_bytes; |
1766 | + CARD8 *d = dst_bytes; |
1767 | + src_bytes += src_stride; |
1768 | + dst_bytes += dst_stride; |
1769 | + w = byte_width; |
1770 | + |
1771 | + while (w >= 2 && ((unsigned long)d & 3)) |
1772 | + { |
1773 | + *(CARD16 *)d = *(CARD16 *)s; |
1774 | + w -= 2; |
1775 | + s += 2; |
1776 | + d += 2; |
1777 | + } |
1778 | + |
1779 | + while (w >= 4 && ((unsigned int)d & 7)) |
1780 | + { |
1781 | + *(CARD32 *)d = *(CARD32 *)s; |
1782 | + |
1783 | + w -= 4; |
1784 | + s += 4; |
1785 | + d += 4; |
1786 | + } |
1787 | + |
1788 | + while (w >= 64) |
1789 | + { |
1790 | + *(__m64 *)(d + 0) = *(__m64 *)(s + 0); |
1791 | + *(__m64 *)(d + 8) = *(__m64 *)(s + 8); |
1792 | + *(__m64 *)(d + 16) = *(__m64 *)(s + 16); |
1793 | + *(__m64 *)(d + 24) = *(__m64 *)(s + 24); |
1794 | + *(__m64 *)(d + 32) = *(__m64 *)(s + 32); |
1795 | + *(__m64 *)(d + 40) = *(__m64 *)(s + 40); |
1796 | + *(__m64 *)(d + 48) = *(__m64 *)(s + 48); |
1797 | + *(__m64 *)(d + 56) = *(__m64 *)(s + 56); |
1798 | + w -= 64; |
1799 | + s += 64; |
1800 | + d += 64; |
1801 | + } |
1802 | + while (w >= 4) |
1803 | + { |
1804 | + *(CARD32 *)d = *(CARD32 *)s; |
1805 | + |
1806 | + w -= 4; |
1807 | + s += 4; |
1808 | + d += 4; |
1809 | + } |
1810 | + if (w >= 2) |
1811 | + { |
1812 | + *(CARD16 *)d = *(CARD16 *)s; |
1813 | + w -= 2; |
1814 | + s += 2; |
1815 | + d += 2; |
1816 | + } |
1817 | + } |
1818 | + |
1819 | + _mm_empty(); |
1820 | return TRUE; |
1821 | } |
1822 | |
1823 | +void |
1824 | +fbCompositeCopyAreammx (CARD8 op, |
1825 | + PicturePtr pSrc, |
1826 | + PicturePtr pMask, |
1827 | + PicturePtr pDst, |
1828 | + INT16 xSrc, |
1829 | + INT16 ySrc, |
1830 | + INT16 xMask, |
1831 | + INT16 yMask, |
1832 | + INT16 xDst, |
1833 | + INT16 yDst, |
1834 | + CARD16 width, |
1835 | + CARD16 height) |
1836 | +{ |
1837 | + fbCopyAreammx (pSrc->pDrawable, |
1838 | + pDst->pDrawable, |
1839 | + xSrc, ySrc, |
1840 | + xDst, yDst, |
1841 | + width, height); |
1842 | +} |
1843 | + |
1844 | +#ifndef __amd64__ |
1845 | Bool |
1846 | fbHaveMMX (void) |
1847 | { |
1848 | static Bool initialized = FALSE; |
1849 | static Bool mmx_present; |
1850 | - |
1851 | + |
1852 | if (!initialized) |
1853 | { |
1854 | int tmp; /* static variables are accessed through %ebx, |
1855 | @@ -1466,7 +1699,7 @@ |
1856 | |
1857 | __asm__ __volatile__ ( |
1858 | /* Check if bit 21 in flags word is writeable */ |
1859 | - |
1860 | + |
1861 | "pusha \n\t" |
1862 | "pushfl \n\t" |
1863 | "popl %%eax \n\t" |
1864 | @@ -1502,13 +1735,14 @@ |
1865 | : /* no input */); |
1866 | |
1867 | initialized = TRUE; |
1868 | - |
1869 | + |
1870 | mmx_present = tmp; |
1871 | } |
1872 | |
1873 | return mmx_present; |
1874 | } |
1875 | +#endif /* __amd64__ */ |
1876 | |
1877 | |
1878 | #endif /* RENDER */ |
1879 | -#endif /* USE_GCC34_MMX */ |
1880 | +#endif /* USE_MMX */ |
1881 | diff -ur xc-orig/programs/Xserver/fb/fbmmx.h xc/programs/Xserver/fb/fbmmx.h |
1882 | --- xc-orig/programs/Xserver/fb/fbmmx.h 2005-02-11 04:00:50.006092570 -0500 |
1883 | +++ xc/programs/Xserver/fb/fbmmx.h 2005-02-11 04:01:32.072346126 -0500 |
1884 | @@ -1,5 +1,5 @@ |
1885 | /* |
1886 | - * Copyright © 2004 Red Hat, Inc. |
1887 | + * Copyright © 2004 Red Hat, Inc. |
1888 | * |
1889 | * Permission to use, copy, modify, distribute, and sell this software and its |
1890 | * documentation for any purpose is hereby granted without fee, provided that |
1891 | @@ -18,17 +18,23 @@ |
1892 | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
1893 | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
1894 | * |
1895 | - * Author: Søren Sandmann (sandmann@redhat.com) |
1896 | + * Author: Søren Sandmann (sandmann@redhat.com) |
1897 | * |
1898 | * Based on work by Owen Taylor |
1899 | */ |
1900 | -#ifdef USE_GCC34_MMX |
1901 | +#ifdef USE_MMX |
1902 | + |
1903 | +#ifndef __amd64__ |
1904 | Bool fbHaveMMX(void); |
1905 | #else |
1906 | -#define fbHaveMMX FALSE |
1907 | +#define fbHaveMMX() TRUE |
1908 | +#endif |
1909 | + |
1910 | +#else |
1911 | +#define fbHaveMMX() FALSE |
1912 | #endif |
1913 | |
1914 | -#ifdef USE_GCC34_MMX |
1915 | +#ifdef USE_MMX |
1916 | |
1917 | void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, |
1918 | PicturePtr pSrc, |
1919 | @@ -150,6 +156,38 @@ |
1920 | INT16 yDst, |
1921 | CARD16 width, |
1922 | CARD16 height); |
1923 | +void fbCompositeSrc_8888x8x8888mmx (CARD8 op, |
1924 | + PicturePtr pSrc, |
1925 | + PicturePtr pMask, |
1926 | + PicturePtr pDst, |
1927 | + INT16 xSrc, |
1928 | + INT16 ySrc, |
1929 | + INT16 xMask, |
1930 | + INT16 yMask, |
1931 | + INT16 xDst, |
1932 | + INT16 yDst, |
1933 | + CARD16 width, |
1934 | + CARD16 height); |
1935 | +Bool fbCopyAreammx (DrawablePtr pSrc, |
1936 | + DrawablePtr pDst, |
1937 | + int src_x, |
1938 | + int src_y, |
1939 | + int dst_x, |
1940 | + int dst_y, |
1941 | + int width, |
1942 | + int height); |
1943 | +void fbCompositeCopyAreammx (CARD8 op, |
1944 | + PicturePtr pSrc, |
1945 | + PicturePtr pMask, |
1946 | + PicturePtr pDst, |
1947 | + INT16 xSrc, |
1948 | + INT16 ySrc, |
1949 | + INT16 xMask, |
1950 | + INT16 yMask, |
1951 | + INT16 xDst, |
1952 | + INT16 yDst, |
1953 | + CARD16 width, |
1954 | + CARD16 height); |
1955 | Bool fbSolidFillmmx (DrawablePtr pDraw, |
1956 | int x, |
1957 | int y, |
1958 | @@ -157,4 +195,4 @@ |
1959 | int height, |
1960 | FbBits xor); |
1961 | |
1962 | -#endif /* USE_GCC34_MMX */ |
1963 | +#endif /* USE_MMX */ |
1964 | |
1965 | diff -ur xc-orig/programs/Xserver/fb/fbpict.c xc/programs/Xserver/fb/fbpict.c |
1966 | --- xc-orig/programs/Xserver/fb/fbpict.c 2005-02-11 04:00:50.007092600 -0500 |
1967 | +++ xc/programs/Xserver/fb/fbpict.c 2005-02-11 04:01:32.075346216 -0500 |
1968 | @@ -1,7 +1,7 @@ |
1969 | /* |
1970 | * $XFree86: xc/programs/Xserver/fb/fbpict.c,v 1.15 2002/09/26 02:56:48 keithp Exp $ |
1971 | * |
1972 | - * Copyright © 2000 SuSE, Inc. |
1973 | + * Copyright © 2000 SuSE, Inc. |
1974 | * |
1975 | * Permission to use, copy, modify, distribute, and sell this software and its |
1976 | * documentation for any purpose is hereby granted without fee, provided that |
1977 | @@ -863,6 +863,15 @@ |
1978 | if (!pSrc->transform && !(pMask && pMask->transform)) |
1979 | if (!maskAlphaMap && !srcAlphaMap && !dstAlphaMap) |
1980 | switch (op) { |
1981 | + case PictOpSrc: |
1982 | +#ifdef USE_MMX |
1983 | + if (!pMask && pSrc->format == pDst->format && |
1984 | + pSrc->pDrawable != pDst->pDrawable) |
1985 | + { |
1986 | + func = fbCompositeCopyAreammx; |
1987 | + } |
1988 | +#endif |
1989 | + break; |
1990 | case PictOpOver: |
1991 | if (pMask) |
1992 | { |
1993 | @@ -877,7 +886,7 @@ |
1994 | switch (pDst->format) { |
1995 | case PICT_r5g6b5: |
1996 | case PICT_b5g6r5: |
1997 | -#ifdef USE_GCC34_MMX |
1998 | +#ifdef USE_MMX |
1999 | if (fbHaveMMX()) |
2000 | func = fbCompositeSolidMask_nx8x0565mmx; |
2001 | else |
2002 | @@ -892,7 +901,7 @@ |
2003 | case PICT_x8r8g8b8: |
2004 | case PICT_a8b8g8r8: |
2005 | case PICT_x8b8g8r8: |
2006 | -#ifdef USE_GCC34_MMX |
2007 | +#ifdef USE_MMX |
2008 | if (fbHaveMMX()) |
2009 | func = fbCompositeSolidMask_nx8x8888mmx; |
2010 | else |
2011 | @@ -906,7 +915,7 @@ |
2012 | switch (pDst->format) { |
2013 | case PICT_a8r8g8b8: |
2014 | case PICT_x8r8g8b8: |
2015 | -#ifdef USE_GCC34_MMX |
2016 | +#ifdef USE_MMX |
2017 | if (fbHaveMMX()) |
2018 | func = fbCompositeSolidMask_nx8888x8888Cmmx; |
2019 | else |
2020 | @@ -914,7 +923,7 @@ |
2021 | func = fbCompositeSolidMask_nx8888x8888C; |
2022 | break; |
2023 | case PICT_r5g6b5: |
2024 | -#ifdef USE_GCC34_MMX |
2025 | +#ifdef USE_MMX |
2026 | if (fbHaveMMX()) |
2027 | func = fbCompositeSolidMask_nx8888x0565Cmmx; |
2028 | else |
2029 | @@ -929,7 +938,7 @@ |
2030 | switch (pDst->format) { |
2031 | case PICT_a8b8g8r8: |
2032 | case PICT_x8b8g8r8: |
2033 | -#ifdef USE_GCC34_MMX |
2034 | +#ifdef USE_MMX |
2035 | if (fbHaveMMX()) |
2036 | func = fbCompositeSolidMask_nx8888x8888Cmmx; |
2037 | else |
2038 | @@ -937,7 +946,7 @@ |
2039 | func = fbCompositeSolidMask_nx8888x8888C; |
2040 | break; |
2041 | case PICT_b5g6r5: |
2042 | -#ifdef USE_GCC34_MMX |
2043 | +#ifdef USE_MMX |
2044 | if (fbHaveMMX()) |
2045 | func = fbCompositeSolidMask_nx8888x0565Cmmx; |
2046 | else |
2047 | @@ -970,6 +979,7 @@ |
2048 | xSrc == xMask && ySrc == yMask && |
2049 | !pMask->componentAlpha) |
2050 | { |
2051 | + /* source == mask: non-premultiplied data */ |
2052 | switch (pSrc->format) { |
2053 | case PICT_x8b8g8r8: |
2054 | switch (pMask->format) { |
2055 | @@ -978,13 +988,13 @@ |
2056 | switch (pDst->format) { |
2057 | case PICT_a8r8g8b8: |
2058 | case PICT_x8r8g8b8: |
2059 | -#ifdef USE_GCC34_MMX |
2060 | +#ifdef USE_MMX |
2061 | if (fbHaveMMX()) |
2062 | func = fbCompositeSrc_8888RevNPx8888mmx; |
2063 | #endif |
2064 | break; |
2065 | case PICT_r5g6b5: |
2066 | -#ifdef USE_GCC34_MMX |
2067 | +#ifdef USE_MMX |
2068 | if (fbHaveMMX()) |
2069 | func = fbCompositeSrc_8888RevNPx0565mmx; |
2070 | #endif |
2071 | @@ -1000,13 +1010,13 @@ |
2072 | switch (pDst->format) { |
2073 | case PICT_a8b8g8r8: |
2074 | case PICT_x8b8g8r8: |
2075 | -#ifdef USE_GCC34_MMX |
2076 | +#ifdef USE_MMX |
2077 | if (fbHaveMMX()) |
2078 | func = fbCompositeSrc_8888RevNPx8888mmx; |
2079 | #endif |
2080 | break; |
2081 | case PICT_r5g6b5: |
2082 | -#ifdef USE_GCC34_MMX |
2083 | +#ifdef USE_MMX |
2084 | if (fbHaveMMX()) |
2085 | func = fbCompositeSrc_8888RevNPx0565mmx; |
2086 | #endif |
2087 | @@ -1018,9 +1028,27 @@ |
2088 | } |
2089 | break; |
2090 | } |
2091 | + else |
2092 | + { |
2093 | + /* non-repeating source, repeating mask => translucent window */ |
2094 | + if (maskRepeat && |
2095 | + pMask->pDrawable->width == 1 && |
2096 | + pMask->pDrawable->height == 1) |
2097 | + { |
2098 | + if (pSrc->format == PICT_x8r8g8b8 && |
2099 | + pDst->format == PICT_x8r8g8b8 && |
2100 | + pMask->format == PICT_a8) |
2101 | + { |
2102 | +#ifdef USE_MMX |
2103 | + if (fbHaveMMX()) |
2104 | + func = fbCompositeSrc_8888x8x8888mmx; |
2105 | +#endif |
2106 | + } |
2107 | + } |
2108 | + } |
2109 | } |
2110 | } |
2111 | - else |
2112 | + else /* no mask */ |
2113 | { |
2114 | if (srcRepeat && |
2115 | pSrc->pDrawable->width == 1 && |
2116 | @@ -1032,7 +1060,7 @@ |
2117 | switch (pDst->format) { |
2118 | case PICT_a8r8g8b8: |
2119 | case PICT_x8r8g8b8: |
2120 | -#ifdef USE_GCC34_MMX |
2121 | +#ifdef USE_MMX |
2122 | if (fbHaveMMX()) |
2123 | { |
2124 | srcRepeat = FALSE; |
2125 | @@ -1041,7 +1069,7 @@ |
2126 | #endif |
2127 | break; |
2128 | case PICT_r5g6b5: |
2129 | -#ifdef USE_GCC34_MMX |
2130 | +#ifdef USE_MMX |
2131 | if (fbHaveMMX()) |
2132 | { |
2133 | srcRepeat = FALSE; |
2134 | @@ -1070,6 +1098,27 @@ |
2135 | break; |
2136 | } |
2137 | break; |
2138 | + case PICT_x8r8g8b8: |
2139 | + switch (pDst->format) { |
2140 | + case PICT_a8r8g8b8: |
2141 | + case PICT_x8r8g8b8: |
2142 | +#ifdef USE_MMX |
2143 | + if (fbHaveMMX()) |
2144 | + func = fbCompositeCopyAreammx; |
2145 | +#endif |
2146 | + break; |
2147 | + } |
2148 | + case PICT_x8b8g8r8: |
2149 | + switch (pDst->format) { |
2150 | + case PICT_a8b8g8r8: |
2151 | + case PICT_x8b8g8r8: |
2152 | +#ifdef USE_MMX |
2153 | + if (fbHaveMMX()) |
2154 | + func = fbCompositeCopyAreammx; |
2155 | +#endif |
2156 | + break; |
2157 | + } |
2158 | + break; |
2159 | case PICT_a8b8g8r8: |
2160 | switch (pDst->format) { |
2161 | case PICT_a8b8g8r8: |
2162 | @@ -1109,7 +1158,7 @@ |
2163 | case PICT_a8r8g8b8: |
2164 | switch (pDst->format) { |
2165 | case PICT_a8r8g8b8: |
2166 | -#ifdef USE_GCC34_MMX |
2167 | +#ifdef USE_MMX |
2168 | if (fbHaveMMX()) |
2169 | func = fbCompositeSrcAdd_8888x8888mmx; |
2170 | else |
2171 | @@ -1121,7 +1170,7 @@ |
2172 | case PICT_a8b8g8r8: |
2173 | switch (pDst->format) { |
2174 | case PICT_a8b8g8r8: |
2175 | -#ifdef USE_GCC34_MMX |
2176 | +#ifdef USE_MMX |
2177 | if (fbHaveMMX()) |
2178 | func = fbCompositeSrcAdd_8888x8888mmx; |
2179 | else |
2180 | @@ -1133,7 +1182,7 @@ |
2181 | case PICT_a8: |
2182 | switch (pDst->format) { |
2183 | case PICT_a8: |
2184 | -#ifdef USE_GCC34_MMX |
2185 | +#ifdef USE_MMX |
2186 | if (fbHaveMMX()) |
2187 | func = fbCompositeSrcAdd_8000x8000mmx; |
2188 | else |