Annotation of /trunk/libfame/patches/libfame-0.9.1-pic.patch
Parent Directory | Revision Log
Revision 1115 -
(hide annotations)
(download)
Sun Aug 22 18:37:54 2010 UTC (14 years, 1 month ago) by niro
File size: 19202 byte(s)
Sun Aug 22 18:37:54 2010 UTC (14 years, 1 month ago) by niro
File size: 19202 byte(s)
-added
1 | niro | 1115 | diff -Nurp libfame-0.9.1/src/dct_mmx.h libfame-0.9.1-pic/src/dct_mmx.h |
2 | --- libfame-0.9.1/src/dct_mmx.h 2002-04-14 12:22:05.000000000 +0100 | ||
3 | +++ libfame-0.9.1-pic/src/dct_mmx.h 2005-04-24 00:48:52.000000000 +0100 | ||
4 | @@ -22,6 +22,9 @@ | ||
5 | |||
6 | #define precision | ||
7 | |||
8 | +extern FAME_ALIGNED short const _mmx_1[]; | ||
9 | +extern FAME_ALIGNED short const _mmx_cos[]; | ||
10 | + | ||
11 | static void inline dct_aan_pass(dct_t *cache) | ||
12 | { | ||
13 | // register unsigned short const *mmx_cos = _mmx_cos; | ||
14 | @@ -66,42 +69,42 @@ static void inline dct_aan_pass(dct_t *c | ||
15 | #ifdef precision | ||
16 | "psllw $0x01, %%mm5\n" /* precision(va0) += 1 bit */ | ||
17 | #endif | ||
18 | - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ | ||
19 | + "paddw (%2), %%mm4\n" /* + 1 */ | ||
20 | // "pmulhw 16(%1), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */ | ||
21 | - "pmulhw " ASMSYM "_mmx_cos+16, %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */ | ||
22 | + "pmulhw 16(%3), %%mm5\n" /* (v14+v16)*COS6 -> mm5 (va0) */ | ||
23 | "" /* STEP 4 */ | ||
24 | #ifdef precision | ||
25 | "psllw $0x02, %%mm6\n" /* precision(v22) += 1 bit */ | ||
26 | #else | ||
27 | "psllw $0x01, %%mm6\n" /* */ | ||
28 | #endif | ||
29 | - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ | ||
30 | + "paddw (%2), %%mm4\n" /* + 1 */ | ||
31 | // "pmulhw 8(%1), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/ | ||
32 | - "pmulhw " ASMSYM "_mmx_cos+8, %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/ | ||
33 | + "pmulhw 8(%3), %%mm6\n" /* 2*v22*COS4/2 -> mm6 (v32)*/ | ||
34 | #ifdef precision | ||
35 | "psllw $0x02, %%mm2\n" /* precision(v15) += 1 bit */ | ||
36 | #else | ||
37 | "psllw $0x01, %%mm2\n" /* */ | ||
38 | #endif | ||
39 | - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ | ||
40 | + "paddw (%2), %%mm4\n" /* + 1 */ | ||
41 | // "pmulhw 8(%1), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */ | ||
42 | - "pmulhw " ASMSYM "_mmx_cos+8, %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */ | ||
43 | + "pmulhw 8(%3), %%mm2\n" /* 2*v15*COS4/2 -> mm2 (v35) */ | ||
44 | #ifdef precision | ||
45 | "psllw $0x02, %%mm4\n" /* precision(v14) += 1 bit */ | ||
46 | #else | ||
47 | "psllw $0x01, %%mm4\n" /* */ | ||
48 | #endif | ||
49 | - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ | ||
50 | + "paddw (%2), %%mm4\n" /* + 1 */ | ||
51 | // "pmulhw 0(%1), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */ | ||
52 | - "pmulhw " ASMSYM "_mmx_cos, %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */ | ||
53 | + "pmulhw (%3), %%mm4\n" /* 2 * v14 * -COS2/2 -> mm4 */ | ||
54 | "psubsw %%mm5, %%mm4\n" /* v14*-COS2 - va0 -> mm4 (v34) */ | ||
55 | #ifdef precision | ||
56 | "psllw $0x01, %%mm1\n" /* precision(v16) += 1 bit */ | ||
57 | #endif | ||
58 | "psubsw %%mm1, %%mm5\n" /* va0 - v16 -> mm5 */ | ||
59 | - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ | ||
60 | + "paddw (%2), %%mm4\n" /* + 1 */ | ||
61 | // "pmulhw 24(%1), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */ | ||
62 | - "pmulhw " ASMSYM "_mmx_cos+24, %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */ | ||
63 | + "pmulhw 24(%3), %%mm1\n" /* v16 * (COS8 - 1) -> mm1 */ | ||
64 | "psubsw %%mm5, %%mm1\n" /* v16 * COS8 - va0 -> mm1 (v36)*/ | ||
65 | "" /* STEP 5 */ | ||
66 | "movq 0x70(%0), %%mm0\n" /* retrieve v07 -> mm0 */ | ||
67 | @@ -138,8 +141,8 @@ static void inline dct_aan_pass(dct_t *c | ||
68 | "movq %%mm0, 0x30(%0)\n" /* store line 3 */ | ||
69 | "movq %%mm4, 0x50(%0)\n" /* store line 5 */ | ||
70 | "movq %%mm2, 0x70(%0)\n" /* store line 7 */ | ||
71 | - : "=r"(cache)/*, "=r"(mmx_cos)*/ | ||
72 | - : "0"(cache)/*, "1"(mmx_cos)*/ | ||
73 | + : "=r"(cache) | ||
74 | + : "0"(cache), "r"(_mmx_1), "r"(_mmx_cos) | ||
75 | : "memory"); | ||
76 | } | ||
77 | |||
78 | diff -Nurp libfame-0.9.1/src/dequantize_mmx.h libfame-0.9.1-pic/src/dequantize_mmx.h | ||
79 | --- libfame-0.9.1/src/dequantize_mmx.h 2002-04-23 22:40:56.000000000 +0100 | ||
80 | +++ libfame-0.9.1-pic/src/dequantize_mmx.h 2005-04-24 00:44:26.000000000 +0100 | ||
81 | @@ -27,8 +27,8 @@ | ||
82 | "pmullw 0x" #x "8(%3), %%mm5\n" /* premultiply for iDCT */ \ | ||
83 | "psrlw $0x0b, %%mm4\n" /* keep 5 bits */ \ | ||
84 | "psrlw $0x0b, %%mm5\n" /* keep 5 bits */ \ | ||
85 | - "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ \ | ||
86 | - "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 */ \ | ||
87 | + "paddw (%8), %%mm4\n" /* + 1 */ \ | ||
88 | + "paddw (%8), %%mm5\n" /* + 1 */ \ | ||
89 | "psrlw $0x01, %%mm4\n" /* keep 4 bits rounded */ \ | ||
90 | "psrlw $0x01, %%mm5\n" /* keep 4 bits rounded */ \ | ||
91 | "psllw $0x04, %%mm0\n" /* multiply by 16 for iDCT */ \ | ||
92 | @@ -107,7 +107,7 @@ static void inline dequantize_intra_glob | ||
93 | DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() | ||
94 | DEQUANTIZE_PRESCALE_STEP(7) | ||
95 | : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) | ||
96 | - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) | ||
97 | + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1) | ||
98 | : "memory"); | ||
99 | |||
100 | asm volatile("movd %%mm6, %0\n" /* export mismatch */ | ||
101 | @@ -160,8 +160,8 @@ static void inline dequantize_intra_loca | ||
102 | "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \ | ||
103 | "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \ | ||
104 | "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \ | ||
105 | - "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \ | ||
106 | - "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \ | ||
107 | + "por (%8), %%mm0\n" /* or 1 */ \ | ||
108 | + "por (%8), %%mm1\n" /* or 1 */ \ | ||
109 | "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \ | ||
110 | "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */ | ||
111 | |||
112 | @@ -184,7 +184,7 @@ static void inline dequantize_intra_loca | ||
113 | DEQUANTIZE_INTRA_LOCAL_STEP(7) | ||
114 | DEQUANTIZE_PRESCALE_STEP(7) | ||
115 | : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) | ||
116 | - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) | ||
117 | + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1) | ||
118 | : "memory"); | ||
119 | } | ||
120 | |||
121 | @@ -256,7 +256,7 @@ static void inline dequantize_inter_glob | ||
122 | /* resetting the accumulator when the block is coded intra */ | ||
123 | DEQUANTIZE_PRESCALE_STEP(7) | ||
124 | : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) | ||
125 | - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) | ||
126 | + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1) | ||
127 | : "memory"); | ||
128 | |||
129 | asm volatile("movd %%mm6, %0\n" /* export mismatch */ | ||
130 | @@ -324,8 +324,8 @@ static void inline dequantize_inter_loca | ||
131 | "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \ | ||
132 | "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \ | ||
133 | "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \ | ||
134 | - "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \ | ||
135 | - "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \ | ||
136 | + "por (%8), %%mm0\n" /* or 1 */ \ | ||
137 | + "por (%8), %%mm1\n" /* or 1 */ \ | ||
138 | "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \ | ||
139 | "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */ | ||
140 | |||
141 | @@ -348,6 +348,6 @@ static void inline dequantize_inter_loca | ||
142 | DEQUANTIZE_INTER_LOCAL_STEP(7) | ||
143 | DEQUANTIZE_PRESCALE_STEP(7) | ||
144 | : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) | ||
145 | - : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) | ||
146 | + : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix), "r"(_mmx_1) | ||
147 | : "memory"); | ||
148 | } | ||
149 | diff -Nurp libfame-0.9.1/src/fame_syntax_mpeg1.c libfame-0.9.1-pic/src/fame_syntax_mpeg1.c | ||
150 | --- libfame-0.9.1/src/fame_syntax_mpeg1.c 2002-10-05 13:44:47.000000000 +0100 | ||
151 | +++ libfame-0.9.1-pic/src/fame_syntax_mpeg1.c 2005-04-24 00:19:09.000000000 +0100 | ||
152 | @@ -469,89 +469,6 @@ static void mpeg1_block_intra(fame_synta | ||
153 | fast_bitbuffer_write(data, shift, table[v+255].code, table[v+255].length); | ||
154 | |||
155 | /* encode AC coefficients */ | ||
156 | -#if defined(HAS_BSWAP) | ||
157 | - { | ||
158 | - unsigned long dummy1, dummy2; | ||
159 | - | ||
160 | - /* Note: | ||
161 | - movsx mpeg1_table_clip+4096(, %%eax ,2), %%eax | ||
162 | - has been replaced by | ||
163 | - movw mpeg1_table_clip+4096(, %%eax ,2), %%ax | ||
164 | - movsx %%ax, %%eax | ||
165 | - because the first instruction failed on a PIII!! (wrong sign extension) | ||
166 | - whereas it worked well on my P75 :) | ||
167 | - */ | ||
168 | - /* Ok, a bit of explanations for a couple of tricks: | ||
169 | - The DC value of block is already coded and stored in v so we can use it to store something. | ||
170 | - We add one index to the zigzag table so that after coding block[63] we go to index 0. There | ||
171 | - we need to escape the zero counting loop (1), what we ensure by putting a non-zero value in | ||
172 | - the DC coefficient. Then we can test for index == 0 to exit. | ||
173 | - Now this non-zero value is a bit special :) | ||
174 | - In order to have one more 'half' register, we store sp value (16 less significant bit of the | ||
175 | - 32 bit register esp) *plus one* in the DC coefficient. Since the stack is aligned at an | ||
176 | - address multiple of 4 bytes (at least), we are sure that sp != 0xffff and thus sp+1 will | ||
177 | - never be zero. We then retrieve sp at the end for it is needed by 'pop' instructions. | ||
178 | - */ | ||
179 | - /* TODO : echange the role of edx and esp */ | ||
180 | - __asm__ __volatile__ ("pushl %%ebx\n" /* save ebx */ | ||
181 | - "pushl %%ebp\n" /* save stack pointer */ | ||
182 | - "inc %%sp\n" /* make sure sp != 0 */ | ||
183 | - "movw %%sp, (%%edx)\n" /* store sp+1 in DC ;) */ | ||
184 | - "movl %%esi, %%ebp\n" /* ebp = vlc_table */ | ||
185 | - "xorl %%eax, %%eax\n" /* eax = 0 */ | ||
186 | - "movl $" ASMSYM "mpeg1_zigzag_table+1, %%esi\n" /*esi = zigzag*/ | ||
187 | - "lea 1(%%esi), %%ebx\n" /* ebx = zigzag_table+1*/ | ||
188 | - "neg %%ebx\n" /* ebx = -(esi+1) */ | ||
189 | - ".p2align 4,,7\n" /* align for jump */ | ||
190 | - "0: xorw %%sp, %%sp\n" /* sp = 0 */ | ||
191 | - "1: movb (%%esi), %%al\n" /* eax = index in block*/ | ||
192 | - "incl %%esi\n" /* (faster than lodsb) */ | ||
193 | - "addw (%%edx, %%eax, 2), %%sp\n" /* sp = unzig */ | ||
194 | - "jz 1b\n" /* coeff == 0 then loop*/ | ||
195 | - "orl %%eax, %%eax\n" /* index == 0 then quit*/ | ||
196 | - "jz 2f\n" /* (faster than jcxz) */ | ||
197 | - "movsx %%sp, %%eax\n" /* extend sign */ | ||
198 | - "movw " ASMSYM "mpeg1_table_clip_data+4096(, %%eax ,2), %%ax\n" /*clip*/ | ||
199 | - "movsx %%ax, %%eax\n" /* extend sign */ | ||
200 | - "addl %%esi, %%ebx\n" /* ebx = run */ | ||
201 | - "shll $7, %%eax\n" /* eax *= 128(indexing)*/ | ||
202 | - "lea (%%eax, %%ebx, 2), %%eax\n" /*eax = 2 * offset*/ | ||
203 | - "lea (%%ebp, %%eax, 4), %%ebx\n" /* ebx = &vlc */ | ||
204 | - "movl (%%ebx), %%eax\n" /* eax = code */ | ||
205 | - "addl 4(%%ebx), %%ecx\n" /* ecx = shift+=length */ | ||
206 | - "xorl %%ebx, %%ebx\n" /* ebx = 0 */ | ||
207 | - "shrd %%cl, %%eax, %%ebx\n" /* adjust code to fit */ | ||
208 | - "shr %%cl, %%eax\n" /* adjust code to fit */ | ||
209 | - "bswap %%eax\n" /* reverse byte order of code */ | ||
210 | - "bswap %%ebx\n" /* reverse byte order of code */ | ||
211 | - "or %%eax, (%%edi)\n" /* put first 32 bits */ | ||
212 | - "movl %%ecx, %%eax\n" /* eax = shift + length*/ | ||
213 | - "shrl $5, %%eax\n" /* get dword increment */ | ||
214 | - "andl $31, %%ecx\n" /* mask shift */ | ||
215 | - "lea (%%edi, %%eax, 4), %%edi\n"/* data+=(ecx>32)*/ | ||
216 | - "orl %%ebx, (%%edi)\n" /* put last 32 bits */ | ||
217 | - "xorl %%eax, %%eax\n" /* eax = 0 */ | ||
218 | - "lea 1(%%esi), %%ebx\n" /* ebx = esi + 1 (last)*/ | ||
219 | - "neg %%ebx\n" /* ebx = -(esi + 1) */ | ||
220 | - "jmp 0b\n" /* loop */ | ||
221 | - "2:\n" | ||
222 | - "movw (%%edx), %%sp\n" /* retrieve sp+1 */ | ||
223 | - "dec %%sp\n" /* restore esp */ | ||
224 | - "popl %%ebp\n" /* reload stack pointer*/ | ||
225 | - "popl %%ebx\n" /* reload ebx */ | ||
226 | - : "=c"(shift), | ||
227 | - "=a"(dummy1), | ||
228 | - "=d"(block), | ||
229 | - "=D"(data), | ||
230 | - "=S"(dummy2) | ||
231 | - : "d"(block), | ||
232 | - "c"(shift), | ||
233 | - "D"(data), | ||
234 | - "S"(syntax_mpeg1->vlc_table) | ||
235 | - : "memory"); | ||
236 | - block[0] = v; /* restore DC value */ | ||
237 | - } | ||
238 | -#else | ||
239 | { | ||
240 | short i; | ||
241 | unsigned long last; | ||
242 | @@ -573,7 +490,6 @@ static void mpeg1_block_intra(fame_synta | ||
243 | } | ||
244 | } | ||
245 | } | ||
246 | -#endif /* HAS_BSWAP */ | ||
247 | |||
248 | /* mark end of block */ | ||
249 | fast_bitbuffer_write(data, shift, 2, 2); | ||
250 | diff -Nurp libfame-0.9.1/src/half_mmx.h libfame-0.9.1-pic/src/half_mmx.h | ||
251 | --- libfame-0.9.1/src/half_mmx.h 2002-04-30 19:04:02.000000000 +0100 | ||
252 | +++ libfame-0.9.1-pic/src/half_mmx.h 2005-04-24 00:44:49.000000000 +0100 | ||
253 | @@ -68,8 +68,8 @@ static void inline mmx_interpolate(unsig | ||
254 | "paddw %%mm5, %%mm6\n" /* mm6 = ref00+ref10+ref11+1-r 4-7*/ | ||
255 | "psrlw $1, %%mm4\n" /* divide by 2 */ | ||
256 | "psrlw $1, %%mm5\n" /* divide by 2 */ | ||
257 | - "paddw " ASMSYM "_mmx_one, %%mm3\n" /* add 1 */ | ||
258 | - "paddw " ASMSYM "_mmx_one, %%mm6\n" /* add 1 */ | ||
259 | + "paddw (%8), %%mm3\n" /* add 1 */ | ||
260 | + "paddw (%8), %%mm6\n" /* add 1 */ | ||
261 | "packuswb %%mm5, %%mm4\n" /* pack to byte and saturate */ | ||
262 | "movq 1(%3), %%mm1\n" /* mm1 = [ref+1] */ | ||
263 | "movq %%mm1, %%mm2\n" /* mm2 = mm1 */ | ||
264 | @@ -87,7 +87,7 @@ static void inline mmx_interpolate(unsig | ||
265 | "movl 12(%0), %3\n" /* %3 = ref[3] */ | ||
266 | "movq %%mm3, (%3)\n" /* store in frame */ | ||
267 | : "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy) | ||
268 | - : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy) | ||
269 | + : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one) | ||
270 | : "memory"); | ||
271 | } | ||
272 | |||
273 | diff -Nurp libfame-0.9.1/src/half_sse.h libfame-0.9.1-pic/src/half_sse.h | ||
274 | --- libfame-0.9.1/src/half_sse.h 2002-01-27 02:24:56.000000000 +0000 | ||
275 | +++ libfame-0.9.1-pic/src/half_sse.h 2005-04-24 00:57:08.000000000 +0100 | ||
276 | @@ -71,8 +71,8 @@ static void inline mmx_interpolate_signe | ||
277 | "paddw %%mm5, %%mm6\n" /* mm6 = ref00+ref10+ref11+1-r 4-7*/ | ||
278 | "psrlw $1, %%mm4\n" /* divide by 2 */ | ||
279 | "psrlw $1, %%mm5\n" /* divide by 2 */ | ||
280 | - "paddw " ASMSYM "_mmx_one, %%mm3\n" /* add 1 */ | ||
281 | - "paddw " ASMSYM "_mmx_one, %%mm6\n" /* add 1 */ | ||
282 | + "paddw (%8), %%mm3\n" /* add 1 */ | ||
283 | + "paddw (%8), %%mm6\n" /* add 1 */ | ||
284 | "packuswb %%mm5, %%mm4\n" /* pack to byte and saturate */ | ||
285 | "movq 1(%3), %%mm1\n" /* mm1 = [ref+1] */ | ||
286 | "movq %%mm1, %%mm2\n" /* mm2 = mm1 */ | ||
287 | @@ -90,7 +90,7 @@ static void inline mmx_interpolate_signe | ||
288 | "movl 12(%0), %3\n" /* %3 = ref[3] */ | ||
289 | "movq %%mm3, (%3)\n" /* store in frame */ | ||
290 | : "=r"(ref), "=r"(pitch), "=r"(rc), "=r"(dummy) | ||
291 | - : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy) | ||
292 | + : "0"(ref), "1"(pitch), "2"(rc), "3"(dummy), "r"(_mmx_one) | ||
293 | : "memory"); | ||
294 | } | ||
295 | |||
296 | diff -Nurp libfame-0.9.1/src/idct_mmx.h libfame-0.9.1-pic/src/idct_mmx.h | ||
297 | --- libfame-0.9.1/src/idct_mmx.h 2002-04-14 12:22:05.000000000 +0100 | ||
298 | +++ libfame-0.9.1-pic/src/idct_mmx.h 2005-04-24 00:51:00.000000000 +0100 | ||
299 | @@ -18,6 +18,10 @@ | ||
300 | */ | ||
301 | /*************************** MMX accelerated iDCT ****************************/ | ||
302 | |||
303 | +extern FAME_ALIGNED short const _mmx_1[]; | ||
304 | +extern FAME_ALIGNED short const _mmx_cos[]; | ||
305 | +extern FAME_ALIGNED short const _mmx_icos[]; | ||
306 | + | ||
307 | static void inline idct_aan_pass(dct_t * block) | ||
308 | { | ||
309 | // register unsigned short const *mmx_icos = _mmx_icos; | ||
310 | @@ -65,9 +69,9 @@ static void inline idct_aan_pass(dct_t * | ||
311 | block[row*8+6] = v45; - v71, v11, v44, v65, v24 - | ||
312 | */ | ||
313 | "psllw $0x02, %%mm5\n" /* adjust v22 for multiply */ | ||
314 | - "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 for rounding */ | ||
315 | + "paddw (%2), %%mm5\n" /* + 1 for rounding */ | ||
316 | // "pmulhw 8(%1), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23) */ | ||
317 | - "pmulhw " ASMSYM "_mmx_icos+8, %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/ | ||
318 | + "pmulhw 8(%3), %%mm5\n" /* 4*v15*ICOS4/4 -> mm5 (v23)*/ | ||
319 | "psubsw %%mm4, %%mm5\n" /* v23 - v62 -> mm5 (v24) */ | ||
320 | "movq %%mm3, %%mm6\n" /* v44 -> mm6 */ | ||
321 | "paddsw %%mm5, %%mm6\n" /* v44 + v24 -> mm6 (v45) */ | ||
322 | @@ -125,25 +129,25 @@ static void inline idct_aan_pass(dct_t * | ||
323 | block[row*8+4] += v55; - - | ||
324 | */ | ||
325 | "psllw $0x02, %%mm0\n" /* adjust v12 for multiply */ | ||
326 | - "paddw " ASMSYM "_mmx_1, %%mm0\n" /* + 1 for rounding */ | ||
327 | + "paddw (%2), %%mm0\n" /* + 1 for rounding */ | ||
328 | // "pmulhw 8(%1), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */ | ||
329 | - "pmulhw " ASMSYM "_mmx_icos+8, %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */ | ||
330 | + "pmulhw 8(%3), %%mm0\n" /* 4*v12*ICOS4/4 -> mm0 (v13) */ | ||
331 | "movq %%mm2, %%mm6\n" /* v51 -> mm6 */ | ||
332 | "psubsw %%mm1, %%mm6\n" /* v51 - v71 -> mm6 (va2) */ | ||
333 | "psllw $0x03, %%mm2\n" /* adjust v51 for multiply */ | ||
334 | - "paddw " ASMSYM "_mmx_1, %%mm2\n" /* + 1 for rounding */ | ||
335 | + "paddw (%2), %%mm2\n" /* + 1 for rounding */ | ||
336 | /* should add another one here but it seems to look better without */ | ||
337 | // "pmulhw 16(%1), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */ | ||
338 | - "pmulhw " ASMSYM "_mmx_icos+16, %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */ | ||
339 | + "pmulhw 16(%3), %%mm2\n" /* 8*v51*ICOS6/8 -> mm2 (v53) */ | ||
340 | "psllw $0x02, %%mm1\n" /* adjust v71 for multiply */ | ||
341 | - "paddw " ASMSYM "_mmx_1, %%mm1\n" /* + 1 for rounding */ | ||
342 | + "paddw (%2), %%mm1\n" /* + 1 for rounding */ | ||
343 | /* should add another one here but it seems to look better without */ | ||
344 | // "pmulhw 0(%1), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */ | ||
345 | - "pmulhw " ASMSYM "_mmx_icos, %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */ | ||
346 | + "pmulhw (%3), %%mm1\n" /* 4*v71*ICOS2/4 -> mm1 (v73) */ | ||
347 | "psllw $0x01, %%mm6\n" /* adjust va2 for multiply */ | ||
348 | - "paddw " ASMSYM "_mmx_1, %%mm6\n" /* + 1 for rounding */ | ||
349 | + "paddw (%2), %%mm6\n" /* + 1 for rounding */ | ||
350 | // "pmulhw 24(%1), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */ | ||
351 | - "pmulhw " ASMSYM "_mmx_icos+24, %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */ | ||
352 | + "pmulhw 24(%3), %%mm6\n" /* 2*v12*ICOS8/2 -> mm6 (va3) */ | ||
353 | "psubsw %%mm6, %%mm2\n" /* v53 - va3 -> mm2 (v54) */ | ||
354 | "psubsw %%mm6, %%mm1\n" /* v73 - va3 -> mm1 (v74) */ | ||
355 | "psubsw %%mm3, %%mm1\n" /* v74 - v32 -> mm3 (v75) */ | ||
356 | @@ -167,8 +171,8 @@ static void inline idct_aan_pass(dct_t * | ||
357 | "paddsw %%mm0, %%mm7\n" /* v65 + v55 -> mm7 */ | ||
358 | "movq %%mm6, 0x30(%0)\n" /* mm6 -> line 3 */ | ||
359 | "movq %%mm7, 0x40(%0)\n" /* mm7 -> line 4 */ | ||
360 | - : "=r"(block)/*, "=r"(mmx_icos)*/ | ||
361 | - : "0"(block)/*, "1"(mmx_icos)*/ | ||
362 | + : "=r"(block) | ||
363 | + : "0"(block), "r"(_mmx_1), "r"(_mmx_icos) | ||
364 | : "memory"); | ||
365 | } | ||
366 |