Root/
1 | #ifndef _VIDEO_ATAFB_UTILS_H |
2 | #define _VIDEO_ATAFB_UTILS_H |
3 | |
4 | /* ================================================================= */ |
5 | /* Utility Assembler Functions */ |
6 | /* ================================================================= */ |
7 | |
8 | /* ====================================================================== */ |
9 | |
10 | /* Those of a delicate disposition might like to skip the next couple of |
11 | * pages. |
12 | * |
13 | * These functions are drop in replacements for memmove and |
14 | * memset(_, 0, _). However their five instances add at least a kilobyte |
15 | * to the object file. You have been warned. |
16 | * |
17 | * Not a great fan of assembler for the sake of it, but I think |
18 | * that these routines are at least 10 times faster than their C |
19 | * equivalents for large blits, and that's important to the lowest level of |
20 | * a graphics driver. Question is whether some scheme with the blitter |
21 | * would be faster. I suspect not for simple text system - not much |
22 | * asynchrony. |
23 | * |
24 | * Code is very simple, just gruesome expansion. Basic strategy is to |
25 | * increase data moved/cleared at each step to 16 bytes to reduce |
26 | * instruction per data move overhead. movem might be faster still |
27 | * For more than 15 bytes, we try to align the write direction on a |
28 | * longword boundary to get maximum speed. This is even more gruesome. |
29 | * Unaligned read/write used requires 68020+ - think this is a problem? |
30 | * |
31 | * Sorry! |
32 | */ |
33 | |
34 | |
35 | /* ++roman: I've optimized Robert's original versions in some minor |
36 | * aspects, e.g. moveq instead of movel, let gcc choose the registers, |
37 | * use movem in some places... |
38 | * For other modes than 1 plane, lots of more such assembler functions |
39 | * were needed (e.g. the ones using movep or expanding color values). |
40 | */ |
41 | |
42 | /* ++andreas: more optimizations: |
43 | subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc |
44 | addal is faster than addaw |
45 | movep is rather expensive compared to ordinary move's |
46 | some functions rewritten in C for clarity, no speed loss */ |
47 | |
48 | static inline void *fb_memclear_small(void *s, size_t count) |
49 | { |
50 | if (!count) |
51 | return 0; |
52 | |
53 | asm volatile ("\n" |
54 | " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n" |
55 | "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n" |
56 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n" |
57 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n" |
58 | "1:" |
59 | : "=a" (s), "=d" (count) |
60 | : "d" (0), "0" ((char *)s + count), "1" (count)); |
61 | asm volatile ("\n" |
62 | " subq.l #1,%1\n" |
63 | " jcs 3f\n" |
64 | " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n" |
65 | "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n" |
66 | " dbra %1,2b\n" |
67 | "3:" |
68 | : "=a" (s), "=d" (count) |
69 | : "d" (0), "0" (s), "1" (count) |
70 | : "d4", "d5", "d6" |
71 | ); |
72 | |
73 | return 0; |
74 | } |
75 | |
76 | |
77 | static inline void *fb_memclear(void *s, size_t count) |
78 | { |
79 | if (!count) |
80 | return 0; |
81 | |
82 | if (count < 16) { |
83 | asm volatile ("\n" |
84 | " lsr.l #1,%1 ; jcc 1f ; clr.b (%0)+\n" |
85 | "1: lsr.l #1,%1 ; jcc 1f ; clr.w (%0)+\n" |
86 | "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+\n" |
87 | "1: lsr.l #1,%1 ; jcc 1f ; clr.l (%0)+ ; clr.l (%0)+\n" |
88 | "1:" |
89 | : "=a" (s), "=d" (count) |
90 | : "0" (s), "1" (count)); |
91 | } else { |
92 | long tmp; |
93 | asm volatile ("\n" |
94 | " move.l %1,%2\n" |
95 | " lsr.l #1,%2 ; jcc 1f ; clr.b (%0)+ ; subq.w #1,%1\n" |
96 | " lsr.l #1,%2 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ |
97 | " clr.w (%0)+ ; subq.w #2,%1 ; jra 2f\n" |
98 | "1: lsr.l #1,%2 ; jcc 2f\n" |
99 | " clr.w (%0)+ ; subq.w #2,%1\n" |
100 | "2: move.w %1,%2; lsr.l #2,%1 ; jeq 6f\n" |
101 | " lsr.l #1,%1 ; jcc 3f ; clr.l (%0)+\n" |
102 | "3: lsr.l #1,%1 ; jcc 4f ; clr.l (%0)+ ; clr.l (%0)+\n" |
103 | "4: subq.l #1,%1 ; jcs 6f\n" |
104 | "5: clr.l (%0)+; clr.l (%0)+ ; clr.l (%0)+ ; clr.l (%0)+\n" |
105 | " dbra %1,5b ; clr.w %1; subq.l #1,%1; jcc 5b\n" |
106 | "6: move.w %2,%1; btst #1,%1 ; jeq 7f ; clr.w (%0)+\n" |
107 | "7: btst #0,%1 ; jeq 8f ; clr.b (%0)+\n" |
108 | "8:" |
109 | : "=a" (s), "=d" (count), "=d" (tmp) |
110 | : "0" (s), "1" (count)); |
111 | } |
112 | |
113 | return 0; |
114 | } |
115 | |
116 | |
117 | static inline void *fb_memset255(void *s, size_t count) |
118 | { |
119 | if (!count) |
120 | return 0; |
121 | |
122 | asm volatile ("\n" |
123 | " lsr.l #1,%1 ; jcc 1f ; move.b %2,-(%0)\n" |
124 | "1: lsr.l #1,%1 ; jcc 1f ; move.w %2,-(%0)\n" |
125 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0)\n" |
126 | "1: lsr.l #1,%1 ; jcc 1f ; move.l %2,-(%0) ; move.l %2,-(%0)\n" |
127 | "1:" |
128 | : "=a" (s), "=d" (count) |
129 | : "d" (-1), "0" ((char *)s+count), "1" (count)); |
130 | asm volatile ("\n" |
131 | " subq.l #1,%1 ; jcs 3f\n" |
132 | " move.l %2,%%d4; move.l %2,%%d5; move.l %2,%%d6\n" |
133 | "2: movem.l %2/%%d4/%%d5/%%d6,-(%0)\n" |
134 | " dbra %1,2b\n" |
135 | "3:" |
136 | : "=a" (s), "=d" (count) |
137 | : "d" (-1), "0" (s), "1" (count) |
138 | : "d4", "d5", "d6"); |
139 | |
140 | return 0; |
141 | } |
142 | |
143 | |
144 | static inline void *fb_memmove(void *d, const void *s, size_t count) |
145 | { |
146 | if (d < s) { |
147 | if (count < 16) { |
148 | asm volatile ("\n" |
149 | " lsr.l #1,%2 ; jcc 1f ; move.b (%1)+,(%0)+\n" |
150 | "1: lsr.l #1,%2 ; jcc 1f ; move.w (%1)+,(%0)+\n" |
151 | "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+\n" |
152 | "1: lsr.l #1,%2 ; jcc 1f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n" |
153 | "1:" |
154 | : "=a" (d), "=a" (s), "=d" (count) |
155 | : "0" (d), "1" (s), "2" (count)); |
156 | } else { |
157 | long tmp; |
158 | asm volatile ("\n" |
159 | " move.l %0,%3\n" |
160 | " lsr.l #1,%3 ; jcc 1f ; move.b (%1)+,(%0)+ ; subqw #1,%2\n" |
161 | " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ |
162 | " move.w (%1)+,(%0)+ ; subqw #2,%2 ; jra 2f\n" |
163 | "1: lsr.l #1,%3 ; jcc 2f\n" |
164 | " move.w (%1)+,(%0)+ ; subqw #2,%2\n" |
165 | "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n" |
166 | " lsr.l #1,%2 ; jcc 3f ; move.l (%1)+,(%0)+\n" |
167 | "3: lsr.l #1,%2 ; jcc 4f ; move.l (%1)+,(%0)+ ; move.l (%1)+,(%0)+\n" |
168 | "4: subq.l #1,%2 ; jcs 6f\n" |
169 | "5: move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n" |
170 | " move.l (%1)+,(%0)+; move.l (%1)+,(%0)+\n" |
171 | " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n" |
172 | "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w (%1)+,(%0)+\n" |
173 | "7: btst #0,%2 ; jeq 8f ; move.b (%1)+,(%0)+\n" |
174 | "8:" |
175 | : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp) |
176 | : "0" (d), "1" (s), "2" (count)); |
177 | } |
178 | } else { |
179 | if (count < 16) { |
180 | asm volatile ("\n" |
181 | " lsr.l #1,%2 ; jcc 1f ; move.b -(%1),-(%0)\n" |
182 | "1: lsr.l #1,%2 ; jcc 1f ; move.w -(%1),-(%0)\n" |
183 | "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0)\n" |
184 | "1: lsr.l #1,%2 ; jcc 1f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n" |
185 | "1:" |
186 | : "=a" (d), "=a" (s), "=d" (count) |
187 | : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)); |
188 | } else { |
189 | long tmp; |
190 | |
191 | asm volatile ("\n" |
192 | " move.l %0,%3\n" |
193 | " lsr.l #1,%3 ; jcc 1f ; move.b -(%1),-(%0) ; subqw #1,%2\n" |
194 | " lsr.l #1,%3 ; jcs 2f\n" /* %0 increased=>bit 2 switched*/ |
195 | " move.w -(%1),-(%0) ; subqw #2,%2 ; jra 2f\n" |
196 | "1: lsr.l #1,%3 ; jcc 2f\n" |
197 | " move.w -(%1),-(%0) ; subqw #2,%2\n" |
198 | "2: move.w %2,%-; lsr.l #2,%2 ; jeq 6f\n" |
199 | " lsr.l #1,%2 ; jcc 3f ; move.l -(%1),-(%0)\n" |
200 | "3: lsr.l #1,%2 ; jcc 4f ; move.l -(%1),-(%0) ; move.l -(%1),-(%0)\n" |
201 | "4: subq.l #1,%2 ; jcs 6f\n" |
202 | "5: move.l -(%1),-(%0); move.l -(%1),-(%0)\n" |
203 | " move.l -(%1),-(%0); move.l -(%1),-(%0)\n" |
204 | " dbra %2,5b ; clr.w %2; subq.l #1,%2; jcc 5b\n" |
205 | "6: move.w %+,%2; btst #1,%2 ; jeq 7f ; move.w -(%1),-(%0)\n" |
206 | "7: btst #0,%2 ; jeq 8f ; move.b -(%1),-(%0)\n" |
207 | "8:" |
208 | : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp) |
209 | : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)); |
210 | } |
211 | } |
212 | |
213 | return 0; |
214 | } |
215 | |
216 | |
217 | /* ++andreas: Simple and fast version of memmove, assumes size is |
218 | divisible by 16, suitable for moving the whole screen bitplane */ |
219 | static inline void fast_memmove(char *dst, const char *src, size_t size) |
220 | { |
221 | if (!size) |
222 | return; |
223 | if (dst < src) |
224 | asm volatile ("\n" |
225 | "1: movem.l (%0)+,%%d0/%%d1/%%a0/%%a1\n" |
226 | " movem.l %%d0/%%d1/%%a0/%%a1,%1@\n" |
227 | " addq.l #8,%1; addq.l #8,%1\n" |
228 | " dbra %2,1b\n" |
229 | " clr.w %2; subq.l #1,%2\n" |
230 | " jcc 1b" |
231 | : "=a" (src), "=a" (dst), "=d" (size) |
232 | : "0" (src), "1" (dst), "2" (size / 16 - 1) |
233 | : "d0", "d1", "a0", "a1", "memory"); |
234 | else |
235 | asm volatile ("\n" |
236 | "1: subq.l #8,%0; subq.l #8,%0\n" |
237 | " movem.l %0@,%%d0/%%d1/%%a0/%%a1\n" |
238 | " movem.l %%d0/%%d1/%%a0/%%a1,-(%1)\n" |
239 | " dbra %2,1b\n" |
240 | " clr.w %2; subq.l #1,%2\n" |
241 | " jcc 1b" |
242 | : "=a" (src), "=a" (dst), "=d" (size) |
243 | : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1) |
244 | : "d0", "d1", "a0", "a1", "memory"); |
245 | } |
246 | |
247 | #ifdef BPL |
248 | |
249 | /* |
250 | * This expands a up to 8 bit color into two longs |
251 | * for movel operations. |
252 | */ |
253 | static const u32 four2long[] = { |
254 | 0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff, |
255 | 0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff, |
256 | 0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff, |
257 | 0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff, |
258 | }; |
259 | |
260 | static inline void expand8_col2mask(u8 c, u32 m[]) |
261 | { |
262 | m[0] = four2long[c & 15]; |
263 | #if BPL > 4 |
264 | m[1] = four2long[c >> 4]; |
265 | #endif |
266 | } |
267 | |
268 | static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[]) |
269 | { |
270 | fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]); |
271 | #if BPL > 4 |
272 | fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]); |
273 | #endif |
274 | } |
275 | |
276 | /* |
277 | * set an 8bit value to a color |
278 | */ |
279 | static inline void fill8_col(u8 *dst, u32 m[]) |
280 | { |
281 | u32 tmp = m[0]; |
282 | dst[0] = tmp; |
283 | dst[2] = (tmp >>= 8); |
284 | #if BPL > 2 |
285 | dst[4] = (tmp >>= 8); |
286 | dst[6] = tmp >> 8; |
287 | #endif |
288 | #if BPL > 4 |
289 | tmp = m[1]; |
290 | dst[8] = tmp; |
291 | dst[10] = (tmp >>= 8); |
292 | dst[12] = (tmp >>= 8); |
293 | dst[14] = tmp >> 8; |
294 | #endif |
295 | } |
296 | |
297 | /* |
298 | * set an 8bit value according to foreground/background color |
299 | */ |
300 | static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask) |
301 | { |
302 | u32 fgm[2], bgm[2], tmp; |
303 | |
304 | expand8_2col2mask(fg, bg, fgm, bgm); |
305 | |
306 | mask |= mask << 8; |
307 | #if BPL > 2 |
308 | mask |= mask << 16; |
309 | #endif |
310 | tmp = (mask & fgm[0]) ^ bgm[0]; |
311 | dst[0] = tmp; |
312 | dst[2] = (tmp >>= 8); |
313 | #if BPL > 2 |
314 | dst[4] = (tmp >>= 8); |
315 | dst[6] = tmp >> 8; |
316 | #endif |
317 | #if BPL > 4 |
318 | tmp = (mask & fgm[1]) ^ bgm[1]; |
319 | dst[8] = tmp; |
320 | dst[10] = (tmp >>= 8); |
321 | dst[12] = (tmp >>= 8); |
322 | dst[14] = tmp >> 8; |
323 | #endif |
324 | } |
325 | |
326 | static const u32 two2word[] = { |
327 | 0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff |
328 | }; |
329 | |
330 | static inline void expand16_col2mask(u8 c, u32 m[]) |
331 | { |
332 | m[0] = two2word[c & 3]; |
333 | #if BPL > 2 |
334 | m[1] = two2word[(c >> 2) & 3]; |
335 | #endif |
336 | #if BPL > 4 |
337 | m[2] = two2word[(c >> 4) & 3]; |
338 | m[3] = two2word[c >> 6]; |
339 | #endif |
340 | } |
341 | |
342 | static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[]) |
343 | { |
344 | bgm[0] = two2word[bg & 3]; |
345 | fgm[0] = two2word[fg & 3] ^ bgm[0]; |
346 | #if BPL > 2 |
347 | bgm[1] = two2word[(bg >> 2) & 3]; |
348 | fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1]; |
349 | #endif |
350 | #if BPL > 4 |
351 | bgm[2] = two2word[(bg >> 4) & 3]; |
352 | fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2]; |
353 | bgm[3] = two2word[bg >> 6]; |
354 | fgm[3] = two2word[fg >> 6] ^ bgm[3]; |
355 | #endif |
356 | } |
357 | |
358 | static inline u32 *fill16_col(u32 *dst, int rows, u32 m[]) |
359 | { |
360 | while (rows) { |
361 | *dst++ = m[0]; |
362 | #if BPL > 2 |
363 | *dst++ = m[1]; |
364 | #endif |
365 | #if BPL > 4 |
366 | *dst++ = m[2]; |
367 | *dst++ = m[3]; |
368 | #endif |
369 | rows--; |
370 | } |
371 | return dst; |
372 | } |
373 | |
374 | static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes) |
375 | { |
376 | u32 *s, *d, v; |
377 | |
378 | s = src; |
379 | d = dst; |
380 | do { |
381 | v = (*s++ & mask) | (*d & ~mask); |
382 | *d++ = v; |
383 | #if BPL > 2 |
384 | v = (*s++ & mask) | (*d & ~mask); |
385 | *d++ = v; |
386 | #endif |
387 | #if BPL > 4 |
388 | v = (*s++ & mask) | (*d & ~mask); |
389 | *d++ = v; |
390 | v = (*s++ & mask) | (*d & ~mask); |
391 | *d++ = v; |
392 | #endif |
393 | d = (u32 *)((u8 *)d + bytes); |
394 | s = (u32 *)((u8 *)s + bytes); |
395 | } while (--h); |
396 | } |
397 | |
398 | #endif |
399 | |
400 | #endif /* _VIDEO_ATAFB_UTILS_H */ |
401 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9