Root/
1 | /* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. |
2 | * |
3 | * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) |
4 | * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) |
5 | */ |
6 | |
7 | #ifdef __KERNEL__ |
8 | #include <asm/visasm.h> |
9 | #include <asm/asi.h> |
10 | #define GLOBAL_SPARE g7 |
11 | #else |
12 | #define GLOBAL_SPARE g5 |
13 | #define ASI_BLK_P 0xf0 |
14 | #define FPRS_FEF 0x04 |
15 | #ifdef MEMCPY_DEBUG |
16 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ |
17 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; |
18 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs |
19 | #else |
20 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs |
21 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs |
22 | #endif |
23 | #endif |
24 | |
25 | #ifndef EX_LD |
26 | #define EX_LD(x) x |
27 | #endif |
28 | |
29 | #ifndef EX_ST |
30 | #define EX_ST(x) x |
31 | #endif |
32 | |
33 | #ifndef EX_RETVAL |
34 | #define EX_RETVAL(x) x |
35 | #endif |
36 | |
37 | #ifndef LOAD |
38 | #define LOAD(type,addr,dest) type [addr], dest |
39 | #endif |
40 | |
41 | #ifndef LOAD_BLK |
42 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest |
43 | #endif |
44 | |
45 | #ifndef STORE |
46 | #define STORE(type,src,addr) type src, [addr] |
47 | #endif |
48 | |
49 | #ifndef STORE_BLK |
50 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P |
51 | #endif |
52 | |
53 | #ifndef FUNC_NAME |
54 | #define FUNC_NAME memcpy |
55 | #endif |
56 | |
57 | #ifndef PREAMBLE |
58 | #define PREAMBLE |
59 | #endif |
60 | |
61 | #ifndef XCC |
62 | #define XCC xcc |
63 | #endif |
64 | |
65 | #define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ |
66 | faligndata %f1, %f2, %f48; \ |
67 | faligndata %f2, %f3, %f50; \ |
68 | faligndata %f3, %f4, %f52; \ |
69 | faligndata %f4, %f5, %f54; \ |
70 | faligndata %f5, %f6, %f56; \ |
71 | faligndata %f6, %f7, %f58; \ |
72 | faligndata %f7, %f8, %f60; \ |
73 | faligndata %f8, %f9, %f62; |
74 | |
75 | #define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ |
76 | EX_LD(LOAD_BLK(%src, %fdest)); \ |
77 | EX_ST(STORE_BLK(%fsrc, %dest)); \ |
78 | add %src, 0x40, %src; \ |
79 | subcc %len, 0x40, %len; \ |
80 | be,pn %xcc, jmptgt; \ |
81 | add %dest, 0x40, %dest; \ |
82 | |
83 | #define LOOP_CHUNK1(src, dest, len, branch_dest) \ |
84 | MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) |
85 | #define LOOP_CHUNK2(src, dest, len, branch_dest) \ |
86 | MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) |
87 | #define LOOP_CHUNK3(src, dest, len, branch_dest) \ |
88 | MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) |
89 | |
90 | #define DO_SYNC membar #Sync; |
91 | #define STORE_SYNC(dest, fsrc) \ |
92 | EX_ST(STORE_BLK(%fsrc, %dest)); \ |
93 | add %dest, 0x40, %dest; \ |
94 | DO_SYNC |
95 | |
96 | #define STORE_JUMP(dest, fsrc, target) \ |
97 | EX_ST(STORE_BLK(%fsrc, %dest)); \ |
98 | add %dest, 0x40, %dest; \ |
99 | ba,pt %xcc, target; \ |
100 | nop; |
101 | |
102 | #define FINISH_VISCHUNK(dest, f0, f1, left) \ |
103 | subcc %left, 8, %left;\ |
104 | bl,pn %xcc, 95f; \ |
105 | faligndata %f0, %f1, %f48; \ |
106 | EX_ST(STORE(std, %f48, %dest)); \ |
107 | add %dest, 8, %dest; |
108 | |
109 | #define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ |
110 | subcc %left, 8, %left; \ |
111 | bl,pn %xcc, 95f; \ |
112 | fsrc1 %f0, %f1; |
113 | |
114 | #define UNEVEN_VISCHUNK(dest, f0, f1, left) \ |
115 | UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ |
116 | ba,a,pt %xcc, 93f; |
117 | |
118 | .register %g2,#scratch |
119 | .register %g3,#scratch |
120 | |
121 | .text |
122 | .align 64 |
123 | |
124 | .globl FUNC_NAME |
125 | .type FUNC_NAME,#function |
126 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ |
127 | srlx %o2, 31, %g2 |
128 | cmp %g2, 0 |
129 | tne %xcc, 5 |
130 | PREAMBLE |
131 | mov %o0, %o4 |
132 | cmp %o2, 0 |
133 | be,pn %XCC, 85f |
134 | or %o0, %o1, %o3 |
135 | cmp %o2, 16 |
136 | blu,a,pn %XCC, 80f |
137 | or %o3, %o2, %o3 |
138 | |
139 | cmp %o2, (5 * 64) |
140 | blu,pt %XCC, 70f |
141 | andcc %o3, 0x7, %g0 |
142 | |
143 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ |
144 | VISEntry |
145 | |
146 | /* Is 'dst' already aligned on an 64-byte boundary? */ |
147 | andcc %o0, 0x3f, %g2 |
148 | be,pt %XCC, 2f |
149 | |
150 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number |
151 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- |
152 | * subtract this from 'len'. |
153 | */ |
154 | sub %o0, %o1, %GLOBAL_SPARE |
155 | sub %g2, 0x40, %g2 |
156 | sub %g0, %g2, %g2 |
157 | sub %o2, %g2, %o2 |
158 | andcc %g2, 0x7, %g1 |
159 | be,pt %icc, 2f |
160 | and %g2, 0x38, %g2 |
161 | |
162 | 1: subcc %g1, 0x1, %g1 |
163 | EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) |
164 | EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) |
165 | bgu,pt %XCC, 1b |
166 | add %o1, 0x1, %o1 |
167 | |
168 | add %o1, %GLOBAL_SPARE, %o0 |
169 | |
170 | 2: cmp %g2, 0x0 |
171 | and %o1, 0x7, %g1 |
172 | be,pt %icc, 3f |
173 | alignaddr %o1, %g0, %o1 |
174 | |
175 | EX_LD(LOAD(ldd, %o1, %f4)) |
176 | 1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) |
177 | add %o1, 0x8, %o1 |
178 | subcc %g2, 0x8, %g2 |
179 | faligndata %f4, %f6, %f0 |
180 | EX_ST(STORE(std, %f0, %o0)) |
181 | be,pn %icc, 3f |
182 | add %o0, 0x8, %o0 |
183 | |
184 | EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) |
185 | add %o1, 0x8, %o1 |
186 | subcc %g2, 0x8, %g2 |
187 | faligndata %f6, %f4, %f0 |
188 | EX_ST(STORE(std, %f0, %o0)) |
189 | bne,pt %icc, 1b |
190 | add %o0, 0x8, %o0 |
191 | |
192 | /* Destination is 64-byte aligned. */ |
193 | 3: |
194 | membar #LoadStore | #StoreStore | #StoreLoad |
195 | |
196 | subcc %o2, 0x40, %GLOBAL_SPARE |
197 | add %o1, %g1, %g1 |
198 | andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE |
199 | srl %g1, 3, %g2 |
200 | sub %o2, %GLOBAL_SPARE, %g3 |
201 | andn %o1, (0x40 - 1), %o1 |
202 | and %g2, 7, %g2 |
203 | andncc %g3, 0x7, %g3 |
204 | fmovd %f0, %f2 |
205 | sub %g3, 0x8, %g3 |
206 | sub %o2, %GLOBAL_SPARE, %o2 |
207 | |
208 | add %g1, %GLOBAL_SPARE, %g1 |
209 | subcc %o2, %g3, %o2 |
210 | |
211 | EX_LD(LOAD_BLK(%o1, %f0)) |
212 | add %o1, 0x40, %o1 |
213 | add %g1, %g3, %g1 |
214 | EX_LD(LOAD_BLK(%o1, %f16)) |
215 | add %o1, 0x40, %o1 |
216 | sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE |
217 | EX_LD(LOAD_BLK(%o1, %f32)) |
218 | add %o1, 0x40, %o1 |
219 | |
220 | /* There are 8 instances of the unrolled loop, |
221 | * one for each possible alignment of the |
222 | * source buffer. Each loop instance is 452 |
223 | * bytes. |
224 | */ |
225 | sll %g2, 3, %o3 |
226 | sub %o3, %g2, %o3 |
227 | sllx %o3, 4, %o3 |
228 | add %o3, %g2, %o3 |
229 | sllx %o3, 2, %g2 |
230 | 1: rd %pc, %o3 |
231 | add %o3, %lo(1f - 1b), %o3 |
232 | jmpl %o3 + %g2, %g0 |
233 | nop |
234 | |
235 | .align 64 |
236 | 1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) |
237 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
238 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) |
239 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
240 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) |
241 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
242 | ba,pt %xcc, 1b+4 |
243 | faligndata %f0, %f2, %f48 |
244 | 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) |
245 | STORE_SYNC(o0, f48) |
246 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) |
247 | STORE_JUMP(o0, f48, 40f) |
248 | 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) |
249 | STORE_SYNC(o0, f48) |
250 | FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) |
251 | STORE_JUMP(o0, f48, 48f) |
252 | 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) |
253 | STORE_SYNC(o0, f48) |
254 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) |
255 | STORE_JUMP(o0, f48, 56f) |
256 | |
257 | 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) |
258 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
259 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) |
260 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
261 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) |
262 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
263 | ba,pt %xcc, 1b+4 |
264 | faligndata %f2, %f4, %f48 |
265 | 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) |
266 | STORE_SYNC(o0, f48) |
267 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) |
268 | STORE_JUMP(o0, f48, 41f) |
269 | 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) |
270 | STORE_SYNC(o0, f48) |
271 | FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) |
272 | STORE_JUMP(o0, f48, 49f) |
273 | 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) |
274 | STORE_SYNC(o0, f48) |
275 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) |
276 | STORE_JUMP(o0, f48, 57f) |
277 | |
278 | 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) |
279 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
280 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) |
281 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
282 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) |
283 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
284 | ba,pt %xcc, 1b+4 |
285 | faligndata %f4, %f6, %f48 |
286 | 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) |
287 | STORE_SYNC(o0, f48) |
288 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) |
289 | STORE_JUMP(o0, f48, 42f) |
290 | 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) |
291 | STORE_SYNC(o0, f48) |
292 | FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) |
293 | STORE_JUMP(o0, f48, 50f) |
294 | 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) |
295 | STORE_SYNC(o0, f48) |
296 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) |
297 | STORE_JUMP(o0, f48, 58f) |
298 | |
299 | 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) |
300 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
301 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) |
302 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
303 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) |
304 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
305 | ba,pt %xcc, 1b+4 |
306 | faligndata %f6, %f8, %f48 |
307 | 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) |
308 | STORE_SYNC(o0, f48) |
309 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) |
310 | STORE_JUMP(o0, f48, 43f) |
311 | 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) |
312 | STORE_SYNC(o0, f48) |
313 | FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) |
314 | STORE_JUMP(o0, f48, 51f) |
315 | 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) |
316 | STORE_SYNC(o0, f48) |
317 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) |
318 | STORE_JUMP(o0, f48, 59f) |
319 | |
320 | 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) |
321 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
322 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) |
323 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
324 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) |
325 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
326 | ba,pt %xcc, 1b+4 |
327 | faligndata %f8, %f10, %f48 |
328 | 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) |
329 | STORE_SYNC(o0, f48) |
330 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) |
331 | STORE_JUMP(o0, f48, 44f) |
332 | 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) |
333 | STORE_SYNC(o0, f48) |
334 | FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) |
335 | STORE_JUMP(o0, f48, 52f) |
336 | 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) |
337 | STORE_SYNC(o0, f48) |
338 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) |
339 | STORE_JUMP(o0, f48, 60f) |
340 | |
341 | 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) |
342 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
343 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) |
344 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
345 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) |
346 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
347 | ba,pt %xcc, 1b+4 |
348 | faligndata %f10, %f12, %f48 |
349 | 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) |
350 | STORE_SYNC(o0, f48) |
351 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) |
352 | STORE_JUMP(o0, f48, 45f) |
353 | 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) |
354 | STORE_SYNC(o0, f48) |
355 | FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) |
356 | STORE_JUMP(o0, f48, 53f) |
357 | 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) |
358 | STORE_SYNC(o0, f48) |
359 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) |
360 | STORE_JUMP(o0, f48, 61f) |
361 | |
362 | 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) |
363 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
364 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) |
365 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
366 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) |
367 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
368 | ba,pt %xcc, 1b+4 |
369 | faligndata %f12, %f14, %f48 |
370 | 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) |
371 | STORE_SYNC(o0, f48) |
372 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) |
373 | STORE_JUMP(o0, f48, 46f) |
374 | 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) |
375 | STORE_SYNC(o0, f48) |
376 | FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) |
377 | STORE_JUMP(o0, f48, 54f) |
378 | 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) |
379 | STORE_SYNC(o0, f48) |
380 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) |
381 | STORE_JUMP(o0, f48, 62f) |
382 | |
383 | 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) |
384 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) |
385 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) |
386 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) |
387 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) |
388 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) |
389 | ba,pt %xcc, 1b+4 |
390 | faligndata %f14, %f16, %f48 |
391 | 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) |
392 | STORE_SYNC(o0, f48) |
393 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) |
394 | STORE_JUMP(o0, f48, 47f) |
395 | 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) |
396 | STORE_SYNC(o0, f48) |
397 | FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) |
398 | STORE_JUMP(o0, f48, 55f) |
399 | 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) |
400 | STORE_SYNC(o0, f48) |
401 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) |
402 | STORE_JUMP(o0, f48, 63f) |
403 | |
404 | 40: FINISH_VISCHUNK(o0, f0, f2, g3) |
405 | 41: FINISH_VISCHUNK(o0, f2, f4, g3) |
406 | 42: FINISH_VISCHUNK(o0, f4, f6, g3) |
407 | 43: FINISH_VISCHUNK(o0, f6, f8, g3) |
408 | 44: FINISH_VISCHUNK(o0, f8, f10, g3) |
409 | 45: FINISH_VISCHUNK(o0, f10, f12, g3) |
410 | 46: FINISH_VISCHUNK(o0, f12, f14, g3) |
411 | 47: UNEVEN_VISCHUNK(o0, f14, f0, g3) |
412 | 48: FINISH_VISCHUNK(o0, f16, f18, g3) |
413 | 49: FINISH_VISCHUNK(o0, f18, f20, g3) |
414 | 50: FINISH_VISCHUNK(o0, f20, f22, g3) |
415 | 51: FINISH_VISCHUNK(o0, f22, f24, g3) |
416 | 52: FINISH_VISCHUNK(o0, f24, f26, g3) |
417 | 53: FINISH_VISCHUNK(o0, f26, f28, g3) |
418 | 54: FINISH_VISCHUNK(o0, f28, f30, g3) |
419 | 55: UNEVEN_VISCHUNK(o0, f30, f0, g3) |
420 | 56: FINISH_VISCHUNK(o0, f32, f34, g3) |
421 | 57: FINISH_VISCHUNK(o0, f34, f36, g3) |
422 | 58: FINISH_VISCHUNK(o0, f36, f38, g3) |
423 | 59: FINISH_VISCHUNK(o0, f38, f40, g3) |
424 | 60: FINISH_VISCHUNK(o0, f40, f42, g3) |
425 | 61: FINISH_VISCHUNK(o0, f42, f44, g3) |
426 | 62: FINISH_VISCHUNK(o0, f44, f46, g3) |
427 | 63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) |
428 | |
429 | 93: EX_LD(LOAD(ldd, %o1, %f2)) |
430 | add %o1, 8, %o1 |
431 | subcc %g3, 8, %g3 |
432 | faligndata %f0, %f2, %f8 |
433 | EX_ST(STORE(std, %f8, %o0)) |
434 | bl,pn %xcc, 95f |
435 | add %o0, 8, %o0 |
436 | EX_LD(LOAD(ldd, %o1, %f0)) |
437 | add %o1, 8, %o1 |
438 | subcc %g3, 8, %g3 |
439 | faligndata %f2, %f0, %f8 |
440 | EX_ST(STORE(std, %f8, %o0)) |
441 | bge,pt %xcc, 93b |
442 | add %o0, 8, %o0 |
443 | |
444 | 95: brz,pt %o2, 2f |
445 | mov %g1, %o1 |
446 | |
447 | 1: EX_LD(LOAD(ldub, %o1, %o3)) |
448 | add %o1, 1, %o1 |
449 | subcc %o2, 1, %o2 |
450 | EX_ST(STORE(stb, %o3, %o0)) |
451 | bne,pt %xcc, 1b |
452 | add %o0, 1, %o0 |
453 | |
454 | 2: membar #StoreLoad | #StoreStore |
455 | VISExit |
456 | retl |
457 | mov EX_RETVAL(%o4), %o0 |
458 | |
459 | .align 64 |
460 | 70: /* 16 < len <= (5 * 64) */ |
461 | bne,pn %XCC, 75f |
462 | sub %o0, %o1, %o3 |
463 | |
464 | 72: andn %o2, 0xf, %GLOBAL_SPARE |
465 | and %o2, 0xf, %o2 |
466 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) |
467 | EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) |
468 | subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE |
469 | EX_ST(STORE(stx, %o5, %o1 + %o3)) |
470 | add %o1, 0x8, %o1 |
471 | EX_ST(STORE(stx, %g1, %o1 + %o3)) |
472 | bgu,pt %XCC, 1b |
473 | add %o1, 0x8, %o1 |
474 | 73: andcc %o2, 0x8, %g0 |
475 | be,pt %XCC, 1f |
476 | nop |
477 | EX_LD(LOAD(ldx, %o1, %o5)) |
478 | sub %o2, 0x8, %o2 |
479 | EX_ST(STORE(stx, %o5, %o1 + %o3)) |
480 | add %o1, 0x8, %o1 |
481 | 1: andcc %o2, 0x4, %g0 |
482 | be,pt %XCC, 1f |
483 | nop |
484 | EX_LD(LOAD(lduw, %o1, %o5)) |
485 | sub %o2, 0x4, %o2 |
486 | EX_ST(STORE(stw, %o5, %o1 + %o3)) |
487 | add %o1, 0x4, %o1 |
488 | 1: cmp %o2, 0 |
489 | be,pt %XCC, 85f |
490 | nop |
491 | ba,pt %xcc, 90f |
492 | nop |
493 | |
494 | 75: andcc %o0, 0x7, %g1 |
495 | sub %g1, 0x8, %g1 |
496 | be,pn %icc, 2f |
497 | sub %g0, %g1, %g1 |
498 | sub %o2, %g1, %o2 |
499 | |
500 | 1: EX_LD(LOAD(ldub, %o1, %o5)) |
501 | subcc %g1, 1, %g1 |
502 | EX_ST(STORE(stb, %o5, %o1 + %o3)) |
503 | bgu,pt %icc, 1b |
504 | add %o1, 1, %o1 |
505 | |
506 | 2: add %o1, %o3, %o0 |
507 | andcc %o1, 0x7, %g1 |
508 | bne,pt %icc, 8f |
509 | sll %g1, 3, %g1 |
510 | |
511 | cmp %o2, 16 |
512 | bgeu,pt %icc, 72b |
513 | nop |
514 | ba,a,pt %xcc, 73b |
515 | |
516 | 8: mov 64, %o3 |
517 | andn %o1, 0x7, %o1 |
518 | EX_LD(LOAD(ldx, %o1, %g2)) |
519 | sub %o3, %g1, %o3 |
520 | andn %o2, 0x7, %GLOBAL_SPARE |
521 | sllx %g2, %g1, %g2 |
522 | 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) |
523 | subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE |
524 | add %o1, 0x8, %o1 |
525 | srlx %g3, %o3, %o5 |
526 | or %o5, %g2, %o5 |
527 | EX_ST(STORE(stx, %o5, %o0)) |
528 | add %o0, 0x8, %o0 |
529 | bgu,pt %icc, 1b |
530 | sllx %g3, %g1, %g2 |
531 | |
532 | srl %g1, 3, %g1 |
533 | andcc %o2, 0x7, %o2 |
534 | be,pn %icc, 85f |
535 | add %o1, %g1, %o1 |
536 | ba,pt %xcc, 90f |
537 | sub %o0, %o1, %o3 |
538 | |
539 | .align 64 |
540 | 80: /* 0 < len <= 16 */ |
541 | andcc %o3, 0x3, %g0 |
542 | bne,pn %XCC, 90f |
543 | sub %o0, %o1, %o3 |
544 | |
545 | 1: EX_LD(LOAD(lduw, %o1, %g1)) |
546 | subcc %o2, 4, %o2 |
547 | EX_ST(STORE(stw, %g1, %o1 + %o3)) |
548 | bgu,pt %XCC, 1b |
549 | add %o1, 4, %o1 |
550 | |
551 | 85: retl |
552 | mov EX_RETVAL(%o4), %o0 |
553 | |
554 | .align 32 |
555 | 90: EX_LD(LOAD(ldub, %o1, %g1)) |
556 | subcc %o2, 1, %o2 |
557 | EX_ST(STORE(stb, %g1, %o1 + %o3)) |
558 | bgu,pt %XCC, 90b |
559 | add %o1, 1, %o1 |
560 | retl |
561 | mov EX_RETVAL(%o4), %o0 |
562 | |
563 | .size FUNC_NAME, .-FUNC_NAME |
564 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9