Root/
1 | /* |
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> |
3 | * Copyright (C) 2008-2009 PetaLogix |
4 | * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. |
5 | * |
6 | * This file is subject to the terms and conditions of the GNU General |
7 | * Public License. See the file COPYING in the main directory of this |
8 | * archive for more details. |
9 | * |
10 | * Written by Jim Law <jlaw@irispower.com> |
11 | * |
12 | * intended to replace: |
13 | * memcpy in memcpy.c and |
14 | * memmove in memmove.c |
15 | * ... in arch/microblaze/lib |
16 | * |
17 | * |
18 | * assly_fastcopy.S |
19 | * |
20 | * Attempt at quicker memcpy and memmove for MicroBlaze |
21 | * Input : Operand1 in Reg r5 - destination address |
22 | * Operand2 in Reg r6 - source address |
23 | * Operand3 in Reg r7 - number of bytes to transfer |
24 | * Output: Result in Reg r3 - starting destinaition address |
25 | * |
26 | * |
27 | * Explanation: |
28 | * Perform (possibly unaligned) copy of a block of memory |
29 | * between mem locations with size of xfer spec'd in bytes |
30 | */ |
31 | |
32 | #include <linux/linkage.h> |
33 | .text |
34 | .globl memcpy |
35 | .type memcpy, @function |
36 | .ent memcpy |
37 | |
38 | memcpy: |
39 | fast_memcpy_ascending: |
40 | /* move d to return register as value of function */ |
41 | addi r3, r5, 0 |
42 | |
43 | addi r4, r0, 4 /* n = 4 */ |
44 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ |
45 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ |
46 | |
47 | /* transfer first 0~3 bytes to get aligned dest address */ |
48 | andi r4, r5, 3 /* n = d & 3 */ |
49 | /* if zero, destination already aligned */ |
50 | beqi r4, a_dalign_done |
51 | /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ |
52 | rsubi r4, r4, 4 |
53 | rsub r7, r4, r7 /* c = c - n adjust c */ |
54 | |
55 | a_xfer_first_loop: |
56 | /* if no bytes left to transfer, transfer the bulk */ |
57 | beqi r4, a_dalign_done |
58 | lbui r11, r6, 0 /* h = *s */ |
59 | sbi r11, r5, 0 /* *d = h */ |
60 | addi r6, r6, 1 /* s++ */ |
61 | addi r5, r5, 1 /* d++ */ |
62 | brid a_xfer_first_loop /* loop */ |
63 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ |
64 | |
65 | a_dalign_done: |
66 | addi r4, r0, 32 /* n = 32 */ |
67 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ |
68 | /* if n < 0, less than one block to transfer */ |
69 | blti r4, a_block_done |
70 | |
71 | a_block_xfer: |
72 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ |
73 | rsub r7, r4, r7 /* c = c - n */ |
74 | |
75 | andi r9, r6, 3 /* t1 = s & 3 */ |
76 | /* if temp != 0, unaligned transfers needed */ |
77 | bnei r9, a_block_unaligned |
78 | |
79 | a_block_aligned: |
80 | lwi r9, r6, 0 /* t1 = *(s + 0) */ |
81 | lwi r10, r6, 4 /* t2 = *(s + 4) */ |
82 | lwi r11, r6, 8 /* t3 = *(s + 8) */ |
83 | lwi r12, r6, 12 /* t4 = *(s + 12) */ |
84 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
85 | swi r10, r5, 4 /* *(d + 4) = t2 */ |
86 | swi r11, r5, 8 /* *(d + 8) = t3 */ |
87 | swi r12, r5, 12 /* *(d + 12) = t4 */ |
88 | lwi r9, r6, 16 /* t1 = *(s + 16) */ |
89 | lwi r10, r6, 20 /* t2 = *(s + 20) */ |
90 | lwi r11, r6, 24 /* t3 = *(s + 24) */ |
91 | lwi r12, r6, 28 /* t4 = *(s + 28) */ |
92 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
93 | swi r10, r5, 20 /* *(d + 20) = t2 */ |
94 | swi r11, r5, 24 /* *(d + 24) = t3 */ |
95 | swi r12, r5, 28 /* *(d + 28) = t4 */ |
96 | addi r6, r6, 32 /* s = s + 32 */ |
97 | addi r4, r4, -32 /* n = n - 32 */ |
98 | bneid r4, a_block_aligned /* while (n) loop */ |
99 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ |
100 | bri a_block_done |
101 | |
102 | a_block_unaligned: |
103 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ |
104 | add r6, r6, r4 /* s = s + n */ |
105 | lwi r11, r8, 0 /* h = *(as + 0) */ |
106 | |
107 | addi r9, r9, -1 |
108 | beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ |
109 | addi r9, r9, -1 |
110 | beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ |
111 | |
112 | a_block_u3: |
113 | bslli r11, r11, 24 /* h = h << 24 */ |
114 | a_bu3_loop: |
115 | lwi r12, r8, 4 /* v = *(as + 4) */ |
116 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
117 | or r9, r11, r9 /* t1 = h | t1 */ |
118 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
119 | bslli r11, r12, 24 /* h = v << 24 */ |
120 | lwi r12, r8, 8 /* v = *(as + 8) */ |
121 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
122 | or r9, r11, r9 /* t1 = h | t1 */ |
123 | swi r9, r5, 4 /* *(d + 4) = t1 */ |
124 | bslli r11, r12, 24 /* h = v << 24 */ |
125 | lwi r12, r8, 12 /* v = *(as + 12) */ |
126 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
127 | or r9, r11, r9 /* t1 = h | t1 */ |
128 | swi r9, r5, 8 /* *(d + 8) = t1 */ |
129 | bslli r11, r12, 24 /* h = v << 24 */ |
130 | lwi r12, r8, 16 /* v = *(as + 16) */ |
131 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
132 | or r9, r11, r9 /* t1 = h | t1 */ |
133 | swi r9, r5, 12 /* *(d + 12) = t1 */ |
134 | bslli r11, r12, 24 /* h = v << 24 */ |
135 | lwi r12, r8, 20 /* v = *(as + 20) */ |
136 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
137 | or r9, r11, r9 /* t1 = h | t1 */ |
138 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
139 | bslli r11, r12, 24 /* h = v << 24 */ |
140 | lwi r12, r8, 24 /* v = *(as + 24) */ |
141 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
142 | or r9, r11, r9 /* t1 = h | t1 */ |
143 | swi r9, r5, 20 /* *(d + 20) = t1 */ |
144 | bslli r11, r12, 24 /* h = v << 24 */ |
145 | lwi r12, r8, 28 /* v = *(as + 28) */ |
146 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
147 | or r9, r11, r9 /* t1 = h | t1 */ |
148 | swi r9, r5, 24 /* *(d + 24) = t1 */ |
149 | bslli r11, r12, 24 /* h = v << 24 */ |
150 | lwi r12, r8, 32 /* v = *(as + 32) */ |
151 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
152 | or r9, r11, r9 /* t1 = h | t1 */ |
153 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
154 | bslli r11, r12, 24 /* h = v << 24 */ |
155 | addi r8, r8, 32 /* as = as + 32 */ |
156 | addi r4, r4, -32 /* n = n - 32 */ |
157 | bneid r4, a_bu3_loop /* while (n) loop */ |
158 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ |
159 | bri a_block_done |
160 | |
161 | a_block_u1: |
162 | bslli r11, r11, 8 /* h = h << 8 */ |
163 | a_bu1_loop: |
164 | lwi r12, r8, 4 /* v = *(as + 4) */ |
165 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
166 | or r9, r11, r9 /* t1 = h | t1 */ |
167 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
168 | bslli r11, r12, 8 /* h = v << 8 */ |
169 | lwi r12, r8, 8 /* v = *(as + 8) */ |
170 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
171 | or r9, r11, r9 /* t1 = h | t1 */ |
172 | swi r9, r5, 4 /* *(d + 4) = t1 */ |
173 | bslli r11, r12, 8 /* h = v << 8 */ |
174 | lwi r12, r8, 12 /* v = *(as + 12) */ |
175 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
176 | or r9, r11, r9 /* t1 = h | t1 */ |
177 | swi r9, r5, 8 /* *(d + 8) = t1 */ |
178 | bslli r11, r12, 8 /* h = v << 8 */ |
179 | lwi r12, r8, 16 /* v = *(as + 16) */ |
180 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
181 | or r9, r11, r9 /* t1 = h | t1 */ |
182 | swi r9, r5, 12 /* *(d + 12) = t1 */ |
183 | bslli r11, r12, 8 /* h = v << 8 */ |
184 | lwi r12, r8, 20 /* v = *(as + 20) */ |
185 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
186 | or r9, r11, r9 /* t1 = h | t1 */ |
187 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
188 | bslli r11, r12, 8 /* h = v << 8 */ |
189 | lwi r12, r8, 24 /* v = *(as + 24) */ |
190 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
191 | or r9, r11, r9 /* t1 = h | t1 */ |
192 | swi r9, r5, 20 /* *(d + 20) = t1 */ |
193 | bslli r11, r12, 8 /* h = v << 8 */ |
194 | lwi r12, r8, 28 /* v = *(as + 28) */ |
195 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
196 | or r9, r11, r9 /* t1 = h | t1 */ |
197 | swi r9, r5, 24 /* *(d + 24) = t1 */ |
198 | bslli r11, r12, 8 /* h = v << 8 */ |
199 | lwi r12, r8, 32 /* v = *(as + 32) */ |
200 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
201 | or r9, r11, r9 /* t1 = h | t1 */ |
202 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
203 | bslli r11, r12, 8 /* h = v << 8 */ |
204 | addi r8, r8, 32 /* as = as + 32 */ |
205 | addi r4, r4, -32 /* n = n - 32 */ |
206 | bneid r4, a_bu1_loop /* while (n) loop */ |
207 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ |
208 | bri a_block_done |
209 | |
210 | a_block_u2: |
211 | bslli r11, r11, 16 /* h = h << 16 */ |
212 | a_bu2_loop: |
213 | lwi r12, r8, 4 /* v = *(as + 4) */ |
214 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
215 | or r9, r11, r9 /* t1 = h | t1 */ |
216 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
217 | bslli r11, r12, 16 /* h = v << 16 */ |
218 | lwi r12, r8, 8 /* v = *(as + 8) */ |
219 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
220 | or r9, r11, r9 /* t1 = h | t1 */ |
221 | swi r9, r5, 4 /* *(d + 4) = t1 */ |
222 | bslli r11, r12, 16 /* h = v << 16 */ |
223 | lwi r12, r8, 12 /* v = *(as + 12) */ |
224 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
225 | or r9, r11, r9 /* t1 = h | t1 */ |
226 | swi r9, r5, 8 /* *(d + 8) = t1 */ |
227 | bslli r11, r12, 16 /* h = v << 16 */ |
228 | lwi r12, r8, 16 /* v = *(as + 16) */ |
229 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
230 | or r9, r11, r9 /* t1 = h | t1 */ |
231 | swi r9, r5, 12 /* *(d + 12) = t1 */ |
232 | bslli r11, r12, 16 /* h = v << 16 */ |
233 | lwi r12, r8, 20 /* v = *(as + 20) */ |
234 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
235 | or r9, r11, r9 /* t1 = h | t1 */ |
236 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
237 | bslli r11, r12, 16 /* h = v << 16 */ |
238 | lwi r12, r8, 24 /* v = *(as + 24) */ |
239 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
240 | or r9, r11, r9 /* t1 = h | t1 */ |
241 | swi r9, r5, 20 /* *(d + 20) = t1 */ |
242 | bslli r11, r12, 16 /* h = v << 16 */ |
243 | lwi r12, r8, 28 /* v = *(as + 28) */ |
244 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
245 | or r9, r11, r9 /* t1 = h | t1 */ |
246 | swi r9, r5, 24 /* *(d + 24) = t1 */ |
247 | bslli r11, r12, 16 /* h = v << 16 */ |
248 | lwi r12, r8, 32 /* v = *(as + 32) */ |
249 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
250 | or r9, r11, r9 /* t1 = h | t1 */ |
251 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
252 | bslli r11, r12, 16 /* h = v << 16 */ |
253 | addi r8, r8, 32 /* as = as + 32 */ |
254 | addi r4, r4, -32 /* n = n - 32 */ |
255 | bneid r4, a_bu2_loop /* while (n) loop */ |
256 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ |
257 | |
258 | a_block_done: |
259 | addi r4, r0, 4 /* n = 4 */ |
260 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ |
261 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ |
262 | |
263 | a_word_xfer: |
264 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ |
265 | addi r10, r0, 0 /* offset = 0 */ |
266 | |
267 | andi r9, r6, 3 /* t1 = s & 3 */ |
268 | /* if temp != 0, unaligned transfers needed */ |
269 | bnei r9, a_word_unaligned |
270 | |
271 | a_word_aligned: |
272 | lw r9, r6, r10 /* t1 = *(s+offset) */ |
273 | sw r9, r5, r10 /* *(d+offset) = t1 */ |
274 | addi r4, r4,-4 /* n-- */ |
275 | bneid r4, a_word_aligned /* loop */ |
276 | addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ |
277 | |
278 | bri a_word_done |
279 | |
280 | a_word_unaligned: |
281 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ |
282 | lwi r11, r8, 0 /* h = *(as + 0) */ |
283 | addi r8, r8, 4 /* as = as + 4 */ |
284 | |
285 | addi r9, r9, -1 |
286 | beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ |
287 | addi r9, r9, -1 |
288 | beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ |
289 | |
290 | a_word_u3: |
291 | bslli r11, r11, 24 /* h = h << 24 */ |
292 | a_wu3_loop: |
293 | lw r12, r8, r10 /* v = *(as + offset) */ |
294 | bsrli r9, r12, 8 /* t1 = v >> 8 */ |
295 | or r9, r11, r9 /* t1 = h | t1 */ |
296 | sw r9, r5, r10 /* *(d + offset) = t1 */ |
297 | bslli r11, r12, 24 /* h = v << 24 */ |
298 | addi r4, r4,-4 /* n = n - 4 */ |
299 | bneid r4, a_wu3_loop /* while (n) loop */ |
300 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ |
301 | |
302 | bri a_word_done |
303 | |
304 | a_word_u1: |
305 | bslli r11, r11, 8 /* h = h << 8 */ |
306 | a_wu1_loop: |
307 | lw r12, r8, r10 /* v = *(as + offset) */ |
308 | bsrli r9, r12, 24 /* t1 = v >> 24 */ |
309 | or r9, r11, r9 /* t1 = h | t1 */ |
310 | sw r9, r5, r10 /* *(d + offset) = t1 */ |
311 | bslli r11, r12, 8 /* h = v << 8 */ |
312 | addi r4, r4,-4 /* n = n - 4 */ |
313 | bneid r4, a_wu1_loop /* while (n) loop */ |
314 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ |
315 | |
316 | bri a_word_done |
317 | |
318 | a_word_u2: |
319 | bslli r11, r11, 16 /* h = h << 16 */ |
320 | a_wu2_loop: |
321 | lw r12, r8, r10 /* v = *(as + offset) */ |
322 | bsrli r9, r12, 16 /* t1 = v >> 16 */ |
323 | or r9, r11, r9 /* t1 = h | t1 */ |
324 | sw r9, r5, r10 /* *(d + offset) = t1 */ |
325 | bslli r11, r12, 16 /* h = v << 16 */ |
326 | addi r4, r4,-4 /* n = n - 4 */ |
327 | bneid r4, a_wu2_loop /* while (n) loop */ |
328 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ |
329 | |
330 | a_word_done: |
331 | add r5, r5, r10 /* d = d + offset */ |
332 | add r6, r6, r10 /* s = s + offset */ |
333 | rsub r7, r10, r7 /* c = c - offset */ |
334 | |
335 | a_xfer_end: |
336 | a_xfer_end_loop: |
337 | beqi r7, a_done /* while (c) */ |
338 | lbui r9, r6, 0 /* t1 = *s */ |
339 | addi r6, r6, 1 /* s++ */ |
340 | sbi r9, r5, 0 /* *d = t1 */ |
341 | addi r7, r7, -1 /* c-- */ |
342 | brid a_xfer_end_loop /* loop */ |
343 | addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ |
344 | |
345 | a_done: |
346 | rtsd r15, 8 |
347 | nop |
348 | |
349 | .size memcpy, . - memcpy |
350 | .end memcpy |
351 | /*----------------------------------------------------------------------------*/ |
352 | .globl memmove |
353 | .type memmove, @function |
354 | .ent memmove |
355 | |
356 | memmove: |
357 | cmpu r4, r5, r6 /* n = s - d */ |
358 | bgei r4,fast_memcpy_ascending |
359 | |
360 | fast_memcpy_descending: |
361 | /* move d to return register as value of function */ |
362 | addi r3, r5, 0 |
363 | |
364 | add r5, r5, r7 /* d = d + c */ |
365 | add r6, r6, r7 /* s = s + c */ |
366 | |
367 | addi r4, r0, 4 /* n = 4 */ |
368 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ |
369 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ |
370 | |
371 | /* transfer first 0~3 bytes to get aligned dest address */ |
372 | andi r4, r5, 3 /* n = d & 3 */ |
373 | /* if zero, destination already aligned */ |
374 | beqi r4,d_dalign_done |
375 | rsub r7, r4, r7 /* c = c - n adjust c */ |
376 | |
377 | d_xfer_first_loop: |
378 | /* if no bytes left to transfer, transfer the bulk */ |
379 | beqi r4,d_dalign_done |
380 | addi r6, r6, -1 /* s-- */ |
381 | addi r5, r5, -1 /* d-- */ |
382 | lbui r11, r6, 0 /* h = *s */ |
383 | sbi r11, r5, 0 /* *d = h */ |
384 | brid d_xfer_first_loop /* loop */ |
385 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ |
386 | |
387 | d_dalign_done: |
388 | addi r4, r0, 32 /* n = 32 */ |
389 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ |
390 | /* if n < 0, less than one block to transfer */ |
391 | blti r4, d_block_done |
392 | |
393 | d_block_xfer: |
394 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ |
395 | rsub r7, r4, r7 /* c = c - n */ |
396 | |
397 | andi r9, r6, 3 /* t1 = s & 3 */ |
398 | /* if temp != 0, unaligned transfers needed */ |
399 | bnei r9, d_block_unaligned |
400 | |
401 | d_block_aligned: |
402 | addi r6, r6, -32 /* s = s - 32 */ |
403 | addi r5, r5, -32 /* d = d - 32 */ |
404 | lwi r9, r6, 28 /* t1 = *(s + 28) */ |
405 | lwi r10, r6, 24 /* t2 = *(s + 24) */ |
406 | lwi r11, r6, 20 /* t3 = *(s + 20) */ |
407 | lwi r12, r6, 16 /* t4 = *(s + 16) */ |
408 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
409 | swi r10, r5, 24 /* *(d + 24) = t2 */ |
410 | swi r11, r5, 20 /* *(d + 20) = t3 */ |
411 | swi r12, r5, 16 /* *(d + 16) = t4 */ |
412 | lwi r9, r6, 12 /* t1 = *(s + 12) */ |
413 | lwi r10, r6, 8 /* t2 = *(s + 8) */ |
414 | lwi r11, r6, 4 /* t3 = *(s + 4) */ |
415 | lwi r12, r6, 0 /* t4 = *(s + 0) */ |
416 | swi r9, r5, 12 /* *(d + 12) = t1 */ |
417 | swi r10, r5, 8 /* *(d + 8) = t2 */ |
418 | swi r11, r5, 4 /* *(d + 4) = t3 */ |
419 | addi r4, r4, -32 /* n = n - 32 */ |
420 | bneid r4, d_block_aligned /* while (n) loop */ |
421 | swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ |
422 | bri d_block_done |
423 | |
424 | d_block_unaligned: |
425 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ |
426 | rsub r6, r4, r6 /* s = s - n */ |
427 | lwi r11, r8, 0 /* h = *(as + 0) */ |
428 | |
429 | addi r9, r9, -1 |
430 | beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ |
431 | addi r9, r9, -1 |
432 | beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ |
433 | |
434 | d_block_u3: |
435 | bsrli r11, r11, 8 /* h = h >> 8 */ |
436 | d_bu3_loop: |
437 | addi r8, r8, -32 /* as = as - 32 */ |
438 | addi r5, r5, -32 /* d = d - 32 */ |
439 | lwi r12, r8, 28 /* v = *(as + 28) */ |
440 | bslli r9, r12, 24 /* t1 = v << 24 */ |
441 | or r9, r11, r9 /* t1 = h | t1 */ |
442 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
443 | bsrli r11, r12, 8 /* h = v >> 8 */ |
444 | lwi r12, r8, 24 /* v = *(as + 24) */ |
445 | bslli r9, r12, 24 /* t1 = v << 24 */ |
446 | or r9, r11, r9 /* t1 = h | t1 */ |
447 | swi r9, r5, 24 /* *(d + 24) = t1 */ |
448 | bsrli r11, r12, 8 /* h = v >> 8 */ |
449 | lwi r12, r8, 20 /* v = *(as + 20) */ |
450 | bslli r9, r12, 24 /* t1 = v << 24 */ |
451 | or r9, r11, r9 /* t1 = h | t1 */ |
452 | swi r9, r5, 20 /* *(d + 20) = t1 */ |
453 | bsrli r11, r12, 8 /* h = v >> 8 */ |
454 | lwi r12, r8, 16 /* v = *(as + 16) */ |
455 | bslli r9, r12, 24 /* t1 = v << 24 */ |
456 | or r9, r11, r9 /* t1 = h | t1 */ |
457 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
458 | bsrli r11, r12, 8 /* h = v >> 8 */ |
459 | lwi r12, r8, 12 /* v = *(as + 12) */ |
460 | bslli r9, r12, 24 /* t1 = v << 24 */ |
461 | or r9, r11, r9 /* t1 = h | t1 */ |
462 | swi r9, r5, 12 /* *(d + 112) = t1 */ |
463 | bsrli r11, r12, 8 /* h = v >> 8 */ |
464 | lwi r12, r8, 8 /* v = *(as + 8) */ |
465 | bslli r9, r12, 24 /* t1 = v << 24 */ |
466 | or r9, r11, r9 /* t1 = h | t1 */ |
467 | swi r9, r5, 8 /* *(d + 8) = t1 */ |
468 | bsrli r11, r12, 8 /* h = v >> 8 */ |
469 | lwi r12, r8, 4 /* v = *(as + 4) */ |
470 | bslli r9, r12, 24 /* t1 = v << 24 */ |
471 | or r9, r11, r9 /* t1 = h | t1 */ |
472 | swi r9, r5, 4 /* *(d + 4) = t1 */ |
473 | bsrli r11, r12, 8 /* h = v >> 8 */ |
474 | lwi r12, r8, 0 /* v = *(as + 0) */ |
475 | bslli r9, r12, 24 /* t1 = v << 24 */ |
476 | or r9, r11, r9 /* t1 = h | t1 */ |
477 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
478 | addi r4, r4, -32 /* n = n - 32 */ |
479 | bneid r4, d_bu3_loop /* while (n) loop */ |
480 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ |
481 | bri d_block_done |
482 | |
483 | d_block_u1: |
484 | bsrli r11, r11, 24 /* h = h >> 24 */ |
485 | d_bu1_loop: |
486 | addi r8, r8, -32 /* as = as - 32 */ |
487 | addi r5, r5, -32 /* d = d - 32 */ |
488 | lwi r12, r8, 28 /* v = *(as + 28) */ |
489 | bslli r9, r12, 8 /* t1 = v << 8 */ |
490 | or r9, r11, r9 /* t1 = h | t1 */ |
491 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
492 | bsrli r11, r12, 24 /* h = v >> 24 */ |
493 | lwi r12, r8, 24 /* v = *(as + 24) */ |
494 | bslli r9, r12, 8 /* t1 = v << 8 */ |
495 | or r9, r11, r9 /* t1 = h | t1 */ |
496 | swi r9, r5, 24 /* *(d + 24) = t1 */ |
497 | bsrli r11, r12, 24 /* h = v >> 24 */ |
498 | lwi r12, r8, 20 /* v = *(as + 20) */ |
499 | bslli r9, r12, 8 /* t1 = v << 8 */ |
500 | or r9, r11, r9 /* t1 = h | t1 */ |
501 | swi r9, r5, 20 /* *(d + 20) = t1 */ |
502 | bsrli r11, r12, 24 /* h = v >> 24 */ |
503 | lwi r12, r8, 16 /* v = *(as + 16) */ |
504 | bslli r9, r12, 8 /* t1 = v << 8 */ |
505 | or r9, r11, r9 /* t1 = h | t1 */ |
506 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
507 | bsrli r11, r12, 24 /* h = v >> 24 */ |
508 | lwi r12, r8, 12 /* v = *(as + 12) */ |
509 | bslli r9, r12, 8 /* t1 = v << 8 */ |
510 | or r9, r11, r9 /* t1 = h | t1 */ |
511 | swi r9, r5, 12 /* *(d + 112) = t1 */ |
512 | bsrli r11, r12, 24 /* h = v >> 24 */ |
513 | lwi r12, r8, 8 /* v = *(as + 8) */ |
514 | bslli r9, r12, 8 /* t1 = v << 8 */ |
515 | or r9, r11, r9 /* t1 = h | t1 */ |
516 | swi r9, r5, 8 /* *(d + 8) = t1 */ |
517 | bsrli r11, r12, 24 /* h = v >> 24 */ |
518 | lwi r12, r8, 4 /* v = *(as + 4) */ |
519 | bslli r9, r12, 8 /* t1 = v << 8 */ |
520 | or r9, r11, r9 /* t1 = h | t1 */ |
521 | swi r9, r5, 4 /* *(d + 4) = t1 */ |
522 | bsrli r11, r12, 24 /* h = v >> 24 */ |
523 | lwi r12, r8, 0 /* v = *(as + 0) */ |
524 | bslli r9, r12, 8 /* t1 = v << 8 */ |
525 | or r9, r11, r9 /* t1 = h | t1 */ |
526 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
527 | addi r4, r4, -32 /* n = n - 32 */ |
528 | bneid r4, d_bu1_loop /* while (n) loop */ |
529 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ |
530 | bri d_block_done |
531 | |
532 | d_block_u2: |
533 | bsrli r11, r11, 16 /* h = h >> 16 */ |
534 | d_bu2_loop: |
535 | addi r8, r8, -32 /* as = as - 32 */ |
536 | addi r5, r5, -32 /* d = d - 32 */ |
537 | lwi r12, r8, 28 /* v = *(as + 28) */ |
538 | bslli r9, r12, 16 /* t1 = v << 16 */ |
539 | or r9, r11, r9 /* t1 = h | t1 */ |
540 | swi r9, r5, 28 /* *(d + 28) = t1 */ |
541 | bsrli r11, r12, 16 /* h = v >> 16 */ |
542 | lwi r12, r8, 24 /* v = *(as + 24) */ |
543 | bslli r9, r12, 16 /* t1 = v << 16 */ |
544 | or r9, r11, r9 /* t1 = h | t1 */ |
545 | swi r9, r5, 24 /* *(d + 24) = t1 */ |
546 | bsrli r11, r12, 16 /* h = v >> 16 */ |
547 | lwi r12, r8, 20 /* v = *(as + 20) */ |
548 | bslli r9, r12, 16 /* t1 = v << 16 */ |
549 | or r9, r11, r9 /* t1 = h | t1 */ |
550 | swi r9, r5, 20 /* *(d + 20) = t1 */ |
551 | bsrli r11, r12, 16 /* h = v >> 16 */ |
552 | lwi r12, r8, 16 /* v = *(as + 16) */ |
553 | bslli r9, r12, 16 /* t1 = v << 16 */ |
554 | or r9, r11, r9 /* t1 = h | t1 */ |
555 | swi r9, r5, 16 /* *(d + 16) = t1 */ |
556 | bsrli r11, r12, 16 /* h = v >> 16 */ |
557 | lwi r12, r8, 12 /* v = *(as + 12) */ |
558 | bslli r9, r12, 16 /* t1 = v << 16 */ |
559 | or r9, r11, r9 /* t1 = h | t1 */ |
560 | swi r9, r5, 12 /* *(d + 112) = t1 */ |
561 | bsrli r11, r12, 16 /* h = v >> 16 */ |
562 | lwi r12, r8, 8 /* v = *(as + 8) */ |
563 | bslli r9, r12, 16 /* t1 = v << 16 */ |
564 | or r9, r11, r9 /* t1 = h | t1 */ |
565 | swi r9, r5, 8 /* *(d + 8) = t1 */ |
566 | bsrli r11, r12, 16 /* h = v >> 16 */ |
567 | lwi r12, r8, 4 /* v = *(as + 4) */ |
568 | bslli r9, r12, 16 /* t1 = v << 16 */ |
569 | or r9, r11, r9 /* t1 = h | t1 */ |
570 | swi r9, r5, 4 /* *(d + 4) = t1 */ |
571 | bsrli r11, r12, 16 /* h = v >> 16 */ |
572 | lwi r12, r8, 0 /* v = *(as + 0) */ |
573 | bslli r9, r12, 16 /* t1 = v << 16 */ |
574 | or r9, r11, r9 /* t1 = h | t1 */ |
575 | swi r9, r5, 0 /* *(d + 0) = t1 */ |
576 | addi r4, r4, -32 /* n = n - 32 */ |
577 | bneid r4, d_bu2_loop /* while (n) loop */ |
578 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ |
579 | |
580 | d_block_done: |
581 | addi r4, r0, 4 /* n = 4 */ |
582 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ |
583 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ |
584 | |
585 | d_word_xfer: |
586 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ |
587 | rsub r5, r4, r5 /* d = d - n */ |
588 | rsub r6, r4, r6 /* s = s - n */ |
589 | rsub r7, r4, r7 /* c = c - n */ |
590 | |
591 | andi r9, r6, 3 /* t1 = s & 3 */ |
592 | /* if temp != 0, unaligned transfers needed */ |
593 | bnei r9, d_word_unaligned |
594 | |
595 | d_word_aligned: |
596 | addi r4, r4,-4 /* n-- */ |
597 | lw r9, r6, r4 /* t1 = *(s+n) */ |
598 | bneid r4, d_word_aligned /* loop */ |
599 | sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ |
600 | |
601 | bri d_word_done |
602 | |
603 | d_word_unaligned: |
604 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ |
605 | lw r11, r8, r4 /* h = *(as + n) */ |
606 | |
607 | addi r9, r9, -1 |
608 | beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ |
609 | addi r9, r9, -1 |
610 | beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ |
611 | |
612 | d_word_u3: |
613 | bsrli r11, r11, 8 /* h = h >> 8 */ |
614 | d_wu3_loop: |
615 | addi r4, r4,-4 /* n = n - 4 */ |
616 | lw r12, r8, r4 /* v = *(as + n) */ |
617 | bslli r9, r12, 24 /* t1 = v << 24 */ |
618 | or r9, r11, r9 /* t1 = h | t1 */ |
619 | sw r9, r5, r4 /* *(d + n) = t1 */ |
620 | bneid r4, d_wu3_loop /* while (n) loop */ |
621 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ |
622 | |
623 | bri d_word_done |
624 | |
625 | d_word_u1: |
626 | bsrli r11, r11, 24 /* h = h >> 24 */ |
627 | d_wu1_loop: |
628 | addi r4, r4,-4 /* n = n - 4 */ |
629 | lw r12, r8, r4 /* v = *(as + n) */ |
630 | bslli r9, r12, 8 /* t1 = v << 8 */ |
631 | or r9, r11, r9 /* t1 = h | t1 */ |
632 | sw r9, r5, r4 /* *(d + n) = t1 */ |
633 | bneid r4, d_wu1_loop /* while (n) loop */ |
634 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ |
635 | |
636 | bri d_word_done |
637 | |
638 | d_word_u2: |
639 | bsrli r11, r11, 16 /* h = h >> 16 */ |
640 | d_wu2_loop: |
641 | addi r4, r4,-4 /* n = n - 4 */ |
642 | lw r12, r8, r4 /* v = *(as + n) */ |
643 | bslli r9, r12, 16 /* t1 = v << 16 */ |
644 | or r9, r11, r9 /* t1 = h | t1 */ |
645 | sw r9, r5, r4 /* *(d + n) = t1 */ |
646 | bneid r4, d_wu2_loop /* while (n) loop */ |
647 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ |
648 | |
649 | d_word_done: |
650 | |
651 | d_xfer_end: |
652 | d_xfer_end_loop: |
653 | beqi r7, a_done /* while (c) */ |
654 | addi r6, r6, -1 /* s-- */ |
655 | lbui r9, r6, 0 /* t1 = *s */ |
656 | addi r5, r5, -1 /* d-- */ |
657 | sbi r9, r5, 0 /* *d = t1 */ |
658 | brid d_xfer_end_loop /* loop */ |
659 | addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ |
660 | |
661 | d_done: |
662 | rtsd r15, 8 |
663 | nop |
664 | |
665 | .size memmove, . - memmove |
666 | .end memmove |
667 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9