Root/arch/microblaze/lib/fastcopy.S

1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33    .text
34    .globl memcpy
35    .type memcpy, @function
36    .ent memcpy
37
38memcpy:
39fast_memcpy_ascending:
40    /* move d to return register as value of function */
41    addi r3, r5, 0
42
43    addi r4, r0, 4 /* n = 4 */
44    cmpu r4, r4, r7 /* n = c - n (unsigned) */
45    blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
46
47    /* transfer first 0~3 bytes to get aligned dest address */
48    andi r4, r5, 3 /* n = d & 3 */
49    /* if zero, destination already aligned */
50    beqi r4, a_dalign_done
51    /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
52    rsubi r4, r4, 4
53    rsub r7, r4, r7 /* c = c - n adjust c */
54
55a_xfer_first_loop:
56    /* if no bytes left to transfer, transfer the bulk */
57    beqi r4, a_dalign_done
58    lbui r11, r6, 0 /* h = *s */
59    sbi r11, r5, 0 /* *d = h */
60    addi r6, r6, 1 /* s++ */
61    addi r5, r5, 1 /* d++ */
62    brid a_xfer_first_loop /* loop */
63    addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
64
65a_dalign_done:
66    addi r4, r0, 32 /* n = 32 */
67    cmpu r4, r4, r7 /* n = c - n (unsigned) */
68    /* if n < 0, less than one block to transfer */
69    blti r4, a_block_done
70
71a_block_xfer:
72    andi r4, r7, 0xffffffe0 /* n = c & ~31 */
73    rsub r7, r4, r7 /* c = c - n */
74
75    andi r9, r6, 3 /* t1 = s & 3 */
76    /* if temp != 0, unaligned transfers needed */
77    bnei r9, a_block_unaligned
78
79a_block_aligned:
80    lwi r9, r6, 0 /* t1 = *(s + 0) */
81    lwi r10, r6, 4 /* t2 = *(s + 4) */
82    lwi r11, r6, 8 /* t3 = *(s + 8) */
83    lwi r12, r6, 12 /* t4 = *(s + 12) */
84    swi r9, r5, 0 /* *(d + 0) = t1 */
85    swi r10, r5, 4 /* *(d + 4) = t2 */
86    swi r11, r5, 8 /* *(d + 8) = t3 */
87    swi r12, r5, 12 /* *(d + 12) = t4 */
88    lwi r9, r6, 16 /* t1 = *(s + 16) */
89    lwi r10, r6, 20 /* t2 = *(s + 20) */
90    lwi r11, r6, 24 /* t3 = *(s + 24) */
91    lwi r12, r6, 28 /* t4 = *(s + 28) */
92    swi r9, r5, 16 /* *(d + 16) = t1 */
93    swi r10, r5, 20 /* *(d + 20) = t2 */
94    swi r11, r5, 24 /* *(d + 24) = t3 */
95    swi r12, r5, 28 /* *(d + 28) = t4 */
96    addi r6, r6, 32 /* s = s + 32 */
97    addi r4, r4, -32 /* n = n - 32 */
98    bneid r4, a_block_aligned /* while (n) loop */
99    addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
100    bri a_block_done
101
102a_block_unaligned:
103    andi r8, r6, 0xfffffffc /* as = s & ~3 */
104    add r6, r6, r4 /* s = s + n */
105    lwi r11, r8, 0 /* h = *(as + 0) */
106
107    addi r9, r9, -1
108    beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
109    addi r9, r9, -1
110    beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
111
112a_block_u3:
113    bslli r11, r11, 24 /* h = h << 24 */
114a_bu3_loop:
115    lwi r12, r8, 4 /* v = *(as + 4) */
116    bsrli r9, r12, 8 /* t1 = v >> 8 */
117    or r9, r11, r9 /* t1 = h | t1 */
118    swi r9, r5, 0 /* *(d + 0) = t1 */
119    bslli r11, r12, 24 /* h = v << 24 */
120    lwi r12, r8, 8 /* v = *(as + 8) */
121    bsrli r9, r12, 8 /* t1 = v >> 8 */
122    or r9, r11, r9 /* t1 = h | t1 */
123    swi r9, r5, 4 /* *(d + 4) = t1 */
124    bslli r11, r12, 24 /* h = v << 24 */
125    lwi r12, r8, 12 /* v = *(as + 12) */
126    bsrli r9, r12, 8 /* t1 = v >> 8 */
127    or r9, r11, r9 /* t1 = h | t1 */
128    swi r9, r5, 8 /* *(d + 8) = t1 */
129    bslli r11, r12, 24 /* h = v << 24 */
130    lwi r12, r8, 16 /* v = *(as + 16) */
131    bsrli r9, r12, 8 /* t1 = v >> 8 */
132    or r9, r11, r9 /* t1 = h | t1 */
133    swi r9, r5, 12 /* *(d + 12) = t1 */
134    bslli r11, r12, 24 /* h = v << 24 */
135    lwi r12, r8, 20 /* v = *(as + 20) */
136    bsrli r9, r12, 8 /* t1 = v >> 8 */
137    or r9, r11, r9 /* t1 = h | t1 */
138    swi r9, r5, 16 /* *(d + 16) = t1 */
139    bslli r11, r12, 24 /* h = v << 24 */
140    lwi r12, r8, 24 /* v = *(as + 24) */
141    bsrli r9, r12, 8 /* t1 = v >> 8 */
142    or r9, r11, r9 /* t1 = h | t1 */
143    swi r9, r5, 20 /* *(d + 20) = t1 */
144    bslli r11, r12, 24 /* h = v << 24 */
145    lwi r12, r8, 28 /* v = *(as + 28) */
146    bsrli r9, r12, 8 /* t1 = v >> 8 */
147    or r9, r11, r9 /* t1 = h | t1 */
148    swi r9, r5, 24 /* *(d + 24) = t1 */
149    bslli r11, r12, 24 /* h = v << 24 */
150    lwi r12, r8, 32 /* v = *(as + 32) */
151    bsrli r9, r12, 8 /* t1 = v >> 8 */
152    or r9, r11, r9 /* t1 = h | t1 */
153    swi r9, r5, 28 /* *(d + 28) = t1 */
154    bslli r11, r12, 24 /* h = v << 24 */
155    addi r8, r8, 32 /* as = as + 32 */
156    addi r4, r4, -32 /* n = n - 32 */
157    bneid r4, a_bu3_loop /* while (n) loop */
158    addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
159    bri a_block_done
160
161a_block_u1:
162    bslli r11, r11, 8 /* h = h << 8 */
163a_bu1_loop:
164    lwi r12, r8, 4 /* v = *(as + 4) */
165    bsrli r9, r12, 24 /* t1 = v >> 24 */
166    or r9, r11, r9 /* t1 = h | t1 */
167    swi r9, r5, 0 /* *(d + 0) = t1 */
168    bslli r11, r12, 8 /* h = v << 8 */
169    lwi r12, r8, 8 /* v = *(as + 8) */
170    bsrli r9, r12, 24 /* t1 = v >> 24 */
171    or r9, r11, r9 /* t1 = h | t1 */
172    swi r9, r5, 4 /* *(d + 4) = t1 */
173    bslli r11, r12, 8 /* h = v << 8 */
174    lwi r12, r8, 12 /* v = *(as + 12) */
175    bsrli r9, r12, 24 /* t1 = v >> 24 */
176    or r9, r11, r9 /* t1 = h | t1 */
177    swi r9, r5, 8 /* *(d + 8) = t1 */
178    bslli r11, r12, 8 /* h = v << 8 */
179    lwi r12, r8, 16 /* v = *(as + 16) */
180    bsrli r9, r12, 24 /* t1 = v >> 24 */
181    or r9, r11, r9 /* t1 = h | t1 */
182    swi r9, r5, 12 /* *(d + 12) = t1 */
183    bslli r11, r12, 8 /* h = v << 8 */
184    lwi r12, r8, 20 /* v = *(as + 20) */
185    bsrli r9, r12, 24 /* t1 = v >> 24 */
186    or r9, r11, r9 /* t1 = h | t1 */
187    swi r9, r5, 16 /* *(d + 16) = t1 */
188    bslli r11, r12, 8 /* h = v << 8 */
189    lwi r12, r8, 24 /* v = *(as + 24) */
190    bsrli r9, r12, 24 /* t1 = v >> 24 */
191    or r9, r11, r9 /* t1 = h | t1 */
192    swi r9, r5, 20 /* *(d + 20) = t1 */
193    bslli r11, r12, 8 /* h = v << 8 */
194    lwi r12, r8, 28 /* v = *(as + 28) */
195    bsrli r9, r12, 24 /* t1 = v >> 24 */
196    or r9, r11, r9 /* t1 = h | t1 */
197    swi r9, r5, 24 /* *(d + 24) = t1 */
198    bslli r11, r12, 8 /* h = v << 8 */
199    lwi r12, r8, 32 /* v = *(as + 32) */
200    bsrli r9, r12, 24 /* t1 = v >> 24 */
201    or r9, r11, r9 /* t1 = h | t1 */
202    swi r9, r5, 28 /* *(d + 28) = t1 */
203    bslli r11, r12, 8 /* h = v << 8 */
204    addi r8, r8, 32 /* as = as + 32 */
205    addi r4, r4, -32 /* n = n - 32 */
206    bneid r4, a_bu1_loop /* while (n) loop */
207    addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
208    bri a_block_done
209
210a_block_u2:
211    bslli r11, r11, 16 /* h = h << 16 */
212a_bu2_loop:
213    lwi r12, r8, 4 /* v = *(as + 4) */
214    bsrli r9, r12, 16 /* t1 = v >> 16 */
215    or r9, r11, r9 /* t1 = h | t1 */
216    swi r9, r5, 0 /* *(d + 0) = t1 */
217    bslli r11, r12, 16 /* h = v << 16 */
218    lwi r12, r8, 8 /* v = *(as + 8) */
219    bsrli r9, r12, 16 /* t1 = v >> 16 */
220    or r9, r11, r9 /* t1 = h | t1 */
221    swi r9, r5, 4 /* *(d + 4) = t1 */
222    bslli r11, r12, 16 /* h = v << 16 */
223    lwi r12, r8, 12 /* v = *(as + 12) */
224    bsrli r9, r12, 16 /* t1 = v >> 16 */
225    or r9, r11, r9 /* t1 = h | t1 */
226    swi r9, r5, 8 /* *(d + 8) = t1 */
227    bslli r11, r12, 16 /* h = v << 16 */
228    lwi r12, r8, 16 /* v = *(as + 16) */
229    bsrli r9, r12, 16 /* t1 = v >> 16 */
230    or r9, r11, r9 /* t1 = h | t1 */
231    swi r9, r5, 12 /* *(d + 12) = t1 */
232    bslli r11, r12, 16 /* h = v << 16 */
233    lwi r12, r8, 20 /* v = *(as + 20) */
234    bsrli r9, r12, 16 /* t1 = v >> 16 */
235    or r9, r11, r9 /* t1 = h | t1 */
236    swi r9, r5, 16 /* *(d + 16) = t1 */
237    bslli r11, r12, 16 /* h = v << 16 */
238    lwi r12, r8, 24 /* v = *(as + 24) */
239    bsrli r9, r12, 16 /* t1 = v >> 16 */
240    or r9, r11, r9 /* t1 = h | t1 */
241    swi r9, r5, 20 /* *(d + 20) = t1 */
242    bslli r11, r12, 16 /* h = v << 16 */
243    lwi r12, r8, 28 /* v = *(as + 28) */
244    bsrli r9, r12, 16 /* t1 = v >> 16 */
245    or r9, r11, r9 /* t1 = h | t1 */
246    swi r9, r5, 24 /* *(d + 24) = t1 */
247    bslli r11, r12, 16 /* h = v << 16 */
248    lwi r12, r8, 32 /* v = *(as + 32) */
249    bsrli r9, r12, 16 /* t1 = v >> 16 */
250    or r9, r11, r9 /* t1 = h | t1 */
251    swi r9, r5, 28 /* *(d + 28) = t1 */
252    bslli r11, r12, 16 /* h = v << 16 */
253    addi r8, r8, 32 /* as = as + 32 */
254    addi r4, r4, -32 /* n = n - 32 */
255    bneid r4, a_bu2_loop /* while (n) loop */
256    addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
257
258a_block_done:
259    addi r4, r0, 4 /* n = 4 */
260    cmpu r4, r4, r7 /* n = c - n (unsigned) */
261    blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
262
263a_word_xfer:
264    andi r4, r7, 0xfffffffc /* n = c & ~3 */
265    addi r10, r0, 0 /* offset = 0 */
266
267    andi r9, r6, 3 /* t1 = s & 3 */
268    /* if temp != 0, unaligned transfers needed */
269    bnei r9, a_word_unaligned
270
271a_word_aligned:
272    lw r9, r6, r10 /* t1 = *(s+offset) */
273    sw r9, r5, r10 /* *(d+offset) = t1 */
274    addi r4, r4,-4 /* n-- */
275    bneid r4, a_word_aligned /* loop */
276    addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
277
278    bri a_word_done
279
280a_word_unaligned:
281    andi r8, r6, 0xfffffffc /* as = s & ~3 */
282    lwi r11, r8, 0 /* h = *(as + 0) */
283    addi r8, r8, 4 /* as = as + 4 */
284
285    addi r9, r9, -1
286    beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
287    addi r9, r9, -1
288    beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
289
290a_word_u3:
291    bslli r11, r11, 24 /* h = h << 24 */
292a_wu3_loop:
293    lw r12, r8, r10 /* v = *(as + offset) */
294    bsrli r9, r12, 8 /* t1 = v >> 8 */
295    or r9, r11, r9 /* t1 = h | t1 */
296    sw r9, r5, r10 /* *(d + offset) = t1 */
297    bslli r11, r12, 24 /* h = v << 24 */
298    addi r4, r4,-4 /* n = n - 4 */
299    bneid r4, a_wu3_loop /* while (n) loop */
300    addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
301
302    bri a_word_done
303
304a_word_u1:
305    bslli r11, r11, 8 /* h = h << 8 */
306a_wu1_loop:
307    lw r12, r8, r10 /* v = *(as + offset) */
308    bsrli r9, r12, 24 /* t1 = v >> 24 */
309    or r9, r11, r9 /* t1 = h | t1 */
310    sw r9, r5, r10 /* *(d + offset) = t1 */
311    bslli r11, r12, 8 /* h = v << 8 */
312    addi r4, r4,-4 /* n = n - 4 */
313    bneid r4, a_wu1_loop /* while (n) loop */
314    addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
315
316    bri a_word_done
317
318a_word_u2:
319    bslli r11, r11, 16 /* h = h << 16 */
320a_wu2_loop:
321    lw r12, r8, r10 /* v = *(as + offset) */
322    bsrli r9, r12, 16 /* t1 = v >> 16 */
323    or r9, r11, r9 /* t1 = h | t1 */
324    sw r9, r5, r10 /* *(d + offset) = t1 */
325    bslli r11, r12, 16 /* h = v << 16 */
326    addi r4, r4,-4 /* n = n - 4 */
327    bneid r4, a_wu2_loop /* while (n) loop */
328    addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
329
330a_word_done:
331    add r5, r5, r10 /* d = d + offset */
332    add r6, r6, r10 /* s = s + offset */
333    rsub r7, r10, r7 /* c = c - offset */
334
335a_xfer_end:
336a_xfer_end_loop:
337    beqi r7, a_done /* while (c) */
338    lbui r9, r6, 0 /* t1 = *s */
339    addi r6, r6, 1 /* s++ */
340    sbi r9, r5, 0 /* *d = t1 */
341    addi r7, r7, -1 /* c-- */
342    brid a_xfer_end_loop /* loop */
343    addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
344
345a_done:
346    rtsd r15, 8
347    nop
348
349.size memcpy, . - memcpy
350.end memcpy
351/*----------------------------------------------------------------------------*/
352    .globl memmove
353    .type memmove, @function
354    .ent memmove
355
356memmove:
357    cmpu r4, r5, r6 /* n = s - d */
358    bgei r4,fast_memcpy_ascending
359
360fast_memcpy_descending:
361    /* move d to return register as value of function */
362    addi r3, r5, 0
363
364    add r5, r5, r7 /* d = d + c */
365    add r6, r6, r7 /* s = s + c */
366
367    addi r4, r0, 4 /* n = 4 */
368    cmpu r4, r4, r7 /* n = c - n (unsigned) */
369    blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
370
371    /* transfer first 0~3 bytes to get aligned dest address */
372    andi r4, r5, 3 /* n = d & 3 */
373    /* if zero, destination already aligned */
374    beqi r4,d_dalign_done
375    rsub r7, r4, r7 /* c = c - n adjust c */
376
377d_xfer_first_loop:
378    /* if no bytes left to transfer, transfer the bulk */
379    beqi r4,d_dalign_done
380    addi r6, r6, -1 /* s-- */
381    addi r5, r5, -1 /* d-- */
382    lbui r11, r6, 0 /* h = *s */
383    sbi r11, r5, 0 /* *d = h */
384    brid d_xfer_first_loop /* loop */
385    addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
386
387d_dalign_done:
388    addi r4, r0, 32 /* n = 32 */
389    cmpu r4, r4, r7 /* n = c - n (unsigned) */
390    /* if n < 0, less than one block to transfer */
391    blti r4, d_block_done
392
393d_block_xfer:
394    andi r4, r7, 0xffffffe0 /* n = c & ~31 */
395    rsub r7, r4, r7 /* c = c - n */
396
397    andi r9, r6, 3 /* t1 = s & 3 */
398    /* if temp != 0, unaligned transfers needed */
399    bnei r9, d_block_unaligned
400
401d_block_aligned:
402    addi r6, r6, -32 /* s = s - 32 */
403    addi r5, r5, -32 /* d = d - 32 */
404    lwi r9, r6, 28 /* t1 = *(s + 28) */
405    lwi r10, r6, 24 /* t2 = *(s + 24) */
406    lwi r11, r6, 20 /* t3 = *(s + 20) */
407    lwi r12, r6, 16 /* t4 = *(s + 16) */
408    swi r9, r5, 28 /* *(d + 28) = t1 */
409    swi r10, r5, 24 /* *(d + 24) = t2 */
410    swi r11, r5, 20 /* *(d + 20) = t3 */
411    swi r12, r5, 16 /* *(d + 16) = t4 */
412    lwi r9, r6, 12 /* t1 = *(s + 12) */
413    lwi r10, r6, 8 /* t2 = *(s + 8) */
414    lwi r11, r6, 4 /* t3 = *(s + 4) */
415    lwi r12, r6, 0 /* t4 = *(s + 0) */
416    swi r9, r5, 12 /* *(d + 12) = t1 */
417    swi r10, r5, 8 /* *(d + 8) = t2 */
418    swi r11, r5, 4 /* *(d + 4) = t3 */
419    addi r4, r4, -32 /* n = n - 32 */
420    bneid r4, d_block_aligned /* while (n) loop */
421    swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
422    bri d_block_done
423
424d_block_unaligned:
425    andi r8, r6, 0xfffffffc /* as = s & ~3 */
426    rsub r6, r4, r6 /* s = s - n */
427    lwi r11, r8, 0 /* h = *(as + 0) */
428
429    addi r9, r9, -1
430    beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
431    addi r9, r9, -1
432    beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
433
434d_block_u3:
435    bsrli r11, r11, 8 /* h = h >> 8 */
436d_bu3_loop:
437    addi r8, r8, -32 /* as = as - 32 */
438    addi r5, r5, -32 /* d = d - 32 */
439    lwi r12, r8, 28 /* v = *(as + 28) */
440    bslli r9, r12, 24 /* t1 = v << 24 */
441    or r9, r11, r9 /* t1 = h | t1 */
442    swi r9, r5, 28 /* *(d + 28) = t1 */
443    bsrli r11, r12, 8 /* h = v >> 8 */
444    lwi r12, r8, 24 /* v = *(as + 24) */
445    bslli r9, r12, 24 /* t1 = v << 24 */
446    or r9, r11, r9 /* t1 = h | t1 */
447    swi r9, r5, 24 /* *(d + 24) = t1 */
448    bsrli r11, r12, 8 /* h = v >> 8 */
449    lwi r12, r8, 20 /* v = *(as + 20) */
450    bslli r9, r12, 24 /* t1 = v << 24 */
451    or r9, r11, r9 /* t1 = h | t1 */
452    swi r9, r5, 20 /* *(d + 20) = t1 */
453    bsrli r11, r12, 8 /* h = v >> 8 */
454    lwi r12, r8, 16 /* v = *(as + 16) */
455    bslli r9, r12, 24 /* t1 = v << 24 */
456    or r9, r11, r9 /* t1 = h | t1 */
457    swi r9, r5, 16 /* *(d + 16) = t1 */
458    bsrli r11, r12, 8 /* h = v >> 8 */
459    lwi r12, r8, 12 /* v = *(as + 12) */
460    bslli r9, r12, 24 /* t1 = v << 24 */
461    or r9, r11, r9 /* t1 = h | t1 */
462    swi r9, r5, 12 /* *(d + 112) = t1 */
463    bsrli r11, r12, 8 /* h = v >> 8 */
464    lwi r12, r8, 8 /* v = *(as + 8) */
465    bslli r9, r12, 24 /* t1 = v << 24 */
466    or r9, r11, r9 /* t1 = h | t1 */
467    swi r9, r5, 8 /* *(d + 8) = t1 */
468    bsrli r11, r12, 8 /* h = v >> 8 */
469    lwi r12, r8, 4 /* v = *(as + 4) */
470    bslli r9, r12, 24 /* t1 = v << 24 */
471    or r9, r11, r9 /* t1 = h | t1 */
472    swi r9, r5, 4 /* *(d + 4) = t1 */
473    bsrli r11, r12, 8 /* h = v >> 8 */
474    lwi r12, r8, 0 /* v = *(as + 0) */
475    bslli r9, r12, 24 /* t1 = v << 24 */
476    or r9, r11, r9 /* t1 = h | t1 */
477    swi r9, r5, 0 /* *(d + 0) = t1 */
478    addi r4, r4, -32 /* n = n - 32 */
479    bneid r4, d_bu3_loop /* while (n) loop */
480    bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
481    bri d_block_done
482
483d_block_u1:
484    bsrli r11, r11, 24 /* h = h >> 24 */
485d_bu1_loop:
486    addi r8, r8, -32 /* as = as - 32 */
487    addi r5, r5, -32 /* d = d - 32 */
488    lwi r12, r8, 28 /* v = *(as + 28) */
489    bslli r9, r12, 8 /* t1 = v << 8 */
490    or r9, r11, r9 /* t1 = h | t1 */
491    swi r9, r5, 28 /* *(d + 28) = t1 */
492    bsrli r11, r12, 24 /* h = v >> 24 */
493    lwi r12, r8, 24 /* v = *(as + 24) */
494    bslli r9, r12, 8 /* t1 = v << 8 */
495    or r9, r11, r9 /* t1 = h | t1 */
496    swi r9, r5, 24 /* *(d + 24) = t1 */
497    bsrli r11, r12, 24 /* h = v >> 24 */
498    lwi r12, r8, 20 /* v = *(as + 20) */
499    bslli r9, r12, 8 /* t1 = v << 8 */
500    or r9, r11, r9 /* t1 = h | t1 */
501    swi r9, r5, 20 /* *(d + 20) = t1 */
502    bsrli r11, r12, 24 /* h = v >> 24 */
503    lwi r12, r8, 16 /* v = *(as + 16) */
504    bslli r9, r12, 8 /* t1 = v << 8 */
505    or r9, r11, r9 /* t1 = h | t1 */
506    swi r9, r5, 16 /* *(d + 16) = t1 */
507    bsrli r11, r12, 24 /* h = v >> 24 */
508    lwi r12, r8, 12 /* v = *(as + 12) */
509    bslli r9, r12, 8 /* t1 = v << 8 */
510    or r9, r11, r9 /* t1 = h | t1 */
511    swi r9, r5, 12 /* *(d + 112) = t1 */
512    bsrli r11, r12, 24 /* h = v >> 24 */
513    lwi r12, r8, 8 /* v = *(as + 8) */
514    bslli r9, r12, 8 /* t1 = v << 8 */
515    or r9, r11, r9 /* t1 = h | t1 */
516    swi r9, r5, 8 /* *(d + 8) = t1 */
517    bsrli r11, r12, 24 /* h = v >> 24 */
518    lwi r12, r8, 4 /* v = *(as + 4) */
519    bslli r9, r12, 8 /* t1 = v << 8 */
520    or r9, r11, r9 /* t1 = h | t1 */
521    swi r9, r5, 4 /* *(d + 4) = t1 */
522    bsrli r11, r12, 24 /* h = v >> 24 */
523    lwi r12, r8, 0 /* v = *(as + 0) */
524    bslli r9, r12, 8 /* t1 = v << 8 */
525    or r9, r11, r9 /* t1 = h | t1 */
526    swi r9, r5, 0 /* *(d + 0) = t1 */
527    addi r4, r4, -32 /* n = n - 32 */
528    bneid r4, d_bu1_loop /* while (n) loop */
529    bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
530    bri d_block_done
531
532d_block_u2:
533    bsrli r11, r11, 16 /* h = h >> 16 */
534d_bu2_loop:
535    addi r8, r8, -32 /* as = as - 32 */
536    addi r5, r5, -32 /* d = d - 32 */
537    lwi r12, r8, 28 /* v = *(as + 28) */
538    bslli r9, r12, 16 /* t1 = v << 16 */
539    or r9, r11, r9 /* t1 = h | t1 */
540    swi r9, r5, 28 /* *(d + 28) = t1 */
541    bsrli r11, r12, 16 /* h = v >> 16 */
542    lwi r12, r8, 24 /* v = *(as + 24) */
543    bslli r9, r12, 16 /* t1 = v << 16 */
544    or r9, r11, r9 /* t1 = h | t1 */
545    swi r9, r5, 24 /* *(d + 24) = t1 */
546    bsrli r11, r12, 16 /* h = v >> 16 */
547    lwi r12, r8, 20 /* v = *(as + 20) */
548    bslli r9, r12, 16 /* t1 = v << 16 */
549    or r9, r11, r9 /* t1 = h | t1 */
550    swi r9, r5, 20 /* *(d + 20) = t1 */
551    bsrli r11, r12, 16 /* h = v >> 16 */
552    lwi r12, r8, 16 /* v = *(as + 16) */
553    bslli r9, r12, 16 /* t1 = v << 16 */
554    or r9, r11, r9 /* t1 = h | t1 */
555    swi r9, r5, 16 /* *(d + 16) = t1 */
556    bsrli r11, r12, 16 /* h = v >> 16 */
557    lwi r12, r8, 12 /* v = *(as + 12) */
558    bslli r9, r12, 16 /* t1 = v << 16 */
559    or r9, r11, r9 /* t1 = h | t1 */
560    swi r9, r5, 12 /* *(d + 112) = t1 */
561    bsrli r11, r12, 16 /* h = v >> 16 */
562    lwi r12, r8, 8 /* v = *(as + 8) */
563    bslli r9, r12, 16 /* t1 = v << 16 */
564    or r9, r11, r9 /* t1 = h | t1 */
565    swi r9, r5, 8 /* *(d + 8) = t1 */
566    bsrli r11, r12, 16 /* h = v >> 16 */
567    lwi r12, r8, 4 /* v = *(as + 4) */
568    bslli r9, r12, 16 /* t1 = v << 16 */
569    or r9, r11, r9 /* t1 = h | t1 */
570    swi r9, r5, 4 /* *(d + 4) = t1 */
571    bsrli r11, r12, 16 /* h = v >> 16 */
572    lwi r12, r8, 0 /* v = *(as + 0) */
573    bslli r9, r12, 16 /* t1 = v << 16 */
574    or r9, r11, r9 /* t1 = h | t1 */
575    swi r9, r5, 0 /* *(d + 0) = t1 */
576    addi r4, r4, -32 /* n = n - 32 */
577    bneid r4, d_bu2_loop /* while (n) loop */
578    bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
579
580d_block_done:
581    addi r4, r0, 4 /* n = 4 */
582    cmpu r4, r4, r7 /* n = c - n (unsigned) */
583    blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
584
585d_word_xfer:
586    andi r4, r7, 0xfffffffc /* n = c & ~3 */
587    rsub r5, r4, r5 /* d = d - n */
588    rsub r6, r4, r6 /* s = s - n */
589    rsub r7, r4, r7 /* c = c - n */
590
591    andi r9, r6, 3 /* t1 = s & 3 */
592    /* if temp != 0, unaligned transfers needed */
593    bnei r9, d_word_unaligned
594
595d_word_aligned:
596    addi r4, r4,-4 /* n-- */
597    lw r9, r6, r4 /* t1 = *(s+n) */
598    bneid r4, d_word_aligned /* loop */
599    sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
600
601    bri d_word_done
602
603d_word_unaligned:
604    andi r8, r6, 0xfffffffc /* as = s & ~3 */
605    lw r11, r8, r4 /* h = *(as + n) */
606
607    addi r9, r9, -1
608    beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
609    addi r9, r9, -1
610    beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
611
612d_word_u3:
613    bsrli r11, r11, 8 /* h = h >> 8 */
614d_wu3_loop:
615    addi r4, r4,-4 /* n = n - 4 */
616    lw r12, r8, r4 /* v = *(as + n) */
617    bslli r9, r12, 24 /* t1 = v << 24 */
618    or r9, r11, r9 /* t1 = h | t1 */
619    sw r9, r5, r4 /* *(d + n) = t1 */
620    bneid r4, d_wu3_loop /* while (n) loop */
621    bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
622
623    bri d_word_done
624
625d_word_u1:
626    bsrli r11, r11, 24 /* h = h >> 24 */
627d_wu1_loop:
628    addi r4, r4,-4 /* n = n - 4 */
629    lw r12, r8, r4 /* v = *(as + n) */
630    bslli r9, r12, 8 /* t1 = v << 8 */
631    or r9, r11, r9 /* t1 = h | t1 */
632    sw r9, r5, r4 /* *(d + n) = t1 */
633    bneid r4, d_wu1_loop /* while (n) loop */
634    bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
635
636    bri d_word_done
637
638d_word_u2:
639    bsrli r11, r11, 16 /* h = h >> 16 */
640d_wu2_loop:
641    addi r4, r4,-4 /* n = n - 4 */
642    lw r12, r8, r4 /* v = *(as + n) */
643    bslli r9, r12, 16 /* t1 = v << 16 */
644    or r9, r11, r9 /* t1 = h | t1 */
645    sw r9, r5, r4 /* *(d + n) = t1 */
646    bneid r4, d_wu2_loop /* while (n) loop */
647    bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
648
649d_word_done:
650
651d_xfer_end:
652d_xfer_end_loop:
653    beqi r7, a_done /* while (c) */
654    addi r6, r6, -1 /* s-- */
655    lbui r9, r6, 0 /* t1 = *s */
656    addi r5, r5, -1 /* d-- */
657    sbi r9, r5, 0 /* *d = t1 */
658    brid d_xfer_end_loop /* loop */
659    addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
660
661d_done:
662    rtsd r15, 8
663    nop
664
665.size memmove, . - memmove
666.end memmove
667

Archive Download this file



interactive