Root/arch/sh/lib64/memcpy.S

Source at commit 0de2b2b3be81048189a32f7a3d3ba0ba9ec817b6 created 11 years 11 months ago.
By Maarten ter Huurne, MIPS: JZ4740: Fixed value for round robin constant.
1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11! ARG1: source pointer
12! ARG2: byte count
13!
14! Exit: RESULT: destination pointer
15! any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18! unfortunately it is difficult in some cases to concatanate bytes
19! into a longword on the SH, so this does a longword read and small
20! writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25! copied is unsigned greater than the address of the first byte to
26! be copied. This could be easily swapped for a signed comparison,
27! but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30! bytes memory chunk to b copied, the rest of the word can be read
31! without side effects.
32! This could be easily changed by increasing the minimum size of
33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34! however, this would cost a few extra cyles on average.
35! For SHmedia, the assumption is that any quadword can be read in its
36! enirety if at least one byte is included in the copy.
37!
38
39    .section .text..SHmedia32,"ax"
40    .globl memcpy
41    .type memcpy, @function
42    .align 5
43
44memcpy:
45
46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51    ld.b r3,0,r63
52    pta/l Large,tr0
53    movi 25,r0
54    bgeu/u r4,r0,tr0
55    nsb r4,r0
56    shlli r0,5,r0
57    movi (L1-L0+63*32 + 1) & 0xffff,r1
58    sub r1, r0, r0
59L0: ptrel r0,tr0
60    add r2,r4,r5
61    ptabs r18,tr1
62    add r3,r4,r6
63    blink tr0,r63
64    
65/* Rearranged to make cut2 safe */
66    .balign 8
67L4_7: /* 4..7 byte memcpy cntd. */
68    stlo.l r2, 0, r0
69    or r6, r7, r6
70    sthi.l r5, -1, r6
71    stlo.l r5, -4, r6
72    blink tr1,r63
73
74    .balign 8
75L1: /* 0 byte memcpy */
76    nop
77    blink tr1,r63
78    nop
79    nop
80    nop
81    nop
82
83L2_3: /* 2 or 3 byte memcpy cntd. */
84    st.b r5,-1,r6
85    blink tr1,r63
86
87    /* 1 byte memcpy */
88    ld.b r3,0,r0
89    st.b r2,0,r0
90    blink tr1,r63
91
92L8_15: /* 8..15 byte memcpy cntd. */
93    stlo.q r2, 0, r0
94    or r6, r7, r6
95    sthi.q r5, -1, r6
96    stlo.q r5, -8, r6
97    blink tr1,r63
98    
99    /* 2 or 3 byte memcpy */
100    ld.b r3,0,r0
101    ld.b r2,0,r63
102    ld.b r3,1,r1
103    st.b r2,0,r0
104    pta/l L2_3,tr0
105    ld.b r6,-1,r6
106    st.b r2,1,r1
107    blink tr0, r63
108
109    /* 4 .. 7 byte memcpy */
110    LDUAL (r3, 0, r0, r1)
111    pta L4_7, tr0
112    ldlo.l r6, -4, r7
113    or r0, r1, r0
114    sthi.l r2, 3, r0
115    ldhi.l r6, -1, r6
116    blink tr0, r63
117
118    /* 8 .. 15 byte memcpy */
119    LDUAQ (r3, 0, r0, r1)
120    pta L8_15, tr0
121    ldlo.q r6, -8, r7
122    or r0, r1, r0
123    sthi.q r2, 7, r0
124    ldhi.q r6, -1, r6
125    blink tr0, r63
126
127    /* 16 .. 24 byte memcpy */
128    LDUAQ (r3, 0, r0, r1)
129    LDUAQ (r3, 8, r8, r9)
130    or r0, r1, r0
131    sthi.q r2, 7, r0
132    or r8, r9, r8
133    sthi.q r2, 15, r8
134    ldlo.q r6, -8, r7
135    ldhi.q r6, -1, r6
136    stlo.q r2, 8, r8
137    stlo.q r2, 0, r0
138    or r6, r7, r6
139    sthi.q r5, -1, r6
140    stlo.q r5, -8, r6
141    blink tr1,r63
142
143Large:
144    ld.b r2, 0, r63
145    pta/l Loop_ua, tr1
146    ori r3, -8, r7
147    sub r2, r7, r22
148    sub r3, r2, r6
149    add r2, r4, r5
150    ldlo.q r3, 0, r0
151    addi r5, -16, r5
152    movi 64+8, r27 // could subtract r7 from that.
153    stlo.q r2, 0, r0
154    sthi.q r2, 7, r0
155    ldx.q r22, r6, r0
156    bgtu/l r27, r4, tr1
157
158    addi r5, -48, r27
159    pta/l Loop_line, tr0
160    addi r6, 64, r36
161    addi r6, -24, r19
162    addi r6, -16, r20
163    addi r6, -8, r21
164
165Loop_line:
166    ldx.q r22, r36, r63
167    alloco r22, 32
168    addi r22, 32, r22
169    ldx.q r22, r19, r23
170    sthi.q r22, -25, r0
171    ldx.q r22, r20, r24
172    ldx.q r22, r21, r25
173    stlo.q r22, -32, r0
174    ldx.q r22, r6, r0
175    sthi.q r22, -17, r23
176    sthi.q r22, -9, r24
177    sthi.q r22, -1, r25
178    stlo.q r22, -24, r23
179    stlo.q r22, -16, r24
180    stlo.q r22, -8, r25
181    bgeu r27, r22, tr0
182
183Loop_ua:
184    addi r22, 8, r22
185    sthi.q r22, -1, r0
186    stlo.q r22, -8, r0
187    ldx.q r22, r6, r0
188    bgtu/l r5, r22, tr1
189
190    add r3, r4, r7
191    ldlo.q r7, -8, r1
192    sthi.q r22, 7, r0
193    ldhi.q r7, -1, r7
194    ptabs r18,tr1
195    stlo.q r22, 0, r0
196    or r1, r7, r1
197    sthi.q r5, 15, r1
198    stlo.q r5, 8, r1
199    blink tr1, r63
200
201    .size memcpy,.-memcpy
202

Archive Download this file



interactive