Root/
1 | /* |
2 | * Copyright 2010 Tilera Corporation. All Rights Reserved. |
3 | * |
4 | * This program is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU General Public License |
6 | * as published by the Free Software Foundation, version 2. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, but |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or |
11 | * NON INFRINGEMENT. See the GNU General Public License for |
12 | * more details. |
13 | */ |
14 | |
15 | #include <arch/chip.h> |
16 | |
17 | |
18 | /* |
19 | * This file shares the implementation of the userspace memcpy and |
20 | * the kernel's memcpy, copy_to_user and copy_from_user. |
21 | */ |
22 | |
23 | #include <linux/linkage.h> |
24 | |
25 | /* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ |
26 | #if !CHIP_HAS_COHERENT_LOCAL_CACHE() |
27 | #define memcpy __memcpy_asm |
28 | #define __copy_to_user_inatomic __copy_to_user_inatomic_asm |
29 | #define __copy_from_user_inatomic __copy_from_user_inatomic_asm |
30 | #define __copy_from_user_zeroing __copy_from_user_zeroing_asm |
31 | #endif |
32 | |
33 | #define IS_MEMCPY 0 |
34 | #define IS_COPY_FROM_USER 1 |
35 | #define IS_COPY_FROM_USER_ZEROING 2 |
36 | #define IS_COPY_TO_USER -1 |
37 | |
38 | .section .text.memcpy_common, "ax" |
39 | .align 64 |
40 | |
41 | /* Use this to preface each bundle that can cause an exception so |
42 | * the kernel can clean up properly. The special cleanup code should |
43 | * not use these, since it knows what it is doing. |
44 | */ |
45 | #define EX \ |
46 | .pushsection __ex_table, "a"; \ |
47 | .word 9f, memcpy_common_fixup; \ |
48 | .popsection; \ |
49 | 9 |
50 | |
51 | |
52 | /* __copy_from_user_inatomic takes the kernel target address in r0, |
53 | * the user source in r1, and the bytes to copy in r2. |
54 | * It returns the number of uncopiable bytes (hopefully zero) in r0. |
55 | */ |
56 | ENTRY(__copy_from_user_inatomic) |
57 | .type __copy_from_user_inatomic, @function |
58 | FEEDBACK_ENTER_EXPLICIT(__copy_from_user_inatomic, \ |
59 | .text.memcpy_common, \ |
60 | .Lend_memcpy_common - __copy_from_user_inatomic) |
61 | { movei r29, IS_COPY_FROM_USER; j memcpy_common } |
62 | .size __copy_from_user_inatomic, . - __copy_from_user_inatomic |
63 | |
64 | /* __copy_from_user_zeroing is like __copy_from_user_inatomic, but |
65 | * any uncopiable bytes are zeroed in the target. |
66 | */ |
67 | ENTRY(__copy_from_user_zeroing) |
68 | .type __copy_from_user_zeroing, @function |
69 | FEEDBACK_REENTER(__copy_from_user_inatomic) |
70 | { movei r29, IS_COPY_FROM_USER_ZEROING; j memcpy_common } |
71 | .size __copy_from_user_zeroing, . - __copy_from_user_zeroing |
72 | |
73 | /* __copy_to_user_inatomic takes the user target address in r0, |
74 | * the kernel source in r1, and the bytes to copy in r2. |
75 | * It returns the number of uncopiable bytes (hopefully zero) in r0. |
76 | */ |
77 | ENTRY(__copy_to_user_inatomic) |
78 | .type __copy_to_user_inatomic, @function |
79 | FEEDBACK_REENTER(__copy_from_user_inatomic) |
80 | { movei r29, IS_COPY_TO_USER; j memcpy_common } |
81 | .size __copy_to_user_inatomic, . - __copy_to_user_inatomic |
82 | |
83 | ENTRY(memcpy) |
84 | .type memcpy, @function |
85 | FEEDBACK_REENTER(__copy_from_user_inatomic) |
86 | { movei r29, IS_MEMCPY } |
87 | .size memcpy, . - memcpy |
88 | /* Fall through */ |
89 | |
90 | .type memcpy_common, @function |
91 | memcpy_common: |
92 | /* On entry, r29 holds one of the IS_* macro values from above. */ |
93 | |
94 | |
95 | /* r0 is the dest, r1 is the source, r2 is the size. */ |
96 | |
97 | /* Save aside original dest so we can return it at the end. */ |
98 | { sw sp, lr; move r23, r0; or r4, r0, r1 } |
99 | |
100 | /* Check for an empty size. */ |
101 | { bz r2, .Ldone; andi r4, r4, 3 } |
102 | |
103 | /* Save aside original values in case of a fault. */ |
104 | { move r24, r1; move r25, r2 } |
105 | move r27, lr |
106 | |
107 | /* Check for an unaligned source or dest. */ |
108 | { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 } |
109 | |
110 | .Lcheck_aligned_copy_size: |
111 | /* If we are copying < 256 bytes, branch to simple case. */ |
112 | { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
113 | |
114 | /* Copying >= 256 bytes, so jump to complex prefetching loop. */ |
115 | { andi r6, r1, 63; j .Lcopy_many } |
116 | |
117 | /* |
118 | * |
119 | * Aligned 4 byte at a time copy loop |
120 | * |
121 | */ |
122 | |
123 | .Lcopy_8_loop: |
124 | /* Copy two words at a time to hide load latency. */ |
125 | EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 } |
126 | EX: { lw r4, r1; addi r1, r1, 4 } |
127 | EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } |
128 | EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 } |
129 | .Lcopy_8_check: |
130 | { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 } |
131 | |
132 | /* Copy odd leftover word, if any. */ |
133 | { bnzt r4, .Lcheck_odd_stragglers } |
134 | EX: { lw r3, r1; addi r1, r1, 4 } |
135 | EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } |
136 | |
137 | .Lcheck_odd_stragglers: |
138 | { bnz r2, .Lcopy_unaligned_few } |
139 | |
140 | .Ldone: |
141 | /* For memcpy return original dest address, else zero. */ |
142 | { mz r0, r29, r23; jrp lr } |
143 | |
144 | |
145 | /* |
146 | * |
147 | * Prefetching multiple cache line copy handler (for large transfers). |
148 | * |
149 | */ |
150 | |
151 | /* Copy words until r1 is cache-line-aligned. */ |
152 | .Lalign_loop: |
153 | EX: { lw r3, r1; addi r1, r1, 4 } |
154 | { andi r6, r1, 63 } |
155 | EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } |
156 | .Lcopy_many: |
157 | { bnzt r6, .Lalign_loop; addi r9, r0, 63 } |
158 | |
159 | { addi r3, r1, 60; andi r9, r9, -64 } |
160 | |
161 | #if CHIP_HAS_WH64() |
162 | /* No need to prefetch dst, we'll just do the wh64 |
163 | * right before we copy a line. |
164 | */ |
165 | #endif |
166 | |
167 | EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } |
168 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
169 | { bnzt zero, .; move r27, lr } |
170 | EX: { lw r6, r3; addi r3, r3, 64 } |
171 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
172 | { bnzt zero, . } |
173 | EX: { lw r7, r3; addi r3, r3, 64 } |
174 | #if !CHIP_HAS_WH64() |
175 | /* Prefetch the dest */ |
176 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
177 | { bnzt zero, . } |
178 | /* Use a real load to cause a TLB miss if necessary. We aren't using |
179 | * r28, so this should be fine. |
180 | */ |
181 | EX: { lw r28, r9; addi r9, r9, 64 } |
182 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
183 | { bnzt zero, . } |
184 | { prefetch r9; addi r9, r9, 64 } |
185 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
186 | { bnzt zero, . } |
187 | { prefetch r9; addi r9, r9, 64 } |
188 | #endif |
189 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ |
190 | { bz zero, .Lbig_loop2 } |
191 | |
192 | /* On entry to this loop: |
193 | * - r0 points to the start of dst line 0 |
194 | * - r1 points to start of src line 0 |
195 | * - r2 >= (256 - 60), only the first time the loop trips. |
196 | * - r3 contains r1 + 128 + 60 [pointer to end of source line 2] |
197 | * This is our prefetch address. When we get near the end |
198 | * rather than prefetching off the end this is changed to point |
199 | * to some "safe" recently loaded address. |
200 | * - r5 contains *(r1 + 60) [i.e. last word of source line 0] |
201 | * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] |
202 | * - r9 contains ((r0 + 63) & -64) |
203 | * [start of next dst cache line.] |
204 | */ |
205 | |
206 | .Lbig_loop: |
207 | { jal .Lcopy_line2; add r15, r1, r2 } |
208 | |
209 | .Lbig_loop2: |
210 | /* Copy line 0, first stalling until r5 is ready. */ |
211 | EX: { move r12, r5; lw r16, r1 } |
212 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
213 | /* Prefetch several lines ahead. */ |
214 | EX: { lw r5, r3; addi r3, r3, 64 } |
215 | { jal .Lcopy_line } |
216 | |
217 | /* Copy line 1, first stalling until r6 is ready. */ |
218 | EX: { move r12, r6; lw r16, r1 } |
219 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
220 | /* Prefetch several lines ahead. */ |
221 | EX: { lw r6, r3; addi r3, r3, 64 } |
222 | { jal .Lcopy_line } |
223 | |
224 | /* Copy line 2, first stalling until r7 is ready. */ |
225 | EX: { move r12, r7; lw r16, r1 } |
226 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } |
227 | /* Prefetch several lines ahead. */ |
228 | EX: { lw r7, r3; addi r3, r3, 64 } |
229 | /* Use up a caches-busy cycle by jumping back to the top of the |
230 | * loop. Might as well get it out of the way now. |
231 | */ |
232 | { j .Lbig_loop } |
233 | |
234 | |
235 | /* On entry: |
236 | * - r0 points to the destination line. |
237 | * - r1 points to the source line. |
238 | * - r3 is the next prefetch address. |
239 | * - r9 holds the last address used for wh64. |
240 | * - r12 = WORD_15 |
241 | * - r16 = WORD_0. |
242 | * - r17 == r1 + 16. |
243 | * - r27 holds saved lr to restore. |
244 | * |
245 | * On exit: |
246 | * - r0 is incremented by 64. |
247 | * - r1 is incremented by 64, unless that would point to a word |
248 | * beyond the end of the source array, in which case it is redirected |
249 | * to point to an arbitrary word already in the cache. |
250 | * - r2 is decremented by 64. |
251 | * - r3 is unchanged, unless it points to a word beyond the |
252 | * end of the source array, in which case it is redirected |
253 | * to point to an arbitrary word already in the cache. |
254 | * Redirecting is OK since if we are that close to the end |
255 | * of the array we will not come back to this subroutine |
256 | * and use the contents of the prefetched address. |
257 | * - r4 is nonzero iff r2 >= 64. |
258 | * - r9 is incremented by 64, unless it points beyond the |
259 | * end of the last full destination cache line, in which |
260 | * case it is redirected to a "safe address" that can be |
261 | * clobbered (sp - 64) |
262 | * - lr contains the value in r27. |
263 | */ |
264 | |
265 | /* r26 unused */ |
266 | |
267 | .Lcopy_line: |
268 | /* TODO: when r3 goes past the end, we would like to redirect it |
269 | * to prefetch the last partial cache line (if any) just once, for the |
270 | * benefit of the final cleanup loop. But we don't want to |
271 | * prefetch that line more than once, or subsequent prefetches |
272 | * will go into the RTF. But then .Lbig_loop should unconditionally |
273 | * branch to top of loop to execute final prefetch, and its |
274 | * nop should become a conditional branch. |
275 | */ |
276 | |
277 | /* We need two non-memory cycles here to cover the resources |
278 | * used by the loads initiated by the caller. |
279 | */ |
280 | { add r15, r1, r2 } |
281 | .Lcopy_line2: |
282 | { slt_u r13, r3, r15; addi r17, r1, 16 } |
283 | |
284 | /* NOTE: this will stall for one cycle as L1 is busy. */ |
285 | |
286 | /* Fill second L1D line. */ |
287 | EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ |
288 | |
289 | #if CHIP_HAS_WH64() |
290 | /* Prepare destination line for writing. */ |
291 | EX: { wh64 r9; addi r9, r9, 64 } |
292 | #else |
293 | /* Prefetch dest line */ |
294 | { prefetch r9; addi r9, r9, 64 } |
295 | #endif |
296 | /* Load seven words that are L1D hits to cover wh64 L2 usage. */ |
297 | |
298 | /* Load the three remaining words from the last L1D line, which |
299 | * we know has already filled the L1D. |
300 | */ |
301 | EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ |
302 | EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ |
303 | EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ |
304 | |
305 | /* Load the three remaining words from the first L1D line, first |
306 | * stalling until it has filled by "looking at" r16. |
307 | */ |
308 | EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ |
309 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ |
310 | EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ |
311 | |
312 | /* Load second word from the second L1D line, first |
313 | * stalling until it has filled by "looking at" r17. |
314 | */ |
315 | EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ |
316 | |
317 | /* Store last word to the destination line, potentially dirtying it |
318 | * for the first time, which keeps the L2 busy for two cycles. |
319 | */ |
320 | EX: { sw r10, r12 } /* store(WORD_15) */ |
321 | |
322 | /* Use two L1D hits to cover the sw L2 access above. */ |
323 | EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ |
324 | EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ |
325 | |
326 | /* Fill third L1D line. */ |
327 | EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ |
328 | |
329 | /* Store first L1D line. */ |
330 | EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ |
331 | EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ |
332 | EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ |
333 | #if CHIP_HAS_WH64() |
334 | EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ |
335 | #else |
336 | /* Back up the r9 to a cache line we are already storing to |
337 | * if it gets past the end of the dest vector. Strictly speaking, |
338 | * we don't need to back up to the start of a cache line, but it's free |
339 | * and tidy, so why not? |
340 | */ |
341 | EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ |
342 | #endif |
343 | /* Store second L1D line. */ |
344 | EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ |
345 | EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ |
346 | EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ |
347 | EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */ |
348 | |
349 | EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */ |
350 | EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ |
351 | EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ |
352 | |
353 | /* Store third L1D line. */ |
354 | EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ |
355 | EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ |
356 | EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ |
357 | EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ |
358 | |
359 | /* Store rest of fourth L1D line. */ |
360 | EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ |
361 | { |
362 | EX: sw r0, r8 /* store(WORD_13) */ |
363 | addi r0, r0, 4 |
364 | /* Will r2 be > 64 after we subtract 64 below? */ |
365 | shri r4, r2, 7 |
366 | } |
367 | { |
368 | EX: sw r0, r11 /* store(WORD_14) */ |
369 | addi r0, r0, 8 |
370 | /* Record 64 bytes successfully copied. */ |
371 | addi r2, r2, -64 |
372 | } |
373 | |
374 | { jrp lr; move lr, r27 } |
375 | |
376 | /* Convey to the backtrace library that the stack frame is size |
377 | * zero, and the real return address is on the stack rather than |
378 | * in 'lr'. |
379 | */ |
380 | { info 8 } |
381 | |
382 | .align 64 |
383 | .Lcopy_unaligned_maybe_many: |
384 | /* Skip the setup overhead if we aren't copying many bytes. */ |
385 | { slti_u r8, r2, 20; sub r4, zero, r0 } |
386 | { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 } |
387 | { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 } |
388 | |
389 | /* |
390 | * |
391 | * unaligned 4 byte at a time copy handler. |
392 | * |
393 | */ |
394 | |
395 | /* Copy single bytes until r0 == 0 mod 4, so we can store words. */ |
396 | .Lalign_dest_loop: |
397 | EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 } |
398 | EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } |
399 | { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 } |
400 | |
401 | /* If source and dest are now *both* aligned, do an aligned copy. */ |
402 | { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 } |
403 | |
404 | .Ldest_is_word_aligned: |
405 | |
406 | #if CHIP_HAS_DWORD_ALIGN() |
407 | EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} |
408 | { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } |
409 | |
410 | /* This copies unaligned words until either there are fewer |
411 | * than 4 bytes left to copy, or until the destination pointer |
412 | * is cache-aligned, whichever comes first. |
413 | * |
414 | * On entry: |
415 | * - r0 is the next store address. |
416 | * - r1 points 4 bytes past the load address corresponding to r0. |
417 | * - r2 >= 4 |
418 | * - r6 is the next aligned word loaded. |
419 | */ |
420 | .Lcopy_unaligned_src_words: |
421 | EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 } |
422 | /* stall */ |
423 | { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 } |
424 | EX: { swadd r0, r6, 4; addi r2, r2, -4 } |
425 | { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 } |
426 | { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 } |
427 | |
428 | /* On entry: |
429 | * - r0 is the next store address. |
430 | * - r1 points 4 bytes past the load address corresponding to r0. |
431 | * - r2 >= 4 (# of bytes left to store). |
432 | * - r6 is the next aligned src word value. |
433 | * - r9 = (r2 < 64U). |
434 | * - r18 points one byte past the end of source memory. |
435 | */ |
436 | .Ldest_is_L2_line_aligned: |
437 | |
438 | { |
439 | /* Not a full cache line remains. */ |
440 | bnz r9, .Lcleanup_unaligned_words |
441 | move r7, r6 |
442 | } |
443 | |
444 | /* r2 >= 64 */ |
445 | |
446 | /* Kick off two prefetches, but don't go past the end. */ |
447 | { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 } |
448 | { prefetch r3; move r3, r8; slt_u r8, r8, r18 } |
449 | { mvz r3, r8, r1; addi r8, r3, 64 } |
450 | { prefetch r3; move r3, r8; slt_u r8, r8, r18 } |
451 | { mvz r3, r8, r1; movei r17, 0 } |
452 | |
453 | .Lcopy_unaligned_line: |
454 | /* Prefetch another line. */ |
455 | { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 } |
456 | /* Fire off a load of the last word we are about to copy. */ |
457 | EX: { lw_na r15, r15; slt_u r8, r3, r18 } |
458 | |
459 | EX: { mvz r3, r8, r1; wh64 r0 } |
460 | |
461 | /* This loop runs twice. |
462 | * |
463 | * On entry: |
464 | * - r17 is even before the first iteration, and odd before |
465 | * the second. It is incremented inside the loop. Encountering |
466 | * an even value at the end of the loop makes it stop. |
467 | */ |
468 | .Lcopy_half_an_unaligned_line: |
469 | EX: { |
470 | /* Stall until the last byte is ready. In the steady state this |
471 | * guarantees all words to load below will be in the L2 cache, which |
472 | * avoids shunting the loads to the RTF. |
473 | */ |
474 | move zero, r15 |
475 | lwadd_na r7, r1, 16 |
476 | } |
477 | EX: { lwadd_na r11, r1, 12 } |
478 | EX: { lwadd_na r14, r1, -24 } |
479 | EX: { lwadd_na r8, r1, 4 } |
480 | EX: { lwadd_na r9, r1, 4 } |
481 | EX: { |
482 | lwadd_na r10, r1, 8 |
483 | /* r16 = (r2 < 64), after we subtract 32 from r2 below. */ |
484 | slti_u r16, r2, 64 + 32 |
485 | } |
486 | EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 } |
487 | EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 } |
488 | EX: { swadd r0, r6, 4; dword_align r7, r8, r1 } |
489 | EX: { swadd r0, r7, 4; dword_align r8, r9, r1 } |
490 | EX: { swadd r0, r8, 4; dword_align r9, r10, r1 } |
491 | EX: { swadd r0, r9, 4; dword_align r10, r11, r1 } |
492 | EX: { swadd r0, r10, 4; dword_align r11, r12, r1 } |
493 | EX: { swadd r0, r11, 4; dword_align r12, r13, r1 } |
494 | EX: { swadd r0, r12, 4; dword_align r13, r14, r1 } |
495 | EX: { swadd r0, r13, 4; addi r2, r2, -32 } |
496 | { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line } |
497 | |
498 | { bzt r16, .Lcopy_unaligned_line; move r7, r6 } |
499 | |
500 | /* On entry: |
501 | * - r0 is the next store address. |
502 | * - r1 points 4 bytes past the load address corresponding to r0. |
503 | * - r2 >= 0 (# of bytes left to store). |
504 | * - r7 is the next aligned src word value. |
505 | */ |
506 | .Lcleanup_unaligned_words: |
507 | /* Handle any trailing bytes. */ |
508 | { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 } |
509 | { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 } |
510 | |
511 | /* Move r1 back to the point where it corresponds to r0. */ |
512 | { addi r1, r1, -4 } |
513 | |
514 | #else /* !CHIP_HAS_DWORD_ALIGN() */ |
515 | |
516 | /* Compute right/left shift counts and load initial source words. */ |
517 | { andi r5, r1, -4; andi r3, r1, 3 } |
518 | EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 } |
519 | EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 } |
520 | |
521 | /* Load and store one word at a time, using shifts and ORs |
522 | * to correct for the misaligned src. |
523 | */ |
524 | .Lcopy_unaligned_src_loop: |
525 | { shr r6, r6, r3; shl r8, r7, r4 } |
526 | EX: { lw r7, r5; or r8, r8, r6; move r6, r7 } |
527 | EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 } |
528 | { addi r5, r5, 4; slti_u r8, r2, 8 } |
529 | { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 } |
530 | |
531 | { bz r2, .Lcopy_unaligned_done } |
532 | #endif /* !CHIP_HAS_DWORD_ALIGN() */ |
533 | |
534 | /* Fall through */ |
535 | |
536 | /* |
537 | * |
538 | * 1 byte at a time copy handler. |
539 | * |
540 | */ |
541 | |
542 | .Lcopy_unaligned_few: |
543 | EX: { lb_u r3, r1; addi r1, r1, 1 } |
544 | EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } |
545 | { bnzt r2, .Lcopy_unaligned_few } |
546 | |
547 | .Lcopy_unaligned_done: |
548 | |
549 | /* For memcpy return original dest address, else zero. */ |
550 | { mz r0, r29, r23; jrp lr } |
551 | |
552 | .Lend_memcpy_common: |
553 | .size memcpy_common, .Lend_memcpy_common - memcpy_common |
554 | |
555 | .section .fixup,"ax" |
556 | memcpy_common_fixup: |
557 | .type memcpy_common_fixup, @function |
558 | |
559 | /* Skip any bytes we already successfully copied. |
560 | * r2 (num remaining) is correct, but r0 (dst) and r1 (src) |
561 | * may not be quite right because of unrolling and prefetching. |
562 | * So we need to recompute their values as the address just |
563 | * after the last byte we are sure was successfully loaded and |
564 | * then stored. |
565 | */ |
566 | |
567 | /* Determine how many bytes we successfully copied. */ |
568 | { sub r3, r25, r2 } |
569 | |
570 | /* Add this to the original r0 and r1 to get their new values. */ |
571 | { add r0, r23, r3; add r1, r24, r3 } |
572 | |
573 | { bzt r29, memcpy_fixup_loop } |
574 | { blzt r29, copy_to_user_fixup_loop } |
575 | |
576 | copy_from_user_fixup_loop: |
577 | /* Try copying the rest one byte at a time, expecting a load fault. */ |
578 | .Lcfu: { lb_u r3, r1; addi r1, r1, 1 } |
579 | { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } |
580 | { bnzt r2, copy_from_user_fixup_loop } |
581 | |
582 | .Lcopy_from_user_fixup_zero_remainder: |
583 | { bbs r29, 2f } /* low bit set means IS_COPY_FROM_USER */ |
584 | /* byte-at-a-time loop faulted, so zero the rest. */ |
585 | { move r3, r2; bz r2, 2f /* should be impossible, but handle it. */ } |
586 | 1: { sb r0, zero; addi r0, r0, 1; addi r3, r3, -1 } |
587 | { bnzt r3, 1b } |
588 | 2: move lr, r27 |
589 | { move r0, r2; jrp lr } |
590 | |
591 | copy_to_user_fixup_loop: |
592 | /* Try copying the rest one byte at a time, expecting a store fault. */ |
593 | { lb_u r3, r1; addi r1, r1, 1 } |
594 | .Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } |
595 | { bnzt r2, copy_to_user_fixup_loop } |
596 | .Lcopy_to_user_fixup_done: |
597 | move lr, r27 |
598 | { move r0, r2; jrp lr } |
599 | |
600 | memcpy_fixup_loop: |
601 | /* Try copying the rest one byte at a time. We expect a disastrous |
602 | * fault to happen since we are in fixup code, but let it happen. |
603 | */ |
604 | { lb_u r3, r1; addi r1, r1, 1 } |
605 | { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } |
606 | { bnzt r2, memcpy_fixup_loop } |
607 | /* This should be unreachable, we should have faulted again. |
608 | * But be paranoid and handle it in case some interrupt changed |
609 | * the TLB or something. |
610 | */ |
611 | move lr, r27 |
612 | { move r0, r23; jrp lr } |
613 | |
614 | .size memcpy_common_fixup, . - memcpy_common_fixup |
615 | |
616 | .section __ex_table,"a" |
617 | .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder |
618 | .word .Lctu, .Lcopy_to_user_fixup_done |
619 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9