diff options
Diffstat (limited to 'kernel/arch/arm64/lib')
-rw-r--r-- | kernel/arch/arm64/lib/Makefile | 13 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/atomic_ll_sc.c | 3 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/bitops.S | 45 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/clear_user.S | 8 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/copy_from_user.S | 81 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/copy_in_user.S | 70 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/copy_template.S | 193 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/copy_to_user.S | 70 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/memchr.S | 2 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/memcmp.S | 2 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/memcpy.S | 184 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/memmove.S | 9 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/memset.S | 5 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/strcmp.S | 2 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/strlen.S | 2 | ||||
-rw-r--r-- | kernel/arch/arm64/lib/strncmp.S | 2 |
16 files changed, 432 insertions, 259 deletions
diff --git a/kernel/arch/arm64/lib/Makefile b/kernel/arch/arm64/lib/Makefile index d98d3e398..1a811ecf7 100644 --- a/kernel/arch/arm64/lib/Makefile +++ b/kernel/arch/arm64/lib/Makefile @@ -3,3 +3,16 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ clear_page.o memchr.o memcpy.o memmove.o memset.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o + +# Tell the compiler to treat all general purpose registers as +# callee-saved, which allows for efficient runtime patching of the bl +# instruction in the caller with an atomic instruction when supported by +# the CPU. Result and argument registers are handled correctly, based on +# the function prototype. +lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o +CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ + -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 \ + -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9 \ + -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ + -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ + -fcall-saved-x16 -fcall-saved-x17 -fcall-saved-x18 diff --git a/kernel/arch/arm64/lib/atomic_ll_sc.c b/kernel/arch/arm64/lib/atomic_ll_sc.c new file mode 100644 index 000000000..b0c538b0d --- /dev/null +++ b/kernel/arch/arm64/lib/atomic_ll_sc.c @@ -0,0 +1,3 @@ +#include <asm/atomic.h> +#define __ARM64_IN_ATOMIC_IMPL +#include <asm/atomic_ll_sc.h> diff --git a/kernel/arch/arm64/lib/bitops.S b/kernel/arch/arm64/lib/bitops.S index 7dac371cc..43ac736ba 100644 --- a/kernel/arch/arm64/lib/bitops.S +++ b/kernel/arch/arm64/lib/bitops.S @@ -18,52 +18,59 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/lse.h> /* * x0: bits 5:0 bit offset * bits 31:6 word offset * x1: address */ - .macro bitop, name, instr + .macro bitop, name, llsc, lse ENTRY( \name ) and w3, w0, #63 // Get bit offset eor w0, w0, w3 // Clear low bits mov x2, #1 add x1, x1, x0, lsr #3 // Get word offset +alt_lse " prfm pstl1strm, [x1]", "nop" lsl x3, x2, x3 // Create mask -1: ldxr x2, [x1] - \instr x2, x2, x3 - stxr w0, x2, [x1] - cbnz w0, 1b + +alt_lse "1: ldxr x2, [x1]", "\lse x3, [x1]" +alt_lse " \llsc x2, x2, x3", "nop" +alt_lse " stxr w0, x2, [x1]", "nop" +alt_lse " cbnz w0, 1b", "nop" + ret ENDPROC(\name ) .endm - .macro testop, name, instr + .macro testop, name, llsc, lse ENTRY( \name ) and w3, w0, #63 // Get bit offset eor w0, w0, w3 // Clear low bits mov x2, #1 add x1, x1, x0, lsr #3 // Get word offset +alt_lse " prfm pstl1strm, [x1]", "nop" lsl x4, x2, x3 // Create mask -1: ldxr x2, [x1] - lsr x0, x2, x3 // Save old value of bit - \instr x2, x2, x4 // toggle bit - stlxr w5, x2, [x1] - cbnz w5, 1b - dmb ish + +alt_lse "1: ldxr x2, [x1]", "\lse x4, x2, [x1]" + lsr x0, x2, x3 +alt_lse " \llsc x2, x2, x4", "nop" +alt_lse " stlxr w5, x2, [x1]", "nop" +alt_lse " cbnz w5, 1b", "nop" +alt_lse " dmb ish", "nop" + and x0, x0, #1 -3: ret + ret ENDPROC(\name ) .endm /* * Atomic bit operations. */ - bitop change_bit, eor - bitop clear_bit, bic - bitop set_bit, orr + bitop change_bit, eor, steor + bitop clear_bit, bic, stclr + bitop set_bit, orr, stset - testop test_and_change_bit, eor - testop test_and_clear_bit, bic - testop test_and_set_bit, orr + testop test_and_change_bit, eor, ldeoral + testop test_and_clear_bit, bic, ldclral + testop test_and_set_bit, orr, ldsetal diff --git a/kernel/arch/arm64/lib/clear_user.S b/kernel/arch/arm64/lib/clear_user.S index c17967fdf..a9723c71c 100644 --- a/kernel/arch/arm64/lib/clear_user.S +++ b/kernel/arch/arm64/lib/clear_user.S @@ -16,7 +16,11 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #include <linux/linkage.h> + +#include <asm/alternative.h> #include <asm/assembler.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> .text @@ -29,6 +33,8 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -48,6 +54,8 @@ USER(9f, strh wzr, [x0], #2 ) b.mi 5f USER(9f, strb wzr, [x0] ) 5: mov x0, #0 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) ret ENDPROC(__clear_user) diff --git a/kernel/arch/arm64/lib/copy_from_user.S b/kernel/arch/arm64/lib/copy_from_user.S index 5e27add9d..4699cd74f 100644 --- a/kernel/arch/arm64/lib/copy_from_user.S +++ b/kernel/arch/arm64/lib/copy_from_user.S @@ -15,7 +15,12 @@ */ #include <linux/linkage.h> + +#include <asm/alternative.h> #include <asm/assembler.h> +#include <asm/cache.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -27,40 +32,58 @@ * Returns: * x0 - bytes not copied */ + + .macro ldrb1 ptr, regB, val + USER(9998f, ldrb \ptr, [\regB], \val) + .endm + + .macro strb1 ptr, regB, val + strb \ptr, [\regB], \val + .endm + + .macro ldrh1 ptr, regB, val + USER(9998f, ldrh \ptr, [\regB], \val) + .endm + + .macro strh1 ptr, regB, val + strh \ptr, [\regB], \val + .endm + + .macro ldr1 ptr, regB, val + USER(9998f, ldr \ptr, [\regB], \val) + .endm + + .macro str1 ptr, regB, val + str \ptr, [\regB], \val + .endm + + .macro ldp1 ptr, regB, regC, val + USER(9998f, ldp \ptr, \regB, [\regC], \val) + .endm + + .macro stp1 ptr, regB, regC, val + stp \ptr, \regB, [\regC], \val + .endm + +end .req x5 ENTRY(__copy_from_user) - add x4, x1, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: -USER(9f, ldr x3, [x1], #8 ) - subs x2, x2, #8 - str x3, [x0], #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f -USER(9f, ldr w3, [x1], #4 ) - sub x2, x2, #4 - str w3, [x0], #4 -3: adds x2, x2, #2 - b.mi 4f -USER(9f, ldrh w3, [x1], #2 ) - sub x2, x2, #2 - strh w3, [x0], #2 -4: adds x2, x2, #1 - b.mi 5f -USER(9f, ldrb w3, [x1] ) - strb w3, [x0] -5: mov x0, #0 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) + add end, x0, x2 +#include "copy_template.S" +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) + mov x0, #0 // Nothing to copy ret ENDPROC(__copy_from_user) .section .fixup,"ax" .align 2 -9: sub x2, x4, x1 - mov x3, x2 -10: strb wzr, [x0], #1 // zero remaining buffer space - subs x3, x3, #1 - b.ne 10b - mov x0, x2 // bytes not copied +9998: + sub x0, end, dst +9999: + strb wzr, [dst], #1 // zero remaining buffer space + cmp dst, end + b.lo 9999b ret .previous diff --git a/kernel/arch/arm64/lib/copy_in_user.S b/kernel/arch/arm64/lib/copy_in_user.S index 84b6c9bb9..81c8fc93c 100644 --- a/kernel/arch/arm64/lib/copy_in_user.S +++ b/kernel/arch/arm64/lib/copy_in_user.S @@ -17,7 +17,12 @@ */ #include <linux/linkage.h> + +#include <asm/alternative.h> #include <asm/assembler.h> +#include <asm/cache.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> /* * Copy from user space to user space (alignment handled by the hardware) @@ -29,35 +34,52 @@ * Returns: * x0 - bytes not copied */ + .macro ldrb1 ptr, regB, val + USER(9998f, ldrb \ptr, [\regB], \val) + .endm + + .macro strb1 ptr, regB, val + USER(9998f, strb \ptr, [\regB], \val) + .endm + + .macro ldrh1 ptr, regB, val + USER(9998f, ldrh \ptr, [\regB], \val) + .endm + + .macro strh1 ptr, regB, val + USER(9998f, strh \ptr, [\regB], \val) + .endm + + .macro ldr1 ptr, regB, val + USER(9998f, ldr \ptr, [\regB], \val) + .endm + + .macro str1 ptr, regB, val + USER(9998f, str \ptr, [\regB], \val) + .endm + + .macro ldp1 ptr, regB, regC, val + USER(9998f, ldp \ptr, \regB, [\regC], \val) + .endm + + .macro stp1 ptr, regB, regC, val + USER(9998f, stp \ptr, \regB, [\regC], \val) + .endm + +end .req x5 ENTRY(__copy_in_user) - add x4, x0, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: -USER(9f, ldr x3, [x1], #8 ) - subs x2, x2, #8 -USER(9f, str x3, [x0], #8 ) - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f -USER(9f, ldr w3, [x1], #4 ) - sub x2, x2, #4 -USER(9f, str w3, [x0], #4 ) -3: adds x2, x2, #2 - b.mi 4f -USER(9f, ldrh w3, [x1], #2 ) - sub x2, x2, #2 -USER(9f, strh w3, [x0], #2 ) -4: adds x2, x2, #1 - b.mi 5f -USER(9f, ldrb w3, [x1] ) -USER(9f, strb w3, [x0] ) -5: mov x0, #0 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) + add end, x0, x2 +#include "copy_template.S" +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) + mov x0, #0 ret ENDPROC(__copy_in_user) .section .fixup,"ax" .align 2 -9: sub x0, x4, x0 // bytes not copied +9998: sub x0, end, dst // bytes not copied ret .previous diff --git a/kernel/arch/arm64/lib/copy_template.S b/kernel/arch/arm64/lib/copy_template.S new file mode 100644 index 000000000..410fbdb81 --- /dev/null +++ b/kernel/arch/arm64/lib/copy_template.S @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwritting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp1 A_l, A_h, src, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + ldp1 D_l, D_h, src, #16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp1 A_l, A_h, dst, #16 + ldp1 A_l, A_h, src, #16 + stp1 B_l, B_h, dst, #16 + ldp1 B_l, B_h, src, #16 + stp1 C_l, C_h, dst, #16 + ldp1 C_l, C_h, src, #16 + stp1 D_l, D_h, dst, #16 + ldp1 D_l, D_h, src, #16 + subs count, count, #64 + b.ge 1b + stp1 A_l, A_h, dst, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: diff --git a/kernel/arch/arm64/lib/copy_to_user.S b/kernel/arch/arm64/lib/copy_to_user.S index a0aeeb9b7..7512bbbc0 100644 --- a/kernel/arch/arm64/lib/copy_to_user.S +++ b/kernel/arch/arm64/lib/copy_to_user.S @@ -15,7 +15,12 @@ */ #include <linux/linkage.h> + +#include <asm/alternative.h> #include <asm/assembler.h> +#include <asm/cache.h> +#include <asm/cpufeature.h> +#include <asm/sysreg.h> /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -27,35 +32,52 @@ * Returns: * x0 - bytes not copied */ + .macro ldrb1 ptr, regB, val + ldrb \ptr, [\regB], \val + .endm + + .macro strb1 ptr, regB, val + USER(9998f, strb \ptr, [\regB], \val) + .endm + + .macro ldrh1 ptr, regB, val + ldrh \ptr, [\regB], \val + .endm + + .macro strh1 ptr, regB, val + USER(9998f, strh \ptr, [\regB], \val) + .endm + + .macro ldr1 ptr, regB, val + ldr \ptr, [\regB], \val + .endm + + .macro str1 ptr, regB, val + USER(9998f, str \ptr, [\regB], \val) + .endm + + .macro ldp1 ptr, regB, regC, val + ldp \ptr, \regB, [\regC], \val + .endm + + .macro stp1 ptr, regB, regC, val + USER(9998f, stp \ptr, \regB, [\regC], \val) + .endm + +end .req x5 ENTRY(__copy_to_user) - add x4, x0, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: - ldr x3, [x1], #8 - subs x2, x2, #8 -USER(9f, str x3, [x0], #8 ) - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1], #4 - sub x2, x2, #4 -USER(9f, str w3, [x0], #4 ) -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1], #2 - sub x2, x2, #2 -USER(9f, strh w3, [x0], #2 ) -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1] -USER(9f, strb w3, [x0] ) -5: mov x0, #0 +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) + add end, x0, x2 +#include "copy_template.S" +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) + mov x0, #0 ret ENDPROC(__copy_to_user) .section .fixup,"ax" .align 2 -9: sub x0, x4, x0 // bytes not copied +9998: sub x0, end, dst // bytes not copied ret .previous diff --git a/kernel/arch/arm64/lib/memchr.S b/kernel/arch/arm64/lib/memchr.S index 8636b7549..4444c1d25 100644 --- a/kernel/arch/arm64/lib/memchr.S +++ b/kernel/arch/arm64/lib/memchr.S @@ -41,4 +41,4 @@ ENTRY(memchr) ret 2: mov x0, #0 ret -ENDPROC(memchr) +ENDPIPROC(memchr) diff --git a/kernel/arch/arm64/lib/memcmp.S b/kernel/arch/arm64/lib/memcmp.S index 6ea0776ba..ffbdec003 100644 --- a/kernel/arch/arm64/lib/memcmp.S +++ b/kernel/arch/arm64/lib/memcmp.S @@ -255,4 +255,4 @@ CPU_LE( rev data2, data2 ) .Lret0: mov result, #0 ret -ENDPROC(memcmp) +ENDPIPROC(memcmp) diff --git a/kernel/arch/arm64/lib/memcpy.S b/kernel/arch/arm64/lib/memcpy.S index 8a9a96d3d..676139377 100644 --- a/kernel/arch/arm64/lib/memcpy.S +++ b/kernel/arch/arm64/lib/memcpy.S @@ -36,166 +36,42 @@ * Returns: * x0 - dest */ -dstin .req x0 -src .req x1 -count .req x2 -tmp1 .req x3 -tmp1w .req w3 -tmp2 .req x4 -tmp2w .req w4 -tmp3 .req x5 -tmp3w .req w5 -dst .req x6 + .macro ldrb1 ptr, regB, val + ldrb \ptr, [\regB], \val + .endm -A_l .req x7 -A_h .req x8 -B_l .req x9 -B_h .req x10 -C_l .req x11 -C_h .req x12 -D_l .req x13 -D_h .req x14 + .macro strb1 ptr, regB, val + strb \ptr, [\regB], \val + .endm -ENTRY(memcpy) - mov dst, dstin - cmp count, #16 - /*When memory length is less than 16, the accessed are not aligned.*/ - b.lo .Ltiny15 + .macro ldrh1 ptr, regB, val + ldrh \ptr, [\regB], \val + .endm - neg tmp2, src - ands tmp2, tmp2, #15/* Bytes to reach alignment. */ - b.eq .LSrcAligned - sub count, count, tmp2 - /* - * Copy the leading memory data from src to dst in an increasing - * address order.By this way,the risk of overwritting the source - * memory data is eliminated when the distance between src and - * dst is less than 16. The memory accesses here are alignment. - */ - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - tbz tmp2, #1, 2f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -2: - tbz tmp2, #2, 3f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -3: - tbz tmp2, #3, .LSrcAligned - ldr tmp1, [src],#8 - str tmp1, [dst],#8 + .macro strh1 ptr, regB, val + strh \ptr, [\regB], \val + .endm -.LSrcAligned: - cmp count, #64 - b.ge .Lcpy_over64 - /* - * Deal with small copies quickly by dropping straight into the - * exit block. - */ -.Ltail63: - /* - * Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. - */ - ands tmp1, count, #0x30 - b.eq .Ltiny15 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src], #16 - stp A_l, A_h, [dst], #16 -1: - ldp A_l, A_h, [src], #16 - stp A_l, A_h, [dst], #16 -2: - ldp A_l, A_h, [src], #16 - stp A_l, A_h, [dst], #16 -.Ltiny15: - /* - * Prefer to break one ldp/stp into several load/store to access - * memory in an increasing address order,rather than to load/store 16 - * bytes from (src-16) to (dst-16) and to backward the src to aligned - * address,which way is used in original cortex memcpy. If keeping - * the original memcpy process here, memmove need to satisfy the - * precondition that src address is at least 16 bytes bigger than dst - * address,otherwise some source data will be overwritten when memove - * call memcpy directly. To make memmove simpler and decouple the - * memcpy's dependency on memmove, withdrew the original process. - */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 2f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -2: - tbz count, #1, 3f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -3: - tbz count, #0, .Lexitfunc - ldrb tmp1w, [src] - strb tmp1w, [dst] + .macro ldr1 ptr, regB, val + ldr \ptr, [\regB], \val + .endm -.Lexitfunc: - ret + .macro str1 ptr, regB, val + str \ptr, [\regB], \val + .endm -.Lcpy_over64: - subs count, count, #128 - b.ge .Lcpy_body_large - /* - * Less than 128 bytes to copy, so handle 64 here and then jump - * to the tail. - */ - ldp A_l, A_h, [src],#16 - stp A_l, A_h, [dst],#16 - ldp B_l, B_h, [src],#16 - ldp C_l, C_h, [src],#16 - stp B_l, B_h, [dst],#16 - stp C_l, C_h, [dst],#16 - ldp D_l, D_h, [src],#16 - stp D_l, D_h, [dst],#16 + .macro ldp1 ptr, regB, regC, val + ldp \ptr, \regB, [\regC], \val + .endm - tst count, #0x3f - b.ne .Ltail63 - ret + .macro stp1 ptr, regB, regC, val + stp \ptr, \regB, [\regC], \val + .endm - /* - * Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. - */ - .p2align L1_CACHE_SHIFT -.Lcpy_body_large: - /* pre-get 64 bytes data. */ - ldp A_l, A_h, [src],#16 - ldp B_l, B_h, [src],#16 - ldp C_l, C_h, [src],#16 - ldp D_l, D_h, [src],#16 -1: - /* - * interlace the load of next 64 bytes data block with store of the last - * loaded 64 bytes data. - */ - stp A_l, A_h, [dst],#16 - ldp A_l, A_h, [src],#16 - stp B_l, B_h, [dst],#16 - ldp B_l, B_h, [src],#16 - stp C_l, C_h, [dst],#16 - ldp C_l, C_h, [src],#16 - stp D_l, D_h, [dst],#16 - ldp D_l, D_h, [src],#16 - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst],#16 - stp B_l, B_h, [dst],#16 - stp C_l, C_h, [dst],#16 - stp D_l, D_h, [dst],#16 - - tst count, #0x3f - b.ne .Ltail63 + .weak memcpy +ENTRY(__memcpy) +ENTRY(memcpy) +#include "copy_template.S" ret -ENDPROC(memcpy) +ENDPIPROC(memcpy) +ENDPROC(__memcpy) diff --git a/kernel/arch/arm64/lib/memmove.S b/kernel/arch/arm64/lib/memmove.S index 57b19ea2d..a5a445901 100644 --- a/kernel/arch/arm64/lib/memmove.S +++ b/kernel/arch/arm64/lib/memmove.S @@ -57,12 +57,14 @@ C_h .req x12 D_l .req x13 D_h .req x14 + .weak memmove +ENTRY(__memmove) ENTRY(memmove) cmp dstin, src - b.lo memcpy + b.lo __memcpy add tmp1, src, count cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ + b.hs __memcpy /* No overlap. */ add dst, dstin, count add src, src, count @@ -194,4 +196,5 @@ ENTRY(memmove) tst count, #0x3f b.ne .Ltail63 ret -ENDPROC(memmove) +ENDPIPROC(memmove) +ENDPROC(__memmove) diff --git a/kernel/arch/arm64/lib/memset.S b/kernel/arch/arm64/lib/memset.S index 7c72dfd36..f2670a9f2 100644 --- a/kernel/arch/arm64/lib/memset.S +++ b/kernel/arch/arm64/lib/memset.S @@ -54,6 +54,8 @@ dst .req x8 tmp3w .req w9 tmp3 .req x9 + .weak memset +ENTRY(__memset) ENTRY(memset) mov dst, dstin /* Preserve return value. */ and A_lw, val, #255 @@ -213,4 +215,5 @@ ENTRY(memset) ands count, count, zva_bits_x b.ne .Ltail_maybe_long ret -ENDPROC(memset) +ENDPIPROC(memset) +ENDPROC(__memset) diff --git a/kernel/arch/arm64/lib/strcmp.S b/kernel/arch/arm64/lib/strcmp.S index 42f828b06..471fe6176 100644 --- a/kernel/arch/arm64/lib/strcmp.S +++ b/kernel/arch/arm64/lib/strcmp.S @@ -231,4 +231,4 @@ CPU_BE( orr syndrome, diff, has_nul ) lsr data1, data1, #56 sub result, data1, data2, lsr #56 ret -ENDPROC(strcmp) +ENDPIPROC(strcmp) diff --git a/kernel/arch/arm64/lib/strlen.S b/kernel/arch/arm64/lib/strlen.S index 987b68b9c..55ccc8e24 100644 --- a/kernel/arch/arm64/lib/strlen.S +++ b/kernel/arch/arm64/lib/strlen.S @@ -123,4 +123,4 @@ CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ csinv data1, data1, xzr, le csel data2, data2, data2a, le b .Lrealigned -ENDPROC(strlen) +ENDPIPROC(strlen) diff --git a/kernel/arch/arm64/lib/strncmp.S b/kernel/arch/arm64/lib/strncmp.S index 0224cf5a5..e26704476 100644 --- a/kernel/arch/arm64/lib/strncmp.S +++ b/kernel/arch/arm64/lib/strncmp.S @@ -307,4 +307,4 @@ CPU_BE( orr syndrome, diff, has_nul ) .Lret0: mov result, #0 ret -ENDPROC(strncmp) +ENDPIPROC(strncmp) |