summaryrefslogtreecommitdiffstats
path: root/kernel/arch/powerpc/lib
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/arch/powerpc/lib')
-rw-r--r--kernel/arch/powerpc/lib/Makefile2
-rw-r--r--kernel/arch/powerpc/lib/checksum_32.S16
-rw-r--r--kernel/arch/powerpc/lib/checksum_64.S21
-rw-r--r--kernel/arch/powerpc/lib/copy_32.S120
-rw-r--r--kernel/arch/powerpc/lib/vmx-helper.c11
5 files changed, 126 insertions, 44 deletions
diff --git a/kernel/arch/powerpc/lib/Makefile b/kernel/arch/powerpc/lib/Makefile
index 7902802a1..a47e14277 100644
--- a/kernel/arch/powerpc/lib/Makefile
+++ b/kernel/arch/powerpc/lib/Makefile
@@ -33,6 +33,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
obj-$(CONFIG_ALTIVEC) += xor_vmx.o
-CFLAGS_xor_vmx.o += -maltivec -mabi=altivec
+CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec)
obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/kernel/arch/powerpc/lib/checksum_32.S b/kernel/arch/powerpc/lib/checksum_32.S
index 7874e8a80..6d67e057f 100644
--- a/kernel/arch/powerpc/lib/checksum_32.S
+++ b/kernel/arch/powerpc/lib/checksum_32.S
@@ -41,22 +41,6 @@ _GLOBAL(ip_fast_csum)
blr
/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(saddr, daddr, len, proto, sum)
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- addze r0,r0 /* add in final carry */
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
diff --git a/kernel/arch/powerpc/lib/checksum_64.S b/kernel/arch/powerpc/lib/checksum_64.S
index 57a072065..f3ef35436 100644
--- a/kernel/arch/powerpc/lib/checksum_64.S
+++ b/kernel/arch/powerpc/lib/checksum_64.S
@@ -45,27 +45,6 @@ _GLOBAL(ip_fast_csum)
blr
/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
- * No real gain trying to do this specially for 64 bit, but
- * the 32 bit addition may spill into the upper bits of
- * the doubleword so we still must fold it down from 64.
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- rldicl r4,r0,32,0 /* fold 64 bit value */
- add r0,r4,r0
- srdi r0,r0,32
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* Computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit).
*
diff --git a/kernel/arch/powerpc/lib/copy_32.S b/kernel/arch/powerpc/lib/copy_32.S
index 6813f80d1..c44df2dbe 100644
--- a/kernel/arch/powerpc/lib/copy_32.S
+++ b/kernel/arch/powerpc/lib/copy_32.S
@@ -69,9 +69,19 @@ CACHELINE_BYTES = L1_CACHE_BYTES
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
CACHELINE_MASK = (L1_CACHE_BYTES-1)
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero. This requires that the destination
+ * area is cacheable. -- paulus
+ *
+ * During early init, cache might not be active yet, so dcbz cannot be used.
+ * We therefore skip the optimised bloc that uses dcbz. This jump is
+ * replaced by a nop once cache is active. This is done in machine_init()
+ */
_GLOBAL(memset)
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
+
addi r6,r3,-4
cmplwi 0,r5,4
blt 7f
@@ -80,7 +90,31 @@ _GLOBAL(memset)
andi. r0,r6,3
add r5,r0,r5
subf r6,r0,r6
- srwi r0,r5,2
+ cmplwi 0,r4,0
+ bne 2f /* Use normal procedure if r4 is not zero */
+_GLOBAL(memset_nocache_branch)
+ b 2f /* Skip optimised bloc until cache is enabled */
+
+ clrlwi r7,r6,32-LG_CACHELINE_BYTES
+ add r8,r7,r5
+ srwi r9,r8,LG_CACHELINE_BYTES
+ addic. r9,r9,-1 /* total number of complete cachelines */
+ ble 2f
+ xori r0,r7,CACHELINE_MASK & ~3
+ srwi. r0,r0,2
+ beq 3f
+ mtctr r0
+4: stwu r4,4(r6)
+ bdnz 4b
+3: mtctr r9
+ li r7,4
+10: dcbz r7,r6
+ addi r6,r6,CACHELINE_BYTES
+ bdnz 10b
+ clrlwi r5,r8,32-LG_CACHELINE_BYTES
+ addi r5,r5,4
+
+2: srwi r0,r5,2
mtctr r0
bdz 6f
1: stwu r4,4(r6)
@@ -94,12 +128,96 @@ _GLOBAL(memset)
bdnz 8b
blr
+/*
+ * This version uses dcbz on the complete cache lines in the
+ * destination area to reduce memory traffic. This requires that
+ * the destination area is cacheable.
+ * We only use this version if the source and dest don't overlap.
+ * -- paulus.
+ *
+ * During early init, cache might not be active yet, so dcbz cannot be used.
+ * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
+ * replaced by a nop once cache is active. This is done in machine_init()
+ */
_GLOBAL(memmove)
cmplw 0,r3,r4
bgt backwards_memcpy
/* fall through */
_GLOBAL(memcpy)
+ b generic_memcpy
+ add r7,r3,r5 /* test if the src & dst overlap */
+ add r8,r4,r5
+ cmplw 0,r4,r7
+ cmplw 1,r3,r8
+ crand 0,0,4 /* cr0.lt &= cr1.lt */
+ blt generic_memcpy /* if regions overlap */
+
+ addi r4,r4,-4
+ addi r6,r3,-4
+ neg r0,r3
+ andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
+ beq 58f
+
+ cmplw 0,r5,r0 /* is this more than total to do? */
+ blt 63f /* if not much to do */
+ andi. r8,r0,3 /* get it word-aligned first */
+ subf r5,r0,r5
+ mtctr r8
+ beq+ 61f
+70: lbz r9,4(r4) /* do some bytes */
+ addi r4,r4,1
+ addi r6,r6,1
+ stb r9,3(r6)
+ bdnz 70b
+61: srwi. r0,r0,2
+ mtctr r0
+ beq 58f
+72: lwzu r9,4(r4) /* do some words */
+ stwu r9,4(r6)
+ bdnz 72b
+
+58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+ clrlwi r5,r5,32-LG_CACHELINE_BYTES
+ li r11,4
+ mtctr r0
+ beq 63f
+53:
+ dcbz r11,r6
+ COPY_16_BYTES
+#if L1_CACHE_BYTES >= 32
+ COPY_16_BYTES
+#if L1_CACHE_BYTES >= 64
+ COPY_16_BYTES
+ COPY_16_BYTES
+#if L1_CACHE_BYTES >= 128
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+#endif
+#endif
+#endif
+ bdnz 53b
+
+63: srwi. r0,r5,2
+ mtctr r0
+ beq 64f
+30: lwzu r0,4(r4)
+ stwu r0,4(r6)
+ bdnz 30b
+
+64: andi. r0,r5,3
+ mtctr r0
+ beq+ 65f
+ addi r4,r4,3
+ addi r6,r6,3
+40: lbzu r0,1(r4)
+ stbu r0,1(r6)
+ bdnz 40b
+65: blr
+
+_GLOBAL(generic_memcpy)
srwi. r7,r5,3
addi r6,r3,-4
addi r4,r4,-4
diff --git a/kernel/arch/powerpc/lib/vmx-helper.c b/kernel/arch/powerpc/lib/vmx-helper.c
index 3cf529cee..ac93a3bd2 100644
--- a/kernel/arch/powerpc/lib/vmx-helper.c
+++ b/kernel/arch/powerpc/lib/vmx-helper.c
@@ -27,11 +27,11 @@ int enter_vmx_usercopy(void)
if (in_interrupt())
return 0;
- /* This acts as preempt_disable() as well and will make
- * enable_kernel_altivec(). We need to disable page faults
- * as they can call schedule and thus make us lose the VMX
- * context. So on page faults, we just fail which will cause
- * a fallback to the normal non-vmx copy.
+ preempt_disable();
+ /*
+ * We need to disable page faults as they can call schedule and
+ * thus make us lose the VMX context. So on page faults, we just
+ * fail which will cause a fallback to the normal non-vmx copy.
*/
pagefault_disable();
@@ -47,6 +47,7 @@ int enter_vmx_usercopy(void)
int exit_vmx_usercopy(void)
{
pagefault_enable();
+ preempt_enable();
return 0;
}