diff options
Diffstat (limited to 'kernel/arch/powerpc/lib')
-rw-r--r-- | kernel/arch/powerpc/lib/Makefile | 2 | ||||
-rw-r--r-- | kernel/arch/powerpc/lib/checksum_32.S | 16 | ||||
-rw-r--r-- | kernel/arch/powerpc/lib/checksum_64.S | 21 | ||||
-rw-r--r-- | kernel/arch/powerpc/lib/copy_32.S | 120 | ||||
-rw-r--r-- | kernel/arch/powerpc/lib/vmx-helper.c | 11 |
5 files changed, 126 insertions, 44 deletions
diff --git a/kernel/arch/powerpc/lib/Makefile b/kernel/arch/powerpc/lib/Makefile index 7902802a1..a47e14277 100644 --- a/kernel/arch/powerpc/lib/Makefile +++ b/kernel/arch/powerpc/lib/Makefile @@ -33,6 +33,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o obj-$(CONFIG_ALTIVEC) += xor_vmx.o -CFLAGS_xor_vmx.o += -maltivec -mabi=altivec +CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec) obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/kernel/arch/powerpc/lib/checksum_32.S b/kernel/arch/powerpc/lib/checksum_32.S index 7874e8a80..6d67e057f 100644 --- a/kernel/arch/powerpc/lib/checksum_32.S +++ b/kernel/arch/powerpc/lib/checksum_32.S @@ -41,22 +41,6 @@ _GLOBAL(ip_fast_csum) blr /* - * Compute checksum of TCP or UDP pseudo-header: - * csum_tcpudp_magic(saddr, daddr, len, proto, sum) - */ -_GLOBAL(csum_tcpudp_magic) - rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ - addc r0,r3,r4 /* add 4 32-bit words together */ - adde r0,r0,r5 - adde r0,r0,r7 - addze r0,r0 /* add in final carry */ - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - -/* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * diff --git a/kernel/arch/powerpc/lib/checksum_64.S b/kernel/arch/powerpc/lib/checksum_64.S index 57a072065..f3ef35436 100644 --- a/kernel/arch/powerpc/lib/checksum_64.S +++ b/kernel/arch/powerpc/lib/checksum_64.S @@ -45,27 +45,6 @@ _GLOBAL(ip_fast_csum) blr /* - * Compute checksum of TCP or UDP pseudo-header: - * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) - * No real gain trying to do this specially for 64 bit, but - * the 32 bit addition may spill into the upper bits of - * the doubleword so we still must fold it down from 64. - */ -_GLOBAL(csum_tcpudp_magic) - rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ - addc r0,r3,r4 /* add 4 32-bit words together */ - adde r0,r0,r5 - adde r0,r0,r7 - rldicl r4,r0,32,0 /* fold 64 bit value */ - add r0,r4,r0 - srdi r0,r0,32 - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - -/* * Computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit). * diff --git a/kernel/arch/powerpc/lib/copy_32.S b/kernel/arch/powerpc/lib/copy_32.S index 6813f80d1..c44df2dbe 100644 --- a/kernel/arch/powerpc/lib/copy_32.S +++ b/kernel/arch/powerpc/lib/copy_32.S @@ -69,9 +69,19 @@ CACHELINE_BYTES = L1_CACHE_BYTES LG_CACHELINE_BYTES = L1_CACHE_SHIFT CACHELINE_MASK = (L1_CACHE_BYTES-1) +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. -- paulus + * + * During early init, cache might not be active yet, so dcbz cannot be used. + * We therefore skip the optimised bloc that uses dcbz. This jump is + * replaced by a nop once cache is active. This is done in machine_init() + */ _GLOBAL(memset) rlwimi r4,r4,8,16,23 rlwimi r4,r4,16,0,15 + addi r6,r3,-4 cmplwi 0,r5,4 blt 7f @@ -80,7 +90,31 @@ _GLOBAL(memset) andi. r0,r6,3 add r5,r0,r5 subf r6,r0,r6 - srwi r0,r5,2 + cmplwi 0,r4,0 + bne 2f /* Use normal procedure if r4 is not zero */ +_GLOBAL(memset_nocache_branch) + b 2f /* Skip optimised bloc until cache is enabled */ + + clrlwi r7,r6,32-LG_CACHELINE_BYTES + add r8,r7,r5 + srwi r9,r8,LG_CACHELINE_BYTES + addic. r9,r9,-1 /* total number of complete cachelines */ + ble 2f + xori r0,r7,CACHELINE_MASK & ~3 + srwi. r0,r0,2 + beq 3f + mtctr r0 +4: stwu r4,4(r6) + bdnz 4b +3: mtctr r9 + li r7,4 +10: dcbz r7,r6 + addi r6,r6,CACHELINE_BYTES + bdnz 10b + clrlwi r5,r8,32-LG_CACHELINE_BYTES + addi r5,r5,4 + +2: srwi r0,r5,2 mtctr r0 bdz 6f 1: stwu r4,4(r6) @@ -94,12 +128,96 @@ _GLOBAL(memset) bdnz 8b blr +/* + * This version uses dcbz on the complete cache lines in the + * destination area to reduce memory traffic. This requires that + * the destination area is cacheable. + * We only use this version if the source and dest don't overlap. + * -- paulus. + * + * During early init, cache might not be active yet, so dcbz cannot be used. + * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is + * replaced by a nop once cache is active. This is done in machine_init() + */ _GLOBAL(memmove) cmplw 0,r3,r4 bgt backwards_memcpy /* fall through */ _GLOBAL(memcpy) + b generic_memcpy + add r7,r3,r5 /* test if the src & dst overlap */ + add r8,r4,r5 + cmplw 0,r4,r7 + cmplw 1,r3,r8 + crand 0,0,4 /* cr0.lt &= cr1.lt */ + blt generic_memcpy /* if regions overlap */ + + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + subf r5,r0,r5 + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ + addi r4,r4,1 + addi r6,r6,1 + stb r9,3(r6) + bdnz 70b +61: srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ + stwu r9,4(r6) + bdnz 72b + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + mtctr r0 + beq 63f +53: + dcbz r11,r6 + COPY_16_BYTES +#if L1_CACHE_BYTES >= 32 + COPY_16_BYTES +#if L1_CACHE_BYTES >= 64 + COPY_16_BYTES + COPY_16_BYTES +#if L1_CACHE_BYTES >= 128 + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES +#endif +#endif +#endif + bdnz 53b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) + stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f + addi r4,r4,3 + addi r6,r6,3 +40: lbzu r0,1(r4) + stbu r0,1(r6) + bdnz 40b +65: blr + +_GLOBAL(generic_memcpy) srwi. r7,r5,3 addi r6,r3,-4 addi r4,r4,-4 diff --git a/kernel/arch/powerpc/lib/vmx-helper.c b/kernel/arch/powerpc/lib/vmx-helper.c index 3cf529cee..ac93a3bd2 100644 --- a/kernel/arch/powerpc/lib/vmx-helper.c +++ b/kernel/arch/powerpc/lib/vmx-helper.c @@ -27,11 +27,11 @@ int enter_vmx_usercopy(void) if (in_interrupt()) return 0; - /* This acts as preempt_disable() as well and will make - * enable_kernel_altivec(). We need to disable page faults - * as they can call schedule and thus make us lose the VMX - * context. So on page faults, we just fail which will cause - * a fallback to the normal non-vmx copy. + preempt_disable(); + /* + * We need to disable page faults as they can call schedule and + * thus make us lose the VMX context. So on page faults, we just + * fail which will cause a fallback to the normal non-vmx copy. */ pagefault_disable(); @@ -47,6 +47,7 @@ int enter_vmx_usercopy(void) int exit_vmx_usercopy(void) { pagefault_enable(); + preempt_enable(); return 0; } |