9 files changed, 110 insertions, 101 deletions
diff --git a/kernel/arch/arm64/crypto/aes-ce-ccm-core.S b/kernel/arch/arm64/crypto/aes-ce-ccm-core.S
index a2a7fbcac..3363560c7 100644
--- a/kernel/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/kernel/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -9,6 +9,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 	.text
 	.arch	armv8-a+crypto
@@ -19,7 +20,7 @@
 	 */
 ENTRY(ce_aes_ccm_auth_data)
 	ldr	w8, [x3]			/* leftover from prev round? */
-	ld1	{v0.2d}, [x0]			/* load mac */
+	ld1	{v0.16b}, [x0]			/* load mac */
 	cbz	w8, 1f
 	sub	w8, w8, #16
 	eor	v1.16b, v1.16b, v1.16b
@@ -31,7 +32,7 @@ ENTRY(ce_aes_ccm_auth_data)
 	beq	8f				/* out of input? */
 	cbnz	w8, 0b
 	eor	v0.16b, v0.16b, v1.16b
-1:	ld1	{v3.2d}, [x4]			/* load first round key */
+1:	ld1	{v3.16b}, [x4]			/* load first round key */
 	prfm	pldl1strm, [x1]
 	cmp	w5, #12				/* which key size? */
 	add	x6, x4, #16
@@ -41,17 +42,17 @@ ENTRY(ce_aes_ccm_auth_data)
 	mov	v5.16b, v3.16b
 	b	4f
 2:	mov	v4.16b, v3.16b
-	ld1	{v5.2d}, [x6], #16		/* load 2nd round key */
+	ld1	{v5.16b}, [x6], #16		/* load 2nd round key */
 3:	aese	v0.16b, v4.16b
 	aesmc	v0.16b, v0.16b
-4:	ld1	{v3.2d}, [x6], #16		/* load next round key */
+4:	ld1	{v3.16b}, [x6], #16		/* load next round key */
 	aese	v0.16b, v5.16b
 	aesmc	v0.16b, v0.16b
-5:	ld1	{v4.2d}, [x6], #16		/* load next round key */
+5:	ld1	{v4.16b}, [x6], #16		/* load next round key */
 	subs	w7, w7, #3
 	aese	v0.16b, v3.16b
 	aesmc	v0.16b, v0.16b
-	ld1	{v5.2d}, [x6], #16		/* load next round key */
+	ld1	{v5.16b}, [x6], #16		/* load next round key */
 	bpl	3b
 	aese	v0.16b, v4.16b
 	subs	w2, w2, #16			/* last data? */
@@ -60,7 +61,7 @@ ENTRY(ce_aes_ccm_auth_data)
 	ld1	{v1.16b}, [x1], #16		/* load next input block */
 	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
 	bne	1b
-6:	st1	{v0.2d}, [x0]			/* store mac */
+6:	st1	{v0.16b}, [x0]			/* store mac */
 	beq	10f
 	adds	w2, w2, #16
 	beq	10f
@@ -79,7 +80,7 @@ ENTRY(ce_aes_ccm_auth_data)
 	adds	w7, w7, #1
 	bne	9b
 	eor	v0.16b, v0.16b, v1.16b
-	st1	{v0.2d}, [x0]
+	st1	{v0.16b}, [x0]
 10:	str	w8, [x3]
 	ret
 ENDPROC(ce_aes_ccm_auth_data)
@@ -89,27 +90,27 @@ ENDPROC(ce_aes_ccm_auth_data)
 	 * 			 u32 rounds);
 	 */
 ENTRY(ce_aes_ccm_final)
-	ld1	{v3.2d}, [x2], #16		/* load first round key */
-	ld1	{v0.2d}, [x0]			/* load mac */
+	ld1	{v3.16b}, [x2], #16		/* load first round key */
+	ld1	{v0.16b}, [x0]			/* load mac */
 	cmp	w3, #12				/* which key size? */
 	sub	w3, w3, #2			/* modified # of rounds */
-	ld1	{v1.2d}, [x1]			/* load 1st ctriv */
+	ld1	{v1.16b}, [x1]			/* load 1st ctriv */
 	bmi	0f
 	bne	3f
 	mov	v5.16b, v3.16b
 	b	2f
 0:	mov	v4.16b, v3.16b
-1:	ld1	{v5.2d}, [x2], #16		/* load next round key */
+1:	ld1	{v5.16b}, [x2], #16		/* load next round key */
 	aese	v0.16b, v4.16b
 	aesmc	v0.16b, v0.16b
 	aese	v1.16b, v4.16b
 	aesmc	v1.16b, v1.16b
-2:	ld1	{v3.2d}, [x2], #16		/* load next round key */
+2:	ld1	{v3.16b}, [x2], #16		/* load next round key */
 	aese	v0.16b, v5.16b
 	aesmc	v0.16b, v0.16b
 	aese	v1.16b, v5.16b
 	aesmc	v1.16b, v1.16b
-3:	ld1	{v4.2d}, [x2], #16		/* load next round key */
+3:	ld1	{v4.16b}, [x2], #16		/* load next round key */
 	subs	w3, w3, #3
 	aese	v0.16b, v3.16b
 	aesmc	v0.16b, v0.16b
@@ -120,47 +121,47 @@ ENTRY(ce_aes_ccm_final)
 	aese	v1.16b, v4.16b
 	/* final round key cancels out */
 	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
-	st1	{v0.2d}, [x0]			/* store result */
+	st1	{v0.16b}, [x0]			/* store result */
 	ret
 ENDPROC(ce_aes_ccm_final)
 
 	.macro	aes_ccm_do_crypt,enc
 	ldr	x8, [x6, #8]			/* load lower ctr */
-	ld1	{v0.2d}, [x5]			/* load mac */
-	rev	x8, x8				/* keep swabbed ctr in reg */
+	ld1	{v0.16b}, [x5]			/* load mac */
+CPU_LE(	rev	x8, x8			)	/* keep swabbed ctr in reg */
 0:	/* outer loop */
-	ld1	{v1.1d}, [x6]			/* load upper ctr */
+	ld1	{v1.8b}, [x6]			/* load upper ctr */
 	prfm	pldl1strm, [x1]
 	add	x8, x8, #1
 	rev	x9, x8
 	cmp	w4, #12				/* which key size? */
 	sub	w7, w4, #2			/* get modified # of rounds */
 	ins	v1.d[1], x9			/* no carry in lower ctr */
-	ld1	{v3.2d}, [x3]			/* load first round key */
+	ld1	{v3.16b}, [x3]			/* load first round key */
 	add	x10, x3, #16
 	bmi	1f
 	bne	4f
 	mov	v5.16b, v3.16b
 	b	3f
 1:	mov	v4.16b, v3.16b
-	ld1	{v5.2d}, [x10], #16		/* load 2nd round key */
+	ld1	{v5.16b}, [x10], #16		/* load 2nd round key */
 2:	/* inner loop: 3 rounds, 2x interleaved */
 	aese	v0.16b, v4.16b
 	aesmc	v0.16b, v0.16b
 	aese	v1.16b, v4.16b
 	aesmc	v1.16b, v1.16b
-3:	ld1	{v3.2d}, [x10], #16		/* load next round key */
+3:	ld1	{v3.16b}, [x10], #16		/* load next round key */
 	aese	v0.16b, v5.16b
 	aesmc	v0.16b, v0.16b
 	aese	v1.16b, v5.16b
 	aesmc	v1.16b, v1.16b
-4:	ld1	{v4.2d}, [x10], #16		/* load next round key */
+4:	ld1	{v4.16b}, [x10], #16		/* load next round key */
 	subs	w7, w7, #3
 	aese	v0.16b, v3.16b
 	aesmc	v0.16b, v0.16b
 	aese	v1.16b, v3.16b
 	aesmc	v1.16b, v1.16b
-	ld1	{v5.2d}, [x10], #16		/* load next round key */
+	ld1	{v5.16b}, [x10], #16		/* load next round key */
 	bpl	2b
 	aese	v0.16b, v4.16b
 	aese	v1.16b, v4.16b
@@ -177,14 +178,14 @@ ENDPROC(ce_aes_ccm_final)
 	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
 	st1	{v1.16b}, [x0], #16		/* write output block */
 	bne	0b
-	rev	x8, x8
-	st1	{v0.2d}, [x5]			/* store mac */
+CPU_LE(	rev	x8, x8			)
+	st1	{v0.16b}, [x5]			/* store mac */
 	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
 5:	ret
 
 6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
 	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
-	st1	{v0.2d}, [x5]			/* store mac */
+	st1	{v0.16b}, [x5]			/* store mac */
 	add	w2, w2, #16			/* process partial tail block */
 7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
 	umov	w6, v1.b[0]			/* get top crypted ctr byte */
diff --git a/kernel/arch/arm64/crypto/aes-ce-cipher.c b/kernel/arch/arm64/crypto/aes-ce-cipher.c
index f7bd9bf0b..50d9fe11d 100644
--- a/kernel/arch/arm64/crypto/aes-ce-cipher.c
+++ b/kernel/arch/arm64/crypto/aes-ce-cipher.c
@@ -47,24 +47,24 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	kernel_neon_begin_partial(4);
 
 	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.2d}, [%[key]], #16		;"
+		"	ld1	{v1.16b}, [%[key]], #16		;"
 		"	cmp	%w[rounds], #10			;"
 		"	bmi	0f				;"
 		"	bne	3f				;"
 		"	mov	v3.16b, v1.16b			;"
 		"	b	2f				;"
 		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.2d}, [%[key]], #16		;"
+		"	ld1	{v3.16b}, [%[key]], #16		;"
 		"1:	aese	v0.16b, v2.16b			;"
 		"	aesmc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.2d}, [%[key]], #16		;"
+		"2:	ld1	{v1.16b}, [%[key]], #16		;"
 		"	aese	v0.16b, v3.16b			;"
 		"	aesmc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.2d}, [%[key]], #16		;"
+		"3:	ld1	{v2.16b}, [%[key]], #16		;"
 		"	subs	%w[rounds], %w[rounds], #3	;"
 		"	aese	v0.16b, v1.16b			;"
 		"	aesmc	v0.16b, v0.16b			;"
-		"	ld1	{v3.2d}, [%[key]], #16		;"
+		"	ld1	{v3.16b}, [%[key]], #16		;"
 		"	bpl	1b				;"
 		"	aese	v0.16b, v2.16b			;"
 		"	eor	v0.16b, v0.16b, v3.16b		;"
@@ -92,24 +92,24 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	kernel_neon_begin_partial(4);
 
 	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.2d}, [%[key]], #16		;"
+		"	ld1	{v1.16b}, [%[key]], #16		;"
 		"	cmp	%w[rounds], #10			;"
 		"	bmi	0f				;"
 		"	bne	3f				;"
 		"	mov	v3.16b, v1.16b			;"
 		"	b	2f				;"
 		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.2d}, [%[key]], #16		;"
+		"	ld1	{v3.16b}, [%[key]], #16		;"
 		"1:	aesd	v0.16b, v2.16b			;"
 		"	aesimc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.2d}, [%[key]], #16		;"
+		"2:	ld1	{v1.16b}, [%[key]], #16		;"
 		"	aesd	v0.16b, v3.16b			;"
 		"	aesimc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.2d}, [%[key]], #16		;"
+		"3:	ld1	{v2.16b}, [%[key]], #16		;"
 		"	subs	%w[rounds], %w[rounds], #3	;"
 		"	aesd	v0.16b, v1.16b			;"
 		"	aesimc	v0.16b, v0.16b			;"
-		"	ld1	{v3.2d}, [%[key]], #16		;"
+		"	ld1	{v3.16b}, [%[key]], #16		;"
 		"	bpl	1b				;"
 		"	aesd	v0.16b, v2.16b			;"
 		"	eor	v0.16b, v0.16b, v3.16b		;"
@@ -173,7 +173,12 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		u32 *rki = ctx->key_enc + (i * kwords);
 		u32 *rko = rki + kwords;
 
+#ifndef CONFIG_CPU_BIG_ENDIAN
 		rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
+#else
+		rko[0] = rol32(aes_sub(rki[kwords - 1]), 8) ^ (rcon[i] << 24) ^
+			 rki[0];
+#endif
 		rko[1] = rko[0] ^ rki[1];
 		rko[2] = rko[1] ^ rki[2];
 		rko[3] = rko[2] ^ rki[3];
diff --git a/kernel/arch/arm64/crypto/aes-ce.S b/kernel/arch/arm64/crypto/aes-ce.S
index 78f3cfe92..b46093d56 100644
--- a/kernel/arch/arm64/crypto/aes-ce.S
+++ b/kernel/arch/arm64/crypto/aes-ce.S
@@ -10,6 +10,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 #define AES_ENTRY(func)		ENTRY(ce_ ## func)
 #define AES_ENDPROC(func)	ENDPROC(ce_ ## func)
diff --git a/kernel/arch/arm64/crypto/aes-glue.c b/kernel/arch/arm64/crypto/aes-glue.c
index 05d9e16c0..6a51dfccf 100644
--- a/kernel/arch/arm64/crypto/aes-glue.c
+++ b/kernel/arch/arm64/crypto/aes-glue.c
@@ -211,7 +211,7 @@ static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		err = blkcipher_walk_done(desc, &walk,
 					  walk.nbytes % AES_BLOCK_SIZE);
 	}
-	if (nbytes) {
+	if (walk.nbytes % AES_BLOCK_SIZE) {
 		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
 		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
 		u8 __aligned(8) tail[AES_BLOCK_SIZE];
diff --git a/kernel/arch/arm64/crypto/aes-modes.S b/kernel/arch/arm64/crypto/aes-modes.S
index f6e372c52..838dad5c2 100644
--- a/kernel/arch/arm64/crypto/aes-modes.S
+++ b/kernel/arch/arm64/crypto/aes-modes.S
@@ -193,15 +193,16 @@ AES_ENTRY(aes_cbc_encrypt)
 	cbz		w6, .Lcbcencloop
 
 	ld1		{v0.16b}, [x5]			/* get iv */
-	enc_prepare	w3, x2, x5
+	enc_prepare	w3, x2, x6
 
 .Lcbcencloop:
 	ld1		{v1.16b}, [x1], #16		/* get next pt block */
 	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */
-	encrypt_block	v0, w3, x2, x5, w6
+	encrypt_block	v0, w3, x2, x6, w7
 	st1		{v0.16b}, [x0], #16
 	subs		w4, w4, #1
 	bne		.Lcbcencloop
+	st1		{v0.16b}, [x5]			/* return iv */
 	ret
 AES_ENDPROC(aes_cbc_encrypt)
 
@@ -211,7 +212,7 @@ AES_ENTRY(aes_cbc_decrypt)
 	cbz		w6, .LcbcdecloopNx
 
 	ld1		{v7.16b}, [x5]			/* get iv */
-	dec_prepare	w3, x2, x5
+	dec_prepare	w3, x2, x6
 
 .LcbcdecloopNx:
 #if INTERLEAVE >= 2
@@ -248,7 +249,7 @@ AES_ENTRY(aes_cbc_decrypt)
 .Lcbcdecloop:
 	ld1		{v1.16b}, [x1], #16		/* get next ct block */
 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
-	decrypt_block	v0, w3, x2, x5, w6
+	decrypt_block	v0, w3, x2, x6, w7
 	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
 	mov		v7.16b, v1.16b			/* ct is next iv */
 	st1		{v0.16b}, [x0], #16
@@ -256,6 +257,7 @@ AES_ENTRY(aes_cbc_decrypt)
 	bne		.Lcbcdecloop
 .Lcbcdecout:
 	FRAME_POP
+	st1		{v7.16b}, [x5]			/* return iv */
 	ret
 AES_ENDPROC(aes_cbc_decrypt)
 
@@ -267,24 +269,15 @@ AES_ENDPROC(aes_cbc_decrypt)
 
 AES_ENTRY(aes_ctr_encrypt)
 	FRAME_PUSH
-	cbnz		w6, .Lctrfirst		/* 1st time around? */
-	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
-	rev		x5, x5
-#if INTERLEAVE >= 2
-	cmn		w5, w4			/* 32 bit overflow? */
-	bcs		.Lctrinc
-	add		x5, x5, #1		/* increment BE ctr */
-	b		.LctrincNx
-#else
-	b		.Lctrinc
-#endif
-.Lctrfirst:
+	cbz		w6, .Lctrnotfirst	/* 1st time around? */
 	enc_prepare	w3, x2, x6
 	ld1		{v4.16b}, [x5]
-	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
-	rev		x5, x5
+
+.Lctrnotfirst:
+	umov		x8, v4.d[1]		/* keep swabbed ctr in reg */
+	rev		x8, x8
 #if INTERLEAVE >= 2
-	cmn		w5, w4			/* 32 bit overflow? */
+	cmn		w8, w4			/* 32 bit overflow? */
 	bcs		.Lctrloop
 .LctrloopNx:
 	subs		w4, w4, #INTERLEAVE
@@ -292,11 +285,11 @@ AES_ENTRY(aes_ctr_encrypt)
 #if INTERLEAVE == 2
 	mov		v0.8b, v4.8b
 	mov		v1.8b, v4.8b
-	rev		x7, x5
-	add		x5, x5, #1
+	rev		x7, x8
+	add		x8, x8, #1
 	ins		v0.d[1], x7
-	rev		x7, x5
-	add		x5, x5, #1
+	rev		x7, x8
+	add		x8, x8, #1
 	ins		v1.d[1], x7
 	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */
 	do_encrypt_block2x
@@ -305,7 +298,7 @@ AES_ENTRY(aes_ctr_encrypt)
 	st1		{v0.16b-v1.16b}, [x0], #32
 #else
 	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
-	dup		v7.4s, w5
+	dup		v7.4s, w8
 	mov		v0.16b, v4.16b
 	add		v7.4s, v7.4s, v8.4s
 	mov		v1.16b, v4.16b
@@ -323,18 +316,12 @@ AES_ENTRY(aes_ctr_encrypt)
 	eor		v2.16b, v7.16b, v2.16b
 	eor		v3.16b, v5.16b, v3.16b
 	st1		{v0.16b-v3.16b}, [x0], #64
-	add		x5, x5, #INTERLEAVE
+	add		x8, x8, #INTERLEAVE
 #endif
-	cbz		w4, .LctroutNx
-.LctrincNx:
-	rev		x7, x5
+	rev		x7, x8
 	ins		v4.d[1], x7
+	cbz		w4, .Lctrout
 	b		.LctrloopNx
-.LctroutNx:
-	sub		x5, x5, #1
-	rev		x7, x5
-	ins		v4.d[1], x7
-	b		.Lctrout
 .Lctr1x:
 	adds		w4, w4, #INTERLEAVE
 	beq		.Lctrout
@@ -342,30 +329,39 @@ AES_ENTRY(aes_ctr_encrypt)
 .Lctrloop:
 	mov		v0.16b, v4.16b
 	encrypt_block	v0, w3, x2, x6, w7
+
+	adds		x8, x8, #1		/* increment BE ctr */
+	rev		x7, x8
+	ins		v4.d[1], x7
+	bcs		.Lctrcarry		/* overflow? */
+
+.Lctrcarrydone:
 	subs		w4, w4, #1
 	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
 	ld1		{v3.16b}, [x1], #16
 	eor		v3.16b, v0.16b, v3.16b
 	st1		{v3.16b}, [x0], #16
-	beq		.Lctrout
-.Lctrinc:
-	adds		x5, x5, #1		/* increment BE ctr */
-	rev		x7, x5
-	ins		v4.d[1], x7
-	bcc		.Lctrloop		/* no overflow? */
-	umov		x7, v4.d[0]		/* load upper word of ctr  */
-	rev		x7, x7			/* ... to handle the carry */
-	add		x7, x7, #1
-	rev		x7, x7
-	ins		v4.d[0], x7
-	b		.Lctrloop
+	bne		.Lctrloop
+
+.Lctrout:
+	st1		{v4.16b}, [x5]		/* return next CTR value */
+	FRAME_POP
+	ret
+
 .Lctrhalfblock:
 	ld1		{v3.8b}, [x1]
 	eor		v3.8b, v0.8b, v3.8b
 	st1		{v3.8b}, [x0]
-.Lctrout:
 	FRAME_POP
 	ret
+
+.Lctrcarry:
+	umov		x7, v4.d[0]		/* load upper word of ctr  */
+	rev		x7, x7			/* ... to handle the carry */
+	add		x7, x7, #1
+	rev		x7, x7
+	ins		v4.d[0], x7
+	b		.Lctrcarrydone
 AES_ENDPROC(aes_ctr_encrypt)
 	.ltorg
 
@@ -386,7 +382,8 @@ AES_ENDPROC(aes_ctr_encrypt)
 	.endm
 
 .Lxts_mul_x:
-	.word		1, 0, 0x87, 0
+CPU_LE(	.quad		1, 0x87		)
+CPU_BE(	.quad		0x87, 1		)
 
 AES_ENTRY(aes_xts_encrypt)
 	FRAME_PUSH
diff --git a/kernel/arch/arm64/crypto/aes-neon.S b/kernel/arch/arm64/crypto/aes-neon.S
index b93170e1c..85f07ead7 100644
--- a/kernel/arch/arm64/crypto/aes-neon.S
+++ b/kernel/arch/arm64/crypto/aes-neon.S
@@ -9,6 +9,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 #define AES_ENTRY(func)		ENTRY(neon_ ## func)
 #define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
@@ -83,13 +84,13 @@
 	.endm
 
 	.macro		do_block, enc, in, rounds, rk, rkp, i
-	ld1		{v15.16b}, [\rk]
+	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	ld1		{v15.16b}, [\rkp], #16
+	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
 	beq		2222f
 	.if		\enc == 1
@@ -229,7 +230,7 @@
 	.endm
 
 	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
-	ld1		{v15.16b}, [\rk]
+	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
@@ -237,7 +238,7 @@
 	sub_bytes_2x	\in0, \in1
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.16b}, [\rkp], #16
+	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
 	beq		2222f
 	.if		\enc == 1
@@ -254,7 +255,7 @@
 	.endm
 
 	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
-	ld1		{v15.16b}, [\rk]
+	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
@@ -266,7 +267,7 @@
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.16b}, [\rkp], #16
+	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
 	beq		2222f
 	.if		\enc == 1
@@ -306,12 +307,16 @@
 	.text
 	.align		4
 .LForward_ShiftRows:
-	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
-	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
+CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
+CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
+CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
+CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
 
 .LReverse_ShiftRows:
-	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
-	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
+CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
+CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
+CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
+CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
 
 .LForward_Sbox:
 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
diff --git a/kernel/arch/arm64/crypto/ghash-ce-core.S b/kernel/arch/arm64/crypto/ghash-ce-core.S
index dc4570158..f0bb9f0b5 100644
--- a/kernel/arch/arm64/crypto/ghash-ce-core.S
+++ b/kernel/arch/arm64/crypto/ghash-ce-core.S
@@ -29,8 +29,8 @@
 	 *			   struct ghash_key const *k, const char *head)
 	 */
 ENTRY(pmull_ghash_update)
-	ld1		{SHASH.16b}, [x3]
-	ld1		{XL.16b}, [x1]
+	ld1		{SHASH.2d}, [x3]
+	ld1		{XL.2d}, [x1]
 	movi		MASK.16b, #0xe1
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
 	shl		MASK.2d, MASK.2d, #57
@@ -74,6 +74,6 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 
 	cbnz		w0, 0b
 
-	st1		{XL.16b}, [x1]
+	st1		{XL.2d}, [x1]
 	ret
 ENDPROC(pmull_ghash_update)
diff --git a/kernel/arch/arm64/crypto/sha1-ce-core.S b/kernel/arch/arm64/crypto/sha1-ce-core.S
index 033aae6d7..c98e7e849 100644
--- a/kernel/arch/arm64/crypto/sha1-ce-core.S
+++ b/kernel/arch/arm64/crypto/sha1-ce-core.S
@@ -78,7 +78,7 @@ ENTRY(sha1_ce_transform)
 	ld1r		{k3.4s}, [x6]
 
 	/* load state */
-	ldr		dga, [x0]
+	ld1		{dgav.4s}, [x0]
 	ldr		dgb, [x0, #16]
 
 	/* load sha1_ce_state::finalize */
@@ -144,7 +144,7 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	b		1b
 
 	/* store new state */
-3:	str		dga, [x0]
+3:	st1		{dgav.4s}, [x0]
 	str		dgb, [x0, #16]
 	ret
 ENDPROC(sha1_ce_transform)
diff --git a/kernel/arch/arm64/crypto/sha2-ce-core.S b/kernel/arch/arm64/crypto/sha2-ce-core.S
index 5df9d9d47..01cfee066 100644
--- a/kernel/arch/arm64/crypto/sha2-ce-core.S
+++ b/kernel/arch/arm64/crypto/sha2-ce-core.S
@@ -85,7 +85,7 @@ ENTRY(sha2_ce_transform)
 	ld1		{v12.4s-v15.4s}, [x8]
 
 	/* load state */
-	ldp		dga, dgb, [x0]
+	ld1		{dgav.4s, dgbv.4s}, [x0]
 
 	/* load sha256_ce_state::finalize */
 	ldr		w4, [x0, #:lo12:sha256_ce_offsetof_finalize]
@@ -148,6 +148,6 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	b		1b
 
 	/* store new state */
-3:	stp		dga, dgb, [x0]
+3:	st1		{dgav.4s, dgbv.4s}, [x0]
 	ret
 ENDPROC(sha2_ce_transform)