From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/arch/x86/crypto/Makefile | 14 + kernel/arch/x86/crypto/aesni-intel_glue.c | 428 ++++++--------- kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c | 11 +- kernel/arch/x86/crypto/camellia_aesni_avx_glue.c | 11 +- kernel/arch/x86/crypto/cast5_avx_glue.c | 16 +- kernel/arch/x86/crypto/cast6_avx_glue.c | 16 +- kernel/arch/x86/crypto/chacha20-avx2-x86_64.S | 443 +++++++++++++++ kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S | 627 ++++++++++++++++++++++ kernel/arch/x86/crypto/chacha20_glue.c | 150 ++++++ kernel/arch/x86/crypto/crc32-pclmul_glue.c | 2 +- kernel/arch/x86/crypto/crc32c-intel_glue.c | 3 +- kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +- kernel/arch/x86/crypto/crct10dif-pclmul_glue.c | 2 +- kernel/arch/x86/crypto/fpu.c | 4 +- kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c | 2 +- kernel/arch/x86/crypto/poly1305-avx2-x86_64.S | 386 +++++++++++++ kernel/arch/x86/crypto/poly1305-sse2-x86_64.S | 582 ++++++++++++++++++++ kernel/arch/x86/crypto/poly1305_glue.c | 207 +++++++ kernel/arch/x86/crypto/serpent_avx2_glue.c | 12 +- kernel/arch/x86/crypto/serpent_avx_glue.c | 16 +- kernel/arch/x86/crypto/sha-mb/sha1_mb.c | 8 +- kernel/arch/x86/crypto/sha1_ni_asm.S | 302 +++++++++++ kernel/arch/x86/crypto/sha1_ssse3_glue.c | 324 ++++++++--- kernel/arch/x86/crypto/sha256_ni_asm.S | 353 ++++++++++++ kernel/arch/x86/crypto/sha256_ssse3_glue.c | 339 ++++++++++-- kernel/arch/x86/crypto/sha512_ssse3_glue.c | 263 +++++++-- kernel/arch/x86/crypto/twofish_avx_glue.c | 16 +- 27 files changed, 4012 insertions(+), 527 deletions(-) create mode 100644 kernel/arch/x86/crypto/chacha20-avx2-x86_64.S create mode 100644 kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S create mode 100644 kernel/arch/x86/crypto/chacha20_glue.c create mode 100644 kernel/arch/x86/crypto/poly1305-avx2-x86_64.S create mode 100644 kernel/arch/x86/crypto/poly1305-sse2-x86_64.S create mode 100644 kernel/arch/x86/crypto/poly1305_glue.c create mode 100644 kernel/arch/x86/crypto/sha1_ni_asm.S create mode 100644 kernel/arch/x86/crypto/sha256_ni_asm.S (limited to 'kernel/arch/x86/crypto') diff --git a/kernel/arch/x86/crypto/Makefile b/kernel/arch/x86/crypto/Makefile index 5a4a089e8..b9b912a44 100644 --- a/kernel/arch/x86/crypto/Makefile +++ b/kernel/arch/x86/crypto/Makefile @@ -5,6 +5,8 @@ avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) +sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) +sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o @@ -20,6 +22,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -30,6 +33,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o +obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -60,6 +64,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o +chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o ifeq ($(avx_supported),yes) @@ -75,6 +80,7 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o + chacha20-x86_64-y += chacha20-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o endif @@ -82,12 +88,20 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o ifeq ($(avx2_supported),yes) sha1-ssse3-y += sha1_avx2_x86_64_asm.o +poly1305-x86_64-y += poly1305-avx2-x86_64.o +endif +ifeq ($(sha1_ni_supported),yes) +sha1-ssse3-y += sha1_ni_asm.o endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o +ifeq ($(sha256_ni_supported),yes) +sha256-ssse3-y += sha256_ni_asm.o +endif sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff --git a/kernel/arch/x86/crypto/aesni-intel_glue.c b/kernel/arch/x86/crypto/aesni-intel_glue.c index 3fd3b1634..c6d5458ee 100644 --- a/kernel/arch/x86/crypto/aesni-intel_glue.c +++ b/kernel/arch/x86/crypto/aesni-intel_glue.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -44,15 +44,19 @@ #endif +#define AESNI_ALIGN 16 +#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1)) +#define RFC4106_HASH_SUBKEY_SIZE 16 + /* This data is stored at the end of the crypto_tfm struct. * It's a type of per "session" data storage location. * This needs to be 16 byte aligned. */ struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16]; - struct crypto_aes_ctx aes_key_expanded; + u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + struct crypto_aes_ctx aes_key_expanded + __attribute__ ((__aligned__(AESNI_ALIGN))); u8 nonce[4]; - struct cryptd_aead *cryptd_tfm; }; struct aesni_gcm_set_hash_subkey_result { @@ -66,10 +70,6 @@ struct aesni_hash_subkey_req_data { struct scatterlist sg; }; -#define AESNI_ALIGN (16) -#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) -#define RFC4106_HASH_SUBKEY_SIZE 16 - struct aesni_lrw_ctx { struct lrw_table_ctx lrw_table; u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; @@ -283,10 +283,11 @@ static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { - return - (struct aesni_rfc4106_gcm_ctx *) - PTR_ALIGN((u8 *) - crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); + unsigned long align = AESNI_ALIGN; + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + return PTR_ALIGN(crypto_aead_ctx(tfm), align); } #endif @@ -792,36 +793,27 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, #endif #ifdef CONFIG_X86_64 -static int rfc4106_init(struct crypto_tfm *tfm) +static int rfc4106_init(struct crypto_aead *aead) { struct cryptd_aead *cryptd_tfm; - struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) - PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); - struct crypto_aead *cryptd_child; - struct aesni_rfc4106_gcm_ctx *child_ctx; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", CRYPTO_ALG_INTERNAL, CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); - cryptd_child = cryptd_aead_child(cryptd_tfm); - child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); - memcpy(child_ctx, ctx, sizeof(*ctx)); - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_aead.reqsize = sizeof(struct aead_request) - + crypto_aead_reqsize(&cryptd_tfm->base); + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); return 0; } -static void rfc4106_exit(struct crypto_tfm *tfm) +static void rfc4106_exit(struct crypto_aead *aead) { - struct aesni_rfc4106_gcm_ctx *ctx = - (struct aesni_rfc4106_gcm_ctx *) - PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); - if (!IS_ERR(ctx->cryptd_tfm)) - cryptd_free_aead(ctx->cryptd_tfm); - return; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); } static void @@ -847,8 +839,6 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) if (IS_ERR(ctr_tfm)) return PTR_ERR(ctr_tfm); - crypto_ablkcipher_clear_flags(ctr_tfm, ~0); - ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); if (ret) goto out_free_ablkcipher; @@ -897,73 +887,29 @@ out_free_ablkcipher: static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, unsigned int key_len) { - int ret = 0; - struct crypto_tfm *tfm = crypto_aead_tfm(aead); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - u8 *new_key_align, *new_key_mem = NULL; if (key_len < 4) { - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } /*Account for 4 byte nonce at the end.*/ key_len -= 4; - if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256) { - crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - /*This must be on a 16 byte boundary!*/ - if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN) - return -EINVAL; - - if ((unsigned long)key % AESNI_ALIGN) { - /*key is not aligned: use an auxuliar aligned pointer*/ - new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL); - if (!new_key_mem) - return -ENOMEM; - - new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); - memcpy(new_key_align, key, key_len); - key = new_key_align; - } - if (!irq_fpu_usable()) - ret = crypto_aes_expand_key(&(ctx->aes_key_expanded), - key, key_len); - else { - kernel_fpu_begin(); - ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len); - kernel_fpu_end(); - } - /*This must be on a 16 byte boundary!*/ - if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) { - ret = -EINVAL; - goto exit; - } - ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); -exit: - kfree(new_key_mem); - return ret; + return aes_set_key_common(crypto_aead_tfm(aead), + &ctx->aes_key_expanded, key, key_len) ?: + rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, unsigned int key_len) { - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); - struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child); - struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm; - int ret; + struct cryptd_aead **ctx = crypto_aead_ctx(parent); + struct cryptd_aead *cryptd_tfm = *ctx; - ret = crypto_aead_setkey(child, key, key_len); - if (!ret) { - memcpy(ctx, c_ctx, sizeof(*ctx)); - ctx->cryptd_tfm = cryptd_tfm; - } - return ret; + return crypto_aead_setkey(&cryptd_tfm->base, key, key_len); } static int common_rfc4106_set_authsize(struct crypto_aead *aead, @@ -977,7 +923,7 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, default: return -EINVAL; } - crypto_aead_crt(aead)->authsize = authsize; + return 0; } @@ -986,44 +932,31 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, static int rfc4106_set_authsize(struct crypto_aead *parent, unsigned int authsize) { - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); - int ret; + struct cryptd_aead **ctx = crypto_aead_ctx(parent); + struct cryptd_aead *cryptd_tfm = *ctx; - ret = crypto_aead_setauthsize(child, authsize); - if (!ret) - crypto_aead_crt(parent)->authsize = authsize; - return ret; + return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } -static int __driver_rfc4106_encrypt(struct aead_request *req) +static int helper_rfc4106_encrypt(struct aead_request *req) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - u32 key_len = ctx->aes_key_expanded.key_length; void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv_tab[16+AESNI_ALIGN]; - u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; - struct scatter_walk assoc_sg_walk; struct scatter_walk dst_sg_walk; unsigned int i; /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length equal */ - /* to 8 or 12 bytes */ - if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; - if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) - return -EINVAL; - if (unlikely(key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256)) - return -EINVAL; /* IV below built */ for (i = 0; i < 4; i++) @@ -1032,55 +965,57 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) *(iv+4+i) = req->iv[i]; *((__be32 *)(iv+12)) = counter; - if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + if (sg_is_last(req->src) && + req->src->offset + req->src->length <= PAGE_SIZE && + sg_is_last(req->dst) && + req->dst->offset + req->dst->length <= PAGE_SIZE) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); - scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk); - assoc = scatterwalk_map(&assoc_sg_walk); + assoc = scatterwalk_map(&src_sg_walk); + src = assoc + req->assoclen; dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk); + dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } - } else { /* Allocate memory for src, dst, assoc */ - src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, + assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, GFP_ATOMIC); - if (unlikely(!src)) + if (unlikely(!assoc)) return -ENOMEM; - assoc = (src + req->cryptlen + auth_tag_len); - scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); - scatterwalk_map_and_copy(assoc, req->assoc, 0, - req->assoclen, 0); + scatterwalk_map_and_copy(assoc, req->src, 0, + req->assoclen + req->cryptlen, 0); + src = assoc + req->assoclen; dst = src; } - aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, - ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst - + ((unsigned long)req->cryptlen), auth_tag_len); + kernel_fpu_begin(); + aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, + ctx->hash_subkey, assoc, req->assoclen - 8, + dst + req->cryptlen, auth_tag_len); + kernel_fpu_end(); /* The authTag (aka the Integrity Check Value) needs to be written * back to the packet. */ if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst); - scatterwalk_done(&dst_sg_walk, 0, 0); + scatterwalk_unmap(dst - req->assoclen); + scatterwalk_advance(&dst_sg_walk, req->dst->length); + scatterwalk_done(&dst_sg_walk, 1, 0); } - scatterwalk_unmap(src); scatterwalk_unmap(assoc); - scatterwalk_done(&src_sg_walk, 0, 0); - scatterwalk_done(&assoc_sg_walk, 0, 0); + scatterwalk_advance(&src_sg_walk, req->src->length); + scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, - req->cryptlen + auth_tag_len, 1); - kfree(src); + scatterwalk_map_and_copy(dst, req->dst, req->assoclen, + req->cryptlen + auth_tag_len, 1); + kfree(assoc); } return 0; } -static int __driver_rfc4106_decrypt(struct aead_request *req) +static int helper_rfc4106_decrypt(struct aead_request *req) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; @@ -1089,30 +1024,20 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - u32 key_len = ctx->aes_key_expanded.key_length; void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv_and_authTag[32+AESNI_ALIGN]; - u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN); - u8 *authTag = iv + 16; + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + u8 authTag[16]; struct scatter_walk src_sg_walk; - struct scatter_walk assoc_sg_walk; struct scatter_walk dst_sg_walk; unsigned int i; - if (unlikely((req->cryptlen < auth_tag_len) || - (req->assoclen != 8 && req->assoclen != 12))) + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) return -EINVAL; - if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) - return -EINVAL; - if (unlikely(key_len != AES_KEYSIZE_128 && - key_len != AES_KEYSIZE_192 && - key_len != AES_KEYSIZE_256)) - return -EINVAL; /* Assuming we are supporting rfc4106 64-bit extended */ /* sequence numbers We need to have the AAD length */ - /* equal to 8 or 12 bytes */ + /* equal to 16 or 20 bytes */ tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); /* IV below built */ @@ -1122,33 +1047,36 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) *(iv+4+i) = req->iv[i]; *((__be32 *)(iv+12)) = counter; - if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + if (sg_is_last(req->src) && + req->src->offset + req->src->length <= PAGE_SIZE && + sg_is_last(req->dst) && + req->dst->offset + req->dst->length <= PAGE_SIZE) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); - scatterwalk_start(&assoc_sg_walk, req->assoc); - src = scatterwalk_map(&src_sg_walk); - assoc = scatterwalk_map(&assoc_sg_walk); + assoc = scatterwalk_map(&src_sg_walk); + src = assoc + req->assoclen; dst = src; if (unlikely(req->src != req->dst)) { scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk); + dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } } else { /* Allocate memory for src, dst, assoc */ - src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); - if (!src) + assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); + if (!assoc) return -ENOMEM; - assoc = (src + req->cryptlen); - scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); - scatterwalk_map_and_copy(assoc, req->assoc, 0, - req->assoclen, 0); + scatterwalk_map_and_copy(assoc, req->src, 0, + req->assoclen + req->cryptlen, 0); + src = assoc + req->assoclen; dst = src; } + kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, (unsigned long)req->assoclen, - authTag, auth_tag_len); + ctx->hash_subkey, assoc, req->assoclen - 8, + authTag, auth_tag_len); + kernel_fpu_end(); /* Compare generated tag with passed in tag. */ retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? @@ -1156,90 +1084,45 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) if (one_entry_in_sg) { if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst); - scatterwalk_done(&dst_sg_walk, 0, 0); + scatterwalk_unmap(dst - req->assoclen); + scatterwalk_advance(&dst_sg_walk, req->dst->length); + scatterwalk_done(&dst_sg_walk, 1, 0); } - scatterwalk_unmap(src); scatterwalk_unmap(assoc); - scatterwalk_done(&src_sg_walk, 0, 0); - scatterwalk_done(&assoc_sg_walk, 0, 0); + scatterwalk_advance(&src_sg_walk, req->src->length); + scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); } else { - scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1); - kfree(src); + scatterwalk_map_and_copy(dst, req->dst, req->assoclen, + tempCipherLen, 1); + kfree(assoc); } return retval; } static int rfc4106_encrypt(struct aead_request *req) { - int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + struct cryptd_aead **ctx = crypto_aead_ctx(tfm); + struct cryptd_aead *cryptd_tfm = *ctx; - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); + aead_request_set_tfm(req, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - ret = crypto_aead_encrypt(cryptd_req); - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_encrypt(req); - kernel_fpu_end(); - } - return ret; + return crypto_aead_encrypt(req); } static int rfc4106_decrypt(struct aead_request *req) { - int ret; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); - - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - ret = crypto_aead_decrypt(cryptd_req); - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_decrypt(req); - kernel_fpu_end(); - } - return ret; -} - -static int helper_rfc4106_encrypt(struct aead_request *req) -{ - int ret; - - if (unlikely(!irq_fpu_usable())) { - WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); - ret = -EINVAL; - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_encrypt(req); - kernel_fpu_end(); - } - return ret; -} + struct cryptd_aead **ctx = crypto_aead_ctx(tfm); + struct cryptd_aead *cryptd_tfm = *ctx; -static int helper_rfc4106_decrypt(struct aead_request *req) -{ - int ret; + aead_request_set_tfm(req, irq_fpu_usable() ? + cryptd_aead_child(cryptd_tfm) : + &cryptd_tfm->base); - if (unlikely(!irq_fpu_usable())) { - WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); - ret = -EINVAL; - } else { - kernel_fpu_begin(); - ret = __driver_rfc4106_decrypt(req); - kernel_fpu_end(); - } - return ret; + return crypto_aead_decrypt(req); } #endif @@ -1412,51 +1295,6 @@ static struct crypto_alg aesni_algs[] = { { .geniv = "chainiv", }, }, -}, { - .cra_name = "__gcm-aes-aesni", - .cra_driver_name = "__driver-gcm-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + - AESNI_ALIGN, - .cra_alignmask = 0, - .cra_type = &crypto_aead_type, - .cra_module = THIS_MODULE, - .cra_u = { - .aead = { - .setkey = common_rfc4106_set_key, - .setauthsize = common_rfc4106_set_authsize, - .encrypt = helper_rfc4106_encrypt, - .decrypt = helper_rfc4106_decrypt, - .ivsize = 8, - .maxauthsize = 16, - }, - }, -}, { - .cra_name = "rfc4106(gcm(aes))", - .cra_driver_name = "rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + - AESNI_ALIGN, - .cra_alignmask = 0, - .cra_type = &crypto_nivaead_type, - .cra_module = THIS_MODULE, - .cra_init = rfc4106_init, - .cra_exit = rfc4106_exit, - .cra_u = { - .aead = { - .setkey = rfc4106_set_key, - .setauthsize = rfc4106_set_authsize, - .encrypt = rfc4106_encrypt, - .decrypt = rfc4106_decrypt, - .geniv = "seqiv", - .ivsize = 8, - .maxauthsize = 16, - }, - }, #endif #if IS_ENABLED(CONFIG_CRYPTO_PCBC) }, { @@ -1571,6 +1409,46 @@ static struct crypto_alg aesni_algs[] = { { }, } }; +#ifdef CONFIG_X86_64 +static struct aead_alg aesni_aead_algs[] = { { + .setkey = common_rfc4106_set_key, + .setauthsize = common_rfc4106_set_authsize, + .encrypt = helper_rfc4106_encrypt, + .decrypt = helper_rfc4106_decrypt, + .ivsize = 8, + .maxauthsize = 16, + .base = { + .cra_name = "__gcm-aes-aesni", + .cra_driver_name = "__driver-gcm-aes-aesni", + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, +}, { + .init = rfc4106_init, + .exit = rfc4106_exit, + .setkey = rfc4106_set_key, + .setauthsize = rfc4106_set_authsize, + .encrypt = rfc4106_encrypt, + .decrypt = rfc4106_decrypt, + .ivsize = 8, + .maxauthsize = 16, + .base = { + .cra_name = "rfc4106(gcm(aes))", + .cra_driver_name = "rfc4106-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), + .cra_module = THIS_MODULE, + }, +} }; +#else +static struct aead_alg aesni_aead_algs[0]; +#endif + static const struct x86_cpu_id aesni_cpu_id[] = { X86_FEATURE_MATCH(X86_FEATURE_AES), @@ -1618,17 +1496,33 @@ static int __init aesni_init(void) if (err) return err; - return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + if (err) + goto fpu_exit; + + err = crypto_register_aeads(aesni_aead_algs, + ARRAY_SIZE(aesni_aead_algs)); + if (err) + goto unregister_algs; + + return err; + +unregister_algs: + crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); +fpu_exit: + crypto_fpu_exit(); + return err; } static void __exit aesni_exit(void) { + crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); crypto_fpu_exit(); } -module_init(aesni_init); +late_initcall(aesni_init); module_exit(aesni_exit); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized"); diff --git a/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c b/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c index baf0ac21a..d84456924 100644 --- a/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include #include #include @@ -561,16 +560,16 @@ static struct crypto_alg cmll_algs[10] = { { static int __init camellia_aesni_init(void) { - u64 xcr0; + const char *feature_name; if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; } - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX2 detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c b/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c index 78818a1e7..93d8f2957 100644 --- a/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include #include #include @@ -553,16 +552,16 @@ static struct crypto_alg cmll_algs[10] = { { static int __init camellia_aesni_init(void) { - u64 xcr0; + const char *feature_name; if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/kernel/arch/x86/crypto/cast5_avx_glue.c b/kernel/arch/x86/crypto/cast5_avx_glue.c index f799ec36b..d7699130e 100644 --- a/kernel/arch/x86/crypto/cast5_avx_glue.c +++ b/kernel/arch/x86/crypto/cast5_avx_glue.c @@ -31,8 +31,7 @@ #include #include #include -#include -#include +#include #include #define CAST5_PARALLEL_BLOCKS 16 @@ -465,16 +464,11 @@ static struct crypto_alg cast5_algs[6] = { { static int __init cast5_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - pr_info("AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/kernel/arch/x86/crypto/cast6_avx_glue.c b/kernel/arch/x86/crypto/cast6_avx_glue.c index f448810ca..fca459578 100644 --- a/kernel/arch/x86/crypto/cast6_avx_glue.c +++ b/kernel/arch/x86/crypto/cast6_avx_glue.c @@ -36,8 +36,7 @@ #include #include #include -#include -#include +#include #include #define CAST6_PARALLEL_BLOCKS 8 @@ -590,16 +589,11 @@ static struct crypto_alg cast6_algs[10] = { { static int __init cast6_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - pr_info("AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/kernel/arch/x86/crypto/chacha20-avx2-x86_64.S b/kernel/arch/x86/crypto/chacha20-avx2-x86_64.S new file mode 100644 index 000000000..16694e625 --- /dev/null +++ b/kernel/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -0,0 +1,443 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +.data +.align 32 + +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha20_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: 8 data blocks output, o + # %rdx: 8 data blocks input, i + + # This function encrypts eight consecutive ChaCha20 blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + mov %rsp, %r8 + and $~31, %rsp + sub $0x80, %rsp + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + mov $10,%ecx + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + dec %ecx + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + vmovdqa 0x00(%rsp),%ymm0 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 + vmovdqa %ymm1,0x40(%rsp) + vmovdqa 0x60(%rsp),%ymm0 + vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 + vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 + vmovdqa %ymm1,0x60(%rsp) + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + vmovdqa %ymm0,%ymm8 + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + vmovdqa %ymm0,%ymm9 + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + vmovdqa %ymm0,%ymm10 + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + vmovdqa %ymm0,%ymm11 + + # xor with corresponding input, write to output + vmovdqa 0x00(%rsp),%ymm0 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vmovdqa 0x20(%rsp),%ymm0 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vmovdqa 0x40(%rsp),%ymm0 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vmovdqa 0x60(%rsp),%ymm0 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vpxor 0x0100(%rdx),%ymm4,%ymm4 + vmovdqu %ymm4,0x0100(%rsi) + vpxor 0x0180(%rdx),%ymm5,%ymm5 + vmovdqu %ymm5,0x00180(%rsi) + vpxor 0x0140(%rdx),%ymm6,%ymm6 + vmovdqu %ymm6,0x0140(%rsi) + vpxor 0x01c0(%rdx),%ymm7,%ymm7 + vmovdqu %ymm7,0x01c0(%rsi) + vpxor 0x0020(%rdx),%ymm8,%ymm8 + vmovdqu %ymm8,0x0020(%rsi) + vpxor 0x00a0(%rdx),%ymm9,%ymm9 + vmovdqu %ymm9,0x00a0(%rsi) + vpxor 0x0060(%rdx),%ymm10,%ymm10 + vmovdqu %ymm10,0x0060(%rsi) + vpxor 0x00e0(%rdx),%ymm11,%ymm11 + vmovdqu %ymm11,0x00e0(%rsi) + vpxor 0x0120(%rdx),%ymm12,%ymm12 + vmovdqu %ymm12,0x0120(%rsi) + vpxor 0x01a0(%rdx),%ymm13,%ymm13 + vmovdqu %ymm13,0x01a0(%rsi) + vpxor 0x0160(%rdx),%ymm14,%ymm14 + vmovdqu %ymm14,0x0160(%rsi) + vpxor 0x01e0(%rdx),%ymm15,%ymm15 + vmovdqu %ymm15,0x01e0(%rsi) + + vzeroupper + mov %r8,%rsp + ret +ENDPROC(chacha20_8block_xor_avx2) diff --git a/kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S b/kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S new file mode 100644 index 000000000..3a33124e9 --- /dev/null +++ b/kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -0,0 +1,627 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +.data +.align 16 + +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: 1 data block output, o + # %rdx: 1 data block input, i + + # This function encrypts one ChaCha20 block by loading the state matrix + # in four SSE registers. It performs matrix operation on four words in + # parallel, but requireds shuffling to rearrange the words after each + # round. 8/16-bit word rotation is done with the slightly better + # performing SSSE3 byte shuffling, 7/12-bit word rotation uses + # traditional shift+OR. + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + + mov $10,%ecx + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + dec %ecx + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + movdqu 0x00(%rdx),%xmm4 + paddd %xmm8,%xmm0 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + movdqu 0x10(%rdx),%xmm5 + paddd %xmm9,%xmm1 + pxor %xmm5,%xmm1 + movdqu %xmm1,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + movdqu 0x20(%rdx),%xmm6 + paddd %xmm10,%xmm2 + pxor %xmm6,%xmm2 + movdqu %xmm2,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + movdqu 0x30(%rdx),%xmm7 + paddd %xmm11,%xmm3 + pxor %xmm7,%xmm3 + movdqu %xmm3,0x30(%rsi) + + ret +ENDPROC(chacha20_block_xor_ssse3) + +ENTRY(chacha20_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: 4 data blocks output, o + # %rdx: 4 data blocks input, i + + # This function encrypts four consecutive ChaCha20 blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + mov %rsp,%r11 + sub $0x80,%rsp + and $~63,%rsp + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + mov $10,%ecx + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + dec %ecx + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + movdqa 0x10(%rsp),%xmm0 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + movdqa 0x20(%rsp),%xmm0 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + movdqa 0x30(%rsp),%xmm0 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm4 + movdqu %xmm4,0x10(%rsi) + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm5 + movdqu %xmm5,0x90(%rsi) + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm6 + movdqu %xmm6,0x50(%rsi) + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm7 + movdqu %xmm7,0xd0(%rsi) + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm8 + movdqu %xmm8,0x20(%rsi) + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm9 + movdqu %xmm9,0xa0(%rsi) + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm10 + movdqu %xmm10,0x60(%rsi) + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm11 + movdqu %xmm11,0xe0(%rsi) + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm12 + movdqu %xmm12,0x30(%rsi) + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm13 + movdqu %xmm13,0xb0(%rsi) + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm14 + movdqu %xmm14,0x70(%rsi) + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm15 + movdqu %xmm15,0xf0(%rsi) + + mov %r11,%rsp + ret +ENDPROC(chacha20_4block_xor_ssse3) diff --git a/kernel/arch/x86/crypto/chacha20_glue.c b/kernel/arch/x86/crypto/chacha20_glue.c new file mode 100644 index 000000000..722bacea0 --- /dev/null +++ b/kernel/arch/x86/crypto/chacha20_glue.c @@ -0,0 +1,150 @@ +/* + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CHACHA20_STATE_ALIGN 16 + +asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); +static bool chacha20_use_avx2; +#endif + +static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + u8 buf[CHACHA20_BLOCK_SIZE]; + +#ifdef CONFIG_AS_AVX2 + if (chacha20_use_avx2) { + while (bytes >= CHACHA20_BLOCK_SIZE * 8) { + chacha20_8block_xor_avx2(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 8; + src += CHACHA20_BLOCK_SIZE * 8; + dst += CHACHA20_BLOCK_SIZE * 8; + state[12] += 8; + } + } +#endif + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { + chacha20_4block_xor_ssse3(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE * 4; + src += CHACHA20_BLOCK_SIZE * 4; + dst += CHACHA20_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA20_BLOCK_SIZE) { + chacha20_block_xor_ssse3(state, dst, src); + bytes -= CHACHA20_BLOCK_SIZE; + src += CHACHA20_BLOCK_SIZE; + dst += CHACHA20_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha20_block_xor_ssse3(state, buf, buf); + memcpy(dst, buf, bytes); + } +} + +static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1]; + struct blkcipher_walk walk; + int err; + + if (!may_use_simd()) + return crypto_chacha20_crypt(desc, dst, src, nbytes); + + state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN); + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); + + crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); + + kernel_fpu_begin(); + + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % CHACHA20_BLOCK_SIZE); + } + + if (walk.nbytes) { + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + + kernel_fpu_end(); + + return err; +} + +static struct crypto_alg alg = { + .cra_name = "chacha20", + .cra_driver_name = "chacha20-simd", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_type = &crypto_blkcipher_type, + .cra_ctxsize = sizeof(struct chacha20_ctx), + .cra_alignmask = sizeof(u32) - 1, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = CHACHA20_KEY_SIZE, + .max_keysize = CHACHA20_KEY_SIZE, + .ivsize = CHACHA20_IV_SIZE, + .geniv = "seqiv", + .setkey = crypto_chacha20_setkey, + .encrypt = chacha20_simd, + .decrypt = chacha20_simd, + }, + }, +}; + +static int __init chacha20_simd_mod_init(void) +{ + if (!cpu_has_ssse3) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#endif + return crypto_register_alg(&alg); +} + +static void __exit chacha20_simd_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(chacha20_simd_mod_init); +module_exit(chacha20_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); diff --git a/kernel/arch/x86/crypto/crc32-pclmul_glue.c b/kernel/arch/x86/crypto/crc32-pclmul_glue.c index 1937fc1d8..07d2c6c86 100644 --- a/kernel/arch/x86/crypto/crc32-pclmul_glue.c +++ b/kernel/arch/x86/crypto/crc32-pclmul_glue.c @@ -35,7 +35,7 @@ #include #include -#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 diff --git a/kernel/arch/x86/crypto/crc32c-intel_glue.c b/kernel/arch/x86/crypto/crc32c-intel_glue.c index 28640c3d6..81a595d75 100644 --- a/kernel/arch/x86/crypto/crc32c-intel_glue.c +++ b/kernel/arch/x86/crypto/crc32c-intel_glue.c @@ -32,8 +32,7 @@ #include #include -#include -#include +#include #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 diff --git a/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 225be06ed..4fe27e074 100644 --- a/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -330,7 +330,7 @@ ENDPROC(crc_pcl) ## PCLMULQDQ tables ## Table is 128 entries x 2 words (8 bytes) each ################################################################ -.section .rotata, "a", %progbits +.section .rodata, "a", %progbits .align 8 K_table: .long 0x493c7d27, 0x00000001 diff --git a/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c b/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c index b6c67bf30..a3fcfc97a 100644 --- a/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/arch/x86/crypto/fpu.c b/kernel/arch/x86/crypto/fpu.c index f368ba261..e7d679e2a 100644 --- a/kernel/arch/x86/crypto/fpu.c +++ b/kernel/arch/x86/crypto/fpu.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include struct crypto_fpu_ctx { struct crypto_blkcipher *child; @@ -156,7 +156,7 @@ int __init crypto_fpu_init(void) return crypto_register_template(&crypto_fpu_tmpl); } -void __exit crypto_fpu_exit(void) +void crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } diff --git a/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c b/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c index daf8d2b9a..440df0c7a 100644 --- a/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define GHASH_BLOCK_SIZE 16 diff --git a/kernel/arch/x86/crypto/poly1305-avx2-x86_64.S b/kernel/arch/x86/crypto/poly1305-avx2-x86_64.S new file mode 100644 index 000000000..eff2f414e --- /dev/null +++ b/kernel/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -0,0 +1,386 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +.data +.align 32 + +ANMASK: .octa 0x0000000003ffffff0000000003ffffff + .octa 0x0000000003ffffff0000000003ffffff +ORMASK: .octa 0x00000000010000000000000001000000 + .octa 0x00000000010000000000000001000000 + +.text + +#define h0 0x00(%rdi) +#define h1 0x04(%rdi) +#define h2 0x08(%rdi) +#define h3 0x0c(%rdi) +#define h4 0x10(%rdi) +#define r0 0x00(%rdx) +#define r1 0x04(%rdx) +#define r2 0x08(%rdx) +#define r3 0x0c(%rdx) +#define r4 0x10(%rdx) +#define u0 0x00(%r8) +#define u1 0x04(%r8) +#define u2 0x08(%r8) +#define u3 0x0c(%r8) +#define u4 0x10(%r8) +#define w0 0x14(%r8) +#define w1 0x18(%r8) +#define w2 0x1c(%r8) +#define w3 0x20(%r8) +#define w4 0x24(%r8) +#define y0 0x28(%r8) +#define y1 0x2c(%r8) +#define y2 0x30(%r8) +#define y3 0x34(%r8) +#define y4 0x38(%r8) +#define m %rsi +#define hc0 %ymm0 +#define hc1 %ymm1 +#define hc2 %ymm2 +#define hc3 %ymm3 +#define hc4 %ymm4 +#define hc0x %xmm0 +#define hc1x %xmm1 +#define hc2x %xmm2 +#define hc3x %xmm3 +#define hc4x %xmm4 +#define t1 %ymm5 +#define t2 %ymm6 +#define t1x %xmm5 +#define t2x %xmm6 +#define ruwy0 %ymm7 +#define ruwy1 %ymm8 +#define ruwy2 %ymm9 +#define ruwy3 %ymm10 +#define ruwy4 %ymm11 +#define ruwy0x %xmm7 +#define ruwy1x %xmm8 +#define ruwy2x %xmm9 +#define ruwy3x %xmm10 +#define ruwy4x %xmm11 +#define svxz1 %ymm12 +#define svxz2 %ymm13 +#define svxz3 %ymm14 +#define svxz4 %ymm15 +#define d0 %r9 +#define d1 %r10 +#define d2 %r11 +#define d3 %r12 +#define d4 %r13 + +ENTRY(poly1305_4block_avx2) + # %rdi: Accumulator h[5] + # %rsi: 64 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Quadblock count + # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], + + # This four-block variant uses loop unrolled block processing. It + # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: + # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r + + vzeroupper + push %rbx + push %r12 + push %r13 + + # combine r0,u0,w0,y0 + vmovd y0,ruwy0x + vmovd w0,t1x + vpunpcklqdq t1,ruwy0,ruwy0 + vmovd u0,t1x + vmovd r0,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy0,ruwy0 + + # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 + vmovd y1,ruwy1x + vmovd w1,t1x + vpunpcklqdq t1,ruwy1,ruwy1 + vmovd u1,t1x + vmovd r1,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy1,ruwy1 + vpslld $2,ruwy1,svxz1 + vpaddd ruwy1,svxz1,svxz1 + + # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 + vmovd y2,ruwy2x + vmovd w2,t1x + vpunpcklqdq t1,ruwy2,ruwy2 + vmovd u2,t1x + vmovd r2,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy2,ruwy2 + vpslld $2,ruwy2,svxz2 + vpaddd ruwy2,svxz2,svxz2 + + # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 + vmovd y3,ruwy3x + vmovd w3,t1x + vpunpcklqdq t1,ruwy3,ruwy3 + vmovd u3,t1x + vmovd r3,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy3,ruwy3 + vpslld $2,ruwy3,svxz3 + vpaddd ruwy3,svxz3,svxz3 + + # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 + vmovd y4,ruwy4x + vmovd w4,t1x + vpunpcklqdq t1,ruwy4,ruwy4 + vmovd u4,t1x + vmovd r4,t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,ruwy4,ruwy4 + vpslld $2,ruwy4,svxz4 + vpaddd ruwy4,svxz4,svxz4 + +.Ldoblock4: + # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, + # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] + vmovd 0x00(m),hc0x + vmovd 0x10(m),t1x + vpunpcklqdq t1,hc0,hc0 + vmovd 0x20(m),t1x + vmovd 0x30(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc0,hc0 + vpand ANMASK(%rip),hc0,hc0 + vmovd h0,t1x + vpaddd t1,hc0,hc0 + # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, + # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] + vmovd 0x03(m),hc1x + vmovd 0x13(m),t1x + vpunpcklqdq t1,hc1,hc1 + vmovd 0x23(m),t1x + vmovd 0x33(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc1,hc1 + vpsrld $2,hc1,hc1 + vpand ANMASK(%rip),hc1,hc1 + vmovd h1,t1x + vpaddd t1,hc1,hc1 + # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, + # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] + vmovd 0x06(m),hc2x + vmovd 0x16(m),t1x + vpunpcklqdq t1,hc2,hc2 + vmovd 0x26(m),t1x + vmovd 0x36(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc2,hc2 + vpsrld $4,hc2,hc2 + vpand ANMASK(%rip),hc2,hc2 + vmovd h2,t1x + vpaddd t1,hc2,hc2 + # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, + # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] + vmovd 0x09(m),hc3x + vmovd 0x19(m),t1x + vpunpcklqdq t1,hc3,hc3 + vmovd 0x29(m),t1x + vmovd 0x39(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc3,hc3 + vpsrld $6,hc3,hc3 + vpand ANMASK(%rip),hc3,hc3 + vmovd h3,t1x + vpaddd t1,hc3,hc3 + # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), + # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] + vmovd 0x0c(m),hc4x + vmovd 0x1c(m),t1x + vpunpcklqdq t1,hc4,hc4 + vmovd 0x2c(m),t1x + vmovd 0x3c(m),t2x + vpunpcklqdq t2,t1,t1 + vperm2i128 $0x20,t1,hc4,hc4 + vpsrld $8,hc4,hc4 + vpor ORMASK(%rip),hc4,hc4 + vmovd h4,t1x + vpaddd t1,hc4,hc4 + + # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] + vpmuludq hc0,ruwy0,t1 + # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] + vpmuludq hc1,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] + vpmuludq hc2,svxz3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] + vpmuludq hc3,svxz2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] + vpmuludq hc4,svxz1,t2 + vpaddq t2,t1,t1 + # d0 = t1[0] + t1[1] + t[2] + t[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d0 + + # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] + vpmuludq hc0,ruwy1,t1 + # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] + vpmuludq hc1,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] + vpmuludq hc2,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] + vpmuludq hc3,svxz3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] + vpmuludq hc4,svxz2,t2 + vpaddq t2,t1,t1 + # d1 = t1[0] + t1[1] + t1[3] + t1[4] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d1 + + # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] + vpmuludq hc0,ruwy2,t1 + # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] + vpmuludq hc1,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] + vpmuludq hc2,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] + vpmuludq hc3,svxz4,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] + vpmuludq hc4,svxz3,t2 + vpaddq t2,t1,t1 + # d2 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d2 + + # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] + vpmuludq hc0,ruwy3,t1 + # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] + vpmuludq hc1,ruwy2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] + vpmuludq hc2,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] + vpmuludq hc3,ruwy0,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] + vpmuludq hc4,svxz4,t2 + vpaddq t2,t1,t1 + # d3 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d3 + + # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] + vpmuludq hc0,ruwy4,t1 + # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] + vpmuludq hc1,ruwy3,t2 + vpaddq t2,t1,t1 + # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] + vpmuludq hc2,ruwy2,t2 + vpaddq t2,t1,t1 + # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] + vpmuludq hc3,ruwy1,t2 + vpaddq t2,t1,t1 + # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] + vpmuludq hc4,ruwy0,t2 + vpaddq t2,t1,t1 + # d4 = t1[0] + t1[1] + t1[2] + t1[3] + vpermq $0xee,t1,t2 + vpaddq t2,t1,t1 + vpsrldq $8,t1,t2 + vpaddq t2,t1,t1 + vmovq t1x,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x40,m + dec %rcx + jnz .Ldoblock4 + + vzeroupper + pop %r13 + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_4block_avx2) diff --git a/kernel/arch/x86/crypto/poly1305-sse2-x86_64.S b/kernel/arch/x86/crypto/poly1305-sse2-x86_64.S new file mode 100644 index 000000000..338c74805 --- /dev/null +++ b/kernel/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -0,0 +1,582 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include + +.data +.align 16 + +ANMASK: .octa 0x0000000003ffffff0000000003ffffff +ORMASK: .octa 0x00000000010000000000000001000000 + +.text + +#define h0 0x00(%rdi) +#define h1 0x04(%rdi) +#define h2 0x08(%rdi) +#define h3 0x0c(%rdi) +#define h4 0x10(%rdi) +#define r0 0x00(%rdx) +#define r1 0x04(%rdx) +#define r2 0x08(%rdx) +#define r3 0x0c(%rdx) +#define r4 0x10(%rdx) +#define s1 0x00(%rsp) +#define s2 0x04(%rsp) +#define s3 0x08(%rsp) +#define s4 0x0c(%rsp) +#define m %rsi +#define h01 %xmm0 +#define h23 %xmm1 +#define h44 %xmm2 +#define t1 %xmm3 +#define t2 %xmm4 +#define t3 %xmm5 +#define t4 %xmm6 +#define mask %xmm7 +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 +#define d4 %r12 + +ENTRY(poly1305_block_sse2) + # %rdi: Accumulator h[5] + # %rsi: 16 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Block count + + # This single block variant tries to improve performance by doing two + # multiplications in parallel using SSE instructions. There is quite + # some quardword packing involved, hence the speedup is marginal. + + push %rbx + push %r12 + sub $0x10,%rsp + + # s1..s4 = r1..r4 * 5 + mov r1,%eax + lea (%eax,%eax,4),%eax + mov %eax,s1 + mov r2,%eax + lea (%eax,%eax,4),%eax + mov %eax,s2 + mov r3,%eax + lea (%eax,%eax,4),%eax + mov %eax,s3 + mov r4,%eax + lea (%eax,%eax,4),%eax + mov %eax,s4 + + movdqa ANMASK(%rip),mask + +.Ldoblock: + # h01 = [0, h1, 0, h0] + # h23 = [0, h3, 0, h2] + # h44 = [0, h4, 0, h4] + movd h0,h01 + movd h1,t1 + movd h2,h23 + movd h3,t2 + movd h4,h44 + punpcklqdq t1,h01 + punpcklqdq t2,h23 + punpcklqdq h44,h44 + + # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] + movd 0x00(m),t1 + movd 0x03(m),t2 + psrld $2,t2 + punpcklqdq t2,t1 + pand mask,t1 + paddd t1,h01 + # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] + movd 0x06(m),t1 + movd 0x09(m),t2 + psrld $4,t1 + psrld $6,t2 + punpcklqdq t2,t1 + pand mask,t1 + paddd t1,h23 + # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] + mov 0x0c(m),%eax + shr $8,%eax + or $0x01000000,%eax + movd %eax,t1 + pshufd $0xc4,t1,t1 + paddd t1,h44 + + # t1[0] = h0 * r0 + h2 * s3 + # t1[1] = h1 * s4 + h3 * s2 + movd r0,t1 + movd s4,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd s3,t2 + movd s2,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t2[0] = h0 * r1 + h2 * s4 + # t2[1] = h1 * r0 + h3 * s3 + movd r1,t2 + movd r0,t3 + punpcklqdq t3,t2 + pmuludq h01,t2 + movd s4,t3 + movd s3,t4 + punpcklqdq t4,t3 + pmuludq h23,t3 + paddq t3,t2 + # t3[0] = h4 * s1 + # t3[1] = h4 * s2 + movd s1,t3 + movd s2,t4 + punpcklqdq t4,t3 + pmuludq h44,t3 + # d0 = t1[0] + t1[1] + t3[0] + # d1 = t2[0] + t2[1] + t3[1] + movdqa t1,t4 + punpcklqdq t2,t4 + punpckhqdq t2,t1 + paddq t4,t1 + paddq t3,t1 + movq t1,d0 + psrldq $8,t1 + movq t1,d1 + + # t1[0] = h0 * r2 + h2 * r0 + # t1[1] = h1 * r1 + h3 * s4 + movd r2,t1 + movd r1,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd r0,t2 + movd s4,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t2[0] = h0 * r3 + h2 * r1 + # t2[1] = h1 * r2 + h3 * r0 + movd r3,t2 + movd r2,t3 + punpcklqdq t3,t2 + pmuludq h01,t2 + movd r1,t3 + movd r0,t4 + punpcklqdq t4,t3 + pmuludq h23,t3 + paddq t3,t2 + # t3[0] = h4 * s3 + # t3[1] = h4 * s4 + movd s3,t3 + movd s4,t4 + punpcklqdq t4,t3 + pmuludq h44,t3 + # d2 = t1[0] + t1[1] + t3[0] + # d3 = t2[0] + t2[1] + t3[1] + movdqa t1,t4 + punpcklqdq t2,t4 + punpckhqdq t2,t1 + paddq t4,t1 + paddq t3,t1 + movq t1,d2 + psrldq $8,t1 + movq t1,d3 + + # t1[0] = h0 * r4 + h2 * r2 + # t1[1] = h1 * r3 + h3 * r1 + movd r4,t1 + movd r3,t2 + punpcklqdq t2,t1 + pmuludq h01,t1 + movd r2,t2 + movd r1,t3 + punpcklqdq t3,t2 + pmuludq h23,t2 + paddq t2,t1 + # t3[0] = h4 * r0 + movd r0,t3 + pmuludq h44,t3 + # d4 = t1[0] + t1[1] + t3[0] + movdqa t1,t4 + psrldq $8,t4 + paddq t4,t1 + paddq t3,t1 + movq t1,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x10,m + dec %rcx + jnz .Ldoblock + + add $0x10,%rsp + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_block_sse2) + + +#define u0 0x00(%r8) +#define u1 0x04(%r8) +#define u2 0x08(%r8) +#define u3 0x0c(%r8) +#define u4 0x10(%r8) +#define hc0 %xmm0 +#define hc1 %xmm1 +#define hc2 %xmm2 +#define hc3 %xmm5 +#define hc4 %xmm6 +#define ru0 %xmm7 +#define ru1 %xmm8 +#define ru2 %xmm9 +#define ru3 %xmm10 +#define ru4 %xmm11 +#define sv1 %xmm12 +#define sv2 %xmm13 +#define sv3 %xmm14 +#define sv4 %xmm15 +#undef d0 +#define d0 %r13 + +ENTRY(poly1305_2block_sse2) + # %rdi: Accumulator h[5] + # %rsi: 16 byte input block m + # %rdx: Poly1305 key r[5] + # %rcx: Doubleblock count + # %r8: Poly1305 derived key r^2 u[5] + + # This two-block variant further improves performance by using loop + # unrolled block processing. This is more straight forward and does + # less byte shuffling, but requires a second Poly1305 key r^2: + # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r + + push %rbx + push %r12 + push %r13 + + # combine r0,u0 + movd u0,ru0 + movd r0,t1 + punpcklqdq t1,ru0 + + # combine r1,u1 and s1=r1*5,v1=u1*5 + movd u1,ru1 + movd r1,t1 + punpcklqdq t1,ru1 + movdqa ru1,sv1 + pslld $2,sv1 + paddd ru1,sv1 + + # combine r2,u2 and s2=r2*5,v2=u2*5 + movd u2,ru2 + movd r2,t1 + punpcklqdq t1,ru2 + movdqa ru2,sv2 + pslld $2,sv2 + paddd ru2,sv2 + + # combine r3,u3 and s3=r3*5,v3=u3*5 + movd u3,ru3 + movd r3,t1 + punpcklqdq t1,ru3 + movdqa ru3,sv3 + pslld $2,sv3 + paddd ru3,sv3 + + # combine r4,u4 and s4=r4*5,v4=u4*5 + movd u4,ru4 + movd r4,t1 + punpcklqdq t1,ru4 + movdqa ru4,sv4 + pslld $2,sv4 + paddd ru4,sv4 + +.Ldoblock2: + # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] + movd 0x00(m),hc0 + movd 0x10(m),t1 + punpcklqdq t1,hc0 + pand ANMASK(%rip),hc0 + movd h0,t1 + paddd t1,hc0 + # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] + movd 0x03(m),hc1 + movd 0x13(m),t1 + punpcklqdq t1,hc1 + psrld $2,hc1 + pand ANMASK(%rip),hc1 + movd h1,t1 + paddd t1,hc1 + # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] + movd 0x06(m),hc2 + movd 0x16(m),t1 + punpcklqdq t1,hc2 + psrld $4,hc2 + pand ANMASK(%rip),hc2 + movd h2,t1 + paddd t1,hc2 + # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] + movd 0x09(m),hc3 + movd 0x19(m),t1 + punpcklqdq t1,hc3 + psrld $6,hc3 + pand ANMASK(%rip),hc3 + movd h3,t1 + paddd t1,hc3 + # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] + movd 0x0c(m),hc4 + movd 0x1c(m),t1 + punpcklqdq t1,hc4 + psrld $8,hc4 + por ORMASK(%rip),hc4 + movd h4,t1 + paddd t1,hc4 + + # t1 = [ hc0[1] * r0, hc0[0] * u0 ] + movdqa ru0,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * s4, hc1[0] * v4 ] + movdqa sv4,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * s3, hc2[0] * v3 ] + movdqa sv3,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s2, hc3[0] * v2 ] + movdqa sv2,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s1, hc4[0] * v1 ] + movdqa sv1,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d0 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d0 + + # t1 = [ hc0[1] * r1, hc0[0] * u1 ] + movdqa ru1,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r0, hc1[0] * u0 ] + movdqa ru0,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * s4, hc2[0] * v4 ] + movdqa sv4,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s3, hc3[0] * v3 ] + movdqa sv3,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s2, hc4[0] * v2 ] + movdqa sv2,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d1 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d1 + + # t1 = [ hc0[1] * r2, hc0[0] * u2 ] + movdqa ru2,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r1, hc1[0] * u1 ] + movdqa ru1,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r0, hc2[0] * u0 ] + movdqa ru0,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * s4, hc3[0] * v4 ] + movdqa sv4,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s3, hc4[0] * v3 ] + movdqa sv3,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d2 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d2 + + # t1 = [ hc0[1] * r3, hc0[0] * u3 ] + movdqa ru3,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r2, hc1[0] * u2 ] + movdqa ru2,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r1, hc2[0] * u1 ] + movdqa ru1,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * r0, hc3[0] * u0 ] + movdqa ru0,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * s4, hc4[0] * v4 ] + movdqa sv4,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d3 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d3 + + # t1 = [ hc0[1] * r4, hc0[0] * u4 ] + movdqa ru4,t1 + pmuludq hc0,t1 + # t1 += [ hc1[1] * r3, hc1[0] * u3 ] + movdqa ru3,t2 + pmuludq hc1,t2 + paddq t2,t1 + # t1 += [ hc2[1] * r2, hc2[0] * u2 ] + movdqa ru2,t2 + pmuludq hc2,t2 + paddq t2,t1 + # t1 += [ hc3[1] * r1, hc3[0] * u1 ] + movdqa ru1,t2 + pmuludq hc3,t2 + paddq t2,t1 + # t1 += [ hc4[1] * r0, hc4[0] * u0 ] + movdqa ru0,t2 + pmuludq hc4,t2 + paddq t2,t1 + # d4 = t1[0] + t1[1] + movdqa t1,t2 + psrldq $8,t2 + paddq t2,t1 + movq t1,d4 + + # d1 += d0 >> 26 + mov d0,%rax + shr $26,%rax + add %rax,d1 + # h0 = d0 & 0x3ffffff + mov d0,%rbx + and $0x3ffffff,%ebx + + # d2 += d1 >> 26 + mov d1,%rax + shr $26,%rax + add %rax,d2 + # h1 = d1 & 0x3ffffff + mov d1,%rax + and $0x3ffffff,%eax + mov %eax,h1 + + # d3 += d2 >> 26 + mov d2,%rax + shr $26,%rax + add %rax,d3 + # h2 = d2 & 0x3ffffff + mov d2,%rax + and $0x3ffffff,%eax + mov %eax,h2 + + # d4 += d3 >> 26 + mov d3,%rax + shr $26,%rax + add %rax,d4 + # h3 = d3 & 0x3ffffff + mov d3,%rax + and $0x3ffffff,%eax + mov %eax,h3 + + # h0 += (d4 >> 26) * 5 + mov d4,%rax + shr $26,%rax + lea (%eax,%eax,4),%eax + add %eax,%ebx + # h4 = d4 & 0x3ffffff + mov d4,%rax + and $0x3ffffff,%eax + mov %eax,h4 + + # h1 += h0 >> 26 + mov %ebx,%eax + shr $26,%eax + add %eax,h1 + # h0 = h0 & 0x3ffffff + andl $0x3ffffff,%ebx + mov %ebx,h0 + + add $0x20,m + dec %rcx + jnz .Ldoblock2 + + pop %r13 + pop %r12 + pop %rbx + ret +ENDPROC(poly1305_2block_sse2) diff --git a/kernel/arch/x86/crypto/poly1305_glue.c b/kernel/arch/x86/crypto/poly1305_glue.c new file mode 100644 index 000000000..4264a3d59 --- /dev/null +++ b/kernel/arch/x86/crypto/poly1305_glue.c @@ -0,0 +1,207 @@ +/* + * Poly1305 authenticator algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct poly1305_simd_desc_ctx { + struct poly1305_desc_ctx base; + /* derived key u set? */ + bool uset; +#ifdef CONFIG_AS_AVX2 + /* derived keys r^3, r^4 set? */ + bool wset; +#endif + /* derived Poly1305 key r^2 */ + u32 u[5]; + /* ... silently appended r^3 and r^4 when using AVX2 */ +}; + +asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, + const u32 *r, unsigned int blocks); +asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); +#ifdef CONFIG_AS_AVX2 +asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); +static bool poly1305_use_avx2; +#endif + +static int poly1305_simd_init(struct shash_desc *desc) +{ + struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); + + sctx->uset = false; +#ifdef CONFIG_AS_AVX2 + sctx->wset = false; +#endif + + return crypto_poly1305_init(desc); +} + +static void poly1305_simd_mult(u32 *a, const u32 *b) +{ + u8 m[POLY1305_BLOCK_SIZE]; + + memset(m, 0, sizeof(m)); + /* The poly1305 block function adds a hi-bit to the accumulator which + * we don't need for key multiplication; compensate for it. */ + a[4] -= 1 << 24; + poly1305_block_sse2(a, m, b, 1); +} + +static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + struct poly1305_simd_desc_ctx *sctx; + unsigned int blocks, datalen; + + BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); + sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + +#ifdef CONFIG_AS_AVX2 + if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { + if (unlikely(!sctx->wset)) { + if (!sctx->uset) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u + 5, dctx->r); + memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u + 10, dctx->r); + sctx->wset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 4); + poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 4 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; + } +#endif + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { + if (unlikely(!sctx->uset)) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 2); + poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 2 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + poly1305_block_sse2(dctx->h, src, dctx->r, 1); + srclen -= POLY1305_BLOCK_SIZE; + } + return srclen; +} + +static int poly1305_simd_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int bytes; + + /* kernel_fpu_begin/end is costly, use fallback for small updates */ + if (srclen <= 288 || !may_use_simd()) + return crypto_poly1305_update(desc, src, srclen); + + kernel_fpu_begin(); + + if (unlikely(dctx->buflen)) { + bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); + memcpy(dctx->buf + dctx->buflen, src, bytes); + src += bytes; + srclen -= bytes; + dctx->buflen += bytes; + + if (dctx->buflen == POLY1305_BLOCK_SIZE) { + poly1305_simd_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + dctx->buflen = 0; + } + } + + if (likely(srclen >= POLY1305_BLOCK_SIZE)) { + bytes = poly1305_simd_blocks(dctx, src, srclen); + src += srclen - bytes; + srclen = bytes; + } + + kernel_fpu_end(); + + if (unlikely(srclen)) { + dctx->buflen = srclen; + memcpy(dctx->buf, src, srclen); + } + + return 0; +} + +static struct shash_alg alg = { + .digestsize = POLY1305_DIGEST_SIZE, + .init = poly1305_simd_init, + .update = poly1305_simd_update, + .final = crypto_poly1305_final, + .setkey = crypto_poly1305_setkey, + .descsize = sizeof(struct poly1305_simd_desc_ctx), + .base = { + .cra_name = "poly1305", + .cra_driver_name = "poly1305-simd", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_alignmask = sizeof(u32) - 1, + .cra_blocksize = POLY1305_BLOCK_SIZE, + .cra_module = THIS_MODULE, + }, +}; + +static int __init poly1305_simd_mod_init(void) +{ + if (!cpu_has_xmm2) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); + alg.descsize = sizeof(struct poly1305_simd_desc_ctx); + if (poly1305_use_avx2) + alg.descsize += 10 * sizeof(u32); +#endif + return crypto_register_shash(&alg); +} + +static void __exit poly1305_simd_mod_exit(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(poly1305_simd_mod_init); +module_exit(poly1305_simd_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi "); +MODULE_DESCRIPTION("Poly1305 authenticator"); +MODULE_ALIAS_CRYPTO("poly1305"); +MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/kernel/arch/x86/crypto/serpent_avx2_glue.c b/kernel/arch/x86/crypto/serpent_avx2_glue.c index 2f63dc89e..6d198342e 100644 --- a/kernel/arch/x86/crypto/serpent_avx2_glue.c +++ b/kernel/arch/x86/crypto/serpent_avx2_glue.c @@ -20,8 +20,7 @@ #include #include #include -#include -#include +#include #include #include @@ -537,16 +536,15 @@ static struct crypto_alg srp_algs[10] = { { static int __init init(void) { - u64 xcr0; + const char *feature_name; if (!cpu_has_avx2 || !cpu_has_osxsave) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/kernel/arch/x86/crypto/serpent_avx_glue.c b/kernel/arch/x86/crypto/serpent_avx_glue.c index c8d478af8..5dc37026c 100644 --- a/kernel/arch/x86/crypto/serpent_avx_glue.c +++ b/kernel/arch/x86/crypto/serpent_avx_glue.c @@ -36,8 +36,7 @@ #include #include #include -#include -#include +#include #include #include @@ -596,16 +595,11 @@ static struct crypto_alg serpent_algs[10] = { { static int __init serpent_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - printk(KERN_INFO "AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - printk(KERN_INFO "AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, + &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } diff --git a/kernel/arch/x86/crypto/sha-mb/sha1_mb.c b/kernel/arch/x86/crypto/sha-mb/sha1_mb.c index e510b1c5d..a841e9765 100644 --- a/kernel/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/kernel/arch/x86/crypto/sha-mb/sha1_mb.c @@ -65,11 +65,8 @@ #include #include #include -#include -#include -#include #include -#include +#include #include "sha_mb_ctx.h" #define FLUSH_INTERVAL 1000 /* in usec */ @@ -885,7 +882,8 @@ static int __init sha1_mb_mod_init(void) INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher); cpu_state->cpu = cpu; cpu_state->alg_state = &sha1_mb_alg_state; - cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL); + cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr), + GFP_KERNEL); if (!cpu_state->mgr) goto err2; sha1_ctx_mgr_init(cpu_state->mgr); diff --git a/kernel/arch/x86/crypto/sha1_ni_asm.S b/kernel/arch/x86/crypto/sha1_ni_asm.S new file mode 100644 index 000000000..874a651b9 --- /dev/null +++ b/kernel/arch/x86/crypto/sha1_ni_asm.S @@ -0,0 +1,302 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define RSPSAVE %rax + +/* gcc conversion */ +#define FRAME_SIZE 32 /* space for 2x16 bytes */ + +#define ABCD %xmm0 +#define E0 %xmm1 /* Need two E's b/c they ping pong */ +#define E1 %xmm2 +#define MSG0 %xmm3 +#define MSG1 %xmm4 +#define MSG2 %xmm5 +#define MSG3 %xmm6 +#define SHUF_MASK %xmm7 + + +/* + * Intel SHA Extensions optimized implementation of a SHA-1 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha1_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks) + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ +.text +.align 32 +ENTRY(sha1_ni_transform) + mov %rsp, RSPSAVE + sub $FRAME_SIZE, %rsp + and $~0xF, %rsp + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* load initial hash values */ + pinsrd $3, 1*16(DIGEST_PTR), E0 + movdqu 0*16(DIGEST_PTR), ABCD + pand UPPER_WORD_MASK(%rip), E0 + pshufd $0x1B, ABCD, ABCD + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa E0, (0*16)(%rsp) + movdqa ABCD, (1*16)(%rsp) + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG0 + pshufb SHUF_MASK, MSG0 + paddd MSG0, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG1 + pshufb SHUF_MASK, MSG1 + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG1, MSG0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG2 + pshufb SHUF_MASK, MSG2 + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG3 + pshufb SHUF_MASK, MSG3 + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $0, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 16-19 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $0, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 20-23 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 24-27 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 28-31 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 32-35 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $1, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 36-39 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $1, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 40-43 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 44-47 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 48-51 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 52-55 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $2, E1, ABCD + sha1msg1 MSG1, MSG0 + pxor MSG1, MSG3 + + /* Rounds 56-59 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $2, E0, ABCD + sha1msg1 MSG2, MSG1 + pxor MSG2, MSG0 + + /* Rounds 60-63 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1msg2 MSG3, MSG0 + sha1rnds4 $3, E1, ABCD + sha1msg1 MSG3, MSG2 + pxor MSG3, MSG1 + + /* Rounds 64-67 */ + sha1nexte MSG0, E0 + movdqa ABCD, E1 + sha1msg2 MSG0, MSG1 + sha1rnds4 $3, E0, ABCD + sha1msg1 MSG0, MSG3 + pxor MSG0, MSG2 + + /* Rounds 68-71 */ + sha1nexte MSG1, E1 + movdqa ABCD, E0 + sha1msg2 MSG1, MSG2 + sha1rnds4 $3, E1, ABCD + pxor MSG1, MSG3 + + /* Rounds 72-75 */ + sha1nexte MSG2, E0 + movdqa ABCD, E1 + sha1msg2 MSG2, MSG3 + sha1rnds4 $3, E0, ABCD + + /* Rounds 76-79 */ + sha1nexte MSG3, E1 + movdqa ABCD, E0 + sha1rnds4 $3, E1, ABCD + + /* Add current hash values with previously saved */ + sha1nexte (0*16)(%rsp), E0 + paddd (1*16)(%rsp), ABCD + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, ABCD, ABCD + movdqu ABCD, 0*16(DIGEST_PTR) + pextrd $3, E0, 1*16(DIGEST_PTR) + +.Ldone_hash: + mov RSPSAVE, %rsp + + ret +ENDPROC(sha1_ni_transform) + +.data + +.align 64 +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x000102030405060708090a0b0c0d0e0f +UPPER_WORD_MASK: + .octa 0xFFFFFFFF000000000000000000000000 diff --git a/kernel/arch/x86/crypto/sha1_ssse3_glue.c b/kernel/arch/x86/crypto/sha1_ssse3_glue.c index 33d1b9dc1..dd14616b7 100644 --- a/kernel/arch/x86/crypto/sha1_ssse3_glue.c +++ b/kernel/arch/x86/crypto/sha1_ssse3_glue.c @@ -29,28 +29,13 @@ #include #include #include -#include -#include -#include +#include +typedef void (sha1_transform_fn)(u32 *digest, const char *data, + unsigned int rounds); -asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, - unsigned int rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha1_transform_avx(u32 *digest, const char *data, - unsigned int rounds); -#endif -#ifdef CONFIG_AS_AVX2 -#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ - -asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); -#endif - -static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); - -static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha1_transform_fn *sha1_xform) { struct sha1_state *sctx = shash_desc_ctx(desc); @@ -63,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_transform_asm); + (sha1_block_fn *)sha1_xform); kernel_fpu_end(); return 0; } -static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) { if (!irq_fpu_usable()) return crypto_sha1_finup(desc, data, len, out); @@ -78,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha1_base_do_update(desc, data, len, - (sha1_block_fn *)sha1_transform_asm); - sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); + (sha1_block_fn *)sha1_xform); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); } -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - return sha1_ssse3_finup(desc, NULL, 0, out); + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_transform_ssse3); } -#ifdef CONFIG_AS_AVX2 -static void sha1_apply_transform_avx2(u32 *digest, const char *data, - unsigned int rounds) +static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - /* Select the optimal transform based on data block size */ - if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) - sha1_transform_avx2(digest, data, rounds); - else - sha1_transform_avx(digest, data, rounds); + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_transform_ssse3); +} + +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ssse3_finup(desc, NULL, 0, out); } -#endif -static struct shash_alg alg = { +static struct shash_alg sha1_ssse3_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ssse3_update, @@ -112,7 +102,7 @@ static struct shash_alg alg = { .descsize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", - .cra_driver_name= "sha1-ssse3", + .cra_driver_name = "sha1-ssse3", .cra_priority = 150, .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, @@ -120,73 +110,261 @@ static struct shash_alg alg = { } }; +static int register_sha1_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shash(&sha1_ssse3_alg); + return 0; +} + +static void unregister_sha1_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shash(&sha1_ssse3_alg); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha1_transform_avx(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - u64 xcr0; + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_transform_avx); +} - if (!cpu_has_avx || !cpu_has_osxsave) - return false; +static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_transform_avx); +} - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); +static int sha1_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha1_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_avx_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_avx_update, + .final = sha1_avx_final, + .finup = sha1_avx_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; +static bool avx_usable(void) +{ + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } return true; } -#ifdef CONFIG_AS_AVX2 -static bool __init avx2_usable(void) +static int register_sha1_avx(void) { - if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && - boot_cpu_has(X86_FEATURE_BMI2)) + if (avx_usable()) + return crypto_register_shash(&sha1_avx_alg); + return 0; +} + +static void unregister_sha1_avx(void) +{ + if (avx_usable()) + crypto_unregister_shash(&sha1_avx_alg); +} + +#else /* CONFIG_AS_AVX */ +static inline int register_sha1_avx(void) { return 0; } +static inline void unregister_sha1_avx(void) { } +#endif /* CONFIG_AS_AVX */ + + +#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX) +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); + +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + && boot_cpu_has(X86_FEATURE_BMI1) + && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } + +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) +{ + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} + +static int sha1_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_apply_transform_avx2); +} + +static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_apply_transform_avx2); +} + +static int sha1_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha1_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_avx2_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_avx2_update, + .final = sha1_avx2_final, + .finup = sha1_avx2_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int register_sha1_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shash(&sha1_avx2_alg); + return 0; +} + +static void unregister_sha1_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shash(&sha1_avx2_alg); +} + +#else +static inline int register_sha1_avx2(void) { return 0; } +static inline void unregister_sha1_avx2(void) { } #endif + +#ifdef CONFIG_AS_SHA1_NI +asmlinkage void sha1_ni_transform(u32 *digest, const char *data, + unsigned int rounds); + +static int sha1_ni_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha1_update(desc, data, len, + (sha1_transform_fn *) sha1_ni_transform); +} + +static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha1_finup(desc, data, len, out, + (sha1_transform_fn *) sha1_ni_transform); +} + +static int sha1_ni_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ni_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha1_ni_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_base_init, + .update = sha1_ni_update, + .final = sha1_ni_final, + .finup = sha1_ni_finup, + .descsize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int register_sha1_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + return crypto_register_shash(&sha1_ni_alg); + return 0; +} + +static void unregister_sha1_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + crypto_unregister_shash(&sha1_ni_alg); +} + +#else +static inline int register_sha1_ni(void) { return 0; } +static inline void unregister_sha1_ni(void) { } #endif static int __init sha1_ssse3_mod_init(void) { - char *algo_name; + if (register_sha1_ssse3()) + goto fail; - /* test for SSSE3 first */ - if (cpu_has_ssse3) { - sha1_transform_asm = sha1_transform_ssse3; - algo_name = "SSSE3"; + if (register_sha1_avx()) { + unregister_sha1_ssse3(); + goto fail; } -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { - sha1_transform_asm = sha1_transform_avx; - algo_name = "AVX"; -#ifdef CONFIG_AS_AVX2 - /* allow AVX2 to override AVX, it's a little faster */ - if (avx2_usable()) { - sha1_transform_asm = sha1_apply_transform_avx2; - algo_name = "AVX2"; - } -#endif + if (register_sha1_avx2()) { + unregister_sha1_avx(); + unregister_sha1_ssse3(); + goto fail; } -#endif - if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", algo_name); - return crypto_register_shash(&alg); + if (register_sha1_ni()) { + unregister_sha1_avx2(); + unregister_sha1_avx(); + unregister_sha1_ssse3(); + goto fail; } - pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); + return 0; +fail: return -ENODEV; } static void __exit sha1_ssse3_mod_fini(void) { - crypto_unregister_shash(&alg); + unregister_sha1_ni(); + unregister_sha1_avx2(); + unregister_sha1_avx(); + unregister_sha1_ssse3(); } module_init(sha1_ssse3_mod_init); diff --git a/kernel/arch/x86/crypto/sha256_ni_asm.S b/kernel/arch/x86/crypto/sha256_ni_asm.S new file mode 100644 index 000000000..748cdf21a --- /dev/null +++ b/kernel/arch/x86/crypto/sha256_ni_asm.S @@ -0,0 +1,353 @@ +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Sean Gulley + * Tim Chen + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#define DIGEST_PTR %rdi /* 1st arg */ +#define DATA_PTR %rsi /* 2nd arg */ +#define NUM_BLKS %rdx /* 3rd arg */ + +#define SHA256CONSTANTS %rax + +#define MSG %xmm0 +#define STATE0 %xmm1 +#define STATE1 %xmm2 +#define MSGTMP0 %xmm3 +#define MSGTMP1 %xmm4 +#define MSGTMP2 %xmm5 +#define MSGTMP3 %xmm6 +#define MSGTMP4 %xmm7 + +#define SHUF_MASK %xmm8 + +#define ABEF_SAVE %xmm9 +#define CDGH_SAVE %xmm10 + +/* + * Intel SHA Extensions optimized implementation of a SHA-256 update function + * + * The function takes a pointer to the current hash values, a pointer to the + * input data, and a number of 64 byte blocks to process. Once all blocks have + * been processed, the digest pointer is updated with the resulting hash value. + * The function only processes complete blocks, there is no functionality to + * store partial blocks. All message padding and hash value initialization must + * be done outside the update function. + * + * The indented lines in the loop are instructions related to rounds processing. + * The non-indented lines are instructions related to the message schedule. + * + * void sha256_ni_transform(uint32_t *digest, const void *data, + uint32_t numBlocks); + * digest : pointer to digest + * data: pointer to input data + * numBlocks: Number of blocks to process + */ + +.text +.align 32 +ENTRY(sha256_ni_transform) + + shl $6, NUM_BLKS /* convert to bytes */ + jz .Ldone_hash + add DATA_PTR, NUM_BLKS /* pointer to end of data */ + + /* + * load initial hash values + * Need to reorder these appropriately + * DCBA, HGFE -> ABEF, CDGH + */ + movdqu 0*16(DIGEST_PTR), STATE0 + movdqu 1*16(DIGEST_PTR), STATE1 + + pshufd $0xB1, STATE0, STATE0 /* CDAB */ + pshufd $0x1B, STATE1, STATE1 /* EFGH */ + movdqa STATE0, MSGTMP4 + palignr $8, STATE1, STATE0 /* ABEF */ + pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ + + movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK + lea K256(%rip), SHA256CONSTANTS + +.Lloop0: + /* Save hash values for addition after rounds */ + movdqa STATE0, ABEF_SAVE + movdqa STATE1, CDGH_SAVE + + /* Rounds 0-3 */ + movdqu 0*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP0 + paddd 0*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 4-7 */ + movdqu 1*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP1 + paddd 1*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 8-11 */ + movdqu 2*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP2 + paddd 2*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 12-15 */ + movdqu 3*16(DATA_PTR), MSG + pshufb SHUF_MASK, MSG + movdqa MSG, MSGTMP3 + paddd 3*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 16-19 */ + movdqa MSGTMP0, MSG + paddd 4*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 20-23 */ + movdqa MSGTMP1, MSG + paddd 5*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 24-27 */ + movdqa MSGTMP2, MSG + paddd 6*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 28-31 */ + movdqa MSGTMP3, MSG + paddd 7*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 32-35 */ + movdqa MSGTMP0, MSG + paddd 8*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 36-39 */ + movdqa MSGTMP1, MSG + paddd 9*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP1, MSGTMP0 + + /* Rounds 40-43 */ + movdqa MSGTMP2, MSG + paddd 10*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP2, MSGTMP1 + + /* Rounds 44-47 */ + movdqa MSGTMP3, MSG + paddd 11*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP3, MSGTMP4 + palignr $4, MSGTMP2, MSGTMP4 + paddd MSGTMP4, MSGTMP0 + sha256msg2 MSGTMP3, MSGTMP0 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP3, MSGTMP2 + + /* Rounds 48-51 */ + movdqa MSGTMP0, MSG + paddd 12*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP0, MSGTMP4 + palignr $4, MSGTMP3, MSGTMP4 + paddd MSGTMP4, MSGTMP1 + sha256msg2 MSGTMP0, MSGTMP1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + sha256msg1 MSGTMP0, MSGTMP3 + + /* Rounds 52-55 */ + movdqa MSGTMP1, MSG + paddd 13*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP1, MSGTMP4 + palignr $4, MSGTMP0, MSGTMP4 + paddd MSGTMP4, MSGTMP2 + sha256msg2 MSGTMP1, MSGTMP2 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 56-59 */ + movdqa MSGTMP2, MSG + paddd 14*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + movdqa MSGTMP2, MSGTMP4 + palignr $4, MSGTMP1, MSGTMP4 + paddd MSGTMP4, MSGTMP3 + sha256msg2 MSGTMP2, MSGTMP3 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Rounds 60-63 */ + movdqa MSGTMP3, MSG + paddd 15*16(SHA256CONSTANTS), MSG + sha256rnds2 STATE0, STATE1 + pshufd $0x0E, MSG, MSG + sha256rnds2 STATE1, STATE0 + + /* Add current hash values with previously saved */ + paddd ABEF_SAVE, STATE0 + paddd CDGH_SAVE, STATE1 + + /* Increment data pointer and loop if more to process */ + add $64, DATA_PTR + cmp NUM_BLKS, DATA_PTR + jne .Lloop0 + + /* Write hash values back in the correct order */ + pshufd $0x1B, STATE0, STATE0 /* FEBA */ + pshufd $0xB1, STATE1, STATE1 /* DCHG */ + movdqa STATE0, MSGTMP4 + pblendw $0xF0, STATE1, STATE0 /* DCBA */ + palignr $8, MSGTMP4, STATE1 /* HGFE */ + + movdqu STATE0, 0*16(DIGEST_PTR) + movdqu STATE1, 1*16(DIGEST_PTR) + +.Ldone_hash: + + ret +ENDPROC(sha256_ni_transform) + +.data +.align 64 +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/kernel/arch/x86/crypto/sha256_ssse3_glue.c b/kernel/arch/x86/crypto/sha256_ssse3_glue.c index ccc338881..5f4d6086d 100644 --- a/kernel/arch/x86/crypto/sha256_ssse3_glue.c +++ b/kernel/arch/x86/crypto/sha256_ssse3_glue.c @@ -37,26 +37,15 @@ #include #include #include -#include -#include -#include +#include #include asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, u64 rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(u32 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, - u64 rounds); -#endif - -static void (*sha256_transform_asm)(u32 *, const char *, u64); +typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds); -static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha256_transform_fn *sha256_xform) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -69,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_transform_asm); + (sha256_block_fn *)sha256_xform); kernel_fpu_end(); return 0; } -static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) { if (!irq_fpu_usable()) return crypto_sha256_finup(desc, data, len, out); @@ -84,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha256_base_do_update(desc, data, len, - (sha256_block_fn *)sha256_transform_asm); - sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); + (sha256_block_fn *)sha256_xform); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform); kernel_fpu_end(); return sha256_base_finish(desc, out); } +static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_ssse3); +} + +static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_ssse3); +} + /* Add padding and return the message digest. */ static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) { return sha256_ssse3_finup(desc, NULL, 0, out); } -static struct shash_alg algs[] = { { +static struct shash_alg sha256_ssse3_algs[] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = sha256_ssse3_update, @@ -129,64 +130,294 @@ static struct shash_alg algs[] = { { } } }; +static int register_sha256_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(sha256_ssse3_algs, + ARRAY_SIZE(sha256_ssse3_algs)); + return 0; +} + +static void unregister_sha256_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(sha256_ssse3_algs, + ARRAY_SIZE(sha256_ssse3_algs)); +} + #ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +asmlinkage void sha256_transform_avx(u32 *digest, const char *data, + u64 rounds); + +static int sha256_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - u64 xcr0; + return sha256_update(desc, data, len, sha256_transform_avx); +} - if (!cpu_has_avx || !cpu_has_osxsave) - return false; +static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_avx); +} - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); +static int sha256_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha256_avx_finup(desc, NULL, 0, out); +} +static struct shash_alg sha256_avx_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_avx_update, + .final = sha256_avx_final, + .finup = sha256_avx_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_avx_update, + .final = sha256_avx_final, + .finup = sha256_avx_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx_usable(void) +{ + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } return true; } -#endif -static int __init sha256_ssse3_mod_init(void) +static int register_sha256_avx(void) { - /* test for SSSE3 first */ - if (cpu_has_ssse3) - sha256_transform_asm = sha256_transform_ssse3; + if (avx_usable()) + return crypto_register_shashes(sha256_avx_algs, + ARRAY_SIZE(sha256_avx_algs)); + return 0; +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { -#ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) - sha256_transform_asm = sha256_transform_rorx; - else +static void unregister_sha256_avx(void) +{ + if (avx_usable()) + crypto_unregister_shashes(sha256_avx_algs, + ARRAY_SIZE(sha256_avx_algs)); +} + +#else +static inline int register_sha256_avx(void) { return 0; } +static inline void unregister_sha256_avx(void) { } #endif - sha256_transform_asm = sha256_transform_avx; + +#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) +asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, + u64 rounds); + +static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_transform_rorx); +} + +static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_transform_rorx); +} + +static int sha256_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha256_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_avx2_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_avx2_update, + .final = sha256_avx2_final, + .finup = sha256_avx2_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, } -#endif +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_avx2_update, + .final = sha256_avx2_final, + .finup = sha256_avx2_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; - if (sha256_transform_asm) { -#ifdef CONFIG_AS_AVX - if (sha256_transform_asm == sha256_transform_avx) - pr_info("Using AVX optimized SHA-256 implementation\n"); -#ifdef CONFIG_AS_AVX2 - else if (sha256_transform_asm == sha256_transform_rorx) - pr_info("Using AVX2 optimized SHA-256 implementation\n"); +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} + +static int register_sha256_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shashes(sha256_avx2_algs, + ARRAY_SIZE(sha256_avx2_algs)); + return 0; +} + +static void unregister_sha256_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shashes(sha256_avx2_algs, + ARRAY_SIZE(sha256_avx2_algs)); +} + +#else +static inline int register_sha256_avx2(void) { return 0; } +static inline void unregister_sha256_avx2(void) { } #endif - else + +#ifdef CONFIG_AS_SHA256_NI +asmlinkage void sha256_ni_transform(u32 *digest, const char *data, + u64 rounds); /*unsigned int rounds);*/ + +static int sha256_ni_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha256_update(desc, data, len, sha256_ni_transform); +} + +static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_finup(desc, data, len, out, sha256_ni_transform); +} + +static int sha256_ni_final(struct shash_desc *desc, u8 *out) +{ + return sha256_ni_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha256_ni_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_ni_update, + .final = sha256_ni_final, + .finup = sha256_ni_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_ni_update, + .final = sha256_ni_final, + .finup = sha256_ni_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ni", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int register_sha256_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + return crypto_register_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); + return 0; +} + +static void unregister_sha256_ni(void) +{ + if (boot_cpu_has(X86_FEATURE_SHA_NI)) + crypto_unregister_shashes(sha256_ni_algs, + ARRAY_SIZE(sha256_ni_algs)); +} + +#else +static inline int register_sha256_ni(void) { return 0; } +static inline void unregister_sha256_ni(void) { } #endif - pr_info("Using SSSE3 optimized SHA-256 implementation\n"); - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + +static int __init sha256_ssse3_mod_init(void) +{ + if (register_sha256_ssse3()) + goto fail; + + if (register_sha256_avx()) { + unregister_sha256_ssse3(); + goto fail; } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + if (register_sha256_avx2()) { + unregister_sha256_avx(); + unregister_sha256_ssse3(); + goto fail; + } + + if (register_sha256_ni()) { + unregister_sha256_avx2(); + unregister_sha256_avx(); + unregister_sha256_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha256_ssse3_mod_fini(void) { - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + unregister_sha256_ni(); + unregister_sha256_avx2(); + unregister_sha256_avx(); + unregister_sha256_ssse3(); } module_init(sha256_ssse3_mod_init); diff --git a/kernel/arch/x86/crypto/sha512_ssse3_glue.c b/kernel/arch/x86/crypto/sha512_ssse3_glue.c index d9fa4c1e0..34e5083d6 100644 --- a/kernel/arch/x86/crypto/sha512_ssse3_glue.c +++ b/kernel/arch/x86/crypto/sha512_ssse3_glue.c @@ -35,27 +35,17 @@ #include #include #include -#include -#include -#include +#include #include asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, u64 rounds); -#ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(u64 *digest, const char *data, - u64 rounds); -#endif -#ifdef CONFIG_AS_AVX2 -asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, - u64 rounds); -#endif -static void (*sha512_transform_asm)(u64 *, const char *, u64); +typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds); -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha512_transform_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); @@ -68,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_transform_asm); + (sha512_block_fn *)sha512_xform); kernel_fpu_end(); return 0; } -static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) { if (!irq_fpu_usable()) return crypto_sha512_finup(desc, data, len, out); @@ -83,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, - (sha512_block_fn *)sha512_transform_asm); - sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); + (sha512_block_fn *)sha512_xform); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } +static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_update(desc, data, len, sha512_transform_ssse3); +} + +static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_ssse3); +} + /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } -static struct shash_alg algs[] = { { +static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, @@ -128,64 +130,213 @@ static struct shash_alg algs[] = { { } } }; -#ifdef CONFIG_AS_AVX -static bool __init avx_usable(void) +static int register_sha512_ssse3(void) { - u64 xcr0; - - if (!cpu_has_avx || !cpu_has_osxsave) - return false; + if (boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(sha512_ssse3_algs, + ARRAY_SIZE(sha512_ssse3_algs)); + return 0; +} - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - pr_info("AVX detected but unusable.\n"); +static void unregister_sha512_ssse3(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(sha512_ssse3_algs, + ARRAY_SIZE(sha512_ssse3_algs)); +} +#ifdef CONFIG_AS_AVX +asmlinkage void sha512_transform_avx(u64 *digest, const char *data, + u64 rounds); +static bool avx_usable(void) +{ + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + if (cpu_has_avx) + pr_info("AVX detected but unusable.\n"); return false; } return true; } -#endif -static int __init sha512_ssse3_mod_init(void) +static int sha512_avx_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - /* test for SSSE3 first */ - if (cpu_has_ssse3) - sha512_transform_asm = sha512_transform_ssse3; + return sha512_update(desc, data, len, sha512_transform_avx); +} -#ifdef CONFIG_AS_AVX - /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) { -#ifdef CONFIG_AS_AVX2 - if (boot_cpu_has(X86_FEATURE_AVX2)) - sha512_transform_asm = sha512_transform_rorx; - else -#endif - sha512_transform_asm = sha512_transform_avx; +static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_avx); +} + +/* Add padding and return the message digest. */ +static int sha512_avx_final(struct shash_desc *desc, u8 *out) +{ + return sha512_avx_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha512_avx_algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_avx_update, + .final = sha512_avx_final, + .finup = sha512_avx_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, } -#endif +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_avx_update, + .final = sha512_avx_final, + .finup = sha512_avx_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-avx", + .cra_priority = 160, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; - if (sha512_transform_asm) { -#ifdef CONFIG_AS_AVX - if (sha512_transform_asm == sha512_transform_avx) - pr_info("Using AVX optimized SHA-512 implementation\n"); -#ifdef CONFIG_AS_AVX2 - else if (sha512_transform_asm == sha512_transform_rorx) - pr_info("Using AVX2 optimized SHA-512 implementation\n"); +static int register_sha512_avx(void) +{ + if (avx_usable()) + return crypto_register_shashes(sha512_avx_algs, + ARRAY_SIZE(sha512_avx_algs)); + return 0; +} + +static void unregister_sha512_avx(void) +{ + if (avx_usable()) + crypto_unregister_shashes(sha512_avx_algs, + ARRAY_SIZE(sha512_avx_algs)); +} +#else +static inline int register_sha512_avx(void) { return 0; } +static inline void unregister_sha512_avx(void) { } #endif - else + +#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX) +asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, + u64 rounds); + +static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + return sha512_update(desc, data, len, sha512_transform_rorx); +} + +static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_finup(desc, data, len, out, sha512_transform_rorx); +} + +/* Add padding and return the message digest. */ +static int sha512_avx2_final(struct shash_desc *desc, u8 *out) +{ + return sha512_avx2_finup(desc, NULL, 0, out); +} + +static struct shash_alg sha512_avx2_algs[] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = sha512_base_init, + .update = sha512_avx2_update, + .final = sha512_avx2_final, + .finup = sha512_avx2_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name = "sha512-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = sha384_base_init, + .update = sha512_avx2_update, + .final = sha512_avx2_final, + .finup = sha512_avx2_finup, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name = "sha384-avx2", + .cra_priority = 170, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static bool avx2_usable(void) +{ + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} + +static int register_sha512_avx2(void) +{ + if (avx2_usable()) + return crypto_register_shashes(sha512_avx2_algs, + ARRAY_SIZE(sha512_avx2_algs)); + return 0; +} + +static void unregister_sha512_avx2(void) +{ + if (avx2_usable()) + crypto_unregister_shashes(sha512_avx2_algs, + ARRAY_SIZE(sha512_avx2_algs)); +} +#else +static inline int register_sha512_avx2(void) { return 0; } +static inline void unregister_sha512_avx2(void) { } #endif - pr_info("Using SSSE3 optimized SHA-512 implementation\n"); - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); + +static int __init sha512_ssse3_mod_init(void) +{ + + if (register_sha512_ssse3()) + goto fail; + + if (register_sha512_avx()) { + unregister_sha512_ssse3(); + goto fail; } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + if (register_sha512_avx2()) { + unregister_sha512_avx(); + unregister_sha512_ssse3(); + goto fail; + } + + return 0; +fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + unregister_sha512_avx2(); + unregister_sha512_avx(); + unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); diff --git a/kernel/arch/x86/crypto/twofish_avx_glue.c b/kernel/arch/x86/crypto/twofish_avx_glue.c index b5e2d5651..b7a3904b9 100644 --- a/kernel/arch/x86/crypto/twofish_avx_glue.c +++ b/kernel/arch/x86/crypto/twofish_avx_glue.c @@ -36,9 +36,7 @@ #include #include #include -#include -#include -#include +#include #include #include #include @@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { { static int __init twofish_init(void) { - u64 xcr0; + const char *feature_name; - if (!cpu_has_avx || !cpu_has_osxsave) { - printk(KERN_INFO "AVX instructions are not detected.\n"); - return -ENODEV; - } - - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { - printk(KERN_INFO "AVX detected but unusable.\n"); + if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { + pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } -- cgit 1.2.3-korg