/* Copyright (C) 2007-2013 Open Information Security Foundation * * You can copy, redistribute or modify this Program under the terms of * the GNU General Public License version 2 as published by the Free * Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * version 2 along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. */ /** * \file * * \author Victor Julien * * Memcmp implementations for SSE3, SSE4.1, SSE4.2 and TILE-Gx SIMD. * * Both SCMemcmp and SCMemcmpLowercase return 0 on a exact match, * 1 on a failed match. */ #ifndef __UTIL_MEMCMP_H__ #define __UTIL_MEMCMP_H__ #include "util-optimize.h" /** \brief compare two patterns, converting the 2nd to lowercase * \warning *ONLY* the 2nd pattern is converted to lowercase */ static inline int SCMemcmpLowercase(const void *, const void *, size_t); void MemcmpRegisterTests(void); static inline int MemcmpLowercase(const void *s1, const void *s2, size_t n) { ssize_t i; /* check backwards because we already tested the first * 2 to 4 chars. This way we are more likely to detect * a miss and thus speed up a little... */ for (i = n - 1; i >= 0; i--) { if (((uint8_t *)s1)[i] != u8_tolower(*(((uint8_t *)s2)+i))) return 1; } return 0; } #if defined(__SSE4_2__) #include /* No SIMD support, fall back to plain memcmp and a home grown lowercase one */ static inline int SCMemcmp(const void *s1, const void *s2, size_t n) { __m128i b1, b2; int r; /* counter for how far we already matched in the buffer */ size_t m = 0; do { /* apparently we can't just read 16 bytes even though * it almost always works fine :) */ if (likely(n - m < 16)) { return memcmp(s1, s2, n - m) ? 1 : 0; } /* load the buffers into the 128bit vars */ b1 = _mm_loadu_si128((const __m128i *) s1); b2 = _mm_loadu_si128((const __m128i *) s2); /* do the actual compare */ m += (r = _mm_cmpestri(b1, n - m, b2, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY)); s1 += 16; s2 += 16; } while (r == 16); return ((m == n) ? 0 : 1); } /* Range of values of uppercase characters. We only use the first 2 bytes. */ static char scmemcmp_uppercase[16] __attribute__((aligned(16))) = { 'A', 'Z', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /** \brief compare two buffers in a case insensitive way * \param s1 buffer already in lowercase * \param s2 buffer with mixed upper and lowercase */ static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t n) { __m128i b1, b2, mask; int r; /* counter for how far we already matched in the buffer */ size_t m = 0; __m128i ucase = _mm_load_si128((const __m128i *) scmemcmp_uppercase); __m128i nulls = _mm_setzero_si128(); __m128i uplow = _mm_set1_epi8(0x20); do { /* apparently we can't just read 16 bytes even though * it almost always works fine :) */ if (likely(n - m < 16)) { return MemcmpLowercase(s1, s2, n - m); } b1 = _mm_loadu_si128((const __m128i *) s1); b2 = _mm_loadu_si128((const __m128i *) s2); size_t len = n - m; /* The first step is creating a mask that is FF for all uppercase * characters, 00 for all others */ mask = _mm_cmpestrm(ucase, 2, b2, len, _SIDD_CMP_RANGES | _SIDD_UNIT_MASK); /* Next we use that mask to create a new: this one has 0x20 for * the uppercase chars, 00 for all other. */ mask = _mm_blendv_epi8(nulls, uplow, mask); /* finally, merge the mask and the buffer converting the * uppercase to lowercase */ b2 = _mm_add_epi8(b2, mask); /* search using our converted buffer */ m += (r = _mm_cmpestri(b1, len, b2, 16, _SIDD_CMP_EQUAL_EACH | _SIDD_MASKED_NEGATIVE_POLARITY)); s1 += 16; s2 += 16; } while (r == 16); return ((m == n) ? 0 : 1); } #elif defined(__SSE4_1__) #include #define SCMEMCMP_BYTES 16 static inline int SCMemcmp(const void *s1, const void *s2, size_t len) { size_t offset = 0; __m128i b1, b2, c; do { /* apparently we can't just read 16 bytes even though * it almost always works fine :) */ if (likely(len - offset < 16)) { return memcmp(s1, s2, len - offset) ? 1 : 0; } /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using * _mm_lddqu_si128 was about 2% slower even though it's supposed to * be faster. */ b1 = _mm_loadu_si128((const __m128i *) s1); b2 = _mm_loadu_si128((const __m128i *) s2); c = _mm_cmpeq_epi8(b1, b2); int diff = len - offset; if (diff < 16) { int rmask = ~(0xFFFFFFFF << diff); if ((_mm_movemask_epi8(c) & rmask) != rmask) { return 1; } } else { if (_mm_movemask_epi8(c) != 0x0000FFFF) { return 1; } } offset += SCMEMCMP_BYTES; s1 += SCMEMCMP_BYTES; s2 += SCMEMCMP_BYTES; } while (len > offset); return 0; } #define UPPER_LOW 0x40 /* "A" - 1 */ #define UPPER_HIGH 0x5B /* "Z" + 1 */ static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len) { size_t offset = 0; __m128i b1, b2, mask1, mask2, upper1, upper2, nulls, uplow; /* setup registers for upper to lower conversion */ upper1 = _mm_set1_epi8(UPPER_LOW); upper2 = _mm_set1_epi8(UPPER_HIGH); nulls = _mm_setzero_si128(); uplow = _mm_set1_epi8(0x20); do { /* apparently we can't just read 16 bytes even though * it almost always works fine :) */ if (likely(len - offset < 16)) { return MemcmpLowercase(s1, s2, len - offset); } /* unaligned loading of the bytes to compare */ b1 = _mm_loadu_si128((const __m128i *) s1); b2 = _mm_loadu_si128((const __m128i *) s2); /* mark all chars bigger than upper1 */ mask1 = _mm_cmpgt_epi8(b2, upper1); /* mark all chars lower than upper2 */ mask2 = _mm_cmplt_epi8(b2, upper2); /* merge the two, leaving only those that are true in both */ mask1 = _mm_cmpeq_epi8(mask1, mask2); /* Next we use that mask to create a new: this one has 0x20 for * the uppercase chars, 00 for all other. */ mask1 = _mm_blendv_epi8(nulls, uplow, mask1); /* add to b2, converting uppercase to lowercase */ b2 = _mm_add_epi8(b2, mask1); /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */ mask1 = _mm_cmpeq_epi8(b1, b2); int diff = len - offset; if (diff < 16) { int rmask = ~(0xFFFFFFFF << diff); if ((_mm_movemask_epi8(mask1) & rmask) != rmask) { return 1; } } else { if (_mm_movemask_epi8(mask1) != 0x0000FFFF) { return 1; } } offset += SCMEMCMP_BYTES; s1 += SCMEMCMP_BYTES; s2 += SCMEMCMP_BYTES; } while (len > offset); return 0; } #elif defined(__SSE3__) #include /* for SSE3 */ #define SCMEMCMP_BYTES 16 static inline int SCMemcmp(const void *s1, const void *s2, size_t len) { size_t offset = 0; __m128i b1, b2, c; do { /* apparently we can't just read 16 bytes even though * it almost always works fine :) */ if (likely(len - offset < 16)) { return memcmp(s1, s2, len - offset) ? 1 : 0; } /* do unaligned loads using _mm_loadu_si128. On my Core2 E6600 using * _mm_lddqu_si128 was about 2% slower even though it's supposed to * be faster. */ b1 = _mm_loadu_si128((const __m128i *) s1); b2 = _mm_loadu_si128((const __m128i *) s2); c = _mm_cmpeq_epi8(b1, b2); int diff = len - offset; if (diff < 16) { int rmask = ~(0xFFFFFFFF << diff); if ((_mm_movemask_epi8(c) & rmask) != rmask) { return 1; } } else { if (_mm_movemask_epi8(c) != 0x0000FFFF) { return 1; } } offset += SCMEMCMP_BYTES; s1 += SCMEMCMP_BYTES; s2 += SCMEMCMP_BYTES; } while (len > offset); return 0; } #define UPPER_LOW 0x40 /* "A" - 1 */ #define UPPER_HIGH 0x5B /* "Z" + 1 */ #define UPPER_DELTA 0xDF /* 0xFF - 0x20 */ static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len) { size_t offset = 0; __m128i b1, b2, mask1, mask2, upper1, upper2, delta; /* setup registers for upper to lower conversion */ upper1 = _mm_set1_epi8(UPPER_LOW); upper2 = _mm_set1_epi8(UPPER_HIGH); delta = _mm_set1_epi8(UPPER_DELTA); do { /* apparently we can't just read 16 bytes even though * it almost always works fine :) */ if (likely(len - offset < 16)) { return MemcmpLowercase(s1, s2, len - offset); } /* unaligned loading of the bytes to compare */ b1 = _mm_loadu_si128((const __m128i *) s1); b2 = _mm_loadu_si128((const __m128i *) s2); /* mark all chars bigger than upper1 */ mask1 = _mm_cmpgt_epi8(b2, upper1); /* mark all chars lower than upper2 */ mask2 = _mm_cmplt_epi8(b2, upper2); /* merge the two, leaving only those that are true in both */ mask1 = _mm_cmpeq_epi8(mask1, mask2); /* sub delta leaves 0x20 only for uppercase positions, the rest is 0x00 due to the saturation (reuse mask1 reg)*/ mask1 = _mm_subs_epu8(mask1, delta); /* add to b2, converting uppercase to lowercase */ b2 = _mm_add_epi8(b2, mask1); /* now all is lowercase, let's do the actual compare (reuse mask1 reg) */ mask1 = _mm_cmpeq_epi8(b1, b2); int diff = len - offset; if (diff < 16) { int rmask = ~(0xFFFFFFFF << diff); if ((_mm_movemask_epi8(mask1) & rmask) != rmask) { return 1; } } else { if (_mm_movemask_epi8(mask1) != 0x0000FFFF) { return 1; } } offset += SCMEMCMP_BYTES; s1 += SCMEMCMP_BYTES; s2 += SCMEMCMP_BYTES; } while (len > offset); return 0; } #elif defined(__tile__) #include /* Compare to non-zero sequence of bytes. */ static inline int SCMemcmpNZ(const void *s1, const void *s2, size_t len) { uint64_t b1, w1, aligned1; uint64_t b2, w2, aligned2; /* Load aligned words containing the beginning of each string. * These loads don't trigger unaligned events. */ w1 = __insn_ldna(s1); w2 = __insn_ldna(s2); /* Can't just read next 8 bytes because it might go past the end * of a page. */ while (len > 8) { /* Here, the buffer extends into the next word by at least one * byte, so it is safe to read the next word. Do an aligned * loads on the next word. Then use the two words to create * an aligned word from each string. */ b1 = __insn_ldna(s1 + 8); b2 = __insn_ldna(s2 + 8); aligned1 = __insn_dblalign(w1, b1, s1); aligned2 = __insn_dblalign(w2, b2, s2); if (aligned1 != aligned2) return 1; /* Move forward one word (8 bytes) */ w1 = b1; w2 = b2; len -= 8; s1 += 8; s2 += 8; } /* Process the last up-to 8 bytes. */ do { if (*(char*)s1 != *(char*)s2) return 1; s1++; s2++; len--; } while (len); return 0; } /* Compare two sequences of bytes. */ static inline int SCMemcmp(const void *s1, const void *s2, size_t len) { if (len == 0) return 0; return SCMemcmpNZ(s1, s2, len); } /** \brief Convert 8 characters to lower case using SIMD. * \param Word containing the 8 bytes. * \return Word containing 8-bytes each converted to lowercase. */ static inline uint64_t vec_tolower(uint64_t cc) { /* For Uppercases letters, add 32 to convert to lower case. */ uint64_t less_than_eq_Z = __insn_v1cmpltui (cc, 'Z' + 1); uint64_t less_than_A = __insn_v1cmpltui (cc, 'A'); uint64_t is_upper = __insn_v1cmpne (less_than_eq_Z, less_than_A); return __insn_v1add (cc,__insn_v1shli (is_upper, 5)); } /** \brief compare two buffers in a case insensitive way * \param s1 buffer already in lowercase * \param s2 buffer with mixed upper and lowercase */ static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len) { uint64_t b1, w1, aligned1; uint64_t b2, w2, aligned2; if (len == 0) return 0; /* TODO Check for already aligned cases. To optimize. */ /* Load word containing the beginning of each string. * These loads don't trigger unaligned events. */ w1 = __insn_ldna(s1); w2 = __insn_ldna(s2); /* Can't just read next 8 bytes because it might go past the end * of a page. */ while (len > 8) { /* Here, the buffer extends into the next word by at least one * byte, so it is safe to read the next word. Do aligned * loads on next word. Then use the two words to create an * aligned word from each string. */ b1 = __insn_ldna(s1 + 8); b2 = __insn_ldna(s2 + 8); aligned1 = __insn_dblalign(w1, b1, s1); aligned2 = vec_tolower(__insn_dblalign(w2, b2, s2)); if (aligned1 != aligned2) return 1; /* Move forward one word (8 bytes) */ w1 = b1; w2 = b2; len -= 8; s1 += 8; s2 += 8; } do { if (*(char*)s1 != tolower(*(char*)s2)) return 1; s1++; s2++; len--; } while (len); return 0; } #else /* No SIMD support, fall back to plain memcmp and a home grown lowercase one */ /* wrapper around memcmp to match the retvals of the SIMD implementations */ #define SCMemcmp(a,b,c) ({ \ memcmp((a), (b), (c)) ? 1 : 0; \ }) static inline int SCMemcmpLowercase(const void *s1, const void *s2, size_t len) { return MemcmpLowercase(s1, s2, len); } #endif /* SIMD */ #endif /* __UTIL_MEMCMP_H__ */