summaryrefslogtreecommitdiffstats
path: root/kernel/lib/xz/xz_dec_bcj.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/lib/xz/xz_dec_bcj.c')
-rw-r--r--kernel/lib/xz/xz_dec_bcj.c574
1 files changed, 574 insertions, 0 deletions
diff --git a/kernel/lib/xz/xz_dec_bcj.c b/kernel/lib/xz/xz_dec_bcj.c
new file mode 100644
index 000000000..a768e6d28
--- /dev/null
+++ b/kernel/lib/xz/xz_dec_bcj.c
@@ -0,0 +1,574 @@
+/*
+ * Branch/Call/Jump (BCJ) filter decoders
+ *
+ * Authors: Lasse Collin <lasse.collin@tukaani.org>
+ * Igor Pavlov <http://7-zip.org/>
+ *
+ * This file has been put into the public domain.
+ * You can do whatever you want with this file.
+ */
+
+#include "xz_private.h"
+
+/*
+ * The rest of the file is inside this ifdef. It makes things a little more
+ * convenient when building without support for any BCJ filters.
+ */
+#ifdef XZ_DEC_BCJ
+
+struct xz_dec_bcj {
+ /* Type of the BCJ filter being used */
+ enum {
+ BCJ_X86 = 4, /* x86 or x86-64 */
+ BCJ_POWERPC = 5, /* Big endian only */
+ BCJ_IA64 = 6, /* Big or little endian */
+ BCJ_ARM = 7, /* Little endian only */
+ BCJ_ARMTHUMB = 8, /* Little endian only */
+ BCJ_SPARC = 9 /* Big or little endian */
+ } type;
+
+ /*
+ * Return value of the next filter in the chain. We need to preserve
+ * this information across calls, because we must not call the next
+ * filter anymore once it has returned XZ_STREAM_END.
+ */
+ enum xz_ret ret;
+
+ /* True if we are operating in single-call mode. */
+ bool single_call;
+
+ /*
+ * Absolute position relative to the beginning of the uncompressed
+ * data (in a single .xz Block). We care only about the lowest 32
+ * bits so this doesn't need to be uint64_t even with big files.
+ */
+ uint32_t pos;
+
+ /* x86 filter state */
+ uint32_t x86_prev_mask;
+
+ /* Temporary space to hold the variables from struct xz_buf */
+ uint8_t *out;
+ size_t out_pos;
+ size_t out_size;
+
+ struct {
+ /* Amount of already filtered data in the beginning of buf */
+ size_t filtered;
+
+ /* Total amount of data currently stored in buf */
+ size_t size;
+
+ /*
+ * Buffer to hold a mix of filtered and unfiltered data. This
+ * needs to be big enough to hold Alignment + 2 * Look-ahead:
+ *
+ * Type Alignment Look-ahead
+ * x86 1 4
+ * PowerPC 4 0
+ * IA-64 16 0
+ * ARM 4 0
+ * ARM-Thumb 2 2
+ * SPARC 4 0
+ */
+ uint8_t buf[16];
+ } temp;
+};
+
+#ifdef XZ_DEC_X86
+/*
+ * This is used to test the most significant byte of a memory address
+ * in an x86 instruction.
+ */
+static inline int bcj_x86_test_msbyte(uint8_t b)
+{
+ return b == 0x00 || b == 0xFF;
+}
+
+static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+ static const bool mask_to_allowed_status[8]
+ = { true, true, true, false, true, false, false, false };
+
+ static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
+
+ size_t i;
+ size_t prev_pos = (size_t)-1;
+ uint32_t prev_mask = s->x86_prev_mask;
+ uint32_t src;
+ uint32_t dest;
+ uint32_t j;
+ uint8_t b;
+
+ if (size <= 4)
+ return 0;
+
+ size -= 4;
+ for (i = 0; i < size; ++i) {
+ if ((buf[i] & 0xFE) != 0xE8)
+ continue;
+
+ prev_pos = i - prev_pos;
+ if (prev_pos > 3) {
+ prev_mask = 0;
+ } else {
+ prev_mask = (prev_mask << (prev_pos - 1)) & 7;
+ if (prev_mask != 0) {
+ b = buf[i + 4 - mask_to_bit_num[prev_mask]];
+ if (!mask_to_allowed_status[prev_mask]
+ || bcj_x86_test_msbyte(b)) {
+ prev_pos = i;
+ prev_mask = (prev_mask << 1) | 1;
+ continue;
+ }
+ }
+ }
+
+ prev_pos = i;
+
+ if (bcj_x86_test_msbyte(buf[i + 4])) {
+ src = get_unaligned_le32(buf + i + 1);
+ while (true) {
+ dest = src - (s->pos + (uint32_t)i + 5);
+ if (prev_mask == 0)
+ break;
+
+ j = mask_to_bit_num[prev_mask] * 8;
+ b = (uint8_t)(dest >> (24 - j));
+ if (!bcj_x86_test_msbyte(b))
+ break;
+
+ src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
+ }
+
+ dest &= 0x01FFFFFF;
+ dest |= (uint32_t)0 - (dest & 0x01000000);
+ put_unaligned_le32(dest, buf + i + 1);
+ i += 4;
+ } else {
+ prev_mask = (prev_mask << 1) | 1;
+ }
+ }
+
+ prev_pos = i - prev_pos;
+ s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
+ return i;
+}
+#endif
+
+#ifdef XZ_DEC_POWERPC
+static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+ size_t i;
+ uint32_t instr;
+
+ for (i = 0; i + 4 <= size; i += 4) {
+ instr = get_unaligned_be32(buf + i);
+ if ((instr & 0xFC000003) == 0x48000001) {
+ instr &= 0x03FFFFFC;
+ instr -= s->pos + (uint32_t)i;
+ instr &= 0x03FFFFFC;
+ instr |= 0x48000001;
+ put_unaligned_be32(instr, buf + i);
+ }
+ }
+
+ return i;
+}
+#endif
+
+#ifdef XZ_DEC_IA64
+static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+ static const uint8_t branch_table[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 4, 4, 6, 6, 0, 0, 7, 7,
+ 4, 4, 0, 0, 4, 4, 0, 0
+ };
+
+ /*
+ * The local variables take a little bit stack space, but it's less
+ * than what LZMA2 decoder takes, so it doesn't make sense to reduce
+ * stack usage here without doing that for the LZMA2 decoder too.
+ */
+
+ /* Loop counters */
+ size_t i;
+ size_t j;
+
+ /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
+ uint32_t slot;
+
+ /* Bitwise offset of the instruction indicated by slot */
+ uint32_t bit_pos;
+
+ /* bit_pos split into byte and bit parts */
+ uint32_t byte_pos;
+ uint32_t bit_res;
+
+ /* Address part of an instruction */
+ uint32_t addr;
+
+ /* Mask used to detect which instructions to convert */
+ uint32_t mask;
+
+ /* 41-bit instruction stored somewhere in the lowest 48 bits */
+ uint64_t instr;
+
+ /* Instruction normalized with bit_res for easier manipulation */
+ uint64_t norm;
+
+ for (i = 0; i + 16 <= size; i += 16) {
+ mask = branch_table[buf[i] & 0x1F];
+ for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
+ if (((mask >> slot) & 1) == 0)
+ continue;
+
+ byte_pos = bit_pos >> 3;
+ bit_res = bit_pos & 7;
+ instr = 0;
+ for (j = 0; j < 6; ++j)
+ instr |= (uint64_t)(buf[i + j + byte_pos])
+ << (8 * j);
+
+ norm = instr >> bit_res;
+
+ if (((norm >> 37) & 0x0F) == 0x05
+ && ((norm >> 9) & 0x07) == 0) {
+ addr = (norm >> 13) & 0x0FFFFF;
+ addr |= ((uint32_t)(norm >> 36) & 1) << 20;
+ addr <<= 4;
+ addr -= s->pos + (uint32_t)i;
+ addr >>= 4;
+
+ norm &= ~((uint64_t)0x8FFFFF << 13);
+ norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
+ norm |= (uint64_t)(addr & 0x100000)
+ << (36 - 20);
+
+ instr &= (1 << bit_res) - 1;
+ instr |= norm << bit_res;
+
+ for (j = 0; j < 6; j++)
+ buf[i + j + byte_pos]
+ = (uint8_t)(instr >> (8 * j));
+ }
+ }
+ }
+
+ return i;
+}
+#endif
+
+#ifdef XZ_DEC_ARM
+static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+ size_t i;
+ uint32_t addr;
+
+ for (i = 0; i + 4 <= size; i += 4) {
+ if (buf[i + 3] == 0xEB) {
+ addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
+ | ((uint32_t)buf[i + 2] << 16);
+ addr <<= 2;
+ addr -= s->pos + (uint32_t)i + 8;
+ addr >>= 2;
+ buf[i] = (uint8_t)addr;
+ buf[i + 1] = (uint8_t)(addr >> 8);
+ buf[i + 2] = (uint8_t)(addr >> 16);
+ }
+ }
+
+ return i;
+}
+#endif
+
+#ifdef XZ_DEC_ARMTHUMB
+static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+ size_t i;
+ uint32_t addr;
+
+ for (i = 0; i + 4 <= size; i += 2) {
+ if ((buf[i + 1] & 0xF8) == 0xF0
+ && (buf[i + 3] & 0xF8) == 0xF8) {
+ addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
+ | ((uint32_t)buf[i] << 11)
+ | (((uint32_t)buf[i + 3] & 0x07) << 8)
+ | (uint32_t)buf[i + 2];
+ addr <<= 1;
+ addr -= s->pos + (uint32_t)i + 4;
+ addr >>= 1;
+ buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
+ buf[i] = (uint8_t)(addr >> 11);
+ buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
+ buf[i + 2] = (uint8_t)addr;
+ i += 2;
+ }
+ }
+
+ return i;
+}
+#endif
+
+#ifdef XZ_DEC_SPARC
+static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
+{
+ size_t i;
+ uint32_t instr;
+
+ for (i = 0; i + 4 <= size; i += 4) {
+ instr = get_unaligned_be32(buf + i);
+ if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
+ instr <<= 2;
+ instr -= s->pos + (uint32_t)i;
+ instr >>= 2;
+ instr = ((uint32_t)0x40000000 - (instr & 0x400000))
+ | 0x40000000 | (instr & 0x3FFFFF);
+ put_unaligned_be32(instr, buf + i);
+ }
+ }
+
+ return i;
+}
+#endif
+
+/*
+ * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
+ * of data that got filtered.
+ *
+ * NOTE: This is implemented as a switch statement to avoid using function
+ * pointers, which could be problematic in the kernel boot code, which must
+ * avoid pointers to static data (at least on x86).
+ */
+static void bcj_apply(struct xz_dec_bcj *s,
+ uint8_t *buf, size_t *pos, size_t size)
+{
+ size_t filtered;
+
+ buf += *pos;
+ size -= *pos;
+
+ switch (s->type) {
+#ifdef XZ_DEC_X86
+ case BCJ_X86:
+ filtered = bcj_x86(s, buf, size);
+ break;
+#endif
+#ifdef XZ_DEC_POWERPC
+ case BCJ_POWERPC:
+ filtered = bcj_powerpc(s, buf, size);
+ break;
+#endif
+#ifdef XZ_DEC_IA64
+ case BCJ_IA64:
+ filtered = bcj_ia64(s, buf, size);
+ break;
+#endif
+#ifdef XZ_DEC_ARM
+ case BCJ_ARM:
+ filtered = bcj_arm(s, buf, size);
+ break;
+#endif
+#ifdef XZ_DEC_ARMTHUMB
+ case BCJ_ARMTHUMB:
+ filtered = bcj_armthumb(s, buf, size);
+ break;
+#endif
+#ifdef XZ_DEC_SPARC
+ case BCJ_SPARC:
+ filtered = bcj_sparc(s, buf, size);
+ break;
+#endif
+ default:
+ /* Never reached but silence compiler warnings. */
+ filtered = 0;
+ break;
+ }
+
+ *pos += filtered;
+ s->pos += filtered;
+}
+
+/*
+ * Flush pending filtered data from temp to the output buffer.
+ * Move the remaining mixture of possibly filtered and unfiltered
+ * data to the beginning of temp.
+ */
+static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
+{
+ size_t copy_size;
+
+ copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
+ memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
+ b->out_pos += copy_size;
+
+ s->temp.filtered -= copy_size;
+ s->temp.size -= copy_size;
+ memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
+}
+
+/*
+ * The BCJ filter functions are primitive in sense that they process the
+ * data in chunks of 1-16 bytes. To hide this issue, this function does
+ * some buffering.
+ */
+XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
+ struct xz_dec_lzma2 *lzma2,
+ struct xz_buf *b)
+{
+ size_t out_start;
+
+ /*
+ * Flush pending already filtered data to the output buffer. Return
+ * immediatelly if we couldn't flush everything, or if the next
+ * filter in the chain had already returned XZ_STREAM_END.
+ */
+ if (s->temp.filtered > 0) {
+ bcj_flush(s, b);
+ if (s->temp.filtered > 0)
+ return XZ_OK;
+
+ if (s->ret == XZ_STREAM_END)
+ return XZ_STREAM_END;
+ }
+
+ /*
+ * If we have more output space than what is currently pending in
+ * temp, copy the unfiltered data from temp to the output buffer
+ * and try to fill the output buffer by decoding more data from the
+ * next filter in the chain. Apply the BCJ filter on the new data
+ * in the output buffer. If everything cannot be filtered, copy it
+ * to temp and rewind the output buffer position accordingly.
+ *
+ * This needs to be always run when temp.size == 0 to handle a special
+ * case where the output buffer is full and the next filter has no
+ * more output coming but hasn't returned XZ_STREAM_END yet.
+ */
+ if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
+ out_start = b->out_pos;
+ memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
+ b->out_pos += s->temp.size;
+
+ s->ret = xz_dec_lzma2_run(lzma2, b);
+ if (s->ret != XZ_STREAM_END
+ && (s->ret != XZ_OK || s->single_call))
+ return s->ret;
+
+ bcj_apply(s, b->out, &out_start, b->out_pos);
+
+ /*
+ * As an exception, if the next filter returned XZ_STREAM_END,
+ * we can do that too, since the last few bytes that remain
+ * unfiltered are meant to remain unfiltered.
+ */
+ if (s->ret == XZ_STREAM_END)
+ return XZ_STREAM_END;
+
+ s->temp.size = b->out_pos - out_start;
+ b->out_pos -= s->temp.size;
+ memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
+
+ /*
+ * If there wasn't enough input to the next filter to fill
+ * the output buffer with unfiltered data, there's no point
+ * to try decoding more data to temp.
+ */
+ if (b->out_pos + s->temp.size < b->out_size)
+ return XZ_OK;
+ }
+
+ /*
+ * We have unfiltered data in temp. If the output buffer isn't full
+ * yet, try to fill the temp buffer by decoding more data from the
+ * next filter. Apply the BCJ filter on temp. Then we hopefully can
+ * fill the actual output buffer by copying filtered data from temp.
+ * A mix of filtered and unfiltered data may be left in temp; it will
+ * be taken care on the next call to this function.
+ */
+ if (b->out_pos < b->out_size) {
+ /* Make b->out{,_pos,_size} temporarily point to s->temp. */
+ s->out = b->out;
+ s->out_pos = b->out_pos;
+ s->out_size = b->out_size;
+ b->out = s->temp.buf;
+ b->out_pos = s->temp.size;
+ b->out_size = sizeof(s->temp.buf);
+
+ s->ret = xz_dec_lzma2_run(lzma2, b);
+
+ s->temp.size = b->out_pos;
+ b->out = s->out;
+ b->out_pos = s->out_pos;
+ b->out_size = s->out_size;
+
+ if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
+ return s->ret;
+
+ bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
+
+ /*
+ * If the next filter returned XZ_STREAM_END, we mark that
+ * everything is filtered, since the last unfiltered bytes
+ * of the stream are meant to be left as is.
+ */
+ if (s->ret == XZ_STREAM_END)
+ s->temp.filtered = s->temp.size;
+
+ bcj_flush(s, b);
+ if (s->temp.filtered > 0)
+ return XZ_OK;
+ }
+
+ return s->ret;
+}
+
+XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
+{
+ struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (s != NULL)
+ s->single_call = single_call;
+
+ return s;
+}
+
+XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
+{
+ switch (id) {
+#ifdef XZ_DEC_X86
+ case BCJ_X86:
+#endif
+#ifdef XZ_DEC_POWERPC
+ case BCJ_POWERPC:
+#endif
+#ifdef XZ_DEC_IA64
+ case BCJ_IA64:
+#endif
+#ifdef XZ_DEC_ARM
+ case BCJ_ARM:
+#endif
+#ifdef XZ_DEC_ARMTHUMB
+ case BCJ_ARMTHUMB:
+#endif
+#ifdef XZ_DEC_SPARC
+ case BCJ_SPARC:
+#endif
+ break;
+
+ default:
+ /* Unsupported Filter ID */
+ return XZ_OPTIONS_ERROR;
+ }
+
+ s->type = id;
+ s->ret = XZ_OK;
+ s->pos = 0;
+ s->x86_prev_mask = 0;
+ s->temp.filtered = 0;
+ s->temp.size = 0;
+
+ return XZ_OK;
+}
+
+#endif