diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index cbb85e527eb7f239b7a65fde96c0a126fb62747b..e2697e5b96f048137a81c4a15a3b2d025965aaba 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -787,29 +787,66 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, * Hashes ***************************************/ static const U32 prime3bytes = 506832829U; +static size_t ZSTD_hash3_opt(U32 u, U32 h) { assert(h <= 32); return (size_t) (((U64)(((u << (32-24)) * prime3bytes) >> (32-h)) << 32) + (U32)((u << (32-24)) * prime3bytes)) ; } +MEM_STATIC size_t ZSTD_hash3Ptr_opt(const void* ptr, U32 h) { return ZSTD_hash3_opt(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ + static U32 ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes) >> (32-h) ; } MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ static const U32 prime4bytes = 2654435761U; +static size_t ZSTD_hash4_opt(U32 u, U32 h) { assert(h <= 32); return (size_t)((((U64)((u * prime4bytes) >> (32-h)) << 32)) + (U32)(u * prime4bytes)) ; } +static size_t ZSTD_hash4Ptr_opt(const void* ptr, U32 h) { return ZSTD_hash4_opt(MEM_readLE32(ptr), h); } + static U32 ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; } static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); } static const U64 prime5bytes = 889523592379ULL; +static size_t ZSTD_hash5_opt(U64 u, U32 h) { assert(h <= 64); return (size_t)(((((u << (64-40)) * prime5bytes) >> (64-h)) << 32) + (U32)((u << (64-40)) * prime5bytes)) ; } +static size_t ZSTD_hash5Ptr_opt(const void* p, U32 h) { return ZSTD_hash5_opt(MEM_readLE64(p), h); } + static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6_opt(U64 u, U32 h) { assert(h <= 64); return (size_t)(((((u << (64-48)) * prime6bytes) >> (64-h)) << 32) + (U32)((u << (64-48)) * prime6bytes)) ; } +static size_t ZSTD_hash6Ptr_opt(const void* p, U32 h) { return ZSTD_hash6_opt(MEM_readLE64(p), h); } + static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } static const U64 prime7bytes = 58295818150454627ULL; +static size_t ZSTD_hash7_opt(U64 u, U32 h) { assert(h <= 64); return (size_t)(((((u << (64-56)) * prime7bytes) >> (64-h)) << 32) + (U32)((u << (64-56)) * prime7bytes)) ; } +static size_t ZSTD_hash7Ptr_opt(const void* p, U32 h) { return ZSTD_hash7_opt(MEM_readLE64(p), h); } + static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8_opt(U64 u, U32 h) { assert(h <= 64); return (size_t)(((((u) * prime8bytes) >> (64-h)) << 32) + (U32)((u) * prime8bytes)) ; } +static size_t ZSTD_hash8Ptr_opt(const void* p, U32 h) { return ZSTD_hash8_opt(MEM_readLE64(p), h); } + static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; } static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } +MEM_STATIC FORCE_INLINE_ATTR +size_t ZSTD_hashPtr_opt(const void* p, U32 hBits, U32 mls) +{ + /* Although some of these hashes do support hBits up to 64, some do not. + * To be on the safe side, always avoid hBits > 32. */ + assert(hBits <= 32); + + switch(mls) + { + default: + case 3: return ZSTD_hash3Ptr_opt(p, hBits); + case 4: return ZSTD_hash4Ptr_opt(p, hBits); + case 5: return ZSTD_hash5Ptr_opt(p, hBits); + case 6: return ZSTD_hash6Ptr_opt(p, hBits); + case 7: return ZSTD_hash7Ptr_opt(p, hBits); + case 8: return ZSTD_hash8Ptr_opt(p, hBits); + } +} + MEM_STATIC FORCE_INLINE_ATTR size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) { diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c index 0ad88ffc7bdad16b26b13637b81f1f78c16fcea6..e5536061bb589a640acff4c48c72b7bb8d823f06 100644 --- a/lib/compress/zstd_double_fast.c +++ b/lib/compress/zstd_double_fast.c @@ -161,18 +161,20 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( goto _cleanup; } - hl0 = ZSTD_hashPtr(ip, hBitsL, 8); - idxl0 = hashLong[hl0]; + + hl0 = ZSTD_hashPtr_opt(ip, hBitsL, 8); + idxl0 = hashLong[hl0 >> 32] - (U32)hl0; matchl0 = base + idxl0; /* Inner Loop: one iteration per search / position */ do { - const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls); - const U32 idxs0 = hashSmall[hs0]; + const size_t hs0 = ZSTD_hashPtr_opt(ip, hBitsS, mls); + const U32 idxs0 = hashSmall[hs0 >> 32] - (U32)hs0 ; curr = (U32)(ip-base); matchs0 = base + idxs0; - hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */ + hashLong[hl0 >> 32] = curr + (U32)hl0; + hashSmall[hs0 >> 32] = curr + (U32)hs0; /* update hash tables */ /* check noDict repcode */ if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { @@ -182,34 +184,41 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( goto _match_stored; } - hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); - - if (idxl0 > prefixLowestIndex) { - /* check prefix long match */ - if (MEM_read64(matchl0) == MEM_read64(ip)) { - mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; - offset = (U32)(ip-matchl0); - while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ - goto _match_found; + + hl1 = ZSTD_hashPtr_opt(ip1, hBitsL, 8); + + if (prefixLowest <= matchl0 && matchl0 < ilimit) { + if (idxl0 > prefixLowestIndex) { + /* check prefix long match */ + if (MEM_read64(matchl0) == MEM_read64(ip)) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; + } } } - idxl1 = hashLong[hl1]; + idxl1 = hashLong[hl1 >> 32] - (U32)hl1; matchl1 = base + idxl1; - if (idxs0 > prefixLowestIndex) { - /* check prefix short match */ - if (MEM_read32(matchs0) == MEM_read32(ip)) { - goto _search_next_long; + if (prefixLowest <= matchs0 && matchs0 < ilimit) { + if (idxs0 > prefixLowestIndex && ip > matchs0) { + /* check prefix short match */ + if (MEM_read32(matchs0) == MEM_read32(ip)) { + goto _search_next_long; + } } } - + + if (ip1 >= nextStep) { PREFETCH_L1(ip1 + 64); PREFETCH_L1(ip1 + 128); step++; nextStep += kStepIncr; } + ip = ip1; ip1 += step; @@ -229,23 +238,26 @@ _cleanup: /* save reps for next block */ rep[0] = offset_1 ? offset_1 : offsetSaved1; rep[1] = offset_2 ? offset_2 : offsetSaved2; + /* Return the last literals size */ return (size_t)(iend - anchor); _search_next_long: - /* check prefix long +1 match */ - if (idxl1 > prefixLowestIndex) { - if (MEM_read64(matchl1) == MEM_read64(ip1)) { - ip = ip1; - mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; - offset = (U32)(ip-matchl1); - while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ - goto _match_found; + + if (prefixLowest <= (matchl1) && (matchl1) < ilimit){ + if (idxl1 > prefixLowestIndex) { + if (MEM_read64(matchl1) == MEM_read64(ip1)) { + ip = ip1; + mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; + offset = (U32)(ip-matchl1); + while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ + goto _match_found; + } } } - + /* if no long +1 match, explore the short match we found */ mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; offset = (U32)(ip - matchs0); @@ -265,7 +277,7 @@ _match_found: /* requires ip, offset, mLength */ * more predictable test. The minmatch even if we take a short match is * 4 bytes, so as long as step, the distance between ip and ip1 * (initially) is less than 4, we know ip1 < new ip. */ - hashLong[hl1] = (U32)(ip1 - base); + hashLong[hl1 >> 32] = (U32)(ip1 - base) + (U32)hl1; } ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); @@ -279,21 +291,21 @@ _match_stored: /* Complementary insertion */ /* done after iLimit test, as candidates could be > iend-8 */ { U32 const indexToInsert = curr+2; - hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; - hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); - hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; - hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + hashLong[ZSTD_hashPtr_opt(base+indexToInsert, hBitsL, 8) >> 32] = indexToInsert + (U32)ZSTD_hashPtr_opt(base+indexToInsert, hBitsL, 8) ; + hashLong[ZSTD_hashPtr_opt(ip-2, hBitsL, 8) >> 32] = (U32)(ip-2-base) + (U32)ZSTD_hashPtr_opt(ip-2, hBitsL, 8); + hashSmall[ZSTD_hashPtr_opt(base+indexToInsert, hBitsS, mls) >> 32] = indexToInsert + (U32)ZSTD_hashPtr_opt(base+indexToInsert, hBitsS, mls); + hashSmall[ZSTD_hashPtr_opt(ip-1, hBitsS, mls) >> 32] = (U32)(ip-1-base) + (U32)ZSTD_hashPtr_opt(ip-1, hBitsS, mls); } /* check immediate repcode */ while ( (ip <= ilimit) - && ( (offset_2>0) + && ( (offset_2>0) & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { /* store sequence */ size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + hashSmall[ZSTD_hashPtr_opt(ip, hBitsS, mls) >> 32] = (U32)(ip-base) + (U32)ZSTD_hashPtr_opt(ip, hBitsS, mls); + hashLong[ZSTD_hashPtr_opt(ip, hBitsL, 8) >> 32] = (U32)(ip-base) + (U32)ZSTD_hashPtr_opt(ip, hBitsL, 8); ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); ip += rLength; anchor = ip; diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 5f2c6a2edad5225423fa2699b6b728d9782e2b8d..965879f7eb5077607edc47cf83df0275c25e87a2 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -210,10 +210,10 @@ _start: /* Requires: ip0 */ goto _cleanup; } - hash0 = ZSTD_hashPtr(ip0, hlog, mls); - hash1 = ZSTD_hashPtr(ip1, hlog, mls); + hash0 = ZSTD_hashPtr_opt(ip0, hlog, mls); + hash1 = ZSTD_hashPtr_opt(ip1, hlog, mls); - idx = hashTable[hash0]; + idx = hashTable[hash0 >> 32] - (U32)hash0; do { /* load repcode match for ip[2]*/ @@ -221,7 +221,7 @@ _start: /* Requires: ip0 */ /* write back hash table entry */ current0 = (U32)(ip0 - base); - hashTable[hash0] = current0; + hashTable[hash0 >> 32] = current0 + (U32)hash0; /* check repcode at ip[2] */ if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) { @@ -236,36 +236,36 @@ _start: /* Requires: ip0 */ /* First write next hash table entry; we've already calculated it. * This write is known to be safe because the ip1 is before the * repcode (ip2). */ - hashTable[hash1] = (U32)(ip1 - base); + hashTable[hash1 >> 32] = (U32)(ip1 - base) + (U32)hash1; goto _match; } /* load match for ip[0] */ - if (idx >= prefixStartIndex) { + if (idx >= prefixStartIndex && idx < endIndex) { mval = MEM_read32(base + idx); } else { mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ } /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { + if (MEM_read32(ip0) == mval && (base + idx) < ip0) { /* found a match! */ /* First write next hash table entry; we've already calculated it. * This write is known to be safe because the ip1 == ip0 + 1, so * we know we will resume searching after ip1 */ - hashTable[hash1] = (U32)(ip1 - base); + hashTable[hash1 >> 32] = (U32)(ip1 - base) + (U32)hash1; goto _offset; } /* lookup ip[1] */ - idx = hashTable[hash1]; + idx = hashTable[hash1 >> 32] - (U32)hash1; /* hash ip[2] */ hash0 = hash1; - hash1 = ZSTD_hashPtr(ip2, hlog, mls); + hash1 = ZSTD_hashPtr_opt(ip2, hlog, mls); /* advance to next positions */ ip0 = ip1; @@ -274,17 +274,17 @@ _start: /* Requires: ip0 */ /* write back hash table entry */ current0 = (U32)(ip0 - base); - hashTable[hash0] = current0; + hashTable[hash0 >> 32] = current0 + (U32)hash0; /* load match for ip[0] */ - if (idx >= prefixStartIndex) { + if (idx >= prefixStartIndex && idx < endIndex) { mval = MEM_read32(base + idx); } else { mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ } /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { + if (MEM_read32(ip0) == mval && (base + idx) < ip0) { /* found a match! */ /* first write next hash table entry; we've already calculated it */ @@ -298,18 +298,18 @@ _start: /* Requires: ip0 */ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely * write this position. */ - hashTable[hash1] = (U32)(ip1 - base); + hashTable[hash1 >> 32] = (U32)(ip1 - base) + (U32)hash1; } goto _offset; } /* lookup ip[1] */ - idx = hashTable[hash1]; + idx = hashTable[hash1 >> 32] - (U32)hash1 ; /* hash ip[2] */ hash0 = hash1; - hash1 = ZSTD_hashPtr(ip2, hlog, mls); + hash1 = ZSTD_hashPtr_opt(ip2, hlog, mls); /* advance to next positions */ ip0 = ip1; @@ -357,17 +357,21 @@ _offset: /* Requires: ip0, idx */ /* Compute the offset code. */ match0 = base + idx; + rep_offset2 = rep_offset1; rep_offset1 = (U32)(ip0-match0); offcode = OFFSET_TO_OFFBASE(rep_offset1); mLength = 4; - /* Count the backwards match length. */ while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } + + + + _match: /* Requires: ip0, match0, offcode */ @@ -383,15 +387,15 @@ _match: /* Requires: ip0, match0, offcode */ if (ip0 <= ilimit) { /* Fill Table */ assert(base+current0+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + hashTable[ZSTD_hashPtr_opt(base+current0+2, hlog, mls) >> 32] = current0+2+(U32)ZSTD_hashPtr_opt(base+current0+2, hlog, mls); /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr_opt(ip0-2, hlog, mls) >> 32] = (U32)(ip0-2-base)+(U32)ZSTD_hashPtr_opt(ip0-2, hlog, mls); if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */ while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) { /* store sequence */ size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4; { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + hashTable[ZSTD_hashPtr_opt(ip0, hlog, mls) >> 32] = (U32)(ip0-base) + (U32)ZSTD_hashPtr_opt(ip0, hlog, mls); ip0 += rLength; ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); anchor = ip0; @@ -875,6 +879,7 @@ _cleanup: _offset: /* Requires: ip0, idx, idxBase */ /* Compute the offset code. */ + { U32 const offset = current0 - idx; const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; matchEnd = idx < prefixStartIndex ? dictEnd : iend; diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index c2d1f633a49d0a94e27d656dbcf46fa13dd79d52..53ca59aa04781b1be461850398f7b914993ece81 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -11,6 +11,7 @@ * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ +#define OPTIMIZE_LIT_HUF_DECODE 1 /* ************************************************************** * Dependencies @@ -843,9 +844,110 @@ HUF_decompress4X1_usingDTable_internal_fast( HUF_DGEN(HUF_decompress1X1_usingDTable_internal) +#if OPTIMIZE_LIT_HUF_DECODE + +// allowed range x = 1~((1<<64)-1) +inline static int8_t trailbit_u64 (uint64_t val) { + return (int8_t)__builtin_ctzll(val); +} + +// allowed range x = 1~511 +inline static int8_t highbit_u9 (uint16_t x) { + return 31 - __builtin_clz((uint32_t)x); +} + +inline void MEM_COPY16B (uint8_t *p_dst, const uint8_t *p_src) { + vst1q_u8(p_dst, vld1q_u8(p_src)); +} + +inline static void HUF_decompress4X1_usingDTable_ver2 (U8 *dst, size_t dst_size, const U8 *src, size_t src_size, const HUF_DTable* huf_dtable) { + U8 backup [16]; + MEM_COPY16B(backup, (dst+dst_size)); // 保存 (dst+dst_size) 开始的 16B,避免 wildcopy 的篡改 + + typedef struct { U8 n_bit; U8 symb; } HUF_item; + + const HUF_item * dtable = (const HUF_item*)(huf_dtable + 1); + const U8 table_sft = (64 - HUF_getDTableDesc(huf_dtable).tableLog) & 0x3F; + + #define HUF4X1_RELD(k) { int8_t c=trailbit_u64(d[k]); ip[k]-=(c>>3); d[k]=(1|(*(U64*)ip[k])); d[k]<<=(c&7); } + #define HUF4X1_DECJ(k,j) { HUF_item item=dtable[(d[k]>>table_sft)]; d[k]<<=item.n_bit; op[k][j]=item.symb; } + #define HUF4X1_DECK(k) { HUF_item item=dtable[(d[k]>>table_sft)]; d[k]<<=item.n_bit; *(op[k]++)=item.symb; } + + const U32 u = ((dst_size+3) >> 2); + const U32 y = u / 5; + const U32 z = u - y*5; + U8* op1 = (U8*)dst; + U8* op2 = op1 + u; + U8* op3 = op2 + u; + U8* op4 = op3 + u; + const U8 *ip1 = ((U8*)src) + 6 + ((const U16*)src)[0] - 8; + const U8 *ip2 = ip1 + ((const U16*)src)[1]; + const U8 *ip3 = ip2 + ((const U16*)src)[2]; + const U8 *ip4 = ((U8*)src) + src_size - 8; + + const U8 * ip[] = {ip1, ip2, ip3, ip4}; + U8 * op[] = {op1, op2, op3, op4}; + U64 d[4]; + + for (int k=0; k<4; k++) { + d[k] = (1|(*(U64*)ip[k])); + d[k] <<= (8 - highbit_u9(ip[k][7])); + } + + for (U32 i=0; i