diff --git a/README.md b/README.md index 0b829c4..2e1b503 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,15 @@ where encoding and/or decoding speed is the primary concern. MinLZ is designed to operate *faster than IO* for both compression and decompression and be a viable "always on" option even if some content already is compressed. -If slow compression is acceptable, MinLZ can be configured to produce high compression ratio, -but retain high decompression speed. +If slow compression is acceptable, MinLZ can be configured to produce a high compression ratio, +while retaining high decompression speed. * Best in class compression * Block or Streaming interfaces * Very fast decompression, even as pure Go * AMD64 encoder+decoder assembly -* Adjustable Compression (3 levels) +* ARM64 decoder assembly +* Adjustable Compression (4 levels) * Concurrent stream Compression * Concurrent stream Decompression * Skip forward in compressed stream via independent blocks @@ -32,6 +33,11 @@ This package implements the MinLZ specification v1.0 in Go. For format specification see the included [SPEC.md](SPEC.md). # Changelog +* [v1.1.0](https://github.com/minio/minlz/releases/tag/v1.1.0) + * Added SuperFast compression mode. See LevelSuperFast below. + * Added ARM64 decompression assembly. + * Added `-follow` to mz command to allow reading files while they are being written. + * Improve default compression. * [v1.0.0](https://github.com/minio/minlz/releases/tag/v1.0.0) * [Initial Release Blog Post](https://blog.min.io/minlz-compression-algorithm/). @@ -49,8 +55,9 @@ Blocks are mainly useful for small data sizes. Streams are a collection of independent blocks, which each have checksums and EOF checks, which ensures against corruption and truncation. -3 compression levels are provided: +4 compression levels are provided: +* Level -1, "SuperFast": Provides the fastest compression, but at a very reduced compression ratio. * Level 1, "Fastest": Provides the fastest compression with reasonable compression. * Level 2, "Balanced": Provides a good balance between compression and speed. ~50% the speed of the fastest level. * Level 3, "Smallest": Provides the smallest output possible. Not tuned for speed. @@ -157,6 +164,24 @@ Setting level 0 will disable compression and write the data as an uncompressed s The default level is `LevelBalanced`. +Typical speeds are `LevelFastest` is 2x the speed of `LevelBalanced` +and `LevelSmallest` is at least an order of magnitude slower. + +### LevelSuperFast + +Furthermore `LevelSuperFast` is provided. This compression mode is aimed purely at reducing +slowdown when compressing hard-to-compress data. In practice that is short matches. + +The compression ratio can greatly suffer in this mode, but in these cases it can be faster. +So if you have a very high throughput (> 1GB/core/s) and a time-sensitive use case, +this can be used to ensure the compression doesn't take too much longer on this content. + +For these cases the speed difference can be *up to* 2x over `LevelFastest`, but also with a +much worse compression ratio. +However, most often it will be around 15% faster with a similar drop in the compression ratio. + +Typically, the performance difference will also be the same for decompression. + #### Writer Block Size The `WriterBlockSize` allows to set the maximum size of each block on the stream encoder. @@ -344,20 +369,22 @@ Click below to see some sample benchmarks compared to Snappy and LZ4: ### Protobuf Sample -| Compressor | Size | Comp MB/s | Decomp MB/s | Reduction % | -|--------------|--------|----------:|-------------|-------------| -| MinLZ 1 | 17,613 | 27,837 | 116,762 | 85.15% | -| MinLZ 1 (Go) | 17,479 | 22,036 | 61,652 | 85.26% | -| MinLZ 2 | 16,345 | 12,797 | 103,100 | 86.22% | -| MinLZ 2 (Go) | 16,345 | 9,732 | 52,964 | 86.22% | -| MinLZ 3 | 14,766 | 210 | 126,385 | 87.55% | -| MinLZ 3 (Go) | 14,766 | | 68,411 | 87.55% | -| Snappy | 23,335 | 24,052 | 61,002 | 80.32% | -| Snappy (Go) | 23,335 | 10,055 | 35,699 | 80.32% | -| LZ4 0 | 18,766 | 12,649 | 137,553 | 84.18% | -| LZ4 0 (Go) | 18,766 | | 64,092 | 84.18% | -| LZ4 9 | 15,844 | 12,649 | 139,801 | 86.64% | -| LZ4 9 (Go) | 15,844 | | 66,904 | 86.64% | +| Compressor | Size | Comp MB/s | Decomp MB/s | Reduction % | +|---------------|---------|----------:|-------------|-------------| +| MinLZ 1 | 17,613 | 27,837 | 116,762 | 85.15% | +| MinLZ 1 (Go) | 17,479 | 22,036 | 61,652 | 85.26% | +| MinLZ 2 | 16,345 | 12,797 | 103,100 | 86.22% | +| MinLZ 2 (Go) | 16,345 | 9,732 | 52,964 | 86.22% | +| MinLZ 3 | 14,766 | 210 | 126,385 | 87.55% | +| MinLZ 3 (Go) | 14,766 | | 68,411 | 87.55% | +| Snappy | 23,335 | 24,052 | 61,002 | 80.32% | +| Snappy (Go) | 23,335 | 10,055 | 35,699 | 80.32% | +| LZ4 0 | 18,766 | 12,649 | 137,553 | 84.18% | +| LZ4 0 (Go) | 18,766 | | 64,092 | 84.18% | +| LZ4 9 | 15,844 | 12,649 | 139,801 | 86.64% | +| LZ4 9 (Go) | 15,844 | | 66,904 | 86.64% | +| MinLZ -1 | 19,889 | ------ | ------- | 83.23% | +| MinLZ -1 (Go) | 19,218 | ------ | ------- | 83.74% | ![Compression vs Size](img/pb-block.png) @@ -369,20 +396,22 @@ Source file: https://github.com/google/snappy/blob/main/testdata/geo.protodata
Click To See Data + Charts (102,400 bytes input) -| Compressor | Size | Comp MB/s | Decomp MB/s | Reduction % | -|--------------|--------|----------:|-------------|-------------| -| MinLZ 1 | 20,184 | 17,558 | 82,292 | 80.29% | -| MinLZ 1 (Go) | 19,849 | 15,035 | 32,327 | 80.62% | -| MinLZ 2 | 17,831 | 9,260 | 58,432 | 82.59% | -| MinLZ 2 (Go) | 17,831 | 7,524 | 25,728 | 82.59% | -| MinLZ 3 | 16,025 | 180 | 80,445 | 84.35% | -| MinLZ 3 (Go) | 16,025 | | 33,382 | 84.35% | -| Snappy | 22,843 | 17,469 | 44,765 | 77.69% | -| Snappy (Go) | 22,843 | 8,161 | 21,082 | 77.69% | -| LZ4 0 | 21,216 | 9,452 | 101,490 | 79.28% | -| LZ4 0 (Go) | 21,216 | | 40,674 | 79.28% | -| LZ4 9 | 17,139 | 1,407 | 95,706 | 83.26% | -| LZ4 9 (Go) | 17,139 | | 39,709 | 83.26% | +| Compressor | Size | Comp MB/s | Decomp MB/s | Reduction % | +|---------------|--------|----------:|-------------|-------------| +| MinLZ 1 | 20,184 | 17,558 | 82,292 | 80.29% | +| MinLZ 1 (Go) | 19,849 | 15,035 | 32,327 | 80.62% | +| MinLZ 2 | 17,831 | 9,260 | 58,432 | 82.59% | +| MinLZ 2 (Go) | 17,831 | 7,524 | 25,728 | 82.59% | +| MinLZ 3 | 16,025 | 180 | 80,445 | 84.35% | +| MinLZ 3 (Go) | 16,025 | | 33,382 | 84.35% | +| Snappy | 22,843 | 17,469 | 44,765 | 77.69% | +| Snappy (Go) | 22,843 | 8,161 | 21,082 | 77.69% | +| LZ4 0 | 21,216 | 9,452 | 101,490 | 79.28% | +| LZ4 0 (Go) | 21,216 | | 40,674 | 79.28% | +| LZ4 9 | 17,139 | 1,407 | 95,706 | 83.26% | +| LZ4 9 (Go) | 17,139 | | 39,709 | 83.26% | +| MinLZ -1 | 23,487 | ------ | ------- | 77.06% | +| MinLZ -1 (Go) | 22,911 | ------ | ------- | 77.63% | ![Compression vs Size](img/html-block.png) @@ -666,6 +695,7 @@ File names beginning with 'http://' and 'https://' will be downloaded and compre Only http response code 200 is accepted. Options: + -0 Perform no compression -1 Compress faster, but with a minor compression loss -2 Default compression speed (default true) -3 Compress more, but a lot slower @@ -695,6 +725,8 @@ Options: Do not overwrite output files -verify Verify files, but do not write output + -xfast + Compress fastest, with a major compression loss Example: @@ -731,10 +763,12 @@ Options: -c Write all output to stdout. Multiple input files will be concatenated -cpu int Maximum number of threads to use (default 32) + -follow + Follow file like tail -f, reopening when EOF is reached -help Display help -limit string - Return at most this much data. Examples: 92, 64K, 256K, 1M, 4M + Return at most this much data. Examples: 92, 64K, 256K, 1M, 4M -o string Write output to another file. Single input file only -offset string diff --git a/_generate/gen.go b/_generate/gen.go index 52d306b..81ebf1c 100644 --- a/_generate/gen.go +++ b/_generate/gen.go @@ -31,8 +31,12 @@ import ( "github.com/mmcloughlin/avo/reg" ) -// insert extra checks here and there. -const debug = false +const ( + // insert extra checks here and there. + debug = false + // matchOffsetCMOV is true if we should use CMOV to check match offsets. + matchOffsetCMOV = true +) func main() { flag.Parse() @@ -47,8 +51,10 @@ func main() { avx2: false, outputMargin: 17, inputMargin: 17, - } - // 16 bits has too big of a speed impact. + skipOutput: false} + + // 16 bit hash table has too big of a speed impact. + o.fastOpts = fastOpts{match8: false, fuselits: true, checkRepeats: false, checkBack: true, skipOne: false, incLoop: 4, minSizeLog: 5} o.genEncodeBlockAsm("encodeBlockAsm", 15, 6, 6, 8<<20) o.genEncodeBlockAsm("encodeBlockAsm2MB", 15, 6, 6, 2<<20) o.genEncodeBlockAsm("encodeBlockAsm512K", 14, 6, 6, 512<<10) @@ -57,6 +63,16 @@ func main() { o.genEncodeBlockAsm("encodeBlockAsm4K", 10, 5, 4, 4<<10) o.genEncodeBlockAsm("encodeBlockAsm1K", 9, 4, 4, 1<<10) + o.fastOpts = fastOpts{match8: true, fuselits: false, checkRepeats: true, checkBack: false, skipOne: false, incLoop: 4, minSizeLog: 3} + const fastHashBytes = 8 + o.genEncodeBlockAsm("encodeFastBlockAsm", 14, 5, fastHashBytes, 8<<20) + o.genEncodeBlockAsm("encodeFastBlockAsm2MB", 13, 5, fastHashBytes, 2<<20) + o.genEncodeBlockAsm("encodeFastBlockAsm512K", 13, 5, fastHashBytes, 512<<10) + o.genEncodeBlockAsm("encodeFastBlockAsm64K", 12, 4, fastHashBytes, 64<<10) + o.genEncodeBlockAsm("encodeFastBlockAsm16K", 11, 4, fastHashBytes, 16<<10) + o.genEncodeBlockAsm("encodeFastBlockAsm4K", 10, 4, fastHashBytes, 4<<10) + o.genEncodeBlockAsm("encodeFastBlockAsm1K", 9, 3, fastHashBytes, 1<<10) + o.maxSkip = 100 // Blocks can be long, limit max skipping. o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 17, 14, 8, 7, 8<<20) o.genEncodeBetterBlockAsm("encodeBetterBlockAsm2MB", 17, 14, 7, 7, 2<<20) @@ -210,6 +226,29 @@ type options struct { inputMargin int maxSkip int ignoreMargins bool + fastOpts +} + +type fastOpts struct { + // Do 8 byte minimum match + match8 bool + + // Do fused literals when emitting + fuselits bool + + // Check for repeats + checkRepeats bool + + // Extend matches backwards + checkBack bool + + // Skip checking s+1 when looking for a match. + skipOne bool + + // Increment loop by this many bytes when match fails. + incLoop int + + minSizeLog int } func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, maxLen int) { @@ -310,6 +349,12 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m JNZ(LabelRef("zero_loop_" + name)) } + match8 := o.match8 + fuselits := o.fuselits + checkRepeats := o.checkRepeats + checkBack := o.checkBack + skipOne := o.skipOne + { // nextEmit is offset n src where the next emitLiteral should start from. MOVL(U32(0), nextEmitL) @@ -329,8 +374,8 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m MOVL(tmp3.As32(), sLimitL) - // dstLimit := (len(src) - outputMargin ) - len(src)>>5 - SHRQ(U8(5), tmp) + // dstLimit := (len(src) - outputMargin ) - len(src)>>minSizeLog + SHRQ(U8(o.minSizeLog), tmp) SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp assert(func(ok LabelRef) { @@ -402,7 +447,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m MOVL(s, tmp.As32()) // tmp = s SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog - LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS) + LEAL(Mem{Base: s, Disp: o.incLoop, Index: tmp, Scale: 1}, nextS) } else { panic("maxskip not implemented") } @@ -425,12 +470,20 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m ifok() return } - skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) - ccCounter++ - CMPL(cand.As32(), minPos.As32()) - JLE(LabelRef(skip)) - ifok() - Label(skip) + if matchOffsetCMOV { + // Use CMOV over JLE to avoid a jump. + // Intel seems to favor this. + CMPL(cand.As32(), minPos.As32()) + CMOVLLE(minPos.As32(), cand.As32()) + ifok() + } else { + skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) + ccCounter++ + CMPL(cand.As32(), minPos.As32()) + JLE(LabelRef(skip)) + ifok() + Label(skip) + } } assert(func(ok LabelRef) { // Check if s is valid (we should have jumped above if not) @@ -447,23 +500,31 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m { hash0, hash1 := GP64(), GP64() MOVQ(cv, hash0) - MOVQ(cv, hash1) - SHRQ(U8(8), hash1) hasher.hash(hash0) - hasher.hash(hash1) + if !skipOne { + if hashBytes > 7 { + MOVQ(Mem{Base: src, Index: s, Disp: 1, Scale: 1}, hash1) + } else { + MOVQ(cv, hash1) + SHRQ(U8(8), hash1) + } + hasher.hash(hash1) + assert(func(ok LabelRef) { + CMPQ(hash1, U32(tableSize)) + JB(ok) + }) + } table.LoadIdx(hash0, candidate) assert(func(ok LabelRef) { CMPQ(hash0, U32(tableSize)) JB(ok) }) - assert(func(ok LabelRef) { - CMPQ(hash1, U32(tableSize)) - JB(ok) - }) - table.LoadIdx(hash1, candidate2) table.SaveIdx(s, hash0) - table.SaveIdx(s, hash1) + if !skipOne { + table.LoadIdx(hash1, candidate2) + table.SaveIdx(s, hash1) + } } // Can be moved up if registers are available. @@ -471,8 +532,12 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m { // hash2 := hash6(cv>>16, tableBits) // hasher = hash6(tableBits) - MOVQ(cv, hash2) - SHRQ(U8(16), hash2) + if hashBytes > 6 { + MOVQ(Mem{Base: src, Index: s, Disp: 2, Scale: 1}, hash2) + } else { + MOVQ(cv, hash2) + SHRQ(U8(16), hash2) + } hasher.hash(hash2) assert(func(ok LabelRef) { CMPQ(hash2, U32(tableSize)) @@ -481,7 +546,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m } // En/disable repeat matching. - if true { + if checkRepeats { // Check repeat at offset checkRep const checkRep = 1 { @@ -508,7 +573,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m MOVL(nextEmitL, nextEmit) // Extend back - if true { + if checkBack { i := GP32() MOVL(base, i) SUBL(repeatL, i) @@ -571,6 +636,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, nil, LabelRef("repeat_extend_forward_end_"+name)) forwardStart, backStart, srcLeft = nil, nil, nil + PCALIGN(16) Label("repeat_extend_forward_end_" + name) // s+= length ADDL(length.As32(), s) @@ -613,26 +679,38 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m CMPL(s, cand) JA(ok) }) - assertCand(candidate2, func(cand reg.Register, ok LabelRef) { - tmp := GP64() - MOVQ(lenSrcQ, tmp) - CMPL(tmp.As32(), cand) - JA(ok) - }) - assertCand(candidate2, func(cand reg.Register, ok LabelRef) { - CMPL(s, cand) - // Candidate2 is at s+1, so s is ok. - JAE(ok) - }) + if !skipOne { + assertCand(candidate2, func(cand reg.Register, ok LabelRef) { + tmp := GP64() + MOVQ(lenSrcQ, tmp) + CMPL(tmp.As32(), cand) + JA(ok) + }) + assertCand(candidate2, func(cand reg.Register, ok LabelRef) { + CMPL(s, cand) + // Candidate2 is at s+1, so s is ok. + JAE(ok) + }) + } checkCandidate(candidate, func() { - CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) // <<-- Hot + if match8 { + CMPQ(Mem{Base: src, Index: candidate, Scale: 1}, cv.As64()) // <<-- Hot + } else { + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) // <<-- Hot + } JEQ(LabelRef("candidate_match_" + name)) }) tmp := GP32() // cv >>= 8 - SHRQ(U8(8), cv) + if !skipOne { + if hashBytes > 7 { + MOVQ(Mem{Base: src, Index: s, Disp: 1, Scale: 1}, cv) + } else { + SHRQ(U8(8), cv) + } + } // candidate = int(table[hash2]) - load early. table.LoadIdx(hash2, candidate) @@ -653,20 +731,38 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m LEAL(Mem{Base: s, Disp: 2}, tmp) //if uint32(cv>>8) == load32(src, candidate2) - checkCandidate(candidate2, func() { - CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32()) - JEQ(LabelRef("candidate2_match_" + name)) - }) + if !skipOne { + checkCandidate(candidate2, func() { + if match8 { + CMPQ(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As64()) + } else { + CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32()) + } + JEQ(LabelRef("candidate2_match_" + name)) + }) + } // table[hash2] = uint32(s + 2) table.SaveIdx(tmp, hash2) // cv >>= 8 (>> 16 total) - SHRQ(U8(8), cv) + if hashBytes > 6 { + MOVQ(Mem{Base: src, Index: s, Disp: 2, Scale: 1}, cv) + } else { + if skipOne { + SHRQ(U8(16), cv) + } else { + SHRQ(U8(8), cv) + } + } // if uint32(cv>>16) == load32(src, candidate) checkCandidate(candidate, func() { - CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + if match8 { + CMPQ(Mem{Base: src, Index: candidate, Scale: 1}, cv.As64()) + } else { + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + } JEQ(LabelRef("candidate3_match_" + name)) }) @@ -693,7 +789,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m Label("candidate_match_" + name) // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. // Extend backwards - if true { + if checkBack { ne := GP32() MOVL(nextEmitL, ne) TESTL(candidate, candidate) @@ -734,9 +830,15 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m MOVL(repeatVal, repeatL) } // s+=4, candidate+=4 - ADDL(U8(4), s) - ADDL(U8(4), candidate) - // Extend the 4-byte match as long as possible and emit copy. + if match8 { + ADDL(U8(8), s) + ADDL(U8(8), candidate) + } else { + ADDL(U8(4), s) + ADDL(U8(4), candidate) + } + + // Extend the 4/8-byte match as long as possible and emit copy. { assert(func(ok LabelRef) { // s must be > candidate cannot be equal. @@ -761,6 +863,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m nil, LabelRef("match_nolit_end_"+name), ) + PCALIGN(16) Label("match_nolit_end_" + name) assert(func(ok LabelRef) { // Should never exceed max block size... @@ -771,13 +874,16 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m // s += length (length is destroyed, use it now) ADDL(length.As32(), s) - ADDL(U8(4), length.As32()) // length += 4 + if match8 { + ADDL(U8(8), length.As32()) // length += 8 + } else { + ADDL(U8(4), length.As32()) // length += 4 + } // Load offset from repeat value. offset := GP64() MOVL(repeatL, offset.As32()) // Emit lits - const fuselits = true { litLen, nextEmit := GP64(), GP64() MOVL(nextEmitL, nextEmit.As32()) @@ -829,6 +935,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m o.emitLiteral("match_emit_"+name, litLen, nil, dst, litSrc, LabelRef("match_nolits_copy_"+name), true) } + PCALIGN(16) Label("match_nolits_copy_" + name) o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) Label("match_nolit_emitcopy_end_" + name) @@ -862,7 +969,11 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m hasher := hashN(o, hashBytes, tableBits) hash0, hash1 := GP64(), GP64() MOVQ(cv, hash0) // src[s-2] - SHRQ(U8(16), cv) + if hashBytes > 6 { + MOVQ(Mem{Base: src, Index: s, Disp: 0, Scale: 1}, cv) + } else { + SHRQ(U8(16), cv) + } MOVQ(cv, hash1) // src[s] hasher.hash(hash0) hasher.hash(hash1) @@ -893,7 +1004,11 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m JMP(LabelRef("search_loop_" + name)) Label("match_nolit_len_ok" + name) } - CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + if match8 { + CMPQ(Mem{Base: src, Index: candidate, Scale: 1}, cv.As64()) + } else { + CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) + } JNE(LabelRef("search_loop_" + name)) // << -- Hot // Prepare for emit // Update repeat @@ -906,8 +1021,13 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m } // s+=4, candidate+=4 checkDst(0, nil) - ADDL(U8(3), s) - ADDL(U8(4), candidate) + if match8 { + ADDL(U8(7), s) + ADDL(U8(8), candidate) + } else { + ADDL(U8(3), s) + ADDL(U8(4), candidate) + } { // Extend the 4-byte match as long as possible and emit copy. assertCand(candidate, func(cand reg.Register, ok LabelRef) { @@ -933,6 +1053,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m length, LabelRef("match_nolit2_end_"+name), ) + PCALIGN(16) Label("match_nolit2_end_" + name) assert(func(ok LabelRef) { // Should never exceed max block size... @@ -942,8 +1063,12 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m // s += length (length is destroyed, use it now) ADDL(length.As32(), s) - ADDL(U8(4), length.As32()) // length += 4 - MOVL(s, nextEmitL) // nextEmit = s + if match8 { + ADDL(U8(8), length.As32()) // length += 4 + } else { + ADDL(U8(4), length.As32()) // length += 4 + } + MOVL(s, nextEmitL) // nextEmit = s } // Load offset from repeat value. MOVL(repeatL, offset.As32()) @@ -1167,6 +1292,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk Load(Param("src").Base(), src) // Load cv + PCALIGN(16) Label("search_loop_" + name) reloadTables("tmp", &sTab, &lTab) candidate := GP32() @@ -1261,12 +1387,20 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk ifok() return } - skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) - ccCounter++ - CMPL(cand.As32(), minPos.As32()) - JLE(LabelRef(skip)) - ifok() - Label(skip) + if matchOffsetCMOV { + // Use CMOV over JLE to avoid a jump. + // Intel seems to favor this. + CMPL(cand.As32(), minPos.As32()) + CMOVLLE(minPos.As32(), cand.As32()) + ifok() + } else { + skip := fmt.Sprintf("offset_ok_%d_%s", ccCounter, name) + ccCounter++ + CMPL(cand.As32(), minPos.As32()) + JL(LabelRef(skip)) + ifok() + Label(skip) + } } longVal := GP64() shortVal := GP64() @@ -1390,6 +1524,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, nil, LabelRef("repeat_extend_forward_end_"+name)) forwardStart, backStart, srcLeft = nil, nil, nil + PCALIGN(16) Label("repeat_extend_forward_end_" + name) // s+= length ADDL(length.As32(), s) @@ -1419,6 +1554,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk } JMP(LabelRef("search_loop_" + name)) } + PCALIGN(16) Label("no_repeat_found_" + name) { // Check candidates are ok. All must be < s and < len(src) @@ -1484,6 +1620,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk } } + PCALIGN(16) Label("candidate_match_" + name) // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. // Extend backwards @@ -1566,6 +1703,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, sk nil, LabelRef("match_nolit_end_"+name), ) + PCALIGN(16) Label("match_nolit_end_" + name) assert(func(ok LabelRef) { CMPL(length.As32(), U32(math.MaxInt32)) @@ -1870,11 +2008,11 @@ func hashN(o options, hashBytes, tablebits int) hashGen { o: o, } if o.bmi2 { - if hashBytes < 8 && hashBytes != 4 { - MOVQ(U32(hashBytes*8), h.mulreg) - } else { - MOVQ(U32(tablebits), h.mulreg) + if hashBytes < 8 { + h.clear = GP64() + MOVQ(U8(hashBytes*8), h.clear) } + MOVQ(U8(tablebits), h.mulreg) return h } primebytes := uint64(0) @@ -1901,32 +2039,15 @@ func hashN(o options, hashBytes, tablebits int) hashGen { // hash uses multiply to get hash of the value. func (h hashGen) hash(val reg.GPVirtual) { if h.o.bmi2 { - // Broken somehow... if h.bytes < 8 { - if h.bytes == 4 { - MOVL(val.As32(), val.As32()) - } else { - //SHLQ(U8(64-8*h.bytes), val) - BZHIQ(val, h.mulreg, val) - } + BZHIQ(val, h.clear, val) } CRC32Q(val, val) - tmp := h.mulreg - if h.bytes < 8 && h.bytes != 4 { - tmp = GP64() - MOVQ(U32(h.tablebits), tmp) - } - //SHRQ(U8(32-h.tablebits), val) - BZHIQ(val, tmp, val) - return + BZHIQ(val, h.mulreg, val) } // Move value to top of register. if h.bytes < 8 { - if h.bytes == 4 { - MOVL(val.As32(), val.As32()) - } else { - SHLQ(U8(64-8*h.bytes), val) - } + SHLQ(U8(64-8*h.bytes), val) } // 329 AMD64 :IMUL r64, r64 L: 0.86ns= 3.0c T: 0.29ns= 1.00c // 2020 BMI2 :MULX r64, r64, r64 L: 1.14ns= 4.0c T: 0.29ns= 1.00c @@ -2777,6 +2898,7 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en } if minMove <= 16 { + PCALIGN(16) Label(name + "move_8through16") if margin < 16 { MOVQ(Mem{Base: src}, AX) @@ -2825,9 +2947,9 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en // func genMemMoveLong(to, from unsafe.Pointer, n uintptr) // src and dst may not overlap. // length must be >= 64 bytes. Is preserved. -// Non AVX uses 2 GP register, 16 SSE2 registers. -// AVX uses 4 GP registers 16 AVX/SSE registers. +// Uses 2 GP register, 16 SSE2 registers. // All passed registers are preserved. +// Attempts to align dst writes. func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) { if o.skipOutput { JMP(end) @@ -2844,8 +2966,6 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end // Store start and end for sse_tail Label(name + "forward_sse") X0, X1, X2, X3, X4, X5 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM() - // X6, X7 := XMM(), XMM() - //X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() MOVOU(Mem{Base: src}, X0) MOVOU(Mem{Base: src, Disp: 16}, X1) @@ -2874,6 +2994,7 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end dstPos := GP64() LEAQ(Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}, dstPos) + PCALIGN(16) Label(name + "big_loop_back") MOVOU(Mem{Disp: 0, Base: srcPos}, X4) @@ -2909,8 +3030,6 @@ func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end // genMemMoveLong64 copies regions of at least 64 bytes. // src and dst may not overlap by less than 64 bytes. // length must be >= 64 bytes. Is preserved. -// Non AVX uses 2 GP register, 16 SSE2 registers. -// AVX uses 4 GP registers 16 AVX/SSE registers. // All passed registers are preserved. func (o options) genMemMoveLong64(name string, dst, src, length reg.GPVirtual, end LabelRef) { if o.skipOutput { @@ -2928,37 +3047,47 @@ func (o options) genMemMoveLong64(name string, dst, src, length reg.GPVirtual, e // We do purely unaligned copied. // Modern processors doesn't seems to care, // and one of the addresses will most often be unaligned anyway. - X0, X1 := XMM(), XMM() + X0, X1, X2, X3 := XMM(), XMM(), XMM(), XMM() // forward (only) bigLoops := GP64() MOVQ(length, bigLoops) - SHRQ(U8(5), bigLoops) // bigLoops = length / 32 + SHRQ(U8(6), bigLoops) // bigLoops = length / 64 srcPos, dstPos, remain := GP64(), GP64(), GP64() MOVQ(src, srcPos) MOVQ(dst, dstPos) MOVQ(length, remain) + PCALIGN(16) Label(name + "big_loop_back") MOVOU(Mem{Disp: 0, Base: srcPos}, X0) MOVOU(Mem{Disp: 16, Base: srcPos}, X1) + MOVOU(Mem{Disp: 32, Base: srcPos}, X2) + MOVOU(Mem{Disp: 48, Base: srcPos}, X3) MOVOU(X0, Mem{Disp: 0, Base: dstPos}) MOVOU(X1, Mem{Disp: 16, Base: dstPos}) - ADDQ(U8(32), dstPos) - ADDQ(U8(32), srcPos) - SUBQ(U8(32), remain) + MOVOU(X2, Mem{Disp: 32, Base: dstPos}) + MOVOU(X3, Mem{Disp: 48, Base: dstPos}) + ADDQ(U8(64), dstPos) + ADDQ(U8(64), srcPos) + SUBQ(U8(64), remain) + JZ(end) DECQ(bigLoops) JNZ(LabelRef(name + "big_loop_back")) TESTQ(remain, remain) JZ(end) - // We have 1 -> 31 remaining, but we can write in earlier part. - MOVOU(Mem{Base: srcPos, Disp: -32, Index: remain, Scale: 1}, X0) - MOVOU(Mem{Base: srcPos, Disp: -16, Index: remain, Scale: 1}, X1) - MOVOU(X0, Mem{Base: dstPos, Disp: -32, Index: remain, Scale: 1}) - MOVOU(X1, Mem{Base: dstPos, Disp: -16, Index: remain, Scale: 1}) + // We have 1 -> 63 remaining, but we can overwrite in the earlier part. + MOVOU(Mem{Base: srcPos, Disp: -64, Index: remain, Scale: 1}, X0) + MOVOU(Mem{Base: srcPos, Disp: -48, Index: remain, Scale: 1}, X1) + MOVOU(Mem{Base: srcPos, Disp: -32, Index: remain, Scale: 1}, X2) + MOVOU(Mem{Base: srcPos, Disp: -16, Index: remain, Scale: 1}, X3) + MOVOU(X0, Mem{Base: dstPos, Disp: -64, Index: remain, Scale: 1}) + MOVOU(X1, Mem{Base: dstPos, Disp: -48, Index: remain, Scale: 1}) + MOVOU(X2, Mem{Base: dstPos, Disp: -32, Index: remain, Scale: 1}) + MOVOU(X3, Mem{Base: dstPos, Disp: -16, Index: remain, Scale: 1}) JMP(end) return @@ -3008,6 +3137,7 @@ func (o options) matchLen(name string, a, b, len, dst reg.GPVirtual, end LabelRe Label("avx2_continue_" + name) JMP(LabelRef("matchlen_loop_16_entry_" + name)) + PCALIGN(16) Label("matchlen_loopback_16_" + name) tmp2 := GP64() MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp) @@ -3025,6 +3155,7 @@ func (o options) matchLen(name string, a, b, len, dst reg.GPVirtual, end LabelRe JAE(LabelRef("matchlen_loopback_16_" + name)) JMP(LabelRef("matchlen_match8_" + name)) + PCALIGN(16) Label("matchlen_bsf_16" + name) // Not all match. TZCNTQ(tmp2, tmp2) @@ -3043,13 +3174,14 @@ func (o options) matchLen(name string, a, b, len, dst reg.GPVirtual, end LabelRe LEAL(Mem{Base: len, Disp: -8}, len.As32()) LEAL(Mem{Base: matched, Disp: 8}, matched) JMP(LabelRef("matchlen_match4_" + name)) - Label("matchlen_bsf_8_" + name) + PCALIGN(16) + Label("matchlen_bsf_8_" + name) // Not all match. TZCNTQ(tmp, tmp) // tmp is the number of bits that matched. SARQ(U8(3), tmp) - LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) + ADDL(tmp.As32(), matched) JMP(end) // Less than 8 bytes left. @@ -3115,7 +3247,7 @@ func (o options) matchLenAVX2(name string, a, b, len reg.GPVirtual, cont, end La JNE(LabelRef(name + "cal_prefix")) ADDQ(U8(32), a) ADDQ(U8(32), b) - ADDL(U8(32), dst) + ADDL(U8(32), dst.As32()) SUBQ(U8(32), len) JZ(end) JMP(LabelRef(name + "loop")) @@ -3125,7 +3257,7 @@ func (o options) matchLenAVX2(name string, a, b, len reg.GPVirtual, cont, end La { NOTQ(equalMaskBits) TZCNTQ(equalMaskBits, equalMaskBits) - ADDL(equalMaskBits.As32(), dst) + ADDL(equalMaskBits.As32(), dst.As32()) } JMP(end) return @@ -3555,6 +3687,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } // LOOP + if !prefetch { + // Triggers https://github.com/golang/go/issues/74648 + //PCALIGN(16) + } Label(name + "_loop") CMPQ(src, srcLimit) JAE(LabelRef(name + "_end_copy")) @@ -3562,7 +3698,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr MOVQ(tag, value) SHRQ(U8(2), value) - Label(name + "_loop_nofetch") + if prefetch { + PCALIGN(16) + Label(name + "_loop_nofetch") + } // Check destination CMPQ(dst, dstLimit) JAE(LabelRef(name + "_end_copy")) @@ -3573,6 +3712,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr JNZ(LabelRef(name + "_copy")) // TAG 00 Literals length := GP64() + PCALIGN(16) Label(name + "_lits") { MOVL(value.As32(), length.As32()) @@ -3585,6 +3725,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr JMP(LabelRef(name + "_lit_3")) // Must be 31 // 1 - > 29 literals + PCALIGN(16) Label(name + "_lit_0") { INCQ(src) @@ -3709,6 +3850,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr JMP(LabelRef(name + "_copy_3")) // TAG 1 - Copy 1 + PCALIGN(16) Label(name + "_copy_1") { if o.inputMargin < 2 { @@ -3771,6 +3913,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } // TAG 2 - Copy 2 + PCALIGN(16) Label(name + "_copy_2") { // length = int(src[s-3]) >> 2 @@ -3862,6 +4005,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } } // TAG 3 - Copy 2/3 fused + PCALIGN(16) Label(name + "_copy_3") { if o.inputMargin < 4 { @@ -4048,6 +4192,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } // Length always < 64 copySrc := GP64() + PCALIGN(16) Label(name + "_copy_exec_short") { CMPL(offset.As32(), dstPos.As32()) @@ -4067,6 +4212,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr o.outputMargin -= 4 // 64 offset, 64 length + PCALIGN(16) Label(name + "_copy_exec_long_long") { MOVQ(dst, copySrc) @@ -4080,8 +4226,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr } o.genMemMoveLong64(name+"_copy_long_long", dst, copySrc, length, LabelRef(name+"_copy_done")) } + // length 4 -> 64, no overlap // Very hot (16 byte copy mainly) + PCALIGN(16) Label(name + "_copy_short_no_ol") { // Create source pointer with offset @@ -4097,6 +4245,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr o.genMemMoveShort(name+"_copy_short_no_ol", dst, copySrc, length, LabelRef(name+"_copy_done"), 4) } // Offset anything, length anything + PCALIGN(16) Label(name + "_copy_exec") { CMPL(offset.As32(), dstPos.As32()) @@ -4112,7 +4261,7 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr CMPL(offset.As32(), length.As32()) JB(LabelRef(name + "_copy_overlap")) CMPL(length.As32(), U8(64)) - JA(LabelRef(name + "_copy_long")) + JAE(LabelRef(name + "_copy_long")) Label(name + "_copy_short") { @@ -4273,3 +4422,10 @@ func (o options) genDecodeLoop(name string, dstEnd, srcEnd reg.Register, dst, sr Label(name + "_end_copy") Label(name + "_end_done") } + +func PCALIGN(n int) { + Instruction(&ir.Instruction{ + Opcode: "PCALIGN", + Operands: []Op{Imm(uint64(n))}, + }) +} diff --git a/asm_amd64.go b/asm_amd64.go index 605b14d..9b991ed 100644 --- a/asm_amd64.go +++ b/asm_amd64.go @@ -53,6 +53,55 @@ func encodeBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int //go:noescape func encodeBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int +// encodeFastBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 8388608 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int + +// encodeFastBlockAsm2MB encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 2097152 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm2MB(dst []byte, src []byte, tmp *[32768]byte) int + +// encodeFastBlockAsm512K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 524288 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm512K(dst []byte, src []byte, tmp *[32768]byte) int + +// encodeFastBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 65536 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm64K(dst []byte, src []byte, tmp *[8192]byte) int + +// encodeFastBlockAsm16K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 16384 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm16K(dst []byte, src []byte, tmp *[4096]byte) int + +// encodeFastBlockAsm4K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 4096 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int + +// encodeFastBlockAsm1K encodes a non-empty src to a guaranteed-large-enough dst. +// Maximum input 1024 bytes. +// It assumes that the varint-encoded length of the decompressed bytes has already been written. +// +//go:noescape +func encodeFastBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int + // encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst. // Maximum input 8388608 bytes. // It assumes that the varint-encoded length of the decompressed bytes has already been written. diff --git a/asm_amd64.s b/asm_amd64.s index c18e6c4..d6b54ce 100644 --- a/asm_amd64.s +++ b/asm_amd64.s @@ -39,363 +39,53 @@ zero_loop_encodeBlockAsm: MOVQ src_base+24(FP), BX search_loop_encodeBlockAsm: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBlockAsm - MOVQ (BX)(DX*1), DI - LEAL -2162685(DX), R8 - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ DI, R11 - MOVQ DI, R12 - SHRQ $0x08, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x31, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x31, R12 - MOVL (AX)(R11*4), SI - MOVL (AX)(R12*4), R9 - MOVL DX, (AX)(R11*4) - MOVL DX, (AX)(R12*4) - MOVQ DI, R11 - SHRQ $0x10, R11 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x31, R11 - MOVL DX, R10 - SUBL 16(SP), R10 - MOVL 1(BX)(R10*1), R12 - MOVQ DI, R10 - SHRQ $0x08, R10 - CMPL R10, R12 - JNE no_repeat_found_encodeBlockAsm - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm - -repeat_extend_back_loop_encodeBlockAsm: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm - -repeat_extend_back_end_encodeBlockAsm: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 4(CX)(SI*1), R9 - CMPQ R9, (SP) - JB dst_size_check_ok_1 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm - CMPL R9, $0x00010000 - JB three_bytes_repeat_emit_lits_encodeBlockAsm - MOVL R9, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R9, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm - -three_bytes_repeat_emit_lits_encodeBlockAsm: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm - -two_bytes_repeat_emit_lits_encodeBlockAsm: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm - JMP memmove_long_repeat_emit_lits_encodeBlockAsm - -one_byte_repeat_emit_lits_encodeBlockAsm: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm - -memmove_midrepeat_emit_lits_encodeBlockAsm: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm - -memmove_long_repeat_emit_lits_encodeBlockAsm: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm - -matchlen_loopback_16_repeat_extend_encodeBlockAsm: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm - JMP matchlen_match8_repeat_extend_encodeBlockAsm - -matchlen_bsf_16repeat_extend_encodeBlockAsm: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_match8_repeat_extend_encodeBlockAsm: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm - -matchlen_bsf_8_repeat_extend_encodeBlockAsm: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_match4_repeat_extend_encodeBlockAsm: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm - JB repeat_extend_forward_end_encodeBlockAsm - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_match1_repeat_extend_encodeBlockAsm: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_match_repeat_encodeBlockAsm: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_match_repeat_encodeBlockAsm: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_one_match_repeat_encodeBlockAsm: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm - -no_repeat_found_encodeBlockAsm: - CMPL SI, R8 - JLE offset_ok_0_encodeBlockAsm - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm - -offset_ok_0_encodeBlockAsm: - SHRQ $0x08, DI - MOVL (AX)(R11*4), SI - LEAL 2(DX), R10 - CMPL R9, R8 - JLE offset_ok_1_encodeBlockAsm - CMPL (BX)(R9*1), DI - JEQ candidate2_match_encodeBlockAsm - -offset_ok_1_encodeBlockAsm: - MOVL R10, (AX)(R11*4) - SHRQ $0x08, DI - CMPL SI, R8 - JLE offset_ok_2_encodeBlockAsm - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm - -offset_ok_2_encodeBlockAsm: - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x06, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeBlockAsm + MOVQ (BX)(DX*1), DI + LEAL -2162685(DX), R8 + MOVL SI, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R10 + MOVQ DI, R9 + SHLQ $0x10, R9 + IMULQ R10, R9 + SHRQ $0x31, R9 + MOVQ DI, R11 + SHRQ $0x08, R11 + SHLQ $0x10, R11 + IMULQ R10, R11 + SHRQ $0x31, R11 + MOVL (AX)(R9*4), SI + MOVL DX, (AX)(R9*4) + MOVL (AX)(R11*4), R9 + MOVL DX, (AX)(R11*4) + MOVQ DI, R11 + SHRQ $0x10, R11 + SHLQ $0x10, R11 + IMULQ R10, R11 + SHRQ $0x31, R11 + CMPL SI, R8 + CMOVLLE R8, SI + CMPL (BX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm + SHRQ $0x08, DI + MOVL (AX)(R11*4), SI + LEAL 2(DX), R10 + CMPL R9, R8 + CMOVLLE R8, R9 + CMPL (BX)(R9*1), DI + JEQ candidate2_match_encodeBlockAsm + MOVL R10, (AX)(R11*4) + SHRQ $0x08, DI + CMPL SI, R8 + CMOVLLE R8, SI + CMPL (BX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm + MOVL 20(SP), DX + JMP search_loop_encodeBlockAsm candidate3_match_encodeBlockAsm: ADDL $0x02, DX @@ -425,11 +115,11 @@ match_extend_back_loop_encodeBlockAsm: match_extend_back_end_encodeBlockAsm: CMPQ CX, (SP) - JB dst_size_check_ok_2 + JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_2: +dst_size_check_ok_1: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -444,6 +134,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm: MOVQ (R9)(R11*1), R10 @@ -459,6 +150,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm JMP matchlen_match8_match_nolit_encodeBlockAsm + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm: TZCNTQ R12, R12 @@ -475,11 +167,12 @@ matchlen_match8_match_nolit_encodeBlockAsm: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm matchlen_match4_match_nolit_encodeBlockAsm: @@ -507,6 +200,8 @@ matchlen_match1_match_nolit_encodeBlockAsm: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm + PCALIGN $0x10 match_nolit_end_encodeBlockAsm: ADDL R11, DX @@ -625,11 +320,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm: match_emit_lits_copy_encodeBlockAsm: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -677,6 +372,7 @@ one_byte_match_emit_encodeBlockAsm: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: MOVOU (DI), X0 @@ -738,20 +434,21 @@ memmove_long_match_emit_encodeBlockAsm: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 @@ -777,6 +474,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm: // emitCopy @@ -960,11 +659,11 @@ match_nolit_len_okencodeBlockAsm: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -975,6 +674,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm: MOVQ (R8)(R11*1), R9 @@ -990,6 +690,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm JMP matchlen_match8_match_nolit2_encodeBlockAsm + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm: TZCNTQ R10, R10 @@ -1006,11 +707,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm matchlen_match4_match_nolit2_encodeBlockAsm: @@ -1038,6 +740,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm: ADDL R11, DX @@ -1054,11 +758,11 @@ emit_remainder_encodeBlockAsm: LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -1132,6 +836,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: MOVQ (DX), SI @@ -1195,20 +900,21 @@ memmove_long_emit_remainder_encodeBlockAsm: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: MOVOU (SI), X4 @@ -1285,341 +991,37 @@ search_loop_encodeBlockAsm2MB: MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x31, R8 MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 + SHRQ $0x08, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x31, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 + MOVL (AX)(R8*4), SI + MOVL DX, (AX)(R8*4) + MOVL (AX)(R10*4), R8 MOVL DX, (AX)(R10*4) - MOVL DX, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm2MB - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm2MB - -repeat_extend_back_loop_encodeBlockAsm2MB: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm2MB - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm2MB - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm2MB - -repeat_extend_back_end_encodeBlockAsm2MB: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 4(CX)(SI*1), R9 - CMPQ R9, (SP) - JB dst_size_check_ok_1 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm2MB - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm2MB - CMPL R9, $0x00010000 - JB three_bytes_repeat_emit_lits_encodeBlockAsm2MB - MOVL R9, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R9, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB - -three_bytes_repeat_emit_lits_encodeBlockAsm2MB: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB - -two_bytes_repeat_emit_lits_encodeBlockAsm2MB: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm2MB - JMP memmove_long_repeat_emit_lits_encodeBlockAsm2MB - -one_byte_repeat_emit_lits_encodeBlockAsm2MB: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm2MB: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm2MB - -memmove_midrepeat_emit_lits_encodeBlockAsm2MB: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm2MB_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm2MB: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm2MB - -memmove_long_repeat_emit_lits_encodeBlockAsm2MB: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm2MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm2MB: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB - -matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm2MB - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm2MB: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm2MB - JMP matchlen_match8_repeat_extend_encodeBlockAsm2MB - -matchlen_bsf_16repeat_extend_encodeBlockAsm2MB: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm2MB - -matchlen_match8_repeat_extend_encodeBlockAsm2MB: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm2MB - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm2MB - -matchlen_bsf_8_repeat_extend_encodeBlockAsm2MB: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm2MB - -matchlen_match4_repeat_extend_encodeBlockAsm2MB: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm2MB - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm2MB - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm2MB: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm2MB - JB repeat_extend_forward_end_encodeBlockAsm2MB - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm2MB - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm2MB - -matchlen_match1_repeat_extend_encodeBlockAsm2MB: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm2MB - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm2MB: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm2MB - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm2MB - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm2MB - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm2MB - -repeat_three_match_repeat_encodeBlockAsm2MB: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm2MB - -repeat_two_match_repeat_encodeBlockAsm2MB: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm2MB - -repeat_one_match_repeat_encodeBlockAsm2MB: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm2MB: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm2MB - -no_repeat_found_encodeBlockAsm2MB: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm2MB - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm2MB - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm2MB - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm2MB + CMPL (BX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm2MB + SHRQ $0x08, DI + MOVL (AX)(R10*4), SI + LEAL 2(DX), R9 + CMPL (BX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm2MB + MOVL R9, (AX)(R10*4) + SHRQ $0x08, DI + CMPL (BX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm2MB + MOVL 20(SP), DX + JMP search_loop_encodeBlockAsm2MB candidate3_match_encodeBlockAsm2MB: ADDL $0x02, DX @@ -1649,11 +1051,11 @@ match_extend_back_loop_encodeBlockAsm2MB: match_extend_back_end_encodeBlockAsm2MB: CMPQ CX, (SP) - JB dst_size_check_ok_2 + JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_2: +dst_size_check_ok_1: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -1668,6 +1070,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm2MB: MOVQ (R9)(R11*1), R10 @@ -1683,6 +1086,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm2MB JMP matchlen_match8_match_nolit_encodeBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm2MB: TZCNTQ R12, R12 @@ -1699,11 +1103,12 @@ matchlen_match8_match_nolit_encodeBlockAsm2MB: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm2MB: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm2MB matchlen_match4_match_nolit_encodeBlockAsm2MB: @@ -1731,6 +1136,8 @@ matchlen_match1_match_nolit_encodeBlockAsm2MB: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm2MB LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm2MB + PCALIGN $0x10 match_nolit_end_encodeBlockAsm2MB: ADDL R11, DX @@ -1849,11 +1256,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm2MB: match_emit_lits_copy_encodeBlockAsm2MB: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -1901,6 +1308,7 @@ one_byte_match_emit_encodeBlockAsm2MB: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm2MB_memmove_move_8through16: MOVOU (DI), X0 @@ -1962,20 +1370,21 @@ memmove_long_match_emit_encodeBlockAsm2MB: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_big_loop_back: MOVOU (R10), X4 @@ -2001,6 +1410,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm2MBlarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm2MB + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm2MB: // emitCopy @@ -2178,11 +1589,11 @@ match_nolit_dst_ok_encodeBlockAsm2MB: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -2193,6 +1604,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB: MOVQ (R8)(R11*1), R9 @@ -2208,6 +1620,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm2MB JMP matchlen_match8_match_nolit2_encodeBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm2MB: TZCNTQ R10, R10 @@ -2224,11 +1637,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm2MB: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm2MB: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm2MB matchlen_match4_match_nolit2_encodeBlockAsm2MB: @@ -2256,6 +1670,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm2MB: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm2MB LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm2MB + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm2MB: ADDL R11, DX @@ -2272,11 +1688,11 @@ emit_remainder_encodeBlockAsm2MB: LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -2350,6 +1766,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm2MB + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm2MB_memmove_move_8through16: MOVQ (DX), SI @@ -2413,20 +1830,21 @@ memmove_long_emit_remainder_encodeBlockAsm2MB: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsm2MBlarge_big_loop_back: MOVOU (SI), X4 @@ -2503,341 +1921,37 @@ search_loop_encodeBlockAsm512K: MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x32, R8 MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 + SHRQ $0x08, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 + MOVL (AX)(R8*4), SI + MOVL DX, (AX)(R8*4) + MOVL (AX)(R10*4), R8 MOVL DX, (AX)(R10*4) - MOVL DX, (AX)(R11*4) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm512K - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm512K - -repeat_extend_back_loop_encodeBlockAsm512K: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm512K - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm512K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm512K - -repeat_extend_back_end_encodeBlockAsm512K: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 4(CX)(SI*1), R9 - CMPQ R9, (SP) - JB dst_size_check_ok_1 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm512K - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm512K - CMPL R9, $0x00010000 - JB three_bytes_repeat_emit_lits_encodeBlockAsm512K - MOVL R9, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R9, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K - -three_bytes_repeat_emit_lits_encodeBlockAsm512K: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K - -two_bytes_repeat_emit_lits_encodeBlockAsm512K: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm512K - JMP memmove_long_repeat_emit_lits_encodeBlockAsm512K - -one_byte_repeat_emit_lits_encodeBlockAsm512K: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm512K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm512K - -memmove_midrepeat_emit_lits_encodeBlockAsm512K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm512K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm512K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm512K - -memmove_long_repeat_emit_lits_encodeBlockAsm512K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm512Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm512K: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K - -matchlen_loopback_16_repeat_extend_encodeBlockAsm512K: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm512K - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm512K - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm512K: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm512K - JMP matchlen_match8_repeat_extend_encodeBlockAsm512K - -matchlen_bsf_16repeat_extend_encodeBlockAsm512K: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm512K - -matchlen_match8_repeat_extend_encodeBlockAsm512K: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm512K - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm512K - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm512K - -matchlen_bsf_8_repeat_extend_encodeBlockAsm512K: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm512K - -matchlen_match4_repeat_extend_encodeBlockAsm512K: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm512K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm512K - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm512K: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm512K - JB repeat_extend_forward_end_encodeBlockAsm512K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm512K - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm512K - -matchlen_match1_repeat_extend_encodeBlockAsm512K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm512K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm512K: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm512K - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm512K - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm512K - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm512K - -repeat_three_match_repeat_encodeBlockAsm512K: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm512K - -repeat_two_match_repeat_encodeBlockAsm512K: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm512K - -repeat_one_match_repeat_encodeBlockAsm512K: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm512K: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm512K - -no_repeat_found_encodeBlockAsm512K: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm512K - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm512K - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm512K - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm512K + CMPL (BX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm512K + SHRQ $0x08, DI + MOVL (AX)(R10*4), SI + LEAL 2(DX), R9 + CMPL (BX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm512K + MOVL R9, (AX)(R10*4) + SHRQ $0x08, DI + CMPL (BX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm512K + MOVL 20(SP), DX + JMP search_loop_encodeBlockAsm512K candidate3_match_encodeBlockAsm512K: ADDL $0x02, DX @@ -2867,11 +1981,11 @@ match_extend_back_loop_encodeBlockAsm512K: match_extend_back_end_encodeBlockAsm512K: CMPQ CX, (SP) - JB dst_size_check_ok_2 + JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_2: +dst_size_check_ok_1: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -2886,6 +2000,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm512K: MOVQ (R9)(R11*1), R10 @@ -2901,6 +2016,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm512K JMP matchlen_match8_match_nolit_encodeBlockAsm512K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm512K: TZCNTQ R12, R12 @@ -2917,11 +2033,12 @@ matchlen_match8_match_nolit_encodeBlockAsm512K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm512K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm512K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm512K matchlen_match4_match_nolit_encodeBlockAsm512K: @@ -2949,6 +2066,8 @@ matchlen_match1_match_nolit_encodeBlockAsm512K: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm512K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm512K + PCALIGN $0x10 match_nolit_end_encodeBlockAsm512K: ADDL R11, DX @@ -3067,11 +2186,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm512K: match_emit_lits_copy_encodeBlockAsm512K: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -3119,6 +2238,7 @@ one_byte_match_emit_encodeBlockAsm512K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm512K_memmove_move_8through16: MOVOU (DI), X0 @@ -3180,20 +2300,21 @@ memmove_long_match_emit_encodeBlockAsm512K: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_big_loop_back: MOVOU (R10), X4 @@ -3219,6 +2340,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm512Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm512K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm512K: // emitCopy @@ -3396,11 +2519,11 @@ match_nolit_dst_ok_encodeBlockAsm512K: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -3411,6 +2534,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm512K: MOVQ (R8)(R11*1), R9 @@ -3426,6 +2550,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm512K JMP matchlen_match8_match_nolit2_encodeBlockAsm512K + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm512K: TZCNTQ R10, R10 @@ -3442,11 +2567,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm512K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm512K + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm512K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm512K matchlen_match4_match_nolit2_encodeBlockAsm512K: @@ -3474,6 +2600,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm512K: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm512K LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm512K + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm512K: ADDL R11, DX @@ -3490,11 +2618,11 @@ emit_remainder_encodeBlockAsm512K: LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -3568,6 +2696,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm512K_memmove_move_8through16: MOVQ (DX), SI @@ -3631,20 +2760,21 @@ memmove_long_emit_remainder_encodeBlockAsm512K: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsm512Klarge_big_loop_back: MOVOU (SI), X4 @@ -3721,327 +2851,24 @@ search_loop_encodeBlockAsm64K: MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R9 + MOVQ DI, R8 + SHLQ $0x10, R8 + IMULQ R9, R8 + SHRQ $0x33, R8 MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 + SHRQ $0x08, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x33, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x33, R11 - MOVWLZX (AX)(R10*2), SI - MOVWLZX (AX)(R11*2), R8 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 MOVW DX, (AX)(R10*2) - MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x33, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm64K - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm64K - -repeat_extend_back_loop_encodeBlockAsm64K: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm64K - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm64K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm64K - -repeat_extend_back_end_encodeBlockAsm64K: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 4(CX)(SI*1), R9 - CMPQ R9, (SP) - JB dst_size_check_ok_1 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm64K - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm64K - JB three_bytes_repeat_emit_lits_encodeBlockAsm64K - MOVL R9, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R9, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K - -three_bytes_repeat_emit_lits_encodeBlockAsm64K: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K - -two_bytes_repeat_emit_lits_encodeBlockAsm64K: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm64K - JMP memmove_long_repeat_emit_lits_encodeBlockAsm64K - -one_byte_repeat_emit_lits_encodeBlockAsm64K: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm64K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm64K - -memmove_midrepeat_emit_lits_encodeBlockAsm64K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm64K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm64K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm64K - -memmove_long_repeat_emit_lits_encodeBlockAsm64K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm64K: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K - -matchlen_loopback_16_repeat_extend_encodeBlockAsm64K: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm64K - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm64K - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm64K: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm64K - JMP matchlen_match8_repeat_extend_encodeBlockAsm64K - -matchlen_bsf_16repeat_extend_encodeBlockAsm64K: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm64K - -matchlen_match8_repeat_extend_encodeBlockAsm64K: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm64K - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm64K - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm64K - -matchlen_bsf_8_repeat_extend_encodeBlockAsm64K: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm64K - -matchlen_match4_repeat_extend_encodeBlockAsm64K: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm64K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm64K - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm64K: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm64K - JB repeat_extend_forward_end_encodeBlockAsm64K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm64K - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm64K - -matchlen_match1_repeat_extend_encodeBlockAsm64K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm64K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm64K: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm64K - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm64K - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm64K - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm64K - -repeat_three_match_repeat_encodeBlockAsm64K: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm64K - -repeat_two_match_repeat_encodeBlockAsm64K: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm64K - -repeat_one_match_repeat_encodeBlockAsm64K: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm64K: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm64K - -no_repeat_found_encodeBlockAsm64K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm64K SHRQ $0x08, DI @@ -4084,11 +2911,11 @@ match_extend_back_loop_encodeBlockAsm64K: match_extend_back_end_encodeBlockAsm64K: CMPQ CX, (SP) - JB dst_size_check_ok_2 + JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_2: +dst_size_check_ok_1: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -4103,6 +2930,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm64K: MOVQ (R9)(R11*1), R10 @@ -4118,6 +2946,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm64K JMP matchlen_match8_match_nolit_encodeBlockAsm64K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm64K: TZCNTQ R12, R12 @@ -4134,11 +2963,12 @@ matchlen_match8_match_nolit_encodeBlockAsm64K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm64K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm64K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm64K matchlen_match4_match_nolit_encodeBlockAsm64K: @@ -4166,6 +2996,8 @@ matchlen_match1_match_nolit_encodeBlockAsm64K: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm64K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm64K + PCALIGN $0x10 match_nolit_end_encodeBlockAsm64K: ADDL R11, DX @@ -4238,11 +3070,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm64K: match_emit_lits_copy_encodeBlockAsm64K: LEAQ 4(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -4289,6 +3121,7 @@ one_byte_match_emit_encodeBlockAsm64K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm64K_memmove_move_8through16: MOVOU (DI), X0 @@ -4350,20 +3183,21 @@ memmove_long_match_emit_encodeBlockAsm64K: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 @@ -4389,6 +3223,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm64Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm64K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm64K: // emitCopy @@ -4523,11 +3359,11 @@ match_nolit_dst_ok_encodeBlockAsm64K: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -4538,6 +3374,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm64K: MOVQ (R8)(R11*1), R9 @@ -4553,6 +3390,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm64K JMP matchlen_match8_match_nolit2_encodeBlockAsm64K + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm64K: TZCNTQ R10, R10 @@ -4569,11 +3407,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm64K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm64K + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm64K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm64K matchlen_match4_match_nolit2_encodeBlockAsm64K: @@ -4601,6 +3440,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm64K: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm64K LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm64K + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm64K: ADDL R11, DX @@ -4617,11 +3458,11 @@ emit_remainder_encodeBlockAsm64K: LEAQ (BX)(DX*1), DX LEAQ 4(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -4694,6 +3535,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm64K_memmove_move_8through16: MOVQ (DX), SI @@ -4757,20 +3599,21 @@ memmove_long_emit_remainder_encodeBlockAsm64K: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 @@ -4847,319 +3690,24 @@ search_loop_encodeBlockAsm16K: MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x000000cf1bbcdcbb, R9 + MOVQ DI, R8 + SHLQ $0x18, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 + SHRQ $0x08, R10 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVWLZX (AX)(R10*2), SI - MOVWLZX (AX)(R11*2), R8 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 MOVW DX, (AX)(R10*2) - MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 SHLQ $0x18, R10 IMULQ R9, R10 SHRQ $0x34, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm16K - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm16K - -repeat_extend_back_loop_encodeBlockAsm16K: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm16K - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm16K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm16K - -repeat_extend_back_end_encodeBlockAsm16K: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 3(CX)(SI*1), R9 - CMPQ R9, (SP) - JB dst_size_check_ok_1 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm16K - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm16K - JB three_bytes_repeat_emit_lits_encodeBlockAsm16K - -three_bytes_repeat_emit_lits_encodeBlockAsm16K: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm16K - -two_bytes_repeat_emit_lits_encodeBlockAsm16K: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm16K - JMP memmove_long_repeat_emit_lits_encodeBlockAsm16K - -one_byte_repeat_emit_lits_encodeBlockAsm16K: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm16K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm16K - -memmove_midrepeat_emit_lits_encodeBlockAsm16K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm16K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm16K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm16K - -memmove_long_repeat_emit_lits_encodeBlockAsm16K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm16Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm16K: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K - -matchlen_loopback_16_repeat_extend_encodeBlockAsm16K: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm16K - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm16K - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm16K: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm16K - JMP matchlen_match8_repeat_extend_encodeBlockAsm16K - -matchlen_bsf_16repeat_extend_encodeBlockAsm16K: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm16K - -matchlen_match8_repeat_extend_encodeBlockAsm16K: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm16K - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm16K - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm16K - -matchlen_bsf_8_repeat_extend_encodeBlockAsm16K: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm16K - -matchlen_match4_repeat_extend_encodeBlockAsm16K: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm16K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm16K - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm16K: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm16K - JB repeat_extend_forward_end_encodeBlockAsm16K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm16K - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm16K - -matchlen_match1_repeat_extend_encodeBlockAsm16K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm16K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm16K: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm16K - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm16K - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm16K - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm16K - -repeat_three_match_repeat_encodeBlockAsm16K: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm16K - -repeat_two_match_repeat_encodeBlockAsm16K: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm16K - -repeat_one_match_repeat_encodeBlockAsm16K: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm16K: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm16K - -no_repeat_found_encodeBlockAsm16K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm16K SHRQ $0x08, DI @@ -5202,11 +3750,11 @@ match_extend_back_loop_encodeBlockAsm16K: match_extend_back_end_encodeBlockAsm16K: CMPQ CX, (SP) - JB dst_size_check_ok_2 + JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_2: +dst_size_check_ok_1: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -5221,6 +3769,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm16K: MOVQ (R9)(R11*1), R10 @@ -5236,6 +3785,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm16K JMP matchlen_match8_match_nolit_encodeBlockAsm16K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm16K: TZCNTQ R12, R12 @@ -5252,11 +3802,12 @@ matchlen_match8_match_nolit_encodeBlockAsm16K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm16K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm16K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm16K matchlen_match4_match_nolit_encodeBlockAsm16K: @@ -5284,6 +3835,8 @@ matchlen_match1_match_nolit_encodeBlockAsm16K: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm16K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm16K + PCALIGN $0x10 match_nolit_end_encodeBlockAsm16K: ADDL R11, DX @@ -5356,11 +3909,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm16K: match_emit_lits_copy_encodeBlockAsm16K: LEAQ 3(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -5399,6 +3952,7 @@ one_byte_match_emit_encodeBlockAsm16K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm16K_memmove_move_8through16: MOVOU (DI), X0 @@ -5460,20 +4014,21 @@ memmove_long_match_emit_encodeBlockAsm16K: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_big_loop_back: MOVOU (R10), X4 @@ -5499,6 +4054,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm16Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm16K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm16K: // emitCopy @@ -5633,11 +4190,11 @@ match_nolit_dst_ok_encodeBlockAsm16K: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -5648,6 +4205,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm16K: MOVQ (R8)(R11*1), R9 @@ -5663,6 +4221,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm16K JMP matchlen_match8_match_nolit2_encodeBlockAsm16K + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm16K: TZCNTQ R10, R10 @@ -5679,11 +4238,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm16K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm16K + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm16K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm16K matchlen_match4_match_nolit2_encodeBlockAsm16K: @@ -5711,6 +4271,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm16K: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm16K LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm16K + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm16K: ADDL R11, DX @@ -5727,11 +4289,11 @@ emit_remainder_encodeBlockAsm16K: LEAQ (BX)(DX*1), DX LEAQ 3(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -5796,6 +4358,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm16K_memmove_move_8through16: MOVQ (DX), SI @@ -5859,20 +4422,21 @@ memmove_long_emit_remainder_encodeBlockAsm16K: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsm16Klarge_big_loop_back: MOVOU (SI), X4 @@ -5949,363 +4513,71 @@ search_loop_encodeBlockAsm4K: MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 + SHRQ $0x08, R10 + SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVWLZX (AX)(R10*2), SI - MOVWLZX (AX)(R11*2), R8 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 MOVW DX, (AX)(R10*2) - MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 + SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x36, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm4K - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm4K - -repeat_extend_back_loop_encodeBlockAsm4K: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm4K - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm4K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm4K - -repeat_extend_back_end_encodeBlockAsm4K: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 3(CX)(SI*1), R9 - CMPQ R9, (SP) + CMPL (BX)(SI*1), DI + JEQ candidate_match_encodeBlockAsm4K + SHRQ $0x08, DI + MOVWLZX (AX)(R10*2), SI + LEAL 2(DX), R9 + CMPL (BX)(R8*1), DI + JEQ candidate2_match_encodeBlockAsm4K + MOVW R9, (AX)(R10*2) + SHRQ $0x08, DI + CMPL (BX)(SI*1), DI + JEQ candidate3_match_encodeBlockAsm4K + MOVL 20(SP), DX + JMP search_loop_encodeBlockAsm4K + +candidate3_match_encodeBlockAsm4K: + ADDL $0x02, DX + JMP candidate_match_encodeBlockAsm4K + +candidate2_match_encodeBlockAsm4K: + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + +candidate_match_encodeBlockAsm4K: + MOVL 12(SP), DI + TESTL SI, SI + JZ match_extend_back_end_encodeBlockAsm4K + +match_extend_back_loop_encodeBlockAsm4K: + CMPL DX, DI + JBE match_extend_back_end_encodeBlockAsm4K + MOVB -1(BX)(SI*1), R8 + MOVB -1(BX)(DX*1), R9 + CMPB R8, R9 + JNE match_extend_back_end_encodeBlockAsm4K + LEAL -1(DX), DX + DECL SI + JZ match_extend_back_end_encodeBlockAsm4K + JMP match_extend_back_loop_encodeBlockAsm4K + +match_extend_back_end_encodeBlockAsm4K: + CMPQ CX, (SP) JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm4K - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm4K - JB three_bytes_repeat_emit_lits_encodeBlockAsm4K - -three_bytes_repeat_emit_lits_encodeBlockAsm4K: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm4K - -two_bytes_repeat_emit_lits_encodeBlockAsm4K: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm4K - JMP memmove_long_repeat_emit_lits_encodeBlockAsm4K - -one_byte_repeat_emit_lits_encodeBlockAsm4K: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm4K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm4K - -memmove_midrepeat_emit_lits_encodeBlockAsm4K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm4K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm4K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm4K - -memmove_long_repeat_emit_lits_encodeBlockAsm4K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm4Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm4K: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K - -matchlen_loopback_16_repeat_extend_encodeBlockAsm4K: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4K - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4K - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm4K: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm4K - JMP matchlen_match8_repeat_extend_encodeBlockAsm4K - -matchlen_bsf_16repeat_extend_encodeBlockAsm4K: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm4K - -matchlen_match8_repeat_extend_encodeBlockAsm4K: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm4K - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4K - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm4K - -matchlen_bsf_8_repeat_extend_encodeBlockAsm4K: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm4K - -matchlen_match4_repeat_extend_encodeBlockAsm4K: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm4K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm4K - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm4K: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm4K - JB repeat_extend_forward_end_encodeBlockAsm4K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm4K - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm4K - -matchlen_match1_repeat_extend_encodeBlockAsm4K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm4K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm4K: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm4K - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm4K - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm4K - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm4K - -repeat_three_match_repeat_encodeBlockAsm4K: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm4K - -repeat_two_match_repeat_encodeBlockAsm4K: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4K - -repeat_one_match_repeat_encodeBlockAsm4K: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm4K: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm4K - -no_repeat_found_encodeBlockAsm4K: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm4K - SHRQ $0x08, DI - MOVWLZX (AX)(R10*2), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm4K - MOVW R9, (AX)(R10*2) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm4K - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm4K - -candidate3_match_encodeBlockAsm4K: - ADDL $0x02, DX - JMP candidate_match_encodeBlockAsm4K - -candidate2_match_encodeBlockAsm4K: - MOVW R9, (AX)(R10*2) - INCL DX - MOVL R8, SI - -candidate_match_encodeBlockAsm4K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm4K - -match_extend_back_loop_encodeBlockAsm4K: - CMPL DX, DI - JBE match_extend_back_end_encodeBlockAsm4K - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBlockAsm4K - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBlockAsm4K - JMP match_extend_back_loop_encodeBlockAsm4K - -match_extend_back_end_encodeBlockAsm4K: - CMPQ CX, (SP) - JB dst_size_check_ok_2 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_2: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -6320,6 +4592,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm4K: MOVQ (R9)(R11*1), R10 @@ -6335,6 +4608,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm4K JMP matchlen_match8_match_nolit_encodeBlockAsm4K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm4K: TZCNTQ R12, R12 @@ -6351,11 +4625,12 @@ matchlen_match8_match_nolit_encodeBlockAsm4K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm4K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm4K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm4K matchlen_match4_match_nolit_encodeBlockAsm4K: @@ -6383,6 +4658,8 @@ matchlen_match1_match_nolit_encodeBlockAsm4K: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm4K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm4K + PCALIGN $0x10 match_nolit_end_encodeBlockAsm4K: ADDL R11, DX @@ -6455,11 +4732,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm4K: match_emit_lits_copy_encodeBlockAsm4K: LEAQ 3(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -6498,6 +4775,7 @@ one_byte_match_emit_encodeBlockAsm4K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm4K_memmove_move_8through16: MOVOU (DI), X0 @@ -6559,20 +4837,21 @@ memmove_long_match_emit_encodeBlockAsm4K: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_big_loop_back: MOVOU (R10), X4 @@ -6598,6 +4877,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm4Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm4K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm4K: // emitCopy @@ -6714,8 +4995,10 @@ match_nolit_dst_ok_encodeBlockAsm4K: MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 + SHLQ $0x20, R8 IMULQ SI, R8 SHRQ $0x36, R8 + SHLQ $0x20, R9 IMULQ SI, R9 SHRQ $0x36, R9 LEAL -2(DX), R10 @@ -6730,11 +5013,11 @@ match_nolit_dst_ok_encodeBlockAsm4K: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -6745,6 +5028,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm4K: MOVQ (R8)(R11*1), R9 @@ -6760,6 +5044,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm4K JMP matchlen_match8_match_nolit2_encodeBlockAsm4K + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm4K: TZCNTQ R10, R10 @@ -6776,11 +5061,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm4K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm4K + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm4K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm4K matchlen_match4_match_nolit2_encodeBlockAsm4K: @@ -6808,6 +5094,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm4K: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm4K LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm4K + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm4K: ADDL R11, DX @@ -6824,11 +5112,11 @@ emit_remainder_encodeBlockAsm4K: LEAQ (BX)(DX*1), DX LEAQ 3(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -6893,6 +5181,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm4K_memmove_move_8through16: MOVQ (DX), SI @@ -6956,20 +5245,21 @@ memmove_long_emit_remainder_encodeBlockAsm4K: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4Klarge_big_loop_back: MOVOU (SI), X4 @@ -7046,316 +5336,24 @@ search_loop_encodeBlockAsm1K: MOVQ (BX)(DX*1), DI MOVL SI, 20(SP) MOVQ $0x9e3779b1, R9 + MOVQ DI, R8 + SHLQ $0x20, R8 + IMULQ R9, R8 + SHRQ $0x37, R8 MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 + SHRQ $0x08, R10 + SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x37, R10 - IMULQ R9, R11 - SHRQ $0x37, R11 - MOVWLZX (AX)(R10*2), SI - MOVWLZX (AX)(R11*2), R8 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 MOVW DX, (AX)(R10*2) - MOVW DX, (AX)(R11*2) MOVQ DI, R10 SHRQ $0x10, R10 + SHLQ $0x20, R10 IMULQ R9, R10 SHRQ $0x37, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm1K - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeBlockAsm1K - -repeat_extend_back_loop_encodeBlockAsm1K: - CMPL DI, SI - JBE repeat_extend_back_end_encodeBlockAsm1K - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm1K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeBlockAsm1K - -repeat_extend_back_end_encodeBlockAsm1K: - MOVL DI, SI - MOVL 12(SP), R8 - SUBL R8, SI - LEAQ 3(CX)(SI*1), R9 - CMPQ R9, (SP) - JB dst_size_check_ok_1 - MOVQ $0x00000000, ret+56(FP) - RET - -dst_size_check_ok_1: - LEAQ (BX)(R8*1), R8 - - // emitLiteral - LEAL -1(SI), R9 - CMPL R9, $0x1d - JB one_byte_repeat_emit_lits_encodeBlockAsm1K - SUBL $0x1d, R9 - CMPL R9, $0x00000100 - JB two_bytes_repeat_emit_lits_encodeBlockAsm1K - JB three_bytes_repeat_emit_lits_encodeBlockAsm1K - -three_bytes_repeat_emit_lits_encodeBlockAsm1K: - MOVB $0xf0, (CX) - MOVW R9, 1(CX) - ADDQ $0x03, CX - ADDL $0x1d, R9 - JMP memmove_long_repeat_emit_lits_encodeBlockAsm1K - -two_bytes_repeat_emit_lits_encodeBlockAsm1K: - MOVB $0xe8, (CX) - MOVB R9, 1(CX) - ADDL $0x1d, R9 - ADDQ $0x02, CX - CMPL R9, $0x40 - JB memmove_midrepeat_emit_lits_encodeBlockAsm1K - JMP memmove_long_repeat_emit_lits_encodeBlockAsm1K - -one_byte_repeat_emit_lits_encodeBlockAsm1K: - SHLB $0x03, R9 - MOVB R9, (CX) - ADDQ $0x01, CX - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 16, min move: 1 - CMPQ SI, $0x10 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16 - CMPQ SI, $0x20 - JBE emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_8through16: - MOVOU (R8), X0 - MOVOU X0, (CX) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K - -emit_lit_memmove_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_end_copy_repeat_emit_lits_encodeBlockAsm1K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm1K - -memmove_midrepeat_emit_lits_encodeBlockAsm1K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveShort - // margin: 15, min move: 30 - CMPQ SI, $0x20 - JBE emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32 - JMP emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64 - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(SI*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(SI*1) - JMP memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K - -emit_lit_memmove_mid_repeat_emit_lits_encodeBlockAsm1K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - -memmove_mid_end_copy_repeat_emit_lits_encodeBlockAsm1K: - MOVQ R9, CX - JMP repeat_emit_lits_end_encodeBlockAsm1K - -memmove_long_repeat_emit_lits_encodeBlockAsm1K: - LEAQ (CX)(SI*1), R9 - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ SI, R12 - JAE emit_lit_memmove_long_repeat_emit_lits_encodeBlockAsm1Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(SI*1) - MOVOU X3, -16(CX)(SI*1) - MOVQ R9, CX - -repeat_emit_lits_end_encodeBlockAsm1K: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - JMP matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K - -matchlen_loopback_16_repeat_extend_encodeBlockAsm1K: - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm1K - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm1K - LEAL -16(R8), R8 - LEAL 16(R11), R11 - -matchlen_loop_16_entry_repeat_extend_encodeBlockAsm1K: - CMPL R8, $0x10 - JAE matchlen_loopback_16_repeat_extend_encodeBlockAsm1K - JMP matchlen_match8_repeat_extend_encodeBlockAsm1K - -matchlen_bsf_16repeat_extend_encodeBlockAsm1K: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm1K - -matchlen_match8_repeat_extend_encodeBlockAsm1K: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm1K - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm1K - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm1K - -matchlen_bsf_8_repeat_extend_encodeBlockAsm1K: - TZCNTQ R10, R10 - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm1K - -matchlen_match4_repeat_extend_encodeBlockAsm1K: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm1K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm1K - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm1K: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm1K - JB repeat_extend_forward_end_encodeBlockAsm1K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm1K - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm1K - -matchlen_match1_repeat_extend_encodeBlockAsm1K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm1K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm1K: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitRepeat - LEAL -1(SI), DI - CMPL SI, $0x1d - JBE repeat_one_match_repeat_encodeBlockAsm1K - LEAL -30(SI), DI - CMPL SI, $0x0000011e - JB repeat_two_match_repeat_encodeBlockAsm1K - CMPL SI, $0x0001001e - JB repeat_three_match_repeat_encodeBlockAsm1K - MOVB $0xfc, (CX) - MOVL DI, 1(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm1K - -repeat_three_match_repeat_encodeBlockAsm1K: - MOVB $0xf4, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm1K - -repeat_two_match_repeat_encodeBlockAsm1K: - MOVB $0xec, (CX) - MOVB DI, 1(CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm1K - -repeat_one_match_repeat_encodeBlockAsm1K: - XORL DI, DI - LEAL -4(DI)(SI*8), DI - MOVB DI, (CX) - ADDQ $0x01, CX - -repeat_end_emit_encodeBlockAsm1K: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm1K - -no_repeat_found_encodeBlockAsm1K: CMPL (BX)(SI*1), DI JEQ candidate_match_encodeBlockAsm1K SHRQ $0x08, DI @@ -7398,11 +5396,11 @@ match_extend_back_loop_encodeBlockAsm1K: match_extend_back_end_encodeBlockAsm1K: CMPQ CX, (SP) - JB dst_size_check_ok_2 + JB dst_size_check_ok_1 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_2: +dst_size_check_ok_1: MOVL DX, R8 MOVL DX, DI SUBL SI, DI @@ -7417,6 +5415,7 @@ dst_size_check_ok_2: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBlockAsm1K: MOVQ (R9)(R11*1), R10 @@ -7432,6 +5431,7 @@ matchlen_loop_16_entry_match_nolit_encodeBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBlockAsm1K JMP matchlen_match8_match_nolit_encodeBlockAsm1K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBlockAsm1K: TZCNTQ R12, R12 @@ -7448,11 +5448,12 @@ matchlen_match8_match_nolit_encodeBlockAsm1K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBlockAsm1K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBlockAsm1K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBlockAsm1K matchlen_match4_match_nolit_encodeBlockAsm1K: @@ -7480,6 +5481,8 @@ matchlen_match1_match_nolit_encodeBlockAsm1K: CMPB (SI)(R11*1), R10 JNE match_nolit_end_encodeBlockAsm1K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBlockAsm1K + PCALIGN $0x10 match_nolit_end_encodeBlockAsm1K: ADDL R11, DX @@ -7552,11 +5555,11 @@ repeat_one_match_emit_repeat_copy2_encodeBlockAsm1K: match_emit_lits_copy_encodeBlockAsm1K: LEAQ 3(CX)(R8*1), R9 CMPQ R9, (SP) - JB dst_size_check_ok_3 + JB dst_size_check_ok_2 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_3: +dst_size_check_ok_2: // emitLiteral LEAL -1(R8), R9 CMPL R9, $0x1d @@ -7595,6 +5598,7 @@ one_byte_match_emit_encodeBlockAsm1K: CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBlockAsm1K_memmove_move_8through16: MOVOU (DI), X0 @@ -7656,20 +5660,21 @@ memmove_long_match_emit_encodeBlockAsm1K: LEAQ (CX)(R8*1), R9 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R8*1), X2 - MOVOU -16(DI)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_big_loop_back: MOVOU (R10), X4 @@ -7695,6 +5700,8 @@ emit_lit_memmove_long_match_emit_encodeBlockAsm1Klarge_forward_sse_loop_32: MOVOU X2, -32(CX)(R8*1) MOVOU X3, -16(CX)(R8*1) MOVQ R9, CX + JMP match_nolits_copy_encodeBlockAsm1K + PCALIGN $0x10 match_nolits_copy_encodeBlockAsm1K: // emitCopy @@ -7811,8 +5818,10 @@ match_nolit_dst_ok_encodeBlockAsm1K: MOVQ DI, R8 SHRQ $0x10, DI MOVQ DI, R9 + SHLQ $0x20, R8 IMULQ SI, R8 SHRQ $0x37, R8 + SHLQ $0x20, R9 IMULQ SI, R9 SHRQ $0x37, R9 LEAL -2(DX), R10 @@ -7827,11 +5836,11 @@ match_nolit_dst_ok_encodeBlockAsm1K: SUBL SI, DI MOVL DI, 16(SP) CMPQ CX, (SP) - JB dst_size_check_ok_4 + JB dst_size_check_ok_3 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_4: +dst_size_check_ok_3: ADDL $0x03, DX ADDL $0x04, SI MOVQ src_len+32(FP), DI @@ -7842,6 +5851,7 @@ dst_size_check_ok_4: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K + PCALIGN $0x10 matchlen_loopback_16_match_nolit2_encodeBlockAsm1K: MOVQ (R8)(R11*1), R9 @@ -7857,6 +5867,7 @@ matchlen_loop_16_entry_match_nolit2_encodeBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit2_encodeBlockAsm1K JMP matchlen_match8_match_nolit2_encodeBlockAsm1K + PCALIGN $0x10 matchlen_bsf_16match_nolit2_encodeBlockAsm1K: TZCNTQ R10, R10 @@ -7873,11 +5884,12 @@ matchlen_match8_match_nolit2_encodeBlockAsm1K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit2_encodeBlockAsm1K + PCALIGN $0x10 matchlen_bsf_8_match_nolit2_encodeBlockAsm1K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R11)(R9*1), R11 + ADDL R9, R11 JMP match_nolit2_end_encodeBlockAsm1K matchlen_match4_match_nolit2_encodeBlockAsm1K: @@ -7905,6 +5917,8 @@ matchlen_match1_match_nolit2_encodeBlockAsm1K: CMPB (SI)(R11*1), R9 JNE match_nolit2_end_encodeBlockAsm1K LEAL 1(R11), R11 + JMP match_nolit2_end_encodeBlockAsm1K + PCALIGN $0x10 match_nolit2_end_encodeBlockAsm1K: ADDL R11, DX @@ -7921,11 +5935,11 @@ emit_remainder_encodeBlockAsm1K: LEAQ (BX)(DX*1), DX LEAQ 3(CX)(AX*1), BX CMPQ BX, (SP) - JB dst_size_check_ok_5 + JB dst_size_check_ok_4 MOVQ $0x00000000, ret+56(FP) RET -dst_size_check_ok_5: +dst_size_check_ok_4: // emitLiteral LEAL -1(AX), BX CMPL BX, $0x1d @@ -7990,6 +6004,7 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_4through8: MOVL SI, (CX) MOVL DX, -4(CX)(AX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBlockAsm1K_memmove_move_8through16: MOVQ (DX), SI @@ -8053,20 +6068,21 @@ memmove_long_emit_remainder_encodeBlockAsm1K: LEAQ (CX)(AX*1), BX // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(AX*1), X2 - MOVOU -16(DX)(AX*1), X3 - MOVQ AX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(DX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBlockAsm1Klarge_big_loop_back: MOVOU (SI), X4 @@ -8099,1156 +6115,8588 @@ emit_remainder_end_encodeBlockAsm1K: MOVQ CX, ret+56(FP) RET -// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int +// func encodeFastBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int // Requires: BMI, CMOV, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $24-64 +TEXT ·encodeFastBlockAsm(SB), $24-64 MOVQ tmp+48(FP), AX MOVQ dst_base+0(FP), CX - MOVQ $0x00001200, DX + MOVQ $0x00000200, DX + MOVQ AX, BX PXOR X0, X0 -zero_loop_encodeBetterBlockAsm: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX +zero_loop_encodeFastBlockAsm: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX DECQ DX - JNZ zero_loop_encodeBetterBlockAsm + JNZ zero_loop_encodeFastBlockAsm MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -17(AX), DX - LEAQ -17(AX), DI - MOVL DI, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm: - MOVQ tmp+48(FP), DI - MOVL AX, R8 - SUBL 12(SP), R8 - SHRL $0x08, R8 - CMPL R8, $0x63 - JBE check_maxskip_ok_encodeBetterBlockAsm - LEAL 100(AX), R8 - JMP check_maxskip_cont_encodeBetterBlockAsm - -check_maxskip_ok_encodeBetterBlockAsm: - LEAL 1(AX)(R8*1), R8 - -check_maxskip_cont_encodeBetterBlockAsm: - CMPL R8, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm - MOVQ (DX)(AX*1), R9 - MOVL R8, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R11 - MOVQ $0x9e3779b1, R8 - MOVQ R9, R12 - MOVQ R9, R13 - SHLQ $0x08, R12 - IMULQ R11, R12 - SHRQ $0x2f, R12 - IMULQ R8, R13 - SHRQ $0x32, R13 - MOVL (DI)(R12*4), R8 - MOVL 524288(DI)(R13*4), R10 - MOVL AX, (DI)(R12*4) - MOVL AX, 524288(DI)(R13*4) - LEAL -2162685(AX), R12 - CMPL R8, R12 - JLE offset_ok_0_encodeBetterBlockAsm - MOVQ (DX)(R8*1), BX - CMPQ BX, R9 - JEQ candidate_match_encodeBetterBlockAsm - -offset_ok_0_encodeBetterBlockAsm: - CMPL R10, R12 - JLE offset_ok_1_encodeBetterBlockAsm - MOVQ (DX)(R10*1), SI - CMPQ SI, R9 - -offset_ok_1_encodeBetterBlockAsm: - MOVL AX, R13 - SUBL 16(SP), R13 - MOVQ (DX)(R13*1), R13 - MOVQ $0x000000ffffffff00, R14 - XORQ R9, R13 - TESTQ R14, R13 - JNE no_repeat_found_encodeBetterBlockAsm - LEAL 1(AX), DI - MOVL 12(SP), R8 - MOVL DI, R9 - SUBL 16(SP), R9 - JZ repeat_extend_back_end_encodeBetterBlockAsm + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX -repeat_extend_back_loop_encodeBetterBlockAsm: - CMPL DI, R8 +search_loop_encodeFastBlockAsm: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm + MOVQ (BX)(DX*1), DI + LEAL -2162685(DX), R8 + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R10 + MOVQ DI, R9 + IMULQ R10, R9 + SHRQ $0x32, R9 + MOVQ 1(BX)(DX*1), R11 + IMULQ R10, R11 + SHRQ $0x32, R11 + MOVL (AX)(R9*4), SI + MOVL DX, (AX)(R9*4) + MOVL (AX)(R11*4), R9 + MOVL DX, (AX)(R11*4) + MOVQ 2(BX)(DX*1), R11 + IMULQ R10, R11 + SHRQ $0x32, R11 + MOVL DX, R10 + SUBL 16(SP), R10 + MOVL 1(BX)(R10*1), R12 + MOVQ DI, R10 + SHRQ $0x08, R10 + CMPL R10, R12 + JNE no_repeat_found_encodeFastBlockAsm + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 4(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm + CMPL R9, $0x00010000 + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm + +three_bytes_repeat_emit_lits_encodeFastBlockAsm: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm + +two_bytes_repeat_emit_lits_encodeFastBlockAsm: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm + +one_byte_repeat_emit_lits_encodeFastBlockAsm: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm + +memmove_midrepeat_emit_lits_encodeFastBlockAsm: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm + +memmove_long_repeat_emit_lits_encodeFastBlockAsm: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm + +matchlen_match8_repeat_extend_encodeFastBlockAsm: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm + +matchlen_match4_repeat_extend_encodeFastBlockAsm: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm + JB repeat_extend_forward_end_encodeFastBlockAsm + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm + +matchlen_match1_repeat_extend_encodeFastBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm + +repeat_three_match_repeat_encodeFastBlockAsm: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm + +repeat_two_match_repeat_encodeFastBlockAsm: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm + +repeat_one_match_repeat_encodeFastBlockAsm: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm + +no_repeat_found_encodeFastBlockAsm: + CMPL SI, R8 + CMOVLLE R8, SI + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm + MOVQ 1(BX)(DX*1), DI + MOVL (AX)(R11*4), SI + LEAL 2(DX), R10 + CMPL R9, R8 + CMOVLLE R8, R9 + CMPQ (BX)(R9*1), DI + JEQ candidate2_match_encodeFastBlockAsm + MOVL R10, (AX)(R11*4) + MOVQ 2(BX)(DX*1), DI + CMPL SI, R8 + CMOVLLE R8, SI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm + +candidate3_match_encodeFastBlockAsm: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm + +candidate2_match_encodeFastBlockAsm: + MOVL R10, (AX)(R11*4) + INCL DX + MOVL R9, SI + +candidate_match_encodeFastBlockAsm: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm + JMP matchlen_match8_match_nolit_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm + +matchlen_match8_match_nolit_encodeFastBlockAsm: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm + +matchlen_match4_match_nolit_encodeFastBlockAsm: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm + JB match_nolit_end_encodeFastBlockAsm + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm + +matchlen_match1_match_nolit_encodeFastBlockAsm: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm + LEAQ (BX)(DI*1), DI + LEAQ 4(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm + CMPL R9, $0x00010000 + JB three_bytes_match_emit_encodeFastBlockAsm + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm + +three_bytes_match_emit_encodeFastBlockAsm: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm + +two_bytes_match_emit_encodeFastBlockAsm: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm + JMP memmove_long_match_emit_encodeFastBlockAsm + +one_byte_match_emit_encodeFastBlockAsm: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm + +emit_lit_memmove_match_emit_encodeFastBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm + +emit_lit_memmove_match_emit_encodeFastBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm + +memmove_midmatch_emit_encodeFastBlockAsm: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm + +memmove_long_match_emit_encodeFastBlockAsm: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsmlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm: + // emitCopy + CMPL SI, $0x0001003f + JBE two_byte_offset_match_nolit_encodeFastBlockAsm + + // emitCopy3 + LEAL -4(R11), R11 + LEAL -65536(SI), SI + SHLL $0x0b, SI + ADDL $0x07, SI + CMPL R11, $0x3c + JBE emit_copy3_0_match_nolit_encodeFastBlockAsm_emit3 + LEAL -60(R11), DI + CMPL R11, $0x0000013c + JB emit_copy3_1_match_nolit_encodeFastBlockAsm_emit3 + CMPL R11, $0x0001003c + JB emit_copy3_2_match_nolit_encodeFastBlockAsm_emit3 + ADDL $0x000007e0, SI + MOVL SI, (CX) + MOVL DI, 4(CX) + ADDQ $0x07, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy3_2_match_nolit_encodeFastBlockAsm_emit3: + ADDL $0x000007c0, SI + MOVL SI, (CX) + MOVW DI, 4(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy3_1_match_nolit_encodeFastBlockAsm_emit3: + ADDL $0x000007a0, SI + MOVL SI, (CX) + MOVB DI, 4(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy3_0_match_nolit_encodeFastBlockAsm_emit3: + SHLL $0x05, R11 + ORL R11, SI + MOVL SI, (CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +two_byte_offset_match_nolit_encodeFastBlockAsm: + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_one_longer_match_nolit_encodeFastBlockAsm: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +two_byte_match_nolit_encodeFastBlockAsm: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy2_2_match_nolit_encodeFastBlockAsm_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy2_1_match_nolit_encodeFastBlockAsm_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm + +emit_copy2_0_match_nolit_encodeFastBlockAsm_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x32, R8 + IMULQ SI, R9 + SHRQ $0x32, R9 + LEAL -2(DX), R10 + MOVL (AX)(R9*4), SI + MOVL R10, (AX)(R8*4) + MOVL DX, (AX)(R9*4) + MOVL DX, R8 + INCL DX + LEAL -2162687(R8), R9 + CMPL SI, R9 + JA match_nolit_len_okencodeFastBlockAsm + JMP search_loop_encodeFastBlockAsm + +match_nolit_len_okencodeFastBlockAsm: + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm + +matchlen_match8_match_nolit2_encodeFastBlockAsm: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm + +matchlen_match4_match_nolit2_encodeFastBlockAsm: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm + JB match_nolit2_end_encodeFastBlockAsm + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm + +matchlen_match1_match_nolit2_encodeFastBlockAsm: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm + +emit_remainder_encodeFastBlockAsm: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm + LEAQ (BX)(DX*1), DX + LEAQ 4(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm + CMPL BX, $0x00010000 + JB three_bytes_emit_remainder_encodeFastBlockAsm + MOVL BX, SI + SHRL $0x10, SI + MOVB $0xf8, (CX) + MOVW BX, 1(CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm + +three_bytes_emit_remainder_encodeFastBlockAsm: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm + +two_bytes_emit_remainder_encodeFastBlockAsm: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm + JMP memmove_long_emit_remainder_encodeFastBlockAsm + +one_byte_emit_remainder_encodeFastBlockAsm: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm + +memmove_midemit_remainder_encodeFastBlockAsm: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm + +memmove_long_emit_remainder_encodeFastBlockAsm: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsmlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsmlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsmlarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsmlarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeFastBlockAsm2MB(dst []byte, src []byte, tmp *[32768]byte) int +// Requires: BMI, SSE2 +TEXT ·encodeFastBlockAsm2MB(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00000100, DX + MOVQ AX, BX + PXOR X0, X0 + +zero_loop_encodeFastBlockAsm2MB: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeFastBlockAsm2MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + +search_loop_encodeFastBlockAsm2MB: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm2MB + MOVQ (BX)(DX*1), DI + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R9 + MOVQ DI, R8 + IMULQ R9, R8 + SHRQ $0x33, R8 + MOVQ 1(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x33, R10 + MOVL (AX)(R8*4), SI + MOVL DX, (AX)(R8*4) + MOVL (AX)(R10*4), R8 + MOVL DX, (AX)(R10*4) + MOVQ 2(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x33, R10 + MOVL DX, R9 + SUBL 16(SP), R9 + MOVL 1(BX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeFastBlockAsm2MB + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 4(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm2MB + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm2MB + CMPL R9, $0x00010000 + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm2MB + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm2MB + +three_bytes_repeat_emit_lits_encodeFastBlockAsm2MB: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm2MB + +two_bytes_repeat_emit_lits_encodeFastBlockAsm2MB: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm2MB + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm2MB + +one_byte_repeat_emit_lits_encodeFastBlockAsm2MB: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm2MB + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm2MB + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm2MB: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm2MB + +memmove_midrepeat_emit_lits_encodeFastBlockAsm2MB: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm2MB + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm2MB_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm2MB: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm2MB + +memmove_long_repeat_emit_lits_encodeFastBlockAsm2MB: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm2MBlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm2MBlarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm2MBlarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm2MBlarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm2MB: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm2MB: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm2MB + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm2MB + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm2MB: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm2MB + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm2MB: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm2MB + +matchlen_match8_repeat_extend_encodeFastBlockAsm2MB: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm2MB + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm2MB + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm2MB: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm2MB + +matchlen_match4_repeat_extend_encodeFastBlockAsm2MB: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm2MB + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm2MB + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm2MB: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm2MB + JB repeat_extend_forward_end_encodeFastBlockAsm2MB + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm2MB + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm2MB + +matchlen_match1_repeat_extend_encodeFastBlockAsm2MB: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm2MB + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm2MB + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm2MB: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm2MB + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm2MB + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm2MB + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm2MB + +repeat_three_match_repeat_encodeFastBlockAsm2MB: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm2MB + +repeat_two_match_repeat_encodeFastBlockAsm2MB: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm2MB + +repeat_one_match_repeat_encodeFastBlockAsm2MB: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm2MB: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm2MB + +no_repeat_found_encodeFastBlockAsm2MB: + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm2MB + MOVQ 1(BX)(DX*1), DI + MOVL (AX)(R10*4), SI + LEAL 2(DX), R9 + CMPQ (BX)(R8*1), DI + JEQ candidate2_match_encodeFastBlockAsm2MB + MOVL R9, (AX)(R10*4) + MOVQ 2(BX)(DX*1), DI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm2MB + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm2MB + +candidate3_match_encodeFastBlockAsm2MB: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm2MB + +candidate2_match_encodeFastBlockAsm2MB: + MOVL R9, (AX)(R10*4) + INCL DX + MOVL R8, SI + +candidate_match_encodeFastBlockAsm2MB: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm2MB: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm2MB + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm2MB + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm2MB: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm2MB + JMP matchlen_match8_match_nolit_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm2MB: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm2MB + +matchlen_match8_match_nolit_encodeFastBlockAsm2MB: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm2MB + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm2MB + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm2MB: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm2MB + +matchlen_match4_match_nolit_encodeFastBlockAsm2MB: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm2MB + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm2MB + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm2MB: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm2MB + JB match_nolit_end_encodeFastBlockAsm2MB + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm2MB + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm2MB + +matchlen_match1_match_nolit_encodeFastBlockAsm2MB: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm2MB + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm2MB + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm2MB: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm2MB + LEAQ (BX)(DI*1), DI + LEAQ 4(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm2MB + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm2MB + CMPL R9, $0x00010000 + JB three_bytes_match_emit_encodeFastBlockAsm2MB + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm2MB + +three_bytes_match_emit_encodeFastBlockAsm2MB: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm2MB + +two_bytes_match_emit_encodeFastBlockAsm2MB: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm2MB + JMP memmove_long_match_emit_encodeFastBlockAsm2MB + +one_byte_match_emit_encodeFastBlockAsm2MB: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm2MB_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm2MB_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm2MB_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm2MB + +emit_lit_memmove_match_emit_encodeFastBlockAsm2MB_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm2MB + +emit_lit_memmove_match_emit_encodeFastBlockAsm2MB_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm2MB: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm2MB + +memmove_midmatch_emit_encodeFastBlockAsm2MB: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm2MB_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm2MB_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm2MB_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm2MB + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm2MB_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm2MB: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm2MB + +memmove_long_match_emit_encodeFastBlockAsm2MB: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm2MBlarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsm2MBlarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm2MBlarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsm2MBlarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm2MB + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm2MB: + // emitCopy + CMPL SI, $0x0001003f + JBE two_byte_offset_match_nolit_encodeFastBlockAsm2MB + + // emitCopy3 + LEAL -4(R11), R11 + LEAL -65536(SI), SI + SHLL $0x0b, SI + ADDL $0x07, SI + CMPL R11, $0x3c + JBE emit_copy3_0_match_nolit_encodeFastBlockAsm2MB_emit3 + LEAL -60(R11), DI + CMPL R11, $0x0000013c + JB emit_copy3_1_match_nolit_encodeFastBlockAsm2MB_emit3 + CMPL R11, $0x0001003c + JB emit_copy3_2_match_nolit_encodeFastBlockAsm2MB_emit3 + ADDL $0x000007e0, SI + MOVL SI, (CX) + MOVL DI, 4(CX) + ADDQ $0x07, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy3_2_match_nolit_encodeFastBlockAsm2MB_emit3: + ADDL $0x000007c0, SI + MOVL SI, (CX) + MOVW DI, 4(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy3_1_match_nolit_encodeFastBlockAsm2MB_emit3: + ADDL $0x000007a0, SI + MOVL SI, (CX) + MOVB DI, 4(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy3_0_match_nolit_encodeFastBlockAsm2MB_emit3: + SHLL $0x05, R11 + ORL R11, SI + MOVL SI, (CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +two_byte_offset_match_nolit_encodeFastBlockAsm2MB: + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm2MB + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm2MB + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_one_longer_match_nolit_encodeFastBlockAsm2MB: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm2MB + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm2MB: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm2MB + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm2MB + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm2MB + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm2MB: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm2MB: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm2MB: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +two_byte_match_nolit_encodeFastBlockAsm2MB: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm2MB_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm2MB_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm2MB_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy2_2_match_nolit_encodeFastBlockAsm2MB_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy2_1_match_nolit_encodeFastBlockAsm2MB_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm2MB + +emit_copy2_0_match_nolit_encodeFastBlockAsm2MB_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm2MB: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm2MB + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm2MB + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm2MB: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x33, R8 + IMULQ SI, R9 + SHRQ $0x33, R9 + LEAL -2(DX), R10 + MOVL (AX)(R9*4), SI + MOVL R10, (AX)(R8*4) + MOVL DX, (AX)(R9*4) + MOVL DX, R8 + INCL DX + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm2MB + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm2MB: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm2MB + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm2MB + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm2MB: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm2MB + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm2MB: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm2MB + +matchlen_match8_match_nolit2_encodeFastBlockAsm2MB: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm2MB + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm2MB + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm2MB + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm2MB: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm2MB + +matchlen_match4_match_nolit2_encodeFastBlockAsm2MB: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm2MB + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm2MB + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm2MB: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm2MB + JB match_nolit2_end_encodeFastBlockAsm2MB + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm2MB + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm2MB + +matchlen_match1_match_nolit2_encodeFastBlockAsm2MB: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm2MB + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm2MB + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm2MB: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm2MB + +emit_remainder_encodeFastBlockAsm2MB: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm2MB + LEAQ (BX)(DX*1), DX + LEAQ 4(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm2MB + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm2MB + CMPL BX, $0x00010000 + JB three_bytes_emit_remainder_encodeFastBlockAsm2MB + MOVL BX, SI + SHRL $0x10, SI + MOVB $0xf8, (CX) + MOVW BX, 1(CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm2MB + +three_bytes_emit_remainder_encodeFastBlockAsm2MB: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm2MB + +two_bytes_emit_remainder_encodeFastBlockAsm2MB: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm2MB + JMP memmove_long_emit_remainder_encodeFastBlockAsm2MB + +one_byte_emit_remainder_encodeFastBlockAsm2MB: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm2MB + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm2MB + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm2MB + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm2MB + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm2MB + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm2MB_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm2MB: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm2MB + +memmove_midemit_remainder_encodeFastBlockAsm2MB: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm2MB_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm2MB_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm2MB_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm2MB + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm2MB_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm2MB: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm2MB + +memmove_long_emit_remainder_encodeFastBlockAsm2MB: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm2MBlarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm2MBlarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm2MBlarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm2MBlarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm2MB: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeFastBlockAsm512K(dst []byte, src []byte, tmp *[32768]byte) int +// Requires: BMI, SSE2 +TEXT ·encodeFastBlockAsm512K(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00000100, DX + MOVQ AX, BX + PXOR X0, X0 + +zero_loop_encodeFastBlockAsm512K: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeFastBlockAsm512K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + +search_loop_encodeFastBlockAsm512K: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x05, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm512K + MOVQ (BX)(DX*1), DI + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R9 + MOVQ DI, R8 + IMULQ R9, R8 + SHRQ $0x33, R8 + MOVQ 1(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x33, R10 + MOVL (AX)(R8*4), SI + MOVL DX, (AX)(R8*4) + MOVL (AX)(R10*4), R8 + MOVL DX, (AX)(R10*4) + MOVQ 2(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x33, R10 + MOVL DX, R9 + SUBL 16(SP), R9 + MOVL 1(BX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeFastBlockAsm512K + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 4(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm512K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm512K + CMPL R9, $0x00010000 + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm512K + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm512K + +three_bytes_repeat_emit_lits_encodeFastBlockAsm512K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm512K + +two_bytes_repeat_emit_lits_encodeFastBlockAsm512K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm512K + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm512K + +one_byte_repeat_emit_lits_encodeFastBlockAsm512K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm512K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm512K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm512K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm512K + +memmove_midrepeat_emit_lits_encodeFastBlockAsm512K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm512K + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm512K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm512K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm512K + +memmove_long_repeat_emit_lits_encodeFastBlockAsm512K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm512Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm512Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm512Klarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm512Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm512K: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm512K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm512K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm512K + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm512K: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm512K + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm512K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm512K + +matchlen_match8_repeat_extend_encodeFastBlockAsm512K: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm512K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm512K + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm512K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm512K + +matchlen_match4_repeat_extend_encodeFastBlockAsm512K: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm512K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm512K + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm512K: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm512K + JB repeat_extend_forward_end_encodeFastBlockAsm512K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm512K + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm512K + +matchlen_match1_repeat_extend_encodeFastBlockAsm512K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm512K + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm512K + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm512K: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm512K + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm512K + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm512K + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm512K + +repeat_three_match_repeat_encodeFastBlockAsm512K: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm512K + +repeat_two_match_repeat_encodeFastBlockAsm512K: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm512K + +repeat_one_match_repeat_encodeFastBlockAsm512K: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm512K: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm512K + +no_repeat_found_encodeFastBlockAsm512K: + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm512K + MOVQ 1(BX)(DX*1), DI + MOVL (AX)(R10*4), SI + LEAL 2(DX), R9 + CMPQ (BX)(R8*1), DI + JEQ candidate2_match_encodeFastBlockAsm512K + MOVL R9, (AX)(R10*4) + MOVQ 2(BX)(DX*1), DI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm512K + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm512K + +candidate3_match_encodeFastBlockAsm512K: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm512K + +candidate2_match_encodeFastBlockAsm512K: + MOVL R9, (AX)(R10*4) + INCL DX + MOVL R8, SI + +candidate_match_encodeFastBlockAsm512K: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm512K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm512K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm512K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm512K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm512K + JMP matchlen_match8_match_nolit_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm512K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm512K + +matchlen_match8_match_nolit_encodeFastBlockAsm512K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm512K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm512K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm512K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm512K + +matchlen_match4_match_nolit_encodeFastBlockAsm512K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm512K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm512K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm512K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm512K + JB match_nolit_end_encodeFastBlockAsm512K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm512K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm512K + +matchlen_match1_match_nolit_encodeFastBlockAsm512K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm512K + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm512K + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm512K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm512K + LEAQ (BX)(DI*1), DI + LEAQ 4(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm512K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm512K + CMPL R9, $0x00010000 + JB three_bytes_match_emit_encodeFastBlockAsm512K + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm512K + +three_bytes_match_emit_encodeFastBlockAsm512K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm512K + +two_bytes_match_emit_encodeFastBlockAsm512K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm512K + JMP memmove_long_match_emit_encodeFastBlockAsm512K + +one_byte_match_emit_encodeFastBlockAsm512K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm512K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm512K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm512K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm512K_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm512K + +emit_lit_memmove_match_emit_encodeFastBlockAsm512K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm512K + +emit_lit_memmove_match_emit_encodeFastBlockAsm512K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm512K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm512K + +memmove_midmatch_emit_encodeFastBlockAsm512K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm512K_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm512K_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm512K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm512K + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm512K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm512K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm512K + +memmove_long_match_emit_encodeFastBlockAsm512K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm512Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsm512Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm512Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsm512Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm512K + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm512K: + // emitCopy + CMPL SI, $0x0001003f + JBE two_byte_offset_match_nolit_encodeFastBlockAsm512K + + // emitCopy3 + LEAL -4(R11), R11 + LEAL -65536(SI), SI + SHLL $0x0b, SI + ADDL $0x07, SI + CMPL R11, $0x3c + JBE emit_copy3_0_match_nolit_encodeFastBlockAsm512K_emit3 + LEAL -60(R11), DI + CMPL R11, $0x0000013c + JB emit_copy3_1_match_nolit_encodeFastBlockAsm512K_emit3 + CMPL R11, $0x0001003c + JB emit_copy3_2_match_nolit_encodeFastBlockAsm512K_emit3 + ADDL $0x000007e0, SI + MOVL SI, (CX) + MOVL DI, 4(CX) + ADDQ $0x07, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy3_2_match_nolit_encodeFastBlockAsm512K_emit3: + ADDL $0x000007c0, SI + MOVL SI, (CX) + MOVW DI, 4(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy3_1_match_nolit_encodeFastBlockAsm512K_emit3: + ADDL $0x000007a0, SI + MOVL SI, (CX) + MOVB DI, 4(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy3_0_match_nolit_encodeFastBlockAsm512K_emit3: + SHLL $0x05, R11 + ORL R11, SI + MOVL SI, (CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +two_byte_offset_match_nolit_encodeFastBlockAsm512K: + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm512K + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm512K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_one_longer_match_nolit_encodeFastBlockAsm512K: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm512K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm512K: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm512K + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm512K + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm512K + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm512K: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm512K: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm512K: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +two_byte_match_nolit_encodeFastBlockAsm512K: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm512K_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm512K_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm512K_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy2_2_match_nolit_encodeFastBlockAsm512K_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy2_1_match_nolit_encodeFastBlockAsm512K_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm512K + +emit_copy2_0_match_nolit_encodeFastBlockAsm512K_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm512K: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm512K + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm512K + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm512K: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x33, R8 + IMULQ SI, R9 + SHRQ $0x33, R9 + LEAL -2(DX), R10 + MOVL (AX)(R9*4), SI + MOVL R10, (AX)(R8*4) + MOVL DX, (AX)(R9*4) + MOVL DX, R8 + INCL DX + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm512K + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm512K: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm512K + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm512K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm512K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm512K + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm512K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm512K + +matchlen_match8_match_nolit2_encodeFastBlockAsm512K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm512K + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm512K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm512K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm512K: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm512K + +matchlen_match4_match_nolit2_encodeFastBlockAsm512K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm512K + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm512K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm512K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm512K + JB match_nolit2_end_encodeFastBlockAsm512K + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm512K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm512K + +matchlen_match1_match_nolit2_encodeFastBlockAsm512K: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm512K + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm512K + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm512K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm512K + +emit_remainder_encodeFastBlockAsm512K: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm512K + LEAQ (BX)(DX*1), DX + LEAQ 4(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm512K + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm512K + CMPL BX, $0x00010000 + JB three_bytes_emit_remainder_encodeFastBlockAsm512K + MOVL BX, SI + SHRL $0x10, SI + MOVB $0xf8, (CX) + MOVW BX, 1(CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm512K + +three_bytes_emit_remainder_encodeFastBlockAsm512K: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm512K + +two_bytes_emit_remainder_encodeFastBlockAsm512K: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm512K + JMP memmove_long_emit_remainder_encodeFastBlockAsm512K + +one_byte_emit_remainder_encodeFastBlockAsm512K: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm512K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm512K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm512K + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm512K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm512K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm512K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm512K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm512K + +memmove_midemit_remainder_encodeFastBlockAsm512K: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm512K_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm512K_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm512K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm512K + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm512K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm512K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm512K + +memmove_long_emit_remainder_encodeFastBlockAsm512K: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm512Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm512Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm512Klarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm512Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm512K: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeFastBlockAsm64K(dst []byte, src []byte, tmp *[8192]byte) int +// Requires: BMI, SSE2 +TEXT ·encodeFastBlockAsm64K(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00000040, DX + MOVQ AX, BX + PXOR X0, X0 + +zero_loop_encodeFastBlockAsm64K: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeFastBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + +search_loop_encodeFastBlockAsm64K: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm64K + MOVQ (BX)(DX*1), DI + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R9 + MOVQ DI, R8 + IMULQ R9, R8 + SHRQ $0x34, R8 + MOVQ 1(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 + MOVW DX, (AX)(R10*2) + MOVQ 2(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x34, R10 + MOVL DX, R9 + SUBL 16(SP), R9 + MOVL 1(BX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeFastBlockAsm64K + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 4(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm64K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm64K + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm64K + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm64K + +three_bytes_repeat_emit_lits_encodeFastBlockAsm64K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm64K + +two_bytes_repeat_emit_lits_encodeFastBlockAsm64K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm64K + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm64K + +one_byte_repeat_emit_lits_encodeFastBlockAsm64K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm64K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm64K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm64K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm64K + +memmove_midrepeat_emit_lits_encodeFastBlockAsm64K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm64K + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm64K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm64K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm64K + +memmove_long_repeat_emit_lits_encodeFastBlockAsm64K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm64K: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm64K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm64K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm64K + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm64K: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm64K + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm64K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm64K + +matchlen_match8_repeat_extend_encodeFastBlockAsm64K: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm64K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm64K + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm64K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm64K + +matchlen_match4_repeat_extend_encodeFastBlockAsm64K: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm64K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm64K + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm64K: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm64K + JB repeat_extend_forward_end_encodeFastBlockAsm64K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm64K + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm64K + +matchlen_match1_repeat_extend_encodeFastBlockAsm64K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm64K + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm64K + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm64K: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm64K + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm64K + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm64K + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm64K + +repeat_three_match_repeat_encodeFastBlockAsm64K: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm64K + +repeat_two_match_repeat_encodeFastBlockAsm64K: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm64K + +repeat_one_match_repeat_encodeFastBlockAsm64K: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm64K: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm64K + +no_repeat_found_encodeFastBlockAsm64K: + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm64K + MOVQ 1(BX)(DX*1), DI + MOVWLZX (AX)(R10*2), SI + LEAL 2(DX), R9 + CMPQ (BX)(R8*1), DI + JEQ candidate2_match_encodeFastBlockAsm64K + MOVW R9, (AX)(R10*2) + MOVQ 2(BX)(DX*1), DI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm64K + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm64K + +candidate3_match_encodeFastBlockAsm64K: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm64K + +candidate2_match_encodeFastBlockAsm64K: + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + +candidate_match_encodeFastBlockAsm64K: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm64K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm64K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm64K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm64K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm64K + JMP matchlen_match8_match_nolit_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm64K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm64K + +matchlen_match8_match_nolit_encodeFastBlockAsm64K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm64K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm64K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm64K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm64K + +matchlen_match4_match_nolit_encodeFastBlockAsm64K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm64K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm64K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm64K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm64K + JB match_nolit_end_encodeFastBlockAsm64K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm64K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm64K + +matchlen_match1_match_nolit_encodeFastBlockAsm64K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm64K + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm64K + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm64K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm64K + LEAQ (BX)(DI*1), DI + LEAQ 4(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm64K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm64K + JB three_bytes_match_emit_encodeFastBlockAsm64K + MOVL R9, R10 + SHRL $0x10, R10 + MOVB $0xf8, (CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm64K + +three_bytes_match_emit_encodeFastBlockAsm64K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm64K + +two_bytes_match_emit_encodeFastBlockAsm64K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm64K + JMP memmove_long_match_emit_encodeFastBlockAsm64K + +one_byte_match_emit_encodeFastBlockAsm64K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm64K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm64K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm64K_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm64K + +emit_lit_memmove_match_emit_encodeFastBlockAsm64K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm64K + +emit_lit_memmove_match_emit_encodeFastBlockAsm64K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm64K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm64K + +memmove_midmatch_emit_encodeFastBlockAsm64K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm64K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm64K + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm64K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm64K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm64K + +memmove_long_match_emit_encodeFastBlockAsm64K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm64Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm64K + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm64K: + // emitCopy + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm64K + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm64K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +emit_one_longer_match_nolit_encodeFastBlockAsm64K: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm64K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm64K: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm64K + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm64K + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm64K + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm64K: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm64K: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm64K: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +two_byte_match_nolit_encodeFastBlockAsm64K: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm64K_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm64K_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm64K_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +emit_copy2_2_match_nolit_encodeFastBlockAsm64K_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +emit_copy2_1_match_nolit_encodeFastBlockAsm64K_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm64K + +emit_copy2_0_match_nolit_encodeFastBlockAsm64K_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm64K: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm64K + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm64K + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm64K: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x34, R8 + IMULQ SI, R9 + SHRQ $0x34, R9 + LEAL -2(DX), R10 + MOVWLZX (AX)(R9*2), SI + MOVW R10, (AX)(R8*2) + MOVW DX, (AX)(R9*2) + MOVL DX, R8 + INCL DX + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm64K + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm64K: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm64K + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm64K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm64K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm64K + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm64K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm64K + +matchlen_match8_match_nolit2_encodeFastBlockAsm64K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm64K + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm64K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm64K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm64K: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm64K + +matchlen_match4_match_nolit2_encodeFastBlockAsm64K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm64K + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm64K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm64K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm64K + JB match_nolit2_end_encodeFastBlockAsm64K + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm64K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm64K + +matchlen_match1_match_nolit2_encodeFastBlockAsm64K: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm64K + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm64K + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm64K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm64K + +emit_remainder_encodeFastBlockAsm64K: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm64K + LEAQ (BX)(DX*1), DX + LEAQ 4(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm64K + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm64K + JB three_bytes_emit_remainder_encodeFastBlockAsm64K + MOVL BX, SI + SHRL $0x10, SI + MOVB $0xf8, (CX) + MOVW BX, 1(CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm64K + +three_bytes_emit_remainder_encodeFastBlockAsm64K: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm64K + +two_bytes_emit_remainder_encodeFastBlockAsm64K: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm64K + JMP memmove_long_emit_remainder_encodeFastBlockAsm64K + +one_byte_emit_remainder_encodeFastBlockAsm64K: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm64K + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm64K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm64K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm64K + +memmove_midemit_remainder_encodeFastBlockAsm64K: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm64K_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm64K_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm64K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm64K + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm64K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm64K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm64K + +memmove_long_emit_remainder_encodeFastBlockAsm64K: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm64Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm64Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm64Klarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm64Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm64K: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeFastBlockAsm16K(dst []byte, src []byte, tmp *[4096]byte) int +// Requires: BMI, SSE2 +TEXT ·encodeFastBlockAsm16K(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00000020, DX + MOVQ AX, BX + PXOR X0, X0 + +zero_loop_encodeFastBlockAsm16K: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeFastBlockAsm16K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + +search_loop_encodeFastBlockAsm16K: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm16K + MOVQ (BX)(DX*1), DI + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R9 + MOVQ DI, R8 + IMULQ R9, R8 + SHRQ $0x35, R8 + MOVQ 1(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x35, R10 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 + MOVW DX, (AX)(R10*2) + MOVQ 2(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x35, R10 + MOVL DX, R9 + SUBL 16(SP), R9 + MOVL 1(BX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeFastBlockAsm16K + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 3(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm16K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm16K + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm16K + +three_bytes_repeat_emit_lits_encodeFastBlockAsm16K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm16K + +two_bytes_repeat_emit_lits_encodeFastBlockAsm16K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm16K + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm16K + +one_byte_repeat_emit_lits_encodeFastBlockAsm16K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm16K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm16K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm16K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm16K + +memmove_midrepeat_emit_lits_encodeFastBlockAsm16K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm16K + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm16K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm16K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm16K + +memmove_long_repeat_emit_lits_encodeFastBlockAsm16K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm16Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm16Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm16Klarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm16Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm16K: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm16K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm16K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm16K + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm16K: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm16K + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm16K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm16K + +matchlen_match8_repeat_extend_encodeFastBlockAsm16K: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm16K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm16K + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm16K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm16K + +matchlen_match4_repeat_extend_encodeFastBlockAsm16K: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm16K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm16K + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm16K: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm16K + JB repeat_extend_forward_end_encodeFastBlockAsm16K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm16K + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm16K + +matchlen_match1_repeat_extend_encodeFastBlockAsm16K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm16K + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm16K + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm16K: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm16K + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm16K + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm16K + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm16K + +repeat_three_match_repeat_encodeFastBlockAsm16K: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm16K + +repeat_two_match_repeat_encodeFastBlockAsm16K: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm16K + +repeat_one_match_repeat_encodeFastBlockAsm16K: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm16K: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm16K + +no_repeat_found_encodeFastBlockAsm16K: + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm16K + MOVQ 1(BX)(DX*1), DI + MOVWLZX (AX)(R10*2), SI + LEAL 2(DX), R9 + CMPQ (BX)(R8*1), DI + JEQ candidate2_match_encodeFastBlockAsm16K + MOVW R9, (AX)(R10*2) + MOVQ 2(BX)(DX*1), DI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm16K + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm16K + +candidate3_match_encodeFastBlockAsm16K: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm16K + +candidate2_match_encodeFastBlockAsm16K: + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + +candidate_match_encodeFastBlockAsm16K: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm16K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm16K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm16K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm16K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm16K + JMP matchlen_match8_match_nolit_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm16K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm16K + +matchlen_match8_match_nolit_encodeFastBlockAsm16K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm16K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm16K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm16K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm16K + +matchlen_match4_match_nolit_encodeFastBlockAsm16K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm16K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm16K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm16K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm16K + JB match_nolit_end_encodeFastBlockAsm16K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm16K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm16K + +matchlen_match1_match_nolit_encodeFastBlockAsm16K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm16K + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm16K + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm16K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm16K + LEAQ (BX)(DI*1), DI + LEAQ 3(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm16K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm16K + JB three_bytes_match_emit_encodeFastBlockAsm16K + +three_bytes_match_emit_encodeFastBlockAsm16K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm16K + +two_bytes_match_emit_encodeFastBlockAsm16K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm16K + JMP memmove_long_match_emit_encodeFastBlockAsm16K + +one_byte_match_emit_encodeFastBlockAsm16K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm16K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm16K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm16K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm16K_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm16K + +emit_lit_memmove_match_emit_encodeFastBlockAsm16K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm16K + +emit_lit_memmove_match_emit_encodeFastBlockAsm16K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm16K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm16K + +memmove_midmatch_emit_encodeFastBlockAsm16K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm16K_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm16K_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm16K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm16K + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm16K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm16K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm16K + +memmove_long_match_emit_encodeFastBlockAsm16K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm16Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsm16Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm16Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsm16Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm16K + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm16K: + // emitCopy + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm16K + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm16K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +emit_one_longer_match_nolit_encodeFastBlockAsm16K: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm16K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm16K: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm16K + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm16K + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm16K + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm16K: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm16K: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm16K: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +two_byte_match_nolit_encodeFastBlockAsm16K: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm16K_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm16K_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm16K_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +emit_copy2_2_match_nolit_encodeFastBlockAsm16K_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +emit_copy2_1_match_nolit_encodeFastBlockAsm16K_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm16K + +emit_copy2_0_match_nolit_encodeFastBlockAsm16K_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm16K: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm16K + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm16K + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm16K: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x35, R8 + IMULQ SI, R9 + SHRQ $0x35, R9 + LEAL -2(DX), R10 + MOVWLZX (AX)(R9*2), SI + MOVW R10, (AX)(R8*2) + MOVW DX, (AX)(R9*2) + MOVL DX, R8 + INCL DX + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm16K + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm16K: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm16K + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm16K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm16K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm16K + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm16K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm16K + +matchlen_match8_match_nolit2_encodeFastBlockAsm16K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm16K + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm16K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm16K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm16K: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm16K + +matchlen_match4_match_nolit2_encodeFastBlockAsm16K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm16K + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm16K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm16K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm16K + JB match_nolit2_end_encodeFastBlockAsm16K + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm16K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm16K + +matchlen_match1_match_nolit2_encodeFastBlockAsm16K: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm16K + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm16K + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm16K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm16K + +emit_remainder_encodeFastBlockAsm16K: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm16K + LEAQ (BX)(DX*1), DX + LEAQ 3(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm16K + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm16K + JB three_bytes_emit_remainder_encodeFastBlockAsm16K + +three_bytes_emit_remainder_encodeFastBlockAsm16K: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm16K + +two_bytes_emit_remainder_encodeFastBlockAsm16K: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm16K + JMP memmove_long_emit_remainder_encodeFastBlockAsm16K + +one_byte_emit_remainder_encodeFastBlockAsm16K: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm16K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm16K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm16K + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm16K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm16K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm16K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm16K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm16K + +memmove_midemit_remainder_encodeFastBlockAsm16K: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm16K_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm16K_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm16K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm16K + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm16K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm16K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm16K + +memmove_long_emit_remainder_encodeFastBlockAsm16K: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm16Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm16Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm16Klarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm16Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm16K: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeFastBlockAsm4K(dst []byte, src []byte, tmp *[2048]byte) int +// Requires: BMI, SSE2 +TEXT ·encodeFastBlockAsm4K(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00000010, DX + MOVQ AX, BX + PXOR X0, X0 + +zero_loop_encodeFastBlockAsm4K: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeFastBlockAsm4K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + +search_loop_encodeFastBlockAsm4K: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x04, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm4K + MOVQ (BX)(DX*1), DI + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R9 + MOVQ DI, R8 + IMULQ R9, R8 + SHRQ $0x36, R8 + MOVQ 1(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 + MOVW DX, (AX)(R10*2) + MOVQ 2(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x36, R10 + MOVL DX, R9 + SUBL 16(SP), R9 + MOVL 1(BX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeFastBlockAsm4K + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 3(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm4K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm4K + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm4K + +three_bytes_repeat_emit_lits_encodeFastBlockAsm4K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm4K + +two_bytes_repeat_emit_lits_encodeFastBlockAsm4K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm4K + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm4K + +one_byte_repeat_emit_lits_encodeFastBlockAsm4K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm4K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm4K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm4K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm4K + +memmove_midrepeat_emit_lits_encodeFastBlockAsm4K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm4K + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm4K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm4K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm4K + +memmove_long_repeat_emit_lits_encodeFastBlockAsm4K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm4Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm4Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm4Klarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm4Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm4K: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm4K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm4K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm4K + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm4K: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm4K + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm4K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm4K + +matchlen_match8_repeat_extend_encodeFastBlockAsm4K: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm4K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm4K + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm4K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm4K + +matchlen_match4_repeat_extend_encodeFastBlockAsm4K: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm4K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm4K + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm4K: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm4K + JB repeat_extend_forward_end_encodeFastBlockAsm4K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm4K + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm4K + +matchlen_match1_repeat_extend_encodeFastBlockAsm4K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm4K + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm4K + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm4K: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm4K + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm4K + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm4K + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm4K + +repeat_three_match_repeat_encodeFastBlockAsm4K: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm4K + +repeat_two_match_repeat_encodeFastBlockAsm4K: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm4K + +repeat_one_match_repeat_encodeFastBlockAsm4K: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm4K: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm4K + +no_repeat_found_encodeFastBlockAsm4K: + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm4K + MOVQ 1(BX)(DX*1), DI + MOVWLZX (AX)(R10*2), SI + LEAL 2(DX), R9 + CMPQ (BX)(R8*1), DI + JEQ candidate2_match_encodeFastBlockAsm4K + MOVW R9, (AX)(R10*2) + MOVQ 2(BX)(DX*1), DI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm4K + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm4K + +candidate3_match_encodeFastBlockAsm4K: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm4K + +candidate2_match_encodeFastBlockAsm4K: + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + +candidate_match_encodeFastBlockAsm4K: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm4K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm4K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm4K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm4K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm4K + JMP matchlen_match8_match_nolit_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm4K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm4K + +matchlen_match8_match_nolit_encodeFastBlockAsm4K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm4K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm4K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm4K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm4K + +matchlen_match4_match_nolit_encodeFastBlockAsm4K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm4K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm4K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm4K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm4K + JB match_nolit_end_encodeFastBlockAsm4K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm4K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm4K + +matchlen_match1_match_nolit_encodeFastBlockAsm4K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm4K + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm4K + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm4K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm4K + LEAQ (BX)(DI*1), DI + LEAQ 3(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm4K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm4K + JB three_bytes_match_emit_encodeFastBlockAsm4K + +three_bytes_match_emit_encodeFastBlockAsm4K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm4K + +two_bytes_match_emit_encodeFastBlockAsm4K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm4K + JMP memmove_long_match_emit_encodeFastBlockAsm4K + +one_byte_match_emit_encodeFastBlockAsm4K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm4K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm4K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm4K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm4K_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm4K + +emit_lit_memmove_match_emit_encodeFastBlockAsm4K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm4K + +emit_lit_memmove_match_emit_encodeFastBlockAsm4K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm4K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm4K + +memmove_midmatch_emit_encodeFastBlockAsm4K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm4K_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm4K_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm4K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm4K + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm4K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm4K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm4K + +memmove_long_match_emit_encodeFastBlockAsm4K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm4Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsm4Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm4Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsm4Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm4K + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm4K: + // emitCopy + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm4K + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm4K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +emit_one_longer_match_nolit_encodeFastBlockAsm4K: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm4K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm4K: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm4K + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm4K + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm4K + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm4K: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm4K: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm4K: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +two_byte_match_nolit_encodeFastBlockAsm4K: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm4K_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm4K_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm4K_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +emit_copy2_2_match_nolit_encodeFastBlockAsm4K_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +emit_copy2_1_match_nolit_encodeFastBlockAsm4K_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm4K + +emit_copy2_0_match_nolit_encodeFastBlockAsm4K_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm4K: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm4K + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm4K + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm4K: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x36, R8 + IMULQ SI, R9 + SHRQ $0x36, R9 + LEAL -2(DX), R10 + MOVWLZX (AX)(R9*2), SI + MOVW R10, (AX)(R8*2) + MOVW DX, (AX)(R9*2) + MOVL DX, R8 + INCL DX + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm4K + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm4K: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm4K + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm4K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm4K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm4K + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm4K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm4K + +matchlen_match8_match_nolit2_encodeFastBlockAsm4K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm4K + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm4K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm4K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm4K: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm4K + +matchlen_match4_match_nolit2_encodeFastBlockAsm4K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm4K + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm4K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm4K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm4K + JB match_nolit2_end_encodeFastBlockAsm4K + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm4K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm4K + +matchlen_match1_match_nolit2_encodeFastBlockAsm4K: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm4K + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm4K + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm4K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm4K + +emit_remainder_encodeFastBlockAsm4K: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm4K + LEAQ (BX)(DX*1), DX + LEAQ 3(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm4K + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm4K + JB three_bytes_emit_remainder_encodeFastBlockAsm4K + +three_bytes_emit_remainder_encodeFastBlockAsm4K: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm4K + +two_bytes_emit_remainder_encodeFastBlockAsm4K: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm4K + JMP memmove_long_emit_remainder_encodeFastBlockAsm4K + +one_byte_emit_remainder_encodeFastBlockAsm4K: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm4K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm4K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm4K + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm4K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm4K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm4K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm4K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm4K + +memmove_midemit_remainder_encodeFastBlockAsm4K: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm4K_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm4K_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm4K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm4K + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm4K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm4K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm4K + +memmove_long_emit_remainder_encodeFastBlockAsm4K: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm4Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm4Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm4Klarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm4Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm4K: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeFastBlockAsm1K(dst []byte, src []byte, tmp *[1024]byte) int +// Requires: BMI, SSE2 +TEXT ·encodeFastBlockAsm1K(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00000008, DX + MOVQ AX, BX + PXOR X0, X0 + +zero_loop_encodeFastBlockAsm1K: + MOVOU X0, (BX) + MOVOU X0, 16(BX) + MOVOU X0, 32(BX) + MOVOU X0, 48(BX) + MOVOU X0, 64(BX) + MOVOU X0, 80(BX) + MOVOU X0, 96(BX) + MOVOU X0, 112(BX) + ADDQ $0x80, BX + DECQ DX + JNZ zero_loop_encodeFastBlockAsm1K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), DX + LEAQ -17(DX), BX + LEAQ -17(DX), SI + MOVL SI, 8(SP) + SHRQ $0x03, DX + SUBL DX, BX + LEAQ (CX)(BX*1), BX + MOVQ BX, (SP) + MOVL $0x00000001, DX + MOVL DX, 16(SP) + MOVQ src_base+24(FP), BX + +search_loop_encodeFastBlockAsm1K: + MOVL DX, SI + SUBL 12(SP), SI + SHRL $0x03, SI + LEAL 4(DX)(SI*1), SI + CMPL SI, 8(SP) + JAE emit_remainder_encodeFastBlockAsm1K + MOVQ (BX)(DX*1), DI + MOVL SI, 20(SP) + MOVQ $0xcf1bbcdcb7a56463, R9 + MOVQ DI, R8 + IMULQ R9, R8 + SHRQ $0x37, R8 + MOVQ 1(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x37, R10 + MOVWLZX (AX)(R8*2), SI + MOVW DX, (AX)(R8*2) + MOVWLZX (AX)(R10*2), R8 + MOVW DX, (AX)(R10*2) + MOVQ 2(BX)(DX*1), R10 + IMULQ R9, R10 + SHRQ $0x37, R10 + MOVL DX, R9 + SUBL 16(SP), R9 + MOVL 1(BX)(R9*1), R11 + MOVQ DI, R9 + SHRQ $0x08, R9 + CMPL R9, R11 + JNE no_repeat_found_encodeFastBlockAsm1K + LEAL 1(DX), DI + MOVL 12(SP), SI + MOVL DI, SI + MOVL 12(SP), R8 + SUBL R8, SI + LEAQ 3(CX)(SI*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_1 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_1: + LEAQ (BX)(R8*1), R8 + + // emitLiteral + LEAL -1(SI), R9 + CMPL R9, $0x1d + JB one_byte_repeat_emit_lits_encodeFastBlockAsm1K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_repeat_emit_lits_encodeFastBlockAsm1K + JB three_bytes_repeat_emit_lits_encodeFastBlockAsm1K + +three_bytes_repeat_emit_lits_encodeFastBlockAsm1K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm1K + +two_bytes_repeat_emit_lits_encodeFastBlockAsm1K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midrepeat_emit_lits_encodeFastBlockAsm1K + JMP memmove_long_repeat_emit_lits_encodeFastBlockAsm1K + +one_byte_repeat_emit_lits_encodeFastBlockAsm1K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ SI, $0x10 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_8through16 + CMPQ SI, $0x20 + JBE emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_17through32 + JMP emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_8through16: + MOVOU (R8), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm1K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm1K + +emit_lit_memmove_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_end_copy_repeat_emit_lits_encodeFastBlockAsm1K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm1K + +memmove_midrepeat_emit_lits_encodeFastBlockAsm1K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ SI, $0x20 + JBE emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_17through32 + JMP emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_33through64 + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_17through32: + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(SI*1) + JMP memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm1K + +emit_lit_memmove_mid_repeat_emit_lits_encodeFastBlockAsm1K_memmove_move_33through64: + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + +memmove_mid_end_copy_repeat_emit_lits_encodeFastBlockAsm1K: + MOVQ R9, CX + JMP repeat_emit_lits_end_encodeFastBlockAsm1K + +memmove_long_repeat_emit_lits_encodeFastBlockAsm1K: + LEAQ (CX)(SI*1), R9 + + // genMemMoveLong + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R11 + SHRQ $0x05, R11 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R10 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm1Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R10 + ADDQ $0x20, R12 + DECQ R11 + JNA emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm1Klarge_big_loop_back + +emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm1Klarge_forward_sse_loop_32: + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ SI, R12 + JAE emit_lit_memmove_long_repeat_emit_lits_encodeFastBlockAsm1Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX + +repeat_emit_lits_end_encodeFastBlockAsm1K: + ADDL $0x05, DX + MOVL DX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), R8 + SUBL DX, R8 + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_loopback_16_repeat_extend_encodeFastBlockAsm1K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm1K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeFastBlockAsm1K + LEAL -16(R8), R8 + LEAL 16(R11), R11 + +matchlen_loop_16_entry_repeat_extend_encodeFastBlockAsm1K: + CMPL R8, $0x10 + JAE matchlen_loopback_16_repeat_extend_encodeFastBlockAsm1K + JMP matchlen_match8_repeat_extend_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_bsf_16repeat_extend_encodeFastBlockAsm1K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm1K + +matchlen_match8_repeat_extend_encodeFastBlockAsm1K: + CMPL R8, $0x08 + JB matchlen_match4_repeat_extend_encodeFastBlockAsm1K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeFastBlockAsm1K + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_bsf_8_repeat_extend_encodeFastBlockAsm1K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm1K + +matchlen_match4_repeat_extend_encodeFastBlockAsm1K: + CMPL R8, $0x04 + JB matchlen_match2_repeat_extend_encodeFastBlockAsm1K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_repeat_extend_encodeFastBlockAsm1K + LEAL -4(R8), R8 + LEAL 4(R11), R11 + +matchlen_match2_repeat_extend_encodeFastBlockAsm1K: + CMPL R8, $0x01 + JE matchlen_match1_repeat_extend_encodeFastBlockAsm1K + JB repeat_extend_forward_end_encodeFastBlockAsm1K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_repeat_extend_encodeFastBlockAsm1K + LEAL 2(R11), R11 + SUBL $0x02, R8 + JZ repeat_extend_forward_end_encodeFastBlockAsm1K + +matchlen_match1_repeat_extend_encodeFastBlockAsm1K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE repeat_extend_forward_end_encodeFastBlockAsm1K + LEAL 1(R11), R11 + JMP repeat_extend_forward_end_encodeFastBlockAsm1K + PCALIGN $0x10 + +repeat_extend_forward_end_encodeFastBlockAsm1K: + ADDL R11, DX + MOVL DX, SI + SUBL DI, SI + MOVL 16(SP), DI + + // emitRepeat + LEAL -1(SI), DI + CMPL SI, $0x1d + JBE repeat_one_match_repeat_encodeFastBlockAsm1K + LEAL -30(SI), DI + CMPL SI, $0x0000011e + JB repeat_two_match_repeat_encodeFastBlockAsm1K + CMPL SI, $0x0001001e + JB repeat_three_match_repeat_encodeFastBlockAsm1K + MOVB $0xfc, (CX) + MOVL DI, 1(CX) + ADDQ $0x04, CX + JMP repeat_end_emit_encodeFastBlockAsm1K + +repeat_three_match_repeat_encodeFastBlockAsm1K: + MOVB $0xf4, (CX) + MOVW DI, 1(CX) + ADDQ $0x03, CX + JMP repeat_end_emit_encodeFastBlockAsm1K + +repeat_two_match_repeat_encodeFastBlockAsm1K: + MOVB $0xec, (CX) + MOVB DI, 1(CX) + ADDQ $0x02, CX + JMP repeat_end_emit_encodeFastBlockAsm1K + +repeat_one_match_repeat_encodeFastBlockAsm1K: + XORL DI, DI + LEAL -4(DI)(SI*8), DI + MOVB DI, (CX) + ADDQ $0x01, CX + +repeat_end_emit_encodeFastBlockAsm1K: + MOVL DX, 12(SP) + JMP search_loop_encodeFastBlockAsm1K + +no_repeat_found_encodeFastBlockAsm1K: + CMPQ (BX)(SI*1), DI + JEQ candidate_match_encodeFastBlockAsm1K + MOVQ 1(BX)(DX*1), DI + MOVWLZX (AX)(R10*2), SI + LEAL 2(DX), R9 + CMPQ (BX)(R8*1), DI + JEQ candidate2_match_encodeFastBlockAsm1K + MOVW R9, (AX)(R10*2) + MOVQ 2(BX)(DX*1), DI + CMPQ (BX)(SI*1), DI + JEQ candidate3_match_encodeFastBlockAsm1K + MOVL 20(SP), DX + JMP search_loop_encodeFastBlockAsm1K + +candidate3_match_encodeFastBlockAsm1K: + ADDL $0x02, DX + JMP candidate_match_encodeFastBlockAsm1K + +candidate2_match_encodeFastBlockAsm1K: + MOVW R9, (AX)(R10*2) + INCL DX + MOVL R8, SI + +candidate_match_encodeFastBlockAsm1K: + CMPQ CX, (SP) + JB dst_size_check_ok_2 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_2: + MOVL DX, R8 + MOVL DX, DI + SUBL SI, DI + MOVL DI, 16(SP) + ADDL $0x08, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R9 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit_encodeFastBlockAsm1K: + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm1K + XORQ 8(SI)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeFastBlockAsm1K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit_encodeFastBlockAsm1K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit_encodeFastBlockAsm1K + JMP matchlen_match8_match_nolit_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit_encodeFastBlockAsm1K: + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeFastBlockAsm1K + +matchlen_match8_match_nolit_encodeFastBlockAsm1K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit_encodeFastBlockAsm1K + MOVQ (R9)(R11*1), R10 + XORQ (SI)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeFastBlockAsm1K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit_encodeFastBlockAsm1K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 + JMP match_nolit_end_encodeFastBlockAsm1K + +matchlen_match4_match_nolit_encodeFastBlockAsm1K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit_encodeFastBlockAsm1K + MOVL (R9)(R11*1), R10 + CMPL (SI)(R11*1), R10 + JNE matchlen_match2_match_nolit_encodeFastBlockAsm1K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit_encodeFastBlockAsm1K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit_encodeFastBlockAsm1K + JB match_nolit_end_encodeFastBlockAsm1K + MOVW (R9)(R11*1), R10 + CMPW (SI)(R11*1), R10 + JNE matchlen_match1_match_nolit_encodeFastBlockAsm1K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit_end_encodeFastBlockAsm1K + +matchlen_match1_match_nolit_encodeFastBlockAsm1K: + MOVB (R9)(R11*1), R10 + CMPB (SI)(R11*1), R10 + JNE match_nolit_end_encodeFastBlockAsm1K + LEAL 1(R11), R11 + JMP match_nolit_end_encodeFastBlockAsm1K + PCALIGN $0x10 + +match_nolit_end_encodeFastBlockAsm1K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL 16(SP), SI + MOVL 12(SP), DI + MOVL DX, 12(SP) + SUBL DI, R8 + JZ match_nolits_copy_encodeFastBlockAsm1K + LEAQ (BX)(DI*1), DI + LEAQ 3(CX)(R8*1), R9 + CMPQ R9, (SP) + JB dst_size_check_ok_3 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_3: + // emitLiteral + LEAL -1(R8), R9 + CMPL R9, $0x1d + JB one_byte_match_emit_encodeFastBlockAsm1K + SUBL $0x1d, R9 + CMPL R9, $0x00000100 + JB two_bytes_match_emit_encodeFastBlockAsm1K + JB three_bytes_match_emit_encodeFastBlockAsm1K + +three_bytes_match_emit_encodeFastBlockAsm1K: + MOVB $0xf0, (CX) + MOVW R9, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, R9 + JMP memmove_long_match_emit_encodeFastBlockAsm1K + +two_bytes_match_emit_encodeFastBlockAsm1K: + MOVB $0xe8, (CX) + MOVB R9, 1(CX) + ADDL $0x1d, R9 + ADDQ $0x02, CX + CMPL R9, $0x40 + JB memmove_midmatch_emit_encodeFastBlockAsm1K + JMP memmove_long_match_emit_encodeFastBlockAsm1K + +one_byte_match_emit_encodeFastBlockAsm1K: + SHLB $0x03, R9 + MOVB R9, (CX) + ADDQ $0x01, CX + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 16, min move: 1 + CMPQ R8, $0x10 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm1K_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_match_emit_encodeFastBlockAsm1K_memmove_move_17through32 + JMP emit_lit_memmove_match_emit_encodeFastBlockAsm1K_memmove_move_33through64 + PCALIGN $0x10 + +emit_lit_memmove_match_emit_encodeFastBlockAsm1K_memmove_move_8through16: + MOVOU (DI), X0 + MOVOU X0, (CX) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm1K + +emit_lit_memmove_match_emit_encodeFastBlockAsm1K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_end_copy_match_emit_encodeFastBlockAsm1K + +emit_lit_memmove_match_emit_encodeFastBlockAsm1K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_end_copy_match_emit_encodeFastBlockAsm1K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm1K + +memmove_midmatch_emit_encodeFastBlockAsm1K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveShort + // margin: 15, min move: 30 + CMPQ R8, $0x20 + JBE emit_lit_memmove_mid_match_emit_encodeFastBlockAsm1K_memmove_move_17through32 + JMP emit_lit_memmove_mid_match_emit_encodeFastBlockAsm1K_memmove_move_33through64 + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm1K_memmove_move_17through32: + MOVOU (DI), X0 + MOVOU -16(DI)(R8*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(R8*1) + JMP memmove_mid_end_copy_match_emit_encodeFastBlockAsm1K + +emit_lit_memmove_mid_match_emit_encodeFastBlockAsm1K_memmove_move_33through64: + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + +memmove_mid_end_copy_match_emit_encodeFastBlockAsm1K: + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm1K + +memmove_long_match_emit_encodeFastBlockAsm1K: + LEAQ (CX)(R8*1), R9 + + // genMemMoveLong + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R8*1), X2 + MOVOU -16(DI)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeFastBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(DI)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm1Klarge_big_loop_back: + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_match_emit_encodeFastBlockAsm1Klarge_big_loop_back + +emit_lit_memmove_long_match_emit_encodeFastBlockAsm1Klarge_forward_sse_loop_32: + MOVOU -32(DI)(R13*1), X4 + MOVOU -16(DI)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_match_emit_encodeFastBlockAsm1Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(R8*1) + MOVOU X3, -16(CX)(R8*1) + MOVQ R9, CX + JMP match_nolits_copy_encodeFastBlockAsm1K + PCALIGN $0x10 + +match_nolits_copy_encodeFastBlockAsm1K: + // emitCopy + CMPL SI, $0x00000400 + JA two_byte_match_nolit_encodeFastBlockAsm1K + CMPL R11, $0x00000013 + JAE emit_one_longer_match_nolit_encodeFastBlockAsm1K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +emit_one_longer_match_nolit_encodeFastBlockAsm1K: + CMPL R11, $0x00000112 + JAE emit_copy1_repeat_match_nolit_encodeFastBlockAsm1K + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +emit_copy1_repeat_match_nolit_encodeFastBlockAsm1K: + LEAL -1(SI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) + ADDQ $0x02, CX + SUBL $0x12, R11 + + // emitRepeat + LEAL -1(R11), SI + CMPL R11, $0x1d + JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm1K + LEAL -30(R11), SI + CMPL R11, $0x0000011e + JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm1K + CMPL R11, $0x0001001e + JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm1K + MOVB $0xfc, (CX) + MOVL SI, 1(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +repeat_three_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm1K: + MOVB $0xf4, (CX) + MOVW SI, 1(CX) + ADDQ $0x03, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +repeat_two_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm1K: + MOVB $0xec, (CX) + MOVB SI, 1(CX) + ADDQ $0x02, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +repeat_one_emit_copy1_do_repeat_match_nolit_encodeFastBlockAsm1K: + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) + ADDQ $0x01, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +two_byte_match_nolit_encodeFastBlockAsm1K: + // emitCopy2 + LEAL -64(SI), SI + LEAL -4(R11), R11 + MOVW SI, 1(CX) + CMPL R11, $0x3c + JBE emit_copy2_0_match_nolit_encodeFastBlockAsm1K_emit2 + LEAL -60(R11), SI + CMPL R11, $0x0000013c + JB emit_copy2_1_match_nolit_encodeFastBlockAsm1K_emit2 + CMPL R11, $0x0001003c + JB emit_copy2_2_match_nolit_encodeFastBlockAsm1K_emit2 + MOVB $0xfe, (CX) + MOVL SI, 3(CX) + ADDQ $0x06, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +emit_copy2_2_match_nolit_encodeFastBlockAsm1K_emit2: + MOVB $0xfa, (CX) + MOVW SI, 3(CX) + ADDQ $0x05, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +emit_copy2_1_match_nolit_encodeFastBlockAsm1K_emit2: + MOVB $0xf6, (CX) + MOVB SI, 3(CX) + ADDQ $0x04, CX + JMP match_nolit_emitcopy_end_encodeFastBlockAsm1K + +emit_copy2_0_match_nolit_encodeFastBlockAsm1K_emit2: + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) + ADDQ $0x03, CX + +match_nolit_emitcopy_end_encodeFastBlockAsm1K: + CMPL DX, 8(SP) + JAE emit_remainder_encodeFastBlockAsm1K + MOVQ -2(BX)(DX*1), DI + CMPQ CX, (SP) + JB match_nolit_dst_ok_encodeFastBlockAsm1K + MOVQ $0x00000000, ret+56(FP) + RET + +match_nolit_dst_ok_encodeFastBlockAsm1K: + MOVQ $0xcf1bbcdcb7a56463, SI + MOVQ DI, R8 + MOVQ (BX)(DX*1), DI + MOVQ DI, R9 + IMULQ SI, R8 + SHRQ $0x37, R8 + IMULQ SI, R9 + SHRQ $0x37, R9 + LEAL -2(DX), R10 + MOVWLZX (AX)(R9*2), SI + MOVW R10, (AX)(R8*2) + MOVW DX, (AX)(R9*2) + MOVL DX, R8 + INCL DX + CMPQ (BX)(SI*1), DI + JNE search_loop_encodeFastBlockAsm1K + MOVL R8, DI + SUBL SI, DI + MOVL DI, 16(SP) + CMPQ CX, (SP) + JB dst_size_check_ok_4 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_4: + ADDL $0x07, DX + ADDL $0x08, SI + MOVQ src_len+32(FP), DI + SUBL DX, DI + LEAQ (BX)(DX*1), R8 + LEAQ (BX)(SI*1), SI + + // matchLen + XORL R11, R11 + JMP matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_loopback_16_match_nolit2_encodeFastBlockAsm1K: + MOVQ (R8)(R11*1), R9 + MOVQ 8(R8)(R11*1), R10 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm1K + XORQ 8(SI)(R11*1), R10 + JNZ matchlen_bsf_16match_nolit2_encodeFastBlockAsm1K + LEAL -16(DI), DI + LEAL 16(R11), R11 + +matchlen_loop_16_entry_match_nolit2_encodeFastBlockAsm1K: + CMPL DI, $0x10 + JAE matchlen_loopback_16_match_nolit2_encodeFastBlockAsm1K + JMP matchlen_match8_match_nolit2_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_bsf_16match_nolit2_encodeFastBlockAsm1K: + TZCNTQ R10, R10 + SARQ $0x03, R10 + LEAL 8(R11)(R10*1), R11 + JMP match_nolit2_end_encodeFastBlockAsm1K + +matchlen_match8_match_nolit2_encodeFastBlockAsm1K: + CMPL DI, $0x08 + JB matchlen_match4_match_nolit2_encodeFastBlockAsm1K + MOVQ (R8)(R11*1), R9 + XORQ (SI)(R11*1), R9 + JNZ matchlen_bsf_8_match_nolit2_encodeFastBlockAsm1K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit2_encodeFastBlockAsm1K + PCALIGN $0x10 + +matchlen_bsf_8_match_nolit2_encodeFastBlockAsm1K: + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R11 + JMP match_nolit2_end_encodeFastBlockAsm1K + +matchlen_match4_match_nolit2_encodeFastBlockAsm1K: + CMPL DI, $0x04 + JB matchlen_match2_match_nolit2_encodeFastBlockAsm1K + MOVL (R8)(R11*1), R9 + CMPL (SI)(R11*1), R9 + JNE matchlen_match2_match_nolit2_encodeFastBlockAsm1K + LEAL -4(DI), DI + LEAL 4(R11), R11 + +matchlen_match2_match_nolit2_encodeFastBlockAsm1K: + CMPL DI, $0x01 + JE matchlen_match1_match_nolit2_encodeFastBlockAsm1K + JB match_nolit2_end_encodeFastBlockAsm1K + MOVW (R8)(R11*1), R9 + CMPW (SI)(R11*1), R9 + JNE matchlen_match1_match_nolit2_encodeFastBlockAsm1K + LEAL 2(R11), R11 + SUBL $0x02, DI + JZ match_nolit2_end_encodeFastBlockAsm1K + +matchlen_match1_match_nolit2_encodeFastBlockAsm1K: + MOVB (R8)(R11*1), R9 + CMPB (SI)(R11*1), R9 + JNE match_nolit2_end_encodeFastBlockAsm1K + LEAL 1(R11), R11 + JMP match_nolit2_end_encodeFastBlockAsm1K + PCALIGN $0x10 + +match_nolit2_end_encodeFastBlockAsm1K: + ADDL R11, DX + ADDL $0x08, R11 + MOVL DX, 12(SP) + MOVL 16(SP), SI + JMP match_nolits_copy_encodeFastBlockAsm1K + +emit_remainder_encodeFastBlockAsm1K: + MOVQ src_len+32(FP), AX + MOVL 12(SP), DX + SUBL DX, AX + JZ emit_remainder_end_encodeFastBlockAsm1K + LEAQ (BX)(DX*1), DX + LEAQ 3(CX)(AX*1), BX + CMPQ BX, (SP) + JB dst_size_check_ok_5 + MOVQ $0x00000000, ret+56(FP) + RET + +dst_size_check_ok_5: + // emitLiteral + LEAL -1(AX), BX + CMPL BX, $0x1d + JB one_byte_emit_remainder_encodeFastBlockAsm1K + SUBL $0x1d, BX + CMPL BX, $0x00000100 + JB two_bytes_emit_remainder_encodeFastBlockAsm1K + JB three_bytes_emit_remainder_encodeFastBlockAsm1K + +three_bytes_emit_remainder_encodeFastBlockAsm1K: + MOVB $0xf0, (CX) + MOVW BX, 1(CX) + ADDQ $0x03, CX + ADDL $0x1d, BX + JMP memmove_long_emit_remainder_encodeFastBlockAsm1K + +two_bytes_emit_remainder_encodeFastBlockAsm1K: + MOVB $0xe8, (CX) + MOVB BL, 1(CX) + ADDL $0x1d, BX + ADDQ $0x02, CX + CMPL BX, $0x40 + JB memmove_midemit_remainder_encodeFastBlockAsm1K + JMP memmove_long_emit_remainder_encodeFastBlockAsm1K + +one_byte_emit_remainder_encodeFastBlockAsm1K: + SHLB $0x03, BL + MOVB BL, (CX) + ADDQ $0x01, CX + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 1 + CMPQ AX, $0x03 + JB emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_1or2 + JE emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_3 + CMPQ AX, $0x08 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_4through8 + CMPQ AX, $0x10 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_8through16 + CMPQ AX, $0x20 + JBE emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_17through32 + JMP emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_33through64 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_1or2: + MOVB (DX), SI + MOVB -1(DX)(AX*1), DL + MOVB SI, (CX) + MOVB DL, -1(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm1K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_3: + MOVW (DX), SI + MOVB 2(DX), DL + MOVW SI, (CX) + MOVB DL, 2(CX) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm1K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_4through8: + MOVL (DX), SI + MOVL -4(DX)(AX*1), DX + MOVL SI, (CX) + MOVL DX, -4(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm1K + PCALIGN $0x10 + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_8through16: + MOVQ (DX), SI + MOVQ -8(DX)(AX*1), DX + MOVQ SI, (CX) + MOVQ DX, -8(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm1K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_end_copy_emit_remainder_encodeFastBlockAsm1K + +emit_lit_memmove_emit_remainder_encodeFastBlockAsm1K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_end_copy_emit_remainder_encodeFastBlockAsm1K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm1K + +memmove_midemit_remainder_encodeFastBlockAsm1K: + LEAQ (CX)(AX*1), BX + + // genMemMoveShort + // margin: 0, min move: 30 + CMPQ AX, $0x20 + JBE emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm1K_memmove_move_17through32 + JMP emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm1K_memmove_move_33through64 + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm1K_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(AX*1), X1 + MOVOU X0, (CX) + MOVOU X1, -16(CX)(AX*1) + JMP memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm1K + +emit_lit_memmove_mid_emit_remainder_encodeFastBlockAsm1K_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + +memmove_mid_end_copy_emit_remainder_encodeFastBlockAsm1K: + MOVQ BX, CX + JMP emit_remainder_end_encodeFastBlockAsm1K + +memmove_long_emit_remainder_encodeFastBlockAsm1K: + LEAQ (CX)(AX*1), BX + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(AX*1), X2 + MOVOU -16(DX)(AX*1), X3 + MOVQ AX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(DX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm1Klarge_big_loop_back: + MOVOU (SI), X4 + MOVOU 16(SI), X5 + MOVOA X4, (R9) + MOVOA X5, 16(R9) + ADDQ $0x20, R9 + ADDQ $0x20, SI + ADDQ $0x20, R8 + DECQ DI + JNA emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm1Klarge_big_loop_back + +emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm1Klarge_forward_sse_loop_32: + MOVOU -32(DX)(R8*1), X4 + MOVOU -16(DX)(R8*1), X5 + MOVOA X4, -32(CX)(R8*1) + MOVOA X5, -16(CX)(R8*1) + ADDQ $0x20, R8 + CMPQ AX, R8 + JAE emit_lit_memmove_long_emit_remainder_encodeFastBlockAsm1Klarge_forward_sse_loop_32 + MOVOU X0, (CX) + MOVOU X1, 16(CX) + MOVOU X2, -32(CX)(AX*1) + MOVOU X3, -16(CX)(AX*1) + MOVQ BX, CX + +emit_remainder_end_encodeFastBlockAsm1K: + MOVQ dst_base+0(FP), AX + SUBQ AX, CX + MOVQ CX, ret+56(FP) + RET + +// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int +// Requires: BMI, CMOV, SSE2 +TEXT ·encodeBetterBlockAsm(SB), $24-64 + MOVQ tmp+48(FP), AX + MOVQ dst_base+0(FP), CX + MOVQ $0x00001200, DX + PXOR X0, X0 + +zero_loop_encodeBetterBlockAsm: + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -17(AX), DX + LEAQ -17(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 + +search_loop_encodeBetterBlockAsm: + MOVQ tmp+48(FP), BX + MOVL AX, SI + SUBL 12(SP), SI + SHRL $0x08, SI + CMPL SI, $0x63 + JBE check_maxskip_ok_encodeBetterBlockAsm + LEAL 100(AX), SI + JMP check_maxskip_cont_encodeBetterBlockAsm + +check_maxskip_ok_encodeBetterBlockAsm: + LEAL 1(AX)(SI*1), SI + +check_maxskip_cont_encodeBetterBlockAsm: + CMPL SI, 8(SP) + JAE emit_remainder_encodeBetterBlockAsm + MOVQ (DX)(AX*1), DI + MOVL SI, 20(SP) + MOVQ $0x00cf1bbcdcbfa563, R9 + MOVQ $0x9e3779b1, SI + MOVQ DI, R10 + MOVQ DI, R11 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x2f, R10 + SHLQ $0x20, R11 + IMULQ SI, R11 + SHRQ $0x32, R11 + MOVL (BX)(R10*4), SI + MOVL 524288(BX)(R11*4), R8 + MOVL AX, (BX)(R10*4) + MOVL AX, 524288(BX)(R11*4) + LEAL -2162685(AX), R10 + CMPL SI, R10 + CMOVLLE R10, SI + MOVQ (DX)(SI*1), R11 + CMPQ R11, DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL R8, R10 + CMOVLLE R10, R8 + MOVQ (DX)(R8*1), R12 + CMPQ R12, DI + MOVL AX, R13 + SUBL 16(SP), R13 + MOVQ (DX)(R13*1), R13 + MOVQ $0x000000ffffffff00, R14 + XORQ DI, R13 + TESTQ R14, R13 + JNE no_repeat_found_encodeBetterBlockAsm + LEAL 1(AX), BX + MOVL 12(SP), SI + MOVL BX, DI + SUBL 16(SP), DI + JZ repeat_extend_back_end_encodeBetterBlockAsm + +repeat_extend_back_loop_encodeBetterBlockAsm: + CMPL BX, SI JBE repeat_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(R9*1), R10 - MOVB -1(DX)(DI*1), R11 - CMPB R10, R11 + MOVB -1(DX)(DI*1), R8 + MOVB -1(DX)(BX*1), R9 + CMPB R8, R9 JNE repeat_extend_back_end_encodeBetterBlockAsm - LEAL -1(DI), DI - DECL R9 + LEAL -1(BX), BX + DECL DI JNZ repeat_extend_back_loop_encodeBetterBlockAsm repeat_extend_back_end_encodeBetterBlockAsm: - MOVL DI, R8 - SUBL 12(SP), R8 - LEAQ 4(CX)(R8*1), R8 - CMPQ R8, (SP) + MOVL BX, SI + SUBL 12(SP), SI + LEAQ 4(CX)(SI*1), SI + CMPQ SI, (SP) JB repeat_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET repeat_dst_size_check_encodeBetterBlockAsm: // emitLiteralsDstP - MOVL 12(SP), R8 - CMPL R8, DI + MOVL 12(SP), SI + CMPL SI, BX JEQ emit_literal_done_repeat_emit_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), R10 - SUBL R8, R9 + MOVL BX, DI + MOVL BX, 12(SP) + LEAQ (DX)(SI*1), R8 + SUBL SI, DI // emitLiteral - LEAL -1(R9), R8 - CMPL R8, $0x1d + LEAL -1(DI), SI + CMPL SI, $0x1d JB one_byte_repeat_emit_encodeBetterBlockAsm - SUBL $0x1d, R8 - CMPL R8, $0x00000100 + SUBL $0x1d, SI + CMPL SI, $0x00000100 JB two_bytes_repeat_emit_encodeBetterBlockAsm - CMPL R8, $0x00010000 + CMPL SI, $0x00010000 JB three_bytes_repeat_emit_encodeBetterBlockAsm - MOVL R8, R11 - SHRL $0x10, R11 + MOVL SI, R9 + SHRL $0x10, R9 MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R11, 3(CX) + MOVW SI, 1(CX) + MOVB R9, 3(CX) ADDQ $0x04, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm three_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_repeat_emit_encodeBetterBlockAsm two_bytes_repeat_emit_encodeBetterBlockAsm: MOVB $0xe8, (CX) - MOVB R8, 1(CX) - ADDL $0x1d, R8 + MOVB SI, 1(CX) + ADDL $0x1d, SI ADDQ $0x02, CX - CMPL R8, $0x40 + CMPL SI, $0x40 JB memmove_midrepeat_emit_encodeBetterBlockAsm JMP memmove_long_repeat_emit_encodeBetterBlockAsm one_byte_repeat_emit_encodeBetterBlockAsm: - SHLB $0x03, R8 - MOVB R8, (CX) + SHLB $0x03, SI + MOVB SI, (CX) ADDQ $0x01, CX - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 16, min move: 1 - CMPQ R9, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVOU (R10), X0 + MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_end_copy_repeat_emit_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm memmove_midrepeat_emit_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 15, min move: 30 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm emit_lit_memmove_mid_repeat_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_repeat_emit_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_repeat_emit_encodeBetterBlockAsm memmove_long_repeat_emit_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R12) + MOVOA X5, 16(R12) + ADDQ $0x20, R12 + ADDQ $0x20, R9 ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 + DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 + MOVOU -32(R8)(R11*1), X4 + MOVOU -16(R8)(R11*1), X5 + MOVOA X4, -32(CX)(R11*1) + MOVOA X5, -16(CX)(R11*1) + ADDQ $0x20, R11 + CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) + MOVQ SI, CX emit_literal_done_repeat_emit_encodeBetterBlockAsm: ADDL $0x05, AX - MOVL AX, R8 - SUBL 16(SP), R8 - MOVQ src_len+32(FP), R9 - SUBL AX, R9 - LEAQ (DX)(AX*1), R10 - LEAQ (DX)(R8*1), R8 + MOVL AX, SI + SUBL 16(SP), SI + MOVQ src_len+32(FP), DI + SUBL AX, DI + LEAQ (DX)(AX*1), R8 + LEAQ (DX)(SI*1), SI // matchLen - XORL R12, R12 + XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm: - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (R8)(R12*1), R11 + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm - XORQ 8(R8)(R12*1), R13 + XORQ 8(SI)(R10*1), R11 JNZ matchlen_bsf_16repeat_extend_encodeBetterBlockAsm - LEAL -16(R9), R9 - LEAL 16(R12), R12 + LEAL -16(DI), DI + LEAL 16(R10), R10 matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x10 + CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm: - TZCNTQ R13, R13 - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 + TZCNTQ R11, R11 + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match8_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x08 + CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeBetterBlockAsm - MOVQ (R10)(R12*1), R11 - XORQ (R8)(R12*1), R11 + MOVQ (R8)(R10*1), R9 + XORQ (SI)(R10*1), R9 JNZ matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm - LEAL -8(R9), R9 - LEAL 8(R12), R12 + LEAL -8(DI), DI + LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm: - TZCNTQ R11, R11 - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 + TZCNTQ R9, R9 + SARQ $0x03, R9 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match4_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x04 + CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeBetterBlockAsm - MOVL (R10)(R12*1), R11 - CMPL (R8)(R12*1), R11 + MOVL (R8)(R10*1), R9 + CMPL (SI)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeBetterBlockAsm - LEAL -4(R9), R9 - LEAL 4(R12), R12 + LEAL -4(DI), DI + LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeBetterBlockAsm: - CMPL R9, $0x01 + CMPL DI, $0x01 JE matchlen_match1_repeat_extend_encodeBetterBlockAsm JB repeat_extend_forward_end_encodeBetterBlockAsm - MOVW (R10)(R12*1), R11 - CMPW (R8)(R12*1), R11 + MOVW (R8)(R10*1), R9 + CMPW (SI)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeBetterBlockAsm - LEAL 2(R12), R12 - SUBL $0x02, R9 + LEAL 2(R10), R10 + SUBL $0x02, DI JZ repeat_extend_forward_end_encodeBetterBlockAsm matchlen_match1_repeat_extend_encodeBetterBlockAsm: - MOVB (R10)(R12*1), R11 - CMPB (R8)(R12*1), R11 + MOVB (R8)(R10*1), R9 + CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm - LEAL 1(R12), R12 + LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm: - ADDL R12, AX - MOVL AX, R8 - SUBL DI, R8 - MOVL 16(SP), DI + ADDL R10, AX + MOVL AX, SI + SUBL BX, SI + MOVL 16(SP), BX // emitRepeat - LEAL -1(R8), DI - CMPL R8, $0x1d + LEAL -1(SI), BX + CMPL SI, $0x1d JBE repeat_one_match_repeat_encodeBetterBlockAsm - LEAL -30(R8), DI - CMPL R8, $0x0000011e + LEAL -30(SI), BX + CMPL SI, $0x0000011e JB repeat_two_match_repeat_encodeBetterBlockAsm - CMPL R8, $0x0001001e + CMPL SI, $0x0001001e JB repeat_three_match_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL DI, 1(CX) + MOVL BX, 1(CX) ADDQ $0x04, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_three_match_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW DI, 1(CX) + MOVW BX, 1(CX) ADDQ $0x03, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_two_match_repeat_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB DI, 1(CX) + MOVB BL, 1(CX) ADDQ $0x02, CX JMP repeat_end_emit_encodeBetterBlockAsm repeat_one_match_repeat_encodeBetterBlockAsm: - XORL DI, DI - LEAL -4(DI)(R8*8), DI - MOVB DI, (CX) + XORL BX, BX + LEAL -4(BX)(SI*8), BX + MOVB BL, (CX) ADDQ $0x01, CX repeat_end_emit_encodeBetterBlockAsm: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm: - CMPL R8, R12 - JLE offset_ok_2_encodeBetterBlockAsm - CMPL BX, R9 - JEQ candidate_match_encodeBetterBlockAsm - -offset_ok_2_encodeBetterBlockAsm: - CMPL R10, R12 - JLE offset_ok_3_encodeBetterBlockAsm - CMPL SI, R9 - JEQ candidateS_match_encodeBetterBlockAsm - -offset_ok_3_encodeBetterBlockAsm: - MOVL 20(SP), AX - JMP search_loop_encodeBetterBlockAsm + CMPL SI, R10 + CMOVLLE R10, SI + CMPL R11, DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL R8, R10 + CMOVLLE R10, R8 + CMPL R12, DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), AX + JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, R9 - MOVQ R9, R13 - SHLQ $0x08, R13 - IMULQ R11, R13 - SHRQ $0x2f, R13 - MOVL (DI)(R13*4), R8 - INCL AX - MOVL AX, (DI)(R13*4) - CMPL R8, R12 - JLE offset_ok_4_encodeBetterBlockAsm - CMPL (DX)(R8*1), R9 - JEQ candidate_match_encodeBetterBlockAsm - -offset_ok_4_encodeBetterBlockAsm: - DECL AX - MOVL R10, R8 + SHRQ $0x08, DI + MOVQ DI, R11 + SHLQ $0x08, R11 + IMULQ R9, R11 + SHRQ $0x2f, R11 + MOVL (BX)(R11*4), SI + INCL AX + MOVL AX, (BX)(R11*4) + CMPL SI, R10 + CMOVLLE R10, SI + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm + DECL AX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), DI - TESTL R8, R8 + MOVL 12(SP), BX + TESTL SI, SI JZ match_extend_back_end_encodeBetterBlockAsm match_extend_back_loop_encodeBetterBlockAsm: - CMPL AX, DI + CMPL AX, BX JBE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(R8*1), R9 - MOVB -1(DX)(AX*1), R10 - CMPB R9, R10 + MOVB -1(DX)(SI*1), DI + MOVB -1(DX)(AX*1), R8 + CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(AX), AX - DECL R8 + DECL SI JZ match_extend_back_end_encodeBetterBlockAsm JMP match_extend_back_loop_encodeBetterBlockAsm match_extend_back_end_encodeBetterBlockAsm: - MOVL AX, DI - SUBL 12(SP), DI - LEAQ 4(CX)(DI*1), DI - CMPQ DI, (SP) + MOVL AX, BX + SUBL 12(SP), BX + LEAQ 4(CX)(BX*1), BX + CMPQ BX, (SP) JB match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+56(FP) RET match_dst_size_check_encodeBetterBlockAsm: - MOVL AX, DI + MOVL AX, BX ADDL $0x04, AX - ADDL $0x04, R8 - MOVQ src_len+32(FP), R9 - SUBL AX, R9 - LEAQ (DX)(AX*1), R10 - LEAQ (DX)(R8*1), R11 + ADDL $0x04, SI + MOVQ src_len+32(FP), DI + SUBL AX, DI + LEAQ (DX)(AX*1), R8 + LEAQ (DX)(SI*1), R9 // matchLen - XORL R13, R13 + XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: - MOVQ (R10)(R13*1), R12 - MOVQ 8(R10)(R13*1), R14 - XORQ (R11)(R13*1), R12 + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - XORQ 8(R11)(R13*1), R14 + XORQ 8(R9)(R11*1), R12 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm - LEAL -16(R9), R9 - LEAL 16(R13), R13 + LEAL -16(DI), DI + LEAL 16(R11), R11 matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x10 + CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm JMP matchlen_match8_match_nolit_encodeBetterBlockAsm + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm: - TZCNTQ R14, R14 - SARQ $0x03, R14 - LEAL 8(R13)(R14*1), R13 + TZCNTQ R12, R12 + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match8_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x08 + CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm - MOVQ (R10)(R13*1), R12 - XORQ (R11)(R13*1), R12 + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - LEAL -8(R9), R9 - LEAL 8(R13), R13 + LEAL -8(DI), DI + LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: - TZCNTQ R12, R12 - SARQ $0x03, R12 - LEAL (R13)(R12*1), R13 + TZCNTQ R10, R10 + SARQ $0x03, R10 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm matchlen_match4_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x04 + CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm - MOVL (R10)(R13*1), R12 - CMPL (R11)(R13*1), R12 + MOVL (R8)(R11*1), R10 + CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm - LEAL -4(R9), R9 - LEAL 4(R13), R13 + LEAL -4(DI), DI + LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x01 + CMPL DI, $0x01 JE matchlen_match1_match_nolit_encodeBetterBlockAsm JB match_nolit_end_encodeBetterBlockAsm - MOVW (R10)(R13*1), R12 - CMPW (R11)(R13*1), R12 + MOVW (R8)(R11*1), R10 + CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm - LEAL 2(R13), R13 - SUBL $0x02, R9 + LEAL 2(R11), R11 + SUBL $0x02, DI JZ match_nolit_end_encodeBetterBlockAsm matchlen_match1_match_nolit_encodeBetterBlockAsm: - MOVB (R10)(R13*1), R12 - CMPB (R11)(R13*1), R12 + MOVB (R8)(R11*1), R10 + CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R13), R13 + LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm: - MOVL AX, R9 - SUBL R8, R9 - CMPL R13, $0x01 + MOVL AX, DI + SUBL SI, DI + CMPL R11, $0x01 JA match_length_ok_encodeBetterBlockAsm - CMPL R9, $0x0001003f + CMPL DI, $0x0001003f JBE match_length_ok_encodeBetterBlockAsm MOVL 20(SP), AX INCL AX JMP search_loop_encodeBetterBlockAsm match_length_ok_encodeBetterBlockAsm: - MOVL R9, 16(SP) + MOVL DI, 16(SP) // Check if we can combine lit+copy - MOVLQZX 12(SP), R10 - MOVL DI, R8 - SUBL R10, R8 + MOVLQZX 12(SP), R8 + MOVL BX, SI + SUBL R8, SI JZ match_emit_nolits_encodeBetterBlockAsm - CMPL R9, $0x00000040 + CMPL DI, $0x00000040 JL match_emit_lits_encodeBetterBlockAsm - CMPL R9, $0x0001003f + CMPL DI, $0x0001003f JA match_emit_copy3_encodeBetterBlockAsm - CMPL R8, $0x04 + CMPL SI, $0x04 JA match_emit_lits_encodeBetterBlockAsm - MOVL (DX)(R10*1), R10 - ADDL R13, AX - ADDL $0x04, R13 + MOVL (DX)(R8*1), R8 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy2WithLits - XORQ R11, R11 - SUBL $0x40, R9 - LEAL -11(R13), R12 - LEAL -4(R13), R13 - MOVW R9, 1(CX) - CMPL R13, $0x07 - CMOVLGE R12, R11 - MOVQ $0x00000007, R9 - CMOVLLT R13, R9 - LEAL -1(R8)(R9*4), R9 - MOVL $0x00000003, R12 - LEAL (R12)(R9*8), R9 - MOVB R9, (CX) + XORQ R9, R9 + SUBL $0x40, DI + LEAL -11(R11), R10 + LEAL -4(R11), R11 + MOVW DI, 1(CX) + CMPL R11, $0x07 + CMOVLGE R10, R9 + MOVQ $0x00000007, DI + CMOVLLT R11, DI + LEAL -1(SI)(DI*4), DI + MOVL $0x00000003, R10 + LEAL (R10)(DI*8), DI + MOVB DI, (CX) ADDQ $0x03, CX - MOVL R10, (CX) - ADDQ R8, CX - TESTL R11, R11 + MOVL R8, (CX) + ADDQ SI, CX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm // emitRepeat - LEAL -1(R11), R8 - CMPL R11, $0x1d + LEAL -1(R9), SI + CMPL R9, $0x1d JBE repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm - LEAL -30(R11), R8 - CMPL R11, $0x0000011e + LEAL -30(R9), SI + CMPL R9, $0x0000011e JB repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm - CMPL R11, $0x0001001e + CMPL R9, $0x0001001e JB repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL R8, 1(CX) + MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_emit_repeat_copy2_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_emit_repeat_copy2_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB R8, 1(CX) + MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_match_emit_repeat_copy2_encodeBetterBlockAsm: - XORL R8, R8 - LEAL -4(R8)(R11*8), R8 - MOVB R8, (CX) + XORL SI, SI + LEAL -4(SI)(R9*8), SI + MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_emit_copy3_encodeBetterBlockAsm: - CMPL R8, $0x03 + CMPL SI, $0x03 JA match_emit_lits_encodeBetterBlockAsm - MOVLQZX 12(SP), R10 - MOVL (DX)(R10*1), R10 - ADDL R13, AX - ADDL $0x04, R13 + MOVLQZX 12(SP), R8 + MOVL (DX)(R8*1), R8 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy3 - LEAL -4(R13), R13 - LEAL -65536(R9), R9 - SHLL $0x0b, R9 - LEAL 7(R9)(R8*8), R9 - CMPL R13, $0x3c + LEAL -4(R11), R11 + LEAL -65536(DI), DI + SHLL $0x0b, DI + LEAL 7(DI)(SI*8), DI + CMPL R11, $0x3c JBE emit_copy3_0_match_emit_lits_encodeBetterBlockAsm - LEAL -60(R13), R11 - CMPL R13, $0x0000013c + LEAL -60(R11), R9 + CMPL R11, $0x0000013c JB emit_copy3_1_match_emit_lits_encodeBetterBlockAsm - CMPL R13, $0x0001003c + CMPL R11, $0x0001003c JB emit_copy3_2_match_emit_lits_encodeBetterBlockAsm - ADDL $0x000007e0, R9 - MOVL R9, (CX) - MOVL R11, 4(CX) + ADDL $0x000007e0, DI + MOVL DI, (CX) + MOVL R9, 4(CX) ADDQ $0x07, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_2_match_emit_lits_encodeBetterBlockAsm: - ADDL $0x000007c0, R9 - MOVL R9, (CX) - MOVW R11, 4(CX) + ADDL $0x000007c0, DI + MOVL DI, (CX) + MOVW R9, 4(CX) ADDQ $0x06, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_1_match_emit_lits_encodeBetterBlockAsm: - ADDL $0x000007a0, R9 - MOVL R9, (CX) - MOVB R11, 4(CX) + ADDL $0x000007a0, DI + MOVL DI, (CX) + MOVB R9, 4(CX) ADDQ $0x05, CX JMP match_emit_copy_litsencodeBetterBlockAsm emit_copy3_0_match_emit_lits_encodeBetterBlockAsm: - SHLL $0x05, R13 - ORL R13, R9 - MOVL R9, (CX) + SHLL $0x05, R11 + ORL R11, DI + MOVL DI, (CX) ADDQ $0x04, CX match_emit_copy_litsencodeBetterBlockAsm: - MOVL R10, (CX) - ADDQ R8, CX + MOVL R8, (CX) + ADDQ SI, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_emit_lits_encodeBetterBlockAsm: - LEAQ (DX)(R10*1), R10 + LEAQ (DX)(R8*1), R8 // emitLiteral - LEAL -1(R8), R11 - CMPL R11, $0x1d + LEAL -1(SI), R9 + CMPL R9, $0x1d JB one_byte_match_emit_encodeBetterBlockAsm - SUBL $0x1d, R11 - CMPL R11, $0x00000100 + SUBL $0x1d, R9 + CMPL R9, $0x00000100 JB two_bytes_match_emit_encodeBetterBlockAsm - CMPL R11, $0x00010000 + CMPL R9, $0x00010000 JB three_bytes_match_emit_encodeBetterBlockAsm - MOVL R11, R12 - SHRL $0x10, R12 + MOVL R9, R10 + SHRL $0x10, R10 MOVB $0xf8, (CX) - MOVW R11, 1(CX) - MOVB R12, 3(CX) + MOVW R9, 1(CX) + MOVB R10, 3(CX) ADDQ $0x04, CX - ADDL $0x1d, R11 + ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (CX) - MOVW R11, 1(CX) + MOVW R9, 1(CX) ADDQ $0x03, CX - ADDL $0x1d, R11 + ADDL $0x1d, R9 JMP memmove_long_match_emit_encodeBetterBlockAsm two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xe8, (CX) - MOVB R11, 1(CX) - ADDL $0x1d, R11 + MOVB R9, 1(CX) + ADDL $0x1d, R9 ADDQ $0x02, CX - CMPL R11, $0x40 + CMPL R9, $0x40 JB memmove_midmatch_emit_encodeBetterBlockAsm JMP memmove_long_match_emit_encodeBetterBlockAsm one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x03, R11 - MOVB R11, (CX) + SHLB $0x03, R9 + MOVB R9, (CX) ADDQ $0x01, CX - LEAQ (CX)(R8*1), R11 + LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 16, min move: 1 - CMPQ R8, $0x10 + CMPQ SI, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 + CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVOU (R10), X0 + MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R8*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) + MOVOU X1, -16(CX)(SI*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R8*1), X2 - MOVOU -16(R10)(R8*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ R11, CX + MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm memmove_midmatch_emit_encodeBetterBlockAsm: - LEAQ (CX)(R8*1), R11 + LEAQ (CX)(SI*1), R9 // genMemMoveShort // margin: 15, min move: 30 - CMPQ R8, $0x20 + CMPQ SI, $0x20 JBE emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R8*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(SI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) + MOVOU X1, -16(CX)(SI*1) JMP memmove_mid_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_mid_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R8*1), X2 - MOVOU -16(R10)(R8*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) memmove_mid_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ R11, CX + MOVQ R9, CX JMP match_emit_nolits_encodeBetterBlockAsm memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (CX)(R8*1), R11 + LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R8*1), X2 - MOVOU -16(R10)(R8*1), X3 - MOVQ R8, R14 - SHRQ $0x05, R14 - MOVQ CX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R15 - SUBQ R12, R15 - DECQ R14 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R15*1), R12 - LEAQ -32(CX)(R15*1), BP + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R12), X4 - MOVOU 16(R12), X5 - MOVOA X4, (BP) - MOVOA X5, 16(BP) - ADDQ $0x20, BP - ADDQ $0x20, R12 - ADDQ $0x20, R15 - DECQ R14 + MOVOU (R10), X4 + MOVOU 16(R10), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R10 + ADDQ $0x20, R13 + DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R15*1), X4 - MOVOU -16(R10)(R15*1), X5 - MOVOA X4, -32(CX)(R15*1) - MOVOA X5, -16(CX)(R15*1) - ADDQ $0x20, R15 - CMPQ R8, R15 + MOVOU -32(R8)(R13*1), X4 + MOVOU -16(R8)(R13*1), X5 + MOVOA X4, -32(CX)(R13*1) + MOVOA X5, -16(CX)(R13*1) + ADDQ $0x20, R13 + CMPQ SI, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ R11, CX + MOVOU X2, -32(CX)(SI*1) + MOVOU X3, -16(CX)(SI*1) + MOVQ R9, CX match_emit_nolits_encodeBetterBlockAsm: - ADDL R13, AX - ADDL $0x04, R13 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitCopy - CMPL R9, $0x0001003f + CMPL DI, $0x0001003f JBE two_byte_offset_match_nolit_encodeBetterBlockAsm // emitCopy3 - LEAL -4(R13), R13 - LEAL -65536(R9), R8 - SHLL $0x0b, R8 - ADDL $0x07, R8 - CMPL R13, $0x3c + LEAL -4(R11), R11 + LEAL -65536(DI), SI + SHLL $0x0b, SI + ADDL $0x07, SI + CMPL R11, $0x3c JBE emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3 - LEAL -60(R13), R9 - CMPL R13, $0x0000013c + LEAL -60(R11), DI + CMPL R11, $0x0000013c JB emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3 - CMPL R13, $0x0001003c + CMPL R11, $0x0001003c JB emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3 - ADDL $0x000007e0, R8 - MOVL R8, (CX) - MOVL R9, 4(CX) + ADDL $0x000007e0, SI + MOVL SI, (CX) + MOVL DI, 4(CX) ADDQ $0x07, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_2_match_nolit_encodeBetterBlockAsm_emit3: - ADDL $0x000007c0, R8 - MOVL R8, (CX) - MOVW R9, 4(CX) + ADDL $0x000007c0, SI + MOVL SI, (CX) + MOVW DI, 4(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_1_match_nolit_encodeBetterBlockAsm_emit3: - ADDL $0x000007a0, R8 - MOVL R8, (CX) - MOVB R9, 4(CX) + ADDL $0x000007a0, SI + MOVL SI, (CX) + MOVB DI, 4(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy3_0_match_nolit_encodeBetterBlockAsm_emit3: - SHLL $0x05, R13 - ORL R13, R8 - MOVL R8, (CX) + SHLL $0x05, R11 + ORL R11, SI + MOVL SI, (CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R9, $0x00000400 + CMPL DI, $0x00000400 JA two_byte_match_nolit_encodeBetterBlockAsm - CMPL R13, $0x00000013 + CMPL R11, $0x00000013 JAE emit_one_longer_match_nolit_encodeBetterBlockAsm - LEAL -1(R9), R8 - SHLL $0x06, R8 - LEAL -15(R8)(R13*4), R8 - MOVW R8, (CX) + LEAL -1(DI), SI + SHLL $0x06, SI + LEAL -15(SI)(R11*4), SI + MOVW SI, (CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_one_longer_match_nolit_encodeBetterBlockAsm: - CMPL R13, $0x00000112 + CMPL R11, $0x00000112 JAE emit_copy1_repeat_match_nolit_encodeBetterBlockAsm - LEAL -1(R9), R8 - SHLL $0x06, R8 - LEAL 61(R8), R8 - MOVW R8, (CX) - LEAL -18(R13), R8 - MOVB R8, 2(CX) + LEAL -1(DI), SI + SHLL $0x06, SI + LEAL 61(SI), SI + MOVW SI, (CX) + LEAL -18(R11), SI + MOVB SI, 2(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy1_repeat_match_nolit_encodeBetterBlockAsm: - LEAL -1(R9), R8 - SHLL $0x06, R8 - LEAL 57(R8), R8 - MOVW R8, (CX) + LEAL -1(DI), SI + SHLL $0x06, SI + LEAL 57(SI), SI + MOVW SI, (CX) ADDQ $0x02, CX - SUBL $0x12, R13 + SUBL $0x12, R11 // emitRepeat - LEAL -1(R13), R8 - CMPL R13, $0x1d + LEAL -1(R11), SI + CMPL R11, $0x1d JBE repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm - LEAL -30(R13), R8 - CMPL R13, $0x0000011e + LEAL -30(R11), SI + CMPL R11, $0x0000011e JB repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm - CMPL R13, $0x0001001e + CMPL R11, $0x0001001e JB repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL R8, 1(CX) + MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB R8, 1(CX) + MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_emit_copy1_do_repeat_match_nolit_encodeBetterBlockAsm: - XORL R8, R8 - LEAL -4(R8)(R13*8), R8 - MOVB R8, (CX) + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) ADDQ $0x01, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_match_nolit_encodeBetterBlockAsm: // emitCopy2 - LEAL -64(R9), R9 - LEAL -4(R13), R13 - MOVW R9, 1(CX) - CMPL R13, $0x3c + LEAL -64(DI), DI + LEAL -4(R11), R11 + MOVW DI, 1(CX) + CMPL R11, $0x3c JBE emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2 - LEAL -60(R13), R8 - CMPL R13, $0x0000013c + LEAL -60(R11), SI + CMPL R11, $0x0000013c JB emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2 - CMPL R13, $0x0001003c + CMPL R11, $0x0001003c JB emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2 MOVB $0xfe, (CX) - MOVL R8, 3(CX) + MOVL SI, 3(CX) ADDQ $0x06, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_2_match_nolit_encodeBetterBlockAsm_emit2: MOVB $0xfa, (CX) - MOVW R8, 3(CX) + MOVW SI, 3(CX) ADDQ $0x05, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_1_match_nolit_encodeBetterBlockAsm_emit2: MOVB $0xf6, (CX) - MOVB R8, 3(CX) + MOVB SI, 3(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy2_0_match_nolit_encodeBetterBlockAsm_emit2: - MOVL $0x00000002, R8 - LEAL (R8)(R13*4), R8 - MOVB R8, (CX) + MOVL $0x00000002, SI + LEAL (SI)(R11*4), SI + MOVB SI, (CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm // emitLiteralsDstP - MOVL 12(SP), R8 - CMPL R8, DI + MOVL 12(SP), SI + CMPL SI, BX JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (DX)(R8*1), R10 - SUBL R8, R9 + MOVL BX, DI + MOVL BX, 12(SP) + LEAQ (DX)(SI*1), R8 + SUBL SI, DI // emitLiteral - LEAL -1(R9), R8 - CMPL R8, $0x1d + LEAL -1(DI), SI + CMPL SI, $0x1d JB one_byte_match_emit_repeat_encodeBetterBlockAsm - SUBL $0x1d, R8 - CMPL R8, $0x00000100 + SUBL $0x1d, SI + CMPL SI, $0x00000100 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL R8, $0x00010000 + CMPL SI, $0x00010000 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm - MOVL R8, R11 - SHRL $0x10, R11 + MOVL SI, R9 + SHRL $0x10, R9 MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R11, 3(CX) + MOVW SI, 1(CX) + MOVB R9, 3(CX) ADDQ $0x04, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm three_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf0, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX - ADDL $0x1d, R8 + ADDL $0x1d, SI JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm two_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xe8, (CX) - MOVB R8, 1(CX) - ADDL $0x1d, R8 + MOVB SI, 1(CX) + ADDL $0x1d, SI ADDQ $0x02, CX - CMPL R8, $0x40 + CMPL SI, $0x40 JB memmove_midmatch_emit_repeat_encodeBetterBlockAsm JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x03, R8 - MOVB R8, (CX) + SHLB $0x03, SI + MOVB SI, (CX) ADDQ $0x01, CX - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 16, min move: 1 - CMPQ R9, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVOU (R10), X0 + MOVOU (R8), X0 MOVOU X0, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_midmatch_emit_repeat_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveShort // margin: 15, min move: 30 - CMPQ R9, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) + MOVOU X1, -16(CX)(DI*1) JMP memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_mid_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) memmove_mid_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ R8, CX + MOVQ SI, CX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), R8 + LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R12 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R12 + MOVOU (R9), X4 + MOVOU 16(R9), X5 + MOVOA X4, (R13) + MOVOA X5, 16(R13) + ADDQ $0x20, R13 + ADDQ $0x20, R9 + ADDQ $0x20, R12 + DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 + MOVOU -32(R8)(R12*1), X4 + MOVOU -16(R8)(R12*1), X5 + MOVOA X4, -32(CX)(R12*1) + MOVOA X5, -16(CX)(R12*1) + ADDQ $0x20, R12 + CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (CX) MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX + MOVOU X2, -32(CX)(DI*1) + MOVOU X3, -16(CX)(DI*1) + MOVQ SI, CX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R13, AX - ADDL $0x04, R13 + ADDL R11, AX + ADDL $0x04, R11 MOVL AX, 12(SP) // emitRepeat - LEAL -1(R13), R8 - CMPL R13, $0x1d + LEAL -1(R11), SI + CMPL R11, $0x1d JBE repeat_one_match_nolit_repeat_encodeBetterBlockAsm - LEAL -30(R13), R8 - CMPL R13, $0x0000011e + LEAL -30(R11), SI + CMPL R11, $0x0000011e JB repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL R13, $0x0001001e + CMPL R11, $0x0001001e JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm MOVB $0xfc, (CX) - MOVL R8, 1(CX) + MOVL SI, 1(CX) ADDQ $0x04, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_repeat_encodeBetterBlockAsm: MOVB $0xf4, (CX) - MOVW R8, 1(CX) + MOVW SI, 1(CX) ADDQ $0x03, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_repeat_encodeBetterBlockAsm: MOVB $0xec, (CX) - MOVB R8, 1(CX) + MOVB SI, 1(CX) ADDQ $0x02, CX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_one_match_nolit_repeat_encodeBetterBlockAsm: - XORL R8, R8 - LEAL -4(R8)(R13*8), R8 - MOVB R8, (CX) + XORL SI, SI + LEAL -4(SI)(R11*8), SI + MOVB SI, (CX) ADDQ $0x01, CX match_nolit_emitcopy_end_encodeBetterBlockAsm: @@ -9260,51 +14708,53 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm: RET match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ tmp+48(FP), R8 - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, R10 - LEAQ 1(DI), DI - LEAQ -2(AX), R11 - MOVQ (DX)(DI*1), R12 - MOVQ 1(DX)(DI*1), R13 - MOVQ (DX)(R11*1), R14 - MOVQ 1(DX)(R11*1), R15 + MOVQ tmp+48(FP), SI + MOVQ $0x00cf1bbcdcbfa563, DI + MOVQ $0x9e3779b1, R8 + LEAQ 1(BX), BX + LEAQ -2(AX), R9 + MOVQ (DX)(BX*1), R10 + MOVQ 1(DX)(BX*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 + SHLQ $0x08, R10 + IMULQ DI, R10 + SHRQ $0x2f, R10 + SHLQ $0x20, R11 + IMULQ R8, R11 + SHRQ $0x32, R11 SHLQ $0x08, R12 - IMULQ R9, R12 + IMULQ DI, R12 SHRQ $0x2f, R12 - IMULQ R10, R13 + SHLQ $0x20, R13 + IMULQ R8, R13 SHRQ $0x32, R13 - SHLQ $0x08, R14 - IMULQ R9, R14 - SHRQ $0x2f, R14 - IMULQ R10, R15 - SHRQ $0x32, R15 - LEAQ 1(DI), R10 - LEAQ 1(R11), BP - MOVL DI, (R8)(R12*4) - MOVL R11, (R8)(R14*4) - LEAQ 1(R11)(DI*1), R12 - SHRQ $0x01, R12 - ADDQ $0x01, DI - SUBQ $0x01, R11 - MOVL R10, 524288(R8)(R13*4) - MOVL BP, 524288(R8)(R15*4) + LEAQ 1(BX), R8 + LEAQ 1(R9), R14 + MOVL BX, (SI)(R10*4) + MOVL R9, (SI)(R12*4) + LEAQ 1(R9)(BX*1), R10 + SHRQ $0x01, R10 + ADDQ $0x01, BX + SUBQ $0x01, R9 + MOVL R8, 524288(SI)(R11*4) + MOVL R14, 524288(SI)(R13*4) index_loop_encodeBetterBlockAsm: - CMPQ R12, R11 + CMPQ R10, R9 JAE search_loop_encodeBetterBlockAsm - MOVQ (DX)(DI*1), R10 - MOVQ (DX)(R12*1), R13 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - SHLQ $0x08, R13 - IMULQ R9, R13 - SHRQ $0x2f, R13 - MOVL DI, (R8)(R10*4) - MOVL R11, (R8)(R13*4) - ADDQ $0x02, DI - ADDQ $0x02, R12 + MOVQ (DX)(BX*1), R8 + MOVQ (DX)(R10*1), R11 + SHLQ $0x08, R8 + IMULQ DI, R8 + SHRQ $0x2f, R8 + SHLQ $0x08, R11 + IMULQ DI, R11 + SHRQ $0x2f, R11 + MOVL BX, (SI)(R8*4) + MOVL R9, (SI)(R11*4) + ADDQ $0x02, BX + ADDQ $0x02, R10 JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: @@ -9402,6 +14852,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (AX), SI @@ -9467,20 +14918,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (SI), X4 @@ -9522,29 +14974,30 @@ TEXT ·encodeBetterBlockAsm2MB(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm2MB: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm2MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -17(AX), DX - LEAQ -17(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm2MB + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -17(AX), DX + LEAQ -17(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm2MB: MOVQ tmp+48(FP), BX @@ -9571,6 +15024,7 @@ check_maxskip_cont_encodeBetterBlockAsm2MB: SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x2f, R10 + SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL (BX)(R10*4), SI @@ -9672,6 +15126,7 @@ one_byte_repeat_emit_encodeBetterBlockAsm2MB: CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -9733,20 +15188,21 @@ memmove_long_repeat_emit_encodeBetterBlockAsm2MB: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(CX)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (R9), X4 @@ -9785,6 +15241,7 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm2MB: // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB: MOVQ (R8)(R10*1), R9 @@ -9800,6 +15257,7 @@ matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm2MB JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm2MB: TZCNTQ R11, R11 @@ -9816,11 +15274,12 @@ matchlen_match8_repeat_extend_encodeBetterBlockAsm2MB: LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm2MB: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm2MB matchlen_match4_repeat_extend_encodeBetterBlockAsm2MB: @@ -9848,6 +15307,8 @@ matchlen_match1_repeat_extend_encodeBetterBlockAsm2MB: CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm2MB LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm2MB + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm2MB: ADDL R10, AX @@ -9890,6 +15351,7 @@ repeat_one_match_repeat_encodeBetterBlockAsm2MB: repeat_end_emit_encodeBetterBlockAsm2MB: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm2MB + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm2MB: CMPL R10, DI @@ -9900,18 +15362,19 @@ no_repeat_found_encodeBetterBlockAsm2MB: JMP search_loop_encodeBetterBlockAsm2MB candidateS_match_encodeBetterBlockAsm2MB: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - MOVL (BX)(R10*4), SI - INCL AX - MOVL AX, (BX)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm2MB - DECL AX - MOVL R8, SI + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x2f, R10 + MOVL (BX)(R10*4), SI + INCL AX + MOVL AX, (BX)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm2MB + DECL AX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm2MB: MOVL 12(SP), BX @@ -9951,6 +15414,7 @@ match_dst_size_check_encodeBetterBlockAsm2MB: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB: MOVQ (R8)(R11*1), R10 @@ -9966,6 +15430,7 @@ matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm2MB: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm2MB JMP matchlen_match8_match_nolit_encodeBetterBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm2MB: TZCNTQ R12, R12 @@ -9982,11 +15447,12 @@ matchlen_match8_match_nolit_encodeBetterBlockAsm2MB: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm2MB + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm2MB: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm2MB matchlen_match4_match_nolit_encodeBetterBlockAsm2MB: @@ -10014,6 +15480,8 @@ matchlen_match1_match_nolit_encodeBetterBlockAsm2MB: CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm2MB LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm2MB + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm2MB: MOVL AX, DI @@ -10200,6 +15668,7 @@ one_byte_match_emit_encodeBetterBlockAsm2MB: CMPQ SI, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -10261,20 +15730,21 @@ memmove_long_match_emit_encodeBetterBlockAsm2MB: LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (R10), X4 @@ -10506,6 +15976,7 @@ one_byte_match_emit_repeat_encodeBetterBlockAsm2MB: CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_33through64 + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVOU (R8), X0 @@ -10567,20 +16038,21 @@ memmove_long_match_emit_repeat_encodeBetterBlockAsm2MB: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(CX)(R12*1), R13 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (R9), X4 @@ -10665,11 +16137,13 @@ match_nolit_dst_ok_encodeBetterBlockAsm2MB: SHLQ $0x08, R10 IMULQ DI, R10 SHRQ $0x2f, R10 + SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 SHLQ $0x08, R12 IMULQ DI, R12 SHRQ $0x2f, R12 + SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x32, R13 LEAQ 1(BX), R8 @@ -10795,6 +16269,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm2MB + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm2MB_memmove_move_8through16: MOVQ (AX), SI @@ -10860,20 +16335,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm2MB: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm2MBlarge_big_loop_back: MOVOU (SI), X4 @@ -10915,29 +16391,30 @@ TEXT ·encodeBetterBlockAsm512K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm512K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm512K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm512K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm512K: MOVQ tmp+48(FP), BX @@ -10964,6 +16441,7 @@ check_maxskip_cont_encodeBetterBlockAsm512K: SHLQ $0x08, R10 IMULQ R9, R10 SHRQ $0x30, R10 + SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x33, R11 MOVL (BX)(R10*4), SI @@ -11072,6 +16550,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R9 @@ -11135,20 +16614,21 @@ memmove_long_repeat_emit_encodeBetterBlockAsm512K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(CX)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (R9), X4 @@ -11187,6 +16667,7 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm512K: // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K: MOVQ (R8)(R10*1), R9 @@ -11202,6 +16683,7 @@ matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm512K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm512K + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm512K: TZCNTQ R11, R11 @@ -11218,11 +16700,12 @@ matchlen_match8_repeat_extend_encodeBetterBlockAsm512K: LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm512K + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm512K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm512K matchlen_match4_repeat_extend_encodeBetterBlockAsm512K: @@ -11250,6 +16733,8 @@ matchlen_match1_repeat_extend_encodeBetterBlockAsm512K: CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm512K LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm512K + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm512K: ADDL R10, AX @@ -11292,6 +16777,7 @@ repeat_one_match_repeat_encodeBetterBlockAsm512K: repeat_end_emit_encodeBetterBlockAsm512K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm512K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm512K: CMPL R10, DI @@ -11302,18 +16788,19 @@ no_repeat_found_encodeBetterBlockAsm512K: JMP search_loop_encodeBetterBlockAsm512K candidateS_match_encodeBetterBlockAsm512K: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL (BX)(R10*4), SI - INCL AX - MOVL AX, (BX)(R10*4) - CMPL (DX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm512K - DECL AX - MOVL R8, SI + SHRQ $0x08, DI + MOVQ DI, R10 + SHLQ $0x08, R10 + IMULQ R9, R10 + SHRQ $0x30, R10 + MOVL (BX)(R10*4), SI + INCL AX + MOVL AX, (BX)(R10*4) + CMPL (DX)(SI*1), DI + JEQ candidate_match_encodeBetterBlockAsm512K + DECL AX + MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm512K: MOVL 12(SP), BX @@ -11353,6 +16840,7 @@ match_dst_size_check_encodeBetterBlockAsm512K: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K: MOVQ (R8)(R11*1), R10 @@ -11368,6 +16856,7 @@ matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm512K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm512K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm512K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm512K: TZCNTQ R12, R12 @@ -11384,11 +16873,12 @@ matchlen_match8_match_nolit_encodeBetterBlockAsm512K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm512K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm512K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm512K matchlen_match4_match_nolit_encodeBetterBlockAsm512K: @@ -11416,6 +16906,8 @@ matchlen_match1_match_nolit_encodeBetterBlockAsm512K: CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm512K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm512K + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm512K: MOVL AX, DI @@ -11609,6 +17101,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R10 @@ -11672,20 +17165,21 @@ memmove_long_match_emit_encodeBetterBlockAsm512K: LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (R10), X4 @@ -11924,6 +17418,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (R8), R9 @@ -11987,20 +17482,21 @@ memmove_long_match_emit_repeat_encodeBetterBlockAsm512K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(CX)(R12*1), R13 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (R9), X4 @@ -12085,11 +17581,13 @@ match_nolit_dst_ok_encodeBetterBlockAsm512K: SHLQ $0x08, R10 IMULQ DI, R10 SHRQ $0x30, R10 + SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x33, R11 SHLQ $0x08, R12 IMULQ DI, R12 SHRQ $0x30, R12 + SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x33, R13 LEAQ 1(BX), R8 @@ -12215,6 +17713,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm512K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm512K_memmove_move_8through16: MOVQ (AX), SI @@ -12280,20 +17779,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm512K: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm512Klarge_big_loop_back: MOVOU (SI), X4 @@ -12335,29 +17835,30 @@ TEXT ·encodeBetterBlockAsm64K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm64K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm64K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm64K: MOVQ tmp+48(FP), BX @@ -12376,6 +17877,7 @@ search_loop_encodeBetterBlockAsm64K: SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x31, R10 + SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x34, R11 MOVWLZX (BX)(R10*2), SI @@ -12483,6 +17985,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 @@ -12546,20 +18049,21 @@ memmove_long_repeat_emit_encodeBetterBlockAsm64K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(CX)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (R9), X4 @@ -12598,6 +18102,7 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm64K: // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K: MOVQ (R8)(R10*1), R9 @@ -12613,6 +18118,7 @@ matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm64K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm64K + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm64K: TZCNTQ R11, R11 @@ -12629,11 +18135,12 @@ matchlen_match8_repeat_extend_encodeBetterBlockAsm64K: LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm64K + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm64K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm64K matchlen_match4_repeat_extend_encodeBetterBlockAsm64K: @@ -12661,6 +18168,8 @@ matchlen_match1_repeat_extend_encodeBetterBlockAsm64K: CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm64K LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm64K + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm64K: ADDL R10, AX @@ -12703,6 +18212,7 @@ repeat_one_match_repeat_encodeBetterBlockAsm64K: repeat_end_emit_encodeBetterBlockAsm64K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm64K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm64K: CMPL R10, DI @@ -12725,6 +18235,7 @@ candidateS_match_encodeBetterBlockAsm64K: JEQ candidate_match_encodeBetterBlockAsm64K DECL AX MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm64K: MOVL 12(SP), BX @@ -12764,6 +18275,7 @@ match_dst_size_check_encodeBetterBlockAsm64K: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K: MOVQ (R8)(R11*1), R10 @@ -12779,6 +18291,7 @@ matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm64K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm64K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm64K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm64K: TZCNTQ R12, R12 @@ -12795,11 +18308,12 @@ matchlen_match8_match_nolit_encodeBetterBlockAsm64K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm64K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm64K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm64K matchlen_match4_match_nolit_encodeBetterBlockAsm64K: @@ -12827,6 +18341,8 @@ matchlen_match1_match_nolit_encodeBetterBlockAsm64K: CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm64K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm64K + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm64K: MOVL AX, DI @@ -12956,6 +18472,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R10 @@ -13019,20 +18536,21 @@ memmove_long_match_emit_encodeBetterBlockAsm64K: LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 @@ -13227,6 +18745,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 @@ -13290,20 +18809,21 @@ memmove_long_match_emit_repeat_encodeBetterBlockAsm64K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(CX)(R12*1), R13 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (R9), X4 @@ -13388,11 +18908,13 @@ match_nolit_dst_ok_encodeBetterBlockAsm64K: SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x31, R10 + SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x31, R12 + SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x34, R13 LEAQ 1(BX), R8 @@ -13517,6 +19039,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm64K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm64K_memmove_move_8through16: MOVQ (AX), SI @@ -13582,20 +19105,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm64K: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 @@ -13637,29 +19161,30 @@ TEXT ·encodeBetterBlockAsm16K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm16K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm16K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm16K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm16K: MOVQ tmp+48(FP), BX @@ -13678,6 +19203,7 @@ search_loop_encodeBetterBlockAsm16K: SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x32, R10 + SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x35, R11 MOVWLZX (BX)(R10*2), SI @@ -13777,6 +19303,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R9 @@ -13840,20 +19367,21 @@ memmove_long_repeat_emit_encodeBetterBlockAsm16K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(CX)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (R9), X4 @@ -13892,6 +19420,7 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm16K: // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K: MOVQ (R8)(R10*1), R9 @@ -13907,6 +19436,7 @@ matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm16K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm16K + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm16K: TZCNTQ R11, R11 @@ -13923,11 +19453,12 @@ matchlen_match8_repeat_extend_encodeBetterBlockAsm16K: LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm16K + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm16K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm16K matchlen_match4_repeat_extend_encodeBetterBlockAsm16K: @@ -13955,6 +19486,8 @@ matchlen_match1_repeat_extend_encodeBetterBlockAsm16K: CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm16K LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm16K + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm16K: ADDL R10, AX @@ -13997,6 +19530,7 @@ repeat_one_match_repeat_encodeBetterBlockAsm16K: repeat_end_emit_encodeBetterBlockAsm16K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm16K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm16K: CMPL R10, DI @@ -14019,6 +19553,7 @@ candidateS_match_encodeBetterBlockAsm16K: JEQ candidate_match_encodeBetterBlockAsm16K DECL AX MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm16K: MOVL 12(SP), BX @@ -14058,6 +19593,7 @@ match_dst_size_check_encodeBetterBlockAsm16K: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K: MOVQ (R8)(R11*1), R10 @@ -14073,6 +19609,7 @@ matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm16K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm16K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm16K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm16K: TZCNTQ R12, R12 @@ -14089,11 +19626,12 @@ matchlen_match8_match_nolit_encodeBetterBlockAsm16K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm16K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm16K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm16K matchlen_match4_match_nolit_encodeBetterBlockAsm16K: @@ -14121,6 +19659,8 @@ matchlen_match1_match_nolit_encodeBetterBlockAsm16K: CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm16K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm16K + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm16K: MOVL AX, DI @@ -14242,6 +19782,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R10 @@ -14305,20 +19846,21 @@ memmove_long_match_emit_encodeBetterBlockAsm16K: LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (R10), X4 @@ -14505,6 +20047,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (R8), R9 @@ -14568,20 +20111,21 @@ memmove_long_match_emit_repeat_encodeBetterBlockAsm16K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(CX)(R12*1), R13 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (R9), X4 @@ -14666,11 +20210,13 @@ match_nolit_dst_ok_encodeBetterBlockAsm16K: SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x32, R10 + SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x35, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x32, R12 + SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x35, R13 LEAQ 1(BX), R8 @@ -14787,6 +20333,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm16K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm16K_memmove_move_8through16: MOVQ (AX), SI @@ -14852,20 +20399,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm16K: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm16Klarge_big_loop_back: MOVOU (SI), X4 @@ -14907,29 +20455,30 @@ TEXT ·encodeBetterBlockAsm4K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm4K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm4K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm4K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm4K: MOVQ tmp+48(FP), BX @@ -14948,6 +20497,7 @@ search_loop_encodeBetterBlockAsm4K: SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x34, R10 + SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x36, R11 MOVWLZX (BX)(R10*2), SI @@ -15047,6 +20597,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R9 @@ -15110,20 +20661,21 @@ memmove_long_repeat_emit_encodeBetterBlockAsm4K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(CX)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (R9), X4 @@ -15162,6 +20714,7 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm4K: // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K: MOVQ (R8)(R10*1), R9 @@ -15177,6 +20730,7 @@ matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm4K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm4K + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm4K: TZCNTQ R11, R11 @@ -15193,11 +20747,12 @@ matchlen_match8_repeat_extend_encodeBetterBlockAsm4K: LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm4K + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm4K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm4K matchlen_match4_repeat_extend_encodeBetterBlockAsm4K: @@ -15225,6 +20780,8 @@ matchlen_match1_repeat_extend_encodeBetterBlockAsm4K: CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm4K LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm4K + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm4K: ADDL R10, AX @@ -15267,6 +20824,7 @@ repeat_one_match_repeat_encodeBetterBlockAsm4K: repeat_end_emit_encodeBetterBlockAsm4K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm4K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm4K: CMPL R10, DI @@ -15289,6 +20847,7 @@ candidateS_match_encodeBetterBlockAsm4K: JEQ candidate_match_encodeBetterBlockAsm4K DECL AX MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm4K: MOVL 12(SP), BX @@ -15328,6 +20887,7 @@ match_dst_size_check_encodeBetterBlockAsm4K: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K: MOVQ (R8)(R11*1), R10 @@ -15343,6 +20903,7 @@ matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm4K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm4K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm4K: TZCNTQ R12, R12 @@ -15359,11 +20920,12 @@ matchlen_match8_match_nolit_encodeBetterBlockAsm4K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm4K matchlen_match4_match_nolit_encodeBetterBlockAsm4K: @@ -15391,6 +20953,8 @@ matchlen_match1_match_nolit_encodeBetterBlockAsm4K: CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm4K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm4K + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm4K: MOVL AX, DI @@ -15512,6 +21076,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R10 @@ -15575,20 +21140,21 @@ memmove_long_match_emit_encodeBetterBlockAsm4K: LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (R10), X4 @@ -15775,6 +21341,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (R8), R9 @@ -15838,20 +21405,21 @@ memmove_long_match_emit_repeat_encodeBetterBlockAsm4K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(CX)(R12*1), R13 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (R9), X4 @@ -15936,11 +21504,13 @@ match_nolit_dst_ok_encodeBetterBlockAsm4K: SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x34, R10 + SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x34, R12 + SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x36, R13 LEAQ 1(BX), R8 @@ -16057,6 +21627,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4K_memmove_move_8through16: MOVQ (AX), SI @@ -16122,20 +21693,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm4K: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4Klarge_big_loop_back: MOVOU (SI), X4 @@ -16177,29 +21749,30 @@ TEXT ·encodeBetterBlockAsm1K(SB), $24-64 PXOR X0, X0 zero_loop_encodeBetterBlockAsm1K: - MOVOU X0, (AX) - MOVOU X0, 16(AX) - MOVOU X0, 32(AX) - MOVOU X0, 48(AX) - MOVOU X0, 64(AX) - MOVOU X0, 80(AX) - MOVOU X0, 96(AX) - MOVOU X0, 112(AX) - ADDQ $0x80, AX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm1K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), AX - LEAQ -11(AX), DX - LEAQ -8(AX), BX - MOVL BX, 8(SP) - SHRQ $0x05, AX - SUBL AX, DX - LEAQ (CX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, AX - MOVL AX, 16(SP) - MOVQ src_base+24(FP), DX + MOVOU X0, (AX) + MOVOU X0, 16(AX) + MOVOU X0, 32(AX) + MOVOU X0, 48(AX) + MOVOU X0, 64(AX) + MOVOU X0, 80(AX) + MOVOU X0, 96(AX) + MOVOU X0, 112(AX) + ADDQ $0x80, AX + DECQ DX + JNZ zero_loop_encodeBetterBlockAsm1K + MOVL $0x00000000, 12(SP) + MOVQ src_len+32(FP), AX + LEAQ -11(AX), DX + LEAQ -8(AX), BX + MOVL BX, 8(SP) + SHRQ $0x05, AX + SUBL AX, DX + LEAQ (CX)(DX*1), DX + MOVQ DX, (SP) + MOVL $0x00000001, AX + MOVL AX, 16(SP) + MOVQ src_base+24(FP), DX + PCALIGN $0x10 search_loop_encodeBetterBlockAsm1K: MOVQ tmp+48(FP), BX @@ -16218,6 +21791,7 @@ search_loop_encodeBetterBlockAsm1K: SHLQ $0x10, R10 IMULQ R9, R10 SHRQ $0x35, R10 + SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x38, R11 MOVWLZX (BX)(R10*2), SI @@ -16317,6 +21891,7 @@ emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_repeat_emit_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_repeat_emit_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R9 @@ -16380,20 +21955,21 @@ memmove_long_repeat_emit_encodeBetterBlockAsm1K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(CX)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R11 + SUBQ R9, R11 + DECQ R10 + JA emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), R9 + LEAQ -32(CX)(R11*1), R12 + PCALIGN $0x10 emit_lit_memmove_long_repeat_emit_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (R9), X4 @@ -16432,6 +22008,7 @@ emit_literal_done_repeat_emit_encodeBetterBlockAsm1K: // matchLen XORL R10, R10 JMP matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K + PCALIGN $0x10 matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K: MOVQ (R8)(R10*1), R9 @@ -16447,6 +22024,7 @@ matchlen_loop_16_entry_repeat_extend_encodeBetterBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_repeat_extend_encodeBetterBlockAsm1K JMP matchlen_match8_repeat_extend_encodeBetterBlockAsm1K + PCALIGN $0x10 matchlen_bsf_16repeat_extend_encodeBetterBlockAsm1K: TZCNTQ R11, R11 @@ -16463,11 +22041,12 @@ matchlen_match8_repeat_extend_encodeBetterBlockAsm1K: LEAL -8(DI), DI LEAL 8(R10), R10 JMP matchlen_match4_repeat_extend_encodeBetterBlockAsm1K + PCALIGN $0x10 matchlen_bsf_8_repeat_extend_encodeBetterBlockAsm1K: TZCNTQ R9, R9 SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + ADDL R9, R10 JMP repeat_extend_forward_end_encodeBetterBlockAsm1K matchlen_match4_repeat_extend_encodeBetterBlockAsm1K: @@ -16495,6 +22074,8 @@ matchlen_match1_repeat_extend_encodeBetterBlockAsm1K: CMPB (SI)(R10*1), R9 JNE repeat_extend_forward_end_encodeBetterBlockAsm1K LEAL 1(R10), R10 + JMP repeat_extend_forward_end_encodeBetterBlockAsm1K + PCALIGN $0x10 repeat_extend_forward_end_encodeBetterBlockAsm1K: ADDL R10, AX @@ -16537,6 +22118,7 @@ repeat_one_match_repeat_encodeBetterBlockAsm1K: repeat_end_emit_encodeBetterBlockAsm1K: MOVL AX, 12(SP) JMP search_loop_encodeBetterBlockAsm1K + PCALIGN $0x10 no_repeat_found_encodeBetterBlockAsm1K: CMPL R10, DI @@ -16559,6 +22141,7 @@ candidateS_match_encodeBetterBlockAsm1K: JEQ candidate_match_encodeBetterBlockAsm1K DECL AX MOVL R8, SI + PCALIGN $0x10 candidate_match_encodeBetterBlockAsm1K: MOVL 12(SP), BX @@ -16598,6 +22181,7 @@ match_dst_size_check_encodeBetterBlockAsm1K: // matchLen XORL R11, R11 JMP matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K + PCALIGN $0x10 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K: MOVQ (R8)(R11*1), R10 @@ -16613,6 +22197,7 @@ matchlen_loop_16_entry_match_nolit_encodeBetterBlockAsm1K: CMPL DI, $0x10 JAE matchlen_loopback_16_match_nolit_encodeBetterBlockAsm1K JMP matchlen_match8_match_nolit_encodeBetterBlockAsm1K + PCALIGN $0x10 matchlen_bsf_16match_nolit_encodeBetterBlockAsm1K: TZCNTQ R12, R12 @@ -16629,11 +22214,12 @@ matchlen_match8_match_nolit_encodeBetterBlockAsm1K: LEAL -8(DI), DI LEAL 8(R11), R11 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm1K + PCALIGN $0x10 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm1K: TZCNTQ R10, R10 SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 + ADDL R10, R11 JMP match_nolit_end_encodeBetterBlockAsm1K matchlen_match4_match_nolit_encodeBetterBlockAsm1K: @@ -16661,6 +22247,8 @@ matchlen_match1_match_nolit_encodeBetterBlockAsm1K: CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm1K LEAL 1(R11), R11 + JMP match_nolit_end_encodeBetterBlockAsm1K + PCALIGN $0x10 match_nolit_end_encodeBetterBlockAsm1K: MOVL AX, DI @@ -16782,6 +22370,7 @@ emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R10 MOVQ R10, (CX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_match_emit_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R10 @@ -16845,20 +22434,21 @@ memmove_long_match_emit_encodeBetterBlockAsm1K: LEAQ (CX)(SI*1), R9 // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(SI*1), X2 - MOVOU -16(R8)(SI*1), X3 - MOVQ SI, R12 - SHRQ $0x05, R12 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(SI*1), X2 + MOVOU -16(R8)(SI*1), X3 + MOVQ SI, R12 + SHRQ $0x05, R12 + MOVQ CX, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R12 + JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(CX)(R13*1), R14 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (R10), X4 @@ -17045,6 +22635,7 @@ emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (CX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (R8), R9 @@ -17108,20 +22699,21 @@ memmove_long_match_emit_repeat_encodeBetterBlockAsm1K: LEAQ (CX)(DI*1), SI // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ CX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(CX)(R12*1), R13 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 + MOVQ DI, R10 + SHRQ $0x05, R10 + MOVQ CX, R9 + ANDL $0x0000001f, R9 + MOVQ $0x00000040, R12 + SUBQ R9, R12 + DECQ R10 + JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(R8)(R12*1), R9 + LEAQ -32(CX)(R12*1), R13 + PCALIGN $0x10 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (R9), X4 @@ -17206,11 +22798,13 @@ match_nolit_dst_ok_encodeBetterBlockAsm1K: SHLQ $0x10, R10 IMULQ DI, R10 SHRQ $0x35, R10 + SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 SHLQ $0x10, R12 IMULQ DI, R12 SHRQ $0x35, R12 + SHLQ $0x20, R13 IMULQ R8, R13 SHRQ $0x38, R13 LEAQ 1(BX), R8 @@ -17327,6 +22921,7 @@ emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_4through8: MOVL SI, (CX) MOVL AX, -4(CX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm1K + PCALIGN $0x10 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm1K_memmove_move_8through16: MOVQ (AX), SI @@ -17392,20 +22987,21 @@ memmove_long_emit_remainder_encodeBetterBlockAsm1K: MOVL SI, BX // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU -32(AX)(BX*1), X2 + MOVOU -16(AX)(BX*1), X3 + MOVQ BX, DI + SHRQ $0x05, DI + MOVQ CX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_forward_sse_loop_32 + LEAQ -32(AX)(R8*1), SI + LEAQ -32(CX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm1Klarge_big_loop_back: MOVOU (SI), X4 @@ -17524,6 +23120,7 @@ emit_lit_memmove_standalone_memmove_move_4through8: MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone + PCALIGN $0x10 emit_lit_memmove_standalone_memmove_move_8through16: MOVQ (CX), SI @@ -17579,20 +23176,21 @@ emit_lit_memmove_mid_standalone_memmove_move_33through64: memmove_long_standalone: // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVQ DX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 + MOVOU (CX), X0 + MOVOU 16(CX), X1 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 + MOVQ DX, DI + SHRQ $0x05, DI + MOVQ AX, SI + ANDL $0x0000001f, SI + MOVQ $0x00000040, R8 + SUBQ SI, R8 + DECQ DI + JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 + LEAQ -32(CX)(R8*1), SI + LEAQ -32(AX)(R8*1), R9 + PCALIGN $0x10 emit_lit_memmove_long_standalonelarge_big_loop_back: MOVOU (SI), X4 @@ -18020,6 +23618,7 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 // matchLen XORL SI, SI JMP matchlen_loop_16_entry_standalone + PCALIGN $0x10 matchlen_loopback_16_standalone: MOVQ (AX)(SI*1), BX @@ -18035,6 +23634,7 @@ matchlen_loop_16_entry_standalone: CMPL DX, $0x10 JAE matchlen_loopback_16_standalone JMP matchlen_match8_standalone + PCALIGN $0x10 matchlen_bsf_16standalone: TZCNTQ DI, DI @@ -18051,11 +23651,12 @@ matchlen_match8_standalone: LEAL -8(DX), DX LEAL 8(SI), SI JMP matchlen_match4_standalone + PCALIGN $0x10 matchlen_bsf_8_standalone: TZCNTQ BX, BX SARQ $0x03, BX - LEAL (SI)(BX*1), SI + ADDL BX, SI JMP gen_match_len_end matchlen_match4_standalone: @@ -18215,6 +23816,7 @@ emit_lit_memmove_lz4_mz_memmove_move_4through8: MOVL R13, (AX) MOVL R14, -4(AX)(R12*1) JMP memmove_end_copy_lz4_mz + PCALIGN $0x10 emit_lit_memmove_lz4_mz_memmove_move_8through16: MOVQ (DX), R13 @@ -18280,20 +23882,21 @@ memmove_long_lz4_mz: MOVL R8, R12 // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R12*1), X2 - MOVOU -16(DX)(R12*1), X3 - MOVQ R12, R14 - SHRQ $0x05, R14 - MOVQ AX, R13 - ANDL $0x0000001f, R13 - MOVQ $0x00000040, R15 - SUBQ R13, R15 - DECQ R14 - JA emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32 - LEAQ -32(DX)(R15*1), R13 - LEAQ -32(AX)(R15*1), BP + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R12*1), X2 + MOVOU -16(DX)(R12*1), X3 + MOVQ R12, R14 + SHRQ $0x05, R14 + MOVQ AX, R13 + ANDL $0x0000001f, R13 + MOVQ $0x00000040, R15 + SUBQ R13, R15 + DECQ R14 + JA emit_lit_memmove_long_lz4_mzlarge_forward_sse_loop_32 + LEAQ -32(DX)(R15*1), R13 + LEAQ -32(AX)(R15*1), BP + PCALIGN $0x10 emit_lit_memmove_long_lz4_mzlarge_big_loop_back: MOVOU (R13), X4 @@ -18528,6 +24131,7 @@ emit_lit_memmove_lz4_mz_emitcopy_memmove_move_4through8: MOVL R12, (AX) MOVL DI, -4(AX)(R10*1) JMP memmove_end_copy_lz4_mz_emitcopy + PCALIGN $0x10 emit_lit_memmove_lz4_mz_emitcopy_memmove_move_8through16: MOVQ (DI), R12 @@ -18591,20 +24195,21 @@ memmove_long_lz4_mz_emitcopy: LEAQ (AX)(R10*1), R11 // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R10*1), X2 - MOVOU -16(DI)(R10*1), X3 - MOVQ R10, R13 - SHRQ $0x05, R13 - MOVQ AX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R14 - SUBQ R12, R14 - DECQ R13 - JA emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32 - LEAQ -32(DI)(R14*1), R12 - LEAQ -32(AX)(R14*1), R15 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(R10*1), X2 + MOVOU -16(DI)(R10*1), X3 + MOVQ R10, R13 + SHRQ $0x05, R13 + MOVQ AX, R12 + ANDL $0x0000001f, R12 + MOVQ $0x00000040, R14 + SUBQ R12, R14 + DECQ R13 + JA emit_lit_memmove_long_lz4_mz_emitcopylarge_forward_sse_loop_32 + LEAQ -32(DI)(R14*1), R12 + LEAQ -32(AX)(R14*1), R15 + PCALIGN $0x10 emit_lit_memmove_long_lz4_mz_emitcopylarge_big_loop_back: MOVOU (R12), X4 @@ -18912,6 +24517,7 @@ emit_lit_memmove_lz4_mz_emit_final_memmove_move_4through8: MOVL BX, (AX) MOVL DI, -4(AX)(DX*1) JMP memmove_end_copy_lz4_mz_emit_final + PCALIGN $0x10 emit_lit_memmove_lz4_mz_emit_final_memmove_move_8through16: MOVQ (DI), BX @@ -18977,20 +24583,21 @@ memmove_long_lz4_mz_emit_final: MOVL R10, DX // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(DX*1), X2 - MOVOU -16(DI)(DX*1), X3 - MOVQ DX, R8 - SHRQ $0x05, R8 - MOVQ AX, BX - ANDL $0x0000001f, BX - MOVQ $0x00000040, R9 - SUBQ BX, R9 - DECQ R8 - JA emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32 - LEAQ -32(DI)(R9*1), BX - LEAQ -32(AX)(R9*1), R10 + MOVOU (DI), X0 + MOVOU 16(DI), X1 + MOVOU -32(DI)(DX*1), X2 + MOVOU -16(DI)(DX*1), X3 + MOVQ DX, R8 + SHRQ $0x05, R8 + MOVQ AX, BX + ANDL $0x0000001f, BX + MOVQ $0x00000040, R9 + SUBQ BX, R9 + DECQ R8 + JA emit_lit_memmove_long_lz4_mz_emit_finallarge_forward_sse_loop_32 + LEAQ -32(DI)(R9*1), BX + LEAQ -32(AX)(R9*1), R10 + PCALIGN $0x10 emit_lit_memmove_long_lz4_mz_emit_finallarge_big_loop_back: MOVOU (BX), X4 @@ -19056,12 +24663,14 @@ TEXT ·decodeBlockAsm(SB), $8-56 MOVBQZX (R8), R10 MOVQ R10, R11 SHRQ $0x02, R11 + PCALIGN $0x10 decodeBlockAsm_fast_loop_nofetch: - CMPQ SI, BX - JAE decodeBlockAsm_fast_end_copy - ANDQ $0x03, R10 - JNZ decodeBlockAsm_fast_copy + CMPQ SI, BX + JAE decodeBlockAsm_fast_end_copy + ANDQ $0x03, R10 + JNZ decodeBlockAsm_fast_copy + PCALIGN $0x10 decodeBlockAsm_fast_lits: MOVL R11, R12 @@ -19072,6 +24681,7 @@ decodeBlockAsm_fast_lits: CMPL R12, $0x1e JEQ decodeBlockAsm_fast_lit_2 JMP decodeBlockAsm_fast_lit_3 + PCALIGN $0x10 decodeBlockAsm_fast_lit_0: INCQ R8 @@ -19092,6 +24702,7 @@ decodeBlockAsm_fast_lit_0: CMPQ R12, $0x20 JBE decodeBlockAsm_fast_lit_0_copy_memmove_move_17through32 JMP decodeBlockAsm_fast_lit_0_copy_memmove_move_33through64 + PCALIGN $0x10 decodeBlockAsm_fast_lit_0_copy_memmove_move_8through16: MOVOU (R8), X0 @@ -19145,20 +24756,21 @@ decodeBlockAsm_fast_litcopy_long: JBE decodeBlockAsm_fast_litcopy_short_reduced // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(R12*1), X2 - MOVOU -16(R8)(R12*1), X3 - MOVQ R12, R11 - SHRQ $0x05, R11 - MOVQ SI, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R11 - JA decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32 - LEAQ -32(R8)(R13*1), R10 - LEAQ -32(SI)(R13*1), R14 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(R12*1), X2 + MOVOU -16(R8)(R12*1), X3 + MOVQ R12, R11 + SHRQ $0x05, R11 + MOVQ SI, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R13 + SUBQ R10, R13 + DECQ R11 + JA decodeBlockAsm_fast_litcopy_longlarge_forward_sse_loop_32 + LEAQ -32(R8)(R13*1), R10 + LEAQ -32(SI)(R13*1), R14 + PCALIGN $0x10 decodeBlockAsm_fast_litcopy_longlarge_big_loop_back: MOVOU (R10), X4 @@ -19229,6 +24841,7 @@ decodeBlockAsm_fast_copy: JB decodeBlockAsm_fast_copy_1 JEQ decodeBlockAsm_fast_copy_2 JMP decodeBlockAsm_fast_copy_3 + PCALIGN $0x10 decodeBlockAsm_fast_copy_1: MOVWQZX R13, R9 @@ -19246,6 +24859,7 @@ decodeBlockAsm_fast_copy_1: CMOVLEQ R11, R12 CMOVQEQ R10, R8 JMP decodeBlockAsm_fast_copy_exec + PCALIGN $0x10 decodeBlockAsm_fast_copy_2: MOVQ R11, R12 @@ -19287,6 +24901,7 @@ decodeBlockAsm_fast_copy_2_0_extra: LEAL 4(R12), R12 ADDQ $0x40, R9 JMP decodeBlockAsm_fast_copy_short_no_ol + PCALIGN $0x10 decodeBlockAsm_fast_copy_3: MOVL R13, R9 @@ -19356,6 +24971,7 @@ decodeBlockAsm_fast_copy_fused_long: ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_fast_copy_exec_long_long + PCALIGN $0x10 decodeBlockAsm_fast_copy_exec_short: CMPL R9, DI @@ -19371,6 +24987,7 @@ decodeBlockAsm_fast_copy_exec_short: CMPL R9, R12 JB decodeBlockAsm_fast_copy_overlap JMP decodeBlockAsm_fast_copy_short + PCALIGN $0x10 decodeBlockAsm_fast_copy_exec_long_long: MOVQ SI, R11 @@ -19385,28 +25002,39 @@ decodeBlockAsm_fast_copy_exec_long_long: MOVBQZX (R8), R10 // genMemMoveLong - MOVQ R12, R13 - SHRQ $0x05, R13 - MOVQ SI, R14 - MOVQ R12, R15 + MOVQ R12, R13 + SHRQ $0x06, R13 + MOVQ SI, R14 + MOVQ R12, R15 + PCALIGN $0x10 decodeBlockAsm_fast_copy_long_longlarge_big_loop_back: MOVOU (R11), X0 MOVOU 16(R11), X1 + MOVOU 32(R11), X2 + MOVOU 48(R11), X3 MOVOU X0, (R14) MOVOU X1, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - SUBQ $0x20, R15 + MOVOU X2, 32(R14) + MOVOU X3, 48(R14) + ADDQ $0x40, R14 + ADDQ $0x40, R11 + SUBQ $0x40, R15 + JZ decodeBlockAsm_fast_copy_done DECQ R13 JNZ decodeBlockAsm_fast_copy_long_longlarge_big_loop_back TESTQ R15, R15 JZ decodeBlockAsm_fast_copy_done - MOVOU -32(R11)(R15*1), X0 - MOVOU -16(R11)(R15*1), X1 - MOVOU X0, -32(R14)(R15*1) - MOVOU X1, -16(R14)(R15*1) + MOVOU -64(R11)(R15*1), X0 + MOVOU -48(R11)(R15*1), X1 + MOVOU -32(R11)(R15*1), X2 + MOVOU -16(R11)(R15*1), X3 + MOVOU X0, -64(R14)(R15*1) + MOVOU X1, -48(R14)(R15*1) + MOVOU X2, -32(R14)(R15*1) + MOVOU X3, -16(R14)(R15*1) JMP decodeBlockAsm_fast_copy_done + PCALIGN $0x10 decodeBlockAsm_fast_copy_short_no_ol: MOVQ SI, R11 @@ -19427,6 +25055,7 @@ decodeBlockAsm_fast_copy_short_no_ol: CMPQ R12, $0x20 JBE decodeBlockAsm_fast_copy_short_no_ol_memmove_move_17through32 JMP decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64 + PCALIGN $0x10 decodeBlockAsm_fast_copy_short_no_ol_memmove_move_8through16: MOVOU (R11), X0 @@ -19450,6 +25079,7 @@ decodeBlockAsm_fast_copy_short_no_ol_memmove_move_33through64: MOVOU X2, -32(SI)(R12*1) MOVOU X3, -16(SI)(R12*1) JMP decodeBlockAsm_fast_copy_done + PCALIGN $0x10 decodeBlockAsm_fast_copy_exec: CMPL R9, DI @@ -19465,7 +25095,7 @@ decodeBlockAsm_fast_copy_exec: CMPL R9, R12 JB decodeBlockAsm_fast_copy_overlap CMPL R12, $0x40 - JA decodeBlockAsm_fast_copy_long + JAE decodeBlockAsm_fast_copy_long decodeBlockAsm_fast_copy_short: // genMemMoveShort @@ -19475,6 +25105,7 @@ decodeBlockAsm_fast_copy_short: CMPQ R12, $0x20 JBE decodeBlockAsm_fast_copy_short_memmove_move_17through32 JMP decodeBlockAsm_fast_copy_short_memmove_move_33through64 + PCALIGN $0x10 decodeBlockAsm_fast_copy_short_memmove_move_8through16: MOVOU (R11), X0 @@ -19501,20 +25132,21 @@ decodeBlockAsm_fast_copy_short_memmove_move_33through64: decodeBlockAsm_fast_copy_long: // genMemMoveLong - MOVOU (R11), X0 - MOVOU 16(R11), X1 - MOVOU -32(R11)(R12*1), X2 - MOVOU -16(R11)(R12*1), X3 - MOVQ R12, R14 - SHRQ $0x05, R14 - MOVQ SI, R13 - ANDL $0x0000001f, R13 - MOVQ $0x00000040, R15 - SUBQ R13, R15 - DECQ R14 - JA decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32 - LEAQ -32(R11)(R15*1), R13 - LEAQ -32(SI)(R15*1), BP + MOVOU (R11), X0 + MOVOU 16(R11), X1 + MOVOU -32(R11)(R12*1), X2 + MOVOU -16(R11)(R12*1), X3 + MOVQ R12, R14 + SHRQ $0x05, R14 + MOVQ SI, R13 + ANDL $0x0000001f, R13 + MOVQ $0x00000040, R15 + SUBQ R13, R15 + DECQ R14 + JA decodeBlockAsm_fast_copy_longlarge_forward_sse_loop_32 + LEAQ -32(R11)(R15*1), R13 + LEAQ -32(SI)(R15*1), BP + PCALIGN $0x10 decodeBlockAsm_fast_copy_longlarge_big_loop_back: MOVOU (R13), X4 @@ -19642,6 +25274,7 @@ decodeBlockAsm_remain_loop: JAE decodeBlockAsm_remain_end_copy ANDQ $0x03, DX JNZ decodeBlockAsm_remain_copy + PCALIGN $0x10 decodeBlockAsm_remain_lits: MOVL BX, DX @@ -19652,6 +25285,7 @@ decodeBlockAsm_remain_lits: CMPL DX, $0x1e JEQ decodeBlockAsm_remain_lit_2 JMP decodeBlockAsm_remain_lit_3 + PCALIGN $0x10 decodeBlockAsm_remain_lit_0: INCQ R8 @@ -19698,6 +25332,7 @@ decodeBlockAsm_remain_lit_0_copy_memmove_move_4through8: MOVL BX, (SI) MOVL R10, -4(SI)(DX*1) JMP decodeBlockAsm_remain_litcopy_done + PCALIGN $0x10 decodeBlockAsm_remain_lit_0_copy_memmove_move_8through16: MOVQ (R8), BX @@ -19759,20 +25394,21 @@ decodeBlockAsm_remain_litcopy_long: JBE decodeBlockAsm_remain_litcopy_short_reduced // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DX*1), X2 - MOVOU -16(R8)(DX*1), X3 - MOVQ DX, R10 - SHRQ $0x05, R10 - MOVQ SI, BX - ANDL $0x0000001f, BX - MOVQ $0x00000040, R11 - SUBQ BX, R11 - DECQ R10 - JA decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), BX - LEAQ -32(SI)(R11*1), R12 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DX*1), X2 + MOVOU -16(R8)(DX*1), X3 + MOVQ DX, R10 + SHRQ $0x05, R10 + MOVQ SI, BX + ANDL $0x0000001f, BX + MOVQ $0x00000040, R11 + SUBQ BX, R11 + DECQ R10 + JA decodeBlockAsm_remain_litcopy_longlarge_forward_sse_loop_32 + LEAQ -32(R8)(R11*1), BX + LEAQ -32(SI)(R11*1), R12 + PCALIGN $0x10 decodeBlockAsm_remain_litcopy_longlarge_big_loop_back: MOVOU (BX), X4 @@ -19842,6 +25478,7 @@ decodeBlockAsm_remain_copy: JB decodeBlockAsm_remain_copy_1 JEQ decodeBlockAsm_remain_copy_2 JMP decodeBlockAsm_remain_copy_3 + PCALIGN $0x10 decodeBlockAsm_remain_copy_1: ADDQ $0x02, R8 @@ -19864,6 +25501,7 @@ decodeBlockAsm_remain_copy_1: decodeBlockAsm_remain_copy_1_short: LEAL 4(DX), DX JMP decodeBlockAsm_remain_copy_exec_short + PCALIGN $0x10 decodeBlockAsm_remain_copy_2: MOVQ BX, DX @@ -19910,6 +25548,7 @@ decodeBlockAsm_remain_copy_2_0_extra: LEAL 4(DX), DX ADDQ $0x40, R9 JMP decodeBlockAsm_remain_copy_short_no_ol + PCALIGN $0x10 decodeBlockAsm_remain_copy_3: ADDQ $0x04, R8 @@ -20071,6 +25710,7 @@ decodeBlockAsm_remain_copy3_fused_lits_done: ADDQ R10, SI ADDQ R10, DI JMP decodeBlockAsm_remain_copy_exec_long_long + PCALIGN $0x10 decodeBlockAsm_remain_copy_exec_short: CMPL R9, DI @@ -20083,6 +25723,7 @@ decodeBlockAsm_remain_copy_exec_short: CMPL R9, DX JB decodeBlockAsm_remain_copy_overlap JMP decodeBlockAsm_remain_copy_short + PCALIGN $0x10 decodeBlockAsm_remain_copy_exec_long_long: MOVQ SI, BX @@ -20094,28 +25735,39 @@ decodeBlockAsm_remain_copy_exec_long_long: JA corrupt // genMemMoveLong - MOVQ DX, R10 - SHRQ $0x05, R10 - MOVQ SI, R11 - MOVQ DX, R12 + MOVQ DX, R10 + SHRQ $0x06, R10 + MOVQ SI, R11 + MOVQ DX, R12 + PCALIGN $0x10 decodeBlockAsm_remain_copy_long_longlarge_big_loop_back: MOVOU (BX), X0 MOVOU 16(BX), X1 + MOVOU 32(BX), X2 + MOVOU 48(BX), X3 MOVOU X0, (R11) MOVOU X1, 16(R11) - ADDQ $0x20, R11 - ADDQ $0x20, BX - SUBQ $0x20, R12 + MOVOU X2, 32(R11) + MOVOU X3, 48(R11) + ADDQ $0x40, R11 + ADDQ $0x40, BX + SUBQ $0x40, R12 + JZ decodeBlockAsm_remain_copy_done DECQ R10 JNZ decodeBlockAsm_remain_copy_long_longlarge_big_loop_back TESTQ R12, R12 JZ decodeBlockAsm_remain_copy_done - MOVOU -32(BX)(R12*1), X0 - MOVOU -16(BX)(R12*1), X1 - MOVOU X0, -32(R11)(R12*1) - MOVOU X1, -16(R11)(R12*1) + MOVOU -64(BX)(R12*1), X0 + MOVOU -48(BX)(R12*1), X1 + MOVOU -32(BX)(R12*1), X2 + MOVOU -16(BX)(R12*1), X3 + MOVOU X0, -64(R11)(R12*1) + MOVOU X1, -48(R11)(R12*1) + MOVOU X2, -32(R11)(R12*1) + MOVOU X3, -16(R11)(R12*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_short_no_ol: MOVQ SI, BX @@ -20142,6 +25794,7 @@ decodeBlockAsm_remain_copy_short_no_ol_memmove_move_4through8: MOVL R10, (SI) MOVL BX, -4(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_short_no_ol_memmove_move_8through16: MOVQ (BX), R10 @@ -20167,6 +25820,7 @@ decodeBlockAsm_remain_copy_short_no_ol_memmove_move_33through64: MOVOU X2, -32(SI)(DX*1) MOVOU X3, -16(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_exec: CMPL R9, DI @@ -20179,7 +25833,7 @@ decodeBlockAsm_remain_copy_exec: CMPL R9, DX JB decodeBlockAsm_remain_copy_overlap CMPL DX, $0x40 - JA decodeBlockAsm_remain_copy_long + JAE decodeBlockAsm_remain_copy_long decodeBlockAsm_remain_copy_short: // genMemMoveShort @@ -20215,6 +25869,7 @@ decodeBlockAsm_remain_copy_short_memmove_move_4through8: MOVL R10, (SI) MOVL BX, -4(SI)(DX*1) JMP decodeBlockAsm_remain_copy_done + PCALIGN $0x10 decodeBlockAsm_remain_copy_short_memmove_move_8through16: MOVQ (BX), R10 @@ -20243,20 +25898,21 @@ decodeBlockAsm_remain_copy_short_memmove_move_33through64: decodeBlockAsm_remain_copy_long: // genMemMoveLong - MOVOU (BX), X0 - MOVOU 16(BX), X1 - MOVOU -32(BX)(DX*1), X2 - MOVOU -16(BX)(DX*1), X3 - MOVQ DX, R11 - SHRQ $0x05, R11 - MOVQ SI, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32 - LEAQ -32(BX)(R12*1), R10 - LEAQ -32(SI)(R12*1), R13 + MOVOU (BX), X0 + MOVOU 16(BX), X1 + MOVOU -32(BX)(DX*1), X2 + MOVOU -16(BX)(DX*1), X3 + MOVQ DX, R11 + SHRQ $0x05, R11 + MOVQ SI, R10 + ANDL $0x0000001f, R10 + MOVQ $0x00000040, R12 + SUBQ R10, R12 + DECQ R11 + JA decodeBlockAsm_remain_copy_longlarge_forward_sse_loop_32 + LEAQ -32(BX)(R12*1), R10 + LEAQ -32(SI)(R12*1), R13 + PCALIGN $0x10 decodeBlockAsm_remain_copy_longlarge_big_loop_back: MOVOU (R10), X4 diff --git a/asm_none.go b/asm_none.go index 0943444..4a4054a 100644 --- a/asm_none.go +++ b/asm_none.go @@ -23,6 +23,24 @@ import ( const hasAsm = false +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// +// len(dst) >= MaxEncodedLen(len(src)) +func encodeBlockFast(dst, src []byte) (d int) { + if len(src) < minNonLiteralBlockSize { + return 0 + } + if len(src) <= 65536 { + // Only very maginally faster... + return encodeFastBlockGo64K(dst, src) + } + return encodeFastBlockGo(dst, src) +} + // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. @@ -34,6 +52,9 @@ func encodeBlock(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { return 0 } + if len(src) <= 65536 { + return encodeBlockGo64K(dst, src) + } return encodeBlockGo(dst, src) } @@ -45,6 +66,12 @@ func encodeBlock(dst, src []byte) (d int) { // // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetter(dst, src []byte) (d int) { + if len(src) < minNonLiteralBlockSize { + return 0 + } + if len(src) <= 64<<10 { + return encodeBlockBetterGo64K(dst, src) + } return encodeBlockBetterGo(dst, src) } diff --git a/cmd/mz/compress.go b/cmd/mz/compress.go index 2c3ee49..7064153 100644 --- a/cmd/mz/compress.go +++ b/cmd/mz/compress.go @@ -39,6 +39,8 @@ func mainCompress(args []string) { var ( fs = flag.NewFlagSet("compress", flag.ExitOnError) + nocomp = fs.Bool("0", false, "Perform no compression") + superFast = fs.Bool("xfast", false, "Compress fastest, with a major compression loss") faster = fs.Bool("1", false, "Compress faster, but with a minor compression loss") _ = fs.Bool("2", true, "Default compression speed") slower = fs.Bool("3", false, "Compress more, but a lot slower") @@ -98,6 +100,12 @@ Options:`) if *slower { level = minlz.LevelSmallest } + if *superFast { + level = minlz.LevelSuperFast + } + if *nocomp { + level = minlz.LevelUncompressed + } opts := []minlz.WriterOption{minlz.WriterBlockSize(int(sz)), minlz.WriterConcurrency(*cpu), minlz.WriterPadding(int(pad)), minlz.WriterLevel(level), minlz.WriterAddIndex(*index)} wr := minlz.NewWriter(nil, opts...) diff --git a/encode.go b/encode.go index d2e5a60..b7bf6bd 100644 --- a/encode.go +++ b/encode.go @@ -23,6 +23,14 @@ import ( ) const ( + // LevelSuperFast is the fastest compression level. + // This will take significant shortcuts and usually provide much worse compression. + // Use only if LevelFastest is confirmed to be too slow. + LevelSuperFast = -1 + + // LevelUncompressed will bypass compression. + LevelUncompressed = 0 + // LevelFastest is the fastest compression level. LevelFastest = 1 @@ -83,6 +91,10 @@ func Encode(dst, src []byte, level int) ([]byte, error) { var n int switch level { + case LevelUncompressed: + return encodeUncompressed(dst[:0], src), nil + case LevelSuperFast: + n = encodeBlockFast(dst[d:], src) case LevelFastest: n = encodeBlock(dst[d:], src) case LevelBalanced: @@ -92,9 +104,17 @@ func Encode(dst, src []byte, level int) ([]byte, error) { default: return nil, ErrInvalidLevel } - if n > 0 { if debugValidateBlocks { + if n+d > len(dst) { + x := crc32.ChecksumIEEE(src) + name := fmt.Sprintf("errs/block-%08x", x) + os.WriteFile(name+"src.bin", src, 0644) + os.WriteFile(name+"dst.mzb", dst, 0644) + + panic(fmt.Sprintf("level %d encoded block too large: %d > %d", level, n+d, len(dst))) + } + block := dst[d : d+n] dst := make([]byte, len(src), len(src)) ret := minLZDecode(dst, block) @@ -165,6 +185,8 @@ func TryEncode(dst, src []byte, level int) []byte { var n int switch level { + case LevelSuperFast: + n = encodeBlockFast(dst[d:], src) case LevelFastest: n = encodeBlock(dst[d:], src) case LevelBalanced: diff --git a/encode_amd64.go b/encode_amd64.go index 951e3d1..37d1164 100644 --- a/encode_amd64.go +++ b/encode_amd64.go @@ -24,6 +24,88 @@ import ( const hasAsm = true +var encFastPools [7]sync.Pool + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockFast(dst, src []byte) (d int) { + race.ReadSlice(src) + race.WriteSlice(dst) + + switch { + case len(src) > 2<<20: + const sz, pool = 65536, 5 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm(dst, src, tmp) + case len(src) > 512<<10: + const sz, pool = 32768, 0 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm2MB(dst, src, tmp) + case len(src) > 64<<10: + const sz, pool = 32768, 0 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm512K(dst, src, tmp) + case len(src) > 16<<10: + const sz, pool = 8192, 1 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm64K(dst, src, tmp) + case len(src) > 4<<10: + const sz, pool = 4096, 2 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm16K(dst, src, tmp) + case len(src) > 1<<10: + const sz, pool = 2048, 3 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm4K(dst, src, tmp) + case len(src) > 32: + const sz, pool = 1024, 4 + tmp, ok := encFastPools[pool].Get().(*[sz]byte) + if !ok { + tmp = &[sz]byte{} + } + race.WriteSlice(tmp[:]) + defer encPools[pool].Put(tmp) + return encodeFastBlockAsm1K(dst, src, tmp) + } + return 0 +} + var encPools [7]sync.Pool // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It diff --git a/encode_l0.go b/encode_l0.go new file mode 100644 index 0000000..41f940f --- /dev/null +++ b/encode_l0.go @@ -0,0 +1,522 @@ +// Copyright 2025 MinIO Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package minlz + +import ( + "bytes" + "encoding/hex" + "fmt" + "math/bits" +) + +// encodeFastBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeFastBlockGo(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 13 + maxTableSize = 1 << tableBits + skipLog = 5 + + debug = debugEncode + ) + + // Having values inside the table is ~the same speed as looking up + // - maybe slightly faster on bigger blocks. + // We go for the smaller stack allocation for now. + var table [maxTableSize]uint32 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>3 - 6 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + if debugEncode { + fmt.Println("encodeBlockGo: Starting encode") + } + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>skipLog + 5 + if nextS > sLimit { + goto emitRemainder + } + minSrcPos := s - maxCopy3Offset + hash0 := hash8(cv, tableBits) + cv1 := load64(src, s+1) + hash1 := hash8(cv1, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint32(s) + table[hash1] = uint32(s + 1) + cv2 := load64(src, s+2) + hash2 := hash8(cv2, tableBits) + + // Check repeat at offset checkRep. + // Speed impact is very small. + const checkRep = 1 + if uint32(cv1) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + // Bail if we exceed the maximum size. + if d+(base-nextEmit) > dstLimit { + return 0 + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + if debugEncode { + fmt.Println(nextEmit, "(lits) length:", base-nextEmit, "d-after:", d) + } + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + d += emitRepeat(dst[d:], s-base) + if debugEncode { + fmt.Println(base, "(repeat) length:", s-base, "offset:", repeat, "d-after:", d) + } + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if candidate >= minSrcPos && cv == load64(src, candidate) { + break + } + candidate = int(table[hash2]) + if candidate2 >= minSrcPos && cv1 == load64(src, candidate2) { + table[hash2] = uint32(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint32(s + 2) + if candidate >= minSrcPos && cv2 == load64(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards. + // The top bytes will be rechecked to get the full match. + for false && candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // An 8-byte match has been found. We'll later see if more than 4 bytes match. + base := s + repeat = base - candidate + + // Extend the 8-byte match as long as possible. + s += 8 + candidate += 8 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + length := s - base + if nextEmit != base { + // No significant speedup when disabled. + if base-nextEmit > maxCopy3Lits || repeat < minCopy2Offset { + // Bail if we exceed the maximum size. + // We will not exceed dstLimit with the other encodings. + if d+(s-nextEmit) > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopy(dst[d:], repeat, length) + } else if repeat <= maxCopy2Offset { + d += emitCopyLits2(dst[d:], src[nextEmit:base], repeat, length) + } else { + d += emitCopyLits3(dst[d:], src[nextEmit:base], repeat, length) + } + } else { + d += emitCopy(dst[d:], repeat, length) + } + if debugEncode { + fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d) + } + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic(fmt.Sprintf("mismatch: source: %v != target: %v", hex.EncodeToString(a), hex.EncodeToString(b))) + } + } + + for { + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + x := load64(src, s-2) + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + m2Hash := hash8(x, tableBits) + x = load64(src, s) + currHash := hash8(x, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint32(s - 2) + table[currHash] = uint32(s) + if debug && s == candidate { + panic("s == candidate") + } + if s-candidate > maxCopy3Offset || x != load64(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + + repeat = s - candidate + base = s + s += 8 + candidate += 8 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + d += emitCopy(dst[d:], repeat, s-base) + if debugEncode { + fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d) + } + } + } + +emitRemainder: + if nextEmit < len(src) { + if debugEncode { + fmt.Println(nextEmit, "emit remainder", len(src)-nextEmit, "d:", d) + } + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + if debugEncode { + fmt.Println("emit remainder", d+len(src)-nextEmit, " exceeds limit", dstLimit) + } + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +func encodeFastBlockGo64K(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 12 + maxTableSize = 1 << tableBits + skipLog = 4 + debug = debugEncode + ) + // Having values inside the table is ~the same speed as looking up + // - maybe slightly faster on bigger blocks. + // We go for the smaller stack allocation for now. + var table [maxTableSize]uint16 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>4 - 32 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + if debugEncode { + fmt.Println("encodeBlockGo: Starting encode") + } + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>skipLog + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash8(cv, tableBits) + cv1 := load64(src, s+1) + hash1 := hash8(cv1, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint16(s) + table[hash1] = uint16(s + 1) + cv2 := load64(src, s+2) + hash2 := hash8(cv2, tableBits) + + // Check repeat at offset checkRep. + // Speed impact is very small. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + // Bail if we exceed the maximum size. + if d+(base-nextEmit) > dstLimit { + return 0 + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + if debugEncode { + fmt.Println(nextEmit, "(lits) length:", base-nextEmit, "d-after:", d) + } + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + d += emitRepeat(dst[d:], s-base) + if debugEncode { + fmt.Println(base, "(repeat) length:", s-base, "offset:", repeat, "d-after:", d) + } + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if cv == load64(src, candidate) { + break + } + candidate = int(table[hash2]) + if cv1 == load64(src, candidate2) { + table[hash2] = uint16(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint16(s + 2) + if cv2 == load64(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards. + // The top bytes will be rechecked to get the full match. + for false && candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes match. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 8 + candidate += 8 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + length := s - base + if nextEmit != base { + if base-nextEmit > maxCopy2Lits || repeat < minCopy2Offset { + // Bail if we exceed the maximum size. + // We will not exceed dstLimit with the other encodings. + if d+(s-nextEmit) > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopy(dst[d:], repeat, length) + } else { + d += emitCopyLits2(dst[d:], src[nextEmit:base], repeat, length) + } + } else { + d += emitCopy(dst[d:], repeat, length) + } + if debugEncode { + fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d) + } + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic(fmt.Sprintf("mismatch: source: %v != target: %v", hex.EncodeToString(a), hex.EncodeToString(b))) + } + } + + for { + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + x := load64(src, s-2) + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + m2Hash := hash8(x, tableBits) + x = load64(src, s) + currHash := hash8(x, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint16(s - 2) + table[currHash] = uint16(s) + if debug && s == candidate { + panic("s == candidate") + } + if x != load64(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + + repeat = s - candidate + base = s + s += 8 + candidate += 8 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + d += emitCopy(dst[d:], repeat, s-base) + if debugEncode { + fmt.Println(base, "(copy) length:", s-base, "offset:", repeat, "d-after:", d) + } + } + } + +emitRemainder: + if nextEmit < len(src) { + if debugEncode { + fmt.Println(nextEmit, "emit remainder", len(src)-nextEmit, "d:", d) + } + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + if debugEncode { + fmt.Println("emit remainder", d+len(src)-nextEmit, " exceeds limit", dstLimit) + } + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/encode_l1.go b/encode_l1.go index 268b2e9..faf6b84 100644 --- a/encode_l1.go +++ b/encode_l1.go @@ -45,9 +45,7 @@ func encodeBlockGo(dst, src []byte) (d int) { debug = debugEncode ) - if len(src) <= 65536 { - return encodeBlockGo64K(dst, src) - } + // Having values inside the table is ~the same speed as looking up // - maybe slightly faster on bigger blocks. // We go for the smaller stack allocation for now. diff --git a/encode_l2.go b/encode_l2.go index 6c7bd66..484ae89 100644 --- a/encode_l2.go +++ b/encode_l2.go @@ -66,9 +66,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { return 0 } - if len(src) <= 64<<10 { - return encodeBlockBetterGo64K(dst, src) - } + // Initialize the hash tables. const ( // Long hash matches. diff --git a/fuzz_test.go b/fuzz_test.go index a3544a3..256d304 100644 --- a/fuzz_test.go +++ b/fuzz_test.go @@ -71,7 +71,7 @@ func FuzzEncodingBlocks(f *testing.F) { }(decDst[len(decDst)-4:]) decDst = decDst[:len(data):len(data)] const levelReference = LevelSmallest + 1 - for l := LevelFastest; l <= levelReference; l++ { + for l := LevelSuperFast; l <= levelReference; l++ { for i := range decDst { decDst[i] = 0xfe } @@ -134,6 +134,7 @@ func FuzzDecode(f *testing.F) { fuzz.ReturnFromZip(f, "testdata/fuzz/block-corpus-raw.zip", fuzz.TypeRaw, addCompressed) fuzz.ReturnFromZip(f, "testdata/fuzz/block-corpus-enc.zip", fuzz.TypeGoFuzz, addCompressed) fuzz.AddFromZip(f, "testdata/dec-block-regressions.zip", fuzz.TypeRaw, false) + fuzz.AddFromZip(f, "testdata/fuzz/block-corpus-dec.zip", fuzz.TypeGoFuzz, false) dec := NewReader(nil, ReaderIgnoreCRC()) f.Fuzz(func(t *testing.T, data []byte) { @@ -143,17 +144,19 @@ func FuzzDecode(f *testing.F) { if bytes.HasPrefix(data, []byte(magicChunk)) { dec.Reset(bytes.NewReader(data)) _, err := io.Copy(io.Discard, dec) - if true { - dec.Reset(bytes.NewReader(data)) - _, cErr := dec.DecodeConcurrent(io.Discard, 2) - if (err == nil) != (cErr == nil) { - t.Error("error mismatch", err, cErr) - } + dec.Reset(bytes.NewReader(data)) + _, cErr := dec.DecodeConcurrent(io.Discard, 2) + if (err == nil) != (cErr == nil) { + t.Error("error mismatch", err, cErr) } return } dCopy := append([]byte{}, data...) - dlen, err := DecodedLen(data) + isMz, dlen, err := IsMinLZ(data) + if !isMz && dlen > MaxBlockSize { + // Don't do s2/snappy fallback if big. + return + } base, baseErr := Decode(nil, data) if !bytes.Equal(data, dCopy) { t.Fatal("data was changed") @@ -163,9 +166,6 @@ func FuzzDecode(f *testing.F) { dataCapped = append(dataCapped, data...) dataCapped = append(dataCapped, bytes.Repeat([]byte{0xff, 0xff, 0xff, 0xff}, 1024/4)...) dataCapped = dataCapped[:len(data):len(data)] - if dlen > MaxBlockSize { - dlen = MaxBlockSize - } dst2 := bytes.Repeat([]byte{0xfe}, dlen+1024) got, err := Decode(dst2[:dlen:dlen], dataCapped[:len(data)]) if !bytes.Equal(dataCapped[:len(data)], dCopy) { @@ -242,7 +242,7 @@ func FuzzStreamEncode(f *testing.F) { fuzz.AddFromZip(f, "testdata/fuzz/block-corpus-enc.zip", fuzz.TypeGoFuzz, false) var encoders []*Writer - for l := LevelFastest; l <= LevelSmallest; l++ { + for l := LevelSuperFast; l <= LevelSmallest; l++ { encoders = append(encoders, NewWriter(nil, WriterLevel(l), WriterConcurrency(1), WriterBlockSize(128<<10))) if !testing.Short() && l == LevelFastest { // Try some combinations... diff --git a/minlz_test.go b/minlz_test.go index 1ca6ca8..af36e91 100644 --- a/minlz_test.go +++ b/minlz_test.go @@ -91,10 +91,24 @@ func encodeGo(dst, src []byte, level int) []byte { var n int switch level { + case LevelSuperFast: + if len(src) <= 64<<10 { + n = encodeFastBlockGo64K(dst[d:], src) + } else { + n = encodeFastBlockGo(dst[d:], src) + } case LevelFastest: - n = encodeBlockGo(dst[d:], src) + if len(src) <= 64<<10 { + n = encodeBlockGo64K(dst[d:], src) + } else { + n = encodeBlockGo(dst[d:], src) + } case LevelBalanced: - n = encodeBlockBetterGo(dst[d:], src) + if len(src) <= 64<<10 { + n = encodeBlockBetterGo64K(dst[d:], src) + } else { + n = encodeBlockBetterGo(dst[d:], src) + } case LevelSmallest: n = encodeBlockBest(dst[d:], src, nil) default: diff --git a/testdata/fuzz/block-corpus-dec-raw.zip b/testdata/fuzz/block-corpus-dec-raw.zip deleted file mode 100644 index 1bd530e..0000000 Binary files a/testdata/fuzz/block-corpus-dec-raw.zip and /dev/null differ diff --git a/testdata/fuzz/block-corpus-dec.zip b/testdata/fuzz/block-corpus-dec.zip new file mode 100644 index 0000000..8e3f786 Binary files /dev/null and b/testdata/fuzz/block-corpus-dec.zip differ diff --git a/testdata/fuzz/block-corpus-enc.zip b/testdata/fuzz/block-corpus-enc.zip index c746a30..31403ca 100644 Binary files a/testdata/fuzz/block-corpus-enc.zip and b/testdata/fuzz/block-corpus-enc.zip differ diff --git a/testdata/fuzz/block-corpus-raw.zip b/testdata/fuzz/block-corpus-raw.zip index c18dd54..80c4d35 100644 Binary files a/testdata/fuzz/block-corpus-raw.zip and b/testdata/fuzz/block-corpus-raw.zip differ diff --git a/writer.go b/writer.go index 4432996..f1667ff 100644 --- a/writer.go +++ b/writer.go @@ -85,7 +85,7 @@ type Writer struct { flushOnWrite bool appendIndex bool genIndex bool - level uint8 + level int8 } type result struct { @@ -437,6 +437,8 @@ func (w *Writer) encodeBlock(obuf, uncompressed []byte) int { var n int switch w.level { + case LevelSuperFast: + n = encodeBlockFast(obuf, uncompressed) case LevelFastest: n = encodeBlock(obuf, uncompressed) case LevelBalanced: @@ -925,10 +927,10 @@ func WriterAddIndex(b bool) WriterOption { // WriterLevel will set the compression level. func WriterLevel(n int) WriterOption { return func(w *Writer) error { - if n < 0 || n > LevelSmallest { + if n < LevelSuperFast || n > LevelSmallest { return ErrInvalidLevel } - w.level = uint8(n) + w.level = int8(n) return nil } }