// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. //go:build amd64 && !appengine && !noasm && gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_main_loop_amd64(SB), $0-8 // Preload values MOVQ ctx+0(FP), AX MOVBQZX 8(AX), DI MOVQ 16(AX), BX MOVQ 48(AX), SI MOVQ 24(AX), R8 MOVQ 32(AX), R9 MOVQ (AX), R10 // Main loop main_loop: XORL DX, DX CMPQ BX, SI SETGE DL // br0.fillFast32() MOVQ 32(R10), R11 MOVBQZX 40(R10), R12 CMPQ R12, $0x20 JBE skip_fill0 MOVQ 24(R10), AX SUBQ $0x20, R12 SUBQ $0x04, AX MOVQ (R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R13*1), R13 MOVQ R12, CX SHLQ CL, R13 MOVQ AX, 24(R10) ORQ R13, R11 // exhausted += (br0.off < 4) CMPQ AX, $0x04 ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) MOVQ DI, CX MOVQ R11, R13 SHRQ CL, R13 // v1 := table[val1&mask] MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (BX) // update the bitreader structure MOVQ R11, 32(R10) MOVB R12, 40(R10) // br1.fillFast32() MOVQ 80(R10), R11 MOVBQZX 88(R10), R12 CMPQ R12, $0x20 JBE skip_fill1 MOVQ 72(R10), AX SUBQ $0x20, R12 SUBQ $0x04, AX MOVQ 48(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R13*1), R13 MOVQ R12, CX SHLQ CL, R13 MOVQ AX, 72(R10) ORQ R13, R11 // exhausted += (br1.off < 4) CMPQ AX, $0x04 ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) MOVQ DI, CX MOVQ R11, R13 SHRQ CL, R13 // v1 := table[val1&mask] MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (BX)(R8*1) // update the bitreader structure MOVQ R11, 80(R10) MOVB R12, 88(R10) // br2.fillFast32() MOVQ 128(R10), R11 MOVBQZX 136(R10), R12 CMPQ R12, $0x20 JBE skip_fill2 MOVQ 120(R10), AX SUBQ $0x20, R12 SUBQ $0x04, AX MOVQ 96(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R13*1), R13 MOVQ R12, CX SHLQ CL, R13 MOVQ AX, 120(R10) ORQ R13, R11 // exhausted += (br2.off < 4) CMPQ AX, $0x04 ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) MOVQ DI, CX MOVQ R11, R13 SHRQ CL, R13 // v1 := table[val1&mask] MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (BX)(R8*2) // update the bitreader structure MOVQ R11, 128(R10) MOVB R12, 136(R10) // br3.fillFast32() MOVQ 176(R10), R11 MOVBQZX 184(R10), R12 CMPQ R12, $0x20 JBE skip_fill3 MOVQ 168(R10), AX SUBQ $0x20, R12 SUBQ $0x04, AX MOVQ 144(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R13*1), R13 MOVQ R12, CX SHLQ CL, R13 MOVQ AX, 168(R10) ORQ R13, R11 // exhausted += (br3.off < 4) CMPQ AX, $0x04 ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) MOVQ DI, CX MOVQ R11, R13 SHRQ CL, R13 // v1 := table[val1&mask] MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) LEAQ (R8)(R8*2), CX MOVW AX, (BX)(CX*1) // update the bitreader structure MOVQ R11, 176(R10) MOVB R12, 184(R10) ADDQ $0x02, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX SUBQ 16(AX), BX SHLQ $0x02, BX MOVQ BX, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 // Preload values MOVQ ctx+0(FP), CX MOVBQZX 8(CX), DI MOVQ 16(CX), BX MOVQ 48(CX), SI MOVQ 24(CX), R8 MOVQ 32(CX), R9 MOVQ (CX), R10 // Main loop main_loop: XORL DX, DX CMPQ BX, SI SETGE DL // br0.fillFast32() MOVQ 32(R10), R11 MOVBQZX 40(R10), R12 CMPQ R12, $0x20 JBE skip_fill0 MOVQ 24(R10), R13 SUBQ $0x20, R12 SUBQ $0x04, R13 MOVQ (R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R13)(R14*1), R14 MOVQ R12, CX SHLQ CL, R14 MOVQ R13, 24(R10) ORQ R14, R11 // exhausted += (br0.off < 4) CMPQ R13, $0x04 ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v1 := table[val0&mask] MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // val2 := br0.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v2 := table[val0&mask] MOVW (R9)(R13*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // val3 := br0.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v3 := table[val0&mask] MOVW (R9)(R13*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (BX) // update the bitreader structure MOVQ R11, 32(R10) MOVB R12, 40(R10) // br1.fillFast32() MOVQ 80(R10), R11 MOVBQZX 88(R10), R12 CMPQ R12, $0x20 JBE skip_fill1 MOVQ 72(R10), R13 SUBQ $0x20, R12 SUBQ $0x04, R13 MOVQ 48(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R13)(R14*1), R14 MOVQ R12, CX SHLQ CL, R14 MOVQ R13, 72(R10) ORQ R14, R11 // exhausted += (br1.off < 4) CMPQ R13, $0x04 ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v1 := table[val0&mask] MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // val2 := br1.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v2 := table[val0&mask] MOVW (R9)(R13*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // val3 := br1.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v3 := table[val0&mask] MOVW (R9)(R13*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (BX)(R8*1) // update the bitreader structure MOVQ R11, 80(R10) MOVB R12, 88(R10) // br2.fillFast32() MOVQ 128(R10), R11 MOVBQZX 136(R10), R12 CMPQ R12, $0x20 JBE skip_fill2 MOVQ 120(R10), R13 SUBQ $0x20, R12 SUBQ $0x04, R13 MOVQ 96(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R13)(R14*1), R14 MOVQ R12, CX SHLQ CL, R14 MOVQ R13, 120(R10) ORQ R14, R11 // exhausted += (br2.off < 4) CMPQ R13, $0x04 ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v1 := table[val0&mask] MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // val2 := br2.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v2 := table[val0&mask] MOVW (R9)(R13*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // val3 := br2.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v3 := table[val0&mask] MOVW (R9)(R13*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (BX)(R8*2) // update the bitreader structure MOVQ R11, 128(R10) MOVB R12, 136(R10) // br3.fillFast32() MOVQ 176(R10), R11 MOVBQZX 184(R10), R12 CMPQ R12, $0x20 JBE skip_fill3 MOVQ 168(R10), R13 SUBQ $0x20, R12 SUBQ $0x04, R13 MOVQ 144(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R13)(R14*1), R14 MOVQ R12, CX SHLQ CL, R14 MOVQ R13, 168(R10) ORQ R14, R11 // exhausted += (br3.off < 4) CMPQ R13, $0x04 ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v0 := table[val0&mask] MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v1 := table[val0&mask] MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // val2 := br3.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v2 := table[val0&mask] MOVW (R9)(R13*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R11 ADDB CL, R12 // val3 := br3.peekTopBits(peekBits) MOVQ R11, R13 MOVQ DI, CX SHRQ CL, R13 // v3 := table[val0&mask] MOVW (R9)(R13*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R11 ADDB CL, R12 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) LEAQ (R8)(R8*2), CX MOVL AX, (BX)(CX*1) // update the bitreader structure MOVQ R11, 176(R10) MOVB R12, 184(R10) ADDQ $0x04, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX SUBQ 16(AX), BX SHLQ $0x02, BX MOVQ BX, 40(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) TEXT ·decompress1x_main_loop_amd64(SB), $0-8 MOVQ ctx+0(FP), CX MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 MOVQ 24(SI), R9 MOVQ 32(SI), R10 MOVBQZX 40(SI), R11 MOVQ 32(CX), SI MOVBQZX 8(CX), DI JMP loop_condition main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 JL bitReader_fillFast_1_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), R12 MOVQ R11, CX SHLQ CL, R12 ORQ R12, R10 bitReader_fillFast_1_end: MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 BSWAPL AX CMPQ R11, $0x20 JL bitReader_fillFast_2_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), R12 MOVQ R11, CX SHLQ CL, R12 ORQ R12, R10 bitReader_fillFast_2_end: MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 BSWAPL AX // Store the decoded values MOVL AX, (DX) ADDQ $0x04, DX loop_condition: CMPQ R9, $0x08 JGE main_loop // Update ctx structure MOVQ ctx+0(FP), AX SUBQ 16(AX), DX MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) MOVB R11, 40(AX) RET // Report error error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) RET // func decompress1x_main_loop_bmi2(ctx *decompress1xContext) // Requires: BMI2 TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 MOVQ ctx+0(FP), CX MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 MOVQ 24(SI), R9 MOVQ 32(SI), R10 MOVBQZX 40(SI), R11 MOVQ 32(CX), SI MOVBQZX 8(CX), DI JMP loop_condition main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 JL bitReader_fillFast_1_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), CX SHLXQ R11, CX, CX ORQ CX, R10 bitReader_fillFast_1_end: SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 BSWAPL AX CMPQ R11, $0x20 JL bitReader_fillFast_2_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), CX SHLXQ R11, CX, CX ORQ CX, R10 bitReader_fillFast_2_end: SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 BSWAPL AX // Store the decoded values MOVL AX, (DX) ADDQ $0x04, DX loop_condition: CMPQ R9, $0x08 JGE main_loop // Update ctx structure MOVQ ctx+0(FP), AX SUBQ 16(AX), DX MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) MOVB R11, 40(AX) RET // Report error error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) RET