go-countlines/countlines_amd64.s

86 lines
1.1 KiB
ArmAsm

// +build amd64,!gccgo,!appengine
#include "textflag.h"
TEXT ·countNewlinesASM(SB),NOSPLIT,$0
MOVQ src+0(FP), SI
MOVQ len+8(FP), BX
XORQ AX, AX
XORQ DX, DX
XORPD X0, X0
XORPD X2, X2
XORPD X3, X3
MOVQ $0x0A0A0A0A0A0A0A0A, R10 // prep '\n'
PINSRQ $1, R10, X1
PINSRQ $0, R10, X1
CMPQ BX, $16
JB tail
CMPQ BX, $64
JB loop
bigloop:
VMOVDQU -16(SI)(BX*1), X11
VMOVDQU -32(SI)(BX*1), X10
VMOVDQU -48(SI)(BX*1), X9
VMOVDQU -64(SI)(BX*1), X8
VPCMPEQB X11, X1, X11
VPCMPEQB X10, X1, X10
VPCMPEQB X9, X1, X9
VPCMPEQB X8, X1, X8
VPADDB X0, X11, X2
VPADDB X2, X10, X2
VPADDB X2, X9, X2
VPADDB X2, X8, X2
PSIGNB X2, X2
VPSADBW X2, X0, X2
VPADDQ X2, X3, X3
SUBQ $64, BX
JZ ret
CMPQ BX, $64
JAE bigloop
CMPQ BX, $8
JB tail
loop:
VMOVDQU -16(SI)(BX*1), X11
VPCMPEQB X11, X1, X11
VPADDB X0, X11, X2
PSIGNB X2, X2
VPSADBW X2, X0, X2
VPADDQ X2, X3, X3
SUBQ $16, BX
JZ ret
CMPQ BX, $16
JAE loop
tail:
MOVB -1(SI)(BX*1), DX
CMPB DX, $0x0A
JNZ next
INCQ AX
next:
SUBQ $1, BX
JNZ tail
ret:
PEXTRQ $0, X3, CX
PEXTRQ $1, X3, DX
ADDQ CX, AX
ADDQ DX, AX
MOVQ AX, ret+16(FP)
RET