86 lines
1.1 KiB
ArmAsm
86 lines
1.1 KiB
ArmAsm
// +build amd64,!gccgo,!appengine
|
|
|
|
#include "textflag.h"
|
|
|
|
TEXT ·countNewlinesASM(SB),NOSPLIT,$0
|
|
MOVQ src+0(FP), SI
|
|
MOVQ len+8(FP), BX
|
|
|
|
XORQ AX, AX
|
|
XORQ DX, DX
|
|
XORPD X0, X0
|
|
XORPD X2, X2
|
|
XORPD X3, X3
|
|
|
|
MOVQ $0x0A0A0A0A0A0A0A0A, R10 // prep '\n'
|
|
PINSRQ $1, R10, X1
|
|
PINSRQ $0, R10, X1
|
|
|
|
CMPQ BX, $16
|
|
JB tail
|
|
|
|
CMPQ BX, $64
|
|
JB loop
|
|
|
|
bigloop:
|
|
VMOVDQU -16(SI)(BX*1), X11
|
|
VMOVDQU -32(SI)(BX*1), X10
|
|
VMOVDQU -48(SI)(BX*1), X9
|
|
VMOVDQU -64(SI)(BX*1), X8
|
|
|
|
VPCMPEQB X11, X1, X11
|
|
VPCMPEQB X10, X1, X10
|
|
VPCMPEQB X9, X1, X9
|
|
VPCMPEQB X8, X1, X8
|
|
|
|
VPADDB X0, X11, X2
|
|
VPADDB X2, X10, X2
|
|
VPADDB X2, X9, X2
|
|
VPADDB X2, X8, X2
|
|
PSIGNB X2, X2
|
|
VPSADBW X2, X0, X2
|
|
VPADDQ X2, X3, X3
|
|
|
|
SUBQ $64, BX
|
|
JZ ret
|
|
|
|
CMPQ BX, $64
|
|
JAE bigloop
|
|
|
|
CMPQ BX, $8
|
|
JB tail
|
|
|
|
loop:
|
|
VMOVDQU -16(SI)(BX*1), X11
|
|
|
|
VPCMPEQB X11, X1, X11
|
|
VPADDB X0, X11, X2
|
|
PSIGNB X2, X2
|
|
VPSADBW X2, X0, X2
|
|
VPADDQ X2, X3, X3
|
|
|
|
SUBQ $16, BX
|
|
JZ ret
|
|
|
|
CMPQ BX, $16
|
|
JAE loop
|
|
|
|
tail:
|
|
MOVB -1(SI)(BX*1), DX
|
|
CMPB DX, $0x0A
|
|
JNZ next
|
|
|
|
INCQ AX
|
|
next:
|
|
|
|
SUBQ $1, BX
|
|
JNZ tail
|
|
|
|
|
|
ret:
|
|
PEXTRQ $0, X3, CX
|
|
PEXTRQ $1, X3, DX
|
|
ADDQ CX, AX
|
|
ADDQ DX, AX
|
|
MOVQ AX, ret+16(FP)
|
|
RET
|