Use error returns and interface type for parsing
Fixes issue #72 This change simplifies interactions with parsing N-Quads and makes reading datasets more robust. Changes made while here also improve performance: benchmark old ns/op new ns/op delta BenchmarkParser 1058 667 -36.96% We still use string concatenation which I'm not wildly happy about, but I think this can be left for a later change. Initial changes towards idiomatic error handling have been made. More significant changes are needed, but these have subtle design implication and need to be thought about more. 30kmoviesdata.nt.gz has been altered to properly escape double quotes. This was done mechanically and with manual curation to pick up straglers.
This commit is contained in:
parent
abdd649c82
commit
0e0e382d2b
11 changed files with 260 additions and 226 deletions
185
nquads/nquads.go
185
nquads/nquads.go
|
|
@ -16,109 +16,112 @@ package nquads
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/barakmich/glog"
|
||||
|
||||
"github.com/google/cayley/graph"
|
||||
)
|
||||
|
||||
func isWhitespace(s uint8) bool {
|
||||
return (s == '\t' || s == '\r' || s == ' ')
|
||||
}
|
||||
func Parse(str string) *graph.Triple {
|
||||
var (
|
||||
ErrAbsentSubject = errors.New("absent subject")
|
||||
ErrAbsentPredicate = errors.New("absent predicate")
|
||||
ErrAbsentObject = errors.New("absent object")
|
||||
ErrUnterminated = errors.New("unterminated quad")
|
||||
)
|
||||
|
||||
func Parse(str string) (*graph.Triple, error) {
|
||||
// Skip leading whitespace.
|
||||
str = skipWhitespace(str)
|
||||
str = trimSpace(str)
|
||||
// Check for a comment
|
||||
if str != "" && str[0] == '#' {
|
||||
return nil
|
||||
return nil, nil
|
||||
}
|
||||
sub, remainder := getTripleComponent(str)
|
||||
if sub == nil {
|
||||
return nil
|
||||
if sub == "" {
|
||||
return nil, ErrAbsentSubject
|
||||
}
|
||||
str = skipWhitespace(remainder)
|
||||
str = trimSpace(remainder)
|
||||
pred, remainder := getTripleComponent(str)
|
||||
if pred == nil {
|
||||
return nil
|
||||
if pred == "" {
|
||||
return nil, ErrAbsentPredicate
|
||||
}
|
||||
str = skipWhitespace(remainder)
|
||||
str = trimSpace(remainder)
|
||||
obj, remainder := getTripleComponent(str)
|
||||
if obj == nil {
|
||||
return nil
|
||||
if obj == "" {
|
||||
return nil, ErrAbsentObject
|
||||
}
|
||||
str = skipWhitespace(remainder)
|
||||
prov_ptr, remainder := getTripleComponent(str)
|
||||
var prov string
|
||||
if prov_ptr == nil {
|
||||
prov = ""
|
||||
} else {
|
||||
prov = *prov_ptr
|
||||
}
|
||||
str = skipWhitespace(remainder)
|
||||
str = trimSpace(remainder)
|
||||
prov, remainder := getTripleComponent(str)
|
||||
str = trimSpace(remainder)
|
||||
if str != "" && str[0] == '.' {
|
||||
return &graph.Triple{*sub, *pred, *obj, prov}
|
||||
return &graph.Triple{sub, pred, obj, prov}, nil
|
||||
}
|
||||
return nil
|
||||
return nil, ErrUnterminated
|
||||
}
|
||||
|
||||
func skipWhitespace(str string) string {
|
||||
func isSpace(s uint8) bool {
|
||||
return s == ' ' || s == '\t' || s == '\r'
|
||||
}
|
||||
|
||||
func trimSpace(str string) string {
|
||||
i := 0
|
||||
for i < len(str) && isWhitespace(str[i]) {
|
||||
for i < len(str) && isSpace(str[i]) {
|
||||
i += 1
|
||||
}
|
||||
return str[i:]
|
||||
}
|
||||
|
||||
func getTripleComponent(str string) (*string, string) {
|
||||
func getTripleComponent(str string) (head, tail string) {
|
||||
if len(str) == 0 {
|
||||
return nil, str
|
||||
return "", str
|
||||
}
|
||||
if str[0] == '<' {
|
||||
return getUriPart(str[1:])
|
||||
} else if str[0] == '"' {
|
||||
return getQuotedPart(str[1:])
|
||||
} else if str[0] == '.' {
|
||||
return nil, str
|
||||
return "", str
|
||||
} else {
|
||||
// Technically not part of the spec. But we do it anyway for convenience.
|
||||
return getUnquotedPart(str)
|
||||
}
|
||||
}
|
||||
|
||||
func getUriPart(str string) (*string, string) {
|
||||
func getUriPart(str string) (head, tail string) {
|
||||
i := 0
|
||||
for i < len(str) && str[i] != '>' {
|
||||
i += 1
|
||||
}
|
||||
if i == len(str) {
|
||||
return nil, str
|
||||
return "", str
|
||||
}
|
||||
part := str[0:i]
|
||||
return &part, str[i+1:]
|
||||
head = str[0:i]
|
||||
return head, str[i+1:]
|
||||
}
|
||||
|
||||
func getQuotedPart(str string) (*string, string) {
|
||||
i := 0
|
||||
start := 0
|
||||
out := ""
|
||||
func getQuotedPart(str string) (head, tail string) {
|
||||
var (
|
||||
i int
|
||||
start int
|
||||
)
|
||||
for i < len(str) && str[i] != '"' {
|
||||
if str[i] == '\\' {
|
||||
out += str[start:i]
|
||||
head += str[start:i]
|
||||
switch str[i+1] {
|
||||
case '\\':
|
||||
out += "\\"
|
||||
head += "\\"
|
||||
case 'r':
|
||||
out += "\r"
|
||||
head += "\r"
|
||||
case 'n':
|
||||
out += "\n"
|
||||
head += "\n"
|
||||
case 't':
|
||||
out += "\t"
|
||||
head += "\t"
|
||||
case '"':
|
||||
out += "\""
|
||||
head += "\""
|
||||
default:
|
||||
return nil, str
|
||||
return "", str
|
||||
}
|
||||
i += 2
|
||||
start = i
|
||||
|
|
@ -127,70 +130,74 @@ func getQuotedPart(str string) (*string, string) {
|
|||
i += 1
|
||||
}
|
||||
if i == len(str) {
|
||||
return nil, str
|
||||
return "", str
|
||||
}
|
||||
out += str[start:i]
|
||||
head += str[start:i]
|
||||
i += 1
|
||||
var remainder string
|
||||
if strings.HasPrefix(str[i:], "^^<") {
|
||||
switch {
|
||||
case strings.HasPrefix(str[i:], "^^<"):
|
||||
// Ignore type, for now
|
||||
_, remainder = getUriPart(str[i+3:])
|
||||
} else if strings.HasPrefix(str[i:], "@") {
|
||||
_, remainder = getUnquotedPart(str[i+1:])
|
||||
} else {
|
||||
remainder = str[i:]
|
||||
_, tail = getUriPart(str[i+3:])
|
||||
case str[i] == '@':
|
||||
_, tail = getUnquotedPart(str[i+1:])
|
||||
default:
|
||||
tail = str[i:]
|
||||
}
|
||||
|
||||
return &out, remainder
|
||||
return head, tail
|
||||
}
|
||||
|
||||
func getUnquotedPart(str string) (*string, string) {
|
||||
i := 0
|
||||
initStr := str
|
||||
out := ""
|
||||
start := 0
|
||||
for i < len(str) && !isWhitespace(str[i]) {
|
||||
func getUnquotedPart(str string) (head, tail string) {
|
||||
var (
|
||||
i int
|
||||
initStr = str
|
||||
start int
|
||||
)
|
||||
for i < len(str) && !isSpace(str[i]) {
|
||||
if str[i] == '"' {
|
||||
part, remainder := getQuotedPart(str[i+1:])
|
||||
if part == nil {
|
||||
if part == "" {
|
||||
return part, initStr
|
||||
}
|
||||
out += str[start:i]
|
||||
head += str[start:i]
|
||||
str = remainder
|
||||
i = 0
|
||||
start = 0
|
||||
out += *part
|
||||
head += part
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
out += str[start:i]
|
||||
return &out, str[i:]
|
||||
head += str[start:i]
|
||||
return head, str[i:]
|
||||
}
|
||||
|
||||
func ReadNQuadsFromReader(c chan *graph.Triple, reader io.Reader) {
|
||||
bf := bufio.NewReader(reader)
|
||||
type Decoder struct {
|
||||
r *bufio.Reader
|
||||
line []byte
|
||||
}
|
||||
|
||||
nTriples := 0
|
||||
line := ""
|
||||
func NewDecoder(r io.Reader) *Decoder {
|
||||
return &Decoder{r: bufio.NewReader(r)}
|
||||
}
|
||||
|
||||
func (dec *Decoder) Unmarshal() (*graph.Triple, error) {
|
||||
dec.line = dec.line[:0]
|
||||
for {
|
||||
l, pre, err := bf.ReadLine()
|
||||
if err == io.EOF {
|
||||
l, pre, err := dec.r.ReadLine()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dec.line = append(dec.line, l...)
|
||||
if !pre {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
glog.Fatalln("Something bad happened while reading file " + err.Error())
|
||||
}
|
||||
line += string(l)
|
||||
if pre {
|
||||
continue
|
||||
}
|
||||
triple := Parse(line)
|
||||
line = ""
|
||||
if triple != nil {
|
||||
nTriples++
|
||||
c <- triple
|
||||
}
|
||||
}
|
||||
glog.Infoln("Read", nTriples, "triples")
|
||||
close(c)
|
||||
triple, err := Parse(string(dec.line))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse %q: %v", dec.line, err)
|
||||
}
|
||||
if triple == nil {
|
||||
return dec.Unmarshal()
|
||||
}
|
||||
return triple, nil
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue