From ab3f59d21fed1ade9e70e83b385b94c0d0ebf725 Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Fri, 7 Aug 2015 14:35:24 -0400 Subject: [PATCH] Add hash-based indexes --- graph/sql/quadstore.go | 71 ++++++++++++++++++++++++++++++-------- graph/sql/sql_iterator.go | 2 +- graph/sql/sql_link_iterator.go | 44 ++++++++++++----------- graph/sql/sql_node_intersection.go | 13 ++++--- graph/sql/sql_node_iterator.go | 9 +++-- 5 files changed, 95 insertions(+), 44 deletions(-) diff --git a/graph/sql/quadstore.go b/graph/sql/quadstore.go index 3181f2b..4c708cb 100644 --- a/graph/sql/quadstore.go +++ b/graph/sql/quadstore.go @@ -1,8 +1,12 @@ package sql import ( + "crypto/sha1" "database/sql" + "encoding/hex" "fmt" + "hash" + "sync" "github.com/lib/pq" @@ -18,6 +22,13 @@ func init() { graph.RegisterQuadStore(QuadStoreType, true, newQuadStore, createSQLTables, nil) } +var ( + hashPool = sync.Pool{ + New: func() interface{} { return sha1.New() }, + } + hashSize = sha1.Size +) + type QuadStore struct { db *sql.DB sqlFlavor string @@ -55,7 +66,11 @@ func createSQLTables(addr string, options graph.Options) error { horizon BIGSERIAL PRIMARY KEY, id BIGINT, ts timestamp, - UNIQUE(subject, predicate, object, label) + subject_hash TEXT NOT NULL, + predicate_hash TEXT NOT NULL, + object_hash TEXT NOT NULL, + label_hash TEXT, + UNIQUE(subject_hash, predicate_hash, object_hash, label_hash) );`) if err != nil { glog.Errorf("Cannot create quad table: %v", quadTable) @@ -73,17 +88,11 @@ func createSQLTables(addr string, options graph.Options) error { CREATE INDEX pos_index ON quads USING brin(predicate) WITH (pages_per_range = 32); CREATE INDEX osp_index ON quads USING brin(object) WITH (pages_per_range = 32); `) - } else if idxStrat == "prefix" { - index, err = tx.Exec(fmt.Sprintf(` - CREATE INDEX spo_index ON quads (substr(subject, 0, 8)) WITH (FILLFACTOR = %d); - CREATE INDEX pos_index ON quads (substr(predicate, 0, 8)) WITH (FILLFACTOR = %d); - CREATE INDEX osp_index ON quads (substr(object, 0, 8)) WITH (FILLFACTOR = %d); - `, factor, factor, factor)) } else { index, err = tx.Exec(fmt.Sprintf(` - CREATE INDEX spo_index ON quads (subject, predicate, object) WITH (FILLFACTOR = %d); - CREATE INDEX pos_index ON quads (predicate, object, subject) WITH (FILLFACTOR = %d); - CREATE INDEX osp_index ON quads (object, subject, predicate) WITH (FILLFACTOR = %d); + CREATE INDEX spo_index ON quads (subject_hash) WITH (FILLFACTOR = %d); + CREATE INDEX pos_index ON quads (predicate_hash) WITH (FILLFACTOR = %d); + CREATE INDEX osp_index ON quads (object_hash) WITH (FILLFACTOR = %d); `, factor, factor, factor)) } if err != nil { @@ -107,13 +116,34 @@ func newQuadStore(addr string, options graph.Options) (graph.QuadStore, error) { return &qs, nil } +func hashOf(s string) string { + h := hashPool.Get().(hash.Hash) + h.Reset() + defer hashPool.Put(h) + key := make([]byte, 0, hashSize) + h.Write([]byte(s)) + key = h.Sum(key) + return hex.EncodeToString(key) +} + func (qs *QuadStore) copyFrom(tx *sql.Tx, in []graph.Delta) error { - stmt, err := tx.Prepare(pq.CopyIn("quads", "subject", "predicate", "object", "label", "id", "ts")) + stmt, err := tx.Prepare(pq.CopyIn("quads", "subject", "predicate", "object", "label", "id", "ts", "subject_hash", "predicate_hash", "object_hash", "label_hash")) if err != nil { return err } for _, d := range in { - _, err := stmt.Exec(d.Quad.Subject, d.Quad.Predicate, d.Quad.Object, d.Quad.Label, d.ID.Int(), d.Timestamp) + _, err := stmt.Exec( + d.Quad.Subject, + d.Quad.Predicate, + d.Quad.Object, + d.Quad.Label, + d.ID.Int(), + d.Timestamp, + hashOf(d.Quad.Subject), + hashOf(d.Quad.Predicate), + hashOf(d.Quad.Object), + hashOf(d.Quad.Label), + ) if err != nil { glog.Errorf("couldn't prepare COPY statement: %v", err) return err @@ -137,7 +167,7 @@ func (qs *QuadStore) buildTxPostgres(tx *sql.Tx, in []graph.Delta) error { return qs.copyFrom(tx, in) } - insert, err := tx.Prepare(`INSERT INTO quads(subject, predicate, object, label, id, ts) VALUES ($1, $2, $3, $4, $5, $6)`) + insert, err := tx.Prepare(`INSERT INTO quads(subject, predicate, object, label, id, ts, subject_hash, predicate_hash, object_hash, label_hash) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)`) if err != nil { glog.Errorf("Cannot prepare insert statement: %v", err) return err @@ -145,7 +175,18 @@ func (qs *QuadStore) buildTxPostgres(tx *sql.Tx, in []graph.Delta) error { for _, d := range in { switch d.Action { case graph.Add: - _, err := insert.Exec(d.Quad.Subject, d.Quad.Predicate, d.Quad.Object, d.Quad.Label, d.ID.Int(), d.Timestamp) + _, err := insert.Exec( + d.Quad.Subject, + d.Quad.Predicate, + d.Quad.Object, + d.Quad.Label, + d.ID.Int(), + d.Timestamp, + hashOf(d.Quad.Subject), + hashOf(d.Quad.Predicate), + hashOf(d.Quad.Object), + hashOf(d.Quad.Label), + ) if err != nil { glog.Errorf("couldn't prepare INSERT statement: %v", err) return err @@ -271,7 +312,7 @@ func (qs *QuadStore) sizeForIterator(isAll bool, dir quad.Direction, val string) var size int64 glog.V(4).Infoln("sql: getting size for select %s, %s", dir.String(), val) err = qs.db.QueryRow( - fmt.Sprintf("SELECT count(*) FROM quads WHERE %s = $1;", dir.String()), val).Scan(&size) + fmt.Sprintf("SELECT count(*) FROM quads WHERE %s_hash = $1;", dir.String()), hashOf(val)).Scan(&size) if err != nil { glog.Errorln("Error getting size from SQL database: %v", err) return 0 diff --git a/graph/sql/sql_iterator.go b/graph/sql/sql_iterator.go index 74ca0c2..3f2d758 100644 --- a/graph/sql/sql_iterator.go +++ b/graph/sql/sql_iterator.go @@ -290,7 +290,7 @@ func (it *SQLIterator) makeCursor(next bool, value graph.Value) error { } var q string var values []string - q, values = it.sql.buildSQL(next, value) + q, values = it.sql.buildSQL(next, value, false) q = convertToPostgres(q, values) ivalues := make([]interface{}, 0, len(values)) for _, v := range values { diff --git a/graph/sql/sql_link_iterator.go b/graph/sql/sql_link_iterator.go index 8e7a805..fc6d02e 100644 --- a/graph/sql/sql_link_iterator.go +++ b/graph/sql/sql_link_iterator.go @@ -50,11 +50,11 @@ type tagDir struct { func (t tagDir) String() string { if t.dir == quad.Any { if t.justLocal { - return fmt.Sprintf("%s.__execd as %s", t.table, t.tag) + return fmt.Sprintf("%s.__execd as %s, %s.__execd_hash as %s_hash", t.table, t.tag, t.table, t.tag) } - return fmt.Sprintf("%s.%s as %s", t.table, t.tag, t.tag) + return fmt.Sprintf("%s.%s as %s, %s.%s_hash as %s_hash", t.table, t.tag, t.tag, t.table, t.tag, t.tag) } - return fmt.Sprintf("%s.%s as %s", t.table, t.dir, t.tag) + return fmt.Sprintf("%s.%s as %s, %s.%s_hash as %s_hash", t.table, t.dir, t.tag, t.table, t.dir, t.tag) } type tableDef struct { @@ -71,7 +71,7 @@ type sqlItDir struct { type sqlIterator interface { sqlClone() sqlIterator - buildSQL(next bool, val graph.Value) (string, []string) + buildSQL(next bool, val graph.Value, hash bool) (string, []string) getTables() []tableDef getTags() []tagDir buildWhere() (string, []string) @@ -219,8 +219,8 @@ func (l *SQLLinkIterator) buildWhere() (string, []string) { var q []string var vals []string for _, c := range l.constraints { - q = append(q, fmt.Sprintf("%s.%s = ?", l.tableName, c.dir)) - vals = append(vals, c.vals[0]) + q = append(q, fmt.Sprintf("%s.%s_hash = ?", l.tableName, c.dir)) + vals = append(vals, hashOf(c.vals[0])) } for _, i := range l.nodeIts { t := i.it.tableID() @@ -228,7 +228,7 @@ func (l *SQLLinkIterator) buildWhere() (string, []string) { if t.dir == quad.Any { dir = t.tag } - q = append(q, fmt.Sprintf("%s.%s = %s.%s", l.tableName, i.dir, t.table, dir)) + q = append(q, fmt.Sprintf("%s.%s_hash = %s.%s_hash", l.tableName, i.dir, t.table, dir)) } for _, i := range l.nodeIts { s, v := i.it.buildWhere() @@ -246,13 +246,17 @@ func (l *SQLLinkIterator) tableID() tagDir { } } -func (l *SQLLinkIterator) buildSQL(next bool, val graph.Value) (string, []string) { +func (l *SQLLinkIterator) buildSQL(next bool, val graph.Value, hash bool) (string, []string) { query := "SELECT DISTINCT " + hashs := "" + if hash { + hashs = "_hash" + } t := []string{ - fmt.Sprintf("%s.subject", l.tableName), - fmt.Sprintf("%s.predicate", l.tableName), - fmt.Sprintf("%s.object", l.tableName), - fmt.Sprintf("%s.label", l.tableName), + fmt.Sprintf("%s.subject%s", l.tableName, hashs), + fmt.Sprintf("%s.predicate%s", l.tableName, hashs), + fmt.Sprintf("%s.object%s", l.tableName, hashs), + fmt.Sprintf("%s.label%s", l.tableName, hashs), } for _, v := range l.getTags() { t = append(t, v.String()) @@ -276,16 +280,16 @@ func (l *SQLLinkIterator) buildSQL(next bool, val graph.Value) (string, []string constraint += " AND " } t = []string{ - fmt.Sprintf("%s.subject = ?", l.tableName), - fmt.Sprintf("%s.predicate = ?", l.tableName), - fmt.Sprintf("%s.object = ?", l.tableName), - fmt.Sprintf("%s.label = ?", l.tableName), + fmt.Sprintf("%s.subject_hash = ?", l.tableName), + fmt.Sprintf("%s.predicate_hash = ?", l.tableName), + fmt.Sprintf("%s.object_hash = ?", l.tableName), + fmt.Sprintf("%s.label_hash = ?", l.tableName), } constraint += strings.Join(t, " AND ") - values = append(values, v.Subject) - values = append(values, v.Predicate) - values = append(values, v.Object) - values = append(values, v.Label) + values = append(values, hashOf(v.Subject)) + values = append(values, hashOf(v.Predicate)) + values = append(values, hashOf(v.Object)) + values = append(values, hashOf(v.Label)) } query += constraint query += ";" diff --git a/graph/sql/sql_node_intersection.go b/graph/sql/sql_node_intersection.go index d010d66..807df72 100644 --- a/graph/sql/sql_node_intersection.go +++ b/graph/sql/sql_node_intersection.go @@ -69,6 +69,9 @@ func (n *SQLNodeIntersection) Describe() string { func (n *SQLNodeIntersection) buildResult(result []string, cols []string) map[string]string { m := make(map[string]string) for i, c := range cols { + if strings.HasSuffix(c, "_hash") { + continue + } if c == "__execd" { n.result = result[i] } @@ -100,7 +103,7 @@ func (n *SQLNodeIntersection) buildSubqueries() []tableDef { for i, it := range n.nodeIts { var td tableDef var table string - table, td.values = it.buildSQL(true, nil) + table, td.values = it.buildSQL(true, nil, true) td.table = fmt.Sprintf("\n(%s)", table[:len(table)-1]) td.name = n.nodetables[i] out = append(out, td) @@ -150,13 +153,13 @@ func (n *SQLNodeIntersection) buildWhere() (string, []string) { var q []string var vals []string for _, tb := range n.nodetables[1:] { - q = append(q, fmt.Sprintf("%s.__execd = %s.__execd", n.nodetables[0], tb)) + q = append(q, fmt.Sprintf("%s.__execd_hash = %s.__execd_hash", n.nodetables[0], tb)) } query := strings.Join(q, " AND ") return query, vals } -func (n *SQLNodeIntersection) buildSQL(next bool, val graph.Value) (string, []string) { +func (n *SQLNodeIntersection) buildSQL(next bool, val graph.Value, _ bool) (string, []string) { topData := n.tableID() tags := []tagDir{topData} tags = append(tags, n.getTags()...) @@ -184,8 +187,8 @@ func (n *SQLNodeIntersection) buildSQL(next bool, val graph.Value) (string, []st if constraint != "" { constraint += " AND " } - constraint += fmt.Sprintf("%s.%s = ?", topData.table, topData.dir) - values = append(values, v) + constraint += fmt.Sprintf("%s.%s_hash = ?", topData.table, topData.dir) + values = append(values, hashOf(v)) } query += constraint query += ";" diff --git a/graph/sql/sql_node_iterator.go b/graph/sql/sql_node_iterator.go index 3d9ec21..55eb211 100644 --- a/graph/sql/sql_node_iterator.go +++ b/graph/sql/sql_node_iterator.go @@ -89,6 +89,9 @@ func (n *SQLNodeIterator) Describe() string { func (n *SQLNodeIterator) buildResult(result []string, cols []string) map[string]string { m := make(map[string]string) for i, c := range cols { + if strings.HasSuffix(c, "_hash") { + continue + } if c == "__execd" { n.result = result[i] } @@ -157,7 +160,7 @@ func (n *SQLNodeIterator) buildWhere() (string, []string) { return query, vals } -func (n *SQLNodeIterator) buildSQL(next bool, val graph.Value) (string, []string) { +func (n *SQLNodeIterator) buildSQL(next bool, val graph.Value, _ bool) (string, []string) { topData := n.tableID() tags := []tagDir{topData} tags = append(tags, n.getTags()...) @@ -185,8 +188,8 @@ func (n *SQLNodeIterator) buildSQL(next bool, val graph.Value) (string, []string if constraint != "" { constraint += " AND " } - constraint += fmt.Sprintf("%s.%s = ?", topData.table, topData.dir) - values = append(values, v) + constraint += fmt.Sprintf("%s.%s_hash = ?", topData.table, topData.dir) + values = append(values, hashOf(v)) } query += constraint query += ";"