From 09244ddd38bbfad7bebd3ca35db7b54975a9a84c Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Wed, 6 Aug 2014 02:55:36 -0400 Subject: [PATCH 1/5] materialize implementation and and optimization --- graph/iterator.go | 15 ++ graph/iterator/and_iterator_optimize.go | 17 +++ graph/iterator/materialize_iterator.go | 235 ++++++++++++++++++++++++++++++++ 3 files changed, 267 insertions(+) create mode 100644 graph/iterator/materialize_iterator.go diff --git a/graph/iterator.go b/graph/iterator.go index 972f334..e7c2ad8 100644 --- a/graph/iterator.go +++ b/graph/iterator.go @@ -153,6 +153,19 @@ func Next(it Iterator) (Value, bool) { return nil, false } +// Height is a convienence function to measure the height of an iterator tree. +func Height(it Iterator) int { + subs := it.SubIterators() + maxDepth := 0 + for _, sub := range subs { + h := Height(sub) + if h > maxDepth { + maxDepth = h + } + } + return maxDepth + 1 +} + // FixedIterator wraps iterators that are modifiable by addition of fixed value sets. type FixedIterator interface { Iterator @@ -180,6 +193,7 @@ const ( Fixed Not Optional + Materialize ) var ( @@ -200,6 +214,7 @@ var ( "fixed", "not", "optional", + "materialize", } ) diff --git a/graph/iterator/and_iterator_optimize.go b/graph/iterator/and_iterator_optimize.go index 92b6b41..f0adfad 100644 --- a/graph/iterator/and_iterator_optimize.go +++ b/graph/iterator/and_iterator_optimize.go @@ -70,6 +70,8 @@ func (it *And) Optimize() (graph.Iterator, bool) { // now a permutation of itself, but the contents are unchanged. its = optimizeOrder(its) + its = materializeIts(its) + // Okay! At this point we have an optimized order. // The easiest thing to do at this point is merely to create a new And iterator @@ -293,6 +295,21 @@ func hasOneUsefulIterator(its []graph.Iterator) graph.Iterator { return nil } +func materializeIts(its []graph.Iterator) []graph.Iterator { + var out []graph.Iterator + for _, it := range its { + stats := it.Stats() + if stats.Size*stats.NextCost < stats.ContainsCost { + if graph.Height(it) > 10 { + out = append(out, NewMaterialize(it)) + continue + } + } + out = append(out, it) + } + return out +} + // and.Stats() lives here in and-iterator-optimize.go because it may // in the future return different statistics based on how it is optimized. // For now, however, it's pretty static. diff --git a/graph/iterator/materialize_iterator.go b/graph/iterator/materialize_iterator.go new file mode 100644 index 0000000..4180a0b --- /dev/null +++ b/graph/iterator/materialize_iterator.go @@ -0,0 +1,235 @@ +// Copyright 2014 The Cayley Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package iterator + +// A simple iterator that, when first called Contains() or Next() upon, materializes the whole subiterator, stores it locally, and responds. Essentially a cache. + +import ( + "fmt" + "strings" + + "github.com/google/cayley/graph" +) + +var abortMaterializeAt = 1000 + +type result struct { + id graph.Value + tags map[string]graph.Value +} + +type Materialize struct { + uid uint64 + tags graph.Tagger + containsMap map[graph.Value]int + values []result + lastIndex int + subIt graph.Iterator + hasRun bool + aborted bool +} + +func NewMaterialize(sub graph.Iterator) *Materialize { + return &Materialize{ + uid: NextUID(), + containsMap: make(map[graph.Value]int), + subIt: sub, + } +} + +func (it *Materialize) UID() uint64 { + return it.uid +} + +func (it *Materialize) Reset() { + it.subIt.Reset() + it.lastIndex = 0 +} + +func (it *Materialize) Close() { + it.subIt.Close() + it.containsMap = nil + it.values = nil + it.hasRun = false +} + +func (it *Materialize) Tagger() *graph.Tagger { + return &it.tags +} + +func (it *Materialize) TagResults(dst map[string]graph.Value) { + if !it.hasRun { + return + } + for _, tag := range it.tags.Tags() { + dst[tag] = it.Result() + } + + for tag, value := range it.values[it.lastIndex].tags { + dst[tag] = value + } +} + +func (it *Materialize) Clone() graph.Iterator { + out := NewMaterialize(it.subIt.Clone()) + out.tags.CopyFrom(it) + return out +} + +// Print some information about the iterator. +func (it *Materialize) DebugString(indent int) string { + return fmt.Sprintf("%s(%s tags: %s Size: %d\n%s)", + strings.Repeat(" ", indent), + it.Type(), + it.tags.Tags(), + len(it.values), + it.subIt.DebugString(indent+4), + ) +} + +// Register this iterator as a Materialize iterator. +func (it *Materialize) Type() graph.Type { return graph.Materialize } + +// DEPRECATED +func (it *Materialize) ResultTree() *graph.ResultTree { + tree := graph.NewResultTree(it.Result()) + tree.AddSubtree(it.subIt.ResultTree()) + return tree +} + +func (it *Materialize) Result() graph.Value { + if it.lastIndex+1 > len(it.values) { + return nil + } + return it.values[it.lastIndex].id +} + +func (it *Materialize) SubIterators() []graph.Iterator { + return []graph.Iterator{it.subIt} +} + +func (it *Materialize) Optimize() (graph.Iterator, bool) { + newSub, changed := it.subIt.Optimize() + if changed { + it.subIt = newSub + if it.subIt.Type() == graph.Null { + return it.subIt, true + } + } + return it, false +} + +// Size is the number of values stored, if we've got them all. +// Otherwise, guess based on the size of the subiterator. +func (it *Materialize) Size() (int64, bool) { + if it.hasRun { + return int64(len(it.values)), true + } + return it.subIt.Size() +} + +// The entire point of Materialize is to amortize the cost by +// putting it all up front. +func (it *Materialize) Stats() graph.IteratorStats { + overhead := int64(2) + size, _ := it.Size() + subitStats := it.subIt.Stats() + return graph.IteratorStats{ + ContainsCost: overhead * subitStats.NextCost, + NextCost: overhead * subitStats.NextCost, + Size: size, + } +} + +func (it *Materialize) Next() (graph.Value, bool) { + if !it.hasRun { + it.materializeSet() + } + if it.aborted { + return graph.Next(it.subIt) + } + + lastVal := it.Result() + for it.lastIndex < len(it.values) { + it.lastIndex++ + if it.Result() != lastVal { + return it.Result(), true + } + } + return nil, false +} + +func (it *Materialize) Contains(v graph.Value) bool { + if !it.hasRun { + it.materializeSet() + } + if it.aborted { + return it.subIt.Contains(v) + } + if i, ok := it.containsMap[v]; ok { + it.lastIndex = i + return true + } + return false +} + +func (it *Materialize) NextResult() bool { + if !it.hasRun { + it.materializeSet() + } + if it.aborted { + return it.subIt.NextResult() + } + + i := it.lastIndex + 1 + if i == len(it.values) { + return false + } + if it.Result() == it.values[i].id { + it.lastIndex = i + return true + } + return false +} + +func (it *Materialize) materializeSet() { + i := 0 + for { + val, ok := graph.Next(it.subIt) + if !ok { + break + } + i += 1 + if i > abortMaterializeAt { + it.aborted = true + break + } + tags := make(map[string]graph.Value) + it.subIt.TagResults(tags) + it.containsMap[val] = len(it.values) + it.values = append(it.values, result{id: val, tags: tags}) + for it.subIt.NextResult() == true { + tags := make(map[string]graph.Value) + it.subIt.TagResults(tags) + it.values = append(it.values, result{id: val, tags: tags}) + } + } + if it.aborted { + it.values = nil + it.containsMap = nil + it.subIt.Reset() + } + it.hasRun = true +} From 24f57df85932f0a57186030418ac7df45bcbc9a0 Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Wed, 6 Aug 2014 03:20:33 -0400 Subject: [PATCH 2/5] fix overshoot and optimize better --- cayley_test.go | 2 +- graph/iterator.go | 7 +++++-- graph/iterator/and_iterator_optimize.go | 2 +- graph/iterator/materialize_iterator.go | 20 ++++++++++++++------ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cayley_test.go b/cayley_test.go index b94389a..e941071 100644 --- a/cayley_test.go +++ b/cayley_test.go @@ -348,7 +348,7 @@ func TestQueries(t *testing.T) { // TODO(kortschak) Be more rigorous in this result validation. if len(got) != len(test.expect) { - t.Errorf("Unexpected number of results, got:%d expect:%d.", len(got), len(test.expect)) + t.Errorf("Unexpected number of results, got:%d expect:%d on %s.", len(got), len(test.expect), test.message) } } } diff --git a/graph/iterator.go b/graph/iterator.go index e7c2ad8..3880b80 100644 --- a/graph/iterator.go +++ b/graph/iterator.go @@ -154,11 +154,14 @@ func Next(it Iterator) (Value, bool) { } // Height is a convienence function to measure the height of an iterator tree. -func Height(it Iterator) int { +func Height(it Iterator, until Type) int { + if it.Type() == until { + return 1 + } subs := it.SubIterators() maxDepth := 0 for _, sub := range subs { - h := Height(sub) + h := Height(sub, until) if h > maxDepth { maxDepth = h } diff --git a/graph/iterator/and_iterator_optimize.go b/graph/iterator/and_iterator_optimize.go index f0adfad..774904d 100644 --- a/graph/iterator/and_iterator_optimize.go +++ b/graph/iterator/and_iterator_optimize.go @@ -300,7 +300,7 @@ func materializeIts(its []graph.Iterator) []graph.Iterator { for _, it := range its { stats := it.Stats() if stats.Size*stats.NextCost < stats.ContainsCost { - if graph.Height(it) > 10 { + if graph.Height(it, graph.Materialize) > 10 { out = append(out, NewMaterialize(it)) continue } diff --git a/graph/iterator/materialize_iterator.go b/graph/iterator/materialize_iterator.go index 4180a0b..0336080 100644 --- a/graph/iterator/materialize_iterator.go +++ b/graph/iterator/materialize_iterator.go @@ -73,10 +73,16 @@ func (it *Materialize) TagResults(dst map[string]graph.Value) { if !it.hasRun { return } + if it.aborted { + it.subIt.TagResults(dst) + return + } + if it.lastIndex > len(it.values) { + return + } for _, tag := range it.tags.Tags() { dst[tag] = it.Result() } - for tag, value := range it.values[it.lastIndex].tags { dst[tag] = value } @@ -154,6 +160,7 @@ func (it *Materialize) Stats() graph.IteratorStats { } func (it *Materialize) Next() (graph.Value, bool) { + graph.NextLogIn(it) if !it.hasRun { it.materializeSet() } @@ -164,14 +171,15 @@ func (it *Materialize) Next() (graph.Value, bool) { lastVal := it.Result() for it.lastIndex < len(it.values) { it.lastIndex++ - if it.Result() != lastVal { - return it.Result(), true + if it.Result() != lastVal && it.Result() != nil { + return graph.NextLogOut(it, it.Result(), true) } } - return nil, false + return graph.NextLogOut(it, nil, false) } func (it *Materialize) Contains(v graph.Value) bool { + graph.ContainsLogIn(it, v) if !it.hasRun { it.materializeSet() } @@ -180,9 +188,9 @@ func (it *Materialize) Contains(v graph.Value) bool { } if i, ok := it.containsMap[v]; ok { it.lastIndex = i - return true + return graph.ContainsLogOut(it, v, true) } - return false + return graph.ContainsLogOut(it, v, false) } func (it *Materialize) NextResult() bool { From d10239483672aae49f345483139d02f27f9d887d Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Wed, 6 Aug 2014 03:44:01 -0400 Subject: [PATCH 3/5] bounds errors --- graph/iterator/materialize_iterator.go | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/graph/iterator/materialize_iterator.go b/graph/iterator/materialize_iterator.go index 0336080..8434bb5 100644 --- a/graph/iterator/materialize_iterator.go +++ b/graph/iterator/materialize_iterator.go @@ -20,6 +20,8 @@ import ( "fmt" "strings" + "github.com/barakmich/glog" + "github.com/google/cayley/graph" ) @@ -35,7 +37,7 @@ type Materialize struct { tags graph.Tagger containsMap map[graph.Value]int values []result - lastIndex int + index int subIt graph.Iterator hasRun bool aborted bool @@ -46,6 +48,7 @@ func NewMaterialize(sub graph.Iterator) *Materialize { uid: NextUID(), containsMap: make(map[graph.Value]int), subIt: sub, + index: -1, } } @@ -55,7 +58,7 @@ func (it *Materialize) UID() uint64 { func (it *Materialize) Reset() { it.subIt.Reset() - it.lastIndex = 0 + it.index = -1 } func (it *Materialize) Close() { @@ -77,13 +80,13 @@ func (it *Materialize) TagResults(dst map[string]graph.Value) { it.subIt.TagResults(dst) return } - if it.lastIndex > len(it.values) { + if it.Result() == nil { return } for _, tag := range it.tags.Tags() { dst[tag] = it.Result() } - for tag, value := range it.values[it.lastIndex].tags { + for tag, value := range it.values[it.index].tags { dst[tag] = value } } @@ -116,10 +119,16 @@ func (it *Materialize) ResultTree() *graph.ResultTree { } func (it *Materialize) Result() graph.Value { - if it.lastIndex+1 > len(it.values) { + if len(it.values) == 0 { return nil } - return it.values[it.lastIndex].id + if it.index == -1 { + return nil + } + if it.index >= len(it.values) { + return nil + } + return it.values[it.index].id } func (it *Materialize) SubIterators() []graph.Iterator { @@ -169,8 +178,8 @@ func (it *Materialize) Next() (graph.Value, bool) { } lastVal := it.Result() - for it.lastIndex < len(it.values) { - it.lastIndex++ + for it.index < len(it.values) { + it.index++ if it.Result() != lastVal && it.Result() != nil { return graph.NextLogOut(it, it.Result(), true) } @@ -187,7 +196,7 @@ func (it *Materialize) Contains(v graph.Value) bool { return it.subIt.Contains(v) } if i, ok := it.containsMap[v]; ok { - it.lastIndex = i + it.index = i return graph.ContainsLogOut(it, v, true) } return graph.ContainsLogOut(it, v, false) @@ -201,12 +210,12 @@ func (it *Materialize) NextResult() bool { return it.subIt.NextResult() } - i := it.lastIndex + 1 + i := it.index + 1 if i == len(it.values) { return false } if it.Result() == it.values[i].id { - it.lastIndex = i + it.index = i return true } return false @@ -239,5 +248,6 @@ func (it *Materialize) materializeSet() { it.containsMap = nil it.subIt.Reset() } + glog.Infof("Materialization List %d: %#v", it.values) it.hasRun = true } From 76efc2fcb7cb5d1936e0a89b102e01d43dbab95c Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Wed, 6 Aug 2014 04:07:30 -0400 Subject: [PATCH 4/5] redo data structure for sensibility --- graph/iterator/materialize_iterator.go | 42 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/graph/iterator/materialize_iterator.go b/graph/iterator/materialize_iterator.go index 8434bb5..2e41969 100644 --- a/graph/iterator/materialize_iterator.go +++ b/graph/iterator/materialize_iterator.go @@ -36,8 +36,9 @@ type Materialize struct { uid uint64 tags graph.Tagger containsMap map[graph.Value]int - values []result + values [][]result index int + subindex int subIt graph.Iterator hasRun bool aborted bool @@ -86,7 +87,7 @@ func (it *Materialize) TagResults(dst map[string]graph.Value) { for _, tag := range it.tags.Tags() { dst[tag] = it.Result() } - for tag, value := range it.values[it.index].tags { + for tag, value := range it.values[it.index][it.subindex].tags { dst[tag] = value } } @@ -128,7 +129,7 @@ func (it *Materialize) Result() graph.Value { if it.index >= len(it.values) { return nil } - return it.values[it.index].id + return it.values[it.index][it.subindex].id } func (it *Materialize) SubIterators() []graph.Iterator { @@ -177,14 +178,12 @@ func (it *Materialize) Next() (graph.Value, bool) { return graph.Next(it.subIt) } - lastVal := it.Result() - for it.index < len(it.values) { - it.index++ - if it.Result() != lastVal && it.Result() != nil { - return graph.NextLogOut(it, it.Result(), true) - } + it.index++ + it.subindex = 0 + if it.index >= len(it.values) { + return graph.NextLogOut(it, nil, false) } - return graph.NextLogOut(it, nil, false) + return graph.NextLogOut(it, it.Result(), true) } func (it *Materialize) Contains(v graph.Value) bool { @@ -197,6 +196,7 @@ func (it *Materialize) Contains(v graph.Value) bool { } if i, ok := it.containsMap[v]; ok { it.index = i + it.subindex = 0 return graph.ContainsLogOut(it, v, true) } return graph.ContainsLogOut(it, v, false) @@ -210,15 +210,13 @@ func (it *Materialize) NextResult() bool { return it.subIt.NextResult() } - i := it.index + 1 - if i == len(it.values) { + it.subindex++ + if it.subindex >= len(it.values[it.index]) { + // Don't go off the end of the world + it.subindex-- return false } - if it.Result() == it.values[i].id { - it.index = i - return true - } - return false + return true } func (it *Materialize) materializeSet() { @@ -233,14 +231,18 @@ func (it *Materialize) materializeSet() { it.aborted = true break } + if _, ok := it.containsMap[val]; !ok { + it.containsMap[val] = len(it.values) + it.values = append(it.values, nil) + } + index := it.containsMap[val] tags := make(map[string]graph.Value) it.subIt.TagResults(tags) - it.containsMap[val] = len(it.values) - it.values = append(it.values, result{id: val, tags: tags}) + it.values[index] = append(it.values[index], result{id: val, tags: tags}) for it.subIt.NextResult() == true { tags := make(map[string]graph.Value) it.subIt.TagResults(tags) - it.values = append(it.values, result{id: val, tags: tags}) + it.values[index] = append(it.values[index], result{id: val, tags: tags}) } } if it.aborted { From 2bec255b52cbf113269a1f71849f673296261b5b Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Wed, 6 Aug 2014 14:37:37 -0400 Subject: [PATCH 5/5] Copy refs and comment on Value --- graph/iterator/materialize_iterator.go | 6 ++++++ graph/triplestore.go | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/graph/iterator/materialize_iterator.go b/graph/iterator/materialize_iterator.go index 2e41969..1b448b3 100644 --- a/graph/iterator/materialize_iterator.go +++ b/graph/iterator/materialize_iterator.go @@ -95,6 +95,12 @@ func (it *Materialize) TagResults(dst map[string]graph.Value) { func (it *Materialize) Clone() graph.Iterator { out := NewMaterialize(it.subIt.Clone()) out.tags.CopyFrom(it) + if it.hasRun { + out.hasRun = true + out.aborted = it.aborted + out.values = it.values + out.containsMap = it.containsMap + } return out } diff --git a/graph/triplestore.go b/graph/triplestore.go index c099d47..bfdf62f 100644 --- a/graph/triplestore.go +++ b/graph/triplestore.go @@ -28,14 +28,17 @@ import ( "github.com/google/cayley/quad" ) -// Defines an opaque "triple store value" type. However the backend wishes to -// implement it, a Value is merely a token to a triple or a node that the backing -// store itself understands, and the base iterators pass around. +// Value defines an opaque "triple store value" type. However the backend wishes +// to implement it, a Value is merely a token to a triple or a node that the +// backing store itself understands, and the base iterators pass around. // // For example, in a very traditional, graphd-style graph, these are int64s // (guids of the primitives). In a very direct sort of graph, these could be // pointers to structs, or merely triples, or whatever works best for the // backing store. +// +// These must be comparable, ie, not arrays or maps, as they may be used as keys +// for maps. type Value interface{} type TripleStore interface {