cayley/graph/and_iterator_optimize.go

// Copyright 2014 The Cayley Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package graph

// Perhaps the most tricky file in this entire module. Really a method on the
// AndIterator, but important enough to deserve its own file.
//
// Calling Optimize() on an And iterator, like any iterator, requires that we
// preserve the underlying meaning. However, the And has many choices, namely,
// which one of it's subiterators will be the branch that does the Next()ing,
// and which ordering of the remaining iterators is the most efficient. In
// short, this is where a lot of the query optimization happens, and there are
// many wins to be had here, as well as many bad bugs. The worst class of bug
// changes the meaning of the query. The second worst class makes things really
// slow.
//
// The good news is this: If Optimize() is never called (turned off, perhaps) we can
// be sure the results are as good as the query language called for.
//
// In short, tread lightly.

import (
	"container/list"
)

// Optimizes the AndIterator, by picking the most efficient way to Next() and
// Check() its subiterators. For SQL fans, this is equivalent to JOIN.
func (it *AndIterator) Optimize() (Iterator, bool) {
	// First, let's get the list of iterators, in order (first one is Next()ed,
	// the rest are Check()ed)
	oldItList := it.GetSubIterators()

	// And call Optimize() on our subtree, replacing each one in the order we
	// found them. it_list is the newly optimized versions of these, and changed
	// is another list, of only the ones that have returned replacements and
	// changed.
	itList := optimizeSubIterators(oldItList)

	// Close the replaced iterators (they ought to close themselves, but Close()
	// is idempotent, so this just protects against any machinations).
	closeIteratorList(oldItList, nil)

	// If we can find only one subiterator which is equivalent to this whole and,
	// we can replace the And...
	out := it.optimizeReplacement(itList)
	if out != nil {
		// ...Move the tags to the replacement...
		moveTagsTo(out, it)
		// ...Close everyone except `out`, our replacement...
		closeIteratorList(itList, out)
		// ...And return it.
		return out, true
	}

	// And now, without changing any of the iterators, we reorder them. it_list is
	// now a permutation of itself, but the contents are unchanged.
	itList = optimizeOrder(itList)

	// Okay! At this point we have an optimized order.

	// The easiest thing to do at this point is merely to create a new And iterator
	// and replace ourselves with our (reordered, optimized) clone.
	newAnd := NewAndIterator()

	// Add the subiterators in order.
	for e := itList.Front(); e != nil; e = e.Next() {
		newAnd.AddSubIterator(e.Value.(Iterator))
	}

	// Move the tags hanging on us (like any good replacement).
	newAnd.CopyTagsFrom(it)

	newAnd.optimizeCheck()

	// And close ourselves but not our subiterators -- some may still be alive in
	// the new And (they were unchanged upon calling Optimize() on them, at the
	// start).
	it.cleanUp()
	return newAnd, true
}

// Closes a list of iterators, except the one passed in `except`. Closes all
// of the iterators in the list if `except` is nil.
func closeIteratorList(l *list.List, except Iterator) {
	for e := l.Front(); e != nil; e = e.Next() {
		it := e.Value.(Iterator)
		if it != except {
			e.Value.(Iterator).Close()
		}
	}
}

// Find if there is a single subiterator which is a valid replacement for this
// AndIterator.
func (_ *AndIterator) optimizeReplacement(itList *list.List) Iterator {
	// If we were created with no SubIterators, we're as good as Null.
	if itList.Len() == 0 {
		return &NullIterator{}
	}
	if itList.Len() == 1 {
		// When there's only one iterator, there's only one choice.
		return itList.Front().Value.(Iterator)
	}
	// If any of our subiterators, post-optimization, are also Null, then
	// there's no point in continuing the branch, we will have no results
	// and we are null as well.
	if hasAnyNullIterators(itList) {
		return &NullIterator{}
	}

	// If we have one useful iterator, use that.
	it := hasOneUsefulIterator(itList)
	if it != nil {
		return it
	}
	return nil
}

// optimizeOrder(l) takes a list and returns a list, containing the same contents
// but with a new ordering, however it wishes.
func optimizeOrder(l *list.List) *list.List {
	out := list.New()
	var bestIt Iterator
	bestCost := int64(1 << 62)
	// bad contains iterators that can't be (efficiently) nexted, such as
	// "optional" or "not". Separate them out and tack them on at the end.
	bad := list.New()

	// Find the iterator with the projected "best" total cost.
	// Total cost is defined as The Next()ed iterator's cost to Next() out
	// all of it's contents, and to Check() each of those against everyone
	// else.
	for e := l.Front(); e != nil; e = e.Next() {
		it := e.Value.(Iterator)
		if !it.Nextable() {
			bad.PushBack(it)
			continue
		}
		rootStats := e.Value.(Iterator).GetStats()
		projectedCost := rootStats.NextCost
		for f := l.Front(); f != nil; f = f.Next() {
			if !f.Value.(Iterator).Nextable() {
				continue
			}
			if f == e {
				continue
			}
			stats := f.Value.(Iterator).GetStats()
			projectedCost += stats.CheckCost
		}
		projectedCost = projectedCost * rootStats.Size
		if projectedCost < bestCost {
			bestIt = it
			bestCost = projectedCost
		}
	}

	// TODO(barakmich): Optimization of order need not stop here. Picking a smart
	// Check() order based on probability of getting a false Check() first is
	// useful (fail faster).

	// Put the best iterator (the one we wish to Next()) at the front...
	out.PushBack(bestIt)
	// ...And push everyone else after...
	for e := l.Front(); e != nil; e = e.Next() {
		thisIt := e.Value.(Iterator)
		if !thisIt.Nextable() {
			continue
		}
		if thisIt != bestIt {
			out.PushBack(thisIt)
		}
	}
	// ...And finally, the difficult children on the end.
	out.PushBackList(bad)
	return out
}

// optimizeCheck(l) creates an alternate check list, containing the same contents
// but with a new ordering, however it wishes.
func (it *AndIterator) optimizeCheck() {
	subIts := it.GetSubIterators()
	out := list.New()

	// Find the iterator with the lowest Check() cost, push it to the front, repeat.
	for subIts.Len() != 0 {
		var best *list.Element
		bestCost := int64(1 << 62)
		for e := subIts.Front(); e != nil; e = e.Next() {
			it := e.Value.(Iterator)
			rootStats := it.GetStats()
			projectedCost := rootStats.CheckCost
			if projectedCost < bestCost {
				best = e
				bestCost = projectedCost
			}
		}
		out.PushBack(best.Value)
		subIts.Remove(best)
	}

	it.checkList = out
}

// If we're replacing ourselves by a single iterator, we need to grab the
// result tags from the iterators that, while still valid and would hold
// the same values as this and, are not going to stay.
// getSubTags() returns a map of the tags for all the subiterators.
func (it *AndIterator) getSubTags() map[string]bool {
	subs := it.GetSubIterators()
	tags := make(map[string]bool)
	for e := subs.Front(); e != nil; e = e.Next() {
		it := e.Value.(Iterator)
		for _, tag := range it.Tags() {
			tags[tag] = true
		}
	}
	for _, tag := range it.Tags() {
		tags[tag] = true
	}
	return tags
}

// moveTagsTo() gets the tags for all of the src's subiterators and the
// src itself, and moves them to dst.
func moveTagsTo(dst Iterator, src *AndIterator) {
	tagmap := src.getSubTags()
	for _, tag := range dst.Tags() {
		if tagmap[tag] {
			delete(tagmap, tag)
		}
	}
	for k, _ := range tagmap {
		dst.AddTag(k)
	}
}

// optimizeSubIterators(l) takes a list of iterators and calls Optimize() on all
// of them. It returns two lists -- the first contains the same list as l, where
// any replacements are made by Optimize() and the second contains the originals
// which were replaced.
func optimizeSubIterators(l *list.List) *list.List {
	itList := list.New()
	for e := l.Front(); e != nil; e = e.Next() {
		it := e.Value.(Iterator)
		newIt, change := it.Optimize()
		if change {
			itList.PushBack(newIt)
		} else {
			itList.PushBack(it.Clone())
		}
	}
	return itList
}

// Check a list of iterators for any Null iterators.
func hasAnyNullIterators(l *list.List) bool {
	for e := l.Front(); e != nil; e = e.Next() {
		it := e.Value.(Iterator)
		if it.Type() == "null" {
			return true
		}
	}
	return false
}

// There are two "not-useful" iterators -- namely "null" which returns
// nothing, and "all" which returns everything. Particularly, we want
// to see if we're intersecting with a bunch of "all" iterators, and,
// if we are, then we have only one useful iterator.
func hasOneUsefulIterator(l *list.List) Iterator {
	usefulCount := 0
	var usefulIt Iterator
	for e := l.Front(); e != nil; e = e.Next() {
		it := e.Value.(Iterator)
		switch it.Type() {
		case "null", "all":
			continue
		case "optional":
			// Optional is weird -- it's not useful, but we can't optimize
			// away from it. Therefore, we skip this optimization
			// if we see one.
			return nil
		default:
			usefulCount++
			usefulIt = it
		}
	}

	if usefulCount == 1 {
		return usefulIt
	}
	return nil
}

// and.GetStats() lives here in and-iterator-optimize.go because it may
// in the future return different statistics based on how it is optimized.
// For now, however, it's pretty static.
func (it *AndIterator) GetStats() *IteratorStats {
	primaryStats := it.primaryIt.GetStats()
	CheckCost := primaryStats.CheckCost
	NextCost := primaryStats.NextCost
	Size := primaryStats.Size
	for _, sub := range it.internalIterators {
		stats := sub.GetStats()
		NextCost += stats.CheckCost
		CheckCost += stats.CheckCost
		if Size > stats.Size {
			Size = stats.Size
		}
	}
	return &IteratorStats{
		CheckCost: CheckCost,
		NextCost:  NextCost,
		Size:      Size,
	}

}