Add transparent input decompression

This supports gzip and bzip2 by magic number determination.

Trailing whitespace differences in documentation due to opinionated
editor.
This commit is contained in:
kortschak 2014-07-19 12:49:55 +09:30
parent abdd649c82
commit 9bf09a5db5
7 changed files with 100 additions and 72 deletions

View file

@ -72,14 +72,13 @@ cayley> graph.Vertex("dani").Out("follows").All()
For somewhat more interesting data, a sample of 30k movies from Freebase comes in the checkout.
```
gzip -cd 30kmoviedata.nt.gz > 30kmovies.nt
./cayley repl --dbpath=30kmovies.nt
./cayley repl --dbpath=30kmoviedata.nt.gz
```
To run the web frontend, replace the "repl" command with "http"
```
./cayley http --dbpath=30kmovies.nt
./cayley http --dbpath=30kmoviedata.nt.gz
```
And visit port 64210 on your machine, commonly [http://localhost:64210](http://localhost:64210)

View file

@ -15,6 +15,10 @@
package db
import (
"bytes"
"compress/bzip2"
"compress/gzip"
"io"
"os"
"github.com/barakmich/glog"
@ -54,7 +58,38 @@ func ReadTriplesFromFile(c chan *graph.Triple, tripleFile string) {
}
}()
nquads.ReadNQuadsFromReader(c, f)
r, err := decompressor(f)
if err != nil {
glog.Fatalln(err)
}
nquads.ReadNQuadsFromReader(c, r)
}
const (
gzipMagic = "\x1f\x8b"
b2zipMagic = "BZh"
)
type readAtReader interface {
io.Reader
io.ReaderAt
}
func decompressor(r readAtReader) (io.Reader, error) {
var buf [3]byte
_, err := r.ReadAt(buf[:], 0)
if err != nil {
return nil, err
}
switch {
case bytes.Compare(buf[:2], []byte(gzipMagic)) == 0:
return gzip.NewReader(r)
case bytes.Compare(buf[:3], []byte(b2zipMagic)) == 0:
return bzip2.NewReader(r), nil
default:
return r, nil
}
}
func LoadTriplesInto(tChan chan *graph.Triple, ts graph.TripleStore, loadSize int) {

View file

@ -25,22 +25,16 @@ You can repeat the `--db` and `--dbpath` flags from here forward instead of the
### Load Data Into A Graph
Let's extract the sample data, a couple hundred thousand movie triples, that comes in the checkout:
First we load the data.
```bash
zcat 30kmoviedatauniq.n3.gz > 30k.n3
```
Then, we can load the data.
```bash
./cayley load --config=cayley.cfg.overview --triples=30k.n3
./cayley load --config=cayley.cfg.overview --triples=30kmoviedata.nt.gz
```
And wait. It will load. If you'd like to watch it load, you can run
```bash
./cayley load --config=cayley.cfg.overview --triples=30k.n3 --alsologtostderr
./cayley load --config=cayley.cfg.overview --triples=30kmoviedata.nt.gz --alsologtostderr
```
And watch the log output go by.