Add some better balancing, reconnection

This commit is contained in:
Barak Michener 2020-12-04 22:40:50 -08:00
parent 715f79c9c0
commit 324ba0d160
6 changed files with 77 additions and 29 deletions

View file

@ -1,6 +1,7 @@
from ray import ray from ray import ray
import sys
ray.connect("localhost:50050") ray.connect(sys.argv[1])
@ray.remote @ray.remote
def plus2(x): def plus2(x):
@ -14,4 +15,17 @@ def fact(x):
return 1 return 1
return x * ray.get(fact.remote(x - 1)) return x * ray.get(fact.remote(x - 1))
print(ray.get(fact.remote(20))) #print(ray.get(fact.remote(20)))
@ray.remote
def sleeper(x):
import time
time.sleep(1)
return x * 2
holder = []
for i in range(20):
holder.append(sleeper.remote(i))
print([ray.get(x) for x in holder])

View file

@ -10,6 +10,7 @@ from ray import ray
from ray.common import ClientObjectRef from ray.common import ClientObjectRef
class Worker: class Worker:
def __init__(self, conn_str): def __init__(self, conn_str):
self.channel = grpc.insecure_channel(conn_str) self.channel = grpc.insecure_channel(conn_str)
@ -37,9 +38,9 @@ class Worker:
continue continue
args = self.decode_args(task) args = self.decode_args(task)
func = self.get(task.payload_id) func = self.get(task.payload_id)
#self.pool.submit(self.run_and_return, func, args, work.ticket) self.pool.submit(self.run_and_return, func, args, work.ticket)
t = threading.Thread(target=self.run_and_return, args=(func, args, work.ticket)) #t = threading.Thread(target=self.run_and_return, args=(func, args, work.ticket))
t.start() #t.start()
def run_and_return(self, func, args, ticket): def run_and_return(self, func, args, ticket):
@ -50,6 +51,7 @@ class Worker:
complete_data = out_data, complete_data = out_data,
finished_ticket = ticket, finished_ticket = ticket,
)) ))
print("Finished Work")
# def get(self, id_bytes): # def get(self, id_bytes):
# data = self.server.GetObject(ray_client_pb2.GetRequest( # data = self.server.GetObject(ray_client_pb2.GetRequest(

View file

@ -59,6 +59,7 @@ func (r *Raylet) Workstream(conn WorkstreamConnection) error {
workChan: make(chan *ray_rpc.Work), workChan: make(chan *ray_rpc.Work),
clientConn: conn, clientConn: conn,
pool: r.Workers, pool: r.Workers,
max: 3,
} }
r.Workers.Register(worker) r.Workers.Register(worker)
err := worker.Run() err := worker.Run()

View file

@ -17,25 +17,40 @@ languagePluginLoader.then(() => {
wsprotocol = "wss:" wsprotocol = "wss:"
} }
var wspath = wsprotocol + "//" + window.location.host + "/api/ws" var wspath = wsprotocol + "//" + window.location.host + "/api/ws"
var c = new WebSocket(wspath) function connect() {
c.onmessage = function(msg) { var c = new WebSocket(wspath)
var workText = workTerms[Math.floor(Math.random() * workTerms.length)];
$("#output").append("<p>" + workText + "...</p>") c.onopen = function() {
pyodide.globals.torun = msg.data $("#status").text("Status: connected!")
pyodide.runPythonAsync("exec_work(torun)").then((res) => { c.send(JSON.stringify({
$("#output").append("<p>Did work! 👏</p>") status: 2,
c.send(res) error_msg: "WebsocketWorker"
}) }))
} }
c.onopen = function() {
$("#status").text("Status: connected!") c.onmessage = function(msg) {
c.send(JSON.stringify({ var workText = workTerms[Math.floor(Math.random() * workTerms.length)];
status: 2, $("#output").append("<p>" + workText + "...</p>")
error_msg: "WebsocketWorker" pyodide.globals.torun = msg.data
})) pyodide.runPythonAsync("exec_work(torun)").then((res) => {
} $("#output").append("<p>Did work! 👏</p>")
c.onclose = function() { c.send(res)
$("#status").text("Status: disconnected") })
} };
})
}) c.onclose = function(e) {
$("#status").text("Status: disconnected. reconnecting...")
console.log('Socket is closed. Reconnect will be attempted in 1 second.', e.reason);
setTimeout(function() {
connect();
}, 500);
};
c.onerror = function(err) {
console.error('Socket encountered error: ', err.message, 'Closing socket');
c.close();
};
};
connect();
}) })

View file

@ -20,10 +20,12 @@ type SimpleWorker struct {
workChan chan *ray_rpc.Work workChan chan *ray_rpc.Work
clientConn WorkstreamConnection clientConn WorkstreamConnection
pool WorkerPool pool WorkerPool
max int
curr int
} }
func (s *SimpleWorker) Schedulable() bool { func (s *SimpleWorker) Schedulable() bool {
return true return s.curr < s.max
} }
func (s *SimpleWorker) AssignWork(work *ray_rpc.Work) error { func (s *SimpleWorker) AssignWork(work *ray_rpc.Work) error {
@ -48,6 +50,7 @@ func (w *SimpleWorker) Run() error {
go func() { go func() {
for work := range w.workChan { for work := range w.workChan {
zap.S().Debug("Sending work") zap.S().Debug("Sending work")
w.curr++
err = w.clientConn.Send(work) err = w.clientConn.Send(work)
if err != nil { if err != nil {
zap.S().Error("Error sending:", err) zap.S().Error("Error sending:", err)
@ -61,6 +64,7 @@ func (w *SimpleWorker) Run() error {
zap.S().Error("Error on channel:", err) zap.S().Error("Error on channel:", err)
return err return err
} }
w.curr--
err = w.pool.Finish(result) err = w.pool.Finish(result)
if err != nil { if err != nil {
zap.S().Error("Error finishing:", err) zap.S().Error("Error finishing:", err)

View file

@ -21,6 +21,7 @@ type SimpleRRWorkerPool struct {
workers []Worker workers []Worker
store ObjectStore store ObjectStore
offset int offset int
pending []chan bool
} }
func NewRoundRobinWorkerPool(obj ObjectStore) *SimpleRRWorkerPool { func NewRoundRobinWorkerPool(obj ObjectStore) *SimpleRRWorkerPool {
@ -57,16 +58,27 @@ func (wp *SimpleRRWorkerPool) Schedule(work *ray_rpc.Work) error {
wp.offset = 0 wp.offset = 0
} }
if wp.offset == origOffset && !done { if wp.offset == origOffset && !done {
return errors.New("No workers schedulable") c := make(chan bool)
wp.pending = append(wp.pending, c)
wp.Unlock()
<-c
wp.Lock()
} }
} }
return nil return nil
} }
func (wp *SimpleRRWorkerPool) Finish(status *ray_rpc.WorkStatus) error { func (wp *SimpleRRWorkerPool) Finish(status *ray_rpc.WorkStatus) error {
wp.Lock()
defer wp.Unlock()
if status.Status != ray_rpc.COMPLETE { if status.Status != ray_rpc.COMPLETE {
panic("todo: Only call Finish on successfully completed work") panic("todo: Only call Finish on successfully completed work")
} }
if len(wp.pending) != 0 {
c := wp.pending[0]
wp.pending = wp.pending[1:]
close(c)
}
id := deserializeObjectID(status.FinishedTicket.ReturnId) id := deserializeObjectID(status.FinishedTicket.ReturnId)
return wp.store.PutObject(&Object{id, status.CompleteData}) return wp.store.PutObject(&Object{id, status.CompleteData})
} }