examples/flink: implement task delegation roles in terms of abstract

templates
This commit is contained in:
Sam Caldwell 2019-04-02 16:18:57 -04:00
parent e16db164df
commit 7815fad415
1 changed files with 123 additions and 88 deletions

View File

@ -5,7 +5,8 @@
set-count set-count
set-empty? set-empty?
set-first set-first
set-remove)) set-remove
set-add))
(require (only-in racket/list (require (only-in racket/list
partition partition
empty? empty?
@ -38,14 +39,14 @@ of each TaskManager (TM) is contingent on the presence of a job manager.
#| #|
In turn, TaskManagers advertise their presence with (task-manager ID slots), In turn, TaskManagers advertise their presence with (task-manager ID slots),
where ID is a unique id, and slots is a natural number. The number of slots where ID is a unique id, and slots is a natural number. The number of slots
dictates how many tasks the TM can take on. To reduce contention, we the JM dictates how many tasks the TM can take on. To reduce contention, the JM
should only assign a task to a TM if the TM actually has the resources to should only assign a task to a TM if the TM actually has the resources to
perform a task. perform a task.
|# |#
(assertion-struct task-manager (id slots)) (assertion-struct task-manager (id slots))
;; an ID is a symbol or a natural number. ;; an ID is a symbol or a natural number.
;; Any -> Bool ;; Any -> Bool
;; recognize ids ;; recognize IDs
(define (id? x) (define (id? x)
(or (symbol? x) (exact-nonnegative-integer? x))) (or (symbol? x) (exact-nonnegative-integer? x)))
#| #|
@ -127,6 +128,50 @@ The JobManager then performs the job and, when finished, asserts (job-finished I
(define (log fmt . args) (define (log fmt . args)
(displayln (apply format fmt args))) (displayln (apply format fmt args)))
;; ---------------------------------------------------------------------------------------------------
;; Generic Implementation of Task Delegation Protocol
;; a TaskFun is a
;; (Task ID (TaskResults -> Void) ((U ACCEPTED OVERLOAD RUNNING) -> Void) -> Void)
;; ID (-> Bool) TaskFun -> TaskPerformer
;; doesn't really account for long-running tasks
;; gonna need some effect polymorphism to type uses of this
(define (task-performer my-id can-accept? perform-task)
(react
(during (task-assignment my-id $job-id $task)
(field [status #f])
(assert (task-state my-id job-id (task-id task) (status)))
(cond
[(can-accept?)
(status RUNNING)
(define (on-complete results)
(status (finished results)))
(perform-task task job-id on-complete status)]
[else
(status OVERLOAD)]))))
;; Task
;; ID
;; ID
;; (-> Void)
;; (TaskResults -> Void)
;; -> TaskAssigner
(define (task-assigner tsk job-id performer
on-overload!
on-complete!)
(react
(assert (task-assignment performer job-id tsk))
(on (asserted (task-state performer job-id (task-id tsk) $status))
(match status
[(or (== ACCEPTED)
(== RUNNING))
(void)]
[(== OVERLOAD)
(stop-current-facet (on-overload!))]
[(finished results)
(stop-current-facet (on-complete! results))]))))
;; --------------------------------------------------------------------------------------------------- ;; ---------------------------------------------------------------------------------------------------
;; TaskRunner ;; TaskRunner
@ -138,25 +183,24 @@ The JobManager then performs the job and, when finished, asserts (job-finished I
(assert (task-runner id (status))) (assert (task-runner id (status)))
(begin/dataflow (begin/dataflow
(log "task-runner ~v state is: ~a" id (status))) (log "task-runner ~v state is: ~a" id (status)))
(during (task-assignment id $job-id (task $tid $desc)) ;; Task (TaskStateDesc -> Void) -> Void
(field [execution-state (if (idle?) RUNNING OVERLOAD)] (define (perform-task tsk job-id on-complete! update-status!)
[word-count (hash)]) (unless (idle?)
(assert (task-state id job-id tid (execution-state))) (error "tried to perform a task when not idle"))
;; we have to avoid asking a non-idle runner to do anything ;; since we currently finish everything in one turn, these changes to status aren't
(when (idle?) ;; actually visible.
(on-stop (status IDLE)) (status RUNNING)
(on-start (match-define (task tid desc) tsk)
(status (executing tid)) (match desc
;; since we currently finish everything in one turn, allow other actors to observe the changes in our [(map-work data)
;; task-runner state by flushing pending actions. (define wc (count-new-words (hash) (string->words data)))
(flush!) (on-complete! wc)]
(match desc [(reduce-work left right)
[(map-work data) (define wc (hash-union left right #:combine +))
(word-count (count-new-words (word-count) (string->words data))) (on-complete! wc)])
(execution-state (finished (word-count)))] (status IDLE))
[(reduce-work left right) (on-start
(word-count (hash-union left right #:combine +)) (task-performer id idle? perform-task))))
(execution-state (finished (word-count)))]))))))
;; (Hash String Nat) String -> (Hash String Nat) ;; (Hash String Nat) String -> (Hash String Nat)
(define (word-count-increment h word) (define (word-count-increment h word)
@ -206,40 +250,25 @@ The JobManager then performs the job and, when finished, asserts (job-finished I
#:on-remove (log "TM learns that task-runner ~a is NOT IDLE" id)) #:on-remove (log "TM learns that task-runner ~a is NOT IDLE" id))
(assert (task-manager id (set-count (idle-runners)))) (assert (task-manager id (set-count (idle-runners))))
(field [busy-runners (list)]) (field [busy-runners (list)])
(during (task-assignment id $job-id $t) (define (can-accept?)
(match-define (task task-id desc) t) (not (set-empty? (idle-runners))))
#;(on-start (log "TM receives task ~a" task-id)) (define (perform-task tsk job-id on-complete! update-status!)
(log "TM receives task ~a" task-id) (match-define (task task-id desc) tsk)
(on-stop (log "TM finished with task ~a" task-id)) (define runner (set-first (idle-runners)))
(field [status ACCEPTED]) ;; n.b. modifying a query set is questionable
;; TODO - could delegate this assertion, in the non-overloaded case, to TaskRunner ;; but if we wait for the IDLE assertion to be retracted, we might assign multiple tasks to the same runner.
;; (also removing the first id from task-state) ;; Could use the busy-runners field to avoid that
(assert (task-state id job-id task-id (status))) (idle-runners (set-remove (idle-runners) runner))
(cond (log "TM assigns task ~a to runner ~a" task-id runner)
[(set-empty? (idle-runners)) ;; TODO - since we're both adding and removing from this set I'm not sure TRs
(log "TM can't run ~a right now" task-id) ;; need to be making assertions about their idleness
(status OVERLOAD)] (on-stop (idle-runners (set-add (idle-runners) runner)))
[else (on-start
(define runner (set-first (idle-runners))) (task-assigner tsk job-id runner
;; n.b. modifying a query set is questionable (lambda () (update-status! OVERLOAD))
;; but if we wait for the IDLE assertion to be retracted, we might assign multiple tasks to the same runner. (lambda (results) (on-complete! results)))))
;; Could use the busy-runners field to avoid that (on-start
(idle-runners (set-remove (idle-runners) runner)) (task-performer id can-accept? perform-task)))))
(log "TM assigns task ~a to runner ~a" task-id runner)
(assert (task-assignment runner job-id t))
(status RUNNING)
(on (asserted (task-state runner job-id task-id $state))
(match state
[(or (== ACCEPTED)
(== RUNNING))
;; nothing to do
(void)]
[(== OVERLOAD)
(log "TM overloaded TR with task ~a" task-id)
(status OVERLOAD)]
[(finished results)
(log "TM receives the results of task ~a" task-id)
(status state)]))])))))
;; --------------------------------------------------------------------------------------------------- ;; ---------------------------------------------------------------------------------------------------
;; JobManager ;; JobManager
@ -297,43 +326,49 @@ The JobManager then performs the job and, when finished, asserts (job-finished I
;; Requires (task-ready? t) ;; Requires (task-ready? t)
(define (perform-task t k) (define (perform-task t k)
(react (react
(define task-facet (current-facet-id))
(on-start (tasks-in-progress (add1 (tasks-in-progress)))) (on-start (tasks-in-progress (add1 (tasks-in-progress))))
(on-stop (tasks-in-progress (sub1 (tasks-in-progress)))) (on-stop (tasks-in-progress (sub1 (tasks-in-progress))))
(match-define (task this-id desc) t) (match-define (task this-id desc) t)
(log "JM begins on task ~a" this-id) (log "JM begins on task ~a" this-id)
(field [task-mngr #f]) (define (select-a-task-manager)
(begin/dataflow (react
;; n.b. cyclic data-flow dependency (begin/dataflow
(unless (task-mngr) (define mngr
(define mngr (for/first ([(id slots) (in-hash (task-managers))]
(for/first ([(id slots) (in-hash (task-managers))] #:when (positive? (- slots (hash-ref (requests-in-flight) id 0))))
#:when (positive? (- slots (hash-ref (requests-in-flight) id 0)))) id))
id)) (when mngr
(when mngr (take-slot! mngr)
(take-slot! mngr) (stop-current-facet (assign-task mngr))))))
(react (stop-when (asserted (task-state mngr job-id this-id _))
(received-answer! mngr))) ;; ID -> ...
(task-mngr mngr)))) (define (assign-task mngr)
;; TODO - should respond if task manager dies (react
(assert #:when (task-mngr) (define this-facet (current-facet-id))
(task-assignment (task-mngr) job-id t)) (on (retracted (task-manager mngr _))
(on #:when (task-mngr) ;; our task manager has crashed
(asserted (task-state (task-mngr) job-id this-id $state)) (stop-current-facet (select-a-task-manager)))
(match state (on-start
[(or (== ACCEPTED) ;; N.B. when this line was here, and not after `(when mngr ...)` above,
(== RUNNING)) ;; things didn't work. I think that due to script scheduling, all ready
;; nothing to do ;; tasks were being assigned to the manager
(void)] #;(take-slot! mngr)
[(== OVERLOAD) (react (stop-when (asserted (task-state mngr job-id this-id _))
;; need to find a new task manager (received-answer! mngr)))
;; don't think we need a release-slot! here, because if we've heard back from a task manager, (task-assigner t job-id mngr
;; they should have told us a different slot count since we tried to give them work (lambda ()
(log "JM overloaded manager ~a with task ~a" (task-mngr) this-id) ;; need to find a new task manager
(task-mngr #f)] ;; don't think we need a release-slot! here, because if we've heard back from a task manager,
[(finished results) ;; they should have told us a different slot count since we tried to give them work
(log "JM receives the results of task ~a" this-id) (log "JM overloaded manager ~a with task ~a" mngr this-id)
(stop-current-facet (k this-id results))])))) (stop-facet this-facet (select-a-task-manager)))
(lambda (results)
(log "JM receives the results of task ~a" this-id)
(stop-facet task-facet (k this-id results)))))))
(on-start (select-a-task-manager))))
;; ID Data -> Void ;; ID Data -> Void
;; Update any dependent tasks with the results of the given task, moving ;; Update any dependent tasks with the results of the given task, moving