Skip to content

Commit

Permalink
Add log event for worker-ttl-timed-out (#8800)
Browse files Browse the repository at this point in the history
  • Loading branch information
hendrikmakait authored Jul 25, 2024
1 parent 7cf5a36 commit cd82d04
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
8 changes: 8 additions & 0 deletions distributed/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8498,6 +8498,14 @@ async def check_worker_ttl(self) -> None:
)

if to_restart:
self.log_event(
"scheduler",
{
"action": "worker-ttl-timed-out",
"workers": to_restart.copy(),
"ttl": ttl,
},
)
await self.restart_workers(
to_restart,
wait_for_workers=False,
Expand Down
16 changes: 16 additions & 0 deletions distributed/tests/test_failed_workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,29 @@ async def test_worker_time_to_live(c, s, a, b):
# Note that this value is ignored because is less than 10x heartbeat_interval
assert s.worker_ttl == 0.5
assert set(s.workers) == {a.address, b.address}
assert all(
event["action"] != "worker-ttl-timed-out"
for _, event in s.get_events("scheduler")
)

a.periodic_callbacks["heartbeat"].stop()

start = time()
while set(s.workers) == {a.address, b.address}:
await asyncio.sleep(0.01)
assert set(s.workers) == {b.address}
events = [
event
for _, event in s.get_events("scheduler")
if event["action"] == "worker-ttl-timed-out"
]
assert len(events) == 1
# This event includes the actual TTL that we applied, i.e, 10 * heartbeat.
assert events[0] == {
"action": "worker-ttl-timed-out",
"workers": [a.address],
"ttl": 5.0,
}

# Worker removal is triggered after 10 * heartbeat
# This is 10 * 0.5s at the moment of writing.
Expand Down

0 comments on commit cd82d04

Please sign in to comment.