Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix overflows in /messages backfill calculation #13936

Merged
merged 14 commits into from
Sep 30, 2022
1 change: 1 addition & 0 deletions changelog.d/13936.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Exponentially backoff from backfilling the same event over and over.
65 changes: 40 additions & 25 deletions synapse/storage/databases/main/event_federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,20 @@

logger = logging.getLogger(__name__)

BACKFILL_EVENT_BACKOFF_UPPER_BOUND_SECONDS: int = int(
datetime.timedelta(days=7).total_seconds()
)
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS = 8
clokep marked this conversation as resolved.
Show resolved Hide resolved
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS: int = int(
datetime.timedelta(hours=1).total_seconds()
)
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved
# The longest backoff period is then:
# _LONGEST_BACKOFF_PERIOD_SECONDS = (
# (1 << BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS)
# * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS,
# )
# which is 2 ** 8 hours = 256 hours ≈ 10 days. This number needs to fit
# in a 32 bit signed int, or else Postgres will error.
# assert _LONGEST_BACKOFF_PERIOD_SECONDS < ((2 ** 31) - 1)
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved
# (We could use a bigint, but bigint overflows still cause errors. This
# at least avoids CASTing in the queries below.)


# All the info we need while iterating the DAG while backfilling
Expand Down Expand Up @@ -749,7 +757,15 @@ def get_backfill_points_in_room_txn(
# persisted in our database yet (meaning we don't know their depth
# specifically). So we need to look for the approximate depth from
# the events connected to the current backwards extremeties.
sql = """

if isinstance(self.database_engine, PostgresEngine):
least_function = "least"
elif isinstance(self.database_engine, Sqlite3Engine):
least_function = "min"
else:
raise RuntimeError("Unknown database engine")
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved

sql = f"""
SELECT backward_extrem.event_id, event.depth FROM events AS event
/**
* Get the edge connections from the event_edges table
Expand Down Expand Up @@ -795,7 +811,10 @@ def get_backfill_points_in_room_txn(
*/
AND (
failed_backfill_attempt_info.event_id IS NULL
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + /*least*/%s((1 << failed_backfill_attempt_info.num_attempts) * ? /* step */, ? /* upper bound */)
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + (
(1 << {least_function}(failed_backfill_attempt_info.num_attempts, ? /* max doubling steps */))
* ? /* step */
)
)
/**
* Sort from highest to the lowest depth. Then tie-break on
Expand All @@ -805,21 +824,14 @@ def get_backfill_points_in_room_txn(
ORDER BY event.depth DESC, backward_extrem.event_id DESC
"""

if isinstance(self.database_engine, PostgresEngine):
least_function = "least"
elif isinstance(self.database_engine, Sqlite3Engine):
least_function = "min"
else:
raise RuntimeError("Unknown database engine")

txn.execute(
sql % (least_function,),
sql,
(
room_id,
False,
self._clock.time_msec(),
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
1000 * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS,
1000 * BACKFILL_EVENT_BACKOFF_UPPER_BOUND_SECONDS,
),
)

Expand Down Expand Up @@ -852,7 +864,14 @@ async def get_insertion_event_backward_extremities_in_room(
def get_insertion_event_backward_extremities_in_room_txn(
txn: LoggingTransaction, room_id: str
) -> List[Tuple[str, int]]:
sql = """
if isinstance(self.database_engine, PostgresEngine):
least_function = "least"
elif isinstance(self.database_engine, Sqlite3Engine):
least_function = "min"
else:
raise RuntimeError("Unknown database engine")

sql = f"""
SELECT
insertion_event_extremity.event_id, event.depth
/* We only want insertion events that are also marked as backwards extremities */
Expand Down Expand Up @@ -880,7 +899,10 @@ def get_insertion_event_backward_extremities_in_room_txn(
*/
AND (
failed_backfill_attempt_info.event_id IS NULL
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + /*least*/%s((1 << failed_backfill_attempt_info.num_attempts) * ? /* step */, ? /* upper bound */)
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + (
(1 << {least_function}(failed_backfill_attempt_info.num_attempts, ? /* max doubling steps */))
* ? /* step */
)
)
/**
* Sort from highest to the lowest depth. Then tie-break on
Expand All @@ -890,20 +912,13 @@ def get_insertion_event_backward_extremities_in_room_txn(
ORDER BY event.depth DESC, insertion_event_extremity.event_id DESC
"""

if isinstance(self.database_engine, PostgresEngine):
least_function = "least"
elif isinstance(self.database_engine, Sqlite3Engine):
least_function = "min"
else:
raise RuntimeError("Unknown database engine")

txn.execute(
sql % (least_function,),
sql,
(
room_id,
self._clock.time_msec(),
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
1000 * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS,
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved
1000 * BACKFILL_EVENT_BACKOFF_UPPER_BOUND_SECONDS,
),
)
return cast(List[Tuple[str, int]], txn.fetchall())
Expand Down
40 changes: 40 additions & 0 deletions tests/storage/test_event_federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,6 +1049,46 @@ def test_get_insertion_event_backward_extremities_in_room_attempted_event_retry_
backfill_event_ids, ["insertion_eventB", "insertion_eventA"]
)

def test_repeated_backfill_failures_are_retried_appropriately(self) -> None:
"""
A test that reproduces #13929 (Postgres only).

We should be able to backoff repeatedly, even if the backoff formula would tell
us to wait for more seconds than can be expressed in a 32 bit signed int.
"""
setup_info = self._setup_room_for_insertion_backfill_tests()
room_id = setup_info.room_id

# Pretend that we have tried and failed 10 times to backfill event A.
for _ in range(10):
self.get_success(
self.store.record_event_failed_pull_attempt(
room_id, "insertion_eventA", "fake cause"
)
)

# If the backoff periods grow without limit:
# After the first failed attempt, we would have backed off for 1 << 1 = 2 hours.
# After the second failed attempt we would have backed off for 1 << 2 = 4 hours,
# so after the 10th failed attempt we should backoff for 1 << 10 == 1024 hours.
# Wait 1100 hours just so we have a nice round number.
self.reactor.advance(datetime.timedelta(hours=1100).total_seconds())

# 1024 hours in milliseconds is 1024 * 3600000, which exceeds the largest 32 bit
# signed integer. The bug we're reproducing is that this overflow causes an
# error in postgres preventing us from fetching a set of backwards extremities
# to retry fetching.
backfill_points = self.get_success(
self.store.get_insertion_event_backward_extremities_in_room(room_id)
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
)

# We should aim to fetch A (the backoff period has expired) and to fetch B
# (we haven't tried to fetch it yet).
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
self.assertListEqual(
backfill_event_ids, ["insertion_eventB", "insertion_eventA"]
)
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved


@attr.s
class FakeEvent:
Expand Down