From a123c92d883ea75c49509bd28e84777e3f056ed2 Mon Sep 17 00:00:00 2001 From: Kevin Falcone Date: Wed, 20 Jun 2018 16:59:57 -0400 Subject: [PATCH 1/2] Add backoff to retry AWS calls The rest of these are ok upgrades --- requirements.txt | 13 +++++++------ requirements/base.in | 1 + requirements/dev.txt | 19 ++++++++++--------- requirements/test.txt | 19 ++++++++++--------- requirements/travis.txt | 6 +++--- 5 files changed, 31 insertions(+), 27 deletions(-) diff --git a/requirements.txt b/requirements.txt index e00778aa..bd127818 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,9 @@ # # make upgrade # -boto3==1.7.32 -botocore==1.10.32 # via boto3, s3transfer +backoff==1.5.0 +boto3==1.7.41 +botocore==1.10.41 # via boto3, s3transfer certifi==2018.4.16 # via requests chardet==3.0.4 # via requests django-storages==1.6.6 @@ -14,19 +15,19 @@ docutils==0.14 # via botocore edx-django-release-util==0.3.1 futures==3.2.0 # via isort, s3transfer gunicorn==0.16.1 -idna==2.6 # via requests +idna==2.7 # via requests isort==4.3.4 jmespath==0.9.3 # via boto3, botocore mysql-python==1.2.5 -newrelic==3.2.1.93 +newrelic==3.2.2.94 path.py==11.0.1 python-dateutil==2.7.3 # via botocore python-memcached==1.48 python-termstyle==0.1.10 pytz==2018.4 pyyaml==3.12 # via edx-django-release-util -requests==2.18.4 +requests==2.19.1 s3transfer==0.1.13 # via boto3 six==1.11.0 # via edx-django-release-util, python-dateutil -urllib3==1.22 # via requests +urllib3==1.23 # via requests wsgiref==0.1.2 diff --git a/requirements/base.in b/requirements/base.in index 24b645cc..9b4b7aea 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -1,6 +1,7 @@ # Core requirements for using this service MySQL-python==1.2.5 +backoff boto3 django>=1.11,<1.12 django-storages diff --git a/requirements/dev.txt b/requirements/dev.txt index 2a191414..11c04122 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -6,8 +6,9 @@ # atomicwrites==1.1.5 attrs==18.1.0 -boto3==1.7.32 -botocore==1.10.32 +backoff==1.5.0 +boto3==1.7.41 +botocore==1.10.41 certifi==2018.4.16 chardet==3.0.4 click==6.7 @@ -21,32 +22,32 @@ first==2.0.1 funcsigs==1.0.2 futures==3.2.0 gunicorn==0.16.1 -idna==2.6 +idna==2.7 isort==4.3.4 jmespath==0.9.3 mock==2.0.0 more-itertools==4.2.0 mysql-python==1.2.5 -newrelic==3.2.1.93 +newrelic==3.2.2.94 path.py==11.0.1 -pbr==4.0.3 +pbr==4.0.4 pip-tools==2.0.2 pluggy==0.6.0 py==1.5.3 pycodestyle==2.4.0 pytest-cov==2.5.1 -pytest-django==3.2.1 -pytest==3.6.1 +pytest-django==3.3.0 +pytest==3.6.2 python-dateutil==2.7.3 python-memcached==1.48 python-termstyle==0.1.10 pytz==2018.4 pyyaml==3.12 -requests==2.18.4 +requests==2.19.1 s3transfer==0.1.13 six==1.11.0 tox-battery==0.5.1 tox==3.0.0 -urllib3==1.22 +urllib3==1.23 virtualenv==16.0.0 wsgiref==0.1.2 diff --git a/requirements/test.txt b/requirements/test.txt index 1a243ffc..ddd61bd2 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,8 +6,9 @@ # atomicwrites==1.1.5 # via pytest attrs==18.1.0 # via pytest -boto3==1.7.32 -botocore==1.10.32 +backoff==1.5.0 +boto3==1.7.41 +botocore==1.10.41 certifi==2018.4.16 chardet==3.0.4 coverage==4.5.1 # via pytest-cov @@ -17,27 +18,27 @@ edx-django-release-util==0.3.1 funcsigs==1.0.2 # via mock, pytest futures==3.2.0 gunicorn==0.16.1 -idna==2.6 +idna==2.7 isort==4.3.4 jmespath==0.9.3 mock==2.0.0 more-itertools==4.2.0 # via pytest mysql-python==1.2.5 -newrelic==3.2.1.93 +newrelic==3.2.2.94 path.py==11.0.1 -pbr==4.0.3 # via mock +pbr==4.0.4 # via mock pluggy==0.6.0 # via pytest py==1.5.3 # via pytest pytest-cov==2.5.1 -pytest-django==3.2.1 -pytest==3.6.1 +pytest-django==3.3.0 +pytest==3.6.2 python-dateutil==2.7.3 python-memcached==1.48 python-termstyle==0.1.10 pytz==2018.4 pyyaml==3.12 -requests==2.18.4 +requests==2.19.1 s3transfer==0.1.13 six==1.11.0 -urllib3==1.22 +urllib3==1.23 wsgiref==0.1.2 diff --git a/requirements/travis.txt b/requirements/travis.txt index ca736caa..dee7d99d 100644 --- a/requirements/travis.txt +++ b/requirements/travis.txt @@ -8,12 +8,12 @@ certifi==2018.4.16 # via requests chardet==3.0.4 # via requests codecov==2.0.15 coverage==4.5.1 # via codecov -idna==2.6 # via requests +idna==2.7 # via requests pluggy==0.6.0 # via tox py==1.5.3 # via tox -requests==2.18.4 # via codecov +requests==2.19.1 # via codecov six==1.11.0 # via tox tox-battery==0.5.1 tox==3.0.0 -urllib3==1.22 # via requests +urllib3==1.23 # via requests virtualenv==16.0.0 # via tox From 0c2b33e22134fe798547c7c7cb7275730ad759a8 Mon Sep 17 00:00:00 2001 From: Kevin Falcone Date: Wed, 20 Jun 2018 17:02:48 -0400 Subject: [PATCH 2/2] Add a cloudwatch argument to count_queued_submissions This steals liberally from check-celery-queues.py --- .../commands/count_queued_submissions.py | 104 ++++++++++++++++++ .../tests/test_count_queued_submissions.py | 30 +++++ xqueue/settings.py | 16 +++ 3 files changed, 150 insertions(+) diff --git a/queue/management/commands/count_queued_submissions.py b/queue/management/commands/count_queued_submissions.py index 59c0c929..48d53ebf 100644 --- a/queue/management/commands/count_queued_submissions.py +++ b/queue/management/commands/count_queued_submissions.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals +from itertools import izip_longest from queue.models import Submission +import backoff +import boto3 +import botocore from django.conf import settings from django.core.management.base import BaseCommand, CommandError from django.db.models import Count @@ -27,6 +31,11 @@ def add_arguments(self, parser): action='store_true', help='Submit New Relic custom metrics' ) + parser.add_argument( + '--cloudwatch', + action='store_true', + help='Submit CloudWatch custom metrics' + ) def handle(self, *args, **options): """ @@ -48,6 +57,10 @@ def handle(self, *args, **options): if use_newrelic: self.send_nr_metrics(queue_counts) + use_cloudwatch = options.get('cloudwatch') + if use_cloudwatch: + self.send_cloudwatch_metrics(queue_counts) + def pretty_print_queues(self, queue_counts): """ Send a tabulated log output of the queues and the counts to the console @@ -73,3 +86,94 @@ def send_nr_metrics(self, queue_counts): 'Custom/XQueueLength/{}[submissions]'.format(queue['queue_name']), queue['queue_count'], application=nr_app) + + def send_cloudwatch_metrics(self, queue_counts): + """ + Send custom metrics to AWS CloudWatch + """ + cloudwatch = CwBotoWrapper() + cloudwatch_configuration = settings.CLOUDWATCH_QUEUE_COUNT_METRICS + metric_name = 'queue_length' + dimension = 'queue' + environment = cloudwatch_configuration['environment'] + deployment = cloudwatch_configuration['deployment'] + namespace = "xqueue/{}-{}".format(environment, + deployment) + + # iterate 10 at a time through the list of queues to stay under AWS limits. + for queues in grouper(queue_counts, 10): + # grouper can return a bunch of Nones and we want to skip those + queues = [q for q in queues if q is not None] + metric_data = [] + for queue in queues: + metric_data.append({ + 'MetricName': metric_name, + 'Dimensions': [{ + "Name": dimension, + "Value": queue['queue_name'] + }], + 'Value': queue['queue_count'] + }) + + if len(metric_data) > 0: + cloudwatch.put_metric_data(Namespace=namespace, MetricData=metric_data) + + for queue in queues: + dimensions = [{'Name': dimension, 'Value': queue['queue_name']}] + threshold = cloudwatch_configuration['default_threshold'] + if queue['queue_name'] in cloudwatch_configuration['thresholds']: + threshold = cloudwatch_configuration['thresholds'][queue['queue_name']] + # Period is in seconds - has to be over the max for an hour + period = 600 + evaluation_periods = 6 + comparison_operator = "GreaterThanThreshold" + treat_missing_data = "notBreaching" + statistic = "Maximum" + actions = [cloudwatch_configuration['sns_arn']] + alarm_name = "{}-{} {} queue length over threshold".format(environment, + deployment, + queue['queue_name']) + + print('Creating or updating alarm "{}"'.format(alarm_name)) + cloudwatch.put_metric_alarm(AlarmName=alarm_name, + AlarmDescription=alarm_name, + Namespace=namespace, + MetricName=metric_name, + Dimensions=dimensions, + Period=period, + EvaluationPeriods=evaluation_periods, + TreatMissingData=treat_missing_data, + Threshold=threshold, + ComparisonOperator=comparison_operator, + Statistic=statistic, + InsufficientDataActions=actions, + OKActions=actions, + AlarmActions=actions) + + +class CwBotoWrapper(object): + max_tries = 5 + + def __init__(self): + self.client = boto3.client('cloudwatch') + + @backoff.on_exception(backoff.expo, + (botocore.exceptions.ClientError), + max_tries=max_tries) + def put_metric_data(self, *args, **kwargs): + return self.client.put_metric_data(*args, **kwargs) + + @backoff.on_exception(backoff.expo, + (botocore.exceptions.ClientError), + max_tries=max_tries) + def put_metric_alarm(self, *args, **kwargs): + return self.client.put_metric_alarm(*args, **kwargs) + + +# Stolen right from the itertools recipes +# https://docs.python.org/3/library/itertools.html#itertools-recipes +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return izip_longest(*args, fillvalue=fillvalue) diff --git a/queue/management/commands/tests/test_count_queued_submissions.py b/queue/management/commands/tests/test_count_queued_submissions.py index 0e318261..b9287bf0 100644 --- a/queue/management/commands/tests/test_count_queued_submissions.py +++ b/queue/management/commands/tests/test_count_queued_submissions.py @@ -53,3 +53,33 @@ def test_push_to_new_relic(self, mock_newrelic_agent): mock_newrelic_agent.record_custom_metric.call_count) mock_newrelic_agent.record_custom_metric.has_calls(expected_nr_calls, any_order=True) + + @patch('boto3.client') + def test_push_to_cloudwatch(self, mock_boto3): + self._create_submission(queue_name="test-pull") + self._create_submission(queue_name="test2") + self._create_submission(queue_name="test2") + call_command('count_queued_submissions', '--cloudwatch', stdout=self.stdout) + self.assertRegexpMatches(self.stdout.getvalue(), r'test2\s*2\s*\ntest-pull\s*1') + + metric_alarm_kwargs = [] + for call in mock_boto3.mock_calls: + name, args, kwargs = call + if 'put_metric_name' in name: + self.assertEquals(len(kwargs['Metricdata']), 2) + self.assertEquals(kwargs, + {'Namespace': u'xqueue/dev-stack', + 'MetricData': [ + {u'Dimensions': [{u'Name': u'queue', u'Value': u'test2'}], + u'Value': 2, + u'MetricName': u'queue_length' + }, + {u'Dimensions': [{u'Name': u'queue', u'Value': u'test-pull'}], + u'Value': 1, + u'MetricName': u'queue_length'}]}) + if 'put_metric_alarm' in name: + metric_alarm_kwargs.append(kwargs) + + self.assertEquals(len(metric_alarm_kwargs), 2) + self.assertEquals(metric_alarm_kwargs[0]['AlarmName'], u'dev-stack test2 queue length over threshold') + self.assertEquals(metric_alarm_kwargs[1]['AlarmName'], u'dev-stack test-pull queue length over threshold') diff --git a/xqueue/settings.py b/xqueue/settings.py index 29c2243f..d705531d 100644 --- a/xqueue/settings.py +++ b/xqueue/settings.py @@ -148,3 +148,19 @@ # This is the list of users managed by update_users USERS = None + +# If you use count_queue_submissions to submit data to AWS CloudWatch you'll need to +# provide some information for how to construct the metrics and alarms. +# It will store metrics in a namespace of xqueue/environment-deployment and create an alarm +# for each queue with an alarm on the default_threshold. If you want a different threshold +# for a given queue, thresholds has a dictionary of "queue name" : "custom limit". +# All thresholds share the sns_arn. +CLOUDWATCH_QUEUE_COUNT_METRICS = { + 'environment': 'dev', + 'deployment': 'stack', + 'sns_arn': 'arn:aws:sns:::', + 'default_threshold': 50, + 'thresholds': { + 'test-pull': 100 + } +}