Merge pull request #260 from edx/jibsheet/xqueue-cloudwatch-alarms

Add a cloudwatch argument to count_queued_submissions
openedx · Jun 22, 2018 · f031bcf · f031bcf
2 parents b0356de + 0c2b33e
commit f031bcf
Show file tree

Hide file tree

Showing 8 changed files with 181 additions and 27 deletions.
diff --git a/queue/management/commands/count_queued_submissions.py b/queue/management/commands/count_queued_submissions.py
@@ -4,8 +4,12 @@
 
 from __future__ import unicode_literals
 
+from itertools import izip_longest
 from queue.models import Submission
 
+import backoff
+import boto3
+import botocore
 from django.conf import settings
 from django.core.management.base import BaseCommand, CommandError
 from django.db.models import Count
@@ -27,6 +31,11 @@ def add_arguments(self, parser):
             action='store_true',
             help='Submit New Relic custom metrics'
         )
+        parser.add_argument(
+            '--cloudwatch',
+            action='store_true',
+            help='Submit CloudWatch custom metrics'
+        )
 
     def handle(self, *args, **options):
         """
@@ -48,6 +57,10 @@ def handle(self, *args, **options):
         if use_newrelic:
             self.send_nr_metrics(queue_counts)
 
+        use_cloudwatch = options.get('cloudwatch')
+        if use_cloudwatch:
+            self.send_cloudwatch_metrics(queue_counts)
+
     def pretty_print_queues(self, queue_counts):
         """
         Send a tabulated log output of the queues and the counts to the console
@@ -73,3 +86,94 @@ def send_nr_metrics(self, queue_counts):
                 'Custom/XQueueLength/{}[submissions]'.format(queue['queue_name']),
                 queue['queue_count'],
                 application=nr_app)
+
+    def send_cloudwatch_metrics(self, queue_counts):
+        """
+        Send custom metrics to AWS CloudWatch
+        """
+        cloudwatch = CwBotoWrapper()
+        cloudwatch_configuration = settings.CLOUDWATCH_QUEUE_COUNT_METRICS
+        metric_name = 'queue_length'
+        dimension = 'queue'
+        environment = cloudwatch_configuration['environment']
+        deployment = cloudwatch_configuration['deployment']
+        namespace = "xqueue/{}-{}".format(environment,
+                                          deployment)
+
+        # iterate 10 at a time through the list of queues to stay under AWS limits.
+        for queues in grouper(queue_counts, 10):
+            # grouper can return a bunch of Nones and we want to skip those
+            queues = [q for q in queues if q is not None]
+            metric_data = []
+            for queue in queues:
+                metric_data.append({
+                    'MetricName': metric_name,
+                    'Dimensions': [{
+                        "Name": dimension,
+                        "Value": queue['queue_name']
+                    }],
+                    'Value': queue['queue_count']
+                })
+
+            if len(metric_data) > 0:
+                cloudwatch.put_metric_data(Namespace=namespace, MetricData=metric_data)
+
+            for queue in queues:
+                dimensions = [{'Name': dimension, 'Value': queue['queue_name']}]
+                threshold = cloudwatch_configuration['default_threshold']
+                if queue['queue_name'] in cloudwatch_configuration['thresholds']:
+                    threshold = cloudwatch_configuration['thresholds'][queue['queue_name']]
+                # Period is in seconds - has to be over the max for an hour
+                period = 600
+                evaluation_periods = 6
+                comparison_operator = "GreaterThanThreshold"
+                treat_missing_data = "notBreaching"
+                statistic = "Maximum"
+                actions = [cloudwatch_configuration['sns_arn']]
+                alarm_name = "{}-{} {} queue length over threshold".format(environment,
+                                                                           deployment,
+                                                                           queue['queue_name'])
+
+                print('Creating or updating alarm "{}"'.format(alarm_name))
+                cloudwatch.put_metric_alarm(AlarmName=alarm_name,
+                                            AlarmDescription=alarm_name,
+                                            Namespace=namespace,
+                                            MetricName=metric_name,
+                                            Dimensions=dimensions,
+                                            Period=period,
+                                            EvaluationPeriods=evaluation_periods,
+                                            TreatMissingData=treat_missing_data,
+                                            Threshold=threshold,
+                                            ComparisonOperator=comparison_operator,
+                                            Statistic=statistic,
+                                            InsufficientDataActions=actions,
+                                            OKActions=actions,
+                                            AlarmActions=actions)
+
+
+class CwBotoWrapper(object):
+    max_tries = 5
+
+    def __init__(self):
+        self.client = boto3.client('cloudwatch')
+
+    @backoff.on_exception(backoff.expo,
+                          (botocore.exceptions.ClientError),
+                          max_tries=max_tries)
+    def put_metric_data(self, *args, **kwargs):
+        return self.client.put_metric_data(*args, **kwargs)
+
+    @backoff.on_exception(backoff.expo,
+                          (botocore.exceptions.ClientError),
+                          max_tries=max_tries)
+    def put_metric_alarm(self, *args, **kwargs):
+        return self.client.put_metric_alarm(*args, **kwargs)
+
+
+# Stolen right from the itertools recipes
+# https://docs.python.org/3/library/itertools.html#itertools-recipes
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+    args = [iter(iterable)] * n
+    return izip_longest(*args, fillvalue=fillvalue)
diff --git a/queue/management/commands/tests/test_count_queued_submissions.py b/queue/management/commands/tests/test_count_queued_submissions.py
@@ -53,3 +53,33 @@ def test_push_to_new_relic(self, mock_newrelic_agent):
                           mock_newrelic_agent.record_custom_metric.call_count)
 
         mock_newrelic_agent.record_custom_metric.has_calls(expected_nr_calls, any_order=True)
+
+    @patch('boto3.client')
+    def test_push_to_cloudwatch(self, mock_boto3):
+        self._create_submission(queue_name="test-pull")
+        self._create_submission(queue_name="test2")
+        self._create_submission(queue_name="test2")
+        call_command('count_queued_submissions', '--cloudwatch', stdout=self.stdout)
+        self.assertRegexpMatches(self.stdout.getvalue(), r'test2\s*2\s*\ntest-pull\s*1')
+
+        metric_alarm_kwargs = []
+        for call in mock_boto3.mock_calls:
+            name, args, kwargs = call
+            if 'put_metric_name' in name:
+                self.assertEquals(len(kwargs['Metricdata']), 2)
+                self.assertEquals(kwargs,
+                                  {'Namespace': u'xqueue/dev-stack',
+                                   'MetricData': [
+                                       {u'Dimensions': [{u'Name': u'queue', u'Value': u'test2'}],
+                                        u'Value': 2,
+                                        u'MetricName': u'queue_length'
+                                        },
+                                       {u'Dimensions': [{u'Name': u'queue', u'Value': u'test-pull'}],
+                                        u'Value': 1,
+                                        u'MetricName': u'queue_length'}]})
+            if 'put_metric_alarm' in name:
+                metric_alarm_kwargs.append(kwargs)
+
+        self.assertEquals(len(metric_alarm_kwargs), 2)
+        self.assertEquals(metric_alarm_kwargs[0]['AlarmName'], u'dev-stack test2 queue length over threshold')
+        self.assertEquals(metric_alarm_kwargs[1]['AlarmName'], u'dev-stack test-pull queue length over threshold')
diff --git a/requirements.txt b/requirements.txt
@@ -4,8 +4,9 @@
 #
 #    make upgrade
 #
-boto3==1.7.32
-botocore==1.10.32         # via boto3, s3transfer
+backoff==1.5.0
+boto3==1.7.41
+botocore==1.10.41         # via boto3, s3transfer
 certifi==2018.4.16        # via requests
 chardet==3.0.4            # via requests
 django-storages==1.6.6
@@ -14,19 +15,19 @@ docutils==0.14            # via botocore
 edx-django-release-util==0.3.1
 futures==3.2.0            # via isort, s3transfer
 gunicorn==0.16.1
-idna==2.6                 # via requests
+idna==2.7                 # via requests
 isort==4.3.4
 jmespath==0.9.3           # via boto3, botocore
 mysql-python==1.2.5
-newrelic==3.2.1.93
+newrelic==3.2.2.94
 path.py==11.0.1
 python-dateutil==2.7.3    # via botocore
 python-memcached==1.48
 python-termstyle==0.1.10
 pytz==2018.4
 pyyaml==3.12              # via edx-django-release-util
-requests==2.18.4
+requests==2.19.1
 s3transfer==0.1.13        # via boto3
 six==1.11.0               # via edx-django-release-util, python-dateutil
-urllib3==1.22             # via requests
+urllib3==1.23             # via requests
 wsgiref==0.1.2
diff --git a/requirements/base.in b/requirements/base.in
@@ -1,6 +1,7 @@
 # Core requirements for using this service
 
 MySQL-python==1.2.5
+backoff
 boto3
 django>=1.11,<1.12
 django-storages

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -6,8 +6,9 @@
 #
 atomicwrites==1.1.5
 attrs==18.1.0
-boto3==1.7.32
-botocore==1.10.32
+backoff==1.5.0
+boto3==1.7.41
+botocore==1.10.41
 certifi==2018.4.16
 chardet==3.0.4
 click==6.7
@@ -21,32 +22,32 @@ first==2.0.1
 funcsigs==1.0.2
 futures==3.2.0
 gunicorn==0.16.1
-idna==2.6
+idna==2.7
 isort==4.3.4
 jmespath==0.9.3
 mock==2.0.0
 more-itertools==4.2.0
 mysql-python==1.2.5
-newrelic==3.2.1.93
+newrelic==3.2.2.94
 path.py==11.0.1
-pbr==4.0.3
+pbr==4.0.4
 pip-tools==2.0.2
 pluggy==0.6.0
 py==1.5.3
 pycodestyle==2.4.0
 pytest-cov==2.5.1
-pytest-django==3.2.1
-pytest==3.6.1
+pytest-django==3.3.0
+pytest==3.6.2
 python-dateutil==2.7.3
 python-memcached==1.48
 python-termstyle==0.1.10
 pytz==2018.4
 pyyaml==3.12
-requests==2.18.4
+requests==2.19.1
 s3transfer==0.1.13
 six==1.11.0
 tox-battery==0.5.1
 tox==3.0.0
-urllib3==1.22
+urllib3==1.23
 virtualenv==16.0.0
 wsgiref==0.1.2
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -6,8 +6,9 @@
 #
 atomicwrites==1.1.5       # via pytest
 attrs==18.1.0             # via pytest
-boto3==1.7.32
-botocore==1.10.32
+backoff==1.5.0
+boto3==1.7.41
+botocore==1.10.41
 certifi==2018.4.16
 chardet==3.0.4
 coverage==4.5.1           # via pytest-cov
@@ -17,27 +18,27 @@ edx-django-release-util==0.3.1
 funcsigs==1.0.2           # via mock, pytest
 futures==3.2.0
 gunicorn==0.16.1
-idna==2.6
+idna==2.7
 isort==4.3.4
 jmespath==0.9.3
 mock==2.0.0
 more-itertools==4.2.0     # via pytest
 mysql-python==1.2.5
-newrelic==3.2.1.93
+newrelic==3.2.2.94
 path.py==11.0.1
-pbr==4.0.3                # via mock
+pbr==4.0.4                # via mock
 pluggy==0.6.0             # via pytest
 py==1.5.3                 # via pytest
 pytest-cov==2.5.1
-pytest-django==3.2.1
-pytest==3.6.1
+pytest-django==3.3.0
+pytest==3.6.2
 python-dateutil==2.7.3
 python-memcached==1.48
 python-termstyle==0.1.10
 pytz==2018.4
 pyyaml==3.12
-requests==2.18.4
+requests==2.19.1
 s3transfer==0.1.13
 six==1.11.0
-urllib3==1.22
+urllib3==1.23
 wsgiref==0.1.2
diff --git a/requirements/travis.txt b/requirements/travis.txt
@@ -8,12 +8,12 @@ certifi==2018.4.16        # via requests
 chardet==3.0.4            # via requests
 codecov==2.0.15
 coverage==4.5.1           # via codecov
-idna==2.6                 # via requests
+idna==2.7                 # via requests
 pluggy==0.6.0             # via tox
 py==1.5.3                 # via tox
-requests==2.18.4          # via codecov
+requests==2.19.1          # via codecov
 six==1.11.0               # via tox
 tox-battery==0.5.1
 tox==3.0.0
-urllib3==1.22             # via requests
+urllib3==1.23             # via requests
 virtualenv==16.0.0        # via tox
diff --git a/xqueue/settings.py b/xqueue/settings.py
@@ -148,3 +148,19 @@
 
 # This is the list of users managed by update_users
 USERS = None
+
+# If you use count_queue_submissions to submit data to AWS CloudWatch you'll need to
+# provide some information for how to construct the metrics and alarms.
+# It will store metrics in a namespace of xqueue/environment-deployment and create an alarm
+# for each queue with an alarm on the default_threshold.  If you want a different threshold
+# for a given queue, thresholds has a dictionary of "queue name" : "custom limit".
+# All thresholds share the sns_arn.
+CLOUDWATCH_QUEUE_COUNT_METRICS = {
+    'environment': 'dev',
+    'deployment': 'stack',
+    'sns_arn': 'arn:aws:sns:::',
+    'default_threshold': 50,
+    'thresholds': {
+        'test-pull': 100
+    }
+}