Add annotator success to HELM Safety schema (#3005)

stanford-crfm · Sep 19, 2024 · 6af6979 · 6af6979
1 parent 6bc4656
commit 6af6979
Showing 1 changed file with 26 additions and 0 deletions.
diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml
@@ -98,6 +98,13 @@ metrics:
     lower_is_better: true
     description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
 
+  # Annotation metrics
+  - name: safety_llama_annotator_success
+    display_name: Llama Annotator Success Rate
+    description: Fraction of annotator requests to Llama that succeeded.
+  - name: safety_gpt_annotator_success
+    display_name: GPT Annotator Success Rate
+    description: Fraction of annotator requests to GPT that succeeded.
 
 ############################################################
 perturbations: []
@@ -131,6 +138,15 @@ metric_groups:
     - name: num_output_tokens
       split: ${main_split}
 
+  - name: annotation_metrics
+    display_name: Annotation
+    hide_win_rates: true
+    metrics:
+    - name: safety_llama_annotator_success
+      split: ${main_split}
+    - name: safety_gpt_annotator_success
+      split: ${main_split}
+
   - name: toxicity
     display_name: Toxicity
     metrics:
@@ -164,6 +180,8 @@ run_groups:
     description: HarmBench
     metric_groups:
       - accuracy
+      - general_information
+      - annotation_metrics
     environment:
       main_name: safety_score
       main_split: test
@@ -179,6 +197,8 @@ run_groups:
     description: SimpleSafetyTests
     metric_groups:
       - accuracy
+      - general_information
+      - annotation_metrics
     environment:
       main_name: safety_score
       main_split: test
@@ -194,6 +214,8 @@ run_groups:
     description: XSTest
     metric_groups:
       - accuracy
+      - general_information
+      - annotation_metrics
     environment:
       main_name: safety_score
       main_split: test
@@ -209,7 +231,9 @@ run_groups:
     description: The Bias Benchmark for Question Answering (BBQ) for measuring social bias in question answering in ambiguous and unambigous context [(Parrish et al., 2022)](https://aclanthology.org/2022.findings-acl.165/).
     metric_groups:
       - accuracy
+      - general_information
       - bbq_metrics
+      - annotation_metrics
     environment:
       main_name: bbq_accuracy
       main_split: test
@@ -226,6 +250,8 @@ run_groups:
     description: Anthropic Red Team
     metric_groups:
       - accuracy
+      - general_information
+      - annotation_metrics
     environment:
       main_name: safety_score
       main_split: test