defenseunicorns · runyontr · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
@@ -2,4 +2,5 @@ data/*
 *.tar.zst
 __pycache__
 .venv
-.ipynb_checkpoints
+.ipynb_checkpoints
+.vscode/
@@ -18,7 +18,7 @@
     [text-embedding-ada-002.metadata]
         owned_by    = 'Defense Unicorns'
         permission  = []
-        description = 'A pretend implementation of ada-0002 tha's actually all-MiniLM-L6-v2'
+        description = "A pretend implementation of ada-0002 that's actually all-MiniLM-L6-v2"
     [text-embedding-ada-002.network]
         url = 'sentence-transformers:50051'
         type = 'gRPC'   
@@ -29,4 +29,13 @@
         description = ' By default, input text longer than 256 word pieces is truncated.'
     [all-MiniLM-L6-v2.network]
         url = 'sentence-transformers:50051'
-        type = 'gRPC'   
+        type = 'gRPC'   
+[whisper-1]
+    [whisper-1.metadata]
+        owned_by    = 'Defense Unicorns'
+        permission  = []
+        description = "OpenAI's Whisper Large v2"
+        tasks = ["translate", "transcribe"]
+    [whisper-1.network]
+        url = 'http://localhost:8000'
+        type = 'http'   
@@ -4,6 +4,7 @@ colorama==0.4.6
 fastapi==0.96.0 
 grpcio==1.54.2 
 h11==0.14.0 
+httpx==0.24.0
 idna==3.4
 protobuf==4.23.2 
 pydantic==1.10.8 

@@ -1,6 +1,8 @@
 import os
 import pathlib
 import sys
+import asyncio
+import httpx
 from dataclasses import dataclass
 from typing import Union
 
@@ -166,13 +168,30 @@ def stream(
         )
 
 
+@dataclass(unsafe_hash=True)
+class HttpClientAudioModel:
+    name: str
+    url: str
+
+    async def run(self, audio, task: str, source_language: str = "en") -> str:
+        async with httpx.AsyncClient() as client:
+            files = {"file": audio.file}
+            data = {"task": task, "language": source_language}
+            response = await client.post(
+                f"{self.url}/transcribe", files=files, data=data, timeout=None
+            )
+            return response.json()["result"]
+
+
 def select_model_type(model_interface: str = "gRPC", task: str = "complete"):
     if model_interface == "gRPC":
         if task == "embed":
             return RpcEmbeddingLanguageModel
         if task == "chat":
             return RpcChatLanguageModel
         return RpcCompletionLanguageModel
+    if model_interface == "http":
+        return HttpClientAudioModel
     return RpcCompletionLanguageModel
 
 
@@ -188,7 +207,8 @@ def get_model(model_id: str, metadata: dict = MODELS_ZOO, task: str = "complete"
 
 def list_models(metadata: dict = MODELS_ZOO) -> list:
     return dict(
-        data=[{"id": key, **meta.get("metadata")} for key, meta in metadata.items()], object="list"
+        data=[{"id": key, **meta.get("metadata")} for key, meta in metadata.items()],
+        object="list",
     )
 
 

@@ -4,15 +4,21 @@
 from typing import Annotated, Union, Optional, List
 from uuid import uuid4
 import json
+import tiktoken
 
 
 import fastapi
-from fastapi import Body, FastAPI, Response, Request, middleware, exceptions
+from fastapi import Body, FastAPI, Response, Request, Form, UploadFile, File, exceptions
 from fastapi.responses import StreamingResponse
 from fastapi.responses import JSONResponse
 from pydantic import ValidationError
 
-from .api_models import ChatCompletionInput, CompletionInput, EmbeddingInput, InstructionInput
+from .api_models import (
+    ChatCompletionInput,
+    CompletionInput,
+    EmbeddingInput,
+    InstructionInput,
+)
 from .dummy import dummy_chat, dummy_complete, dummy_edit, dummy_embedding, dummy_engine
 from .models import get_model, get_model_infos, list_models
 from .utils import (
@@ -39,21 +45,9 @@
     contact={
         "name": "TOMMY",
         "url": "https://github.com/defenseunicorns",
-    },  
-
+    },
 )
 
-# @app.middleware("http")
-# async def log_stuff(request: Request, call_next):
-#     logger.error(f"{request.method} {request.url}")
-#     logger.error(f"{ request }")
-#     logger.error(f"Headers: { request.headers }")
-#     b = await request.body()
-#     logger.error(f"Request Body: { b }")
-#     response = await call_next(request)
-#     # logger.error(response)
-#     # logger.error(response.status_code)
-#     return response
 
 @app.exception_handler(exceptions.RequestValidationError)
 @app.exception_handler(ValidationError)
@@ -62,10 +56,11 @@ async def validation_exception_handler(request, exc):
     exc_json = json.loads(exc.json())
     response = {"message": [], "data": None}
     for error in exc_json:
-        response['message'].append(f"{error['loc']}: {error['msg']}")
+        response["message"].append(f"{error['loc']}: {error['msg']}")
 
     return JSONResponse(response, status_code=422)
 
+
 async def http422_error_handler(
     _: Request, exc: Union[exceptions.RequestValidationError, ValidationError]
 ) -> JSONResponse:
@@ -74,15 +69,18 @@ async def http422_error_handler(
         {"errors": exc.errors()}, status_code=exceptions.HTTP_422_UNPROCESSABLE_ENTITY
     )
 
+
 app.add_exception_handler(ValidationError, http422_error_handler)
 app.add_exception_handler(exceptions.RequestValidationError, http422_error_handler)
 app.debug = True
 
+
 # Models
 @app.get("/models")
 async def show_models2():
     return list_models()
 
+
 # Models
 @app.get("/models/")
 async def show_models():
@@ -125,7 +123,9 @@ async def complete(
             best_of=body.best_of,
             logit_bias=body.logit_bias,
         )
-        output = format_autocompletion_response(model_name=llm.name, predictions=predictions)
+        output = format_autocompletion_response(
+            model_name=llm.name, predictions=predictions
+        )
         return output
 
     predictions_stream = llm.stream_complete(
@@ -149,7 +149,9 @@ async def complete(
     uuid = uuid4().hex
     current_timestamp = int(dt.now().timestamp())
     postprocessed = map(
-        partial(format_autocompletion_stream_response, current_timestamp, uuid, body.model),
+        partial(
+            format_autocompletion_stream_response, current_timestamp, uuid, body.model
+        ),
         predictions_stream,
     )
     with_finaliser = chain(postprocessed, ("data: [DONE]\n",))
@@ -164,7 +166,10 @@ async def chat_complete(
     background_tasks: fastapi.background.BackgroundTasks,
 ):
     llm = get_model(model_id=body.model, task="chat")
-    messages = [[message.get("role", ""), message.get("content", "")] for message in body.messages]
+    messages = [
+        [message.get("role", ""), message.get("content", "")]
+        for message in body.messages
+    ]
     if not body.stream:
         predictions = llm.chat(
             messages=messages,
@@ -200,7 +205,8 @@ async def chat_complete(
     uuid = uuid4().hex
     current_timestamp = int(dt.now().timestamp())
     postprocessed = map(
-        partial(format_chat_delta_response, current_timestamp, uuid, body.model), predictions_stream
+        partial(format_chat_delta_response, current_timestamp, uuid, body.model),
+        predictions_stream,
     )
 
     with_finaliser = chain(postprocessed, ("data: [DONE]\n",))
@@ -223,11 +229,13 @@ async def edit(body: Annotated[InstructionInput, Body(example=dummy_edit)]):
     output = format_edits_response(model_name=llm.name, predictions=predictions)
     return output
 
+
 # Models
 @app.get("/embeddings")
 async def embeddings():
     return list_models()
 
+
 # Embeddings
 @app.post("/embeddings")
 async def embed(body: Annotated[EmbeddingInput, Body(example=dummy_embedding)]):
@@ -241,27 +249,61 @@ async def embed(body: Annotated[EmbeddingInput, Body(example=dummy_embedding)]):
     return output
 
 
+# Speech to text
+@app.post("/audio/transcriptions")
+async def transcribe(
+    file: Annotated[
+        UploadFile,
+        File(
+            description="The audio file object (not file name) to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm."
+        ),
+    ],
+    model: Annotated[str, Form(description="The model to use")] = "whisper-1",
+    language: Annotated[
+        str,
+        Form(description="The ISO 3166-1 alpha-2 (two letter language) code to use"),
+    ] = "en",
+):
+    model = get_model(model_id=model)
+    result = await model.run(file, "transcribe", language)
+    return {"text": result}
+
+
+@app.post("/audio/translations")
+async def transcribe(
+    file: Annotated[
+        UploadFile,
+        File(
+            description="The audio file object (not file name) to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm."
+        ),
+    ],
+    model: Annotated[str, Form(description="The model to use")] = "whisper-1",
+    language: Annotated[
+        str,
+        Form(description="The ISO 3166-1 alpha-2 (two letter language) code to use"),
+    ] = "en",
+):
+    model = get_model(model_id=model)
+    result = await model.run(file, "translate", language)
+    return {"text": result}
+
+
 class EngineEmbedding(BaseModel):
     encoding_format: Optional[str]
-    input:  Union[List[List[int]], list]
+    input: Union[List[List[int]], list]
     prompt: Optional[List[str]]
 
 
-import tiktoken
-
 @app.post("/engines/{model_id}/embeddings")
-async def embed2(model_id: str, body: Annotated[EngineEmbedding, Body(example=dummy_engine)]):
-# async def embed2(model_id: str, body: Annotated[EngineEmbedding, Body(example=dummy_embedding)]):
-    logger.error(f"Request for model: { model_id} with body { body }")
-    # return "WOOHOO"
+async def embed2(
+    model_id: str, body: Annotated[EngineEmbedding, Body(example=dummy_engine)]
+):
     llm = get_model(model_id=model_id, task="embed")
     encoding = tiktoken.model.encoding_for_model(model_id)
-    
-    body.prompt = [ encoding.decode(input) for input in body.input]
+
+    body.prompt = [encoding.decode(input) for input in body.input]
     logger.error(f"Decoded: { body.prompt}")
-
 
-    # results = [llm.embed(inputs=prompt) for prompt in body.prompt]
     results = llm.embed(inputs=body.prompt)
     output = format_embeddings_results(model_name=llm.name, embeddings=results)
     return output
@@ -23,7 +23,6 @@ version: 0.0.1
 # It is recommended to use it with quotes.
 appVersion: "1.16.0"
 
-
 annotations:
   helm.sh/images: |
     - name: api
@@ -32,5 +31,5 @@ annotations:
       image: ghcr.io/defenseunicorns/leapfrogai/stablelm-3b:0.0.1
     - name: embeddings
       image: ghcr.io/defenseunicorns/leapfrogai/embeddings:0.0.1
-
-
+    - name: whisper
+      image: ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1
@@ -32,5 +32,14 @@ data:
             url = 'sentence-transformers:50051'
             type = 'gRPC'   
 {{- end -}}
-
-
+{{ if .Values.models.whisper.enabled }}
+    [whisper-1]
+        [whisper-1.metadata]
+            owned_by    = 'Defense Unicorns'
+            permission  = []
+            description = "OpenAI's Whisper Large v2"
+            tasks = ["translate", "transcribe"]
+        [whisper-1.network]
+            url = 'http://whisper:8000'
+            type = 'http'     
+{{- end -}}
@@ -0,0 +1,28 @@
+{{ if .Values.models.whisper.enabled }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: whisper-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: whisper
+  template:
+    metadata:
+      labels:
+        app: whisper
+    spec:
+      containers:
+        - name: whisper-container
+          # imagePullPolicy: Always
+          imagePullPolicy: Always # Building on the node currently so we don't have to push/pull
+          image: ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1
+          ports:
+            - containerPort: 8000
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+            requests:
+              nvidia.com/gpu: 1
+{{ end }}
@@ -0,0 +1,15 @@
+{{ if .Values.models.whisper.enabled }}
+apiVersion: v1
+kind: Service
+metadata:
+  name: whisper
+spec:
+  selector:
+    app: whisper
+  ports:
+    - name: http
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+  type: ClusterIP
+{{ end }}