Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the whisper frontend and chart configuration #70

Merged
merged 2 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ data/*
*.tar.zst
__pycache__
.venv
.ipynb_checkpoints
.ipynb_checkpoints
.vscode/
13 changes: 11 additions & 2 deletions api/models.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
[text-embedding-ada-002.metadata]
owned_by = 'Defense Unicorns'
permission = []
description = 'A pretend implementation of ada-0002 tha's actually all-MiniLM-L6-v2'
description = "A pretend implementation of ada-0002 that's actually all-MiniLM-L6-v2"
[text-embedding-ada-002.network]
url = 'sentence-transformers:50051'
type = 'gRPC'
Expand All @@ -29,4 +29,13 @@
description = ' By default, input text longer than 256 word pieces is truncated.'
[all-MiniLM-L6-v2.network]
url = 'sentence-transformers:50051'
type = 'gRPC'
type = 'gRPC'
[whisper-1]
[whisper-1.metadata]
owned_by = 'Defense Unicorns'
permission = []
description = "OpenAI's Whisper Large v2"
tasks = ["translate", "transcribe"]
[whisper-1.network]
url = 'http://localhost:8000'
type = 'http'
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ colorama==0.4.6
fastapi==0.96.0
grpcio==1.54.2
h11==0.14.0
httpx==0.24.0
idna==3.4
protobuf==4.23.2
pydantic==1.10.8
Expand Down
22 changes: 21 additions & 1 deletion api/simple_ai/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import pathlib
import sys
import asyncio
import httpx
from dataclasses import dataclass
from typing import Union

Expand Down Expand Up @@ -166,13 +168,30 @@ def stream(
)


@dataclass(unsafe_hash=True)
class HttpClientAudioModel:
name: str
url: str

async def run(self, audio, task: str, source_language: str = "en") -> str:
async with httpx.AsyncClient() as client:
files = {"file": audio.file}
data = {"task": task, "language": source_language}
response = await client.post(
f"{self.url}/transcribe", files=files, data=data, timeout=None
)
return response.json()["result"]


def select_model_type(model_interface: str = "gRPC", task: str = "complete"):
if model_interface == "gRPC":
if task == "embed":
return RpcEmbeddingLanguageModel
if task == "chat":
return RpcChatLanguageModel
return RpcCompletionLanguageModel
if model_interface == "http":
return HttpClientAudioModel
return RpcCompletionLanguageModel


Expand All @@ -188,7 +207,8 @@ def get_model(model_id: str, metadata: dict = MODELS_ZOO, task: str = "complete"

def list_models(metadata: dict = MODELS_ZOO) -> list:
return dict(
data=[{"id": key, **meta.get("metadata")} for key, meta in metadata.items()], object="list"
data=[{"id": key, **meta.get("metadata")} for key, meta in metadata.items()],
object="list",
)


Expand Down
104 changes: 73 additions & 31 deletions api/simple_ai/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,21 @@
from typing import Annotated, Union, Optional, List
from uuid import uuid4
import json
import tiktoken


import fastapi
from fastapi import Body, FastAPI, Response, Request, middleware, exceptions
from fastapi import Body, FastAPI, Response, Request, Form, UploadFile, File, exceptions
from fastapi.responses import StreamingResponse
from fastapi.responses import JSONResponse
from pydantic import ValidationError

from .api_models import ChatCompletionInput, CompletionInput, EmbeddingInput, InstructionInput
from .api_models import (
ChatCompletionInput,
CompletionInput,
EmbeddingInput,
InstructionInput,
)
from .dummy import dummy_chat, dummy_complete, dummy_edit, dummy_embedding, dummy_engine
from .models import get_model, get_model_infos, list_models
from .utils import (
Expand All @@ -39,21 +45,9 @@
contact={
"name": "TOMMY",
"url": "https://github.com/defenseunicorns",
},

},
)

# @app.middleware("http")
# async def log_stuff(request: Request, call_next):
# logger.error(f"{request.method} {request.url}")
# logger.error(f"{ request }")
# logger.error(f"Headers: { request.headers }")
# b = await request.body()
# logger.error(f"Request Body: { b }")
# response = await call_next(request)
# # logger.error(response)
# # logger.error(response.status_code)
# return response

@app.exception_handler(exceptions.RequestValidationError)
@app.exception_handler(ValidationError)
Expand All @@ -62,10 +56,11 @@ async def validation_exception_handler(request, exc):
exc_json = json.loads(exc.json())
response = {"message": [], "data": None}
for error in exc_json:
response['message'].append(f"{error['loc']}: {error['msg']}")
response["message"].append(f"{error['loc']}: {error['msg']}")

return JSONResponse(response, status_code=422)


async def http422_error_handler(
_: Request, exc: Union[exceptions.RequestValidationError, ValidationError]
) -> JSONResponse:
Expand All @@ -74,15 +69,18 @@ async def http422_error_handler(
{"errors": exc.errors()}, status_code=exceptions.HTTP_422_UNPROCESSABLE_ENTITY
)


app.add_exception_handler(ValidationError, http422_error_handler)
app.add_exception_handler(exceptions.RequestValidationError, http422_error_handler)
app.debug = True


# Models
@app.get("/models")
async def show_models2():
return list_models()


# Models
@app.get("/models/")
async def show_models():
Expand Down Expand Up @@ -125,7 +123,9 @@ async def complete(
best_of=body.best_of,
logit_bias=body.logit_bias,
)
output = format_autocompletion_response(model_name=llm.name, predictions=predictions)
output = format_autocompletion_response(
model_name=llm.name, predictions=predictions
)
return output

predictions_stream = llm.stream_complete(
Expand All @@ -149,7 +149,9 @@ async def complete(
uuid = uuid4().hex
current_timestamp = int(dt.now().timestamp())
postprocessed = map(
partial(format_autocompletion_stream_response, current_timestamp, uuid, body.model),
partial(
format_autocompletion_stream_response, current_timestamp, uuid, body.model
),
predictions_stream,
)
with_finaliser = chain(postprocessed, ("data: [DONE]\n",))
Expand All @@ -164,7 +166,10 @@ async def chat_complete(
background_tasks: fastapi.background.BackgroundTasks,
):
llm = get_model(model_id=body.model, task="chat")
messages = [[message.get("role", ""), message.get("content", "")] for message in body.messages]
messages = [
[message.get("role", ""), message.get("content", "")]
for message in body.messages
]
if not body.stream:
predictions = llm.chat(
messages=messages,
Expand Down Expand Up @@ -200,7 +205,8 @@ async def chat_complete(
uuid = uuid4().hex
current_timestamp = int(dt.now().timestamp())
postprocessed = map(
partial(format_chat_delta_response, current_timestamp, uuid, body.model), predictions_stream
partial(format_chat_delta_response, current_timestamp, uuid, body.model),
predictions_stream,
)

with_finaliser = chain(postprocessed, ("data: [DONE]\n",))
Expand All @@ -223,11 +229,13 @@ async def edit(body: Annotated[InstructionInput, Body(example=dummy_edit)]):
output = format_edits_response(model_name=llm.name, predictions=predictions)
return output


# Models
@app.get("/embeddings")
async def embeddings():
return list_models()


# Embeddings
@app.post("/embeddings")
async def embed(body: Annotated[EmbeddingInput, Body(example=dummy_embedding)]):
Expand All @@ -241,27 +249,61 @@ async def embed(body: Annotated[EmbeddingInput, Body(example=dummy_embedding)]):
return output


# Speech to text
@app.post("/audio/transcriptions")
async def transcribe(
file: Annotated[
UploadFile,
File(
description="The audio file object (not file name) to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm."
),
],
model: Annotated[str, Form(description="The model to use")] = "whisper-1",
language: Annotated[
str,
Form(description="The ISO 3166-1 alpha-2 (two letter language) code to use"),
] = "en",
):
model = get_model(model_id=model)
result = await model.run(file, "transcribe", language)
return {"text": result}


@app.post("/audio/translations")
async def transcribe(
file: Annotated[
UploadFile,
File(
description="The audio file object (not file name) to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm."
),
],
model: Annotated[str, Form(description="The model to use")] = "whisper-1",
language: Annotated[
str,
Form(description="The ISO 3166-1 alpha-2 (two letter language) code to use"),
] = "en",
):
model = get_model(model_id=model)
result = await model.run(file, "translate", language)
return {"text": result}


class EngineEmbedding(BaseModel):
encoding_format: Optional[str]
input: Union[List[List[int]], list]
input: Union[List[List[int]], list]
prompt: Optional[List[str]]


import tiktoken

@app.post("/engines/{model_id}/embeddings")
async def embed2(model_id: str, body: Annotated[EngineEmbedding, Body(example=dummy_engine)]):
# async def embed2(model_id: str, body: Annotated[EngineEmbedding, Body(example=dummy_embedding)]):
logger.error(f"Request for model: { model_id} with body { body }")
# return "WOOHOO"
async def embed2(
model_id: str, body: Annotated[EngineEmbedding, Body(example=dummy_engine)]
):
llm = get_model(model_id=model_id, task="embed")
encoding = tiktoken.model.encoding_for_model(model_id)
body.prompt = [ encoding.decode(input) for input in body.input]

body.prompt = [encoding.decode(input) for input in body.input]
logger.error(f"Decoded: { body.prompt}")


# results = [llm.embed(inputs=prompt) for prompt in body.prompt]
results = llm.embed(inputs=body.prompt)
output = format_embeddings_results(model_name=llm.name, embeddings=results)
return output
5 changes: 2 additions & 3 deletions chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ version: 0.0.1
# It is recommended to use it with quotes.
appVersion: "1.16.0"


annotations:
helm.sh/images: |
- name: api
Expand All @@ -32,5 +31,5 @@ annotations:
image: ghcr.io/defenseunicorns/leapfrogai/stablelm-3b:0.0.1
- name: embeddings
image: ghcr.io/defenseunicorns/leapfrogai/embeddings:0.0.1


- name: whisper
image: ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1
13 changes: 11 additions & 2 deletions chart/templates/api/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,14 @@ data:
url = 'sentence-transformers:50051'
type = 'gRPC'
{{- end -}}


{{ if .Values.models.whisper.enabled }}
[whisper-1]
[whisper-1.metadata]
owned_by = 'Defense Unicorns'
permission = []
description = "OpenAI's Whisper Large v2"
tasks = ["translate", "transcribe"]
[whisper-1.network]
url = 'http://whisper:8000'
type = 'http'
{{- end -}}
28 changes: 28 additions & 0 deletions chart/templates/whisper/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{{ if .Values.models.whisper.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: whisper-deployment
spec:
replicas: 1
selector:
matchLabels:
app: whisper
template:
metadata:
labels:
app: whisper
spec:
containers:
- name: whisper-container
# imagePullPolicy: Always
imagePullPolicy: Always # Building on the node currently so we don't have to push/pull
image: ghcr.io/defenseunicorns/leapfrogai/whisper:0.0.1
ports:
- containerPort: 8000
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
{{ end }}
15 changes: 15 additions & 0 deletions chart/templates/whisper/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{{ if .Values.models.whisper.enabled }}
apiVersion: v1
kind: Service
metadata:
name: whisper
spec:
selector:
app: whisper
ports:
- name: http
protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
{{ end }}
Loading