Skip to content

Commit

Permalink
Merge pull request #87 from ks6088ts-labs/feature/issue-86_add-ai-spe…
Browse files Browse the repository at this point in the history
…ech-feature

add AI Speech transcription features
  • Loading branch information
ks6088ts authored Jun 9, 2024
2 parents c1c1ce5 + 3334aea commit 53dd99f
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 0 deletions.
2 changes: 2 additions & 0 deletions azure_ai_speech.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
AZURE_AI_SPEECH_ENDPOINT="https://<your-document-intelligence-name>.cognitiveservices.azure.com"
AZURE_AI_SPEECH_API_KEY="<your-api-key>"
2 changes: 2 additions & 0 deletions backend/fastapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from fastapi.openapi.utils import get_openapi

from backend.routers import azure_ai_document_intelligence as azure_ai_document_intelligence_router
from backend.routers import azure_ai_speech as azure_ai_speech_router
from backend.routers import azure_ai_vision as azure_ai_vision_router
from backend.routers import azure_cosmos_db as azure_cosmos_db_router
from backend.routers import azure_event_grid as azure_event_grid_router
Expand All @@ -21,6 +22,7 @@
azure_storage_blob_router.router,
azure_storage_queue_router.router,
azure_cosmos_db_router.router,
azure_ai_speech_router.router,
]:
app.include_router(router)

Expand Down
69 changes: 69 additions & 0 deletions backend/internals/azure_ai_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from logging import getLogger
from urllib.parse import urljoin

import requests

from backend.settings.azure_ai_speech import Settings

logger = getLogger(__name__)


class Client:
def __init__(self, settings: Settings) -> None:
self.settings = settings

def create_transcription(
self,
content_url: str,
locale: str,
) -> str:
response = requests.post(
url=urljoin(
self.settings.azure_ai_speech_endpoint,
urljoin("speechtotext/v3.2-preview.2/", "transcriptions"),
),
headers={
"Ocp-Apim-Subscription-Key": self.settings.azure_ai_speech_api_key,
"Content-Type": "application/json",
},
json={
"contentUrls": [
content_url,
],
"locale": locale,
"displayName": "My Transcription",
"model": {
# FIXME: remove the hardcoded model
"self": urljoin(
urljoin(
self.settings.azure_ai_speech_endpoint,
"speechtotext/v3.2-preview.2/models/base",
),
"e418c4a9-9937-4db7-b2c9-8afbff72d950",
),
},
"properties": {
"diarizationEnabled": False,
"displayFormWordLevelTimestampsEnabled": False,
"wordLevelTimestampsEnabled": False,
"profanityFilterMode": "Masked",
"punctuationMode": "DictatedAndAutomatic",
"timeToLive": "PT24H", # https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api
},
},
)
return response.json()["self"].split("/")[-1]

def get_transcription(
self,
transcription_id: str,
) -> str:
return requests.get(
url=urljoin(
self.settings.azure_ai_speech_endpoint,
urljoin("speechtotext/v3.2-preview.2/", f"transcriptions/{transcription_id}"),
),
headers={
"Ocp-Apim-Subscription-Key": self.settings.azure_ai_speech_api_key,
},
).json()
48 changes: 48 additions & 0 deletions backend/routers/azure_ai_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from logging import getLogger

from fastapi import APIRouter, status
from fastapi.responses import JSONResponse

from backend.internals.azure_ai_speech import Client
from backend.schemas import azure_ai_speech as azure_ai_speech_schemas
from backend.settings.azure_ai_speech import Settings

logger = getLogger(__name__)

client = Client(
settings=Settings(),
)

router = APIRouter(
prefix="/azure_ai_speech",
tags=["azure_ai_speech"],
responses={404: {"description": "Not found"}},
)


@router.post(
"/transcriptions",
response_model=azure_ai_speech_schemas.CreateTranscriptionResponse,
status_code=200,
)
async def create_transcription(request: azure_ai_speech_schemas.CreateTranscriptionRequest):
transcription_id = client.create_transcription(
content_url=request.content_url,
locale=request.locale,
)
return azure_ai_speech_schemas.CreateTranscriptionResponse(
transcription_id=transcription_id,
)


@router.get(
"/transcriptions/{transcription_id}",
status_code=200,
)
async def get_transcription(transcription_id: str):
return JSONResponse(
status_code=status.HTTP_200_OK,
content=client.get_transcription(
transcription_id=transcription_id,
),
)
14 changes: 14 additions & 0 deletions backend/schemas/azure_ai_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from logging import getLogger

from pydantic import BaseModel

logger = getLogger(__name__)


class CreateTranscriptionRequest(BaseModel):
content_url: str = "https://<blob_account_name>.blob.core.windows.net/<blob_container_name>/<blob_name>"
locale: str = "ja-JP"


class CreateTranscriptionResponse(BaseModel):
transcription_id: str
11 changes: 11 additions & 0 deletions backend/settings/azure_ai_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
azure_ai_speech_endpoint: str = "https://<name>.cognitiveservices.azure.com"
azure_ai_speech_api_key: str = "<api-key>"

model_config = SettingsConfigDict(
env_file="azure_ai_speech.env",
env_file_encoding="utf-8",
)
11 changes: 11 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@

- [Azure AI Document Intelligence client library for Python](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/documentintelligence/azure-ai-documentintelligence/README.md)

### Azure AI Speech Service

- [Whisper model via Azure AI Speech or via Azure OpenAI Service?](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/whisper-overview#whisper-model-via-azure-ai-speech-or-via-azure-openai-service)
- [What is batch transcription?](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription)
- [Locate audio files for batch transcription > Assign resource access role](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-audio-data?tabs=portal#assign-resource-access-role)
- [Use a Whisper model](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model)
- [Get batch transcription results](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-get?pivots=rest-api)
- [How to use the Speech Services Batch Transcription API from Python](https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/README.md)
- [SpeechToText API specification](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/Speech/SpeechToText/preview/v3.2-preview.2/speechtotext.json)
- [Azure-Samples/cognitive-services-speech-sdk/samples/batch/python/python-client/main.py](https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/82b4212f0905349d6607bd6f210d0e52305d5d6c/samples/batch/python/python-client/main.py)

### Azure Cosmos DB

- [Manage Azure Cosmos DB for NoSQL resources with Bicep > Azure Cosmos DB account with autoscale throughput](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/manage-with-bicep#azure-cosmos-db-account-with-autoscale-throughput)
Expand Down
32 changes: 32 additions & 0 deletions tests/test_smoke_azure_ai_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from logging import getLogger

import pytest

from tests.utilities import RUN_TEST, client

logger = getLogger(__name__)


@pytest.mark.skipif(RUN_TEST, reason="need to launch the backend server first")
def test_azure_ai_speech_create_transcription():
path_format = "/azure_ai_speech/{0}"
response = client.post(
url=path_format.format("transcriptions"),
json={
"content_url": "https://<blob_account_name>.blob.core.windows.net/<blob_container_name>/<blob_name>",
"locale": "ja-JP",
},
)
assert response.status_code == 200
logger.info(f"response: {response.json()}")


@pytest.mark.skipif(RUN_TEST, reason="need to launch the backend server first")
def test_azure_ai_speech_get_transcription():
path_format = "/azure_ai_speech/{0}"
transcription_id = "<transcription_id>"
response = client.get(
url=path_format.format(f"transcriptions/{transcription_id}"),
)
assert response.status_code == 200
logger.info(f"response: {response.json()}")

0 comments on commit 53dd99f

Please sign in to comment.