Merge pull request #87 from ks6088ts-labs/feature/issue-86_add-ai-spe…

…ech-feature add AI Speech transcription features
ks6088ts-labs · Jun 9, 2024 · 53dd99f · 53dd99f
2 parents c1c1ce5 + 3334aea
commit 53dd99f
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 0 deletions.
diff --git a/azure_ai_speech.env.sample b/azure_ai_speech.env.sample
@@ -0,0 +1,2 @@
+AZURE_AI_SPEECH_ENDPOINT="https://<your-document-intelligence-name>.cognitiveservices.azure.com"
+AZURE_AI_SPEECH_API_KEY="<your-api-key>"
diff --git a/backend/fastapi.py b/backend/fastapi.py
@@ -2,6 +2,7 @@
 from fastapi.openapi.utils import get_openapi
 
 from backend.routers import azure_ai_document_intelligence as azure_ai_document_intelligence_router
+from backend.routers import azure_ai_speech as azure_ai_speech_router
 from backend.routers import azure_ai_vision as azure_ai_vision_router
 from backend.routers import azure_cosmos_db as azure_cosmos_db_router
 from backend.routers import azure_event_grid as azure_event_grid_router
@@ -21,6 +22,7 @@
     azure_storage_blob_router.router,
     azure_storage_queue_router.router,
     azure_cosmos_db_router.router,
+    azure_ai_speech_router.router,
 ]:
     app.include_router(router)
 

diff --git a/backend/internals/azure_ai_speech.py b/backend/internals/azure_ai_speech.py
@@ -0,0 +1,69 @@
+from logging import getLogger
+from urllib.parse import urljoin
+
+import requests
+
+from backend.settings.azure_ai_speech import Settings
+
+logger = getLogger(__name__)
+
+
+class Client:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+
+    def create_transcription(
+        self,
+        content_url: str,
+        locale: str,
+    ) -> str:
+        response = requests.post(
+            url=urljoin(
+                self.settings.azure_ai_speech_endpoint,
+                urljoin("speechtotext/v3.2-preview.2/", "transcriptions"),
+            ),
+            headers={
+                "Ocp-Apim-Subscription-Key": self.settings.azure_ai_speech_api_key,
+                "Content-Type": "application/json",
+            },
+            json={
+                "contentUrls": [
+                    content_url,
+                ],
+                "locale": locale,
+                "displayName": "My Transcription",
+                "model": {
+                    # FIXME: remove the hardcoded model
+                    "self": urljoin(
+                        urljoin(
+                            self.settings.azure_ai_speech_endpoint,
+                            "speechtotext/v3.2-preview.2/models/base",
+                        ),
+                        "e418c4a9-9937-4db7-b2c9-8afbff72d950",
+                    ),
+                },
+                "properties": {
+                    "diarizationEnabled": False,
+                    "displayFormWordLevelTimestampsEnabled": False,
+                    "wordLevelTimestampsEnabled": False,
+                    "profanityFilterMode": "Masked",
+                    "punctuationMode": "DictatedAndAutomatic",
+                    "timeToLive": "PT24H",  # https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api
+                },
+            },
+        )
+        return response.json()["self"].split("/")[-1]
+
+    def get_transcription(
+        self,
+        transcription_id: str,
+    ) -> str:
+        return requests.get(
+            url=urljoin(
+                self.settings.azure_ai_speech_endpoint,
+                urljoin("speechtotext/v3.2-preview.2/", f"transcriptions/{transcription_id}"),
+            ),
+            headers={
+                "Ocp-Apim-Subscription-Key": self.settings.azure_ai_speech_api_key,
+            },
+        ).json()
diff --git a/backend/routers/azure_ai_speech.py b/backend/routers/azure_ai_speech.py
@@ -0,0 +1,48 @@
+from logging import getLogger
+
+from fastapi import APIRouter, status
+from fastapi.responses import JSONResponse
+
+from backend.internals.azure_ai_speech import Client
+from backend.schemas import azure_ai_speech as azure_ai_speech_schemas
+from backend.settings.azure_ai_speech import Settings
+
+logger = getLogger(__name__)
+
+client = Client(
+    settings=Settings(),
+)
+
+router = APIRouter(
+    prefix="/azure_ai_speech",
+    tags=["azure_ai_speech"],
+    responses={404: {"description": "Not found"}},
+)
+
+
+@router.post(
+    "/transcriptions",
+    response_model=azure_ai_speech_schemas.CreateTranscriptionResponse,
+    status_code=200,
+)
+async def create_transcription(request: azure_ai_speech_schemas.CreateTranscriptionRequest):
+    transcription_id = client.create_transcription(
+        content_url=request.content_url,
+        locale=request.locale,
+    )
+    return azure_ai_speech_schemas.CreateTranscriptionResponse(
+        transcription_id=transcription_id,
+    )
+
+
+@router.get(
+    "/transcriptions/{transcription_id}",
+    status_code=200,
+)
+async def get_transcription(transcription_id: str):
+    return JSONResponse(
+        status_code=status.HTTP_200_OK,
+        content=client.get_transcription(
+            transcription_id=transcription_id,
+        ),
+    )
diff --git a/backend/schemas/azure_ai_speech.py b/backend/schemas/azure_ai_speech.py
@@ -0,0 +1,14 @@
+from logging import getLogger
+
+from pydantic import BaseModel
+
+logger = getLogger(__name__)
+
+
+class CreateTranscriptionRequest(BaseModel):
+    content_url: str = "https://<blob_account_name>.blob.core.windows.net/<blob_container_name>/<blob_name>"
+    locale: str = "ja-JP"
+
+
+class CreateTranscriptionResponse(BaseModel):
+    transcription_id: str
diff --git a/backend/settings/azure_ai_speech.py b/backend/settings/azure_ai_speech.py
@@ -0,0 +1,11 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    azure_ai_speech_endpoint: str = "https://<name>.cognitiveservices.azure.com"
+    azure_ai_speech_api_key: str = "<api-key>"
+
+    model_config = SettingsConfigDict(
+        env_file="azure_ai_speech.env",
+        env_file_encoding="utf-8",
+    )
diff --git a/docs/README.md b/docs/README.md
@@ -77,6 +77,17 @@
 
 - [Azure AI Document Intelligence client library for Python](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/documentintelligence/azure-ai-documentintelligence/README.md)
 
+### Azure AI Speech Service
+
+- [Whisper model via Azure AI Speech or via Azure OpenAI Service?](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/whisper-overview#whisper-model-via-azure-ai-speech-or-via-azure-openai-service)
+- [What is batch transcription?](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription)
+- [Locate audio files for batch transcription > Assign resource access role](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-audio-data?tabs=portal#assign-resource-access-role)
+- [Use a Whisper model](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model)
+- [Get batch transcription results](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-get?pivots=rest-api)
+- [How to use the Speech Services Batch Transcription API from Python](https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/README.md)
+- [SpeechToText API specification](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/Speech/SpeechToText/preview/v3.2-preview.2/speechtotext.json)
+- [Azure-Samples/cognitive-services-speech-sdk/samples/batch/python/python-client/main.py](https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/82b4212f0905349d6607bd6f210d0e52305d5d6c/samples/batch/python/python-client/main.py)
+
 ### Azure Cosmos DB
 
 - [Manage Azure Cosmos DB for NoSQL resources with Bicep > Azure Cosmos DB account with autoscale throughput](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/manage-with-bicep#azure-cosmos-db-account-with-autoscale-throughput)

diff --git a/tests/test_smoke_azure_ai_speech.py b/tests/test_smoke_azure_ai_speech.py
@@ -0,0 +1,32 @@
+from logging import getLogger
+
+import pytest
+
+from tests.utilities import RUN_TEST, client
+
+logger = getLogger(__name__)
+
+
+@pytest.mark.skipif(RUN_TEST, reason="need to launch the backend server first")
+def test_azure_ai_speech_create_transcription():
+    path_format = "/azure_ai_speech/{0}"
+    response = client.post(
+        url=path_format.format("transcriptions"),
+        json={
+            "content_url": "https://<blob_account_name>.blob.core.windows.net/<blob_container_name>/<blob_name>",
+            "locale": "ja-JP",
+        },
+    )
+    assert response.status_code == 200
+    logger.info(f"response: {response.json()}")
+
+
+@pytest.mark.skipif(RUN_TEST, reason="need to launch the backend server first")
+def test_azure_ai_speech_get_transcription():
+    path_format = "/azure_ai_speech/{0}"
+    transcription_id = "<transcription_id>"
+    response = client.get(
+        url=path_format.format(f"transcriptions/{transcription_id}"),
+    )
+    assert response.status_code == 200
+    logger.info(f"response: {response.json()}")