add AI Speech features

ks6088ts-labs · Jun 9, 2024 · 3334aea · 3334aea
2 parents 1ce48e8 + 1222837
commit 3334aea
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 16 deletions.
diff --git a/azure_ai_speech.env.sample b/azure_ai_speech.env.sample
@@ -0,0 +1,2 @@
+AZURE_AI_SPEECH_ENDPOINT="https://<your-document-intelligence-name>.cognitiveservices.azure.com"
+AZURE_AI_SPEECH_API_KEY="<your-api-key>"
diff --git a/backend/fastapi.py b/backend/fastapi.py
@@ -2,6 +2,7 @@
 from fastapi.openapi.utils import get_openapi
 
 from backend.routers import azure_ai_document_intelligence as azure_ai_document_intelligence_router
+from backend.routers import azure_ai_speech as azure_ai_speech_router
 from backend.routers import azure_ai_vision as azure_ai_vision_router
 from backend.routers import azure_cosmos_db as azure_cosmos_db_router
 from backend.routers import azure_event_grid as azure_event_grid_router
@@ -21,6 +22,7 @@
     azure_storage_blob_router.router,
     azure_storage_queue_router.router,
     azure_cosmos_db_router.router,
+    azure_ai_speech_router.router,
 ]:
     app.include_router(router)
 

diff --git a/backend/internals/azure_ai_speech.py b/backend/internals/azure_ai_speech.py
@@ -0,0 +1,69 @@
+from logging import getLogger
+from urllib.parse import urljoin
+
+import requests
+
+from backend.settings.azure_ai_speech import Settings
+
+logger = getLogger(__name__)
+
+
+class Client:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+
+    def create_transcription(
+        self,
+        content_url: str,
+        locale: str,
+    ) -> str:
+        response = requests.post(
+            url=urljoin(
+                self.settings.azure_ai_speech_endpoint,
+                urljoin("speechtotext/v3.2-preview.2/", "transcriptions"),
+            ),
+            headers={
+                "Ocp-Apim-Subscription-Key": self.settings.azure_ai_speech_api_key,
+                "Content-Type": "application/json",
+            },
+            json={
+                "contentUrls": [
+                    content_url,
+                ],
+                "locale": locale,
+                "displayName": "My Transcription",
+                "model": {
+                    # FIXME: remove the hardcoded model
+                    "self": urljoin(
+                        urljoin(
+                            self.settings.azure_ai_speech_endpoint,
+                            "speechtotext/v3.2-preview.2/models/base",
+                        ),
+                        "e418c4a9-9937-4db7-b2c9-8afbff72d950",
+                    ),
+                },
+                "properties": {
+                    "diarizationEnabled": False,
+                    "displayFormWordLevelTimestampsEnabled": False,
+                    "wordLevelTimestampsEnabled": False,
+                    "profanityFilterMode": "Masked",
+                    "punctuationMode": "DictatedAndAutomatic",
+                    "timeToLive": "PT24H",  # https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api
+                },
+            },
+        )
+        return response.json()["self"].split("/")[-1]
+
+    def get_transcription(
+        self,
+        transcription_id: str,
+    ) -> str:
+        return requests.get(
+            url=urljoin(
+                self.settings.azure_ai_speech_endpoint,
+                urljoin("speechtotext/v3.2-preview.2/", f"transcriptions/{transcription_id}"),
+            ),
+            headers={
+                "Ocp-Apim-Subscription-Key": self.settings.azure_ai_speech_api_key,
+            },
+        ).json()
diff --git a/backend/routers/azure_ai_speech.py b/backend/routers/azure_ai_speech.py
@@ -0,0 +1,48 @@
+from logging import getLogger
+
+from fastapi import APIRouter, status
+from fastapi.responses import JSONResponse
+
+from backend.internals.azure_ai_speech import Client
+from backend.schemas import azure_ai_speech as azure_ai_speech_schemas
+from backend.settings.azure_ai_speech import Settings
+
+logger = getLogger(__name__)
+
+client = Client(
+    settings=Settings(),
+)
+
+router = APIRouter(
+    prefix="/azure_ai_speech",
+    tags=["azure_ai_speech"],
+    responses={404: {"description": "Not found"}},
+)
+
+
+@router.post(
+    "/transcriptions",
+    response_model=azure_ai_speech_schemas.CreateTranscriptionResponse,
+    status_code=200,
+)
+async def create_transcription(request: azure_ai_speech_schemas.CreateTranscriptionRequest):
+    transcription_id = client.create_transcription(
+        content_url=request.content_url,
+        locale=request.locale,
+    )
+    return azure_ai_speech_schemas.CreateTranscriptionResponse(
+        transcription_id=transcription_id,
+    )
+
+
+@router.get(
+    "/transcriptions/{transcription_id}",
+    status_code=200,
+)
+async def get_transcription(transcription_id: str):
+    return JSONResponse(
+        status_code=status.HTTP_200_OK,
+        content=client.get_transcription(
+            transcription_id=transcription_id,
+        ),
+    )
diff --git a/backend/schemas/azure_ai_speech.py b/backend/schemas/azure_ai_speech.py
@@ -0,0 +1,14 @@
+from logging import getLogger
+
+from pydantic import BaseModel
+
+logger = getLogger(__name__)
+
+
+class CreateTranscriptionRequest(BaseModel):
+    content_url: str = "https://<blob_account_name>.blob.core.windows.net/<blob_container_name>/<blob_name>"
+    locale: str = "ja-JP"
+
+
+class CreateTranscriptionResponse(BaseModel):
+    transcription_id: str
diff --git a/backend/settings/azure_ai_speech.py b/backend/settings/azure_ai_speech.py
@@ -0,0 +1,11 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    azure_ai_speech_endpoint: str = "https://<name>.cognitiveservices.azure.com"
+    azure_ai_speech_api_key: str = "<api-key>"
+
+    model_config = SettingsConfigDict(
+        env_file="azure_ai_speech.env",
+        env_file_encoding="utf-8",
+    )
diff --git a/docs/README.md b/docs/README.md
@@ -28,7 +28,7 @@
 - [FastAPI > Extending OpenAPI](https://fastapi.tiangolo.com/how-to/extending-openapi/)
 - [Get started with Azure Blob Storage and Python](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blob-python-get-started?tabs=sas-token)
 - [FastAPI で実装した様々なエンドポイントのテストを書く（フォームデータの送信、クッキーの確認、ファイルのアップロード等）](https://qiita.com/kurumaebi65/items/d5cda239ef601f4c36ef#%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%82%92%E3%82%A2%E3%83%83%E3%83%97%E3%83%AD%E3%83%BC%E3%83%89%E3%83%80%E3%82%A6%E3%83%B3%E3%83%AD%E3%83%BC%E3%83%89)
-- [LangChainのstreaming出力で苦労している人おる？](https://qiita.com/numekudi/items/4a9e7728ac10c3515ed1)
+- [LangChain の streaming 出力で苦労している人おる？](https://qiita.com/numekudi/items/4a9e7728ac10c3515ed1)
 - [Quickstart: Azure Queue Storage client library for Python](https://learn.microsoft.com/en-us/azure/storage/queues/storage-quickstart-queues-python?tabs=passwordless%2Croles-azure-portal%2Cenvironment-variable-windows%2Csign-in-azure-cli)
 - [Azure Event Grid client library for Python - version 4.19.0](https://learn.microsoft.com/en-us/python/api/overview/azure/eventgrid-readme?view=azure-python)
 - [Azure Event Grid Client Library Python Samples](https://learn.microsoft.com/en-us/samples/azure/azure-sdk-for-python/eventgrid-samples/)
@@ -39,7 +39,8 @@
 - [Streamlit API cheat sheet](https://docs.streamlit.io/develop/quick-reference/cheat-sheet)
 - [Streamlit > Display progress and status](https://docs.streamlit.io/develop/api-reference/status)
 - [streamlit-audiorecorder](https://github.com/theevann/streamlit-audiorecorder)
-- [Build a basic LLM chat app](https://docs.streamlit.io/develop/tutorials/llms/build-conversational-apps)
+- [Streamlit > Build a basic LLM chat app](https://docs.streamlit.io/develop/tutorials/llms/build-conversational-apps)
+- [OpenAI の新しい API を Streamlit で使ってみた](https://qiita.com/papasim824/items/5a3bee4cc3915d5ae177)
 - [aiohttp > Installing all speedups in one command](https://docs.aiohttp.org/en/stable/#installing-all-speedups-in-one-command)
 - [Python & aiohttp: How to upload files to a remote server](https://www.slingacademy.com/article/python-aiohttp-how-to-upload-files-to-a-remote-server/)
 
@@ -76,6 +77,17 @@
 
 - [Azure AI Document Intelligence client library for Python](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/documentintelligence/azure-ai-documentintelligence/README.md)
 
+### Azure AI Speech Service
+
+- [Whisper model via Azure AI Speech or via Azure OpenAI Service?](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/whisper-overview#whisper-model-via-azure-ai-speech-or-via-azure-openai-service)
+- [What is batch transcription?](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription)
+- [Locate audio files for batch transcription > Assign resource access role](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-audio-data?tabs=portal#assign-resource-access-role)
+- [Use a Whisper model](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#use-a-whisper-model)
+- [Get batch transcription results](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-get?pivots=rest-api)
+- [How to use the Speech Services Batch Transcription API from Python](https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch/python/README.md)
+- [SpeechToText API specification](https://github.com/Azure/azure-rest-api-specs/blob/main/specification/cognitiveservices/data-plane/Speech/SpeechToText/preview/v3.2-preview.2/speechtotext.json)
+- [Azure-Samples/cognitive-services-speech-sdk/samples/batch/python/python-client/main.py](https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/82b4212f0905349d6607bd6f210d0e52305d5d6c/samples/batch/python/python-client/main.py)
+
 ### Azure Cosmos DB
 
 - [Manage Azure Cosmos DB for NoSQL resources with Bicep > Azure Cosmos DB account with autoscale throughput](https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/manage-with-bicep#azure-cosmos-db-account-with-autoscale-throughput)

diff --git a/frontend/solutions/chat.py b/frontend/solutions/chat.py
@@ -36,11 +36,19 @@ def start(
         with st.chat_message("user"):
             st.markdown(prompt)
 
-        with st.chat_message("assistant"):
-            stream = client.chat.completions.create(
-                model=getenv("AZURE_OPENAI_GPT_MODEL"),
-                messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
-                stream=True,
-            )
-            response = st.write_stream(stream)
-        st.session_state.messages.append({"role": "assistant", "content": response})
+        response = client.chat.completions.create(
+            model=getenv("AZURE_OPENAI_GPT_MODEL"),
+            messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages],
+            stream=True,
+        )
+        with st.chat_message("assistant", avatar="assistant"):
+            placeholder = st.empty()
+            assistant_text = ""
+            for chunk in response:
+                if len(chunk.choices) <= 0:
+                    continue
+                content = chunk.choices[0].delta.content
+                if content:
+                    assistant_text += content
+                    placeholder.write(assistant_text)
+            st.session_state.messages.append({"role": "assistant", "content": assistant_text})
diff --git a/frontend/solutions/sandbox.py b/frontend/solutions/sandbox.py
@@ -29,7 +29,6 @@ async def chat_completions_post(
     response = await client.azure_openai.chat_completions.post(
         ChatCompletionRequest(
             content=prompt,
-            stream=False,
         ),
     )
     return response.content
@@ -42,14 +41,12 @@ def start(
     logger.setLevel(log_level)
     logger.debug(f"set log level to {log_level}")
 
-    st.write("Misc solution")
-
-    # GET
+    st.write("Get OpenAPI spec")
     if st.button("GET"):
         logger.info("Fetching data from backend...")
         try:
             with st.spinner("Calling API..."):
-                response = asyncio.run(http_get(url=urljoin(base=backend_url, url="")))
+                response = asyncio.run(http_get(url=urljoin(base=urljoin(backend_url, "openapi.json"), url="")))
             st.write(response)
             logger.info("Data fetched successfully.")
         except Exception as e:
@@ -58,7 +55,7 @@ def start(
 
     st.write("---")
 
-    # POST
+    st.write("Call Azure OpenAI API")
     prompt = st.text_input(
         label="Prompt",
         value="Hello",

diff --git a/tests/test_smoke_azure_ai_speech.py b/tests/test_smoke_azure_ai_speech.py
@@ -0,0 +1,32 @@
+from logging import getLogger
+
+import pytest
+
+from tests.utilities import RUN_TEST, client
+
+logger = getLogger(__name__)
+
+
+@pytest.mark.skipif(RUN_TEST, reason="need to launch the backend server first")
+def test_azure_ai_speech_create_transcription():
+    path_format = "/azure_ai_speech/{0}"
+    response = client.post(
+        url=path_format.format("transcriptions"),
+        json={
+            "content_url": "https://<blob_account_name>.blob.core.windows.net/<blob_container_name>/<blob_name>",
+            "locale": "ja-JP",
+        },
+    )
+    assert response.status_code == 200
+    logger.info(f"response: {response.json()}")
+
+
+@pytest.mark.skipif(RUN_TEST, reason="need to launch the backend server first")
+def test_azure_ai_speech_get_transcription():
+    path_format = "/azure_ai_speech/{0}"
+    transcription_id = "<transcription_id>"
+    response = client.get(
+        url=path_format.format(f"transcriptions/{transcription_id}"),
+    )
+    assert response.status_code == 200
+    logger.info(f"response: {response.json()}")