diff --git a/MultimodalQnA/Dockerfile b/MultimodalQnA/Dockerfile index f0048692e6..fd45b14bc1 100644 --- a/MultimodalQnA/Dockerfile +++ b/MultimodalQnA/Dockerfile @@ -20,7 +20,8 @@ WORKDIR $HOME FROM base AS git RUN apt-get update && apt-get install -y --no-install-recommends git -RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git +# RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git +RUN git clone --single-branch --branch="mmqna-phase3" https://github.com/mhbuehler/GenAIComps.git # Stage 3: common layer shared by services using GenAIComps FROM base AS comps-base diff --git a/MultimodalQnA/README.md b/MultimodalQnA/README.md index bda42ee285..125d9fba07 100644 --- a/MultimodalQnA/README.md +++ b/MultimodalQnA/README.md @@ -2,7 +2,7 @@ Suppose you possess a set of videos, images, audio files, PDFs, or some combination thereof and wish to perform question-answering to extract insights from these documents. To respond to your questions, the system needs to comprehend a mix of textual, visual, and audio facts drawn from the document contents. The MultimodalQnA framework offers an optimal solution for this purpose. -`MultimodalQnA` addresses your questions by dynamically fetching the most pertinent multimodal information (e.g. images, transcripts, and captions) from your collection of video, image, audio, and PDF files. For this purpose, MultimodalQnA utilizes [BridgeTower model](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi), a multimodal encoding transformer model which merges visual and textual data into a unified semantic space. During the ingestion phase, the BridgeTower model embeds both visual cues and auditory facts as texts, and those embeddings are then stored in a vector database. When it comes to answering a question, the MultimodalQnA will fetch its most relevant multimodal content from the vector store and feed it into a downstream Large Vision-Language Model (LVM) as input context to generate a response for the user. +`MultimodalQnA` addresses your questions by dynamically fetching the most pertinent multimodal information (e.g. images, transcripts, and captions) from your collection of video, image, audio, and PDF files. For this purpose, MultimodalQnA utilizes [BridgeTower model](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi), a multimodal encoding transformer model which merges visual and textual data into a unified semantic space. During the ingestion phase, the BridgeTower model embeds both visual cues and auditory facts as texts, and those embeddings are then stored in a vector database. When it comes to answering a question, the MultimodalQnA will fetch its most relevant multimodal content from the vector store and feed it into a downstream Large Vision-Language Model (LVM) as input context to generate a response for the user, which can be text or audio. The MultimodalQnA architecture shows below: @@ -41,12 +41,14 @@ flowchart LR UI([UI server
]):::orchid end + ASR{{Whisper service
}} TEI_EM{{Embedding service
}} VDB{{Vector DB

}} R_RET{{Retriever service
}} DP([Data Preparation
]):::blue LVM_gen{{LVM Service
}} GW([MultimodalQnA GateWay
]):::orange + TTS{{SpeechT5 service
}} %% Data Preparation flow %% Ingest data flow @@ -74,25 +76,42 @@ flowchart LR R_RET <-.->VDB DP <-.->VDB + %% Audio speech recognition used for translating audio queries to text + GW <-.-> ASR + %% Generate spoken responses with text-to-speech using the SpeechT5 model + GW <-.-> TTS ``` This MultimodalQnA use case performs Multimodal-RAG using LangChain, Redis VectorDB and Text Generation Inference on [Intel Gaudi2](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) and [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html), and we invite contributions from other hardware vendors to expand the example. +The [Whisper Service](https://github.com/opea-project/GenAIComps/blob/main/comps/asr/src/README.md) +is used by MultimodalQnA for converting audio queries to text. If a spoken response is requested, the +[SpeechT5 Service](https://github.com/opea-project/GenAIComps/blob/main/comps/tts/src/README.md) translates the text +response from the LVM to a speech audio file. + The Intel Gaudi2 accelerator supports both training and inference for deep learning models in particular for LLMs. Visit [Habana AI products](https://habana.ai/products) for more details. In the below, we provide a table that describes for each microservice component in the MultimodalQnA architecture, the default configuration of the open source project, hardware, port, and endpoint.
-Gaudi default compose.yaml - -| MicroService | Open Source Project | HW | Port | Endpoint | -| ------------ | --------------------- | ----- | ---- | ----------------------------------------------------------- | -| Embedding | Langchain | Xeon | 6000 | /v1/embeddings | -| Retriever | Langchain, Redis | Xeon | 7000 | /v1/multimodal_retrieval | -| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm | -| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest | +Gaudi and Xeon default compose.yaml settings + +| MicroService | Open Source Project | HW | Port | Endpoint | +| ------------ | ----------------------- | ----- | ---- | ----------------------------------------------------------- | +| Dataprep | Redis, Langchain, TGI | Xeon | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest | +| Embedding | Langchain | Xeon | 6000 | /v1/embeddings | +| LVM | Langchain, Transformers | Xeon | 9399 | /v1/lvm | +| Retriever | Langchain, Redis | Xeon | 7000 | /v1/retrieval | +| SpeechT5 | Transformers | Xeon | 7055 | /v1/tts | +| Whisper | Transformers | Xeon | 7066 | /v1/asr | +| Dataprep | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest | +| Embedding | Langchain | Gaudi | 6000 | /v1/embeddings | +| LVM | Langchain, TGI | Gaudi | 9399 | /v1/lvm | +| Retriever | Langchain, Redis | Gaudi | 7000 | /v1/retrieval | +| SpeechT5 | Transformers | Gaudi | 7055 | /v1/tts | +| Whisper | Transformers | Gaudi | 7066 | /v1/asr |
@@ -104,8 +123,12 @@ By default, the embedding and LVM models are set to a default value as listed be | --------- | ----- | ----------------------------------------- | | embedding | Xeon | BridgeTower/bridgetower-large-itm-mlm-itc | | LVM | Xeon | llava-hf/llava-1.5-7b-hf | +| SpeechT5 | Xeon | microsoft/speecht5_tts | +| Whisper | Xeon | openai/whisper-small | | embedding | Gaudi | BridgeTower/bridgetower-large-itm-mlm-itc | | LVM | Gaudi | llava-hf/llava-v1.6-vicuna-13b-hf | +| SpeechT5 | Gaudi | microsoft/speecht5_tts | +| Whisper | Gaudi | openai/whisper-small | You can choose other LVM models, such as `llava-hf/llava-1.5-7b-hf ` and `llava-hf/llava-1.5-13b-hf`, as needed. @@ -113,9 +136,28 @@ You can choose other LVM models, such as `llava-hf/llava-1.5-7b-hf ` and `llava- The MultimodalQnA service can be effortlessly deployed on either Intel Gaudi2 or Intel XEON Scalable Processors. -Currently we support deploying MultimodalQnA services with docker compose. +Currently we support deploying MultimodalQnA services with docker compose. The [`docker_compose`](docker_compose) +directory has folders which include `compose.yaml` files for different hardware types: -### Setup Environment Variable +``` +📂 docker_compose +├── 📂 amd +│   └── 📂 gpu +│   └── 📂 rocm +│   ├── 📄 compose.yaml +│   └── ... +└── 📂 intel + ├── 📂 cpu + │   └── 📂 xeon + │   ├── 📄 compose.yaml + │   └── ... + └── 📂 hpu + └── 📂 gaudi + ├── 📄 compose.yaml + └── ... +``` + +### Setup Environment Variables To set up environment variables for deploying MultimodalQnA services, follow these steps: @@ -124,8 +166,10 @@ To set up environment variables for deploying MultimodalQnA services, follow the ```bash # Example: export host_ip=$(hostname -I | awk '{print $1}') export host_ip="External_Public_IP" + + # Append the host_ip to the no_proxy list to allow container communication # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" - export no_proxy="Your_No_Proxy" + export no_proxy="${no_proxy},${host_ip}" ``` 2. If you are in a proxy environment, also set the proxy-related environment variables: @@ -137,36 +181,41 @@ To set up environment variables for deploying MultimodalQnA services, follow the 3. Set up other environment variables: - > Notice that you can only choose **one** command below to set up envs according to your hardware. Other that the port numbers may be set incorrectly. + > Choose **one** command below to set env vars according to your hardware. Otherwise, the port numbers may be set incorrectly. ```bash # on Gaudi - source ./docker_compose/intel/hpu/gaudi/set_env.sh + cd docker_compose/intel/hpu/gaudi + source ./set_env.sh + # on Xeon - source ./docker_compose/intel/cpu/xeon/set_env.sh + cd docker_compose/intel/cpu/xeon + source ./set_env.sh ``` ### Deploy MultimodalQnA on Gaudi -Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source. +Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) if you would like to build docker images from +source, otherwise images will be pulled from Docker Hub. Find the corresponding [compose.yaml](./docker_compose/intel/hpu/gaudi/compose.yaml). ```bash -cd GenAIExamples/MultimodalQnA/docker_compose/intel/hpu/gaudi/ +# While still in the docker_compose/intel/hpu/gaudi directory, use docker compose to bring up the services docker compose -f compose.yaml up -d ``` -> Notice: Currently only the **Habana Driver 1.17.x** is supported for Gaudi. +> Notice: Currently only the **Habana Driver 1.18.x** is supported for Gaudi. ### Deploy MultimodalQnA on Xeon -Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source. +Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) if you would like to build docker images from +source, otherwise images will be pulled from Docker Hub. Find the corresponding [compose.yaml](./docker_compose/intel/cpu/xeon/compose.yaml). ```bash -cd GenAIExamples/MultimodalQnA/docker_compose/intel/cpu/xeon/ +# While still in the docker_compose/intel/cpu/xeon directory, use docker compose to bring up the services docker compose -f compose.yaml up -d ``` diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md index f49b9815f1..4e3a031da9 100644 --- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md +++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md @@ -178,7 +178,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ ```bash export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://${host_ip}:7000/v1/multimodal_retrieval \ +curl http://${host_ip}:7000/v1/retrieval \ -X POST \ -H "Content-Type: application/json" \ -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml b/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml index af4855bb59..e743d111e7 100644 --- a/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml +++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml @@ -175,6 +175,8 @@ services: - DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT} - DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT} - DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT} + - DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT} + - DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT} ipc: host restart: always diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md index 7e4fa6894a..eeefddeb2d 100644 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md @@ -44,6 +44,10 @@ whisper === port 7066 - Open to 0.0.0.0/0 +speecht5-service +=== +port 7055 - Open to 0.0.0.0/0 + dataprep-multimodal-redis === Port 6007 - Open to 0.0.0.0/0 @@ -63,7 +67,7 @@ Since the `compose.yaml` will consume some environment variables, you need to se **Export the value of the public IP address of your Xeon server to the `host_ip` environment variable** -> Change the External_Public_IP below with the actual IPV4 value +> Change the External_Public_IP below with the actual IPV4 value when setting the `host_ip` value (do not use localhost). ``` export host_ip="External_Public_IP" @@ -72,13 +76,10 @@ export host_ip="External_Public_IP" **Append the value of the public IP address to the no_proxy list** ```bash -export your_no_proxy=${your_no_proxy},"External_Public_IP" +export no_proxy=${no_proxy},${host_ip} ``` ```bash -export no_proxy=${your_no_proxy} -export http_proxy=${your_http_proxy} -export https_proxy=${your_http_proxy} export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip} export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip} export LVM_SERVICE_HOST_IP=${host_ip} @@ -86,6 +87,8 @@ export MEGA_SERVICE_HOST_IP=${host_ip} export WHISPER_PORT=7066 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" export WHISPER_MODEL="base" +export TTS_PORT=7055 +export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts" export MAX_IMAGES=1 export REDIS_DB_PORT=6379 export REDIS_INSIGHTS_PORT=8001 @@ -111,10 +114,9 @@ export LVM_ENDPOINT="http://${host_ip}:$LLAVA_SERVER_PORT" export MEGA_SERVICE_PORT=8888 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:$MEGA_SERVICE_PORT/v1/multimodalqna" export UI_PORT=5173 +export UI_TIMEOUT=200 ``` -Note: Please replace with `host_ip` with you external IP address, do not use localhost. - > Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server. > If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list > needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not @@ -172,7 +174,13 @@ Build whisper server image docker build --no-cache -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile . ``` -### 6. Build MegaService Docker Image +### 6. Build TTS Image + +```bash +docker build --no-cache -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile . +``` + +### 7. Build MegaService Docker Image To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the [multimodalqna.py](../../../../multimodalqna.py) Python script. Build MegaService Docker image via below command: @@ -183,7 +191,7 @@ docker build --no-cache -t opea/multimodalqna:latest --build-arg https_proxy=$ht cd ../.. ``` -### 7. Build UI Docker Image +### 8. Build UI Docker Image Build frontend Docker image via below command: @@ -200,11 +208,12 @@ Then run the command `docker images`, you will have the following 11 Docker Imag 3. `opea/lvm-llava:latest` 4. `opea/retriever:latest` 5. `opea/whisper:latest` -6. `opea/redis-vector-db` -7. `opea/embedding:latest` -8. `opea/embedding-multimodal-bridgetower:latest` -9. `opea/multimodalqna:latest` -10. `opea/multimodalqna-ui:latest` +6. `opea/speech5:latest` +7. `opea/redis-vector-db` +8. `opea/embedding:latest` +9. `opea/embedding-multimodal-bridgetower:latest` +10. `opea/multimodalqna:latest` +11. `opea/multimodalqna-ui:latest` ## 🚀 Start Microservices @@ -264,7 +273,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ ```bash export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/multimodal_retrieval \ +curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval \ -X POST \ -H "Content-Type: application/json" \ -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" @@ -279,7 +288,16 @@ curl ${WHISPER_SERVER_ENDPOINT} \ -d '{"audio" : "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' ``` -5. lvm-llava +5. tts + +```bash +curl ${TTS_ENDPOINT} \ + -X POST \ + -d '{"text": "Who are you?"}' \ + -H 'Content-Type: application/json' +``` + +6. lvm-llava ```bash curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ @@ -288,7 +306,7 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ -d '{"prompt":"Describe the image please.", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}' ``` -6. lvm +7. lvm ```bash curl http://${host_ip}:${LVM_PORT}/v1/lvm \ @@ -313,9 +331,9 @@ curl http://${host_ip}:${LVM_PORT}/v1/lvm \ -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' ``` -7. dataprep-multimodal-redis +8. dataprep-multimodal-redis -Download a sample video, image, pdf, and audio file and create a caption +Download a sample video (.mp4), image (.png, .gif, .jpg), pdf, and audio file (.wav, .mp3) and create a caption ```bash export video_fn="WeAreGoingOnBullrun.mp4" @@ -334,7 +352,7 @@ export audio_fn="AudioSample.wav" wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn} ``` -Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file. +Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav or .mp3 file. ```bash curl --silent --write-out "HTTPSTATUS:%{http_code}" \ @@ -354,7 +372,7 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \ -X POST -F "files=@./${image_fn}" ``` -Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text. +Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text. The image caption can be provided as a text (`.txt`) or as spoken audio (`.wav` or `.mp3`). ```bash curl --silent --write-out "HTTPSTATUS:%{http_code}" \ @@ -393,7 +411,7 @@ curl -X POST \ ${DATAPREP_DELETE_FILE_ENDPOINT} ``` -8. MegaService +9. MegaService Test the MegaService with a text query: @@ -428,8 +446,10 @@ curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \ -d '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}' ``` +Test the MegaService with a back and forth conversation between the user and assistant including a text to speech response from the assistant using `"modalities": ["text", "audio"]'`: + ```bash curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \ -H "Content-Type: application/json" \ - -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}' + -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10, "modalities": ["text", "audio"]}' ``` diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml index 31f543c755..0bc7321500 100644 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml @@ -13,6 +13,19 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} restart: unless-stopped + speecht5-service: + image: ${REGISTRY:-opea}/speecht5:${TAG:-latest} + container_name: speecht5-service + ports: + - "${TTS_PORT}:7055" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + TTS_PORT: ${TTS_PORT} + TTS_ENDPOINT: ${TTS_ENDPOINT} + restart: unless-stopped redis-vector-db: image: redis/redis-stack:7.2.0-v9 container_name: redis-vector-db @@ -152,6 +165,8 @@ services: LVM_MODEL_ID: ${LVM_MODEL_ID} WHISPER_PORT: ${WHISPER_PORT} WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT} + TTS_PORT: ${TTS_PORT} + TTS_ENDPOINT: ${TTS_ENDPOINT} ipc: host restart: always multimodalqna-ui: @@ -169,8 +184,11 @@ services: - DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT} - DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT} - DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT} + - DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT} + - DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT} - MEGA_SERVICE_PORT:=${MEGA_SERVICE_PORT} - UI_PORT=${UI_PORT} + - UI_TIMEOUT=${UI_TIMEOUT} - DATAPREP_MMR_PORT=${DATAPREP_MMR_PORT} ipc: host restart: always diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh index 057f90990c..0cd1267460 100755 --- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh +++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh @@ -8,15 +8,14 @@ popd > /dev/null export host_ip=$(hostname -I | awk '{print $1}') -export no_proxy=${your_no_proxy} -export http_proxy=${your_http_proxy} -export https_proxy=${your_http_proxy} - export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip} export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip} export LVM_SERVICE_HOST_IP=${host_ip} export MEGA_SERVICE_HOST_IP=${host_ip} +export TTS_PORT=7055 +export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts" + export WHISPER_PORT=7066 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" export WHISPER_MODEL="base" @@ -52,3 +51,4 @@ export MEGA_SERVICE_PORT=8888 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" export UI_PORT=5173 +export UI_TIMEOUT=200 diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md index 2379fc3d4d..b81c372e20 100644 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md @@ -8,7 +8,7 @@ Since the `compose.yaml` will consume some environment variables, you need to se **Export the value of the public IP address of your Gaudi server to the `host_ip` environment variable** -> Change the External_Public_IP below with the actual IPV4 value +> Change the External_Public_IP below with the actual IPV4 value when setting the `host_ip` value (do not use localhost). ``` export host_ip="External_Public_IP" @@ -17,13 +17,10 @@ export host_ip="External_Public_IP" **Append the value of the public IP address to the no_proxy list** ```bash -export your_no_proxy=${your_no_proxy},"External_Public_IP" +export no_proxy=${no_proxy},${host_ip} ``` ```bash -export no_proxy=${your_no_proxy} -export http_proxy=${your_http_proxy} -export https_proxy=${your_http_proxy} export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip} export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip} export LVM_SERVICE_HOST_IP=${host_ip} @@ -57,10 +54,9 @@ export LVM_ENDPOINT="http://${host_ip}:${LLAVA_SERVER_PORT}" export MEGA_SERVICE_PORT=8888 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" export UI_PORT=5173 +export UI_TIMEOUT=200 ``` -Note: Please replace with `host_ip` with you external IP address, do not use localhost. - > Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server. > If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list > needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not @@ -120,7 +116,15 @@ Build whisper server image docker build --no-cache -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile . ``` -### 6. Build MegaService Docker Image +### 6. Build TTS Server Image + +Build TTS server image + +```bash +docker build --no-cache -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile . +``` + +### 7. Build MegaService Docker Image To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the [multimodalqna.py](../../../../multimodalqna.py) Python script. Build MegaService Docker image via below command: @@ -130,7 +134,7 @@ cd GenAIExamples/MultimodalQnA docker build --no-cache -t opea/multimodalqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . ``` -### 6. Build UI Docker Image +### 8. Build UI Docker Image Build frontend Docker image via below command: @@ -146,11 +150,12 @@ Then run the command `docker images`, you will have the following 11 Docker Imag 3. `ghcr.io/huggingface/tgi-gaudi:2.0.6` 4. `opea/retriever:latest` 5. `opea/whisper:latest` -6. `opea/redis-vector-db` -7. `opea/embedding:latest` -8. `opea/embedding-multimodal-bridgetower:latest` -9. `opea/multimodalqna:latest` -10. `opea/multimodalqna-ui:latest` +6. `opea/speech5:latest` +7. `opea/redis-vector-db` +8. `opea/embedding:latest` +9. `opea/embedding-multimodal-bridgetower:latest` +10. `opea/multimodalqna:latest` +11. `opea/multimodalqna-ui:latest` ## 🚀 Start Microservices @@ -210,7 +215,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ ```bash export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)") -curl http://${host_ip}:7000/v1/multimodal_retrieval \ +curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval \ -X POST \ -H "Content-Type: application/json" \ -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" @@ -234,7 +239,16 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \ -H 'Content-Type: application/json' ``` -6. lvm +6. tts + +```bash +curl ${TTS_ENDPOINT} \ + -X POST \ + -d '{"text": "Who are you?"}' \ + -H 'Content-Type: application/json' +``` + +7. lvm ```bash curl http://${host_ip}:${LVM_PORT}/v1/lvm \ @@ -259,9 +273,9 @@ curl http://${host_ip}:${LVM_PORT}/v1/lvm \ -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}' ``` -7. Multimodal Dataprep Microservice +8. Multimodal Dataprep Microservice -Download a sample video, image, PDF, and audio file and create a caption +Download a sample video (.mp4), image (.png, .gif, .jpg), pdf, and audio file (.wav, .mp3) and create a caption ```bash export video_fn="WeAreGoingOnBullrun.mp4" @@ -280,7 +294,7 @@ export audio_fn="AudioSample.wav" wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn} ``` -Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file. +Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav or .mp3 file. ```bash curl --silent --write-out "HTTPSTATUS:%{http_code}" \ @@ -300,7 +314,7 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \ -X POST -F "files=@./${image_fn}" ``` -Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text. +Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text. The image caption can be provided as a text (`.txt`) or as spoken audio (`.wav` or `.mp3`). ```bash curl --silent --write-out "HTTPSTATUS:%{http_code}" \ @@ -339,7 +353,7 @@ curl -X POST \ ${DATAPREP_DELETE_FILE_ENDPOINT} ``` -8. MegaService +9. MegaService Test the MegaService with a text query: @@ -366,10 +380,10 @@ curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \ -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Green bananas in a tree"}, {"type": "image_url", "image_url": {"url": "http://images.cocodataset.org/test-stuff2017/000000004248.jpg"}}]}]}' ``` -Test the MegaService with a back and forth conversation between the user and assistant: +Test the MegaService with a back and forth conversation between the user and assistant including a text to speech response from the assistant using `"modalities": ["text", "audio"]'`: ```bash curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \ -H "Content-Type: application/json" \ - -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}' + -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10, "modalities": ["text", "audio"]}' ``` diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml index 26b5610f5e..822d3e2896 100644 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -21,6 +21,19 @@ services: WHISPER_PORT: ${WHISPER_PORT} WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT} restart: unless-stopped + speecht5-service: + image: ${REGISTRY:-opea}/speecht5:${TAG:-latest} + container_name: speecht5-service + ports: + - "${TTS_PORT}:7055" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + TTS_PORT: ${TTS_PORT} + TTS_ENDPOINT: ${TTS_ENDPOINT} + restart: unless-stopped dataprep-multimodal-redis: image: ${REGISTRY:-opea}/dataprep:${TAG:-latest} container_name: dataprep-multimodal-redis @@ -182,6 +195,8 @@ services: LVM_MODEL_ID: ${LVM_MODEL_ID} WHISPER_PORT: ${WHISPER_PORT} WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT} + TTS_PORT: ${TTS_PORT} + TTS_ENDPOINT: ${TTS_ENDPOINT} ipc: host restart: always multimodalqna-ui: @@ -199,8 +214,11 @@ services: - DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT} - DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT} - DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT} + - DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT} + - DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT} - MEGA_SERVICE_PORT:=${MEGA_SERVICE_PORT} - UI_PORT=${UI_PORT} + - UI_TIMEOUT=${UI_TIMEOUT} - DATAPREP_MMR_PORT=${DATAPREP_MMR_PORT} ipc: host restart: always diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh index cc35d58d08..ab89b14596 100755 --- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh +++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh @@ -13,16 +13,15 @@ export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip} export LVM_SERVICE_HOST_IP=${host_ip} export MEGA_SERVICE_HOST_IP=${host_ip} -export no_proxy=${your_no_proxy} -export http_proxy=${your_http_proxy} -export https_proxy=${your_http_proxy} - export REDIS_DB_PORT=6379 export REDIS_INSIGHTS_PORT=8001 export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}" export REDIS_HOST=${host_ip} export INDEX_NAME="mm-rag-redis" +export TTS_PORT=7055 +export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts" + export WHISPER_MODEL="base" export WHISPER_PORT=7066 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" @@ -54,3 +53,4 @@ export MEGA_SERVICE_PORT=8888 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" export UI_PORT=5173 +export UI_TIMEOUT=200 diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml index 1fc599c3e5..cb7d0ebf93 100644 --- a/MultimodalQnA/docker_image_build/build.yaml +++ b/MultimodalQnA/docker_image_build/build.yaml @@ -65,3 +65,15 @@ services: dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile extends: multimodalqna image: ${REGISTRY:-opea}/whisper:${TAG:-latest} + speecht5: + build: + context: GenAIComps + dockerfile: comps/tts/src/integrations/dependency/speecht5/Dockerfile + extends: multimodalqna + image: ${REGISTRY:-opea}/speecht5:${TAG:-latest} + tts: + build: + context: GenAIComps + dockerfile: comps/tts/src/Dockerfile + extends: multimodalqna + image: ${REGISTRY:-opea}/tts:${TAG:-latest} diff --git a/MultimodalQnA/multimodalqna.py b/MultimodalQnA/multimodalqna.py index 0e3f87d190..e89c32aeab 100644 --- a/MultimodalQnA/multimodalqna.py +++ b/MultimodalQnA/multimodalqna.py @@ -28,7 +28,9 @@ LVM_SERVICE_HOST_IP = os.getenv("LVM_SERVICE_HOST_IP", "0.0.0.0") LVM_SERVICE_PORT = int(os.getenv("LVM_PORT", 9399)) WHISPER_PORT = int(os.getenv("WHISPER_PORT", 7066)) -WHISPER_SERVER_ENDPOINT = os.getenv("WHISPER_SERVER_ENDPOINT", "http://0.0.0.0:$WHISPER_PORT/v1/asr") +WHISPER_SERVER_ENDPOINT = os.getenv("WHISPER_SERVER_ENDPOINT", f"http://0.0.0.0:{WHISPER_PORT}/v1/asr") +TTS_PORT = int(os.getenv("TTS_PORT", 7055)) +TTS_ENDPOINT = os.getenv("TTS_ENDPOINT", f"http://0.0.0.0:{TTS_PORT}/v1/tts") def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs): @@ -252,6 +254,22 @@ def convert_audio_to_text(self, audio): response = response.json() return response["asr_result"] + def convert_text_to_audio(self, text): + if isinstance(text, dict): + input_dict = {"text": text["text"]} + else: + input_dict = {"text": text} + + response = requests.post(TTS_ENDPOINT, data=json.dumps(input_dict)) + + if response.status_code != 200: + return JSONResponse( + status_code=503, content={"message": "Unable to convert text to audio. {}".format(response.text)} + ) + + response = response.json() + return response["tts_result"] + async def handle_request(self, request: Request): """MultimodalQnA accepts input queries as text, images, and/or audio. @@ -271,6 +289,7 @@ async def handle_request(self, request: Request): print("[ MultimodalQnAService ] stream=True not used, this has not support stream yet!") stream_opt = False chat_request = ChatCompletionRequest.model_validate(data) + modalities = chat_request.modalities num_messages = len(data["messages"]) if isinstance(data["messages"], list) else 1 messages = self._handle_message(chat_request.messages) decoded_audio_input = "" @@ -333,8 +352,12 @@ async def handle_request(self, request: Request): return response last_node = runtime_graph.all_leaves()[-1] + tts_audio = None if "text" in result_dict[last_node].keys(): response = result_dict[last_node]["text"] + # Toggle for TTS + if "audio" in modalities: + tts_audio = {"data": self.convert_text_to_audio(response)} else: # text is not in response message # something wrong, for example due to empty retrieval results @@ -359,7 +382,7 @@ async def handle_request(self, request: Request): choices.append( ChatCompletionResponseChoice( index=0, - message=ChatMessage(role="assistant", content=response), + message=ChatMessage(role="assistant", content=response, audio=tts_audio), finish_reason="stop", metadata=metadata, ) diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh index ccb4f1894d..e3a854a07e 100644 --- a/MultimodalQnA/tests/test_compose_on_gaudi.sh +++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh @@ -14,9 +14,10 @@ WORKPATH=$(dirname "$PWD") LOG_PATH="$WORKPATH/tests" ip_address=$(hostname -I | awk '{print $1}') -export image_fn="apple.png" +export image_fn="sample.png" export video_fn="WeAreGoingOnBullrun.mp4" -export caption_fn="apple.txt" +export audio_fn="sample.mp3" # audio_fn and caption_fn are used as captions for image_fn, so they all need the same base name +export caption_fn="sample.txt" export pdf_fn="nke-10k-2023.pdf" function check_service_ready() { @@ -59,7 +60,7 @@ function build_docker_images() { git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever lvm dataprep whisper" + service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever speecht5 lvm dataprep whisper" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 @@ -82,6 +83,8 @@ function setup_env() { export MAX_IMAGES=1 export WHISPER_MODEL="base" export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" + export TTS_PORT=7055 + export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts" export DATAPREP_MMR_PORT=6007 export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest" export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts" @@ -116,6 +119,7 @@ function prepare_data() { cd $LOG_PATH echo "Downloading image and video" wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn} + wget https://github.com/intel/intel-extension-for-transformers/raw/refs/tags/v1.5/intel_extension_for_transformers/neural_chat/ui/customized/talkingbot/src/lib/components/talkbot/assets/mid-age-man.mp3 -O ${audio_fn} wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn} wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn} echo "Writing caption file" @@ -133,20 +137,23 @@ function validate_service() { if [[ $SERVICE_NAME == *"dataprep-multimodal-redis-transcript"* ]]; then cd $LOG_PATH - HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-caption"* ]]; then cd $LOG_PATH HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -H 'Content-Type: multipart/form-data' "$URL") + elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest-image-audio"* ]]; then + cd $LOG_PATH + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest"* ]]; then cd $LOG_PATH - HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./apple.txt" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-pdf"* ]]; then cd $LOG_PATH HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${pdf_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL") elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then - HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "apple.txt"}' -H 'Content-Type: application/json' "$URL") + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "${caption_fn}"}' -H 'Content-Type: application/json' "$URL") else HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") fi @@ -218,13 +225,20 @@ function validate_microservices() { "dataprep-multimodal-redis-transcript" \ "dataprep-multimodal-redis" - echo "Validating Data Prep with Image & Caption Ingestion" + echo "Validating Data Prep with Image & Text Caption Ingestion" validate_service \ "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \ "Data preparation succeeded" \ "dataprep-multimodal-redis-ingest" \ "dataprep-multimodal-redis" + echo "Validating Data Prep with Image & Audio Caption Ingestion" + validate_service \ + "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \ + "Data preparation succeeded" \ + "dataprep-multimodal-redis-ingest-image-audio" \ + "dataprep-multimodal-redis" + echo "Validating Data Prep with PDF" validate_service \ "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \ @@ -246,6 +260,14 @@ function validate_microservices() { "dataprep_get" \ "dataprep-multimodal-redis" + echo "Validating Text to speech service" + validate_service \ + "${TTS_ENDPOINT}" \ + '"tts_result":' \ + "speecht5-service" \ + "speecht5-service" \ + '{"text": "Who are you?"}' + sleep 1m # multimodal retrieval microservice @@ -303,10 +325,18 @@ function validate_megaservice() { echo "Validating megaservice with first query" validate_service \ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \ - '"time_of_frame_ms":' \ + 'red' \ "multimodalqna" \ "multimodalqna-backend-server" \ - '{"messages": "What is the revenue of Nike in 2023?"}' + '{"messages": "Find an apple. What color is it?"}' + + echo "Validating megaservice with audio response" + validate_service \ + "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \ + '"audio":{"data"' \ + "multimodalqna" \ + "multimodalqna-backend-server" \ + '{"messages": "Find an apple. What color is it?", "modalities": ["text", "audio"]}' echo "Validating megaservice with first audio query" validate_service \ @@ -344,7 +374,7 @@ function validate_megaservice() { function validate_delete { echo "Validating data prep delete files" - export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete" + export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete" validate_service \ "${DATAPREP_DELETE_FILE_ENDPOINT}" \ '{"status":true}' \ @@ -357,6 +387,7 @@ function delete_data() { echo "Deleting image, video, and caption" rm -rf ${image_fn} rm -rf ${video_fn} + rm -rf ${audio_fn} rm -rf ${caption_fn} rm -rf ${pdf_fn} } diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh index 9ba5c68c90..9ba132418e 100644 --- a/MultimodalQnA/tests/test_compose_on_rocm.sh +++ b/MultimodalQnA/tests/test_compose_on_rocm.sh @@ -251,10 +251,10 @@ function validate_megaservice() { echo "Validate megaservice with first query" validate_service \ "http://${host_ip}:8888/v1/multimodalqna" \ - '"time_of_frame_ms":' \ + 'red' \ "multimodalqna" \ "multimodalqna-backend-server" \ - '{"messages": "What is the revenue of Nike in 2023?"}' + '{"messages": "Find an apple. What color is it?"}' echo "Validate megaservice with first audio query" validate_service \ diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh index b5d254b58c..9094faef3d 100644 --- a/MultimodalQnA/tests/test_compose_on_xeon.sh +++ b/MultimodalQnA/tests/test_compose_on_xeon.sh @@ -14,9 +14,10 @@ WORKPATH=$(dirname "$PWD") LOG_PATH="$WORKPATH/tests" ip_address=$(hostname -I | awk '{print $1}') -export image_fn="apple.png" +export image_fn="sample.png" export video_fn="WeAreGoingOnBullrun.mp4" -export caption_fn="apple.txt" +export audio_fn="sample.mp3" # audio_fn and caption_fn are used as captions for image_fn, so they all need the same base name +export caption_fn="sample.txt" export pdf_fn="nke-10k-2023.pdf" function check_service_ready() { @@ -59,7 +60,7 @@ function build_docker_images() { git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm-llava lvm dataprep whisper" + service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever speecht5 lvm-llava lvm dataprep whisper" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker images && sleep 1s } @@ -74,6 +75,8 @@ function setup_env() { export MAX_IMAGES=1 export WHISPER_MODEL="base" export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr" + export TTS_PORT=7055 + export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts" export REDIS_DB_PORT=6379 export REDIS_INSIGHTS_PORT=8001 export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}" @@ -113,6 +116,7 @@ function prepare_data() { cd $LOG_PATH echo "Downloading image and video" wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn} + wget https://github.com/intel/intel-extension-for-transformers/raw/refs/tags/v1.5/intel_extension_for_transformers/neural_chat/ui/customized/talkingbot/src/lib/components/talkbot/assets/mid-age-man.mp3 -O ${audio_fn} wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn} wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn} echo "Writing caption file" @@ -130,20 +134,23 @@ function validate_service() { if [[ $SERVICE_NAME == *"dataprep-multimodal-redis-transcript"* ]]; then cd $LOG_PATH - HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-caption"* ]]; then cd $LOG_PATH HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -H 'Content-Type: multipart/form-data' "$URL") + elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest-image-audio"* ]]; then + cd $LOG_PATH + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest"* ]]; then cd $LOG_PATH - HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./apple.txt" -H 'Content-Type: multipart/form-data' "$URL") + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-pdf"* ]]; then cd $LOG_PATH HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${pdf_fn}" -H 'Content-Type: multipart/form-data' "$URL") elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL") elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then - HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "apple.txt"}' -H 'Content-Type: application/json' "$URL") + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "${caption_fn}"}' -H 'Content-Type: application/json' "$URL") else HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") fi @@ -215,13 +222,20 @@ function validate_microservices() { "dataprep-multimodal-redis-transcript" \ "dataprep-multimodal-redis" - echo "Validating Data Prep with Image & Caption Ingestion" + echo "Validating Data Prep with Image & Text Caption Ingestion" validate_service \ "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \ "Data preparation succeeded" \ "dataprep-multimodal-redis-ingest" \ "dataprep-multimodal-redis" + echo "Validating Data Prep with Image & Audio Caption Ingestion" + validate_service \ + "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \ + "Data preparation succeeded" \ + "dataprep-multimodal-redis-ingest-image-audio" \ + "dataprep-multimodal-redis" + echo "Validating Data Prep with PDF" validate_service \ "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \ @@ -292,6 +306,15 @@ function validate_microservices() { "dataprep-multimodal-redis-caption" \ "dataprep-multimodal-redis" + echo "Validating Text to speech service" + validate_service \ + "${TTS_ENDPOINT}" \ + '"tts_result":' \ + "speecht5-service" \ + "speecht5-service" \ + '{"text": "Who are you?"}' + + sleep 3m } @@ -300,10 +323,18 @@ function validate_megaservice() { echo "Validating megaservice with first query" validate_service \ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \ - '"time_of_frame_ms":' \ + 'red' \ + "multimodalqna" \ + "multimodalqna-backend-server" \ + '{"messages": "Find an apple. What color is it?"}' + + echo "Validating megaservice with audio response" + validate_service \ + "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \ + '"audio":{"data"' \ "multimodalqna" \ "multimodalqna-backend-server" \ - '{"messages": "What is the revenue of Nike in 2023?"}' + '{"messages": "Find an apple. What color is it?", "modalities": ["text", "audio"]}' echo "Validating megaservice with first audio query" validate_service \ @@ -319,8 +350,7 @@ function validate_megaservice() { '"time_of_frame_ms":' \ "multimodalqna" \ "multimodalqna-backend-server" \ - '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Find a similar image"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}]}' - + '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}], "max_tokens": 10, "modalities": ["text", "audio"]}' echo "Validating megaservice with follow-up query" validate_service \ "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \ @@ -340,7 +370,7 @@ function validate_megaservice() { function validate_delete { echo "Validating data prep delete files" - export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete" + export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete" validate_service \ "${DATAPREP_DELETE_FILE_ENDPOINT}" \ '{"status":true}' \ @@ -353,6 +383,7 @@ function delete_data() { echo "Deleting image, video, and caption" rm -rf ${image_fn} rm -rf ${video_fn} + rm -rf ${audio_fn} rm -rf ${pdf_fn} rm -rf ${caption_fn} } diff --git a/MultimodalQnA/ui/gradio/conversation.py b/MultimodalQnA/ui/gradio/conversation.py index 678f7872c2..42622f9ed1 100644 --- a/MultimodalQnA/ui/gradio/conversation.py +++ b/MultimodalQnA/ui/gradio/conversation.py @@ -3,10 +3,10 @@ import dataclasses from enum import Enum, auto -from typing import Dict, List +from pathlib import Path +from typing import Any, Dict, List, Literal -from PIL import Image -from utils import convert_audio_to_base64, get_b64_frame_from_timestamp +from utils import GRADIO_AUDIO_FORMATS, GRADIO_IMAGE_FORMATS, convert_audio_to_base64, get_b64_frame_from_timestamp class SeparatorStyle(Enum): @@ -21,8 +21,7 @@ class Conversation: system: str roles: List[str] - messages: List[List[str]] - image_query_files: Dict[int, str] + chatbot_history: List[Dict[str, Any]] offset: int sep_style: SeparatorStyle = SeparatorStyle.SINGLE sep: str = "\n" @@ -42,66 +41,50 @@ def _template_caption(self): out = f"The caption associated with the image is '{self.caption}'. " return out - def get_prompt(self): - messages = self.messages - if len(messages) > 1 and messages[1][1] is None: - # Need to do RAG. If the query is text, prompt is the query only - if self.audio_query_file: - ret = [{"role": "user", "content": [{"type": "audio", "audio": self.get_b64_audio_query()}]}] - elif 0 in self.image_query_files: - b64_image = get_b64_frame_from_timestamp(self.image_query_files[0], 0) - ret = [ - { - "role": "user", - "content": [ - {"type": "text", "text": messages[0][1]}, - {"type": "image_url", "image_url": {"url": b64_image}}, - ], - } - ] - else: - ret = messages[0][1] - else: - # No need to do RAG. Thus, prompt of chatcompletion format - conv_dict = [] - if self.sep_style == SeparatorStyle.SINGLE: - for i, (role, message) in enumerate(messages): - if message: - dic = {"role": role} - content = [{"type": "text", "text": message}] - # There might be audio - if self.audio_query_file: - content.append({"type": "audio", "audio": self.get_b64_audio_query()}) - # There might be a returned item from the first query - if i == 0 and self.time_of_frame_ms and self.video_file: - base64_frame = ( - self.base64_frame - if self.base64_frame - else get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms) - ) - if base64_frame is None: - base64_frame = "" - # Include the original caption for the returned image/video - if self.caption and content[0]["type"] == "text": - content[0]["text"] = content[0]["text"] + " " + self._template_caption() - content.append({"type": "image_url", "image_url": {"url": base64_frame}}) - # There might be a query image - if i in self.image_query_files: - content.append( - { - "type": "image_url", - "image_url": {"url": get_b64_frame_from_timestamp(self.image_query_files[i], 0)}, - } - ) - dic["content"] = content - conv_dict.append(dic) - else: - raise ValueError(f"Invalid style: {self.sep_style}") - ret = conv_dict - return ret - - def append_message(self, role, message): - self.messages.append([role, message]) + def get_prompt(self, is_very_first_query): + conv_dict = [{"role": "user", "content": []}] + caption_flag = True + is_image_query = False + + for record in self.chatbot_history: + role = record["role"] + content = record["content"] + + if role == "user": + # Check if last entry of conv_dict has role user + if conv_dict[-1]["role"] != "user": + conv_dict.append({"role": "user", "content": []}) + elif role == "assistant": + caption_flag = False + # Check if last entry of conv_dict has role assistant + if conv_dict[-1]["role"] != "assistant": + conv_dict.append({"role": "assistant", "content": []}) + + # Add content to the last conv_dict record. The single space has only effect on first image-only + # query for the similarity search results to get expected response. + if isinstance(content, str): + if caption_flag: + content += " " + self._template_caption() + conv_dict[-1]["content"].append({"type": "text", "text": content}) + + if isinstance(content, dict) and "path" in content: + if Path(content["path"]).suffix in GRADIO_IMAGE_FORMATS: + is_image_query = True + conv_dict[-1]["content"].append( + {"type": "image_url", "image_url": {"url": get_b64_frame_from_timestamp(content["path"], 0)}} + ) + if Path(content["path"]).suffix in GRADIO_AUDIO_FORMATS: + conv_dict[-1]["content"].append( + {"type": "audio", "audio": convert_audio_to_base64(content["path"])} + ) + + # include the image from the assistant's response given the user's is not a image query + if not is_image_query and caption_flag and self.image: + conv_dict[-1]["content"].append( + {"type": "image_url", "image_url": {"url": get_b64_frame_from_timestamp(self.image, 0)}} + ) + + return conv_dict def get_b64_image(self): b64_img = None @@ -118,68 +101,13 @@ def get_b64_audio_query(self): return b64_audio def to_gradio_chatbot(self): - ret = [] - for i, (role, msg) in enumerate(self.messages[self.offset :]): - if i % 2 == 0: - if type(msg) is tuple: - import base64 - from io import BytesIO - - msg, image, image_process_mode = msg - max_hw, min_hw = max(image.size), min(image.size) - aspect_ratio = max_hw / min_hw - max_len, min_len = 800, 400 - shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) - longest_edge = int(shortest_edge * aspect_ratio) - W, H = image.size - if H > W: - H, W = longest_edge, shortest_edge - else: - H, W = shortest_edge, longest_edge - image = image.resize((W, H)) - buffered = BytesIO() - image.save(buffered, format="JPEG") - img_b64_str = base64.b64encode(buffered.getvalue()).decode() - img_str = f'user upload image' - msg = img_str + msg.replace("", "").strip() - ret.append([msg, None]) - elif i in self.image_query_files: - import base64 - from io import BytesIO - - image = Image.open(self.image_query_files[i]) - max_hw, min_hw = max(image.size), min(image.size) - aspect_ratio = max_hw / min_hw - max_len, min_len = 800, 400 - shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) - longest_edge = int(shortest_edge * aspect_ratio) - W, H = image.size - if H > W: - H, W = longest_edge, shortest_edge - else: - H, W = shortest_edge, longest_edge - image = image.resize((W, H)) - buffered = BytesIO() - if image.format not in ["JPEG", "JPG"]: - image = image.convert("RGB") - image.save(buffered, format="JPEG") - img_b64_str = base64.b64encode(buffered.getvalue()).decode() - img_str = f'user upload image' - msg = img_str + msg.replace("", "").strip() - ret.append([msg, None]) - - else: - ret.append([msg, None]) - else: - ret[-1][-1] = msg - return ret + return self.chatbot_history def copy(self): return Conversation( system=self.system, roles=self.roles, - messages=[[x, y] for x, y in self.messages], - image_query_files=self.image_query_files, + chatbot_history=self.chatbot_history, offset=self.offset, sep_style=self.sep_style, sep=self.sep, @@ -192,7 +120,7 @@ def dict(self): return { "system": self.system, "roles": self.roles, - "messages": self.messages, + "chatbot_history": self.chatbot_history, "offset": self.offset, "sep": self.sep, "time_of_frame_ms": self.time_of_frame_ms, @@ -209,8 +137,7 @@ def dict(self): multimodalqna_conv = Conversation( system="", roles=("user", "assistant"), - messages=(), - image_query_files={}, + chatbot_history=[], offset=0, sep_style=SeparatorStyle.SINGLE, sep="\n", diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py index 7919ce5910..7bc54d2a0c 100644 --- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py +++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import glob import os import shutil import time @@ -14,11 +15,25 @@ from fastapi import FastAPI from fastapi.staticfiles import StaticFiles from gradio_pdf import PDF -from utils import build_logger, make_temp_image, server_error_msg, split_video +from utils import ( + GRADIO_AUDIO_FORMATS, + GRADIO_IMAGE_FORMATS, + TMP_DIR, + build_logger, + convert_base64_to_audio, + make_temp_image, + server_error_msg, + split_video, +) + +IMAGE_FORMATS = [".png", ".gif", ".jpg", ".jpeg"] +AUDIO_FORMATS = [".wav", ".mp3"] logger = build_logger("gradio_web_server", "gradio_web_server.log") logflag = os.getenv("LOGFLAG", False) +ui_timeout = int(os.getenv("UI_TIMEOUT", 200)) + headers = {"Content-Type": "application/json"} css = """ @@ -54,85 +69,86 @@ def clear_history(state, request: gr.Request): if state.pdf and os.path.exists(state.pdf): os.remove(state.pdf) state = multimodalqna_conv.copy() - video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media") - image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media") - pdf = PDF(height=512, elem_id="pdf", interactive=False, visible=False, label="Media") - return (state, state.to_gradio_chatbot(), {"text": "", "files": []}, None, video, image, pdf) + (disable_btn,) * 1 + state.chatbot_history = [] + for file in glob.glob(os.path.join(TMP_DIR, "*.wav")): + os.remove(file) # This removes all chatbot assistant's voice response files + video = gr.Video(value=None, elem_id="video", visible=True, label="Media") + image = gr.Image(value=None, elem_id="image", visible=False, label="Media") + pdf = PDF(value=None, elem_id="pdf", interactive=False, visible=False, label="Media") + return (state, state.to_gradio_chatbot(), None, video, image, pdf) + (disable_btn,) * 1 -def add_text(state, textbox, audio, request: gr.Request): - text = textbox["text"] - logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}") - if audio: - state.audio_query_file = audio - state.append_message(state.roles[0], "--input placeholder--") - state.append_message(state.roles[1], None) - state.skip_next = False - return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1 - # If it is a image query - elif textbox["files"]: - image_file = textbox["files"][0] - state.image_query_files[len(state.messages)] = image_file - state.append_message(state.roles[0], text) - state.append_message(state.roles[1], None) - state.skip_next = False - return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1 - elif len(text) <= 0: +def add_text(state, multimodal_textbox, request: gr.Request): + text = multimodal_textbox["text"] + files = multimodal_textbox["files"] + + image_file, audio_file = None, None + + text = text.strip() + + if not text and not files: state.skip_next = True - return (state, state.to_gradio_chatbot(), None, None) + (no_change_btn,) * 1 + return (state, state.to_gradio_chatbot(), None) + (no_change_btn,) * 1 text = text[:2000] # Hard cut-off - state.append_message(state.roles[0], text) - state.append_message(state.roles[1], None) state.skip_next = False - return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1 + if files: + if Path(files[0]).suffix in GRADIO_IMAGE_FORMATS: + image_file = files[0] + if Path(files[0]).suffix in GRADIO_AUDIO_FORMATS or len(files) > 1: + audio_file = files[-1] # Guaranteed that last file would be recorded audio + + # Add to chatbot history + if image_file: + state.image_query_file = image_file + state.chatbot_history.append({"role": state.roles[0], "content": {"path": image_file}}) + if audio_file: + state.audio_query_file = audio_file + state.chatbot_history.append({"role": state.roles[0], "content": {"path": audio_file}}) + + state.chatbot_history.append({"role": state.roles[0], "content": text}) + + logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}") + + return (state, state.to_gradio_chatbot(), gr.MultimodalTextbox(value=None)) + (disable_btn,) * 1 -def http_bot(state, request: gr.Request): +def http_bot(state, audio_response_toggler, request: gr.Request): global gateway_addr logger.info(f"http_bot. ip: {request.client.host}") url = gateway_addr - is_very_first_query = False - is_audio_query = state.audio_query_file is not None + if state.skip_next: # This generate call is skipped due to invalid inputs yield (state, state.to_gradio_chatbot(), None, None, None) + (no_change_btn,) * 1 return - if len(state.messages) == state.offset + 2: - # First round of conversation - is_very_first_query = True - new_state = multimodalqna_conv.copy() - new_state.append_message(new_state.roles[0], state.messages[-2][1]) - new_state.append_message(new_state.roles[1], None) - new_state.audio_query_file = state.audio_query_file - new_state.image_query_files = state.image_query_files - state = new_state + is_very_first_query = all(True if h["role"] == "user" else False for h in state.chatbot_history) # Construct prompt - prompt = state.get_prompt() + prompt = state.get_prompt(is_very_first_query) + + modalities = ["text", "audio"] if audio_response_toggler else ["text"] # Make requests - pload = { - "messages": prompt, - } + pload = {"messages": prompt, "modalities": modalities} + + state.chatbot_history.append({"role": state.roles[1], "content": "▌"}) + + yield (state, state.to_gradio_chatbot(), state.split_video, state.image, state.pdf) + (disable_btn,) * 1 if logflag: logger.info(f"==== request ====\n{pload}") logger.info(f"==== url request ====\n{gateway_addr}") - state.messages[-1][-1] = "▌" - - yield (state, state.to_gradio_chatbot(), state.split_video, state.image, state.pdf) + (disable_btn,) * 1 - try: response = requests.post( url, headers=headers, json=pload, - timeout=100, + timeout=ui_timeout, ) logger.info(response.status_code) if logflag: @@ -143,9 +159,15 @@ def http_bot(state, request: gr.Request): choice = response["choices"][-1] metadata = choice["metadata"] message = choice["message"]["content"] + audio_response = None + if audio_response_toggler: + if choice["message"]["audio"]: + audio_response = choice["message"]["audio"]["data"] + if ( is_very_first_query and not state.video_file + and metadata and "source_video" in metadata and not state.time_of_frame_ms and "time_of_frame_ms" in metadata @@ -164,7 +186,7 @@ def http_bot(state, request: gr.Request): print(f"video {state.video_file} does not exist in UI host!") splited_video_path = None state.split_video = splited_video_path - elif file_ext in [".jpg", ".jpeg", ".png", ".gif"]: + elif file_ext in IMAGE_FORMATS: try: output_image_path = make_temp_image(state.video_file, file_ext) except: @@ -178,29 +200,37 @@ def http_bot(state, request: gr.Request): print(f"pdf {state.video_file} does not exist in UI host!") output_pdf_path = None state.pdf = output_pdf_path - else: raise requests.exceptions.RequestException + except requests.exceptions.RequestException as e: - state.messages[-1][-1] = server_error_msg + if logflag: + logger.info(f"Request Exception occurred:\n{str(e)}") + + gr.Error("Request exception occurred. See logs for details.") + yield (state, state.to_gradio_chatbot(), None, None, None) + (enable_btn,) return - state.messages[-1][-1] = message - - if is_audio_query: - state.messages[-2][-1] = metadata.get("audio", "--transcribed audio not available--") - state.audio_query_file = None + if audio_response: + state.chatbot_history[-1]["content"] = {"path": convert_base64_to_audio(audio_response)} + else: + state.chatbot_history[-1]["content"] = message yield ( state, state.to_gradio_chatbot(), gr.Video(state.split_video, visible=state.split_video is not None), gr.Image(state.image, visible=state.image is not None), - PDF(state.pdf, visible=state.pdf is not None, interactive=False, starting_page=int(state.time_of_frame_ms)), + PDF( + state.pdf, + visible=state.pdf is not None, + interactive=False, + starting_page=int(state.time_of_frame_ms) if state.time_of_frame_ms else 0, + ), ) + (enable_btn,) * 1 - logger.info(f"{state.messages[-1][-1]}") + logger.info(f"{state.chatbot_history[-1]['content']}") return @@ -314,8 +344,10 @@ def ingest_gen_caption(filepath, filetype, request: gr.Request): return -def ingest_with_text(filepath, text, request: gr.Request): +def ingest_with_caption(filepath, text_caption, audio_caption, request: gr.Request): yield (gr.Textbox(visible=True, value="Please wait for your uploaded image to be ingested into the database...")) + + # Process the image verified_filepath = os.path.normpath(filepath) if not verified_filepath.startswith(tmp_upload_folder): print("Found malicious image file name!") @@ -329,19 +361,29 @@ def ingest_with_text(filepath, text, request: gr.Request): basename = os.path.basename(verified_filepath) dest = os.path.join(static_dir, basename) shutil.copy(verified_filepath, dest) - text_basename = "{}.txt".format(os.path.splitext(basename)[0]) - text_dest = os.path.join(static_dir, text_basename) - with open(text_dest, "w") as file: - file.write(text) + + # Process the caption (can be text or audio) + is_audio_caption = audio_caption is not None + if is_audio_caption: + verified_audio_path = os.path.normpath(audio_caption) + caption_basename = "{}{}".format(os.path.splitext(basename)[0], os.path.splitext(verified_audio_path)[-1]) + caption_file = audio_caption + else: + caption_basename = "{}.txt".format(os.path.splitext(basename)[0]) + caption_file = os.path.join(static_dir, caption_basename) + with open(caption_file, "w") as file: + file.write(text_caption) + print("Done copying uploaded files to static folder!") headers = { # 'Content-Type': 'multipart/form-data' } - files = [("files", (basename, open(dest, "rb"))), ("files", (text_basename, open(text_dest, "rb")))] + files = [("files", (basename, open(dest, "rb"))), ("files", (caption_basename, open(caption_file, "rb")))] try: response = requests.post(dataprep_ingest_addr, headers=headers, files=files) finally: - os.remove(text_dest) + if not is_audio_caption: + os.remove(caption_file) logger.info(response.status_code) if response.status_code == 200: response = response.json() @@ -427,8 +469,44 @@ def hide_text(request: gr.Request): return gr.Textbox(visible=False) -def clear_text(request: gr.Request): - return None +def hide_text_pdf(pdf, text, request: gr.Request): + if pdf is not None: + return text + else: + return gr.Textbox(visible=False) + + +def clear_captions(request: gr.Request): + return None, None + + +def get_files(): + try: + response = requests.post(dataprep_get_file_addr, headers=headers) + logger.info(response.status_code) + files = response.json() + if files: + html_content = "" + yield (gr.HTML(html_content, visible=True, max_height=200)) + return + else: + yield (gr.HTML("Vector store is empty.", visible=True)) + return + except Exception as e: + logger.info(f"Error getting files from vector store: {str(e)}") + + +def delete_files(): + import json + + data = {"file_path": "all"} + try: + response = requests.post(dataprep_delete_file_addr, headers=headers, data=json.dumps(data)) + logger.info(response.status_code) + yield (gr.update(value="Deleted all files!")) + return + except Exception as e: + logger.info(f"Error deleting files from vector store: {str(e)}") with gr.Blocks() as upload_video: @@ -472,13 +550,48 @@ def select_upload_type(choice, request: gr.Request): with gr.Blocks() as upload_image: gr.Markdown("# Ingest Images Using Generated or Custom Captions") - gr.Markdown("Use this interface to ingest an image and generate a caption for it") + gr.Markdown( + "Use this interface to ingest an image and generate a caption for it. If uploading a caption, populate it before the image." + ) + + text_caption_label = "Text Caption" + audio_caption_label = "Voice Audio Caption ({}, or microphone)".format(", ".join(AUDIO_FORMATS)) def select_upload_type(choice, request: gr.Request): if choice == "gen_caption": - return gr.Image(sources="upload", visible=True), gr.Image(sources="upload", visible=False) + return ( + gr.Image(sources="upload", visible=True), + gr.Image(sources="upload", visible=False), + gr.Textbox(visible=False, interactive=True, label=text_caption_label), + gr.Audio(visible=False, type="filepath", label=audio_caption_label), + ) + elif choice == "custom_caption": + return ( + gr.Image(sources="upload", visible=False), + gr.Image(sources="upload", visible=True), + gr.Textbox(visible=True, interactive=True, label=text_caption_label), + gr.Audio(visible=False, type="filepath", label=audio_caption_label), + ) else: - return gr.Image(sources="upload", visible=False), gr.Image(sources="upload", visible=True) + return ( + gr.Image(sources="upload", visible=False), + gr.Image(sources="upload", visible=True), + gr.Textbox(visible=False, interactive=True, label=text_caption_label), + gr.Audio(visible=True, type="filepath", label=audio_caption_label), + ) + + def verify_audio_caption_type(file, request: gr.Request): + audio_type = os.path.splitext(file)[-1] + if audio_type not in AUDIO_FORMATS: + return ( + None, + gr.Textbox(visible=True, value="The audio file format must be {}".format(" or ".join(AUDIO_FORMATS))), + ) + else: + return ( + gr.Audio(value=file, visible=True, type="filepath", label=audio_caption_label), + gr.Textbox(visible=False, value=None), + ) with gr.Row(): with gr.Column(scale=6): @@ -486,22 +599,34 @@ def select_upload_type(choice, request: gr.Request): image_upload_text = gr.Image(type="filepath", sources="upload", elem_id="image_upload_cap", visible=False) with gr.Column(scale=3): text_options_radio = gr.Radio( - [("Generate caption", "gen_caption"), ("Custom caption or label", "custom_caption")], - label="Text Options", - info="How should text be ingested?", + [ + ("Auto-generate a caption", "gen_caption"), + ("Upload a text caption (populate before image)", "custom_caption"), + ("Upload an audio caption (populate before image)", "custom_audio_caption"), + ], + label="Caption Options", + info="How should captions be ingested?", value="gen_caption", ) - custom_caption = gr.Textbox(visible=True, interactive=True, label="Custom Caption or Label") + custom_caption = gr.Textbox(visible=False, interactive=True, label=text_caption_label) + custom_caption_audio = gr.Audio(visible=False, type="filepath", label=audio_caption_label) text_upload_result = gr.Textbox(visible=False, interactive=False, label="Upload Status") + custom_caption_audio.input( + verify_audio_caption_type, [custom_caption_audio], [custom_caption_audio, text_upload_result] + ) image_upload_cap.upload( ingest_gen_caption, [image_upload_cap, gr.Textbox(value="image", visible=False)], [text_upload_result] ) image_upload_cap.clear(hide_text, [], [text_upload_result]) - image_upload_text.upload(ingest_with_text, [image_upload_text, custom_caption], [text_upload_result]).then( - clear_text, [], [custom_caption] - ) + image_upload_text.upload( + ingest_with_caption, [image_upload_text, custom_caption, custom_caption_audio], [text_upload_result] + ).then(clear_captions, [], [custom_caption, custom_caption_audio]) image_upload_text.clear(hide_text, [], [text_upload_result]) - text_options_radio.change(select_upload_type, [text_options_radio], [image_upload_cap, image_upload_text]) + text_options_radio.change( + select_upload_type, + [text_options_radio], + [image_upload_cap, image_upload_text, custom_caption, custom_caption_audio], + ) with gr.Blocks() as upload_audio: gr.Markdown("# Ingest Audio Using Generated Transcripts") @@ -527,34 +652,29 @@ def select_upload_type(choice, request: gr.Request): pdf_upload = PDF(label="PDF File") with gr.Column(scale=3): pdf_upload_result = gr.Textbox(visible=False, interactive=False, label="Upload Status") + pdf_upload.change(hide_text_pdf, [pdf_upload, pdf_upload_result], [pdf_upload_result]) pdf_upload.upload(ingest_pdf, [pdf_upload], [pdf_upload_result]) with gr.Blocks() as qna: state = gr.State(multimodalqna_conv.copy()) - with gr.Row(): + with gr.Row(equal_height=True): with gr.Column(scale=2): - video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media") - image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media") - pdf = PDF(height=512, elem_id="pdf", interactive=False, visible=False, label="Media") + video = gr.Video(elem_id="video", visible=True, label="Media") + image = gr.Image(elem_id="image", visible=False, label="Media") + pdf = PDF(elem_id="pdf", interactive=False, visible=False, label="Media") with gr.Column(scale=9): - chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", height=390) - with gr.Row(): + chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", type="messages") + with gr.Row(equal_height=True): with gr.Column(scale=8): - with gr.Tabs(): - with gr.TabItem("Text & Image Query"): - textbox = gr.MultimodalTextbox( - show_label=False, container=True, submit_btn=False, file_types=["image"] - ) - with gr.TabItem("Audio Query"): - audio = gr.Audio( - type="filepath", - sources=["microphone", "upload"], - show_label=False, - container=False, - ) - with gr.Column(scale=1, min_width=100): + multimodal_textbox = gr.MultimodalTextbox( + show_label=False, + file_types=GRADIO_IMAGE_FORMATS + GRADIO_AUDIO_FORMATS, + sources=["microphone", "upload"], + placeholder="Text, Image & Audio Query", + ) + with gr.Column(scale=1, min_width=150): with gr.Row(): - submit_btn = gr.Button(value="Send", variant="primary", interactive=True) + audio_response_toggler = gr.Checkbox(label="Audio Responses", container=False) with gr.Row(elem_id="buttons") as button_row: clear_btn = gr.Button(value="🗑️ Clear", interactive=False) @@ -563,20 +683,27 @@ def select_upload_type(choice, request: gr.Request): [ state, ], - [state, chatbot, textbox, audio, video, image, pdf, clear_btn], + [state, chatbot, multimodal_textbox, video, image, pdf, clear_btn], ) - submit_btn.click( - add_text, - [state, textbox, audio], - [state, chatbot, textbox, audio, clear_btn], - ).then( - http_bot, - [ - state, - ], - [state, chatbot, video, image, pdf, clear_btn], + multimodal_textbox.submit( + add_text, [state, multimodal_textbox], [state, chatbot, multimodal_textbox, clear_btn] + ).then(http_bot, [state, audio_response_toggler], [state, chatbot, video, image, pdf, clear_btn]).then( + lambda: gr.MultimodalTextbox(interactive=True), None, [multimodal_textbox] ) + +with gr.Blocks() as vector_store: + gr.Markdown("# Uploaded Files") + + with gr.Row(): + with gr.Column(scale=6): + files = gr.HTML(visible=False) + with gr.Column(scale=3): + refresh_btn = gr.Button(value="↻ Refresh", interactive=True, variant="primary") + delete_btn = gr.Button(value="🗑️ Delete", interactive=True, variant="stop") + refresh_btn.click(get_files, None, [files]) + delete_btn.click(delete_files, None, [files]) + with gr.Blocks(css=css) as demo: gr.Markdown("# MultimodalQnA") with gr.Tabs(): @@ -590,6 +717,8 @@ def select_upload_type(choice, request: gr.Request): upload_audio.render() with gr.TabItem("Upload PDF"): upload_pdf.render() + with gr.TabItem("Vector Store"): + vector_store.render() demo.queue() app = gr.mount_gradio_app(app, demo, path="/") @@ -618,6 +747,12 @@ def select_upload_type(choice, request: gr.Request): dataprep_gen_caption_endpoint = os.getenv( "DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_captions" ) + dataprep_get_file_endpoint = os.getenv( + "DATAPREP_GET_FILE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/dataprep/get" + ) + dataprep_delete_file_endpoint = os.getenv( + "DATAPREP_DELETE_FILE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/dataprep/delete" + ) args = parser.parse_args() logger.info(f"args: {args}") global gateway_addr @@ -628,5 +763,9 @@ def select_upload_type(choice, request: gr.Request): dataprep_gen_transcript_addr = dataprep_gen_transcript_endpoint global dataprep_gen_caption_addr dataprep_gen_caption_addr = dataprep_gen_caption_endpoint + global dataprep_get_file_addr + dataprep_get_file_addr = dataprep_get_file_endpoint + global dataprep_delete_file_addr + dataprep_delete_file_addr = dataprep_delete_file_endpoint uvicorn.run(app, host=args.host, port=args.port) diff --git a/MultimodalQnA/ui/gradio/requirements.txt b/MultimodalQnA/ui/gradio/requirements.txt index 12081ed73d..80fc5a0dcc 100644 --- a/MultimodalQnA/ui/gradio/requirements.txt +++ b/MultimodalQnA/ui/gradio/requirements.txt @@ -1,5 +1,5 @@ -gradio==5.11.0 -gradio_pdf==0.0.19 +gradio==5.17.1 +gradio_pdf==0.0.20 moviepy==1.0.3 numpy==1.26.4 opencv-python==4.10.0.82 diff --git a/MultimodalQnA/ui/gradio/utils.py b/MultimodalQnA/ui/gradio/utils.py index c22d102a5a..a0ce9d6b7f 100644 --- a/MultimodalQnA/ui/gradio/utils.py +++ b/MultimodalQnA/ui/gradio/utils.py @@ -7,16 +7,24 @@ import os import shutil import sys +import tempfile from pathlib import Path import cv2 from moviepy.video.io.VideoFileClip import VideoFileClip LOGDIR = "." +TMP_DIR = "/tmp" server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." +GRADIO_IMAGE_FORMATS = [".jpeg", ".png", ".jpg", ".gif"] +GRADIO_AUDIO_FORMATS = [ + ".wav", + ".mp3", +] + handler = None save_log = False @@ -186,3 +194,16 @@ def convert_audio_to_base64(audio_path): """Convert .wav file to base64 string.""" encoded_string = base64.b64encode(open(audio_path, "rb").read()) return encoded_string.decode("utf-8") + + +def convert_base64_to_audio(b64_str): + """Decodes the base64 encoded audio data and returns a saved filepath.""" + + audio_data = base64.b64decode(b64_str) + + # Create a temporary file + with tempfile.NamedTemporaryFile(dir=TMP_DIR, delete=False, suffix=".wav") as temp_audio: + temp_audio.write(audio_data) + temp_audio_path = temp_audio.name # Store the path + + return temp_audio_path