diff --git a/MultimodalQnA/Dockerfile b/MultimodalQnA/Dockerfile
index f0048692e6..fd45b14bc1 100644
--- a/MultimodalQnA/Dockerfile
+++ b/MultimodalQnA/Dockerfile
@@ -20,7 +20,8 @@ WORKDIR $HOME
 FROM base AS git
 
 RUN apt-get update && apt-get install -y --no-install-recommends git
-RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+# RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git
+RUN git clone --single-branch --branch="mmqna-phase3" https://github.com/mhbuehler/GenAIComps.git
 
 # Stage 3: common layer shared by services using GenAIComps
 FROM base AS comps-base
diff --git a/MultimodalQnA/README.md b/MultimodalQnA/README.md
index bda42ee285..125d9fba07 100644
--- a/MultimodalQnA/README.md
+++ b/MultimodalQnA/README.md
@@ -2,7 +2,7 @@
 
 Suppose you possess a set of videos, images, audio files, PDFs, or some combination thereof and wish to perform question-answering to extract insights from these documents. To respond to your questions, the system needs to comprehend a mix of textual, visual, and audio facts drawn from the document contents. The MultimodalQnA framework offers an optimal solution for this purpose.
 
-`MultimodalQnA` addresses your questions by dynamically fetching the most pertinent multimodal information (e.g. images, transcripts, and captions) from your collection of video, image, audio, and PDF files. For this purpose, MultimodalQnA utilizes [BridgeTower model](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi), a multimodal encoding transformer model which merges visual and textual data into a unified semantic space. During the ingestion phase, the BridgeTower model embeds both visual cues and auditory facts as texts, and those embeddings are then stored in a vector database. When it comes to answering a question, the MultimodalQnA will fetch its most relevant multimodal content from the vector store and feed it into a downstream Large Vision-Language Model (LVM) as input context to generate a response for the user.
+`MultimodalQnA` addresses your questions by dynamically fetching the most pertinent multimodal information (e.g. images, transcripts, and captions) from your collection of video, image, audio, and PDF files. For this purpose, MultimodalQnA utilizes [BridgeTower model](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi), a multimodal encoding transformer model which merges visual and textual data into a unified semantic space. During the ingestion phase, the BridgeTower model embeds both visual cues and auditory facts as texts, and those embeddings are then stored in a vector database. When it comes to answering a question, the MultimodalQnA will fetch its most relevant multimodal content from the vector store and feed it into a downstream Large Vision-Language Model (LVM) as input context to generate a response for the user, which can be text or audio.
 
 The MultimodalQnA architecture shows below:
 
@@ -41,12 +41,14 @@ flowchart LR
         UI([UI server<br>]):::orchid
     end
 
+    ASR{{Whisper service <br>}}
     TEI_EM{{Embedding service <br>}}
     VDB{{Vector DB<br><br>}}
     R_RET{{Retriever service <br>}}
     DP([Data Preparation<br>]):::blue
     LVM_gen{{LVM Service <br>}}
     GW([MultimodalQnA GateWay<br>]):::orange
+    TTS{{SpeechT5 service <br>}}
 
     %% Data Preparation flow
     %% Ingest data flow
@@ -74,25 +76,42 @@ flowchart LR
     R_RET <-.->VDB
     DP <-.->VDB
 
+    %% Audio speech recognition used for translating audio queries to text
+    GW <-.-> ASR
 
+    %% Generate spoken responses with text-to-speech using the SpeechT5 model
+    GW <-.-> TTS
 
 ```
 
 This MultimodalQnA use case performs Multimodal-RAG using LangChain, Redis VectorDB and Text Generation Inference on [Intel Gaudi2](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html) and [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html), and we invite contributions from other hardware vendors to expand the example.
 
+The [Whisper Service](https://github.com/opea-project/GenAIComps/blob/main/comps/asr/src/README.md)
+is used by MultimodalQnA for converting audio queries to text. If a spoken response is requested, the
+[SpeechT5 Service](https://github.com/opea-project/GenAIComps/blob/main/comps/tts/src/README.md) translates the text
+response from the LVM to a speech audio file.
+
 The Intel Gaudi2 accelerator supports both training and inference for deep learning models in particular for LLMs. Visit [Habana AI products](https://habana.ai/products) for more details.
 
 In the below, we provide a table that describes for each microservice component in the MultimodalQnA architecture, the default configuration of the open source project, hardware, port, and endpoint.
 
 <details>
-<summary><b>Gaudi default compose.yaml</b></summary>
-
-| MicroService | Open Source Project   | HW    | Port | Endpoint                                                    |
-| ------------ | --------------------- | ----- | ---- | ----------------------------------------------------------- |
-| Embedding    | Langchain             | Xeon  | 6000 | /v1/embeddings                                              |
-| Retriever    | Langchain, Redis      | Xeon  | 7000 | /v1/multimodal_retrieval                                    |
-| LVM          | Langchain, TGI        | Gaudi | 9399 | /v1/lvm                                                     |
-| Dataprep     | Redis, Langchain, TGI | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |
+<summary><b>Gaudi and Xeon default compose.yaml settings</b></summary>
+
+| MicroService | Open Source Project     | HW    | Port | Endpoint                                                    |
+| ------------ | ----------------------- | ----- | ---- | ----------------------------------------------------------- |
+| Dataprep     | Redis, Langchain, TGI   | Xeon  | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |
+| Embedding    | Langchain               | Xeon  | 6000 | /v1/embeddings                                              |
+| LVM          | Langchain, Transformers | Xeon  | 9399 | /v1/lvm                                                     |
+| Retriever    | Langchain, Redis        | Xeon  | 7000 | /v1/retrieval                                               |
+| SpeechT5     | Transformers            | Xeon  | 7055 | /v1/tts                                                     |
+| Whisper      | Transformers            | Xeon  | 7066 | /v1/asr                                                     |
+| Dataprep     | Redis, Langchain, TGI   | Gaudi | 6007 | /v1/generate_transcripts, /v1/generate_captions, /v1/ingest |
+| Embedding    | Langchain               | Gaudi | 6000 | /v1/embeddings                                              |
+| LVM          | Langchain, TGI          | Gaudi | 9399 | /v1/lvm                                                     |
+| Retriever    | Langchain, Redis        | Gaudi | 7000 | /v1/retrieval                                               |
+| SpeechT5     | Transformers            | Gaudi | 7055 | /v1/tts                                                     |
+| Whisper      | Transformers            | Gaudi | 7066 | /v1/asr                                                     |
 
 </details>
 
@@ -104,8 +123,12 @@ By default, the embedding and LVM models are set to a default value as listed be
 | --------- | ----- | ----------------------------------------- |
 | embedding | Xeon  | BridgeTower/bridgetower-large-itm-mlm-itc |
 | LVM       | Xeon  | llava-hf/llava-1.5-7b-hf                  |
+| SpeechT5  | Xeon  | microsoft/speecht5_tts                    |
+| Whisper   | Xeon  | openai/whisper-small                      |
 | embedding | Gaudi | BridgeTower/bridgetower-large-itm-mlm-itc |
 | LVM       | Gaudi | llava-hf/llava-v1.6-vicuna-13b-hf         |
+| SpeechT5  | Gaudi | microsoft/speecht5_tts                    |
+| Whisper   | Gaudi | openai/whisper-small                      |
 
 You can choose other LVM models, such as `llava-hf/llava-1.5-7b-hf ` and `llava-hf/llava-1.5-13b-hf`, as needed.
 
@@ -113,9 +136,28 @@ You can choose other LVM models, such as `llava-hf/llava-1.5-7b-hf ` and `llava-
 
 The MultimodalQnA service can be effortlessly deployed on either Intel Gaudi2 or Intel XEON Scalable Processors.
 
-Currently we support deploying MultimodalQnA services with docker compose.
+Currently we support deploying MultimodalQnA services with docker compose. The [`docker_compose`](docker_compose)
+directory has folders which include `compose.yaml` files for different hardware types:
 
-### Setup Environment Variable
+```
+📂 docker_compose
+├── 📂 amd
+│   └── 📂 gpu
+│       └── 📂 rocm
+│           ├── 📄 compose.yaml
+│           └── ...
+└── 📂 intel
+    ├── 📂 cpu
+    │   └── 📂 xeon
+    │       ├── 📄 compose.yaml
+    │       └── ...
+    └── 📂 hpu
+        └── 📂 gaudi
+            ├── 📄 compose.yaml
+            └── ...
+```
+
+### Setup Environment Variables
 
 To set up environment variables for deploying MultimodalQnA services, follow these steps:
 
@@ -124,8 +166,10 @@ To set up environment variables for deploying MultimodalQnA services, follow the
    ```bash
    # Example: export host_ip=$(hostname -I | awk '{print $1}')
    export host_ip="External_Public_IP"
+
+   # Append the host_ip to the no_proxy list to allow container communication
    # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy"
+   export no_proxy="${no_proxy},${host_ip}"
    ```
 
 2. If you are in a proxy environment, also set the proxy-related environment variables:
@@ -137,36 +181,41 @@ To set up environment variables for deploying MultimodalQnA services, follow the
 
 3. Set up other environment variables:
 
-   > Notice that you can only choose **one** command below to set up envs according to your hardware. Other that the port numbers may be set incorrectly.
+   > Choose **one** command below to set env vars according to your hardware. Otherwise, the port numbers may be set incorrectly.
 
    ```bash
    # on Gaudi
-   source ./docker_compose/intel/hpu/gaudi/set_env.sh
+   cd docker_compose/intel/hpu/gaudi
+   source ./set_env.sh
+
    # on Xeon
-   source ./docker_compose/intel/cpu/xeon/set_env.sh
+   cd docker_compose/intel/cpu/xeon
+   source ./set_env.sh
    ```
 
 ### Deploy MultimodalQnA on Gaudi
 
-Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.
+Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) if you would like to build docker images from
+source, otherwise images will be pulled from Docker Hub.
 
 Find the corresponding [compose.yaml](./docker_compose/intel/hpu/gaudi/compose.yaml).
 
 ```bash
-cd GenAIExamples/MultimodalQnA/docker_compose/intel/hpu/gaudi/
+# While still in the docker_compose/intel/hpu/gaudi directory, use docker compose to bring up the services
 docker compose -f compose.yaml up -d
 ```
 
-> Notice: Currently only the **Habana Driver 1.17.x** is supported for Gaudi.
+> Notice: Currently only the **Habana Driver 1.18.x** is supported for Gaudi.
 
 ### Deploy MultimodalQnA on Xeon
 
-Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.
+Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) if you would like to build docker images from
+source, otherwise images will be pulled from Docker Hub.
 
 Find the corresponding [compose.yaml](./docker_compose/intel/cpu/xeon/compose.yaml).
 
 ```bash
-cd GenAIExamples/MultimodalQnA/docker_compose/intel/cpu/xeon/
+# While still in the docker_compose/intel/cpu/xeon directory, use docker compose to bring up the services
 docker compose -f compose.yaml up -d
 ```
 
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
index f49b9815f1..4e3a031da9 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/README.md
@@ -178,7 +178,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
 
 ```bash
 export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:7000/v1/multimodal_retrieval \
+curl http://${host_ip}:7000/v1/retrieval \
     -X POST \
     -H "Content-Type: application/json" \
     -d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
diff --git a/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml b/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml
index af4855bb59..e743d111e7 100644
--- a/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml
+++ b/MultimodalQnA/docker_compose/amd/gpu/rocm/compose.yaml
@@ -175,6 +175,8 @@ services:
       - DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT}
       - DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}
       - DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}
+      - DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT}
+      - DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT}
     ipc: host
     restart: always
 
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
index 7e4fa6894a..eeefddeb2d 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/README.md
@@ -44,6 +44,10 @@ whisper
 ===
 port 7066 - Open to 0.0.0.0/0
 
+speecht5-service
+===
+port 7055 - Open to 0.0.0.0/0
+
 dataprep-multimodal-redis
 ===
 Port 6007 - Open to 0.0.0.0/0
@@ -63,7 +67,7 @@ Since the `compose.yaml` will consume some environment variables, you need to se
 
 **Export the value of the public IP address of your Xeon server to the `host_ip` environment variable**
 
-> Change the External_Public_IP below with the actual IPV4 value
+> Change the External_Public_IP below with the actual IPV4 value when setting the `host_ip` value (do not use localhost).
 
 ```
 export host_ip="External_Public_IP"
@@ -72,13 +76,10 @@ export host_ip="External_Public_IP"
 **Append the value of the public IP address to the no_proxy list**
 
 ```bash
-export your_no_proxy=${your_no_proxy},"External_Public_IP"
+export no_proxy=${no_proxy},${host_ip}
 ```
 
 ```bash
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
 export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export LVM_SERVICE_HOST_IP=${host_ip}
@@ -86,6 +87,8 @@ export MEGA_SERVICE_HOST_IP=${host_ip}
 export WHISPER_PORT=7066
 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
 export WHISPER_MODEL="base"
+export TTS_PORT=7055
+export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts"
 export MAX_IMAGES=1
 export REDIS_DB_PORT=6379
 export REDIS_INSIGHTS_PORT=8001
@@ -111,10 +114,9 @@ export LVM_ENDPOINT="http://${host_ip}:$LLAVA_SERVER_PORT"
 export MEGA_SERVICE_PORT=8888
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:$MEGA_SERVICE_PORT/v1/multimodalqna"
 export UI_PORT=5173
+export UI_TIMEOUT=200
 ```
 
-Note: Please replace with `host_ip` with you external IP address, do not use localhost.
-
 > Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
 > If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
 > needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
@@ -172,7 +174,13 @@ Build whisper server image
 docker build --no-cache -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```
 
-### 6. Build MegaService Docker Image
+### 6. Build TTS Image
+
+```bash
+docker build --no-cache -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+```
+
+### 7. Build MegaService Docker Image
 
 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the [multimodalqna.py](../../../../multimodalqna.py) Python script. Build MegaService Docker image via below command:
 
@@ -183,7 +191,7 @@ docker build --no-cache -t opea/multimodalqna:latest --build-arg https_proxy=$ht
 cd ../..
 ```
 
-### 7. Build UI Docker Image
+### 8. Build UI Docker Image
 
 Build frontend Docker image via below command:
 
@@ -200,11 +208,12 @@ Then run the command `docker images`, you will have the following 11 Docker Imag
 3. `opea/lvm-llava:latest`
 4. `opea/retriever:latest`
 5. `opea/whisper:latest`
-6. `opea/redis-vector-db`
-7. `opea/embedding:latest`
-8. `opea/embedding-multimodal-bridgetower:latest`
-9. `opea/multimodalqna:latest`
-10. `opea/multimodalqna-ui:latest`
+6. `opea/speech5:latest`
+7. `opea/redis-vector-db`
+8. `opea/embedding:latest`
+9. `opea/embedding-multimodal-bridgetower:latest`
+10. `opea/multimodalqna:latest`
+11. `opea/multimodalqna-ui:latest`
 
 ## 🚀 Start Microservices
 
@@ -264,7 +273,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
 
 ```bash
 export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/multimodal_retrieval \
+curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval \
     -X POST \
     -H "Content-Type: application/json" \
     -d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
@@ -279,7 +288,16 @@ curl ${WHISPER_SERVER_ENDPOINT} \
     -d '{"audio" : "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
 ```
 
-5. lvm-llava
+5. tts
+
+```bash
+curl ${TTS_ENDPOINT} \
+  -X POST \
+  -d '{"text": "Who are you?"}' \
+  -H 'Content-Type: application/json'
+```
+
+6. lvm-llava
 
 ```bash
 curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \
@@ -288,7 +306,7 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \
      -d '{"prompt":"Describe the image please.", "img_b64_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}'
 ```
 
-6. lvm
+7. lvm
 
 ```bash
 curl http://${host_ip}:${LVM_PORT}/v1/lvm \
@@ -313,9 +331,9 @@ curl http://${host_ip}:${LVM_PORT}/v1/lvm \
     -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
 ```
 
-7. dataprep-multimodal-redis
+8. dataprep-multimodal-redis
 
-Download a sample video, image, pdf, and audio file and create a caption
+Download a sample video (.mp4), image (.png, .gif, .jpg), pdf, and audio file (.wav, .mp3) and create a caption
 
 ```bash
 export video_fn="WeAreGoingOnBullrun.mp4"
@@ -334,7 +352,7 @@ export audio_fn="AudioSample.wav"
 wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
 ```
 
-Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file.
+Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav or .mp3 file.
 
 ```bash
 curl --silent --write-out "HTTPSTATUS:%{http_code}" \
@@ -354,7 +372,7 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \
     -X POST -F "files=@./${image_fn}"
 ```
 
-Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text.
+Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text. The image caption can be provided as a text (`.txt`) or as spoken audio (`.wav` or `.mp3`).
 
 ```bash
 curl --silent --write-out "HTTPSTATUS:%{http_code}" \
@@ -393,7 +411,7 @@ curl -X POST \
     ${DATAPREP_DELETE_FILE_ENDPOINT}
 ```
 
-8. MegaService
+9. MegaService
 
 Test the MegaService with a text query:
 
@@ -428,8 +446,10 @@ curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna  \
     -d '{"messages": [{"role": "user", "content": [{"type": "audio", "audio": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}]}]}'
 ```
 
+Test the MegaService with a back and forth conversation between the user and assistant including a text to speech response from the assistant using `"modalities": ["text", "audio"]'`:
+
 ```bash
 curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
     -H "Content-Type: application/json" \
-    -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}'
+    -d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10, "modalities": ["text", "audio"]}'
 ```
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 31f543c755..0bc7321500 100644
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -13,6 +13,19 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     restart: unless-stopped
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "${TTS_PORT}:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      TTS_PORT: ${TTS_PORT}
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+    restart: unless-stopped
   redis-vector-db:
     image: redis/redis-stack:7.2.0-v9
     container_name: redis-vector-db
@@ -152,6 +165,8 @@ services:
       LVM_MODEL_ID: ${LVM_MODEL_ID}
       WHISPER_PORT: ${WHISPER_PORT}
       WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT}
+      TTS_PORT: ${TTS_PORT}
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
     ipc: host
     restart: always
   multimodalqna-ui:
@@ -169,8 +184,11 @@ services:
       - DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT}
       - DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}
       - DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}
+      - DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT}
+      - DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT}
       - MEGA_SERVICE_PORT:=${MEGA_SERVICE_PORT}
       - UI_PORT=${UI_PORT}
+      - UI_TIMEOUT=${UI_TIMEOUT}
       - DATAPREP_MMR_PORT=${DATAPREP_MMR_PORT}
     ipc: host
     restart: always
diff --git a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
index 057f90990c..0cd1267460 100755
--- a/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -8,15 +8,14 @@ popd > /dev/null
 
 export host_ip=$(hostname -I | awk '{print $1}')
 
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
-
 export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export LVM_SERVICE_HOST_IP=${host_ip}
 export MEGA_SERVICE_HOST_IP=${host_ip}
 
+export TTS_PORT=7055
+export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts"
+
 export WHISPER_PORT=7066
 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
 export WHISPER_MODEL="base"
@@ -52,3 +51,4 @@ export MEGA_SERVICE_PORT=8888
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
 
 export UI_PORT=5173
+export UI_TIMEOUT=200
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
index 2379fc3d4d..b81c372e20 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -8,7 +8,7 @@ Since the `compose.yaml` will consume some environment variables, you need to se
 
 **Export the value of the public IP address of your Gaudi server to the `host_ip` environment variable**
 
-> Change the External_Public_IP below with the actual IPV4 value
+> Change the External_Public_IP below with the actual IPV4 value when setting the `host_ip` value (do not use localhost).
 
 ```
 export host_ip="External_Public_IP"
@@ -17,13 +17,10 @@ export host_ip="External_Public_IP"
 **Append the value of the public IP address to the no_proxy list**
 
 ```bash
-export your_no_proxy=${your_no_proxy},"External_Public_IP"
+export no_proxy=${no_proxy},${host_ip}
 ```
 
 ```bash
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
 export MM_EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export LVM_SERVICE_HOST_IP=${host_ip}
@@ -57,10 +54,9 @@ export LVM_ENDPOINT="http://${host_ip}:${LLAVA_SERVER_PORT}"
 export MEGA_SERVICE_PORT=8888
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
 export UI_PORT=5173
+export UI_TIMEOUT=200
 ```
 
-Note: Please replace with `host_ip` with you external IP address, do not use localhost.
-
 > Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
 > If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
 > needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
@@ -120,7 +116,15 @@ Build whisper server image
 docker build --no-cache -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/src/integrations/dependency/whisper/Dockerfile .
 ```
 
-### 6. Build MegaService Docker Image
+### 6. Build TTS Server Image
+
+Build TTS server image
+
+```bash
+docker build --no-cache -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/src/integrations/dependency/speecht5/Dockerfile .
+```
+
+### 7. Build MegaService Docker Image
 
 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the [multimodalqna.py](../../../../multimodalqna.py) Python script. Build MegaService Docker image via below command:
 
@@ -130,7 +134,7 @@ cd GenAIExamples/MultimodalQnA
 docker build --no-cache -t opea/multimodalqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```
 
-### 6. Build UI Docker Image
+### 8. Build UI Docker Image
 
 Build frontend Docker image via below command:
 
@@ -146,11 +150,12 @@ Then run the command `docker images`, you will have the following 11 Docker Imag
 3. `ghcr.io/huggingface/tgi-gaudi:2.0.6`
 4. `opea/retriever:latest`
 5. `opea/whisper:latest`
-6. `opea/redis-vector-db`
-7. `opea/embedding:latest`
-8. `opea/embedding-multimodal-bridgetower:latest`
-9. `opea/multimodalqna:latest`
-10. `opea/multimodalqna-ui:latest`
+6. `opea/speech5:latest`
+7. `opea/redis-vector-db`
+8. `opea/embedding:latest`
+9. `opea/embedding-multimodal-bridgetower:latest`
+10. `opea/multimodalqna:latest`
+11. `opea/multimodalqna-ui:latest`
 
 ## 🚀 Start Microservices
 
@@ -210,7 +215,7 @@ curl http://${host_ip}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
 
 ```bash
 export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(512)]; print(embedding)")
-curl http://${host_ip}:7000/v1/multimodal_retrieval \
+curl http://${host_ip}:${REDIS_RETRIEVER_PORT}/v1/retrieval \
     -X POST \
     -H "Content-Type: application/json" \
     -d "{\"text\":\"test\",\"embedding\":${your_embedding}}"
@@ -234,7 +239,16 @@ curl http://${host_ip}:${LLAVA_SERVER_PORT}/generate \
     -H 'Content-Type: application/json'
 ```
 
-6. lvm
+6. tts
+
+```bash
+curl ${TTS_ENDPOINT} \
+  -X POST \
+  -d '{"text": "Who are you?"}' \
+  -H 'Content-Type: application/json'
+```
+
+7. lvm
 
 ```bash
 curl http://${host_ip}:${LVM_PORT}/v1/lvm \
@@ -259,9 +273,9 @@ curl http://${host_ip}:${LVM_PORT}/v1/lvm \
     -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [], "chat_template":"The caption of the image is: '\''{context}'\''. {question}"}'
 ```
 
-7. Multimodal Dataprep Microservice
+8. Multimodal Dataprep Microservice
 
-Download a sample video, image, PDF, and audio file and create a caption
+Download a sample video (.mp4), image (.png, .gif, .jpg), pdf, and audio file (.wav, .mp3) and create a caption
 
 ```bash
 export video_fn="WeAreGoingOnBullrun.mp4"
@@ -280,7 +294,7 @@ export audio_fn="AudioSample.wav"
 wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
 ```
 
-Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav file.
+Test dataprep microservice with generating transcript. This command updates a knowledge base by uploading a local video .mp4 and an audio .wav or .mp3 file.
 
 ```bash
 curl --silent --write-out "HTTPSTATUS:%{http_code}" \
@@ -300,7 +314,7 @@ curl --silent --write-out "HTTPSTATUS:%{http_code}" \
     -X POST -F "files=@./${image_fn}"
 ```
 
-Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text.
+Now, test the microservice with posting a custom caption along with an image and a PDF containing images and text. The image caption can be provided as a text (`.txt`) or as spoken audio (`.wav` or `.mp3`).
 
 ```bash
 curl --silent --write-out "HTTPSTATUS:%{http_code}" \
@@ -339,7 +353,7 @@ curl -X POST \
     ${DATAPREP_DELETE_FILE_ENDPOINT}
 ```
 
-8. MegaService
+9. MegaService
 
 Test the MegaService with a text query:
 
@@ -366,10 +380,10 @@ curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
     -d  '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Green bananas in a tree"}, {"type": "image_url", "image_url": {"url": "http://images.cocodataset.org/test-stuff2017/000000004248.jpg"}}]}]}'
 ```
 
-Test the MegaService with a back and forth conversation between the user and assistant:
+Test the MegaService with a back and forth conversation between the user and assistant including a text to speech response from the assistant using `"modalities": ["text", "audio"]'`:
 
 ```bash
 curl http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna \
 	-H "Content-Type: application/json" \
-	-d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10}'
+	-d '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}, {"role": "assistant", "content": "opea project! "}, {"role": "user", "content": "chao, "}], "max_tokens": 10, "modalities": ["text", "audio"]}'
 ```
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 26b5610f5e..822d3e2896 100644
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -21,6 +21,19 @@ services:
       WHISPER_PORT: ${WHISPER_PORT}
       WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT}
     restart: unless-stopped
+  speecht5-service:
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+    container_name: speecht5-service
+    ports:
+      - "${TTS_PORT}:7055"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      TTS_PORT: ${TTS_PORT}
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
+    restart: unless-stopped
   dataprep-multimodal-redis:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-multimodal-redis
@@ -182,6 +195,8 @@ services:
       LVM_MODEL_ID: ${LVM_MODEL_ID}
       WHISPER_PORT: ${WHISPER_PORT}
       WHISPER_SERVER_ENDPOINT: ${WHISPER_SERVER_ENDPOINT}
+      TTS_PORT: ${TTS_PORT}
+      TTS_ENDPOINT: ${TTS_ENDPOINT}
     ipc: host
     restart: always
   multimodalqna-ui:
@@ -199,8 +214,11 @@ services:
       - DATAPREP_INGEST_SERVICE_ENDPOINT=${DATAPREP_INGEST_SERVICE_ENDPOINT}
       - DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT=${DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT}
       - DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT=${DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT}
+      - DATAPREP_GET_FILE_ENDPOINT=${DATAPREP_GET_FILE_ENDPOINT}
+      - DATAPREP_DELETE_FILE_ENDPOINT=${DATAPREP_DELETE_FILE_ENDPOINT}
       - MEGA_SERVICE_PORT:=${MEGA_SERVICE_PORT}
       - UI_PORT=${UI_PORT}
+      - UI_TIMEOUT=${UI_TIMEOUT}
       - DATAPREP_MMR_PORT=${DATAPREP_MMR_PORT}
     ipc: host
     restart: always
diff --git a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
index cc35d58d08..ab89b14596 100755
--- a/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/MultimodalQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -13,16 +13,15 @@ export MM_RETRIEVER_SERVICE_HOST_IP=${host_ip}
 export LVM_SERVICE_HOST_IP=${host_ip}
 export MEGA_SERVICE_HOST_IP=${host_ip}
 
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
-
 export REDIS_DB_PORT=6379
 export REDIS_INSIGHTS_PORT=8001
 export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
 export REDIS_HOST=${host_ip}
 export INDEX_NAME="mm-rag-redis"
 
+export TTS_PORT=7055
+export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts"
+
 export WHISPER_MODEL="base"
 export WHISPER_PORT=7066
 export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
@@ -54,3 +53,4 @@ export MEGA_SERVICE_PORT=8888
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna"
 
 export UI_PORT=5173
+export UI_TIMEOUT=200
diff --git a/MultimodalQnA/docker_image_build/build.yaml b/MultimodalQnA/docker_image_build/build.yaml
index 1fc599c3e5..cb7d0ebf93 100644
--- a/MultimodalQnA/docker_image_build/build.yaml
+++ b/MultimodalQnA/docker_image_build/build.yaml
@@ -65,3 +65,15 @@ services:
       dockerfile: comps/asr/src/integrations/dependency/whisper/Dockerfile
     extends: multimodalqna
     image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+  speecht5:
+    build:
+      context: GenAIComps
+      dockerfile: comps/tts/src/integrations/dependency/speecht5/Dockerfile
+    extends: multimodalqna
+    image: ${REGISTRY:-opea}/speecht5:${TAG:-latest}
+  tts:
+    build:
+      context: GenAIComps
+      dockerfile: comps/tts/src/Dockerfile
+    extends: multimodalqna
+    image: ${REGISTRY:-opea}/tts:${TAG:-latest}
diff --git a/MultimodalQnA/multimodalqna.py b/MultimodalQnA/multimodalqna.py
index 0e3f87d190..e89c32aeab 100644
--- a/MultimodalQnA/multimodalqna.py
+++ b/MultimodalQnA/multimodalqna.py
@@ -28,7 +28,9 @@
 LVM_SERVICE_HOST_IP = os.getenv("LVM_SERVICE_HOST_IP", "0.0.0.0")
 LVM_SERVICE_PORT = int(os.getenv("LVM_PORT", 9399))
 WHISPER_PORT = int(os.getenv("WHISPER_PORT", 7066))
-WHISPER_SERVER_ENDPOINT = os.getenv("WHISPER_SERVER_ENDPOINT", "http://0.0.0.0:$WHISPER_PORT/v1/asr")
+WHISPER_SERVER_ENDPOINT = os.getenv("WHISPER_SERVER_ENDPOINT", f"http://0.0.0.0:{WHISPER_PORT}/v1/asr")
+TTS_PORT = int(os.getenv("TTS_PORT", 7055))
+TTS_ENDPOINT = os.getenv("TTS_ENDPOINT", f"http://0.0.0.0:{TTS_PORT}/v1/tts")
 
 
 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
@@ -252,6 +254,22 @@ def convert_audio_to_text(self, audio):
         response = response.json()
         return response["asr_result"]
 
+    def convert_text_to_audio(self, text):
+        if isinstance(text, dict):
+            input_dict = {"text": text["text"]}
+        else:
+            input_dict = {"text": text}
+
+        response = requests.post(TTS_ENDPOINT, data=json.dumps(input_dict))
+
+        if response.status_code != 200:
+            return JSONResponse(
+                status_code=503, content={"message": "Unable to convert text to audio. {}".format(response.text)}
+            )
+
+        response = response.json()
+        return response["tts_result"]
+
     async def handle_request(self, request: Request):
         """MultimodalQnA accepts input queries as text, images, and/or audio.
 
@@ -271,6 +289,7 @@ async def handle_request(self, request: Request):
             print("[ MultimodalQnAService ] stream=True not used, this has not support stream yet!")
             stream_opt = False
         chat_request = ChatCompletionRequest.model_validate(data)
+        modalities = chat_request.modalities
         num_messages = len(data["messages"]) if isinstance(data["messages"], list) else 1
         messages = self._handle_message(chat_request.messages)
         decoded_audio_input = ""
@@ -333,8 +352,12 @@ async def handle_request(self, request: Request):
                 return response
         last_node = runtime_graph.all_leaves()[-1]
 
+        tts_audio = None
         if "text" in result_dict[last_node].keys():
             response = result_dict[last_node]["text"]
+            # Toggle for TTS
+            if "audio" in modalities:
+                tts_audio = {"data": self.convert_text_to_audio(response)}
         else:
             # text is not in response message
             # something wrong, for example due to empty retrieval results
@@ -359,7 +382,7 @@ async def handle_request(self, request: Request):
         choices.append(
             ChatCompletionResponseChoice(
                 index=0,
-                message=ChatMessage(role="assistant", content=response),
+                message=ChatMessage(role="assistant", content=response, audio=tts_audio),
                 finish_reason="stop",
                 metadata=metadata,
             )
diff --git a/MultimodalQnA/tests/test_compose_on_gaudi.sh b/MultimodalQnA/tests/test_compose_on_gaudi.sh
index ccb4f1894d..e3a854a07e 100644
--- a/MultimodalQnA/tests/test_compose_on_gaudi.sh
+++ b/MultimodalQnA/tests/test_compose_on_gaudi.sh
@@ -14,9 +14,10 @@ WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
 
-export image_fn="apple.png"
+export image_fn="sample.png"
 export video_fn="WeAreGoingOnBullrun.mp4"
-export caption_fn="apple.txt"
+export audio_fn="sample.mp3"  # audio_fn and caption_fn are used as captions for image_fn, so they all need the same base name
+export caption_fn="sample.txt"
 export pdf_fn="nke-10k-2023.pdf"
 
 function check_service_ready() {
@@ -59,7 +60,7 @@ function build_docker_images() {
     git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever lvm dataprep whisper"
+    service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower-gaudi embedding retriever speecht5 lvm dataprep whisper"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -82,6 +83,8 @@ function setup_env() {
     export MAX_IMAGES=1
     export WHISPER_MODEL="base"
     export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+    export TTS_PORT=7055
+    export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts"
     export DATAPREP_MMR_PORT=6007
     export DATAPREP_INGEST_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/ingest"
     export DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/generate_transcripts"
@@ -116,6 +119,7 @@ function prepare_data() {
     cd $LOG_PATH
     echo "Downloading image and video"
     wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn}
+    wget https://github.com/intel/intel-extension-for-transformers/raw/refs/tags/v1.5/intel_extension_for_transformers/neural_chat/ui/customized/talkingbot/src/lib/components/talkbot/assets/mid-age-man.mp3 -O ${audio_fn}
     wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn}
     wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
     echo "Writing caption file"
@@ -133,20 +137,23 @@ function validate_service() {
 
     if [[ $SERVICE_NAME == *"dataprep-multimodal-redis-transcript"* ]]; then
         cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -H 'Content-Type: multipart/form-data' "$URL")
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-caption"* ]]; then
          cd $LOG_PATH
          HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest-image-audio"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest"* ]]; then
         cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./apple.txt" -H 'Content-Type: multipart/form-data' "$URL")
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-pdf"* ]]; then
         cd $LOG_PATH
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${pdf_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "apple.txt"}' -H 'Content-Type: application/json' "$URL")
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "${caption_fn}"}' -H 'Content-Type: application/json' "$URL")
     else
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
     fi
@@ -218,13 +225,20 @@ function validate_microservices() {
         "dataprep-multimodal-redis-transcript" \
         "dataprep-multimodal-redis"
 
-    echo "Validating Data Prep with Image & Caption Ingestion"
+    echo "Validating Data Prep with Image & Text Caption Ingestion"
     validate_service \
         "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
         "Data preparation succeeded" \
         "dataprep-multimodal-redis-ingest" \
         "dataprep-multimodal-redis"
 
+    echo "Validating Data Prep with Image & Audio Caption Ingestion"
+    validate_service \
+        "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
+        "Data preparation succeeded" \
+        "dataprep-multimodal-redis-ingest-image-audio" \
+        "dataprep-multimodal-redis"
+
     echo "Validating Data Prep with PDF"
     validate_service \
         "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
@@ -246,6 +260,14 @@ function validate_microservices() {
         "dataprep_get" \
         "dataprep-multimodal-redis"
 
+    echo "Validating Text to speech service"
+    validate_service \
+        "${TTS_ENDPOINT}" \
+        '"tts_result":' \
+        "speecht5-service" \
+        "speecht5-service" \
+        '{"text": "Who are you?"}'
+
     sleep 1m
 
     # multimodal retrieval microservice
@@ -303,10 +325,18 @@ function validate_megaservice() {
     echo "Validating megaservice with first query"
     validate_service \
         "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
-        '"time_of_frame_ms":' \
+        'red' \
         "multimodalqna" \
         "multimodalqna-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
+        '{"messages": "Find an apple. What color is it?"}'
+
+    echo "Validating megaservice with audio response"
+    validate_service \
+        "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
+        '"audio":{"data"' \
+        "multimodalqna" \
+        "multimodalqna-backend-server" \
+        '{"messages": "Find an apple. What color is it?", "modalities": ["text", "audio"]}'
 
     echo "Validating megaservice with first audio query"
     validate_service \
@@ -344,7 +374,7 @@ function validate_megaservice() {
 
 function validate_delete {
     echo "Validating data prep delete files"
-    export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete"
+    export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete"
     validate_service \
         "${DATAPREP_DELETE_FILE_ENDPOINT}" \
         '{"status":true}' \
@@ -357,6 +387,7 @@ function delete_data() {
     echo "Deleting image, video, and caption"
     rm -rf ${image_fn}
     rm -rf ${video_fn}
+    rm -rf ${audio_fn}
     rm -rf ${caption_fn}
     rm -rf ${pdf_fn}
 }
diff --git a/MultimodalQnA/tests/test_compose_on_rocm.sh b/MultimodalQnA/tests/test_compose_on_rocm.sh
index 9ba5c68c90..9ba132418e 100644
--- a/MultimodalQnA/tests/test_compose_on_rocm.sh
+++ b/MultimodalQnA/tests/test_compose_on_rocm.sh
@@ -251,10 +251,10 @@ function validate_megaservice() {
     echo "Validate megaservice with first query"
     validate_service \
         "http://${host_ip}:8888/v1/multimodalqna" \
-        '"time_of_frame_ms":' \
+        'red' \
         "multimodalqna" \
         "multimodalqna-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
+        '{"messages": "Find an apple. What color is it?"}'
 
     echo "Validate megaservice with first audio query"
     validate_service \
diff --git a/MultimodalQnA/tests/test_compose_on_xeon.sh b/MultimodalQnA/tests/test_compose_on_xeon.sh
index b5d254b58c..9094faef3d 100644
--- a/MultimodalQnA/tests/test_compose_on_xeon.sh
+++ b/MultimodalQnA/tests/test_compose_on_xeon.sh
@@ -14,9 +14,10 @@ WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
 
-export image_fn="apple.png"
+export image_fn="sample.png"
 export video_fn="WeAreGoingOnBullrun.mp4"
-export caption_fn="apple.txt"
+export audio_fn="sample.mp3"  # audio_fn and caption_fn are used as captions for image_fn, so they all need the same base name
+export caption_fn="sample.txt"
 export pdf_fn="nke-10k-2023.pdf"
 
 function check_service_ready() {
@@ -59,7 +60,7 @@ function build_docker_images() {
     git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever lvm-llava lvm dataprep whisper"
+    service_list="multimodalqna multimodalqna-ui embedding-multimodal-bridgetower embedding retriever speecht5 lvm-llava lvm dataprep whisper"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
     docker images && sleep 1s
 }
@@ -74,6 +75,8 @@ function setup_env() {
     export MAX_IMAGES=1
     export WHISPER_MODEL="base"
     export WHISPER_SERVER_ENDPOINT="http://${host_ip}:${WHISPER_PORT}/v1/asr"
+    export TTS_PORT=7055
+    export TTS_ENDPOINT="http://${host_ip}:${TTS_PORT}/v1/tts"
     export REDIS_DB_PORT=6379
     export REDIS_INSIGHTS_PORT=8001
     export REDIS_URL="redis://${host_ip}:${REDIS_DB_PORT}"
@@ -113,6 +116,7 @@ function prepare_data() {
     cd $LOG_PATH
     echo "Downloading image and video"
     wget https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true -O ${image_fn}
+    wget https://github.com/intel/intel-extension-for-transformers/raw/refs/tags/v1.5/intel_extension_for_transformers/neural_chat/ui/customized/talkingbot/src/lib/components/talkbot/assets/mid-age-man.mp3 -O ${audio_fn}
     wget http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/WeAreGoingOnBullrun.mp4 -O ${video_fn}
     wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
     echo "Writing caption file"
@@ -130,20 +134,23 @@ function validate_service() {
 
     if [[ $SERVICE_NAME == *"dataprep-multimodal-redis-transcript"* ]]; then
         cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -H 'Content-Type: multipart/form-data' "$URL")
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${video_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-caption"* ]]; then
         cd $LOG_PATH
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -H 'Content-Type: multipart/form-data' "$URL")
+    elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest-image-audio"* ]]; then
+        cd $LOG_PATH
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${audio_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-ingest"* ]]; then
         cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./apple.txt" -H 'Content-Type: multipart/form-data' "$URL")
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${image_fn}" -F "files=@./${caption_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep-multimodal-redis-pdf"* ]]; then
         cd $LOG_PATH
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@./${pdf_fn}" -H 'Content-Type: multipart/form-data' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
     elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "apple.txt"}' -H 'Content-Type: application/json' "$URL")
+        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "${caption_fn}"}' -H 'Content-Type: application/json' "$URL")
     else
         HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
     fi
@@ -215,13 +222,20 @@ function validate_microservices() {
         "dataprep-multimodal-redis-transcript" \
         "dataprep-multimodal-redis"
 
-    echo "Validating Data Prep with Image & Caption Ingestion"
+    echo "Validating Data Prep with Image & Text Caption Ingestion"
     validate_service \
         "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
         "Data preparation succeeded" \
         "dataprep-multimodal-redis-ingest" \
         "dataprep-multimodal-redis"
 
+    echo "Validating Data Prep with Image & Audio Caption Ingestion"
+    validate_service \
+        "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
+        "Data preparation succeeded" \
+        "dataprep-multimodal-redis-ingest-image-audio" \
+        "dataprep-multimodal-redis"
+
     echo "Validating Data Prep with PDF"
     validate_service \
         "${DATAPREP_INGEST_SERVICE_ENDPOINT}" \
@@ -292,6 +306,15 @@ function validate_microservices() {
         "dataprep-multimodal-redis-caption" \
         "dataprep-multimodal-redis"
 
+    echo "Validating Text to speech service"
+    validate_service \
+        "${TTS_ENDPOINT}" \
+        '"tts_result":' \
+        "speecht5-service" \
+        "speecht5-service" \
+        '{"text": "Who are you?"}'
+
+
     sleep 3m
 }
 
@@ -300,10 +323,18 @@ function validate_megaservice() {
     echo "Validating megaservice with first query"
     validate_service \
         "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
-        '"time_of_frame_ms":' \
+        'red' \
+        "multimodalqna" \
+        "multimodalqna-backend-server" \
+        '{"messages": "Find an apple. What color is it?"}'
+
+    echo "Validating megaservice with audio response"
+    validate_service \
+        "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
+        '"audio":{"data"' \
         "multimodalqna" \
         "multimodalqna-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
+        '{"messages": "Find an apple. What color is it?", "modalities": ["text", "audio"]}'
 
     echo "Validating megaservice with first audio query"
     validate_service \
@@ -319,8 +350,7 @@ function validate_megaservice() {
         '"time_of_frame_ms":' \
         "multimodalqna" \
         "multimodalqna-backend-server" \
-        '{"messages": [{"role": "user", "content": [{"type": "text", "text": "Find a similar image"}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}]}'
-
+        '{"messages": [{"role": "user", "content": [{"type": "text", "text": "hello, "}, {"type": "image_url", "image_url": {"url": "https://www.ilankelman.org/stopsigns/australia.jpg"}}]}], "max_tokens": 10, "modalities": ["text", "audio"]}'
     echo "Validating megaservice with follow-up query"
     validate_service \
         "http://${host_ip}:${MEGA_SERVICE_PORT}/v1/multimodalqna" \
@@ -340,7 +370,7 @@ function validate_megaservice() {
 
 function validate_delete {
     echo "Validating data prep delete files"
-    export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/delete"
+    export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:${DATAPREP_MMR_PORT}/v1/dataprep/delete"
     validate_service \
         "${DATAPREP_DELETE_FILE_ENDPOINT}" \
         '{"status":true}' \
@@ -353,6 +383,7 @@ function delete_data() {
     echo "Deleting image, video, and caption"
     rm -rf ${image_fn}
     rm -rf ${video_fn}
+    rm -rf ${audio_fn}
     rm -rf ${pdf_fn}
     rm -rf ${caption_fn}
 }
diff --git a/MultimodalQnA/ui/gradio/conversation.py b/MultimodalQnA/ui/gradio/conversation.py
index 678f7872c2..42622f9ed1 100644
--- a/MultimodalQnA/ui/gradio/conversation.py
+++ b/MultimodalQnA/ui/gradio/conversation.py
@@ -3,10 +3,10 @@
 
 import dataclasses
 from enum import Enum, auto
-from typing import Dict, List
+from pathlib import Path
+from typing import Any, Dict, List, Literal
 
-from PIL import Image
-from utils import convert_audio_to_base64, get_b64_frame_from_timestamp
+from utils import GRADIO_AUDIO_FORMATS, GRADIO_IMAGE_FORMATS, convert_audio_to_base64, get_b64_frame_from_timestamp
 
 
 class SeparatorStyle(Enum):
@@ -21,8 +21,7 @@ class Conversation:
 
     system: str
     roles: List[str]
-    messages: List[List[str]]
-    image_query_files: Dict[int, str]
+    chatbot_history: List[Dict[str, Any]]
     offset: int
     sep_style: SeparatorStyle = SeparatorStyle.SINGLE
     sep: str = "\n"
@@ -42,66 +41,50 @@ def _template_caption(self):
             out = f"The caption associated with the image is '{self.caption}'. "
         return out
 
-    def get_prompt(self):
-        messages = self.messages
-        if len(messages) > 1 and messages[1][1] is None:
-            # Need to do RAG. If the query is text, prompt is the query only
-            if self.audio_query_file:
-                ret = [{"role": "user", "content": [{"type": "audio", "audio": self.get_b64_audio_query()}]}]
-            elif 0 in self.image_query_files:
-                b64_image = get_b64_frame_from_timestamp(self.image_query_files[0], 0)
-                ret = [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "text", "text": messages[0][1]},
-                            {"type": "image_url", "image_url": {"url": b64_image}},
-                        ],
-                    }
-                ]
-            else:
-                ret = messages[0][1]
-        else:
-            # No need to do RAG. Thus, prompt of chatcompletion format
-            conv_dict = []
-            if self.sep_style == SeparatorStyle.SINGLE:
-                for i, (role, message) in enumerate(messages):
-                    if message:
-                        dic = {"role": role}
-                        content = [{"type": "text", "text": message}]
-                        # There might be audio
-                        if self.audio_query_file:
-                            content.append({"type": "audio", "audio": self.get_b64_audio_query()})
-                        # There might be a returned item from the first query
-                        if i == 0 and self.time_of_frame_ms and self.video_file:
-                            base64_frame = (
-                                self.base64_frame
-                                if self.base64_frame
-                                else get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms)
-                            )
-                            if base64_frame is None:
-                                base64_frame = ""
-                            # Include the original caption for the returned image/video
-                            if self.caption and content[0]["type"] == "text":
-                                content[0]["text"] = content[0]["text"] + " " + self._template_caption()
-                            content.append({"type": "image_url", "image_url": {"url": base64_frame}})
-                        # There might be a query image
-                        if i in self.image_query_files:
-                            content.append(
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": get_b64_frame_from_timestamp(self.image_query_files[i], 0)},
-                                }
-                            )
-                        dic["content"] = content
-                        conv_dict.append(dic)
-            else:
-                raise ValueError(f"Invalid style: {self.sep_style}")
-            ret = conv_dict
-        return ret
-
-    def append_message(self, role, message):
-        self.messages.append([role, message])
+    def get_prompt(self, is_very_first_query):
+        conv_dict = [{"role": "user", "content": []}]
+        caption_flag = True
+        is_image_query = False
+
+        for record in self.chatbot_history:
+            role = record["role"]
+            content = record["content"]
+
+            if role == "user":
+                # Check if last entry of conv_dict has role user
+                if conv_dict[-1]["role"] != "user":
+                    conv_dict.append({"role": "user", "content": []})
+            elif role == "assistant":
+                caption_flag = False
+                # Check if last entry of conv_dict has role assistant
+                if conv_dict[-1]["role"] != "assistant":
+                    conv_dict.append({"role": "assistant", "content": []})
+
+            # Add content to the last conv_dict record. The single space has only effect on first image-only
+            # query for the similarity search results to get expected response.
+            if isinstance(content, str):
+                if caption_flag:
+                    content += " " + self._template_caption()
+                conv_dict[-1]["content"].append({"type": "text", "text": content})
+
+            if isinstance(content, dict) and "path" in content:
+                if Path(content["path"]).suffix in GRADIO_IMAGE_FORMATS:
+                    is_image_query = True
+                    conv_dict[-1]["content"].append(
+                        {"type": "image_url", "image_url": {"url": get_b64_frame_from_timestamp(content["path"], 0)}}
+                    )
+                if Path(content["path"]).suffix in GRADIO_AUDIO_FORMATS:
+                    conv_dict[-1]["content"].append(
+                        {"type": "audio", "audio": convert_audio_to_base64(content["path"])}
+                    )
+
+            # include the image from the assistant's response given the user's is not a image query
+            if not is_image_query and caption_flag and self.image:
+                conv_dict[-1]["content"].append(
+                    {"type": "image_url", "image_url": {"url": get_b64_frame_from_timestamp(self.image, 0)}}
+                )
+
+        return conv_dict
 
     def get_b64_image(self):
         b64_img = None
@@ -118,68 +101,13 @@ def get_b64_audio_query(self):
         return b64_audio
 
     def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-
-                    msg, image, image_process_mode = msg
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    buffered = BytesIO()
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace("<image>", "").strip()
-                    ret.append([msg, None])
-                elif i in self.image_query_files:
-                    import base64
-                    from io import BytesIO
-
-                    image = Image.open(self.image_query_files[i])
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    buffered = BytesIO()
-                    if image.format not in ["JPEG", "JPG"]:
-                        image = image.convert("RGB")
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace("<image>", "").strip()
-                    ret.append([msg, None])
-
-                else:
-                    ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
+        return self.chatbot_history
 
     def copy(self):
         return Conversation(
             system=self.system,
             roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            image_query_files=self.image_query_files,
+            chatbot_history=self.chatbot_history,
             offset=self.offset,
             sep_style=self.sep_style,
             sep=self.sep,
@@ -192,7 +120,7 @@ def dict(self):
         return {
             "system": self.system,
             "roles": self.roles,
-            "messages": self.messages,
+            "chatbot_history": self.chatbot_history,
             "offset": self.offset,
             "sep": self.sep,
             "time_of_frame_ms": self.time_of_frame_ms,
@@ -209,8 +137,7 @@ def dict(self):
 multimodalqna_conv = Conversation(
     system="",
     roles=("user", "assistant"),
-    messages=(),
-    image_query_files={},
+    chatbot_history=[],
     offset=0,
     sep_style=SeparatorStyle.SINGLE,
     sep="\n",
diff --git a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
index 7919ce5910..7bc54d2a0c 100644
--- a/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
+++ b/MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import glob
 import os
 import shutil
 import time
@@ -14,11 +15,25 @@
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from gradio_pdf import PDF
-from utils import build_logger, make_temp_image, server_error_msg, split_video
+from utils import (
+    GRADIO_AUDIO_FORMATS,
+    GRADIO_IMAGE_FORMATS,
+    TMP_DIR,
+    build_logger,
+    convert_base64_to_audio,
+    make_temp_image,
+    server_error_msg,
+    split_video,
+)
+
+IMAGE_FORMATS = [".png", ".gif", ".jpg", ".jpeg"]
+AUDIO_FORMATS = [".wav", ".mp3"]
 
 logger = build_logger("gradio_web_server", "gradio_web_server.log")
 logflag = os.getenv("LOGFLAG", False)
 
+ui_timeout = int(os.getenv("UI_TIMEOUT", 200))
+
 headers = {"Content-Type": "application/json"}
 
 css = """
@@ -54,85 +69,86 @@ def clear_history(state, request: gr.Request):
     if state.pdf and os.path.exists(state.pdf):
         os.remove(state.pdf)
     state = multimodalqna_conv.copy()
-    video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media")
-    image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media")
-    pdf = PDF(height=512, elem_id="pdf", interactive=False, visible=False, label="Media")
-    return (state, state.to_gradio_chatbot(), {"text": "", "files": []}, None, video, image, pdf) + (disable_btn,) * 1
+    state.chatbot_history = []
+    for file in glob.glob(os.path.join(TMP_DIR, "*.wav")):
+        os.remove(file)  # This removes all chatbot assistant's voice response files
+    video = gr.Video(value=None, elem_id="video", visible=True, label="Media")
+    image = gr.Image(value=None, elem_id="image", visible=False, label="Media")
+    pdf = PDF(value=None, elem_id="pdf", interactive=False, visible=False, label="Media")
+    return (state, state.to_gradio_chatbot(), None, video, image, pdf) + (disable_btn,) * 1
 
 
-def add_text(state, textbox, audio, request: gr.Request):
-    text = textbox["text"]
-    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
-    if audio:
-        state.audio_query_file = audio
-        state.append_message(state.roles[0], "--input placeholder--")
-        state.append_message(state.roles[1], None)
-        state.skip_next = False
-        return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1
-    # If it is a image query
-    elif textbox["files"]:
-        image_file = textbox["files"][0]
-        state.image_query_files[len(state.messages)] = image_file
-        state.append_message(state.roles[0], text)
-        state.append_message(state.roles[1], None)
-        state.skip_next = False
-        return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1
-    elif len(text) <= 0:
+def add_text(state, multimodal_textbox, request: gr.Request):
+    text = multimodal_textbox["text"]
+    files = multimodal_textbox["files"]
+
+    image_file, audio_file = None, None
+
+    text = text.strip()
+
+    if not text and not files:
         state.skip_next = True
-        return (state, state.to_gradio_chatbot(), None, None) + (no_change_btn,) * 1
+        return (state, state.to_gradio_chatbot(), None) + (no_change_btn,) * 1
 
     text = text[:2000]  # Hard cut-off
 
-    state.append_message(state.roles[0], text)
-    state.append_message(state.roles[1], None)
     state.skip_next = False
 
-    return (state, state.to_gradio_chatbot(), None, None) + (disable_btn,) * 1
+    if files:
+        if Path(files[0]).suffix in GRADIO_IMAGE_FORMATS:
+            image_file = files[0]
+        if Path(files[0]).suffix in GRADIO_AUDIO_FORMATS or len(files) > 1:
+            audio_file = files[-1]  # Guaranteed that last file would be recorded audio
+
+    # Add to chatbot history
+    if image_file:
+        state.image_query_file = image_file
+        state.chatbot_history.append({"role": state.roles[0], "content": {"path": image_file}})
+    if audio_file:
+        state.audio_query_file = audio_file
+        state.chatbot_history.append({"role": state.roles[0], "content": {"path": audio_file}})
+
+    state.chatbot_history.append({"role": state.roles[0], "content": text})
+
+    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
+
+    return (state, state.to_gradio_chatbot(), gr.MultimodalTextbox(value=None)) + (disable_btn,) * 1
 
 
-def http_bot(state, request: gr.Request):
+def http_bot(state, audio_response_toggler, request: gr.Request):
     global gateway_addr
     logger.info(f"http_bot. ip: {request.client.host}")
     url = gateway_addr
-    is_very_first_query = False
-    is_audio_query = state.audio_query_file is not None
+
     if state.skip_next:
         # This generate call is skipped due to invalid inputs
         yield (state, state.to_gradio_chatbot(), None, None, None) + (no_change_btn,) * 1
         return
 
-    if len(state.messages) == state.offset + 2:
-        # First round of conversation
-        is_very_first_query = True
-        new_state = multimodalqna_conv.copy()
-        new_state.append_message(new_state.roles[0], state.messages[-2][1])
-        new_state.append_message(new_state.roles[1], None)
-        new_state.audio_query_file = state.audio_query_file
-        new_state.image_query_files = state.image_query_files
-        state = new_state
+    is_very_first_query = all(True if h["role"] == "user" else False for h in state.chatbot_history)
 
     # Construct prompt
-    prompt = state.get_prompt()
+    prompt = state.get_prompt(is_very_first_query)
+
+    modalities = ["text", "audio"] if audio_response_toggler else ["text"]
 
     # Make requests
-    pload = {
-        "messages": prompt,
-    }
+    pload = {"messages": prompt, "modalities": modalities}
+
+    state.chatbot_history.append({"role": state.roles[1], "content": "▌"})
+
+    yield (state, state.to_gradio_chatbot(), state.split_video, state.image, state.pdf) + (disable_btn,) * 1
 
     if logflag:
         logger.info(f"==== request ====\n{pload}")
     logger.info(f"==== url request ====\n{gateway_addr}")
 
-    state.messages[-1][-1] = "▌"
-
-    yield (state, state.to_gradio_chatbot(), state.split_video, state.image, state.pdf) + (disable_btn,) * 1
-
     try:
         response = requests.post(
             url,
             headers=headers,
             json=pload,
-            timeout=100,
+            timeout=ui_timeout,
         )
         logger.info(response.status_code)
         if logflag:
@@ -143,9 +159,15 @@ def http_bot(state, request: gr.Request):
             choice = response["choices"][-1]
             metadata = choice["metadata"]
             message = choice["message"]["content"]
+            audio_response = None
+            if audio_response_toggler:
+                if choice["message"]["audio"]:
+                    audio_response = choice["message"]["audio"]["data"]
+
             if (
                 is_very_first_query
                 and not state.video_file
+                and metadata
                 and "source_video" in metadata
                 and not state.time_of_frame_ms
                 and "time_of_frame_ms" in metadata
@@ -164,7 +186,7 @@ def http_bot(state, request: gr.Request):
                         print(f"video {state.video_file} does not exist in UI host!")
                         splited_video_path = None
                     state.split_video = splited_video_path
-                elif file_ext in [".jpg", ".jpeg", ".png", ".gif"]:
+                elif file_ext in IMAGE_FORMATS:
                     try:
                         output_image_path = make_temp_image(state.video_file, file_ext)
                     except:
@@ -178,29 +200,37 @@ def http_bot(state, request: gr.Request):
                         print(f"pdf {state.video_file} does not exist in UI host!")
                         output_pdf_path = None
                     state.pdf = output_pdf_path
-
         else:
             raise requests.exceptions.RequestException
+
     except requests.exceptions.RequestException as e:
-        state.messages[-1][-1] = server_error_msg
+        if logflag:
+            logger.info(f"Request Exception occurred:\n{str(e)}")
+
+        gr.Error("Request exception occurred. See logs for details.")
+
         yield (state, state.to_gradio_chatbot(), None, None, None) + (enable_btn,)
         return
 
-    state.messages[-1][-1] = message
-
-    if is_audio_query:
-        state.messages[-2][-1] = metadata.get("audio", "--transcribed audio not available--")
-        state.audio_query_file = None
+    if audio_response:
+        state.chatbot_history[-1]["content"] = {"path": convert_base64_to_audio(audio_response)}
+    else:
+        state.chatbot_history[-1]["content"] = message
 
     yield (
         state,
         state.to_gradio_chatbot(),
         gr.Video(state.split_video, visible=state.split_video is not None),
         gr.Image(state.image, visible=state.image is not None),
-        PDF(state.pdf, visible=state.pdf is not None, interactive=False, starting_page=int(state.time_of_frame_ms)),
+        PDF(
+            state.pdf,
+            visible=state.pdf is not None,
+            interactive=False,
+            starting_page=int(state.time_of_frame_ms) if state.time_of_frame_ms else 0,
+        ),
     ) + (enable_btn,) * 1
 
-    logger.info(f"{state.messages[-1][-1]}")
+    logger.info(f"{state.chatbot_history[-1]['content']}")
     return
 
 
@@ -314,8 +344,10 @@ def ingest_gen_caption(filepath, filetype, request: gr.Request):
     return
 
 
-def ingest_with_text(filepath, text, request: gr.Request):
+def ingest_with_caption(filepath, text_caption, audio_caption, request: gr.Request):
     yield (gr.Textbox(visible=True, value="Please wait for your uploaded image to be ingested into the database..."))
+
+    # Process the image
     verified_filepath = os.path.normpath(filepath)
     if not verified_filepath.startswith(tmp_upload_folder):
         print("Found malicious image file name!")
@@ -329,19 +361,29 @@ def ingest_with_text(filepath, text, request: gr.Request):
     basename = os.path.basename(verified_filepath)
     dest = os.path.join(static_dir, basename)
     shutil.copy(verified_filepath, dest)
-    text_basename = "{}.txt".format(os.path.splitext(basename)[0])
-    text_dest = os.path.join(static_dir, text_basename)
-    with open(text_dest, "w") as file:
-        file.write(text)
+
+    # Process the caption (can be text or audio)
+    is_audio_caption = audio_caption is not None
+    if is_audio_caption:
+        verified_audio_path = os.path.normpath(audio_caption)
+        caption_basename = "{}{}".format(os.path.splitext(basename)[0], os.path.splitext(verified_audio_path)[-1])
+        caption_file = audio_caption
+    else:
+        caption_basename = "{}.txt".format(os.path.splitext(basename)[0])
+        caption_file = os.path.join(static_dir, caption_basename)
+        with open(caption_file, "w") as file:
+            file.write(text_caption)
+
     print("Done copying uploaded files to static folder!")
     headers = {
         # 'Content-Type': 'multipart/form-data'
     }
-    files = [("files", (basename, open(dest, "rb"))), ("files", (text_basename, open(text_dest, "rb")))]
+    files = [("files", (basename, open(dest, "rb"))), ("files", (caption_basename, open(caption_file, "rb")))]
     try:
         response = requests.post(dataprep_ingest_addr, headers=headers, files=files)
     finally:
-        os.remove(text_dest)
+        if not is_audio_caption:
+            os.remove(caption_file)
     logger.info(response.status_code)
     if response.status_code == 200:
         response = response.json()
@@ -427,8 +469,44 @@ def hide_text(request: gr.Request):
     return gr.Textbox(visible=False)
 
 
-def clear_text(request: gr.Request):
-    return None
+def hide_text_pdf(pdf, text, request: gr.Request):
+    if pdf is not None:
+        return text
+    else:
+        return gr.Textbox(visible=False)
+
+
+def clear_captions(request: gr.Request):
+    return None, None
+
+
+def get_files():
+    try:
+        response = requests.post(dataprep_get_file_addr, headers=headers)
+        logger.info(response.status_code)
+        files = response.json()
+        if files:
+            html_content = "<ul>" + "".join(f"<li>{item}</li>" for item in files) + "</ul>"
+            yield (gr.HTML(html_content, visible=True, max_height=200))
+            return
+        else:
+            yield (gr.HTML("Vector store is empty.", visible=True))
+            return
+    except Exception as e:
+        logger.info(f"Error getting files from vector store: {str(e)}")
+
+
+def delete_files():
+    import json
+
+    data = {"file_path": "all"}
+    try:
+        response = requests.post(dataprep_delete_file_addr, headers=headers, data=json.dumps(data))
+        logger.info(response.status_code)
+        yield (gr.update(value="Deleted all files!"))
+        return
+    except Exception as e:
+        logger.info(f"Error deleting files from vector store: {str(e)}")
 
 
 with gr.Blocks() as upload_video:
@@ -472,13 +550,48 @@ def select_upload_type(choice, request: gr.Request):
 
 with gr.Blocks() as upload_image:
     gr.Markdown("# Ingest Images Using Generated or Custom Captions")
-    gr.Markdown("Use this interface to ingest an image and generate a caption for it")
+    gr.Markdown(
+        "Use this interface to ingest an image and generate a caption for it. If uploading a caption, populate it before the image."
+    )
+
+    text_caption_label = "Text Caption"
+    audio_caption_label = "Voice Audio Caption ({}, or microphone)".format(", ".join(AUDIO_FORMATS))
 
     def select_upload_type(choice, request: gr.Request):
         if choice == "gen_caption":
-            return gr.Image(sources="upload", visible=True), gr.Image(sources="upload", visible=False)
+            return (
+                gr.Image(sources="upload", visible=True),
+                gr.Image(sources="upload", visible=False),
+                gr.Textbox(visible=False, interactive=True, label=text_caption_label),
+                gr.Audio(visible=False, type="filepath", label=audio_caption_label),
+            )
+        elif choice == "custom_caption":
+            return (
+                gr.Image(sources="upload", visible=False),
+                gr.Image(sources="upload", visible=True),
+                gr.Textbox(visible=True, interactive=True, label=text_caption_label),
+                gr.Audio(visible=False, type="filepath", label=audio_caption_label),
+            )
         else:
-            return gr.Image(sources="upload", visible=False), gr.Image(sources="upload", visible=True)
+            return (
+                gr.Image(sources="upload", visible=False),
+                gr.Image(sources="upload", visible=True),
+                gr.Textbox(visible=False, interactive=True, label=text_caption_label),
+                gr.Audio(visible=True, type="filepath", label=audio_caption_label),
+            )
+
+    def verify_audio_caption_type(file, request: gr.Request):
+        audio_type = os.path.splitext(file)[-1]
+        if audio_type not in AUDIO_FORMATS:
+            return (
+                None,
+                gr.Textbox(visible=True, value="The audio file format must be {}".format(" or ".join(AUDIO_FORMATS))),
+            )
+        else:
+            return (
+                gr.Audio(value=file, visible=True, type="filepath", label=audio_caption_label),
+                gr.Textbox(visible=False, value=None),
+            )
 
     with gr.Row():
         with gr.Column(scale=6):
@@ -486,22 +599,34 @@ def select_upload_type(choice, request: gr.Request):
             image_upload_text = gr.Image(type="filepath", sources="upload", elem_id="image_upload_cap", visible=False)
         with gr.Column(scale=3):
             text_options_radio = gr.Radio(
-                [("Generate caption", "gen_caption"), ("Custom caption or label", "custom_caption")],
-                label="Text Options",
-                info="How should text be ingested?",
+                [
+                    ("Auto-generate a caption", "gen_caption"),
+                    ("Upload a text caption (populate before image)", "custom_caption"),
+                    ("Upload an audio caption (populate before image)", "custom_audio_caption"),
+                ],
+                label="Caption Options",
+                info="How should captions be ingested?",
                 value="gen_caption",
             )
-            custom_caption = gr.Textbox(visible=True, interactive=True, label="Custom Caption or Label")
+            custom_caption = gr.Textbox(visible=False, interactive=True, label=text_caption_label)
+            custom_caption_audio = gr.Audio(visible=False, type="filepath", label=audio_caption_label)
             text_upload_result = gr.Textbox(visible=False, interactive=False, label="Upload Status")
+        custom_caption_audio.input(
+            verify_audio_caption_type, [custom_caption_audio], [custom_caption_audio, text_upload_result]
+        )
         image_upload_cap.upload(
             ingest_gen_caption, [image_upload_cap, gr.Textbox(value="image", visible=False)], [text_upload_result]
         )
         image_upload_cap.clear(hide_text, [], [text_upload_result])
-        image_upload_text.upload(ingest_with_text, [image_upload_text, custom_caption], [text_upload_result]).then(
-            clear_text, [], [custom_caption]
-        )
+        image_upload_text.upload(
+            ingest_with_caption, [image_upload_text, custom_caption, custom_caption_audio], [text_upload_result]
+        ).then(clear_captions, [], [custom_caption, custom_caption_audio])
         image_upload_text.clear(hide_text, [], [text_upload_result])
-        text_options_radio.change(select_upload_type, [text_options_radio], [image_upload_cap, image_upload_text])
+        text_options_radio.change(
+            select_upload_type,
+            [text_options_radio],
+            [image_upload_cap, image_upload_text, custom_caption, custom_caption_audio],
+        )
 
 with gr.Blocks() as upload_audio:
     gr.Markdown("# Ingest Audio Using Generated Transcripts")
@@ -527,34 +652,29 @@ def select_upload_type(choice, request: gr.Request):
             pdf_upload = PDF(label="PDF File")
         with gr.Column(scale=3):
             pdf_upload_result = gr.Textbox(visible=False, interactive=False, label="Upload Status")
+        pdf_upload.change(hide_text_pdf, [pdf_upload, pdf_upload_result], [pdf_upload_result])
         pdf_upload.upload(ingest_pdf, [pdf_upload], [pdf_upload_result])
 
 with gr.Blocks() as qna:
     state = gr.State(multimodalqna_conv.copy())
-    with gr.Row():
+    with gr.Row(equal_height=True):
         with gr.Column(scale=2):
-            video = gr.Video(height=512, width=512, elem_id="video", visible=True, label="Media")
-            image = gr.Image(height=512, width=512, elem_id="image", visible=False, label="Media")
-            pdf = PDF(height=512, elem_id="pdf", interactive=False, visible=False, label="Media")
+            video = gr.Video(elem_id="video", visible=True, label="Media")
+            image = gr.Image(elem_id="image", visible=False, label="Media")
+            pdf = PDF(elem_id="pdf", interactive=False, visible=False, label="Media")
         with gr.Column(scale=9):
-            chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", height=390)
-            with gr.Row():
+            chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", type="messages")
+            with gr.Row(equal_height=True):
                 with gr.Column(scale=8):
-                    with gr.Tabs():
-                        with gr.TabItem("Text & Image Query"):
-                            textbox = gr.MultimodalTextbox(
-                                show_label=False, container=True, submit_btn=False, file_types=["image"]
-                            )
-                        with gr.TabItem("Audio Query"):
-                            audio = gr.Audio(
-                                type="filepath",
-                                sources=["microphone", "upload"],
-                                show_label=False,
-                                container=False,
-                            )
-                with gr.Column(scale=1, min_width=100):
+                    multimodal_textbox = gr.MultimodalTextbox(
+                        show_label=False,
+                        file_types=GRADIO_IMAGE_FORMATS + GRADIO_AUDIO_FORMATS,
+                        sources=["microphone", "upload"],
+                        placeholder="Text, Image & Audio Query",
+                    )
+                with gr.Column(scale=1, min_width=150):
                     with gr.Row():
-                        submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
+                        audio_response_toggler = gr.Checkbox(label="Audio Responses", container=False)
                     with gr.Row(elem_id="buttons") as button_row:
                         clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
 
@@ -563,20 +683,27 @@ def select_upload_type(choice, request: gr.Request):
         [
             state,
         ],
-        [state, chatbot, textbox, audio, video, image, pdf, clear_btn],
+        [state, chatbot, multimodal_textbox, video, image, pdf, clear_btn],
     )
 
-    submit_btn.click(
-        add_text,
-        [state, textbox, audio],
-        [state, chatbot, textbox, audio, clear_btn],
-    ).then(
-        http_bot,
-        [
-            state,
-        ],
-        [state, chatbot, video, image, pdf, clear_btn],
+    multimodal_textbox.submit(
+        add_text, [state, multimodal_textbox], [state, chatbot, multimodal_textbox, clear_btn]
+    ).then(http_bot, [state, audio_response_toggler], [state, chatbot, video, image, pdf, clear_btn]).then(
+        lambda: gr.MultimodalTextbox(interactive=True), None, [multimodal_textbox]
     )
+
+with gr.Blocks() as vector_store:
+    gr.Markdown("# Uploaded Files")
+
+    with gr.Row():
+        with gr.Column(scale=6):
+            files = gr.HTML(visible=False)
+        with gr.Column(scale=3):
+            refresh_btn = gr.Button(value="↻ Refresh", interactive=True, variant="primary")
+            delete_btn = gr.Button(value="🗑️ Delete", interactive=True, variant="stop")
+        refresh_btn.click(get_files, None, [files])
+        delete_btn.click(delete_files, None, [files])
+
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# MultimodalQnA")
     with gr.Tabs():
@@ -590,6 +717,8 @@ def select_upload_type(choice, request: gr.Request):
             upload_audio.render()
         with gr.TabItem("Upload PDF"):
             upload_pdf.render()
+        with gr.TabItem("Vector Store"):
+            vector_store.render()
 
 demo.queue()
 app = gr.mount_gradio_app(app, demo, path="/")
@@ -618,6 +747,12 @@ def select_upload_type(choice, request: gr.Request):
     dataprep_gen_caption_endpoint = os.getenv(
         "DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/generate_captions"
     )
+    dataprep_get_file_endpoint = os.getenv(
+        "DATAPREP_GET_FILE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/dataprep/get"
+    )
+    dataprep_delete_file_endpoint = os.getenv(
+        "DATAPREP_DELETE_FILE_ENDPOINT", f"http://localhost:{DATAPREP_MMR_PORT}/v1/dataprep/delete"
+    )
     args = parser.parse_args()
     logger.info(f"args: {args}")
     global gateway_addr
@@ -628,5 +763,9 @@ def select_upload_type(choice, request: gr.Request):
     dataprep_gen_transcript_addr = dataprep_gen_transcript_endpoint
     global dataprep_gen_caption_addr
     dataprep_gen_caption_addr = dataprep_gen_caption_endpoint
+    global dataprep_get_file_addr
+    dataprep_get_file_addr = dataprep_get_file_endpoint
+    global dataprep_delete_file_addr
+    dataprep_delete_file_addr = dataprep_delete_file_endpoint
 
     uvicorn.run(app, host=args.host, port=args.port)
diff --git a/MultimodalQnA/ui/gradio/requirements.txt b/MultimodalQnA/ui/gradio/requirements.txt
index 12081ed73d..80fc5a0dcc 100644
--- a/MultimodalQnA/ui/gradio/requirements.txt
+++ b/MultimodalQnA/ui/gradio/requirements.txt
@@ -1,5 +1,5 @@
-gradio==5.11.0
-gradio_pdf==0.0.19
+gradio==5.17.1
+gradio_pdf==0.0.20
 moviepy==1.0.3
 numpy==1.26.4
 opencv-python==4.10.0.82
diff --git a/MultimodalQnA/ui/gradio/utils.py b/MultimodalQnA/ui/gradio/utils.py
index c22d102a5a..a0ce9d6b7f 100644
--- a/MultimodalQnA/ui/gradio/utils.py
+++ b/MultimodalQnA/ui/gradio/utils.py
@@ -7,16 +7,24 @@
 import os
 import shutil
 import sys
+import tempfile
 from pathlib import Path
 
 import cv2
 from moviepy.video.io.VideoFileClip import VideoFileClip
 
 LOGDIR = "."
+TMP_DIR = "/tmp"
 
 server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
 
+GRADIO_IMAGE_FORMATS = [".jpeg", ".png", ".jpg", ".gif"]
+GRADIO_AUDIO_FORMATS = [
+    ".wav",
+    ".mp3",
+]
+
 handler = None
 save_log = False
 
@@ -186,3 +194,16 @@ def convert_audio_to_base64(audio_path):
     """Convert .wav file to base64 string."""
     encoded_string = base64.b64encode(open(audio_path, "rb").read())
     return encoded_string.decode("utf-8")
+
+
+def convert_base64_to_audio(b64_str):
+    """Decodes the base64 encoded audio data and returns a saved filepath."""
+
+    audio_data = base64.b64decode(b64_str)
+
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(dir=TMP_DIR, delete=False, suffix=".wav") as temp_audio:
+        temp_audio.write(audio_data)
+        temp_audio_path = temp_audio.name  # Store the path
+
+    return temp_audio_path