Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
11f797e
first commit for tts addition
okhleif-10 Feb 5, 2025
05ddb11
added TTS linkage to backend
okhleif-10 Feb 7, 2025
ee62b73
removed unused import
okhleif-10 Feb 7, 2025
0f4e77d
added necessary env vars
okhleif-10 Feb 10, 2025
fc99972
Merge remote-tracking branch 'origin/mmqna-phase3' into omar/tts-mmqna
okhleif-10 Feb 10, 2025
e500c10
reworked temp tts toggle logic
okhleif-10 Feb 11, 2025
aafee33
added modalities as a toggle
okhleif-10 Feb 11, 2025
e686ec3
removed print statement
okhleif-10 Feb 12, 2025
e4ae51d
removed gaudi from tts
okhleif-10 Feb 12, 2025
0818fff
Merge remote-tracking branch 'origin/mmqna-phase3' into omar/tts-mmqna
okhleif-10 Feb 12, 2025
a1c7adb
doc updates and code refactor
okhleif-10 Feb 13, 2025
0c056a4
Merge remote-tracking branch 'origin/mmqna-phase3' into omar/tts-mmqna
okhleif-10 Feb 13, 2025
632a60b
added tts test to megaservice tests
okhleif-10 Feb 13, 2025
08ab760
remove log diles
okhleif-10 Feb 13, 2025
220096e
addressed recent review comments
okhleif-10 Feb 13, 2025
186f7a8
Merge branch 'mmqna-phase3' into hramayan/tts-mmqna-ui
HarshaRamayanam Feb 20, 2025
2137998
Added Logic for audio responses & refactored code to align with new g…
HarshaRamayanam Mar 4, 2025
a575dd3
Merge branch 'mmqna-phase3' into hramayan/tts-mmqna-ui
HarshaRamayanam Mar 4, 2025
59fb709
Minr bug fixes and UI changes
HarshaRamayanam Mar 5, 2025
4013a0d
UI layout update & handling empty text with spaces
HarshaRamayanam Mar 5, 2025
cd4c645
Updates on review comments
HarshaRamayanam Mar 5, 2025
a2cf4dd
Update on review comments
HarshaRamayanam Mar 5, 2025
b5a0e27
Merge branch 'mmqna-phase3' into hramayan/tts-mmqna-ui
HarshaRamayanam Mar 6, 2025
58734e9
Update MultimodalQnA/ui/gradio/multimodalqna_ui_gradio.py
HarshaRamayanam Mar 7, 2025
1e09283
Some updates to review comments. More to come after testing
HarshaRamayanam Mar 7, 2025
2c4ead5
Restrict file media types to known/working formats
HarshaRamayanam Mar 7, 2025
1ce67e2
Remove extra whitespace
HarshaRamayanam Mar 7, 2025
e9f0cd0
Fix test_compose_on_gaudi.sh script's diff not syncing with phase3
HarshaRamayanam Mar 7, 2025
5ad1c18
Changes per review comments
HarshaRamayanam Mar 10, 2025
3a34ec2
Added single space to the pload
HarshaRamayanam Mar 13, 2025
5b47407
Added logic to flush chatbot assistant's voice reponse .wav
HarshaRamayanam Mar 13, 2025
9189732
Merge branch 'mmqna-phase3' into hramayan/tts-mmqna-ui
HarshaRamayanam Mar 13, 2025
dea974b
Merge branch 'mmqna-phase3' into hramayan/tts-mmqna-ui
HarshaRamayanam Mar 17, 2025
d2a2bc4
Fixed issue where assistant's image is not sent
HarshaRamayanam Mar 18, 2025
c1843f7
Merge branch 'mmqna-phase3' into hramayan/tts-mmqna-ui
HarshaRamayanam Mar 18, 2025
4ed2117
Revert build yaml
HarshaRamayanam Mar 18, 2025
b4ba36c
Clear diff
HarshaRamayanam Mar 18, 2025
bc43cc1
changes per review
HarshaRamayanam Mar 18, 2025
9aad174
small change
HarshaRamayanam Mar 18, 2025
abf0200
Update Dockerfile
HarshaRamayanam Mar 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 48 additions & 126 deletions MultimodalQnA/ui/gradio/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
# SPDX-License-Identifier: Apache-2.0

import dataclasses

from enum import Enum, auto
from typing import Dict, List
from pathlib import Path
from typing import Dict, List, Any, Literal

from PIL import Image
from utils import convert_audio_to_base64, get_b64_frame_from_timestamp
from utils import convert_audio_to_base64, get_b64_frame_from_timestamp, GRADIO_IMAGE_FORMATS, GRADIO_AUDIO_FORMATS


class SeparatorStyle(Enum):
Expand All @@ -21,8 +22,7 @@ class Conversation:

system: str
roles: List[str]
messages: List[List[str]]
image_query_files: Dict[int, str]
chatbot_history: List[Dict[str, Any]]
offset: int
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
sep: str = "\n"
Expand All @@ -42,66 +42,44 @@ def _template_caption(self):
out = f"The caption associated with the image is '{self.caption}'. "
return out

def get_prompt(self):
messages = self.messages
if len(messages) > 1 and messages[1][1] is None:
# Need to do RAG. If the query is text, prompt is the query only
if self.audio_query_file:
ret = [{"role": "user", "content": [{"type": "audio", "audio": self.get_b64_audio_query()}]}]
elif 0 in self.image_query_files:
b64_image = get_b64_frame_from_timestamp(self.image_query_files[0], 0)
ret = [
{
"role": "user",
"content": [
{"type": "text", "text": messages[0][1]},
{"type": "image_url", "image_url": {"url": b64_image}},
],
}
]
else:
ret = messages[0][1]
else:
# No need to do RAG. Thus, prompt of chatcompletion format
conv_dict = []
if self.sep_style == SeparatorStyle.SINGLE:
for i, (role, message) in enumerate(messages):
if message:
dic = {"role": role}
content = [{"type": "text", "text": message}]
# There might be audio
if self.audio_query_file:
content.append({"type": "audio", "audio": self.get_b64_audio_query()})
# There might be a returned item from the first query
if i == 0 and self.time_of_frame_ms and self.video_file:
base64_frame = (
self.base64_frame
if self.base64_frame
else get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms)
)
if base64_frame is None:
base64_frame = ""
# Include the original caption for the returned image/video
if self.caption and content[0]["type"] == "text":
content[0]["text"] = content[0]["text"] + " " + self._template_caption()
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This call to self._template_caption() is no longer happening, and it was an important fix that gives follow-up queries access to the original caption. Test this with the following steps: (1) upload an image with a caption that specifies the name of someone in the image, (2) query for the image based on the scene description, don't use the person's name, (3) after the image and response are returned, ask for the person's name in a follow-up query. It should give you the correct name.

Copy link
Copy Markdown
Collaborator Author

@HarshaRamayanam HarshaRamayanam Mar 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching that. Fixed it here

content.append({"type": "image_url", "image_url": {"url": base64_frame}})
# There might be a query image
if i in self.image_query_files:
content.append(
{
"type": "image_url",
"image_url": {"url": get_b64_frame_from_timestamp(self.image_query_files[i], 0)},
}
)
dic["content"] = content
conv_dict.append(dic)
else:
raise ValueError(f"Invalid style: {self.sep_style}")
ret = conv_dict
return ret

def append_message(self, role, message):
self.messages.append([role, message])
def get_prompt(self, is_very_first_query):
conv_dict = [{'role': 'user', 'content': []}]
caption_flag = True
is_image_query = False

for record in self.chatbot_history:
role = record['role']
content = record['content']

if role == 'user':
# Check if last entry of conv_dict has role user
if conv_dict[-1]['role'] != 'user':
conv_dict.append({'role': 'user', 'content': []})
elif role == 'assistant':
caption_flag = False
# Check if last entry of conv_dict has role assistant
if conv_dict[-1]['role'] != 'assistant':
conv_dict.append({'role': 'assistant', 'content': []})

# Add content to the last conv_dict record. The single space has only effect on first image-only
# query for the similarity search results to get expected response.
if isinstance(content, str):
if caption_flag:
content += " " + self._template_caption()
conv_dict[-1]['content'].append({'type': 'text', 'text': content})

if isinstance(content, dict) and 'path' in content:
if Path(content['path']).suffix in GRADIO_IMAGE_FORMATS:
is_image_query = True
conv_dict[-1]['content'].append({'type': 'image_url', 'image_url': {'url': get_b64_frame_from_timestamp(content['path'], 0)}})
if Path(content['path']).suffix in GRADIO_AUDIO_FORMATS:
conv_dict[-1]['content'].append({'type': 'audio', 'audio': convert_audio_to_base64(content['path'])})

# include the image from the assistant's response given the user's is not a image query
if not is_image_query and caption_flag and self.image:
conv_dict[-1]['content'].append({'type': 'image_url', 'image_url': {'url': get_b64_frame_from_timestamp(self.image, 0)}})

return conv_dict

def get_b64_image(self):
b64_img = None
Expand All @@ -118,68 +96,13 @@ def get_b64_audio_query(self):
return b64_audio

def to_gradio_chatbot(self):
ret = []
for i, (role, msg) in enumerate(self.messages[self.offset :]):
if i % 2 == 0:
if type(msg) is tuple:
import base64
from io import BytesIO

msg, image, image_process_mode = msg
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 800, 400
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
longest_edge = int(shortest_edge * aspect_ratio)
W, H = image.size
if H > W:
H, W = longest_edge, shortest_edge
else:
H, W = shortest_edge, longest_edge
image = image.resize((W, H))
buffered = BytesIO()
image.save(buffered, format="JPEG")
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
msg = img_str + msg.replace("<image>", "").strip()
ret.append([msg, None])
elif i in self.image_query_files:
import base64
from io import BytesIO

image = Image.open(self.image_query_files[i])
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 800, 400
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
longest_edge = int(shortest_edge * aspect_ratio)
W, H = image.size
if H > W:
H, W = longest_edge, shortest_edge
else:
H, W = shortest_edge, longest_edge
image = image.resize((W, H))
buffered = BytesIO()
if image.format not in ["JPEG", "JPG"]:
image = image.convert("RGB")
image.save(buffered, format="JPEG")
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
msg = img_str + msg.replace("<image>", "").strip()
ret.append([msg, None])

else:
ret.append([msg, None])
else:
ret[-1][-1] = msg
return ret

return self.chatbot_history

def copy(self):
return Conversation(
system=self.system,
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
image_query_files=self.image_query_files,
chatbot_history=self.chatbot_history,
offset=self.offset,
sep_style=self.sep_style,
sep=self.sep,
Expand All @@ -192,7 +115,7 @@ def dict(self):
return {
"system": self.system,
"roles": self.roles,
"messages": self.messages,
"chatbot_history": self.chatbot_history,
"offset": self.offset,
"sep": self.sep,
"time_of_frame_ms": self.time_of_frame_ms,
Expand All @@ -209,8 +132,7 @@ def dict(self):
multimodalqna_conv = Conversation(
system="",
roles=("user", "assistant"),
messages=(),
image_query_files={},
chatbot_history=[],
offset=0,
sep_style=SeparatorStyle.SINGLE,
sep="\n",
Expand Down
Loading