mhbuehler · HarshaRamayanam · Mar 18, 2025 · Feb 5, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/MultimodalQnA/ui/gradio/conversation.py b/MultimodalQnA/ui/gradio/conversation.py
@@ -2,11 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dataclasses
+
 from enum import Enum, auto
-from typing import Dict, List
+from pathlib import Path
+from typing import Dict, List, Any, Literal
 
-from PIL import Image
-from utils import convert_audio_to_base64, get_b64_frame_from_timestamp
+from utils import convert_audio_to_base64, get_b64_frame_from_timestamp, GRADIO_IMAGE_FORMATS, GRADIO_AUDIO_FORMATS
 
 
 class SeparatorStyle(Enum):
@@ -21,8 +22,7 @@ class Conversation:
 
     system: str
     roles: List[str]
-    messages: List[List[str]]
-    image_query_files: Dict[int, str]
+    chatbot_history: List[Dict[str, Any]]
     offset: int
     sep_style: SeparatorStyle = SeparatorStyle.SINGLE
     sep: str = "\n"
@@ -42,66 +42,44 @@ def _template_caption(self):
             out = f"The caption associated with the image is '{self.caption}'. "
         return out
 
-    def get_prompt(self):
-        messages = self.messages
-        if len(messages) > 1 and messages[1][1] is None:
-            # Need to do RAG. If the query is text, prompt is the query only
-            if self.audio_query_file:
-                ret = [{"role": "user", "content": [{"type": "audio", "audio": self.get_b64_audio_query()}]}]
-            elif 0 in self.image_query_files:
-                b64_image = get_b64_frame_from_timestamp(self.image_query_files[0], 0)
-                ret = [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "text", "text": messages[0][1]},
-                            {"type": "image_url", "image_url": {"url": b64_image}},
-                        ],
-                    }
-                ]
-            else:
-                ret = messages[0][1]
-        else:
-            # No need to do RAG. Thus, prompt of chatcompletion format
-            conv_dict = []
-            if self.sep_style == SeparatorStyle.SINGLE:
-                for i, (role, message) in enumerate(messages):
-                    if message:
-                        dic = {"role": role}
-                        content = [{"type": "text", "text": message}]
-                        # There might be audio
-                        if self.audio_query_file:
-                            content.append({"type": "audio", "audio": self.get_b64_audio_query()})
-                        # There might be a returned item from the first query
-                        if i == 0 and self.time_of_frame_ms and self.video_file:
-                            base64_frame = (
-                                self.base64_frame
-                                if self.base64_frame
-                                else get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms)
-                            )
-                            if base64_frame is None:
-                                base64_frame = ""
-                            # Include the original caption for the returned image/video
-                            if self.caption and content[0]["type"] == "text":
-                                content[0]["text"] = content[0]["text"] + " " + self._template_caption()
-                            content.append({"type": "image_url", "image_url": {"url": base64_frame}})
-                        # There might be a query image
-                        if i in self.image_query_files:
-                            content.append(
-                                {
-                                    "type": "image_url",
-                                    "image_url": {"url": get_b64_frame_from_timestamp(self.image_query_files[i], 0)},
-                                }
-                            )
-                        dic["content"] = content
-                        conv_dict.append(dic)
-            else:
-                raise ValueError(f"Invalid style: {self.sep_style}")
-            ret = conv_dict
-        return ret
-
-    def append_message(self, role, message):
-        self.messages.append([role, message])
+    def get_prompt(self, is_very_first_query):
+        conv_dict = [{'role': 'user', 'content': []}]
+        caption_flag = True
+        is_image_query = False
+
+        for record in self.chatbot_history:
+            role = record['role']
+            content = record['content']
+
+            if role == 'user':
+                # Check if last entry of conv_dict has role user
+                if conv_dict[-1]['role'] != 'user':
+                    conv_dict.append({'role': 'user', 'content': []})
+            elif role == 'assistant':
+                caption_flag = False
+                # Check if last entry of conv_dict has role assistant
+                if conv_dict[-1]['role'] != 'assistant':
+                    conv_dict.append({'role': 'assistant', 'content': []})
+
+            # Add content to the last conv_dict record. The single space has only effect on first image-only
+            # query for the similarity search results to get expected response.
+            if isinstance(content, str):
+                if caption_flag:
+                    content += " " + self._template_caption()
+                conv_dict[-1]['content'].append({'type': 'text', 'text': content})
+
+            if isinstance(content, dict) and 'path' in content:
+                if Path(content['path']).suffix in GRADIO_IMAGE_FORMATS:
+                    is_image_query = True
+                    conv_dict[-1]['content'].append({'type': 'image_url', 'image_url': {'url': get_b64_frame_from_timestamp(content['path'], 0)}})
+                if Path(content['path']).suffix in GRADIO_AUDIO_FORMATS:
+                    conv_dict[-1]['content'].append({'type': 'audio', 'audio': convert_audio_to_base64(content['path'])})
+
+            # include the image from the assistant's response given the user's is not a image query
+            if not is_image_query and caption_flag and self.image:
+                conv_dict[-1]['content'].append({'type': 'image_url', 'image_url': {'url': get_b64_frame_from_timestamp(self.image, 0)}})
+
+        return conv_dict        
 
     def get_b64_image(self):
         b64_img = None
@@ -118,68 +96,13 @@ def get_b64_audio_query(self):
         return b64_audio
 
     def to_gradio_chatbot(self):
-        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset :]):
-            if i % 2 == 0:
-                if type(msg) is tuple:
-                    import base64
-                    from io import BytesIO
-
-                    msg, image, image_process_mode = msg
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    buffered = BytesIO()
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace("<image>", "").strip()
-                    ret.append([msg, None])
-                elif i in self.image_query_files:
-                    import base64
-                    from io import BytesIO
-
-                    image = Image.open(self.image_query_files[i])
-                    max_hw, min_hw = max(image.size), min(image.size)
-                    aspect_ratio = max_hw / min_hw
-                    max_len, min_len = 800, 400
-                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
-                    longest_edge = int(shortest_edge * aspect_ratio)
-                    W, H = image.size
-                    if H > W:
-                        H, W = longest_edge, shortest_edge
-                    else:
-                        H, W = shortest_edge, longest_edge
-                    image = image.resize((W, H))
-                    buffered = BytesIO()
-                    if image.format not in ["JPEG", "JPG"]:
-                        image = image.convert("RGB")
-                    image.save(buffered, format="JPEG")
-                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    msg = img_str + msg.replace("<image>", "").strip()
-                    ret.append([msg, None])
-
-                else:
-                    ret.append([msg, None])
-            else:
-                ret[-1][-1] = msg
-        return ret
-
+        return self.chatbot_history
+
     def copy(self):
         return Conversation(
             system=self.system,
             roles=self.roles,
-            messages=[[x, y] for x, y in self.messages],
-            image_query_files=self.image_query_files,
+            chatbot_history=self.chatbot_history,
             offset=self.offset,
             sep_style=self.sep_style,
             sep=self.sep,
@@ -192,7 +115,7 @@ def dict(self):
         return {
             "system": self.system,
             "roles": self.roles,
-            "messages": self.messages,
+            "chatbot_history": self.chatbot_history,
             "offset": self.offset,
             "sep": self.sep,
             "time_of_frame_ms": self.time_of_frame_ms,
@@ -209,8 +132,7 @@ def dict(self):
 multimodalqna_conv = Conversation(
     system="",
     roles=("user", "assistant"),
-    messages=(),
-    image_query_files={},
+    chatbot_history=[],
     offset=0,
     sep_style=SeparatorStyle.SINGLE,
     sep="\n",