From 5aceeb7d5ed537abfaa50552a8f3439aa5e8a03f Mon Sep 17 00:00:00 2001 From: claw Date: Wed, 20 May 2026 11:23:29 +0000 Subject: [PATCH 01/35] feat: Add transformers for videos and bounding boxes --- aperturedb/transformers/__init__.py | 21 +++++++++ .../transformers/bounding_box_properties.py | 36 ++++++++++++++++ aperturedb/transformers/common_properties.py | 39 +++++++++++++++-- aperturedb/transformers/image_properties.py | 6 +-- aperturedb/transformers/transformer.py | 14 +++++- aperturedb/transformers/video_properties.py | 43 +++++++++++++++++++ 6 files changed, 151 insertions(+), 8 deletions(-) create mode 100644 aperturedb/transformers/bounding_box_properties.py create mode 100644 aperturedb/transformers/video_properties.py diff --git a/aperturedb/transformers/__init__.py b/aperturedb/transformers/__init__.py index e69de29b..fe2d5727 100644 --- a/aperturedb/transformers/__init__.py +++ b/aperturedb/transformers/__init__.py @@ -0,0 +1,21 @@ +from .transformer import Transformer +from .common_properties import CommonProperties +from .image_properties import ImageProperties +from .video_properties import VideoProperties +from .bounding_box_properties import BoundingBoxProperties +from .facenet_pytorch_embeddings import FacenetPyTorchEmbeddings +from .clip_pytorch_embeddings import CLIPPyTorchEmbeddings +from .facenet import Facenet +from .clip import CLIP + +__all__ = [ + "Transformer", + "CommonProperties", + "ImageProperties", + "VideoProperties", + "BoundingBoxProperties", + "FacenetPyTorchEmbeddings", + "CLIPPyTorchEmbeddings", + "Facenet", + "CLIP", +] \ No newline at end of file diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py new file mode 100644 index 00000000..376cb7a4 --- /dev/null +++ b/aperturedb/transformers/bounding_box_properties.py @@ -0,0 +1,36 @@ +from aperturedb.transformers.transformer import Transformer +from aperturedb.Subscriptable import Subscriptable +import logging + +logger = logging.getLogger(__name__) + +class BoundingBoxProperties(Transformer): + """ + This computes some bounding box properties and adds them to the metadata. + """ + + def __init__(self, data: Subscriptable, **kwargs) -> None: + super().__init__(data, **kwargs) + self.annotation_source = kwargs.get("annotation_source", "coco") + self.annotation_mode = kwargs.get("annotation_mode", "auto") + + def getitem(self, subscript): + x = self.data[subscript] + try: + for ic in getattr(self, "_add_bounding_box_index", []): + src_properties = x[0][ic]["AddBoundingBox"].setdefault("properties", {}) + if self.annotation_source: + src_properties["annotation_source"] = self.annotation_source + if self.annotation_mode: + src_properties["annotation_mode"] = self.annotation_mode + + for ic in getattr(self, "_add_polygon_index", []): + src_properties = x[0][ic]["AddPolygon"].setdefault("properties", {}) + if self.annotation_source: + src_properties["annotation_source"] = self.annotation_source + if self.annotation_mode: + src_properties["annotation_mode"] = self.annotation_mode + except Exception as e: + logger.exception(e.with_traceback(None), stack_info=True) + + return x diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index 5f5f47bd..1ce57742 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -29,9 +29,42 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] try: - # x is a transaction that has an add_image command and a blob + # Apply properties to AddImage commands for ic in self._add_image_index: - src_properties = x[0][ic]["AddImage"]["properties"] + src_properties = x[0][ic]["AddImage"].setdefault("properties", {}) + # Set the static properties, if explicitly set + if self.adb_data_source: + src_properties["adb_data_source"] = self.adb_data_source + if self.adb_timestamp: + src_properties["adb_timestamp"] = self.adb_timestamp + if self.adb_main_object: + src_properties["adb_main_object"] = self.adb_main_object + + # Apply properties to AddVideo commands + for ic in getattr(self, "_add_video_index", []): + src_properties = x[0][ic]["AddVideo"].setdefault("properties", {}) + # Set the static properties, if explicitly set + if self.adb_data_source: + src_properties["adb_data_source"] = self.adb_data_source + if self.adb_timestamp: + src_properties["adb_timestamp"] = self.adb_timestamp + if self.adb_main_object: + src_properties["adb_main_object"] = self.adb_main_object + + # Apply properties to AddBoundingBox commands + for ic in getattr(self, "_add_bounding_box_index", []): + src_properties = x[0][ic]["AddBoundingBox"].setdefault("properties", {}) + # Set the static properties, if explicitly set + if self.adb_data_source: + src_properties["adb_data_source"] = self.adb_data_source + if self.adb_timestamp: + src_properties["adb_timestamp"] = self.adb_timestamp + if self.adb_main_object: + src_properties["adb_main_object"] = self.adb_main_object + + # Apply properties to AddPolygon commands + for ic in getattr(self, "_add_polygon_index", []): + src_properties = x[0][ic]["AddPolygon"].setdefault("properties", {}) # Set the static properties, if explicitly set if self.adb_data_source: src_properties["adb_data_source"] = self.adb_data_source @@ -40,6 +73,6 @@ def getitem(self, subscript): if self.adb_main_object: src_properties["adb_main_object"] = self.adb_main_object except Exception as e: - logger.exception(e.with_traceback(), stack_info=True) + logger.exception(e.with_traceback(None), stack_info=True) return x diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index f680cec4..105698e1 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -26,9 +26,9 @@ def getitem(self, subscript): x = self.data[subscript] try: # x is a transaction that has an add_image command and a blob - for ic in self._add_image_index: - blob_index = self._add_image_index.index(ic) - src_properties = x[0][ic]["AddImage"]["properties"] + for ic in getattr(self, "_add_image_index", []): + blob_index = self._blob_index.index(ic) + src_properties = x[0][ic]["AddImage"].setdefault("properties", {}) # Compute the dynamic properties and apply them to metadata src_properties["adb_image_size"] = len(x[1][blob_index]) src_properties["adb_image_sha256"] = hashlib.sha256( diff --git a/aperturedb/transformers/transformer.py b/aperturedb/transformers/transformer.py index 5370364f..64082bc9 100644 --- a/aperturedb/transformers/transformer.py +++ b/aperturedb/transformers/transformer.py @@ -58,6 +58,9 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: self._blobs = len(x[1]) self._blob_index = [] self._add_image_index = [] + self._add_video_index = [] + self._add_bounding_box_index = [] + self._add_polygon_index = [] self._client = client bc = 0 @@ -65,9 +68,16 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: command = list(c.keys())[0] if command in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: self._blob_index.append(i) - if command == "AddImage": - self._add_image_index.append(i) bc += 1 + if command == "AddImage": + self._add_image_index.append(i) + elif command == "AddVideo": + self._add_video_index.append(i) + elif command == "AddBoundingBox": + self._add_bounding_box_index.append(i) + elif command == "AddPolygon": + self._add_polygon_index.append(i) + logger.info(f"Found {bc} blobs in the data") logger.info( f"Found {len(self._add_image_index)} AddImage commands in the data") diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py new file mode 100644 index 00000000..cc69a0ec --- /dev/null +++ b/aperturedb/transformers/video_properties.py @@ -0,0 +1,43 @@ +from aperturedb.transformers.transformer import Transformer +from aperturedb.Subscriptable import Subscriptable + +import logging +import uuid +import hashlib + +logger = logging.getLogger(__name__) + + +class VideoProperties(Transformer): + """ + This computes some video properties and adds them to the metadata. + """ + + def __init__(self, data: Subscriptable, **kwargs) -> None: + super().__init__(data, **kwargs) + utils = self.get_utils() + + if "adb_data_source" not in utils.get_indexed_props("_Video"): + utils.create_entity_index("_Video", "adb_data_source") + + def getitem(self, subscript): + x = self.data[subscript] + try: + # x is a transaction that has an add_video command and a blob + for ic in getattr(self, "_add_video_index", []): + blob_index = self._blob_index.index(ic) + src_properties = x[0][ic]["AddVideo"].setdefault("properties", {}) + # Compute the dynamic properties and apply them to metadata + src_properties["adb_video_size"] = len(x[1][blob_index]) + src_properties["adb_video_sha256"] = hashlib.sha256( + x[1][blob_index]).hexdigest() + + src_properties["adb_video_id"] = str( + src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) + + except Exception as e: + # Importantly, do not raise an exception here, since it will kill ingestion. + # Create a log message instead, for post-mortem analysis. + logger.exception(e.with_traceback(None), stack_info=True) + + return x From 7c2d8ced76ed8213d2ac9b22142a20a02be82308 Mon Sep 17 00:00:00 2001 From: claw Date: Wed, 20 May 2026 11:49:24 +0000 Subject: [PATCH 02/35] Fix pre-commit failures and remove optional dependencies from transformers __init__.py --- aperturedb/transformers/__init__.py | 10 +--------- aperturedb/transformers/bounding_box_properties.py | 9 ++++++--- aperturedb/transformers/common_properties.py | 12 ++++++++---- aperturedb/transformers/image_properties.py | 3 ++- aperturedb/transformers/video_properties.py | 3 ++- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/aperturedb/transformers/__init__.py b/aperturedb/transformers/__init__.py index fe2d5727..a9d1884f 100644 --- a/aperturedb/transformers/__init__.py +++ b/aperturedb/transformers/__init__.py @@ -3,10 +3,6 @@ from .image_properties import ImageProperties from .video_properties import VideoProperties from .bounding_box_properties import BoundingBoxProperties -from .facenet_pytorch_embeddings import FacenetPyTorchEmbeddings -from .clip_pytorch_embeddings import CLIPPyTorchEmbeddings -from .facenet import Facenet -from .clip import CLIP __all__ = [ "Transformer", @@ -14,8 +10,4 @@ "ImageProperties", "VideoProperties", "BoundingBoxProperties", - "FacenetPyTorchEmbeddings", - "CLIPPyTorchEmbeddings", - "Facenet", - "CLIP", -] \ No newline at end of file +] diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index 376cb7a4..eb659fbd 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -4,6 +4,7 @@ logger = logging.getLogger(__name__) + class BoundingBoxProperties(Transformer): """ This computes some bounding box properties and adds them to the metadata. @@ -18,14 +19,16 @@ def getitem(self, subscript): x = self.data[subscript] try: for ic in getattr(self, "_add_bounding_box_index", []): - src_properties = x[0][ic]["AddBoundingBox"].setdefault("properties", {}) + src_properties = x[0][ic]["AddBoundingBox"].setdefault( + "properties", {}) if self.annotation_source: src_properties["annotation_source"] = self.annotation_source if self.annotation_mode: src_properties["annotation_mode"] = self.annotation_mode - + for ic in getattr(self, "_add_polygon_index", []): - src_properties = x[0][ic]["AddPolygon"].setdefault("properties", {}) + src_properties = x[0][ic]["AddPolygon"].setdefault( + "properties", {}) if self.annotation_source: src_properties["annotation_source"] = self.annotation_source if self.annotation_mode: diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index 1ce57742..158f4ee3 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -31,7 +31,8 @@ def getitem(self, subscript): try: # Apply properties to AddImage commands for ic in self._add_image_index: - src_properties = x[0][ic]["AddImage"].setdefault("properties", {}) + src_properties = x[0][ic]["AddImage"].setdefault( + "properties", {}) # Set the static properties, if explicitly set if self.adb_data_source: src_properties["adb_data_source"] = self.adb_data_source @@ -42,7 +43,8 @@ def getitem(self, subscript): # Apply properties to AddVideo commands for ic in getattr(self, "_add_video_index", []): - src_properties = x[0][ic]["AddVideo"].setdefault("properties", {}) + src_properties = x[0][ic]["AddVideo"].setdefault( + "properties", {}) # Set the static properties, if explicitly set if self.adb_data_source: src_properties["adb_data_source"] = self.adb_data_source @@ -53,7 +55,8 @@ def getitem(self, subscript): # Apply properties to AddBoundingBox commands for ic in getattr(self, "_add_bounding_box_index", []): - src_properties = x[0][ic]["AddBoundingBox"].setdefault("properties", {}) + src_properties = x[0][ic]["AddBoundingBox"].setdefault( + "properties", {}) # Set the static properties, if explicitly set if self.adb_data_source: src_properties["adb_data_source"] = self.adb_data_source @@ -64,7 +67,8 @@ def getitem(self, subscript): # Apply properties to AddPolygon commands for ic in getattr(self, "_add_polygon_index", []): - src_properties = x[0][ic]["AddPolygon"].setdefault("properties", {}) + src_properties = x[0][ic]["AddPolygon"].setdefault( + "properties", {}) # Set the static properties, if explicitly set if self.adb_data_source: src_properties["adb_data_source"] = self.adb_data_source diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index 105698e1..7da1dd0f 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -28,7 +28,8 @@ def getitem(self, subscript): # x is a transaction that has an add_image command and a blob for ic in getattr(self, "_add_image_index", []): blob_index = self._blob_index.index(ic) - src_properties = x[0][ic]["AddImage"].setdefault("properties", {}) + src_properties = x[0][ic]["AddImage"].setdefault( + "properties", {}) # Compute the dynamic properties and apply them to metadata src_properties["adb_image_size"] = len(x[1][blob_index]) src_properties["adb_image_sha256"] = hashlib.sha256( diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index cc69a0ec..102d796e 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -26,7 +26,8 @@ def getitem(self, subscript): # x is a transaction that has an add_video command and a blob for ic in getattr(self, "_add_video_index", []): blob_index = self._blob_index.index(ic) - src_properties = x[0][ic]["AddVideo"].setdefault("properties", {}) + src_properties = x[0][ic]["AddVideo"].setdefault( + "properties", {}) # Compute the dynamic properties and apply them to metadata src_properties["adb_video_size"] = len(x[1][blob_index]) src_properties["adb_video_sha256"] = hashlib.sha256( From 627805876dea6311ad58499c94bf6c9708d608db Mon Sep 17 00:00:00 2001 From: ad-claw000 Date: Thu, 21 May 2026 01:53:47 +0000 Subject: [PATCH 03/35] Fix: Address transformer PR review comments --- .../transformers/bounding_box_properties.py | 2 +- aperturedb/transformers/common_properties.py | 63 +++++-------------- aperturedb/transformers/image_properties.py | 4 +- aperturedb/transformers/video_properties.py | 4 +- 4 files changed, 24 insertions(+), 49 deletions(-) diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index eb659fbd..29b77427 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -7,7 +7,7 @@ class BoundingBoxProperties(Transformer): """ - This computes some bounding box properties and adds them to the metadata. + This computes bounding box and polygon properties and adds them to the metadata. """ def __init__(self, data: Subscriptable, **kwargs) -> None: diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index 158f4ee3..ca5c514b 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -29,53 +29,24 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] try: - # Apply properties to AddImage commands - for ic in self._add_image_index: - src_properties = x[0][ic]["AddImage"].setdefault( - "properties", {}) - # Set the static properties, if explicitly set - if self.adb_data_source: - src_properties["adb_data_source"] = self.adb_data_source - if self.adb_timestamp: - src_properties["adb_timestamp"] = self.adb_timestamp - if self.adb_main_object: - src_properties["adb_main_object"] = self.adb_main_object + commands = [ + ("AddImage", getattr(self, "_add_image_index", [])), + ("AddVideo", getattr(self, "_add_video_index", [])), + ("AddBoundingBox", getattr(self, "_add_bounding_box_index", [])), + ("AddPolygon", getattr(self, "_add_polygon_index", [])), + ] + + for cmd_name, indices in commands: + for ic in indices: + src_properties = x[0][ic][cmd_name].setdefault( + "properties", {}) + if self.adb_data_source: + src_properties["adb_data_source"] = self.adb_data_source + if self.adb_timestamp: + src_properties["adb_timestamp"] = self.adb_timestamp + if self.adb_main_object: + src_properties["adb_main_object"] = self.adb_main_object - # Apply properties to AddVideo commands - for ic in getattr(self, "_add_video_index", []): - src_properties = x[0][ic]["AddVideo"].setdefault( - "properties", {}) - # Set the static properties, if explicitly set - if self.adb_data_source: - src_properties["adb_data_source"] = self.adb_data_source - if self.adb_timestamp: - src_properties["adb_timestamp"] = self.adb_timestamp - if self.adb_main_object: - src_properties["adb_main_object"] = self.adb_main_object - - # Apply properties to AddBoundingBox commands - for ic in getattr(self, "_add_bounding_box_index", []): - src_properties = x[0][ic]["AddBoundingBox"].setdefault( - "properties", {}) - # Set the static properties, if explicitly set - if self.adb_data_source: - src_properties["adb_data_source"] = self.adb_data_source - if self.adb_timestamp: - src_properties["adb_timestamp"] = self.adb_timestamp - if self.adb_main_object: - src_properties["adb_main_object"] = self.adb_main_object - - # Apply properties to AddPolygon commands - for ic in getattr(self, "_add_polygon_index", []): - src_properties = x[0][ic]["AddPolygon"].setdefault( - "properties", {}) - # Set the static properties, if explicitly set - if self.adb_data_source: - src_properties["adb_data_source"] = self.adb_data_source - if self.adb_timestamp: - src_properties["adb_timestamp"] = self.adb_timestamp - if self.adb_main_object: - src_properties["adb_main_object"] = self.adb_main_object except Exception as e: logger.exception(e.with_traceback(None), stack_info=True) diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index 7da1dd0f..a2ace941 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -22,12 +22,14 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: if "adb_data_source" not in utils.get_indexed_props("_Image"): utils.create_entity_index("_Image", "adb_data_source") + self._blob_index_map = {ic: i for i, ic in enumerate(self._blob_index)} + def getitem(self, subscript): x = self.data[subscript] try: # x is a transaction that has an add_image command and a blob for ic in getattr(self, "_add_image_index", []): - blob_index = self._blob_index.index(ic) + blob_index = self._blob_index_map[ic] src_properties = x[0][ic]["AddImage"].setdefault( "properties", {}) # Compute the dynamic properties and apply them to metadata diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index 102d796e..9547e423 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -20,12 +20,14 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: if "adb_data_source" not in utils.get_indexed_props("_Video"): utils.create_entity_index("_Video", "adb_data_source") + self._blob_index_map = {ic: i for i, ic in enumerate(self._blob_index)} + def getitem(self, subscript): x = self.data[subscript] try: # x is a transaction that has an add_video command and a blob for ic in getattr(self, "_add_video_index", []): - blob_index = self._blob_index.index(ic) + blob_index = self._blob_index_map[ic] src_properties = x[0][ic]["AddVideo"].setdefault( "properties", {}) # Compute the dynamic properties and apply them to metadata From d0d7eb79709a866df4fc883e9ce5d94ece9967e8 Mon Sep 17 00:00:00 2001 From: claw Date: Thu, 21 May 2026 02:51:50 +0000 Subject: [PATCH 04/35] fix(transformers): evaluate annotation counts dynamically per transaction Removes cached add_bounding_box_index and add_polygon_index from the first transaction in Transformer, as these counts frequently vary per-item (e.g. COCO bounding boxes). CommonProperties and BoundingBoxProperties now iterate over the current transactions commands directly, avoiding IndexError when subsequent items have fewer annotations, and ensuring proper property application when they have more. Added a test case to explicitly check behavior with variable annotation counts. --- .../transformers/bounding_box_properties.py | 23 +++---- aperturedb/transformers/common_properties.py | 13 +++- aperturedb/transformers/transformer.py | 6 -- test/test_Transformers.py | 68 +++++++++++++++++++ 4 files changed, 87 insertions(+), 23 deletions(-) create mode 100644 test/test_Transformers.py diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index 29b77427..a3d560af 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -18,21 +18,14 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] try: - for ic in getattr(self, "_add_bounding_box_index", []): - src_properties = x[0][ic]["AddBoundingBox"].setdefault( - "properties", {}) - if self.annotation_source: - src_properties["annotation_source"] = self.annotation_source - if self.annotation_mode: - src_properties["annotation_mode"] = self.annotation_mode - - for ic in getattr(self, "_add_polygon_index", []): - src_properties = x[0][ic]["AddPolygon"].setdefault( - "properties", {}) - if self.annotation_source: - src_properties["annotation_source"] = self.annotation_source - if self.annotation_mode: - src_properties["annotation_mode"] = self.annotation_mode + for cmd_dict in x[0]: + cmd_name = list(cmd_dict.keys())[0] + if cmd_name in ["AddBoundingBox", "AddPolygon"]: + src_properties = cmd_dict[cmd_name].setdefault("properties", {}) + if self.annotation_source: + src_properties["annotation_source"] = self.annotation_source + if self.annotation_mode: + src_properties["annotation_mode"] = self.annotation_mode except Exception as e: logger.exception(e.with_traceback(None), stack_info=True) diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index ca5c514b..a9a629ad 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -32,8 +32,6 @@ def getitem(self, subscript): commands = [ ("AddImage", getattr(self, "_add_image_index", [])), ("AddVideo", getattr(self, "_add_video_index", [])), - ("AddBoundingBox", getattr(self, "_add_bounding_box_index", [])), - ("AddPolygon", getattr(self, "_add_polygon_index", [])), ] for cmd_name, indices in commands: @@ -47,6 +45,17 @@ def getitem(self, subscript): if self.adb_main_object: src_properties["adb_main_object"] = self.adb_main_object + for cmd_dict in x[0]: + cmd_name = list(cmd_dict.keys())[0] + if cmd_name in ["AddBoundingBox", "AddPolygon"]: + src_properties = cmd_dict[cmd_name].setdefault("properties", {}) + if self.adb_data_source: + src_properties["adb_data_source"] = self.adb_data_source + if self.adb_timestamp: + src_properties["adb_timestamp"] = self.adb_timestamp + if self.adb_main_object: + src_properties["adb_main_object"] = self.adb_main_object + except Exception as e: logger.exception(e.with_traceback(None), stack_info=True) diff --git a/aperturedb/transformers/transformer.py b/aperturedb/transformers/transformer.py index 64082bc9..d64536e3 100644 --- a/aperturedb/transformers/transformer.py +++ b/aperturedb/transformers/transformer.py @@ -59,8 +59,6 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: self._blob_index = [] self._add_image_index = [] self._add_video_index = [] - self._add_bounding_box_index = [] - self._add_polygon_index = [] self._client = client bc = 0 @@ -73,10 +71,6 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: self._add_image_index.append(i) elif command == "AddVideo": self._add_video_index.append(i) - elif command == "AddBoundingBox": - self._add_bounding_box_index.append(i) - elif command == "AddPolygon": - self._add_polygon_index.append(i) logger.info(f"Found {bc} blobs in the data") logger.info( diff --git a/test/test_Transformers.py b/test/test_Transformers.py new file mode 100644 index 00000000..6596467a --- /dev/null +++ b/test/test_Transformers.py @@ -0,0 +1,68 @@ +import pytest +from aperturedb.transformers.common_properties import CommonProperties +from aperturedb.transformers.bounding_box_properties import BoundingBoxProperties + +class DummyData: + def __init__(self, data): + self._data = data + def __getitem__(self, i): + return self._data[i] + def __len__(self): + return len(self._data) + +def test_variable_annotation_counts(): + # Item 0: 1 BBox + # Item 1: 0 BBoxes + # Item 2: 2 BBoxes, 1 Polygon + # Item 3: 0 BBoxes, 2 Polygons + + data = [ + # Item 0 + ([ + {"AddImage": {}}, + {"AddBoundingBox": {}} + ], []), + # Item 1 + ([ + {"AddImage": {}} + ], []), + # Item 2 + ([ + {"AddImage": {}}, + {"AddBoundingBox": {}}, + {"AddBoundingBox": {}}, + {"AddPolygon": {}} + ], []), + # Item 3 + ([ + {"AddImage": {}}, + {"AddPolygon": {}}, + {"AddPolygon": {}} + ], []) + ] + + dummy_data = DummyData(data) + + # Test CommonProperties + cp = CommonProperties(dummy_data, adb_data_source="test_source") + + # Process all items + for i in range(len(data)): + res = cp[i] + for cmd in res[0]: + cmd_name = list(cmd.keys())[0] + if cmd_name in ["AddImage", "AddBoundingBox", "AddPolygon"]: + assert cmd[cmd_name]["properties"]["adb_data_source"] == "test_source" + + # Test BoundingBoxProperties + bbp = BoundingBoxProperties(dummy_data, annotation_source="test_anno", annotation_mode="auto") + for i in range(len(data)): + res = bbp[i] + for cmd in res[0]: + cmd_name = list(cmd.keys())[0] + if cmd_name in ["AddBoundingBox", "AddPolygon"]: + assert cmd[cmd_name]["properties"]["annotation_source"] == "test_anno" + assert cmd[cmd_name]["properties"]["annotation_mode"] == "auto" + elif cmd_name == "AddImage": + assert "properties" not in cmd[cmd_name] or "annotation_source" not in cmd[cmd_name]["properties"] + From 6e8010b33f0ddd0ef855a1da5951e2e49f72eac5 Mon Sep 17 00:00:00 2001 From: claw Date: Thu, 21 May 2026 04:18:29 +0000 Subject: [PATCH 05/35] Fix pre-commit issues --- .../transformers/bounding_box_properties.py | 3 ++- aperturedb/transformers/common_properties.py | 3 ++- test/test_Transformers.py | 16 ++++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index a3d560af..201024a5 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -21,7 +21,8 @@ def getitem(self, subscript): for cmd_dict in x[0]: cmd_name = list(cmd_dict.keys())[0] if cmd_name in ["AddBoundingBox", "AddPolygon"]: - src_properties = cmd_dict[cmd_name].setdefault("properties", {}) + src_properties = cmd_dict[cmd_name].setdefault( + "properties", {}) if self.annotation_source: src_properties["annotation_source"] = self.annotation_source if self.annotation_mode: diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index a9a629ad..aaae2a03 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -48,7 +48,8 @@ def getitem(self, subscript): for cmd_dict in x[0]: cmd_name = list(cmd_dict.keys())[0] if cmd_name in ["AddBoundingBox", "AddPolygon"]: - src_properties = cmd_dict[cmd_name].setdefault("properties", {}) + src_properties = cmd_dict[cmd_name].setdefault( + "properties", {}) if self.adb_data_source: src_properties["adb_data_source"] = self.adb_data_source if self.adb_timestamp: diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 6596467a..80585e68 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -2,20 +2,24 @@ from aperturedb.transformers.common_properties import CommonProperties from aperturedb.transformers.bounding_box_properties import BoundingBoxProperties + class DummyData: def __init__(self, data): self._data = data + def __getitem__(self, i): return self._data[i] + def __len__(self): return len(self._data) + def test_variable_annotation_counts(): # Item 0: 1 BBox # Item 1: 0 BBoxes # Item 2: 2 BBoxes, 1 Polygon # Item 3: 0 BBoxes, 2 Polygons - + data = [ # Item 0 ([ @@ -40,12 +44,12 @@ def test_variable_annotation_counts(): {"AddPolygon": {}} ], []) ] - + dummy_data = DummyData(data) - + # Test CommonProperties cp = CommonProperties(dummy_data, adb_data_source="test_source") - + # Process all items for i in range(len(data)): res = cp[i] @@ -55,7 +59,8 @@ def test_variable_annotation_counts(): assert cmd[cmd_name]["properties"]["adb_data_source"] == "test_source" # Test BoundingBoxProperties - bbp = BoundingBoxProperties(dummy_data, annotation_source="test_anno", annotation_mode="auto") + bbp = BoundingBoxProperties( + dummy_data, annotation_source="test_anno", annotation_mode="auto") for i in range(len(data)): res = bbp[i] for cmd in res[0]: @@ -65,4 +70,3 @@ def test_variable_annotation_counts(): assert cmd[cmd_name]["properties"]["annotation_mode"] == "auto" elif cmd_name == "AddImage": assert "properties" not in cmd[cmd_name] or "annotation_source" not in cmd[cmd_name]["properties"] - From ab6958dcf5d6f49231f63165983b31ed7c260206 Mon Sep 17 00:00:00 2001 From: claw Date: Thu, 21 May 2026 05:49:55 +0000 Subject: [PATCH 06/35] fix(transformers): evaluate image and video counts dynamically per transaction --- aperturedb/transformers/image_properties.py | 36 ++++++++++++--------- aperturedb/transformers/video_properties.py | 28 +++++++++------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index a2ace941..c0995e1e 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -27,22 +27,26 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] try: - # x is a transaction that has an add_image command and a blob - for ic in getattr(self, "_add_image_index", []): - blob_index = self._blob_index_map[ic] - src_properties = x[0][ic]["AddImage"].setdefault( - "properties", {}) - # Compute the dynamic properties and apply them to metadata - src_properties["adb_image_size"] = len(x[1][blob_index]) - src_properties["adb_image_sha256"] = hashlib.sha256( - x[1][blob_index]).hexdigest() - - # Compute the image dimensions. - pil_image = Image.open(io.BytesIO(x[1][blob_index])) - src_properties["adb_image_width"] = pil_image.width - src_properties["adb_image_height"] = pil_image.height - src_properties["adb_image_id"] = str( - src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) + blob_index = 0 + for cmd_dict in x[0]: + cmd_name = list(cmd_dict.keys())[0] + if cmd_name == "AddImage": + src_properties = cmd_dict["AddImage"].setdefault( + "properties", {}) + # Compute the dynamic properties and apply them to metadata + src_properties["adb_image_size"] = len(x[1][blob_index]) + src_properties["adb_image_sha256"] = hashlib.sha256( + x[1][blob_index]).hexdigest() + + # Compute the image dimensions. + pil_image = Image.open(io.BytesIO(x[1][blob_index])) + src_properties["adb_image_width"] = pil_image.width + src_properties["adb_image_height"] = pil_image.height + src_properties["adb_image_id"] = str( + src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) + + if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: + blob_index += 1 except Exception as e: # Importantly, do not raise an exception here, since it will kill ingestion. diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index 9547e423..ead608dc 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -25,18 +25,22 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] try: - # x is a transaction that has an add_video command and a blob - for ic in getattr(self, "_add_video_index", []): - blob_index = self._blob_index_map[ic] - src_properties = x[0][ic]["AddVideo"].setdefault( - "properties", {}) - # Compute the dynamic properties and apply them to metadata - src_properties["adb_video_size"] = len(x[1][blob_index]) - src_properties["adb_video_sha256"] = hashlib.sha256( - x[1][blob_index]).hexdigest() - - src_properties["adb_video_id"] = str( - src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) + blob_index = 0 + for cmd_dict in x[0]: + cmd_name = list(cmd_dict.keys())[0] + if cmd_name == "AddVideo": + src_properties = cmd_dict["AddVideo"].setdefault( + "properties", {}) + # Compute the dynamic properties and apply them to metadata + src_properties["adb_video_size"] = len(x[1][blob_index]) + src_properties["adb_video_sha256"] = hashlib.sha256( + x[1][blob_index]).hexdigest() + + src_properties["adb_video_id"] = str( + src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) + + if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: + blob_index += 1 except Exception as e: # Importantly, do not raise an exception here, since it will kill ingestion. From cfc08ad03ef6bfbf64d8b1438bdeb1ce24a51649 Mon Sep 17 00:00:00 2001 From: ad-claw000 Date: Thu, 21 May 2026 10:31:02 +0000 Subject: [PATCH 07/35] Fix: Address review comments on transformers - Remove unused pytest import in test_Transformers.py - Remove caching of AddImage and AddVideo indices in Transformer.__init__ to handle variable per-item counts - Update CommonProperties and ImageProperties to scan for AddVideo dynamically instead of relying on cached indices - Remove unused _blob_index_map from ImageProperties and VideoProperties - Add unit test coverage for VideoProperties to ensure dynamic properties apply correctly --- aperturedb/transformers/common_properties.py | 18 +---- aperturedb/transformers/image_properties.py | 2 - aperturedb/transformers/transformer.py | 8 -- aperturedb/transformers/video_properties.py | 2 - test/test_Transformers.py | 78 +++++++++++--------- 5 files changed, 45 insertions(+), 63 deletions(-) diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index aaae2a03..aed94a3d 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -29,25 +29,9 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] try: - commands = [ - ("AddImage", getattr(self, "_add_image_index", [])), - ("AddVideo", getattr(self, "_add_video_index", [])), - ] - - for cmd_name, indices in commands: - for ic in indices: - src_properties = x[0][ic][cmd_name].setdefault( - "properties", {}) - if self.adb_data_source: - src_properties["adb_data_source"] = self.adb_data_source - if self.adb_timestamp: - src_properties["adb_timestamp"] = self.adb_timestamp - if self.adb_main_object: - src_properties["adb_main_object"] = self.adb_main_object - for cmd_dict in x[0]: cmd_name = list(cmd_dict.keys())[0] - if cmd_name in ["AddBoundingBox", "AddPolygon"]: + if cmd_name in ["AddImage", "AddVideo", "AddBoundingBox", "AddPolygon"]: src_properties = cmd_dict[cmd_name].setdefault( "properties", {}) if self.adb_data_source: diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index c0995e1e..57a1042f 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -22,8 +22,6 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: if "adb_data_source" not in utils.get_indexed_props("_Image"): utils.create_entity_index("_Image", "adb_data_source") - self._blob_index_map = {ic: i for i, ic in enumerate(self._blob_index)} - def getitem(self, subscript): x = self.data[subscript] try: diff --git a/aperturedb/transformers/transformer.py b/aperturedb/transformers/transformer.py index d64536e3..682ad9e3 100644 --- a/aperturedb/transformers/transformer.py +++ b/aperturedb/transformers/transformer.py @@ -57,8 +57,6 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: self._queries = len(x[0]) self._blobs = len(x[1]) self._blob_index = [] - self._add_image_index = [] - self._add_video_index = [] self._client = client bc = 0 @@ -67,14 +65,8 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: if command in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: self._blob_index.append(i) bc += 1 - if command == "AddImage": - self._add_image_index.append(i) - elif command == "AddVideo": - self._add_video_index.append(i) logger.info(f"Found {bc} blobs in the data") - logger.info( - f"Found {len(self._add_image_index)} AddImage commands in the data") self.ncalls = 0 self.cumulative_time = 0 diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index ead608dc..655a954b 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -20,8 +20,6 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: if "adb_data_source" not in utils.get_indexed_props("_Video"): utils.create_entity_index("_Video", "adb_data_source") - self._blob_index_map = {ic: i for i, ic in enumerate(self._blob_index)} - def getitem(self, subscript): x = self.data[subscript] try: diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 80585e68..9606abae 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -1,7 +1,8 @@ -import pytest +from unittest.mock import patch from aperturedb.transformers.common_properties import CommonProperties from aperturedb.transformers.bounding_box_properties import BoundingBoxProperties - +from aperturedb.transformers.video_properties import VideoProperties +import hashlib class DummyData: def __init__(self, data): @@ -15,42 +16,15 @@ def __len__(self): def test_variable_annotation_counts(): - # Item 0: 1 BBox - # Item 1: 0 BBoxes - # Item 2: 2 BBoxes, 1 Polygon - # Item 3: 0 BBoxes, 2 Polygons - data = [ - # Item 0 - ([ - {"AddImage": {}}, - {"AddBoundingBox": {}} - ], []), - # Item 1 - ([ - {"AddImage": {}} - ], []), - # Item 2 - ([ - {"AddImage": {}}, - {"AddBoundingBox": {}}, - {"AddBoundingBox": {}}, - {"AddPolygon": {}} - ], []), - # Item 3 - ([ - {"AddImage": {}}, - {"AddPolygon": {}}, - {"AddPolygon": {}} - ], []) + ([{"AddImage": {}}, {"AddBoundingBox": {}}], []), + ([{"AddImage": {}}], []), + ([{"AddImage": {}}, {"AddBoundingBox": {}}, {"AddBoundingBox": {}}, {"AddPolygon": {}}], []), + ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], []) ] - dummy_data = DummyData(data) - # Test CommonProperties cp = CommonProperties(dummy_data, adb_data_source="test_source") - - # Process all items for i in range(len(data)): res = cp[i] for cmd in res[0]: @@ -58,7 +32,6 @@ def test_variable_annotation_counts(): if cmd_name in ["AddImage", "AddBoundingBox", "AddPolygon"]: assert cmd[cmd_name]["properties"]["adb_data_source"] == "test_source" - # Test BoundingBoxProperties bbp = BoundingBoxProperties( dummy_data, annotation_source="test_anno", annotation_mode="auto") for i in range(len(data)): @@ -70,3 +43,40 @@ def test_variable_annotation_counts(): assert cmd[cmd_name]["properties"]["annotation_mode"] == "auto" elif cmd_name == "AddImage": assert "properties" not in cmd[cmd_name] or "annotation_source" not in cmd[cmd_name]["properties"] + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +def test_video_properties(mock_get_utils): + mock_utils = mock_get_utils.return_value + mock_utils.get_indexed_props.return_value = [] + + dummy_video_data = b"fake_video_blob_content" + data = [ + ([ + {"AddVideo": {}}, + {"AddBoundingBox": {}} + ], [dummy_video_data]), + ([ + {"AddBoundingBox": {}} + ], []), + ([ + {"AddImage": {}}, + {"AddVideo": {}} + ], [b"image_blob", dummy_video_data]), + ] + + dummy_data = DummyData(data) + vp = VideoProperties(dummy_data) + + for i in range(len(data)): + res = vp[i] + blob_index = 0 + for cmd in res[0]: + cmd_name = list(cmd.keys())[0] + if cmd_name == "AddVideo": + props = cmd["AddVideo"]["properties"] + assert props["adb_video_size"] == len(dummy_video_data) + assert props["adb_video_sha256"] == hashlib.sha256(dummy_video_data).hexdigest() + assert "adb_video_id" in props + if cmd_name in ["AddImage", "AddVideo", "AddBlob", "AddDescriptor"]: + blob_index += 1 From 5c3f44294044894bcfad9d686d0a9fe870b90dd5 Mon Sep 17 00:00:00 2001 From: ad-claw000 Date: Thu, 21 May 2026 10:41:55 +0000 Subject: [PATCH 08/35] fix(transformers): remove unused _add_image_index and compute dynamically in remaining embeddings --- .../transformers/clip_pytorch_embeddings.py | 67 +++++++++++------- .../facenet_pytorch_embeddings.py | 68 ++++++++++++------- 2 files changed, 86 insertions(+), 49 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 894de19f..6fd48f78 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -21,8 +21,15 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: super().__init__(data, **kwargs) # Let's sample some data to figure out the descriptorset we need. - if len(self._add_image_index) > 0: - sample = generate_embedding(self.data[0][1][0]) + sample_blob = None + for i, c in enumerate(self.data[0][0]): + if list(c.keys())[0] == "AddImage": + blob_idx = self._blob_index.index(i) + sample_blob = self.data[0][1][blob_idx] + break + + if sample_blob is not None: + sample = generate_embedding(sample_blob) utils = self.get_utils() utils.add_descriptorset( self.search_set_name, dim=len(sample) // 4, metric=["CS"]) @@ -30,27 +37,39 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] - for ic in self._add_image_index: - serialized = generate_embedding(x[1][ic]) - # If the image already has an image_sha256, we use it. - image_sha256 = x[0][ic]["AddImage"].get("properties", {}).get( - "adb_image_sha256", None) - if not image_sha256: - image_sha256 = hashlib.sha256(x[1][ic]).hexdigest() - x[1].append(serialized) - x[0].append( - { - "AddDescriptor": { - "set": self.search_set_name, - "properties": { - "image_sha256": image_sha256, - }, - "if_not_found": { - "image_sha256": ["==", image_sha256], - }, - "connect": { - "ref": x[0][ic]["AddImage"]["_ref"] + blob_index = 0 + new_descriptors = [] + new_blobs = [] + + for i, cmd_dict in enumerate(x[0]): + cmd_name = list(cmd_dict.keys())[0] + if cmd_name == "AddImage": + blob = x[1][blob_index] + serialized = generate_embedding(blob) + # If the image already has an image_sha256, we use it. + image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( + "adb_image_sha256", None) + if not image_sha256: + image_sha256 = hashlib.sha256(blob).hexdigest() + new_blobs.append(serialized) + new_descriptors.append( + { + "AddDescriptor": { + "set": self.search_set_name, + "properties": { + "image_sha256": image_sha256, + }, + "if_not_found": { + "image_sha256": ["==", image_sha256], + }, + "connect": { + "ref": cmd_dict["AddImage"]["_ref"] + } } - } - }) + }) + if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: + blob_index += 1 + + x[0].extend(new_descriptors) + x[1].extend(new_blobs) return x diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 47fe15b8..415abc6f 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -23,8 +23,15 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: super().__init__(data, **kwargs) # Let's sample some data to figure out the descriptorset we need. - if len(self._add_image_index) > 0: - sample = self._get_embedding_from_blob(self.data[0][1][0]) + sample_blob = None + for i, c in enumerate(self.data[0][0]): + if list(c.keys())[0] == "AddImage": + blob_idx = self._blob_index.index(i) + sample_blob = self.data[0][1][blob_idx] + break + + if sample_blob is not None: + sample = self._get_embedding_from_blob(sample_blob) utils = self.get_utils() utils.add_descriptorset(self.search_set_name, dim=len(sample) // 4) @@ -39,29 +46,40 @@ def getitem(self, subscript): self.ncalls += 1 x = self.data[subscript] - for ic in self._add_image_index: - serialized = self._get_embedding_from_blob( - x[1][self._add_image_index.index(ic)]) - # If the image already has an image_sha256, we use it. - image_sha256 = x[0][ic]["AddImage"].get("properties", {}).get( - "adb_image_sha256", None) - if not image_sha256: - image_sha256 = hashlib.sha256(x[1][ic]).hexdigest() - x[1].append(serialized) - x[0].append( - { - "AddDescriptor": { - "set": self.search_set_name, - "properties": { - "image_sha256": image_sha256, - }, - "if_not_found": { - "image_sha256": ["==", image_sha256], - }, - "connect": { - "ref": x[0][ic]["AddImage"]["_ref"] + blob_index = 0 + new_descriptors = [] + new_blobs = [] + + for i, cmd_dict in enumerate(x[0]): + cmd_name = list(cmd_dict.keys())[0] + if cmd_name == "AddImage": + blob = x[1][blob_index] + serialized = self._get_embedding_from_blob(blob) + # If the image already has an image_sha256, we use it. + image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( + "adb_image_sha256", None) + if not image_sha256: + image_sha256 = hashlib.sha256(blob).hexdigest() + new_blobs.append(serialized) + new_descriptors.append( + { + "AddDescriptor": { + "set": self.search_set_name, + "properties": { + "image_sha256": image_sha256, + }, + "if_not_found": { + "image_sha256": ["==", image_sha256], + }, + "connect": { + "ref": cmd_dict["AddImage"]["_ref"] + } } - } - }) + }) + if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: + blob_index += 1 + + x[0].extend(new_descriptors) + x[1].extend(new_blobs) self.cumulative_time += time.time() - start return x From 30721e5c402220d5ea2b3a7d454b6e415f8e37fe Mon Sep 17 00:00:00 2001 From: Claw Date: Thu, 21 May 2026 12:21:29 +0000 Subject: [PATCH 09/35] fix: apply autopep8 formatting --- test/test_Transformers.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 9606abae..9c8a5074 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -4,6 +4,7 @@ from aperturedb.transformers.video_properties import VideoProperties import hashlib + class DummyData: def __init__(self, data): self._data = data @@ -19,7 +20,8 @@ def test_variable_annotation_counts(): data = [ ([{"AddImage": {}}, {"AddBoundingBox": {}}], []), ([{"AddImage": {}}], []), - ([{"AddImage": {}}, {"AddBoundingBox": {}}, {"AddBoundingBox": {}}, {"AddPolygon": {}}], []), + ([{"AddImage": {}}, {"AddBoundingBox": {}}, { + "AddBoundingBox": {}}, {"AddPolygon": {}}], []), ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], []) ] dummy_data = DummyData(data) @@ -49,7 +51,7 @@ def test_variable_annotation_counts(): def test_video_properties(mock_get_utils): mock_utils = mock_get_utils.return_value mock_utils.get_indexed_props.return_value = [] - + dummy_video_data = b"fake_video_blob_content" data = [ ([ @@ -64,10 +66,10 @@ def test_video_properties(mock_get_utils): {"AddVideo": {}} ], [b"image_blob", dummy_video_data]), ] - + dummy_data = DummyData(data) vp = VideoProperties(dummy_data) - + for i in range(len(data)): res = vp[i] blob_index = 0 @@ -76,7 +78,8 @@ def test_video_properties(mock_get_utils): if cmd_name == "AddVideo": props = cmd["AddVideo"]["properties"] assert props["adb_video_size"] == len(dummy_video_data) - assert props["adb_video_sha256"] == hashlib.sha256(dummy_video_data).hexdigest() + assert props["adb_video_sha256"] == hashlib.sha256( + dummy_video_data).hexdigest() assert "adb_video_id" in props if cmd_name in ["AddImage", "AddVideo", "AddBlob", "AddDescriptor"]: blob_index += 1 From d28a4fd4eed51f0818cce495338de38530debc92 Mon Sep 17 00:00:00 2001 From: ad-claw000 Date: Thu, 21 May 2026 15:46:55 +0000 Subject: [PATCH 10/35] Fix remaining unused variable and unused index review comments --- aperturedb/transformers/clip_pytorch_embeddings.py | 2 +- aperturedb/transformers/facenet_pytorch_embeddings.py | 2 +- test/test_Transformers.py | 3 --- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 6fd48f78..075c9bdf 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -41,7 +41,7 @@ def getitem(self, subscript): new_descriptors = [] new_blobs = [] - for i, cmd_dict in enumerate(x[0]): + for cmd_dict in x[0]: cmd_name = list(cmd_dict.keys())[0] if cmd_name == "AddImage": blob = x[1][blob_index] diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 415abc6f..254e3372 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -50,7 +50,7 @@ def getitem(self, subscript): new_descriptors = [] new_blobs = [] - for i, cmd_dict in enumerate(x[0]): + for cmd_dict in x[0]: cmd_name = list(cmd_dict.keys())[0] if cmd_name == "AddImage": blob = x[1][blob_index] diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 9c8a5074..dbc4ebdf 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -72,7 +72,6 @@ def test_video_properties(mock_get_utils): for i in range(len(data)): res = vp[i] - blob_index = 0 for cmd in res[0]: cmd_name = list(cmd.keys())[0] if cmd_name == "AddVideo": @@ -81,5 +80,3 @@ def test_video_properties(mock_get_utils): assert props["adb_video_sha256"] == hashlib.sha256( dummy_video_data).hexdigest() assert "adb_video_id" in props - if cmd_name in ["AddImage", "AddVideo", "AddBlob", "AddDescriptor"]: - blob_index += 1 From 1e2f6a73e327a1202dab42f8fb7f9102fb18a763 Mon Sep 17 00:00:00 2001 From: claw Date: Sat, 23 May 2026 04:50:22 +0000 Subject: [PATCH 11/35] fix: lazily initialize descriptor sets on first AddImage in getitem --- .../transformers/clip_pytorch_embeddings.py | 8 +++++++ .../facenet_pytorch_embeddings.py | 21 +++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 075c9bdf..6ae254dd 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -45,6 +45,14 @@ def getitem(self, subscript): cmd_name = list(cmd_dict.keys())[0] if cmd_name == "AddImage": blob = x[1][blob_index] + + if not getattr(self, "_descriptorset_initialized", False): + sample = generate_embedding(blob) + utils = self.get_utils() + utils.add_descriptorset( + self.search_set_name, dim=len(sample) // 4, metric=["CS"]) + self._descriptorset_initialized = True + serialized = generate_embedding(blob) # If the image already has an image_sha256, we use it. image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 254e3372..44f54adb 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -21,19 +21,7 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: self.search_set_name = kwargs.pop( "search_set_name", "facenet_pytorch_embeddings") super().__init__(data, **kwargs) - - # Let's sample some data to figure out the descriptorset we need. - sample_blob = None - for i, c in enumerate(self.data[0][0]): - if list(c.keys())[0] == "AddImage": - blob_idx = self._blob_index.index(i) - sample_blob = self.data[0][1][blob_idx] - break - - if sample_blob is not None: - sample = self._get_embedding_from_blob(sample_blob) - utils = self.get_utils() - utils.add_descriptorset(self.search_set_name, dim=len(sample) // 4) + self._descriptorset_initialized = False def _get_embedding_from_blob(self, image_blob: bytes): pil_image = Image.open(io.BytesIO(image_blob)) @@ -54,6 +42,13 @@ def getitem(self, subscript): cmd_name = list(cmd_dict.keys())[0] if cmd_name == "AddImage": blob = x[1][blob_index] + + if not getattr(self, "_descriptorset_initialized", False): + sample = self._get_embedding_from_blob(blob) + utils = self.get_utils() + utils.add_descriptorset(self.search_set_name, dim=len(sample) // 4) + self._descriptorset_initialized = True + serialized = self._get_embedding_from_blob(blob) # If the image already has an image_sha256, we use it. image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( From cf75c406cdae28d715d02cb66892e4dbd7df3a65 Mon Sep 17 00:00:00 2001 From: claw Date: Sat, 23 May 2026 08:41:42 +0000 Subject: [PATCH 12/35] fix: lazily initialize descriptor sets on first AddImage in getitem for CLIP --- .../transformers/clip_pytorch_embeddings.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 6ae254dd..1771253b 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -19,20 +19,7 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: self.search_set_name = kwargs.pop( "search_set_name", descriptor_set) super().__init__(data, **kwargs) - - # Let's sample some data to figure out the descriptorset we need. - sample_blob = None - for i, c in enumerate(self.data[0][0]): - if list(c.keys())[0] == "AddImage": - blob_idx = self._blob_index.index(i) - sample_blob = self.data[0][1][blob_idx] - break - - if sample_blob is not None: - sample = generate_embedding(sample_blob) - utils = self.get_utils() - utils.add_descriptorset( - self.search_set_name, dim=len(sample) // 4, metric=["CS"]) + self._descriptorset_initialized = False def getitem(self, subscript): x = self.data[subscript] From 8d37ab872985b4be0768c6c37fd1af09c0f0adb5 Mon Sep 17 00:00:00 2001 From: claw Date: Sat, 23 May 2026 16:36:47 +0000 Subject: [PATCH 13/35] fix(transformers): avoid duplicate embedding generation for the first item --- aperturedb/transformers/clip_pytorch_embeddings.py | 6 +++--- aperturedb/transformers/facenet_pytorch_embeddings.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 1771253b..67f085b5 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -33,14 +33,14 @@ def getitem(self, subscript): if cmd_name == "AddImage": blob = x[1][blob_index] + serialized = generate_embedding(blob) + if not getattr(self, "_descriptorset_initialized", False): - sample = generate_embedding(blob) utils = self.get_utils() utils.add_descriptorset( - self.search_set_name, dim=len(sample) // 4, metric=["CS"]) + self.search_set_name, dim=len(serialized) // 4, metric=["CS"]) self._descriptorset_initialized = True - serialized = generate_embedding(blob) # If the image already has an image_sha256, we use it. image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( "adb_image_sha256", None) diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 44f54adb..28a08481 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -43,13 +43,13 @@ def getitem(self, subscript): if cmd_name == "AddImage": blob = x[1][blob_index] + serialized = self._get_embedding_from_blob(blob) + if not getattr(self, "_descriptorset_initialized", False): - sample = self._get_embedding_from_blob(blob) utils = self.get_utils() - utils.add_descriptorset(self.search_set_name, dim=len(sample) // 4) + utils.add_descriptorset(self.search_set_name, dim=len(serialized) // 4) self._descriptorset_initialized = True - serialized = self._get_embedding_from_blob(blob) # If the image already has an image_sha256, we use it. image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( "adb_image_sha256", None) From 02b8fa2469f72db3d1ab1ac87e41811ad9e6a818 Mon Sep 17 00:00:00 2001 From: claw Date: Sat, 23 May 2026 20:04:03 +0000 Subject: [PATCH 14/35] style: fix autopep8 formatting in facenet_pytorch_embeddings.py --- aperturedb/transformers/facenet_pytorch_embeddings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 28a08481..74c9f452 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -47,7 +47,8 @@ def getitem(self, subscript): if not getattr(self, "_descriptorset_initialized", False): utils = self.get_utils() - utils.add_descriptorset(self.search_set_name, dim=len(serialized) // 4) + utils.add_descriptorset( + self.search_set_name, dim=len(serialized) // 4) self._descriptorset_initialized = True # If the image already has an image_sha256, we use it. From f68a0798cef239402af3f98f9901db0c23480e18 Mon Sep 17 00:00:00 2001 From: claw Date: Sun, 24 May 2026 01:37:22 +0000 Subject: [PATCH 15/35] fix(transformers): restore _add_image_index for backward compat and add Video test for CommonProperties --- aperturedb/transformers/transformer.py | 4 ++++ test/test_Transformers.py | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/aperturedb/transformers/transformer.py b/aperturedb/transformers/transformer.py index 682ad9e3..6d76d9eb 100644 --- a/aperturedb/transformers/transformer.py +++ b/aperturedb/transformers/transformer.py @@ -57,6 +57,7 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: self._queries = len(x[0]) self._blobs = len(x[1]) self._blob_index = [] + self._add_image_index = [] self._client = client bc = 0 @@ -65,6 +66,9 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None: if command in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: self._blob_index.append(i) bc += 1 + # Kept for backward compatibility + if command == "AddImage": + self._add_image_index.append(i) logger.info(f"Found {bc} blobs in the data") diff --git a/test/test_Transformers.py b/test/test_Transformers.py index dbc4ebdf..b1e0a032 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -22,7 +22,8 @@ def test_variable_annotation_counts(): ([{"AddImage": {}}], []), ([{"AddImage": {}}, {"AddBoundingBox": {}}, { "AddBoundingBox": {}}, {"AddPolygon": {}}], []), - ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], []) + ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], []), + ([{"AddVideo": {}}, {"AddBoundingBox": {}}], []) ] dummy_data = DummyData(data) @@ -31,7 +32,7 @@ def test_variable_annotation_counts(): res = cp[i] for cmd in res[0]: cmd_name = list(cmd.keys())[0] - if cmd_name in ["AddImage", "AddBoundingBox", "AddPolygon"]: + if cmd_name in ["AddImage", "AddBoundingBox", "AddPolygon", "AddVideo"]: assert cmd[cmd_name]["properties"]["adb_data_source"] == "test_source" bbp = BoundingBoxProperties( @@ -43,7 +44,7 @@ def test_variable_annotation_counts(): if cmd_name in ["AddBoundingBox", "AddPolygon"]: assert cmd[cmd_name]["properties"]["annotation_source"] == "test_anno" assert cmd[cmd_name]["properties"]["annotation_mode"] == "auto" - elif cmd_name == "AddImage": + elif cmd_name in ["AddImage", "AddVideo"]: assert "properties" not in cmd[cmd_name] or "annotation_source" not in cmd[cmd_name]["properties"] From f8457633d6382ff48c680ba3bc03ad3e2f01cfd8 Mon Sep 17 00:00:00 2001 From: claw Date: Sun, 24 May 2026 20:04:11 +0000 Subject: [PATCH 16/35] test: add coverage for ImageProperties, clip, and facenet transformers --- test/test_Transformers.py | 118 +++++++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 1 deletion(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index b1e0a032..c661443c 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -1,7 +1,10 @@ -from unittest.mock import patch +import sys +import pytest +from unittest.mock import patch, MagicMock from aperturedb.transformers.common_properties import CommonProperties from aperturedb.transformers.bounding_box_properties import BoundingBoxProperties from aperturedb.transformers.video_properties import VideoProperties +from aperturedb.transformers.image_properties import ImageProperties import hashlib @@ -81,3 +84,116 @@ def test_video_properties(mock_get_utils): assert props["adb_video_sha256"] == hashlib.sha256( dummy_video_data).hexdigest() assert "adb_video_id" in props + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +@patch('aperturedb.transformers.image_properties.Image.open') +def test_image_properties(mock_image_open, mock_get_utils): + mock_utils = mock_get_utils.return_value + mock_utils.get_indexed_props.return_value = [] + + mock_pil = MagicMock() + mock_pil.width = 800 + mock_pil.height = 600 + mock_image_open.return_value = mock_pil + + dummy_image_data = b"fake_image_blob_content" + data = [ + ([ + {"AddImage": {"_ref": 1}}, + {"AddVideo": {}} + ], [dummy_image_data, b"video_blob"]), + ([ + {"AddBoundingBox": {}} + ], []), + ([ + {"AddVideo": {}}, + {"AddImage": {"_ref": 2}} + ], [b"video_blob", dummy_image_data]), + ] + + dummy_data = DummyData(data) + ip = ImageProperties(dummy_data) + + for i in range(len(data)): + res = ip[i] + for cmd in res[0]: + cmd_name = list(cmd.keys())[0] + if cmd_name == "AddImage": + props = cmd["AddImage"]["properties"] + assert props["adb_image_size"] == len(dummy_image_data) + assert props["adb_image_sha256"] == hashlib.sha256( + dummy_image_data).hexdigest() + assert props["adb_image_width"] == 800 + assert props["adb_image_height"] == 600 + assert "adb_image_id" in props + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +def test_clip_pytorch_embeddings(mock_get_utils): + # Mock the internal generate_embedding dynamically + try: + from aperturedb.transformers.clip_pytorch_embeddings import CLIPPyTorchEmbeddings + except (ImportError, SystemExit): + pytest.skip("Missing deps for CLIP") + + with patch('aperturedb.transformers.clip_pytorch_embeddings.generate_embedding') as mock_generate_embedding: + mock_generate_embedding.return_value = [0.1, 0.2, 0.3, 0.4] + mock_utils = mock_get_utils.return_value + + dummy_image_data = b"fake_image_blob_content" + data = [ + ([ + {"AddImage": {"_ref": 1}}, + {"AddVideo": {}} + ], [dummy_image_data, b"video_blob"]) + ] + + dummy_data = DummyData(data) + clip = CLIPPyTorchEmbeddings(dummy_data) + + res = clip[0] + assert mock_utils.add_descriptorset.called + assert len(res[0]) == 3 # AddImage, AddVideo, AddDescriptor + + desc_cmd = [c for c in res[0] if "AddDescriptor" in c] + assert len(desc_cmd) == 1 + assert desc_cmd[0]["AddDescriptor"]["connect"]["ref"] == 1 + + # 2 blobs originally + 1 generated embedding blob + assert len(res[1]) == 3 + assert res[1][-1] == [0.1, 0.2, 0.3, 0.4] + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +def test_facenet_pytorch_embeddings(mock_get_utils): + try: + from aperturedb.transformers.facenet_pytorch_embeddings import FacenetPyTorchEmbeddings + except (ImportError, SystemExit): + pytest.skip("Missing deps for Facenet") + + with patch('aperturedb.transformers.facenet_pytorch_embeddings.FacenetPyTorchEmbeddings._get_embedding_from_blob') as mock_get_embedding: + mock_get_embedding.return_value = [0.5, 0.6, 0.7, 0.8] + mock_utils = mock_get_utils.return_value + + dummy_image_data = b"fake_image_blob_content" + data = [ + ([ + {"AddVideo": {}}, + {"AddImage": {"_ref": 2}} + ], [b"video_blob", dummy_image_data]) + ] + + dummy_data = DummyData(data) + facenet = FacenetPyTorchEmbeddings(dummy_data) + + res = facenet[0] + assert mock_utils.add_descriptorset.called + assert len(res[0]) == 3 # AddVideo, AddImage, AddDescriptor + + desc_cmd = [c for c in res[0] if "AddDescriptor" in c] + assert len(desc_cmd) == 1 + assert desc_cmd[0]["AddDescriptor"]["connect"]["ref"] == 2 + + assert len(res[1]) == 3 + assert res[1][-1] == [0.5, 0.6, 0.7, 0.8] From f5b565ea2b70f5b6c06833ba9b87d555405f2d47 Mon Sep 17 00:00:00 2001 From: claw Date: Sun, 24 May 2026 21:57:09 +0000 Subject: [PATCH 17/35] test: remove unused sys import in test_Transformers.py --- test/test_Transformers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index c661443c..62ae79d8 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -1,4 +1,3 @@ -import sys import pytest from unittest.mock import patch, MagicMock from aperturedb.transformers.common_properties import CommonProperties From b59526f6d0577eb0e7d12e57d0cd899b859c27e7 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 02:19:29 +0000 Subject: [PATCH 18/35] fix: set _descriptorset_initialized only on success or if exists Only mark descriptorset as initialized when add_descriptorset succeeds or if the descriptorset is already present in the list of descriptorsets, to avoid silently swallowing initialization failures. --- aperturedb/transformers/clip_pytorch_embeddings.py | 5 +++-- aperturedb/transformers/facenet_pytorch_embeddings.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 67f085b5..330ca6d7 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -37,9 +37,10 @@ def getitem(self, subscript): if not getattr(self, "_descriptorset_initialized", False): utils = self.get_utils() - utils.add_descriptorset( + success = utils.add_descriptorset( self.search_set_name, dim=len(serialized) // 4, metric=["CS"]) - self._descriptorset_initialized = True + if success or self.search_set_name in utils.get_descriptorset_list(): + self._descriptorset_initialized = True # If the image already has an image_sha256, we use it. image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 74c9f452..7e64f652 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -47,9 +47,10 @@ def getitem(self, subscript): if not getattr(self, "_descriptorset_initialized", False): utils = self.get_utils() - utils.add_descriptorset( + success = utils.add_descriptorset( self.search_set_name, dim=len(serialized) // 4) - self._descriptorset_initialized = True + if success or self.search_set_name in utils.get_descriptorset_list(): + self._descriptorset_initialized = True # If the image already has an image_sha256, we use it. image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( From f20278305edcbb27f7adbd737bf899cc9b831fec Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 05:53:28 +0000 Subject: [PATCH 19/35] test: add more unit tests for new transformer features --- test/test_Transformers.py | 53 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 62ae79d8..d2cd1f22 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -4,6 +4,7 @@ from aperturedb.transformers.bounding_box_properties import BoundingBoxProperties from aperturedb.transformers.video_properties import VideoProperties from aperturedb.transformers.image_properties import ImageProperties +from aperturedb.transformers.transformer import Transformer import hashlib @@ -19,7 +20,7 @@ def __len__(self): def test_variable_annotation_counts(): - data = [ + data_orig = [ ([{"AddImage": {}}, {"AddBoundingBox": {}}], []), ([{"AddImage": {}}], []), ([{"AddImage": {}}, {"AddBoundingBox": {}}, { @@ -27,19 +28,28 @@ def test_variable_annotation_counts(): ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], []), ([{"AddVideo": {}}, {"AddBoundingBox": {}}], []) ] + import copy + + data = copy.deepcopy(data_orig) dummy_data = DummyData(data) - cp = CommonProperties(dummy_data, adb_data_source="test_source") + cp = CommonProperties( + dummy_data, adb_data_source="test_source", adb_timestamp="2026-05-24", adb_main_object="test_object" + ) for i in range(len(data)): res = cp[i] for cmd in res[0]: cmd_name = list(cmd.keys())[0] if cmd_name in ["AddImage", "AddBoundingBox", "AddPolygon", "AddVideo"]: assert cmd[cmd_name]["properties"]["adb_data_source"] == "test_source" + assert cmd[cmd_name]["properties"]["adb_timestamp"] == "2026-05-24" + assert cmd[cmd_name]["properties"]["adb_main_object"] == "test_object" + data_bbp = copy.deepcopy(data_orig) + dummy_data_bbp = DummyData(data_bbp) bbp = BoundingBoxProperties( - dummy_data, annotation_source="test_anno", annotation_mode="auto") - for i in range(len(data)): + dummy_data_bbp, annotation_source="test_anno", annotation_mode="auto") + for i in range(len(data_bbp)): res = bbp[i] for cmd in res[0]: cmd_name = list(cmd.keys())[0] @@ -49,6 +59,19 @@ def test_variable_annotation_counts(): elif cmd_name in ["AddImage", "AddVideo"]: assert "properties" not in cmd[cmd_name] or "annotation_source" not in cmd[cmd_name]["properties"] + # Test empty or missing annotations + data_empty = copy.deepcopy(data_orig) + dummy_data_empty = DummyData(data_empty) + bbp_empty = BoundingBoxProperties( + dummy_data_empty, annotation_source=None, annotation_mode=None) + for i in range(len(data_empty)): + res = bbp_empty[i] + for cmd in res[0]: + cmd_name = list(cmd.keys())[0] + if cmd_name in ["AddBoundingBox", "AddPolygon"]: + assert "properties" not in cmd[cmd_name] or "annotation_source" not in cmd[cmd_name].get( + "properties", {}) + @patch('aperturedb.transformers.transformer.Transformer.get_utils') def test_video_properties(mock_get_utils): @@ -73,6 +96,11 @@ def test_video_properties(mock_get_utils): dummy_data = DummyData(data) vp = VideoProperties(dummy_data) + # Verify index creation + mock_utils.get_indexed_props.assert_called_with("_Video") + mock_utils.create_entity_index.assert_called_with( + "_Video", "adb_data_source") + for i in range(len(data)): res = vp[i] for cmd in res[0]: @@ -196,3 +224,20 @@ def test_facenet_pytorch_embeddings(mock_get_utils): assert len(res[1]) == 3 assert res[1][-1] == [0.5, 0.6, 0.7, 0.8] + + +def test_base_transformer(): + data = [ + ([{"AddImage": {}}], [b"dummy"]) + ] + dummy_data = DummyData(data) + transformer = Transformer(dummy_data) + + assert len(transformer) == 1 + assert transformer._queries == 1 + assert transformer._blobs == 1 + assert transformer._blob_index == [0] + + # getitem is abstract + with pytest.raises(NotImplementedError): + _ = transformer[0] From 3bb502ad47c657c03baec53f31e2a2d55580db86 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 06:21:18 +0000 Subject: [PATCH 20/35] fix: avoid mutating traceback in logger.exception --- aperturedb/transformers/bounding_box_properties.py | 2 +- aperturedb/transformers/common_properties.py | 2 +- aperturedb/transformers/image_properties.py | 2 +- aperturedb/transformers/video_properties.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index 201024a5..60e2a25d 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -28,6 +28,6 @@ def getitem(self, subscript): if self.annotation_mode: src_properties["annotation_mode"] = self.annotation_mode except Exception as e: - logger.exception(e.with_traceback(None), stack_info=True) + logger.exception("Error applying bounding box properties", stack_info=True) return x diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index aed94a3d..c39e75bf 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -42,6 +42,6 @@ def getitem(self, subscript): src_properties["adb_main_object"] = self.adb_main_object except Exception as e: - logger.exception(e.with_traceback(None), stack_info=True) + logger.exception("Error applying common properties", stack_info=True) return x diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index 57a1042f..d69c6b0c 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -49,6 +49,6 @@ def getitem(self, subscript): except Exception as e: # Importantly, do not raise an exception here, since it will kill ingestion. # Create a log message instead, for post-mortem analysis. - logger.exception(e.with_traceback(None), stack_info=True) + logger.exception("Error applying image properties", stack_info=True) return x diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index 655a954b..64803414 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -43,6 +43,6 @@ def getitem(self, subscript): except Exception as e: # Importantly, do not raise an exception here, since it will kill ingestion. # Create a log message instead, for post-mortem analysis. - logger.exception(e.with_traceback(None), stack_info=True) + logger.exception("Error applying video properties", stack_info=True) return x From cd2cd1663d2cf3a55f5dfeee8c32f0049c313fd6 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 06:44:54 +0000 Subject: [PATCH 21/35] fix: run pre-commit autopep8 to resolve CI failure --- aperturedb/transformers/bounding_box_properties.py | 3 ++- aperturedb/transformers/common_properties.py | 3 ++- aperturedb/transformers/image_properties.py | 3 ++- aperturedb/transformers/video_properties.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index 60e2a25d..bf0fdf1a 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -28,6 +28,7 @@ def getitem(self, subscript): if self.annotation_mode: src_properties["annotation_mode"] = self.annotation_mode except Exception as e: - logger.exception("Error applying bounding box properties", stack_info=True) + logger.exception( + "Error applying bounding box properties", stack_info=True) return x diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index c39e75bf..e734dd14 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -42,6 +42,7 @@ def getitem(self, subscript): src_properties["adb_main_object"] = self.adb_main_object except Exception as e: - logger.exception("Error applying common properties", stack_info=True) + logger.exception( + "Error applying common properties", stack_info=True) return x diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index d69c6b0c..d594b209 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -49,6 +49,7 @@ def getitem(self, subscript): except Exception as e: # Importantly, do not raise an exception here, since it will kill ingestion. # Create a log message instead, for post-mortem analysis. - logger.exception("Error applying image properties", stack_info=True) + logger.exception( + "Error applying image properties", stack_info=True) return x diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index 64803414..00ee4281 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -43,6 +43,7 @@ def getitem(self, subscript): except Exception as e: # Importantly, do not raise an exception here, since it will kill ingestion. # Create a log message instead, for post-mortem analysis. - logger.exception("Error applying video properties", stack_info=True) + logger.exception( + "Error applying video properties", stack_info=True) return x From 00cb81d859fba7d6f00ac62ba2ee98a871eaff00 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 07:10:19 +0000 Subject: [PATCH 22/35] test: mock clip.generate_embedding to return float32 bytes --- test/test_Transformers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index d2cd1f22..134cc823 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -6,6 +6,7 @@ from aperturedb.transformers.image_properties import ImageProperties from aperturedb.transformers.transformer import Transformer import hashlib +import struct class DummyData: @@ -165,7 +166,8 @@ def test_clip_pytorch_embeddings(mock_get_utils): pytest.skip("Missing deps for CLIP") with patch('aperturedb.transformers.clip_pytorch_embeddings.generate_embedding') as mock_generate_embedding: - mock_generate_embedding.return_value = [0.1, 0.2, 0.3, 0.4] + dummy_embedding = struct.pack('<4f', 0.1, 0.2, 0.3, 0.4) + mock_generate_embedding.return_value = dummy_embedding mock_utils = mock_get_utils.return_value dummy_image_data = b"fake_image_blob_content" @@ -189,7 +191,7 @@ def test_clip_pytorch_embeddings(mock_get_utils): # 2 blobs originally + 1 generated embedding blob assert len(res[1]) == 3 - assert res[1][-1] == [0.1, 0.2, 0.3, 0.4] + assert res[1][-1] == dummy_embedding @patch('aperturedb.transformers.transformer.Transformer.get_utils') @@ -200,7 +202,8 @@ def test_facenet_pytorch_embeddings(mock_get_utils): pytest.skip("Missing deps for Facenet") with patch('aperturedb.transformers.facenet_pytorch_embeddings.FacenetPyTorchEmbeddings._get_embedding_from_blob') as mock_get_embedding: - mock_get_embedding.return_value = [0.5, 0.6, 0.7, 0.8] + dummy_embedding = struct.pack('<4f', 0.5, 0.6, 0.7, 0.8) + mock_get_embedding.return_value = dummy_embedding mock_utils = mock_get_utils.return_value dummy_image_data = b"fake_image_blob_content" @@ -223,7 +226,7 @@ def test_facenet_pytorch_embeddings(mock_get_utils): assert desc_cmd[0]["AddDescriptor"]["connect"]["ref"] == 2 assert len(res[1]) == 3 - assert res[1][-1] == [0.5, 0.6, 0.7, 0.8] + assert res[1][-1] == dummy_embedding def test_base_transformer(): From d95e29287dea06d4d0b91ee02602aa411ddd4711 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 17:23:36 +0000 Subject: [PATCH 23/35] test: add placeholder blobs for image/video commands in test data --- test/test_Transformers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 134cc823..e56e2c4d 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -22,12 +22,12 @@ def __len__(self): def test_variable_annotation_counts(): data_orig = [ - ([{"AddImage": {}}, {"AddBoundingBox": {}}], []), - ([{"AddImage": {}}], []), + ([{"AddImage": {}}, {"AddBoundingBox": {}}], [b"dummy_image"]), + ([{"AddImage": {}}], [b"dummy_image"]), ([{"AddImage": {}}, {"AddBoundingBox": {}}, { - "AddBoundingBox": {}}, {"AddPolygon": {}}], []), - ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], []), - ([{"AddVideo": {}}, {"AddBoundingBox": {}}], []) + "AddBoundingBox": {}}, {"AddPolygon": {}}], [b"dummy_image"]), + ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], [b"dummy_image"]), + ([{"AddVideo": {}}, {"AddBoundingBox": {}}], [b"dummy_video"]) ] import copy From 340b007b9089e429d58f8be677a946867b203c8c Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 17:47:09 +0000 Subject: [PATCH 24/35] fix: address pre-commit formatting issues in test_Transformers.py --- test/test_Transformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index e56e2c4d..fd42b7dd 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -26,7 +26,8 @@ def test_variable_annotation_counts(): ([{"AddImage": {}}], [b"dummy_image"]), ([{"AddImage": {}}, {"AddBoundingBox": {}}, { "AddBoundingBox": {}}, {"AddPolygon": {}}], [b"dummy_image"]), - ([{"AddImage": {}}, {"AddPolygon": {}}, {"AddPolygon": {}}], [b"dummy_image"]), + ([{"AddImage": {}}, {"AddPolygon": {}}, + {"AddPolygon": {}}], [b"dummy_image"]), ([{"AddVideo": {}}, {"AddBoundingBox": {}}], [b"dummy_video"]) ] import copy From 339e5931989a677742b45592431485b2faf4b9b2 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 22:04:48 +0000 Subject: [PATCH 25/35] test: add tests for descriptor initialization retries --- test/test_Transformers.py | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index fd42b7dd..d09091e3 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -245,3 +245,71 @@ def test_base_transformer(): # getitem is abstract with pytest.raises(NotImplementedError): _ = transformer[0] + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +def test_clip_descriptorset_initialization_retry(mock_get_utils): + try: + from aperturedb.transformers.clip_pytorch_embeddings import CLIPPyTorchEmbeddings + except (ImportError, SystemExit): + pytest.skip("Missing deps for CLIP") + + with patch('aperturedb.transformers.clip_pytorch_embeddings.generate_embedding') as mock_generate_embedding: + dummy_embedding = struct.pack('<4f', 0.1, 0.2, 0.3, 0.4) + mock_generate_embedding.return_value = dummy_embedding + + mock_utils = mock_get_utils.return_value + # Fail the first time, succeed the second time + mock_utils.add_descriptorset.side_effect = [False, True] + mock_utils.get_descriptorset_list.return_value = [] + + data = [ + ([{"AddImage": {"_ref": 1}}], [b"image1"]), + ([{"AddImage": {"_ref": 2}}], [b"image2"]) + ] + + dummy_data = DummyData(data) + clip = CLIPPyTorchEmbeddings(dummy_data) + + # First item: creation fails, should not be initialized + res1 = clip[0] + assert mock_utils.add_descriptorset.call_count == 1 + assert not clip._descriptorset_initialized + + # Second item: creation succeeds, should be initialized + res2 = clip[1] + assert mock_utils.add_descriptorset.call_count == 2 + assert clip._descriptorset_initialized + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +def test_facenet_descriptorset_initialization_retry(mock_get_utils): + try: + from aperturedb.transformers.facenet_pytorch_embeddings import FacenetPyTorchEmbeddings + except (ImportError, SystemExit): + pytest.skip("Missing deps for Facenet") + + with patch('aperturedb.transformers.facenet_pytorch_embeddings.FacenetPyTorchEmbeddings._get_embedding_from_blob') as mock_get_embedding: + dummy_embedding = struct.pack('<4f', 0.1, 0.2, 0.3, 0.4) + mock_get_embedding.return_value = dummy_embedding + + mock_utils = mock_get_utils.return_value + # Fail both add and get first time, then succeed get the second time + mock_utils.add_descriptorset.return_value = False + mock_utils.get_descriptorset_list.side_effect = [[], ["facenet"]] + + data = [ + ([{"AddImage": {"_ref": 1}}], [b"image1"]), + ([{"AddImage": {"_ref": 2}}], [b"image2"]) + ] + + dummy_data = DummyData(data) + facenet = FacenetPyTorchEmbeddings(dummy_data) + + res1 = facenet[0] + assert mock_utils.add_descriptorset.call_count == 1 + assert not facenet._descriptorset_initialized + + res2 = facenet[1] + assert mock_utils.add_descriptorset.call_count == 2 + assert facenet._descriptorset_initialized From c7f526600bb5ea9638a15a89f88a346e31832f90 Mon Sep 17 00:00:00 2001 From: claw Date: Mon, 25 May 2026 22:27:14 +0000 Subject: [PATCH 26/35] test: update mocked descriptor set name to match default in FacenetPyTorchEmbeddings --- test/test_Transformers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index d09091e3..857d666e 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -296,7 +296,8 @@ def test_facenet_descriptorset_initialization_retry(mock_get_utils): mock_utils = mock_get_utils.return_value # Fail both add and get first time, then succeed get the second time mock_utils.add_descriptorset.return_value = False - mock_utils.get_descriptorset_list.side_effect = [[], ["facenet"]] + mock_utils.get_descriptorset_list.side_effect = [ + [], ["facenet_pytorch_embeddings"]] data = [ ([{"AddImage": {"_ref": 1}}], [b"image1"]), From 878992738f355248f5e0a406d16e304bf0b08544 Mon Sep 17 00:00:00 2001 From: claw Date: Tue, 26 May 2026 20:33:21 +0000 Subject: [PATCH 27/35] fix: make common_properties and bounding_box_properties true no-ops when all parameters are unset Addresses review comments to avoid mutating command payloads with empty properties dicts when no annotations/common properties are provided. --- aperturedb/transformers/bounding_box_properties.py | 3 +++ aperturedb/transformers/common_properties.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/aperturedb/transformers/bounding_box_properties.py b/aperturedb/transformers/bounding_box_properties.py index bf0fdf1a..3c523bf6 100644 --- a/aperturedb/transformers/bounding_box_properties.py +++ b/aperturedb/transformers/bounding_box_properties.py @@ -16,6 +16,9 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: self.annotation_mode = kwargs.get("annotation_mode", "auto") def getitem(self, subscript): + if not (self.annotation_source or self.annotation_mode): + return self.data[subscript] + x = self.data[subscript] try: for cmd_dict in x[0]: diff --git a/aperturedb/transformers/common_properties.py b/aperturedb/transformers/common_properties.py index e734dd14..42886adf 100644 --- a/aperturedb/transformers/common_properties.py +++ b/aperturedb/transformers/common_properties.py @@ -27,6 +27,9 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: self.adb_main_object = kwargs.get("adb_main_object", None) def getitem(self, subscript): + if not (self.adb_data_source or self.adb_timestamp or self.adb_main_object): + return self.data[subscript] + x = self.data[subscript] try: for cmd_dict in x[0]: From 28c859224ed1f4c83d75483b0a394e78c58833b7 Mon Sep 17 00:00:00 2001 From: claw Date: Wed, 27 May 2026 00:33:16 +0000 Subject: [PATCH 28/35] fix(ci): fix EACCES permission denied by making teardown chmod robust --- test/run_test_container.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/run_test_container.sh b/test/run_test_container.sh index 6ea9b0e0..07ae1f9a 100755 --- a/test/run_test_container.sh +++ b/test/run_test_container.sh @@ -48,6 +48,7 @@ IP_REGEX='[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}' function teardown() { echo "Tearing down containers and networks..." + $(get_sudo) chmod -R 777 "$(pwd)/aperturedb" "$(pwd)/"*_ca 2>/dev/null || true if [ "$TEST_PROTOCOL" == "http" ] || [ "$TEST_PROTOCOL" == "both" ]; then RUNNER_NAME="${RUNNER_NAME}_http" docker compose -f docker-compose.yml down --remove-orphans || true docker network rm "${RUNNER_NAME}_http_host_default" || true @@ -67,7 +68,7 @@ TESTING_LOG_PATH="/aperturedb/test/server_logs" RUNNER_INFO_PATH="$(pwd)/aperturedb/logs/runner_state" $(get_sudo) mkdir -p "$RUNNER_INFO_PATH" -$(get_sudo) chmod -R 777 "$LOG_PATH" || true +$(get_sudo) chmod -R 777 "$(pwd)/aperturedb" "$(pwd)/"*_ca 2>/dev/null || true # Check if TEST_PROTOCOL is set, otherwise default to both TEST_PROTOCOL=${TEST_PROTOCOL:-"both"} From b5df355cb81307ae037b490f80c66f29c72cb0c4 Mon Sep 17 00:00:00 2001 From: claw Date: Wed, 27 May 2026 05:56:19 +0000 Subject: [PATCH 29/35] fix: address review comments on clip/facenet embeddings and run_test_container.sh permissions --- .../transformers/clip_pytorch_embeddings.py | 39 ++++++++++--------- .../facenet_pytorch_embeddings.py | 39 ++++++++++--------- test/run_test_container.sh | 4 +- 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index 330ca6d7..d4e3c0e7 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -43,26 +43,27 @@ def getitem(self, subscript): self._descriptorset_initialized = True # If the image already has an image_sha256, we use it. - image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( - "adb_image_sha256", None) - if not image_sha256: - image_sha256 = hashlib.sha256(blob).hexdigest() - new_blobs.append(serialized) - new_descriptors.append( - { - "AddDescriptor": { - "set": self.search_set_name, - "properties": { - "image_sha256": image_sha256, - }, - "if_not_found": { - "image_sha256": ["==", image_sha256], - }, - "connect": { - "ref": cmd_dict["AddImage"]["_ref"] + if getattr(self, "_descriptorset_initialized", False): + image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( + "adb_image_sha256", None) + if not image_sha256: + image_sha256 = hashlib.sha256(blob).hexdigest() + new_blobs.append(serialized) + new_descriptors.append( + { + "AddDescriptor": { + "set": self.search_set_name, + "properties": { + "image_sha256": image_sha256, + }, + "if_not_found": { + "image_sha256": ["==", image_sha256], + }, + "connect": { + "ref": cmd_dict["AddImage"]["_ref"] + } } - } - }) + }) if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: blob_index += 1 diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 7e64f652..024df46b 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -53,26 +53,27 @@ def getitem(self, subscript): self._descriptorset_initialized = True # If the image already has an image_sha256, we use it. - image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( - "adb_image_sha256", None) - if not image_sha256: - image_sha256 = hashlib.sha256(blob).hexdigest() - new_blobs.append(serialized) - new_descriptors.append( - { - "AddDescriptor": { - "set": self.search_set_name, - "properties": { - "image_sha256": image_sha256, - }, - "if_not_found": { - "image_sha256": ["==", image_sha256], - }, - "connect": { - "ref": cmd_dict["AddImage"]["_ref"] + if getattr(self, "_descriptorset_initialized", False): + image_sha256 = cmd_dict["AddImage"].get("properties", {}).get( + "adb_image_sha256", None) + if not image_sha256: + image_sha256 = hashlib.sha256(blob).hexdigest() + new_blobs.append(serialized) + new_descriptors.append( + { + "AddDescriptor": { + "set": self.search_set_name, + "properties": { + "image_sha256": image_sha256, + }, + "if_not_found": { + "image_sha256": ["==", image_sha256], + }, + "connect": { + "ref": cmd_dict["AddImage"]["_ref"] + } } - } - }) + }) if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: blob_index += 1 diff --git a/test/run_test_container.sh b/test/run_test_container.sh index 07ae1f9a..128d05f7 100755 --- a/test/run_test_container.sh +++ b/test/run_test_container.sh @@ -48,7 +48,7 @@ IP_REGEX='[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}' function teardown() { echo "Tearing down containers and networks..." - $(get_sudo) chmod -R 777 "$(pwd)/aperturedb" "$(pwd)/"*_ca 2>/dev/null || true + $(get_sudo) chmod -R a+rwX "$(pwd)/aperturedb/logs" "$(pwd)/"*_ca 2>/dev/null || true if [ "$TEST_PROTOCOL" == "http" ] || [ "$TEST_PROTOCOL" == "both" ]; then RUNNER_NAME="${RUNNER_NAME}_http" docker compose -f docker-compose.yml down --remove-orphans || true docker network rm "${RUNNER_NAME}_http_host_default" || true @@ -68,7 +68,7 @@ TESTING_LOG_PATH="/aperturedb/test/server_logs" RUNNER_INFO_PATH="$(pwd)/aperturedb/logs/runner_state" $(get_sudo) mkdir -p "$RUNNER_INFO_PATH" -$(get_sudo) chmod -R 777 "$(pwd)/aperturedb" "$(pwd)/"*_ca 2>/dev/null || true +$(get_sudo) chmod -R a+rwX "$(pwd)/aperturedb/logs" "$(pwd)/"*_ca 2>/dev/null || true # Check if TEST_PROTOCOL is set, otherwise default to both TEST_PROTOCOL=${TEST_PROTOCOL:-"both"} From a5df6fdf8828f3d18c197cd396b454baf8fc007b Mon Sep 17 00:00:00 2001 From: claw Date: Wed, 27 May 2026 15:09:28 +0000 Subject: [PATCH 30/35] test: assert AddDescriptor is omitted on initialization failure --- test/test_Transformers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 857d666e..e9c2a234 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -275,11 +275,13 @@ def test_clip_descriptorset_initialization_retry(mock_get_utils): res1 = clip[0] assert mock_utils.add_descriptorset.call_count == 1 assert not clip._descriptorset_initialized + assert not any("AddDescriptor" in c for c in res1[0]) # Second item: creation succeeds, should be initialized res2 = clip[1] assert mock_utils.add_descriptorset.call_count == 2 assert clip._descriptorset_initialized + assert any("AddDescriptor" in c for c in res2[0]) @patch('aperturedb.transformers.transformer.Transformer.get_utils') @@ -310,7 +312,9 @@ def test_facenet_descriptorset_initialization_retry(mock_get_utils): res1 = facenet[0] assert mock_utils.add_descriptorset.call_count == 1 assert not facenet._descriptorset_initialized + assert not any("AddDescriptor" in c for c in res1[0]) res2 = facenet[1] assert mock_utils.add_descriptorset.call_count == 2 assert facenet._descriptorset_initialized + assert any("AddDescriptor" in c for c in res2[0]) From 38b625bc33c88771748060645c877ea6708d2d22 Mon Sep 17 00:00:00 2001 From: claw Date: Thu, 28 May 2026 09:54:06 +0000 Subject: [PATCH 31/35] test: add tests for early return and exception handling --- test/test_Transformers.py | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index e9c2a234..2aa75db5 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -318,3 +318,44 @@ def test_facenet_descriptorset_initialization_retry(mock_get_utils): assert mock_utils.add_descriptorset.call_count == 2 assert facenet._descriptorset_initialized assert any("AddDescriptor" in c for c in res2[0]) + + +def test_common_properties_early_return(): + data = [([{"AddImage": {}}], [b"dummy"])] + dummy_data = DummyData(data) + cp = CommonProperties(dummy_data, adb_data_source=None, + adb_timestamp=None, adb_main_object=None) + res = cp[0] + assert "properties" not in res[0][0]["AddImage"] + + +def test_bounding_box_properties_early_return(): + data = [([{"AddBoundingBox": {}}], [])] + dummy_data = DummyData(data) + bbp = BoundingBoxProperties( + dummy_data, annotation_source=None, annotation_mode=None) + res = bbp[0] + assert "properties" not in res[0][0]["AddBoundingBox"] + + +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +@patch('aperturedb.transformers.video_properties.hashlib.sha256') +def test_video_properties_exception_handling(mock_sha256, mock_get_utils): + mock_utils = mock_get_utils.return_value + mock_utils.get_indexed_props.return_value = [] + + mock_sha256.side_effect = Exception("Test Exception") + + dummy_video_data = b"fake_video_blob_content" + data = [ + ([ + {"AddVideo": {}} + ], [dummy_video_data]) + ] + + dummy_data = DummyData(data) + vp = VideoProperties(dummy_data) + + res = vp[0] + + assert "adb_video_sha256" not in res[0][0]["AddVideo"]["properties"] From 76589713cb943243da19d67f0e2217a5058e9d4c Mon Sep 17 00:00:00 2001 From: claw Date: Fri, 29 May 2026 07:06:14 +0000 Subject: [PATCH 32/35] test: add exception handling and explicit id tests for transformers Addresses review feedback to add more testing and ensure new features are properly tested. --- test/test_Transformers.py | 64 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index 2aa75db5..c6d90988 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -83,7 +83,7 @@ def test_video_properties(mock_get_utils): dummy_video_data = b"fake_video_blob_content" data = [ ([ - {"AddVideo": {}}, + {"AddVideo": {"properties": {"id": "test_id"}}}, {"AddBoundingBox": {}} ], [dummy_video_data]), ([ @@ -113,6 +113,8 @@ def test_video_properties(mock_get_utils): assert props["adb_video_sha256"] == hashlib.sha256( dummy_video_data).hexdigest() assert "adb_video_id" in props + if "id" in props and props["id"] == "test_id": + assert props["adb_video_id"] == "test_id" @patch('aperturedb.transformers.transformer.Transformer.get_utils') @@ -129,7 +131,7 @@ def test_image_properties(mock_image_open, mock_get_utils): dummy_image_data = b"fake_image_blob_content" data = [ ([ - {"AddImage": {"_ref": 1}}, + {"AddImage": {"_ref": 1, "properties": {"id": "test_image_id"}}}, {"AddVideo": {}} ], [dummy_image_data, b"video_blob"]), ([ @@ -156,6 +158,8 @@ def test_image_properties(mock_image_open, mock_get_utils): assert props["adb_image_width"] == 800 assert props["adb_image_height"] == 600 assert "adb_image_id" in props + if "id" in props and props["id"] == "test_image_id": + assert props["adb_image_id"] == "test_image_id" @patch('aperturedb.transformers.transformer.Transformer.get_utils') @@ -359,3 +363,59 @@ def test_video_properties_exception_handling(mock_sha256, mock_get_utils): res = vp[0] assert "adb_video_sha256" not in res[0][0]["AddVideo"]["properties"] + + +@patch('aperturedb.transformers.bounding_box_properties.logger') +def test_bounding_box_properties_exception_handling(mock_logger): + # Pass a malformed command dictionary where "AddBoundingBox" is not a dict + data = [([{"AddBoundingBox": "invalid_type_not_dict"}], [])] + dummy_data = DummyData(data) + + bbp = BoundingBoxProperties( + dummy_data, annotation_source="test_anno", annotation_mode="auto") + + # This should raise an AttributeError when calling setdefault on a string + # But the exception should be caught and logged + res = bbp[0] + + assert mock_logger.exception.called + assert "Error applying bounding box properties" in mock_logger.exception.call_args[0][0] + # The original command should remain unchanged + assert res[0][0]["AddBoundingBox"] == "invalid_type_not_dict" + + +@patch('aperturedb.transformers.image_properties.logger') +@patch('aperturedb.transformers.transformer.Transformer.get_utils') +@patch('aperturedb.transformers.image_properties.Image.open') +def test_image_properties_exception_handling(mock_image_open, mock_get_utils, mock_logger): + mock_utils = mock_get_utils.return_value + mock_utils.get_indexed_props.return_value = [] + + # Make Image.open raise an exception + mock_image_open.side_effect = Exception("Image load error") + + dummy_image_data = b"fake_image_blob_content" + data = [([{"AddImage": {}}], [dummy_image_data])] + dummy_data = DummyData(data) + + ip = ImageProperties(dummy_data) + res = ip[0] + + assert mock_logger.exception.called + assert "Error applying image properties" in mock_logger.exception.call_args[0][0] + # The properties should not have the size or width/height + assert "adb_image_width" not in res[0][0]["AddImage"]["properties"] + + +@patch('aperturedb.transformers.common_properties.logger') +def test_common_properties_exception_handling(mock_logger): + # Pass a malformed command dictionary + data = [([{"AddImage": "invalid_type_not_dict"}], [b"dummy"])] + dummy_data = DummyData(data) + + cp = CommonProperties(dummy_data, adb_data_source="test_source") + res = cp[0] + + assert mock_logger.exception.called + assert "Error applying common properties" in mock_logger.exception.call_args[0][0] + assert res[0][0]["AddImage"] == "invalid_type_not_dict" From 30ba71e54dd4d23a486611e0018a05e777241598 Mon Sep 17 00:00:00 2001 From: claw Date: Fri, 29 May 2026 09:29:15 +0000 Subject: [PATCH 33/35] fix: address review comments on PR #720 Addresses review feedback from Copilot From 5379c928468deda56c557783e3bdde2da5225cc5 Mon Sep 17 00:00:00 2001 From: claw Date: Fri, 29 May 2026 12:46:21 +0000 Subject: [PATCH 34/35] test: update image properties exception handling test --- test/test_Transformers.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/test_Transformers.py b/test/test_Transformers.py index c6d90988..a5f08ae2 100644 --- a/test/test_Transformers.py +++ b/test/test_Transformers.py @@ -403,8 +403,15 @@ def test_image_properties_exception_handling(mock_image_open, mock_get_utils, mo assert mock_logger.exception.called assert "Error applying image properties" in mock_logger.exception.call_args[0][0] - # The properties should not have the size or width/height - assert "adb_image_width" not in res[0][0]["AddImage"]["properties"] + + props = res[0][0]["AddImage"]["properties"] + # size and sha256 are computed before Image.open, so they should be present + assert "adb_image_size" in props + assert "adb_image_sha256" in props + # width, height, and id are computed after Image.open, so they should be missing + assert "adb_image_width" not in props + assert "adb_image_height" not in props + assert "adb_image_id" not in props @patch('aperturedb.transformers.common_properties.logger') From 52ea1e195df8084c050197fb7d1f040993548f8c Mon Sep 17 00:00:00 2001 From: claw Date: Fri, 29 May 2026 16:29:54 +0000 Subject: [PATCH 35/35] fix: address review comments on exception handling in transformers --- .../transformers/clip_pytorch_embeddings.py | 7 ++++-- .../facenet_pytorch_embeddings.py | 7 ++++-- aperturedb/transformers/image_properties.py | 22 +++++++++---------- aperturedb/transformers/video_properties.py | 22 +++++++++---------- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/aperturedb/transformers/clip_pytorch_embeddings.py b/aperturedb/transformers/clip_pytorch_embeddings.py index d4e3c0e7..b7cec5d9 100644 --- a/aperturedb/transformers/clip_pytorch_embeddings.py +++ b/aperturedb/transformers/clip_pytorch_embeddings.py @@ -39,8 +39,11 @@ def getitem(self, subscript): utils = self.get_utils() success = utils.add_descriptorset( self.search_set_name, dim=len(serialized) // 4, metric=["CS"]) - if success or self.search_set_name in utils.get_descriptorset_list(): - self._descriptorset_initialized = True + try: + if success or self.search_set_name in utils.get_descriptorset_list(): + self._descriptorset_initialized = True + except Exception: + pass # If the image already has an image_sha256, we use it. if getattr(self, "_descriptorset_initialized", False): diff --git a/aperturedb/transformers/facenet_pytorch_embeddings.py b/aperturedb/transformers/facenet_pytorch_embeddings.py index 024df46b..8d82343f 100644 --- a/aperturedb/transformers/facenet_pytorch_embeddings.py +++ b/aperturedb/transformers/facenet_pytorch_embeddings.py @@ -49,8 +49,11 @@ def getitem(self, subscript): utils = self.get_utils() success = utils.add_descriptorset( self.search_set_name, dim=len(serialized) // 4) - if success or self.search_set_name in utils.get_descriptorset_list(): - self._descriptorset_initialized = True + try: + if success or self.search_set_name in utils.get_descriptorset_list(): + self._descriptorset_initialized = True + except Exception: + pass # If the image already has an image_sha256, we use it. if getattr(self, "_descriptorset_initialized", False): diff --git a/aperturedb/transformers/image_properties.py b/aperturedb/transformers/image_properties.py index d594b209..68f4282e 100644 --- a/aperturedb/transformers/image_properties.py +++ b/aperturedb/transformers/image_properties.py @@ -24,10 +24,10 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] - try: - blob_index = 0 - for cmd_dict in x[0]: - cmd_name = list(cmd_dict.keys())[0] + blob_index = 0 + for cmd_dict in x[0]: + cmd_name = list(cmd_dict.keys())[0] + try: if cmd_name == "AddImage": src_properties = cmd_dict["AddImage"].setdefault( "properties", {}) @@ -43,13 +43,13 @@ def getitem(self, subscript): src_properties["adb_image_id"] = str( src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) - if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: - blob_index += 1 + except Exception as e: + # Importantly, do not raise an exception here, since it will kill ingestion. + # Create a log message instead, for post-mortem analysis. + logger.exception( + "Error applying image properties", stack_info=True) - except Exception as e: - # Importantly, do not raise an exception here, since it will kill ingestion. - # Create a log message instead, for post-mortem analysis. - logger.exception( - "Error applying image properties", stack_info=True) + if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: + blob_index += 1 return x diff --git a/aperturedb/transformers/video_properties.py b/aperturedb/transformers/video_properties.py index 00ee4281..39690807 100644 --- a/aperturedb/transformers/video_properties.py +++ b/aperturedb/transformers/video_properties.py @@ -22,10 +22,10 @@ def __init__(self, data: Subscriptable, **kwargs) -> None: def getitem(self, subscript): x = self.data[subscript] - try: - blob_index = 0 - for cmd_dict in x[0]: - cmd_name = list(cmd_dict.keys())[0] + blob_index = 0 + for cmd_dict in x[0]: + cmd_name = list(cmd_dict.keys())[0] + try: if cmd_name == "AddVideo": src_properties = cmd_dict["AddVideo"].setdefault( "properties", {}) @@ -37,13 +37,13 @@ def getitem(self, subscript): src_properties["adb_video_id"] = str( src_properties["id"] if "id" in src_properties else uuid.uuid4().hex) - if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: - blob_index += 1 + except Exception as e: + # Importantly, do not raise an exception here, since it will kill ingestion. + # Create a log message instead, for post-mortem analysis. + logger.exception( + "Error applying video properties", stack_info=True) - except Exception as e: - # Importantly, do not raise an exception here, since it will kill ingestion. - # Create a log message instead, for post-mortem analysis. - logger.exception( - "Error applying video properties", stack_info=True) + if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]: + blob_index += 1 return x