Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
5aceeb7
feat: Add transformers for videos and bounding boxes
ad-claw000 May 20, 2026
7c2d8ce
Fix pre-commit failures and remove optional dependencies from transfo…
ad-claw000 May 20, 2026
6278058
Fix: Address transformer PR review comments
ad-claw000 May 21, 2026
d0d7eb7
fix(transformers): evaluate annotation counts dynamically per transac…
ad-claw000 May 21, 2026
6e8010b
Fix pre-commit issues
ad-claw000 May 21, 2026
ab6958d
fix(transformers): evaluate image and video counts dynamically per tr…
ad-claw000 May 21, 2026
cfc08ad
Fix: Address review comments on transformers
ad-claw000 May 21, 2026
5c3f442
fix(transformers): remove unused _add_image_index and compute dynamic…
ad-claw000 May 21, 2026
30721e5
fix: apply autopep8 formatting
May 21, 2026
d28a4fd
Fix remaining unused variable and unused index review comments
ad-claw000 May 21, 2026
32f1400
Merge branch 'develop' into fix/issue-335
luisremis May 22, 2026
871c955
Merge remote-tracking branch 'origin/develop' into fix/issue-335
ad-claw000 May 23, 2026
1e2f6a7
fix: lazily initialize descriptor sets on first AddImage in getitem
ad-claw000 May 23, 2026
cf75c40
fix: lazily initialize descriptor sets on first AddImage in getitem f…
ad-claw000 May 23, 2026
8d37ab8
fix(transformers): avoid duplicate embedding generation for the first…
ad-claw000 May 23, 2026
02b8fa2
style: fix autopep8 formatting in facenet_pytorch_embeddings.py
ad-claw000 May 23, 2026
f68a079
fix(transformers): restore _add_image_index for backward compat and a…
ad-claw000 May 24, 2026
65e6c26
Merge branch 'develop' into fix/issue-335
luisremis May 24, 2026
1f30391
Merge remote-tracking branch 'origin/develop' into fix/issue-335
ad-claw000 May 24, 2026
f845763
test: add coverage for ImageProperties, clip, and facenet transformers
ad-claw000 May 24, 2026
f5b565e
test: remove unused sys import in test_Transformers.py
ad-claw000 May 24, 2026
b59526f
fix: set _descriptorset_initialized only on success or if exists
ad-claw000 May 25, 2026
f202783
test: add more unit tests for new transformer features
ad-claw000 May 25, 2026
3bb502a
fix: avoid mutating traceback in logger.exception
ad-claw000 May 25, 2026
cd2cd16
fix: run pre-commit autopep8 to resolve CI failure
ad-claw000 May 25, 2026
00cb81d
test: mock clip.generate_embedding to return float32 bytes
ad-claw000 May 25, 2026
d95e292
test: add placeholder blobs for image/video commands in test data
ad-claw000 May 25, 2026
340b007
fix: address pre-commit formatting issues in test_Transformers.py
ad-claw000 May 25, 2026
56ae22b
Merge remote-tracking branch 'origin/develop' into fix/issue-335
ad-claw000 May 25, 2026
339e593
test: add tests for descriptor initialization retries
ad-claw000 May 25, 2026
c7f5266
test: update mocked descriptor set name to match default in FacenetPy…
ad-claw000 May 25, 2026
0a49c19
Merge branch 'develop' into fix/issue-335
luisremis May 26, 2026
8789927
fix: make common_properties and bounding_box_properties true no-ops w…
ad-claw000 May 26, 2026
28c8592
fix(ci): fix EACCES permission denied by making teardown chmod robust
ad-claw000 May 27, 2026
b5df355
fix: address review comments on clip/facenet embeddings and run_test_…
ad-claw000 May 27, 2026
a5df6fd
test: assert AddDescriptor is omitted on initialization failure
ad-claw000 May 27, 2026
38b625b
test: add tests for early return and exception handling
ad-claw000 May 28, 2026
7658971
test: add exception handling and explicit id tests for transformers
ad-claw000 May 29, 2026
30ba71e
fix: address review comments on PR #720
ad-claw000 May 29, 2026
5379c92
test: update image properties exception handling test
ad-claw000 May 29, 2026
52ea1e1
fix: address review comments on exception handling in transformers
ad-claw000 May 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions aperturedb/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .transformer import Transformer
from .common_properties import CommonProperties
from .image_properties import ImageProperties
from .video_properties import VideoProperties
from .bounding_box_properties import BoundingBoxProperties

__all__ = [
"Transformer",
"CommonProperties",
"ImageProperties",
"VideoProperties",
"BoundingBoxProperties",
]
37 changes: 37 additions & 0 deletions aperturedb/transformers/bounding_box_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from aperturedb.transformers.transformer import Transformer
from aperturedb.Subscriptable import Subscriptable
import logging

logger = logging.getLogger(__name__)


class BoundingBoxProperties(Transformer):
"""
This computes bounding box and polygon properties and adds them to the metadata.
"""

def __init__(self, data: Subscriptable, **kwargs) -> None:
super().__init__(data, **kwargs)
self.annotation_source = kwargs.get("annotation_source", "coco")
self.annotation_mode = kwargs.get("annotation_mode", "auto")

def getitem(self, subscript):
if not (self.annotation_source or self.annotation_mode):
return self.data[subscript]

x = self.data[subscript]
try:
for cmd_dict in x[0]:
cmd_name = list(cmd_dict.keys())[0]
if cmd_name in ["AddBoundingBox", "AddPolygon"]:
src_properties = cmd_dict[cmd_name].setdefault(
"properties", {})
if self.annotation_source:
src_properties["annotation_source"] = self.annotation_source
if self.annotation_mode:
src_properties["annotation_mode"] = self.annotation_mode
Comment thread
ad-claw000 marked this conversation as resolved.
except Exception as e:
logger.exception(
"Error applying bounding box properties", stack_info=True)

return x
79 changes: 49 additions & 30 deletions aperturedb/transformers/clip_pytorch_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,57 @@ def __init__(self, data: Subscriptable, **kwargs) -> None:
self.search_set_name = kwargs.pop(
"search_set_name", descriptor_set)
super().__init__(data, **kwargs)

# Let's sample some data to figure out the descriptorset we need.
if len(self._add_image_index) > 0:
sample = generate_embedding(self.data[0][1][0])
utils = self.get_utils()
utils.add_descriptorset(
self.search_set_name, dim=len(sample) // 4, metric=["CS"])
self._descriptorset_initialized = False

def getitem(self, subscript):
x = self.data[subscript]

for ic in self._add_image_index:
serialized = generate_embedding(x[1][ic])
# If the image already has an image_sha256, we use it.
image_sha256 = x[0][ic]["AddImage"].get("properties", {}).get(
"adb_image_sha256", None)
if not image_sha256:
image_sha256 = hashlib.sha256(x[1][ic]).hexdigest()
x[1].append(serialized)
x[0].append(
{
"AddDescriptor": {
"set": self.search_set_name,
"properties": {
"image_sha256": image_sha256,
},
"if_not_found": {
"image_sha256": ["==", image_sha256],
},
"connect": {
"ref": x[0][ic]["AddImage"]["_ref"]
}
}
})
blob_index = 0
new_descriptors = []
new_blobs = []

for cmd_dict in x[0]:
cmd_name = list(cmd_dict.keys())[0]
if cmd_name == "AddImage":
blob = x[1][blob_index]

serialized = generate_embedding(blob)

if not getattr(self, "_descriptorset_initialized", False):
utils = self.get_utils()
success = utils.add_descriptorset(
self.search_set_name, dim=len(serialized) // 4, metric=["CS"])
try:
if success or self.search_set_name in utils.get_descriptorset_list():
self._descriptorset_initialized = True
except Exception:
pass

Comment thread
ad-claw000 marked this conversation as resolved.
# If the image already has an image_sha256, we use it.
if getattr(self, "_descriptorset_initialized", False):
image_sha256 = cmd_dict["AddImage"].get("properties", {}).get(
"adb_image_sha256", None)
if not image_sha256:
image_sha256 = hashlib.sha256(blob).hexdigest()
new_blobs.append(serialized)
new_descriptors.append(
{
"AddDescriptor": {
"set": self.search_set_name,
"properties": {
"image_sha256": image_sha256,
},
"if_not_found": {
"image_sha256": ["==", image_sha256],
},
"connect": {
"ref": cmd_dict["AddImage"]["_ref"]
}
}
})
if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]:
blob_index += 1

x[0].extend(new_descriptors)
x[1].extend(new_blobs)
return x
28 changes: 17 additions & 11 deletions aperturedb/transformers/common_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,25 @@ def __init__(self, data: Subscriptable, **kwargs) -> None:
self.adb_main_object = kwargs.get("adb_main_object", None)

def getitem(self, subscript):
if not (self.adb_data_source or self.adb_timestamp or self.adb_main_object):
return self.data[subscript]

x = self.data[subscript]
try:
# x is a transaction that has an add_image command and a blob
for ic in self._add_image_index:
src_properties = x[0][ic]["AddImage"]["properties"]
# Set the static properties, if explicitly set
if self.adb_data_source:
src_properties["adb_data_source"] = self.adb_data_source
if self.adb_timestamp:
src_properties["adb_timestamp"] = self.adb_timestamp
if self.adb_main_object:
src_properties["adb_main_object"] = self.adb_main_object
for cmd_dict in x[0]:
cmd_name = list(cmd_dict.keys())[0]
if cmd_name in ["AddImage", "AddVideo", "AddBoundingBox", "AddPolygon"]:
src_properties = cmd_dict[cmd_name].setdefault(
"properties", {})
Comment thread
ad-claw000 marked this conversation as resolved.
if self.adb_data_source:
src_properties["adb_data_source"] = self.adb_data_source
if self.adb_timestamp:
src_properties["adb_timestamp"] = self.adb_timestamp
if self.adb_main_object:
src_properties["adb_main_object"] = self.adb_main_object

except Exception as e:
logger.exception(e.with_traceback(), stack_info=True)
logger.exception(
"Error applying common properties", stack_info=True)

return x
79 changes: 49 additions & 30 deletions aperturedb/transformers/facenet_pytorch_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,7 @@ def __init__(self, data: Subscriptable, **kwargs) -> None:
self.search_set_name = kwargs.pop(
"search_set_name", "facenet_pytorch_embeddings")
super().__init__(data, **kwargs)

# Let's sample some data to figure out the descriptorset we need.
if len(self._add_image_index) > 0:
sample = self._get_embedding_from_blob(self.data[0][1][0])
utils = self.get_utils()
utils.add_descriptorset(self.search_set_name, dim=len(sample) // 4)
self._descriptorset_initialized = False

def _get_embedding_from_blob(self, image_blob: bytes):
pil_image = Image.open(io.BytesIO(image_blob))
Expand All @@ -39,29 +34,53 @@ def getitem(self, subscript):
self.ncalls += 1
x = self.data[subscript]

for ic in self._add_image_index:
serialized = self._get_embedding_from_blob(
x[1][self._add_image_index.index(ic)])
# If the image already has an image_sha256, we use it.
image_sha256 = x[0][ic]["AddImage"].get("properties", {}).get(
"adb_image_sha256", None)
if not image_sha256:
image_sha256 = hashlib.sha256(x[1][ic]).hexdigest()
x[1].append(serialized)
x[0].append(
{
"AddDescriptor": {
"set": self.search_set_name,
"properties": {
"image_sha256": image_sha256,
},
"if_not_found": {
"image_sha256": ["==", image_sha256],
},
"connect": {
"ref": x[0][ic]["AddImage"]["_ref"]
}
}
})
blob_index = 0
new_descriptors = []
new_blobs = []

for cmd_dict in x[0]:
cmd_name = list(cmd_dict.keys())[0]
if cmd_name == "AddImage":
blob = x[1][blob_index]

serialized = self._get_embedding_from_blob(blob)

if not getattr(self, "_descriptorset_initialized", False):
utils = self.get_utils()
success = utils.add_descriptorset(
self.search_set_name, dim=len(serialized) // 4)
try:
if success or self.search_set_name in utils.get_descriptorset_list():
self._descriptorset_initialized = True
except Exception:
pass

Comment thread
ad-claw000 marked this conversation as resolved.
# If the image already has an image_sha256, we use it.
if getattr(self, "_descriptorset_initialized", False):
image_sha256 = cmd_dict["AddImage"].get("properties", {}).get(
"adb_image_sha256", None)
if not image_sha256:
image_sha256 = hashlib.sha256(blob).hexdigest()
new_blobs.append(serialized)
new_descriptors.append(
{
"AddDescriptor": {
"set": self.search_set_name,
"properties": {
"image_sha256": image_sha256,
},
"if_not_found": {
"image_sha256": ["==", image_sha256],
},
"connect": {
"ref": cmd_dict["AddImage"]["_ref"]
}
}
})
if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]:
blob_index += 1

x[0].extend(new_descriptors)
x[1].extend(new_blobs)
self.cumulative_time += time.time() - start
return x
48 changes: 27 additions & 21 deletions aperturedb/transformers/image_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,32 @@ def __init__(self, data: Subscriptable, **kwargs) -> None:

def getitem(self, subscript):
x = self.data[subscript]
try:
# x is a transaction that has an add_image command and a blob
for ic in self._add_image_index:
blob_index = self._add_image_index.index(ic)
src_properties = x[0][ic]["AddImage"]["properties"]
# Compute the dynamic properties and apply them to metadata
src_properties["adb_image_size"] = len(x[1][blob_index])
src_properties["adb_image_sha256"] = hashlib.sha256(
x[1][blob_index]).hexdigest()

# Compute the image dimensions.
pil_image = Image.open(io.BytesIO(x[1][blob_index]))
src_properties["adb_image_width"] = pil_image.width
src_properties["adb_image_height"] = pil_image.height
src_properties["adb_image_id"] = str(
src_properties["id"] if "id" in src_properties else uuid.uuid4().hex)

except Exception as e:
# Importantly, do not raise an exception here, since it will kill ingestion.
# Create a log message instead, for post-mortem analysis.
logger.exception(e.with_traceback(None), stack_info=True)
blob_index = 0
for cmd_dict in x[0]:
cmd_name = list(cmd_dict.keys())[0]
try:
if cmd_name == "AddImage":
src_properties = cmd_dict["AddImage"].setdefault(
"properties", {})
# Compute the dynamic properties and apply them to metadata
src_properties["adb_image_size"] = len(x[1][blob_index])
src_properties["adb_image_sha256"] = hashlib.sha256(
x[1][blob_index]).hexdigest()

# Compute the image dimensions.
pil_image = Image.open(io.BytesIO(x[1][blob_index]))
src_properties["adb_image_width"] = pil_image.width
src_properties["adb_image_height"] = pil_image.height
src_properties["adb_image_id"] = str(
src_properties["id"] if "id" in src_properties else uuid.uuid4().hex)

Comment thread
ad-claw000 marked this conversation as resolved.
except Exception as e:
# Importantly, do not raise an exception here, since it will kill ingestion.
# Create a log message instead, for post-mortem analysis.
logger.exception(
"Error applying image properties", stack_info=True)

if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]:
blob_index += 1

return x
8 changes: 4 additions & 4 deletions aperturedb/transformers/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def __init__(self, data: Subscriptable, client=None, **kwargs) -> None:
command = list(c.keys())[0]
if command in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]:
self._blob_index.append(i)
if command == "AddImage":
self._add_image_index.append(i)
bc += 1
# Kept for backward compatibility
if command == "AddImage":
self._add_image_index.append(i)

logger.info(f"Found {bc} blobs in the data")
logger.info(
f"Found {len(self._add_image_index)} AddImage commands in the data")

self.ncalls = 0
self.cumulative_time = 0
Expand Down
49 changes: 49 additions & 0 deletions aperturedb/transformers/video_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from aperturedb.transformers.transformer import Transformer
from aperturedb.Subscriptable import Subscriptable

import logging
import uuid
import hashlib

logger = logging.getLogger(__name__)


class VideoProperties(Transformer):
"""
This computes some video properties and adds them to the metadata.
"""

def __init__(self, data: Subscriptable, **kwargs) -> None:
super().__init__(data, **kwargs)
utils = self.get_utils()

if "adb_data_source" not in utils.get_indexed_props("_Video"):
utils.create_entity_index("_Video", "adb_data_source")

def getitem(self, subscript):
x = self.data[subscript]
blob_index = 0
for cmd_dict in x[0]:
cmd_name = list(cmd_dict.keys())[0]
try:
if cmd_name == "AddVideo":
src_properties = cmd_dict["AddVideo"].setdefault(
"properties", {})
# Compute the dynamic properties and apply them to metadata
src_properties["adb_video_size"] = len(x[1][blob_index])
src_properties["adb_video_sha256"] = hashlib.sha256(
x[1][blob_index]).hexdigest()

src_properties["adb_video_id"] = str(
src_properties["id"] if "id" in src_properties else uuid.uuid4().hex)

Comment thread
ad-claw000 marked this conversation as resolved.
except Exception as e:
# Importantly, do not raise an exception here, since it will kill ingestion.
# Create a log message instead, for post-mortem analysis.
logger.exception(
"Error applying video properties", stack_info=True)

if cmd_name in ["AddImage", "AddDescriptor", "AddVideo", "AddBlob"]:
blob_index += 1

return x
Loading
Loading