Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions roboflow/util/folderparser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import json
import os
import re
Expand Down Expand Up @@ -324,25 +325,37 @@ def _read_jsonl(path):


def _parseAnnotationCSV(filename):
# TODO: use a proper CSV library?
# Open in text mode so line endings are normalized to \n (matches legacy behaviour).
# Annotation filenames never contain embedded newlines, so the csv multi-line-field
# caveat for newline="" does not apply here.
with open(filename) as f:
lines = f.readlines()
headers = [h.strip() for h in lines[0].split(",")]
raw_lines = f.readlines()

rows = list(csv.reader(raw_lines))
if not rows:
return {"headers": "", "lines": []}

headers = [h.strip() for h in rows[0]]

# Multi-label classification csv typically named _classes.csv
if os.path.basename(filename) == "_classes.csv":
parsed_lines = []
for line in lines[1:]:
parts = [p.strip() for p in line.split(",")]
for parts in rows[1:]:
parts = [p.strip() for p in parts]
if not parts:
continue
file_name = parts[0]
labels = [headers[i] for i, v in enumerate(parts[1:], start=1) if v == "1"]
parsed_lines.append({"file_name": file_name, "labels": labels})
return {"type": "multilabel_csv", "rows": parsed_lines, "headers": headers}
header_line = lines[0]
lines = [{"file_name": ld.split(",")[0].strip(), "line": ld} for ld in lines[1:]]
return {
"headers": header_line,
"lines": lines,
}

# For regular CSV, preserve raw lines so callers can reconstruct verbatim CSV text
# for upload, but use csv.reader to correctly extract file_name (handles quoted commas).
header_line = raw_lines[0] if raw_lines else ""
lines = [
{"file_name": row[0].strip() if row else "", "line": raw_line} for raw_line, row in zip(raw_lines[1:], rows[1:])
]
return {"headers": header_line, "lines": lines}


def _guessAnnotationFileFormat(parsed, extension):
Expand Down
29 changes: 29 additions & 0 deletions tests/util/test_folderparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,35 @@ def test_coco_root_annotation_matches_images_in_subdirs(self):
self.assertEqual(len(ann_data["annotations"]), 1, "Should have one annotation")
self.assertEqual(ann_data["annotations"][0]["bbox"], [10, 20, 100, 200])

def test_parse_csv_quoted_filename(self):
"""_parseAnnotationCSV must handle filenames containing commas (RFC 4180 quoting)."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False, newline="") as f:
f.write("img_fName,class_label\n")
f.write('"image,with,commas.jpg",cat\n')
f.write("normal.jpg,dog\n")
tmppath = f.name
try:
parsed = folderparser._parseAnnotationCSV(tmppath)
names = [ld["file_name"] for ld in parsed["lines"]]
self.assertEqual(names[0], "image,with,commas.jpg")
self.assertEqual(names[1], "normal.jpg")
finally:
os.unlink(tmppath)

def test_parse_multilabel_csv_quoted_filename(self):
"""_parseAnnotationCSV must handle quoted filenames in _classes.csv format."""
with tempfile.TemporaryDirectory() as tmpdir:
classes_csv = os.path.join(tmpdir, "_classes.csv")
with open(classes_csv, "w") as f:
f.write("filename,cat,dog\n")
f.write('"image,with,commas.jpg",1,0\n')
f.write("normal.jpg,0,1\n")
parsed = folderparser._parseAnnotationCSV(classes_csv)
self.assertEqual(parsed["type"], "multilabel_csv")
rows = {r["file_name"]: r["labels"] for r in parsed["rows"]}
self.assertEqual(rows["image,with,commas.jpg"], ["cat"])
self.assertEqual(rows["normal.jpg"], ["dog"])


def _assertJsonMatchesFile(actual, filename):
with open(filename) as file:
Expand Down