From aa5a7159cc19eba35f2cd017eac86295c9d114de Mon Sep 17 00:00:00 2001 From: Rohith Ramanathan Date: Sat, 20 Jun 2026 00:41:11 -0400 Subject: [PATCH 1/6] Add MemoryManager backend to the LongMemEval runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New --backend memorymanager path that benchmarks the custom MemoryManager system instead of mem0, holding chunks/answerer/judge identical (memory system is the only variable). Integrates directly at the runner's call sites — no Mem0Client replacement, no search() seam. - benchmarks/common/mm_bridge.py: per-question Agent (isolated in-memory LongTermStore), OpenRouter gpt-4o/4o-mini backend + shared embedder singletons; mm_ingest / mm_surface_and_format (PREP) / mm_persist (PERSIST). - run.py: --backend memorymanager + --mm-* flags; ingest_question_mm; process_question_answerer surfaces via the agent's PREP and persists via PERSIST, answer still generated by the SAME harness answerer; single managed-window cutoff; mem0 paths untouched. Validated end-to-end through the bridge with real gpt-4o (construction, ingest, PREP, PERSIST). Embedder parity (text-embedding-3-small) is a follow-up needing a small MemoryManager-side change. Co-Authored-By: Claude Opus 4.8 --- benchmarks/common/mm_bridge.py | 136 +++++++++++++++++++++++++++++++++ benchmarks/longmemeval/run.py | 133 +++++++++++++++++++++++++++++--- 2 files changed, 258 insertions(+), 11 deletions(-) create mode 100644 benchmarks/common/mm_bridge.py diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py new file mode 100644 index 0000000..2b33d8b --- /dev/null +++ b/benchmarks/common/mm_bridge.py @@ -0,0 +1,136 @@ +"""Bridge to the custom MemoryManager (MM) system for the LongMemEval runner. + +MM lives in a separate repo (default ~/MemoryManager, override with +MEMORYMANAGER_PATH). Unlike mem0's stateless extract→search pipeline, MM is a +stateful managed-context system: we ingest the conversation turn-by-turn, then +at answer time surface relevant memory into its context (PREP), let the SAME +harness answerer generate the answer from those surfaced blocks, and persist the +exchange (PERSIST). Only the surfaced memory differs from the mem0 path — the +answerer prompt + model and the judge stay identical, so the memory system is +the sole variable. + +Construction mirrors MM's own eval/agent_server.py wiring. The LLM backend +(OpenRouter, gpt-4o family) and the embedder are process-wide singletons built +once and shared across per-question agents (both are safe for concurrent +inference); each question gets its own Agent with an isolated in-memory +LongTermStore. +""" + +from __future__ import annotations + +import os +import sys +import threading + +_MM_PATH = os.environ.get("MEMORYMANAGER_PATH", os.path.expanduser("~/MemoryManager")) +if _MM_PATH not in sys.path: + sys.path.insert(0, _MM_PATH) + +# MM imports are resolved lazily inside _ensure()/make_mm_agent so that importing +# this module (e.g. for a syntax/import smoke test) does not require API keys or +# pull in heavy deps until an agent is actually built. + +_LOCK = threading.Lock() +_BACKEND = None +_EMBEDDER = None + + +def _ensure(embedding_model: str): + """Build (once) and return the shared (backend, embedder) singletons.""" + global _BACKEND, _EMBEDDER + with _LOCK: + if _BACKEND is None: + from llm import make_llm_backend # MM's llm package + + # OpenRouter chat-completions backend (gpt-4o / gpt-4o-mini). Reads + # OPENROUTER_API_KEY from env / MM's repo-root .env. + _BACKEND = make_llm_backend(backend="openrouter") + if _EMBEDDER is None: + from sentence_transformers import SentenceTransformer + + _EMBEDDER = SentenceTransformer(embedding_model) + return _BACKEND, _EMBEDDER + + +def make_mm_agent( + max_tokens: int, + model: str = "openai/gpt-4o", + util_model: str = "openai/gpt-4o-mini", + embedding_model: str = "all-MiniLM-L6-v2", + novelty_mode=None, + mode: str = "llm", +): + """Build a fresh, isolated MemoryManager Agent for one benchmark question.""" + from agent import Agent, MemoryMode + from controller import MemoryController + from ContextManager import ContextManager + from functions.llm_fns import make_compress_fn, make_merge_fn + from memory.longterm import LongTermStore + from memory.novelty import NoveltyMode + from memory.store import ContextStore + + backend, embedder = _ensure(embedding_model) + + store = ContextStore(max_tokens=max_tokens) + lt = LongTermStore("sqlite:///:memory:") # isolated per agent; StaticPool = thread-safe + cm = ContextManager(store, lt, embedding_model=embedder) + controller = MemoryController( + cm, + compress_fn=make_compress_fn(backend, util_model), + merge_fn=make_merge_fn(backend, cm, util_model), + ) + mm_mode = MemoryMode.LLM if str(mode).lower() == "llm" else MemoryMode.ALGORITHMIC + return Agent( + controller, + backend, + model=model, + mode=mm_mode, + novelty_mode=(novelty_mode or NoveltyMode.EMBEDDING), + novelty_model=util_model, + ) + + +def _join_chunk(chunk) -> str: + """Render one ingestion chunk (a user+assistant pair) as plain text.""" + if isinstance(chunk, str): + return chunk + parts = [] + for msg in chunk: + if isinstance(msg, dict): + parts.append(f"{msg.get('role', '')}: {msg.get('content', '')}") + else: + parts.append(str(msg)) + return "\n".join(parts) + + +def mm_ingest(agent, pairs) -> None: + """Ingest each conversation chunk into MM's memory lifecycle (no LLM reply).""" + for chunk in pairs: + text = _join_chunk(chunk) + if text.strip(): + agent.ingest(text) + + +def mm_surface_and_format(agent, question_text: str) -> list[dict]: + """PREP: surface relevant memory into context, then return the in-context + blocks shaped like the harness's search results ({memory, score, created_at}) + so they feed the unchanged get_answer_generation_prompt. + + created_at is None for now — threading session dates onto blocks is a tracked + follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions). + """ + agent._llm_prep_phase(question_text) + blocks = agent._controller._cm._store.all_blocks() + return [ + { + "memory": b.content, + "score": float(getattr(b, "novelty_score", 0.0)), + "created_at": None, + } + for b in blocks + ] + + +def mm_persist(agent, question_text: str, answer: str) -> None: + """PERSIST: store the exchange and re-score novelty.""" + agent._llm_persist_phase(question_text, answer) diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py index 396788d..8a1976f 100644 --- a/benchmarks/longmemeval/run.py +++ b/benchmarks/longmemeval/run.py @@ -35,6 +35,7 @@ import argparse import asyncio +import contextlib import json import os import random @@ -501,6 +502,38 @@ async def ingest_question( return total_failed == 0, user_id, total_processed +async def ingest_question_mm(question, args, run_id, logger, shutdown): + """Ingest a question's haystack into a fresh MemoryManager agent. + + Mirrors ingest_question's chunking (same chronological sessions, same + pair_turns, same empty-content skipping) but feeds the pairs to MM's + turn-by-turn ingest lifecycle instead of mem0.add. MM agents are ephemeral + in-memory, so there is no checkpoint/resume — we re-ingest each run. + + Returns: (success, agent, total_pairs) + """ + from benchmarks.common import mm_bridge + + question_id = question["question_id"] + sorted_sessions = sort_sessions_chronologically(question) + + all_pairs: list = [] + for _session_id, _date_str, session in sorted_sessions: + if not session: + continue + for messages in pair_turns(session): + if any(not msg.get("content", "").strip() for msg in messages): + continue + all_pairs.append(messages) + + agent = mm_bridge.make_mm_agent( + args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model, + ) + await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs) + logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs)) + return True, agent, len(all_pairs) + + # =============================================================================== # SEARCH + ANSWER + JUDGE # =============================================================================== @@ -519,9 +552,16 @@ async def process_question_answerer( logger: Any, score_debug: bool = False, existing_search_results: list | None = None, + agent: Any = None, ) -> dict[str, Any]: """Process a question in answerer mode: search + generate answer + judge. + When ``agent`` is provided (MemoryManager backend), memory is surfaced via the + agent's PREP phase instead of mem0.search, the answer is generated by the SAME + harness answerer over those surfaced blocks, and the exchange is persisted via + the agent's PERSIST phase. cutoffs is a single value for MM (one managed + window), so the cutoff loop runs once. + Returns a result dict suitable for serialization. """ question_id = question["question_id"] @@ -535,8 +575,18 @@ async def process_question_answerer( parse_longmemeval_date_human(question_date) if question_date else "" ) - # --- Search --- - if existing_search_results is not None: + # --- Search / surface --- + if agent is not None: + # MemoryManager: PREP surfaces relevant memory into the managed context. + from benchmarks.common import mm_bridge + + start = time.monotonic() + formatted = await asyncio.to_thread( + mm_bridge.mm_surface_and_format, agent, question_text, + ) + search_latency = (time.monotonic() - start) * 1000 + query_debug = None + elif existing_search_results is not None: formatted = existing_search_results query_debug = None search_latency = 0.0 @@ -624,6 +674,14 @@ async def process_question_answerer( "reason": f"Generated answer: {generated_answer[:500]}", } + if agent is not None: + # PERSIST the exchange + re-score (single cutoff, so runs once). + from benchmarks.common import mm_bridge + + await asyncio.to_thread( + mm_bridge.mm_persist, agent, question_text, generated_answer, + ) + result["cutoff_results"] = cutoff_results return result @@ -1033,8 +1091,9 @@ def parse_args() -> argparse.Namespace: help="Requests per minute for LLM", ) parser.add_argument( - "--backend", default="oss", choices=["oss", "cloud"], - help="Mem0 backend: 'oss' for self-hosted server (default), 'cloud' for api.mem0.ai", + "--backend", default="oss", choices=["oss", "cloud", "memorymanager"], + help="Memory backend: 'oss'/'cloud' for Mem0, 'memorymanager' for the custom " + "MemoryManager system (answerer mode only).", ) parser.add_argument( "--mem0-host", default=None, @@ -1044,6 +1103,19 @@ def parse_args() -> argparse.Namespace: "--mem0-api-key", default=None, help="Mem0 API key (cloud mode only)", ) + # MemoryManager backend options (ignored for Mem0 backends). + parser.add_argument( + "--mm-max-tokens", type=int, default=8000, + help="MemoryManager context-window token budget (memorymanager backend).", + ) + parser.add_argument( + "--mm-model", default="openai/gpt-4o", + help="MemoryManager main model (memorymanager backend).", + ) + parser.add_argument( + "--mm-util-model", default="openai/gpt-4o-mini", + help="MemoryManager util model for compress/merge/novelty (memorymanager backend).", + ) return parser.parse_args() @@ -1231,12 +1303,22 @@ async def judge_one(question: dict) -> None: return backend = os.getenv("MEM0_BACKEND", args.backend) - mem0 = Mem0Client( - mode=backend, - host=args.mem0_host, - api_key=args.mem0_api_key if backend == "cloud" else None, - rpm=args.rpm, - ) + mm_mode = backend == "memorymanager" + if mm_mode: + if args.mode == "retrieval": + raise SystemExit("memorymanager backend supports --mode answerer only.") + # MM yields one managed context window, not top-k cutoffs. Collapse to a + # single cutoff (>= window size) so the slice returns all surfaced blocks + # and the metrics/display path is reused unchanged. + cutoffs = [args.top_k] + mem0 = None + else: + mem0 = Mem0Client( + mode=backend, + host=args.mem0_host, + api_key=args.mem0_api_key if backend == "cloud" else None, + rpm=args.rpm, + ) shutdown = GracefulShutdown() checkpoint = Checkpoint(output_dir) @@ -1256,7 +1338,7 @@ async def judge_one(question: dict) -> None: existing_ids = {e["question_id"] for e in all_evaluations} - async with mem0: + async with (contextlib.nullcontext() if mem0 is None else mem0): with shutdown: results_lock = asyncio.Lock() question_semaphore = asyncio.Semaphore(args.max_workers) @@ -1282,6 +1364,35 @@ async def process_single_question(question: dict): pbar.update(1) return + if mm_mode: + # MemoryManager path: fresh agent → ingest → PREP+answer+PERSIST. + success, agent, _pairs = await ingest_question_mm( + question, args, run_id, logger, shutdown, + ) + if shutdown.requested: + return + result = await process_question_answerer( + question=question, + user_id=f"longmemeval_{question_id}_{run_id}", + mem0=None, + answerer=answerer, + judge_llm=judge_llm, + cutoffs=cutoffs, + top_k=args.top_k, + user_profile=None, + predict_only=args.predict_only, + logger=logger, + score_debug=args.score_debug, + agent=agent, + ) + result_path = os.path.join(output_dir, f"{question_id}.json") + save_result_json(result_path, result) + async with results_lock: + all_evaluations.append(result) + existing_ids.add(question_id) + pbar.update(1) + return + # Check if we have predict-only results (search data already exists) existing_predict = predict_only_results.get(question_id) if existing_predict and existing_predict.get("retrieval"): From 673c4f71f9c17ca1453e0c4150278ac6e0334a9b Mon Sep 17 00:00:00 2001 From: Rohith Ramanathan Date: Mon, 22 Jun 2026 18:00:04 -0400 Subject: [PATCH 2/6] Default MemoryManager backend embedder to OpenRouter text-embedding-3-small Embedder parity with mem0 OSS (text-embedding-3-small), routed through OpenRouter so the whole stack uses one OpenRouter key (no OpenAI key): route the bridge's embedder through MM's make_embedder (was hardcoding SentenceTransformer), default to "openrouter:openai/text-embedding-3-small" (1536-d, matches LongTermStore's default embedding_dim), and add --mm-embedding-model (accepts openrouter:/openai: specs or a local sentence-transformers name). Verified: OpenRouter's embeddings endpoint returns a 1536-d vector for openai/text-embedding-3-small. Co-Authored-By: Claude Opus 4.8 --- benchmarks/common/mm_bridge.py | 9 ++++++--- benchmarks/longmemeval/run.py | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py index 2b33d8b..fbdfe1e 100644 --- a/benchmarks/common/mm_bridge.py +++ b/benchmarks/common/mm_bridge.py @@ -46,9 +46,12 @@ def _ensure(embedding_model: str): # OPENROUTER_API_KEY from env / MM's repo-root .env. _BACKEND = make_llm_backend(backend="openrouter") if _EMBEDDER is None: - from sentence_transformers import SentenceTransformer + from memory.embeddings import make_embedder # MM's embedder factory - _EMBEDDER = SentenceTransformer(embedding_model) + # Resolves a spec to the right backend: "text-embedding-3-small" / + # "openai:" (OpenAI direct), "openrouter:" (via OpenRouter), or + # a sentence-transformers name (local). Matches mem0 OSS's embedder. + _EMBEDDER = make_embedder(embedding_model) return _BACKEND, _EMBEDDER @@ -56,7 +59,7 @@ def make_mm_agent( max_tokens: int, model: str = "openai/gpt-4o", util_model: str = "openai/gpt-4o-mini", - embedding_model: str = "all-MiniLM-L6-v2", + embedding_model: str = "openrouter:openai/text-embedding-3-small", novelty_mode=None, mode: str = "llm", ): diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py index 8a1976f..c8ea025 100644 --- a/benchmarks/longmemeval/run.py +++ b/benchmarks/longmemeval/run.py @@ -528,6 +528,7 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown): agent = mm_bridge.make_mm_agent( args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model, + embedding_model=args.mm_embedding_model, ) await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs) logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs)) @@ -1116,6 +1117,12 @@ def parse_args() -> argparse.Namespace: "--mm-util-model", default="openai/gpt-4o-mini", help="MemoryManager util model for compress/merge/novelty (memorymanager backend).", ) + parser.add_argument( + "--mm-embedding-model", default="openrouter:openai/text-embedding-3-small", + help="MemoryManager embedder (memorymanager backend). Default routes via " + "OpenRouter (OPENROUTER_API_KEY). Also accepts 'openai:'/'text-embedding-3-small' " + "(OpenAI direct) or a sentence-transformers name for a local keyless embedder.", + ) return parser.parse_args() From be7290f297ee006ab9ad26854b36a7e4c51e0296 Mon Sep 17 00:00:00 2001 From: Rohith Ramanathan Date: Mon, 22 Jun 2026 18:43:30 -0400 Subject: [PATCH 3/6] Align MemoryManager bridge with native llm and algorithmic turn lifecycles. Use PREP+PERSIST ingest and query in llm mode and dual receive() in algorithmic mode so LongMemEval exercises the same memory paths as Agent, with the harness answerer substituting for REPLY. Co-authored-by: Cursor --- benchmarks/common/mm_bridge.py | 124 ++++++++++++++++++++++----------- benchmarks/longmemeval/run.py | 18 +++-- 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py index fbdfe1e..112ee6a 100644 --- a/benchmarks/common/mm_bridge.py +++ b/benchmarks/common/mm_bridge.py @@ -2,18 +2,25 @@ MM lives in a separate repo (default ~/MemoryManager, override with MEMORYMANAGER_PATH). Unlike mem0's stateless extract→search pipeline, MM is a -stateful managed-context system: we ingest the conversation turn-by-turn, then -at answer time surface relevant memory into its context (PREP), let the SAME -harness answerer generate the answer from those surfaced blocks, and persist the -exchange (PERSIST). Only the surfaced memory differs from the mem0 path — the -answerer prompt + model and the judge stay identical, so the memory system is -the sole variable. - -Construction mirrors MM's own eval/agent_server.py wiring. The LLM backend -(OpenRouter, gpt-4o family) and the embedder are process-wide singletons built -once and shared across per-question agents (both are safe for concurrent -inference); each question gets its own Agent with an isolated in-memory -LongTermStore. +stateful managed-context system. Each benchmark question gets a fresh Agent with +an isolated in-memory LongTermStore. + +Two memory modes mirror Agent's native turn structures (REPLY is always the +external harness answerer, not MM's model): + + llm mode — ingest: PREP(user) → PERSIST(user, assistant) per haystack pair + query: PREP(question) → harness answer → PERSIST(question, answer) + + algorithmic mode — ingest: receive(user) → receive(assistant) per pair + query: receive(question) → harness answer → receive(answer) + +Only the surfaced memory differs from the mem0 path — the answerer prompt, +model, and judge stay identical, so the memory system is the sole variable. + +LLM-mode ingest is expensive (~2 tool-loop calls per haystack pair). + +The LLM backend (OpenRouter, gpt-4o family) and embedder are process-wide +singletons built once and shared across per-question agents. """ from __future__ import annotations @@ -93,36 +100,35 @@ def make_mm_agent( ) -def _join_chunk(chunk) -> str: - """Render one ingestion chunk (a user+assistant pair) as plain text.""" +def _split_pair(chunk) -> tuple[str, str]: + """Extract user and assistant content from one haystack pair.""" if isinstance(chunk, str): - return chunk - parts = [] + return chunk, "" + user = assistant = "" for msg in chunk: - if isinstance(msg, dict): - parts.append(f"{msg.get('role', '')}: {msg.get('content', '')}") - else: - parts.append(str(msg)) - return "\n".join(parts) - - -def mm_ingest(agent, pairs) -> None: - """Ingest each conversation chunk into MM's memory lifecycle (no LLM reply).""" - for chunk in pairs: - text = _join_chunk(chunk) - if text.strip(): - agent.ingest(text) - + if not isinstance(msg, dict): + continue + role = msg.get("role", "") + content = msg.get("content", "") + if role == "user": + user = content + elif role == "assistant": + assistant = content + return user, assistant + + +def _algorithmic_receive(agent, content: str) -> None: + """One receive() pass — mirrors the memory half of _algorithmic_turn.""" + if not content.strip(): + return + embedding = agent._controller.embed(content) + agent._controller.receive( + content, embedding, agent._novelty_fn(content, embedding) + ) -def mm_surface_and_format(agent, question_text: str) -> list[dict]: - """PREP: surface relevant memory into context, then return the in-context - blocks shaped like the harness's search results ({memory, score, created_at}) - so they feed the unchanged get_answer_generation_prompt. - created_at is None for now — threading session dates onto blocks is a tracked - follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions). - """ - agent._llm_prep_phase(question_text) +def _format_blocks(agent) -> list[dict]: + """Return in-context blocks shaped like mem0 search hits for the harness.""" blocks = agent._controller._cm._store.all_blocks() return [ { @@ -134,6 +140,44 @@ def mm_surface_and_format(agent, question_text: str) -> list[dict]: ] +def mm_ingest(agent, pairs) -> None: + """Ingest haystack pairs using the agent's native mode lifecycle (no REPLY).""" + from agent import MemoryMode + + for chunk in pairs: + user, assistant = _split_pair(chunk) + if not user.strip() and not assistant.strip(): + continue + if agent._mode == MemoryMode.LLM: + if user.strip(): + agent._llm_prep_phase(user) + if user.strip() or assistant.strip(): + agent._llm_persist_phase(user, assistant) + else: + _algorithmic_receive(agent, user) + _algorithmic_receive(agent, assistant) + + +def mm_surface_and_format(agent, question_text: str) -> list[dict]: + """Surface relevant memory into context, then return blocks for the harness. + + created_at is None for now — threading session dates onto blocks is a tracked + follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions). + """ + from agent import MemoryMode + + if agent._mode == MemoryMode.LLM: + agent._llm_prep_phase(question_text) + else: + _algorithmic_receive(agent, question_text) + return _format_blocks(agent) + + def mm_persist(agent, question_text: str, answer: str) -> None: - """PERSIST: store the exchange and re-score novelty.""" - agent._llm_persist_phase(question_text, answer) + """Persist the Q+A exchange after the harness answer.""" + from agent import MemoryMode + + if agent._mode == MemoryMode.LLM: + agent._llm_persist_phase(question_text, answer) + else: + _algorithmic_receive(agent, answer) diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py index c8ea025..5ee9bc6 100644 --- a/benchmarks/longmemeval/run.py +++ b/benchmarks/longmemeval/run.py @@ -528,7 +528,7 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown): agent = mm_bridge.make_mm_agent( args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model, - embedding_model=args.mm_embedding_model, + embedding_model=args.mm_embedding_model, mode=args.mm_memory_mode, ) await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs) logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs)) @@ -558,10 +558,11 @@ async def process_question_answerer( """Process a question in answerer mode: search + generate answer + judge. When ``agent`` is provided (MemoryManager backend), memory is surfaced via the - agent's PREP phase instead of mem0.search, the answer is generated by the SAME - harness answerer over those surfaced blocks, and the exchange is persisted via - the agent's PERSIST phase. cutoffs is a single value for MM (one managed - window), so the cutoff loop runs once. + agent's mode-native prep step (LLM PREP or algorithmic receive) instead of + mem0.search, the answer is generated by the SAME harness answerer over those + surfaced blocks, and the exchange is persisted via the agent's mode-native + persist step (LLM PERSIST or algorithmic receive). cutoffs is a single value + for MM (one managed window), so the cutoff loop runs once. Returns a result dict suitable for serialization. """ @@ -1123,6 +1124,12 @@ def parse_args() -> argparse.Namespace: "OpenRouter (OPENROUTER_API_KEY). Also accepts 'openai:'/'text-embedding-3-small' " "(OpenAI direct) or a sentence-transformers name for a local keyless embedder.", ) + parser.add_argument( + "--mm-memory-mode", default="llm", choices=["llm", "algorithmic"], + help="MemoryManager memory lifecycle (memorymanager backend). 'llm': PREP+PERSIST " + "ingest and query (~2 tool loops per haystack pair). 'algorithmic': dual " + "receive() ingest and query (no LLM memory tools).", + ) return parser.parse_args() @@ -1312,6 +1319,7 @@ async def judge_one(question: dict) -> None: backend = os.getenv("MEM0_BACKEND", args.backend) mm_mode = backend == "memorymanager" if mm_mode: + print(f" MM memory mode: {args.mm_memory_mode}") if args.mode == "retrieval": raise SystemExit("memorymanager backend supports --mode answerer only.") # MM yields one managed context window, not top-k cutoffs. Collapse to a From caef78be2af733302ec024036b149525bae29363 Mon Sep 17 00:00:00 2001 From: Rohith Ramanathan Date: Tue, 23 Jun 2026 00:40:40 -0400 Subject: [PATCH 4/6] docs: add CLAUDE.md covering the MemoryManager backend integration How-to for the MM-vs-mem0 LongMemEval head-to-head: the mm_bridge integration, all-OpenRouter routing (3 points + env), reusing MM's venv, mem0 OpenRouter config, run commands, and fairness caveats. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ef987d6 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,106 @@ +# memory-benchmarks — CLAUDE.md + +Harness for memory-system benchmarks (LongMemEval, LoCoMo, BEAM). Upstream it +benchmarks **mem0**; this repo has been extended with a **MemoryManager (MM)** +backend so the two can be compared head-to-head on LongMemEval, varying only the +memory system. See the repo `README.md` for the original mem0 usage. + +## The comparison goal + +Fair LongMemEval head-to-head: **only the memory system differs.** Conversation +chunks, the answerer (`gpt-4o`), the judge (`gpt-4o-mini`), and the embedder +(`text-embedding-3-small`) are identical across both backends. mem0 OSS's defaults +(`gpt-4o-mini` extraction + `text-embedding-3-small`) already match, and MM is +configured to the same models. + +**Everything routes through OpenRouter on one key — no OpenAI key.** MM chat, MM +embeddings, the harness answerer/judge, and the mem0 server all hit OpenRouter's +OpenAI-compatible endpoint. + +## How the MM integration works + +MM lives in a **separate repo** (`~/MemoryManager`, override `MEMORYMANAGER_PATH`) +and is a stateful managed-context system, not a stateless extract→search store. So +the integration is **at the runner's call sites**, not a drop-in `Mem0Client`: + +- `benchmarks/common/mm_bridge.py` — builds a fresh, isolated `Agent` per question + (in-memory `LongTermStore`), and exposes `mm_ingest` / `mm_surface_and_format` + (PREP) / `mm_persist` (PERSIST). The **harness answerer** generates the answer + from MM's surfaced blocks (MM's own REPLY phase is bypassed — the fairness crux); + the **judge is unchanged**. +- `benchmarks/longmemeval/run.py` — `--backend memorymanager` branch at the + ingest/answer call sites; all mem0 paths untouched. MM yields one managed-context + window (no top-k cutoffs), so it reports a single cutoff. + +MM-side dependencies (the OpenRouter `LLMBackend` and the `memory/embeddings.py` +Embedder abstraction) are on MM's **`main`** branch — run against a `main` checkout. + +## All-OpenRouter routing (3 points + env) + +| Component | How it routes to OpenRouter | +|---|---| +| MM chat (gpt-4o / gpt-4o-mini) | `make_llm_backend(backend="openrouter")` (reads `OPENROUTER_API_KEY` from MM's `.env`) | +| MM embedder | `--mm-embedding-model openrouter:openai/text-embedding-3-small` (default) → `OpenAIEmbedder(base_url=OpenRouter)` | +| Harness answerer + judge | `LLMClient` (provider `openai`) honors `OPENAI_BASE_URL` + `OPENAI_API_KEY` env | +| mem0 OSS server | `mem0-config.yaml` (`openai_base_url: …openrouter…`) mounted via `docker-compose.yml`; key via `OPENAI_API_KEY` | + +`.env` (repo root, gitignored — **create it yourself**, the key comes from MM's `.env`): +``` +OPENAI_API_KEY= +OPENAI_BASE_URL=https://openrouter.ai/api/v1 +OPENROUTER_API_KEY= +``` +One-liner to create it from MM's `.env`: +``` +K=$(grep -E '^OPENROUTER_API_KEY=' ~/MemoryManager/.env | head -1 | cut -d= -f2-) && \ +printf 'OPENAI_API_KEY=%s\nOPENAI_BASE_URL=https://openrouter.ai/api/v1\nOPENROUTER_API_KEY=%s\n' "$K" "$K" > .env +``` + +## Environment + +No `.venv` is committed here; the default `python3` may be too new for some wheels. +**Reuse MM's venv** (Python 3.12, already has every dep + `aiolimiter` added): +`~/MemoryManager/.venv/bin/python`. mem0 is reached over raw HTTP, so no mem0 SDK +is needed. Ensure `~/MemoryManager` is on `main` (has `memory/embeddings.py`). + +## Running + +```bash +cd ~/memory-benchmarks +PY=~/MemoryManager/.venv/bin/python + +# mem0 OSS server (OpenRouter-backed via mem0-config.yaml), at localhost:8888 +docker compose up -d + +# MemoryManager backend +$PY -m benchmarks.longmemeval.run --backend memorymanager \ + --answerer-model openai/gpt-4o --judge-model openai/gpt-4o-mini --provider openai \ + --mm-max-tokens 8000 --mm-embedding-model openrouter:openai/text-embedding-3-small \ + --per-type 1 --max-workers 2 --project-name mm_smoke + +# mem0 OSS backend (same flags) +$PY -m benchmarks.longmemeval.run --backend oss \ + --answerer-model openai/gpt-4o --judge-model openai/gpt-4o-mini --provider openai \ + --per-type 1 --max-workers 4 --project-name mem0_smoke +``` + +MM fires several gpt-4o calls per question (PREP/PERSIST tool loops, and LLM-mode +ingest is ~2 calls/haystack-pair) → **expensive**. Scale up deliberately: +`--per-type 1` smoke → `--per-type 5` → `--all-questions` (500) only when intended. +Use a lower `--max-workers` for MM than for mem0. + +### Key MM flags (`memorymanager` backend) +- `--mm-max-tokens` (default 8000) — MM context-window budget; set near the token + size of the mem0 cutoff you compare against. +- `--mm-model` / `--mm-util-model` — default `openai/gpt-4o` / `openai/gpt-4o-mini`. +- `--mm-embedding-model` — default `openrouter:openai/text-embedding-3-small`; also + `openai:`/`text-embedding-3-small` (OpenAI direct) or a sentence-transformers + name (local, keyless). + +## Fairness notes / known caveats +- **Single managed window vs mem0 cutoffs:** MM has no top-k; it reports one cutoff. + Compare against the mem0 cutoff whose token footprint ≈ `--mm-max-tokens`. +- **Per-memory dates:** MM blocks carry `created_at: None`, so the answer prompt + can't date-group MM memories (mem0 does) — a temporal-reasoning disadvantage. + Tracked follow-up in MM (thread session timestamps onto blocks). +- **`--mode answerer` only** for the MM backend (retrieval mode is rejected). From 607f9a2be36689775fab6a314753212842ebbd8b Mon Sep 17 00:00:00 2001 From: Rohith Ramanathan Date: Tue, 23 Jun 2026 01:45:45 -0400 Subject: [PATCH 5/6] Wire per-pair source dates into the MM bridge (created_at parity) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thread each haystack pair's real-world session date through ingest: run.py builds a per-pair date list (parse_longmemeval_date -> UTC datetime) parallel to the pairs and passes it to mm_ingest, which stamps each pair's blocks via the scoped ContextManager.using_source_date(). _format_blocks now emits the block's source_date as an ISO created_at, so the answerer prompt date-orders/-groups MM memories the same way it does mem0's per-memory timestamps — closing the temporal-reasoning gap. dates is optional (un-dated ingest still works). Co-Authored-By: Claude Opus 4.8 --- benchmarks/common/mm_bridge.py | 69 ++++++++++++++++++++++------------ benchmarks/longmemeval/run.py | 8 +++- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py index 112ee6a..d7fdf10 100644 --- a/benchmarks/common/mm_bridge.py +++ b/benchmarks/common/mm_bridge.py @@ -17,6 +17,10 @@ Only the surfaced memory differs from the mem0 path — the answerer prompt, model, and judge stay identical, so the memory system is the sole variable. +Each pair is ingested under its real-world session date (via the scoped +ContextManager.using_source_date), so surfaced blocks carry a created_at the +answerer can date-order — parity with mem0's per-memory timestamps. + LLM-mode ingest is expensive (~2 tool-loop calls per haystack pair). The LLM backend (OpenRouter, gpt-4o family) and embedder are process-wide @@ -128,41 +132,58 @@ def _algorithmic_receive(agent, content: str) -> None: def _format_blocks(agent) -> list[dict]: - """Return in-context blocks shaped like mem0 search hits for the harness.""" + """Return in-context blocks shaped like mem0 search hits for the harness. + + created_at is the block's source_date (the real-world date of the memory's + content) as an ISO string, so the answerer prompt can date-group/-order + memories just like it does for mem0's per-memory timestamps. + """ blocks = agent._controller._cm._store.all_blocks() - return [ - { - "memory": b.content, - "score": float(getattr(b, "novelty_score", 0.0)), - "created_at": None, - } - for b in blocks - ] - - -def mm_ingest(agent, pairs) -> None: - """Ingest haystack pairs using the agent's native mode lifecycle (no REPLY).""" + out: list[dict] = [] + for b in blocks: + sd = getattr(b, "source_date", None) + out.append( + { + "memory": b.content, + "score": float(getattr(b, "novelty_score", 0.0)), + "created_at": sd.isoformat() if sd else None, + } + ) + return out + + +def mm_ingest(agent, pairs, dates=None) -> None: + """Ingest haystack pairs using the agent's native mode lifecycle (no REPLY). + + ``dates`` is an optional list parallel to ``pairs`` of datetimes (the real-world + session date of each pair); when given, each pair's blocks are stamped with that + source_date via the scoped using_source_date() so the answerer can date-order + them. None entries (or dates=None) leave blocks undated. + """ from agent import MemoryMode - for chunk in pairs: + cm = agent._controller._cm + for i, chunk in enumerate(pairs): user, assistant = _split_pair(chunk) if not user.strip() and not assistant.strip(): continue - if agent._mode == MemoryMode.LLM: - if user.strip(): - agent._llm_prep_phase(user) - if user.strip() or assistant.strip(): - agent._llm_persist_phase(user, assistant) - else: - _algorithmic_receive(agent, user) - _algorithmic_receive(agent, assistant) + source_date = dates[i] if dates is not None else None + with cm.using_source_date(source_date): + if agent._mode == MemoryMode.LLM: + if user.strip(): + agent._llm_prep_phase(user) + if user.strip() or assistant.strip(): + agent._llm_persist_phase(user, assistant) + else: + _algorithmic_receive(agent, user) + _algorithmic_receive(agent, assistant) def mm_surface_and_format(agent, question_text: str) -> list[dict]: """Surface relevant memory into context, then return blocks for the harness. - created_at is None for now — threading session dates onto blocks is a tracked - follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions). + Surfaced blocks carry the source_date stamped at ingest (see mm_ingest), which + _format_blocks emits as created_at for date-ordering in the answerer prompt. """ from agent import MemoryMode diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py index 5ee9bc6..057ec29 100644 --- a/benchmarks/longmemeval/run.py +++ b/benchmarks/longmemeval/run.py @@ -518,19 +518,23 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown): sorted_sessions = sort_sessions_chronologically(question) all_pairs: list = [] - for _session_id, _date_str, session in sorted_sessions: + all_dates: list = [] # real-world session date per pair (parallel to all_pairs) + for _session_id, date_str, session in sorted_sessions: if not session: continue + ts = parse_longmemeval_date(date_str) if date_str else None + session_date = datetime.fromtimestamp(ts, tz=timezone.utc) if ts else None for messages in pair_turns(session): if any(not msg.get("content", "").strip() for msg in messages): continue all_pairs.append(messages) + all_dates.append(session_date) agent = mm_bridge.make_mm_agent( args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model, embedding_model=args.mm_embedding_model, mode=args.mm_memory_mode, ) - await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs) + await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs, all_dates) logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs)) return True, agent, len(all_pairs) From 765222f9fb3b073082ddfc46423af7c374bc1340 Mon Sep 17 00:00:00 2001 From: Rohith Ramanathan Date: Tue, 23 Jun 2026 03:13:35 -0400 Subject: [PATCH 6/6] Expose MemoryManager simulated decay clock via --mm-clock-seconds-per-turn. Thread clock_seconds_per_turn through mm_bridge into MemoryConfig so batch LongMemEval runs can differentiate recency during fast ingest. Co-authored-by: Cursor --- benchmarks/common/mm_bridge.py | 8 ++++++-- benchmarks/longmemeval/run.py | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py index d7fdf10..3fbaba1 100644 --- a/benchmarks/common/mm_bridge.py +++ b/benchmarks/common/mm_bridge.py @@ -73,25 +73,29 @@ def make_mm_agent( embedding_model: str = "openrouter:openai/text-embedding-3-small", novelty_mode=None, mode: str = "llm", + clock_seconds_per_turn: float = 0.0, ): """Build a fresh, isolated MemoryManager Agent for one benchmark question.""" from agent import Agent, MemoryMode from controller import MemoryController from ContextManager import ContextManager from functions.llm_fns import make_compress_fn, make_merge_fn + from memory.config import MemoryConfig from memory.longterm import LongTermStore from memory.novelty import NoveltyMode from memory.store import ContextStore backend, embedder = _ensure(embedding_model) - store = ContextStore(max_tokens=max_tokens) + cfg = MemoryConfig(clock_seconds_per_turn=clock_seconds_per_turn) + store = ContextStore(max_tokens=max_tokens, config=cfg) lt = LongTermStore("sqlite:///:memory:") # isolated per agent; StaticPool = thread-safe - cm = ContextManager(store, lt, embedding_model=embedder) + cm = ContextManager(store, lt, embedding_model=embedder, config=cfg) controller = MemoryController( cm, compress_fn=make_compress_fn(backend, util_model), merge_fn=make_merge_fn(backend, cm, util_model), + config=cfg, ) mm_mode = MemoryMode.LLM if str(mode).lower() == "llm" else MemoryMode.ALGORITHMIC return Agent( diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py index 057ec29..63522ab 100644 --- a/benchmarks/longmemeval/run.py +++ b/benchmarks/longmemeval/run.py @@ -533,6 +533,7 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown): agent = mm_bridge.make_mm_agent( args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model, embedding_model=args.mm_embedding_model, mode=args.mm_memory_mode, + clock_seconds_per_turn=args.mm_clock_seconds_per_turn, ) await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs, all_dates) logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs)) @@ -1134,6 +1135,13 @@ def parse_args() -> argparse.Namespace: "ingest and query (~2 tool loops per haystack pair). 'algorithmic': dual " "receive() ingest and query (no LLM memory tools).", ) + parser.add_argument( + "--mm-clock-seconds-per-turn", type=float, default=0.0, + help="MemoryManager simulated decay clock (memorymanager backend). 0 => " + "wall-clock (default). >0 => advance logical time by this many seconds " + "per turn (receive() or LLM persist); 600 gives meaningful recency decay " + "during fast batch ingest.", + ) return parser.parse_args() @@ -1324,6 +1332,7 @@ async def judge_one(question: dict) -> None: mm_mode = backend == "memorymanager" if mm_mode: print(f" MM memory mode: {args.mm_memory_mode}") + print(f" MM clock seconds/turn: {args.mm_clock_seconds_per_turn}") if args.mode == "retrieval": raise SystemExit("memorymanager backend supports --mode answerer only.") # MM yields one managed context window, not top-k cutoffs. Collapse to a