From aa5a7159cc19eba35f2cd017eac86295c9d114de Mon Sep 17 00:00:00 2001
From: Rohith Ramanathan <rohith@Rohiths-MacBook-Pro.local>
Date: Sat, 20 Jun 2026 00:41:11 -0400
Subject: [PATCH 1/6] Add MemoryManager backend to the LongMemEval runner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New --backend memorymanager path that benchmarks the custom MemoryManager
system instead of mem0, holding chunks/answerer/judge identical (memory
system is the only variable). Integrates directly at the runner's call
sites — no Mem0Client replacement, no search() seam.

- benchmarks/common/mm_bridge.py: per-question Agent (isolated in-memory
  LongTermStore), OpenRouter gpt-4o/4o-mini backend + shared embedder
  singletons; mm_ingest / mm_surface_and_format (PREP) / mm_persist (PERSIST).
- run.py: --backend memorymanager + --mm-* flags; ingest_question_mm;
  process_question_answerer surfaces via the agent's PREP and persists via
  PERSIST, answer still generated by the SAME harness answerer; single
  managed-window cutoff; mem0 paths untouched.

Validated end-to-end through the bridge with real gpt-4o (construction,
ingest, PREP, PERSIST). Embedder parity (text-embedding-3-small) is a
follow-up needing a small MemoryManager-side change.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 benchmarks/common/mm_bridge.py | 136 +++++++++++++++++++++++++++++++++
 benchmarks/longmemeval/run.py  | 133 +++++++++++++++++++++++++++++---
 2 files changed, 258 insertions(+), 11 deletions(-)
 create mode 100644 benchmarks/common/mm_bridge.py

diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py
new file mode 100644
index 0000000..2b33d8b
--- /dev/null
+++ b/benchmarks/common/mm_bridge.py
@@ -0,0 +1,136 @@
+"""Bridge to the custom MemoryManager (MM) system for the LongMemEval runner.
+
+MM lives in a separate repo (default ~/MemoryManager, override with
+MEMORYMANAGER_PATH). Unlike mem0's stateless extract→search pipeline, MM is a
+stateful managed-context system: we ingest the conversation turn-by-turn, then
+at answer time surface relevant memory into its context (PREP), let the SAME
+harness answerer generate the answer from those surfaced blocks, and persist the
+exchange (PERSIST). Only the surfaced memory differs from the mem0 path — the
+answerer prompt + model and the judge stay identical, so the memory system is
+the sole variable.
+
+Construction mirrors MM's own eval/agent_server.py wiring. The LLM backend
+(OpenRouter, gpt-4o family) and the embedder are process-wide singletons built
+once and shared across per-question agents (both are safe for concurrent
+inference); each question gets its own Agent with an isolated in-memory
+LongTermStore.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import threading
+
+_MM_PATH = os.environ.get("MEMORYMANAGER_PATH", os.path.expanduser("~/MemoryManager"))
+if _MM_PATH not in sys.path:
+    sys.path.insert(0, _MM_PATH)
+
+# MM imports are resolved lazily inside _ensure()/make_mm_agent so that importing
+# this module (e.g. for a syntax/import smoke test) does not require API keys or
+# pull in heavy deps until an agent is actually built.
+
+_LOCK = threading.Lock()
+_BACKEND = None
+_EMBEDDER = None
+
+
+def _ensure(embedding_model: str):
+    """Build (once) and return the shared (backend, embedder) singletons."""
+    global _BACKEND, _EMBEDDER
+    with _LOCK:
+        if _BACKEND is None:
+            from llm import make_llm_backend  # MM's llm package
+
+            # OpenRouter chat-completions backend (gpt-4o / gpt-4o-mini). Reads
+            # OPENROUTER_API_KEY from env / MM's repo-root .env.
+            _BACKEND = make_llm_backend(backend="openrouter")
+        if _EMBEDDER is None:
+            from sentence_transformers import SentenceTransformer
+
+            _EMBEDDER = SentenceTransformer(embedding_model)
+    return _BACKEND, _EMBEDDER
+
+
+def make_mm_agent(
+    max_tokens: int,
+    model: str = "openai/gpt-4o",
+    util_model: str = "openai/gpt-4o-mini",
+    embedding_model: str = "all-MiniLM-L6-v2",
+    novelty_mode=None,
+    mode: str = "llm",
+):
+    """Build a fresh, isolated MemoryManager Agent for one benchmark question."""
+    from agent import Agent, MemoryMode
+    from controller import MemoryController
+    from ContextManager import ContextManager
+    from functions.llm_fns import make_compress_fn, make_merge_fn
+    from memory.longterm import LongTermStore
+    from memory.novelty import NoveltyMode
+    from memory.store import ContextStore
+
+    backend, embedder = _ensure(embedding_model)
+
+    store = ContextStore(max_tokens=max_tokens)
+    lt = LongTermStore("sqlite:///:memory:")  # isolated per agent; StaticPool = thread-safe
+    cm = ContextManager(store, lt, embedding_model=embedder)
+    controller = MemoryController(
+        cm,
+        compress_fn=make_compress_fn(backend, util_model),
+        merge_fn=make_merge_fn(backend, cm, util_model),
+    )
+    mm_mode = MemoryMode.LLM if str(mode).lower() == "llm" else MemoryMode.ALGORITHMIC
+    return Agent(
+        controller,
+        backend,
+        model=model,
+        mode=mm_mode,
+        novelty_mode=(novelty_mode or NoveltyMode.EMBEDDING),
+        novelty_model=util_model,
+    )
+
+
+def _join_chunk(chunk) -> str:
+    """Render one ingestion chunk (a user+assistant pair) as plain text."""
+    if isinstance(chunk, str):
+        return chunk
+    parts = []
+    for msg in chunk:
+        if isinstance(msg, dict):
+            parts.append(f"{msg.get('role', '')}: {msg.get('content', '')}")
+        else:
+            parts.append(str(msg))
+    return "\n".join(parts)
+
+
+def mm_ingest(agent, pairs) -> None:
+    """Ingest each conversation chunk into MM's memory lifecycle (no LLM reply)."""
+    for chunk in pairs:
+        text = _join_chunk(chunk)
+        if text.strip():
+            agent.ingest(text)
+
+
+def mm_surface_and_format(agent, question_text: str) -> list[dict]:
+    """PREP: surface relevant memory into context, then return the in-context
+    blocks shaped like the harness's search results ({memory, score, created_at})
+    so they feed the unchanged get_answer_generation_prompt.
+
+    created_at is None for now — threading session dates onto blocks is a tracked
+    follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions).
+    """
+    agent._llm_prep_phase(question_text)
+    blocks = agent._controller._cm._store.all_blocks()
+    return [
+        {
+            "memory": b.content,
+            "score": float(getattr(b, "novelty_score", 0.0)),
+            "created_at": None,
+        }
+        for b in blocks
+    ]
+
+
+def mm_persist(agent, question_text: str, answer: str) -> None:
+    """PERSIST: store the exchange and re-score novelty."""
+    agent._llm_persist_phase(question_text, answer)
diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py
index 396788d..8a1976f 100644
--- a/benchmarks/longmemeval/run.py
+++ b/benchmarks/longmemeval/run.py
@@ -35,6 +35,7 @@
 
 import argparse
 import asyncio
+import contextlib
 import json
 import os
 import random
@@ -501,6 +502,38 @@ async def ingest_question(
     return total_failed == 0, user_id, total_processed
 
 
+async def ingest_question_mm(question, args, run_id, logger, shutdown):
+    """Ingest a question's haystack into a fresh MemoryManager agent.
+
+    Mirrors ingest_question's chunking (same chronological sessions, same
+    pair_turns, same empty-content skipping) but feeds the pairs to MM's
+    turn-by-turn ingest lifecycle instead of mem0.add. MM agents are ephemeral
+    in-memory, so there is no checkpoint/resume — we re-ingest each run.
+
+    Returns: (success, agent, total_pairs)
+    """
+    from benchmarks.common import mm_bridge
+
+    question_id = question["question_id"]
+    sorted_sessions = sort_sessions_chronologically(question)
+
+    all_pairs: list = []
+    for _session_id, _date_str, session in sorted_sessions:
+        if not session:
+            continue
+        for messages in pair_turns(session):
+            if any(not msg.get("content", "").strip() for msg in messages):
+                continue
+            all_pairs.append(messages)
+
+    agent = mm_bridge.make_mm_agent(
+        args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model,
+    )
+    await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs)
+    logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs))
+    return True, agent, len(all_pairs)
+
+
 # ===============================================================================
 # SEARCH + ANSWER + JUDGE
 # ===============================================================================
@@ -519,9 +552,16 @@ async def process_question_answerer(
     logger: Any,
     score_debug: bool = False,
     existing_search_results: list | None = None,
+    agent: Any = None,
 ) -> dict[str, Any]:
     """Process a question in answerer mode: search + generate answer + judge.
 
+    When ``agent`` is provided (MemoryManager backend), memory is surfaced via the
+    agent's PREP phase instead of mem0.search, the answer is generated by the SAME
+    harness answerer over those surfaced blocks, and the exchange is persisted via
+    the agent's PERSIST phase. cutoffs is a single value for MM (one managed
+    window), so the cutoff loop runs once.
+
     Returns a result dict suitable for serialization.
     """
     question_id = question["question_id"]
@@ -535,8 +575,18 @@ async def process_question_answerer(
         parse_longmemeval_date_human(question_date) if question_date else ""
     )
 
-    # --- Search ---
-    if existing_search_results is not None:
+    # --- Search / surface ---
+    if agent is not None:
+        # MemoryManager: PREP surfaces relevant memory into the managed context.
+        from benchmarks.common import mm_bridge
+
+        start = time.monotonic()
+        formatted = await asyncio.to_thread(
+            mm_bridge.mm_surface_and_format, agent, question_text,
+        )
+        search_latency = (time.monotonic() - start) * 1000
+        query_debug = None
+    elif existing_search_results is not None:
         formatted = existing_search_results
         query_debug = None
         search_latency = 0.0
@@ -624,6 +674,14 @@ async def process_question_answerer(
             "reason": f"Generated answer: {generated_answer[:500]}",
         }
 
+        if agent is not None:
+            # PERSIST the exchange + re-score (single cutoff, so runs once).
+            from benchmarks.common import mm_bridge
+
+            await asyncio.to_thread(
+                mm_bridge.mm_persist, agent, question_text, generated_answer,
+            )
+
     result["cutoff_results"] = cutoff_results
     return result
 
@@ -1033,8 +1091,9 @@ def parse_args() -> argparse.Namespace:
         help="Requests per minute for LLM",
     )
     parser.add_argument(
-        "--backend", default="oss", choices=["oss", "cloud"],
-        help="Mem0 backend: 'oss' for self-hosted server (default), 'cloud' for api.mem0.ai",
+        "--backend", default="oss", choices=["oss", "cloud", "memorymanager"],
+        help="Memory backend: 'oss'/'cloud' for Mem0, 'memorymanager' for the custom "
+             "MemoryManager system (answerer mode only).",
     )
     parser.add_argument(
         "--mem0-host", default=None,
@@ -1044,6 +1103,19 @@ def parse_args() -> argparse.Namespace:
         "--mem0-api-key", default=None,
         help="Mem0 API key (cloud mode only)",
     )
+    # MemoryManager backend options (ignored for Mem0 backends).
+    parser.add_argument(
+        "--mm-max-tokens", type=int, default=8000,
+        help="MemoryManager context-window token budget (memorymanager backend).",
+    )
+    parser.add_argument(
+        "--mm-model", default="openai/gpt-4o",
+        help="MemoryManager main model (memorymanager backend).",
+    )
+    parser.add_argument(
+        "--mm-util-model", default="openai/gpt-4o-mini",
+        help="MemoryManager util model for compress/merge/novelty (memorymanager backend).",
+    )
     return parser.parse_args()
 
 
@@ -1231,12 +1303,22 @@ async def judge_one(question: dict) -> None:
         return
 
     backend = os.getenv("MEM0_BACKEND", args.backend)
-    mem0 = Mem0Client(
-        mode=backend,
-        host=args.mem0_host,
-        api_key=args.mem0_api_key if backend == "cloud" else None,
-        rpm=args.rpm,
-    )
+    mm_mode = backend == "memorymanager"
+    if mm_mode:
+        if args.mode == "retrieval":
+            raise SystemExit("memorymanager backend supports --mode answerer only.")
+        # MM yields one managed context window, not top-k cutoffs. Collapse to a
+        # single cutoff (>= window size) so the slice returns all surfaced blocks
+        # and the metrics/display path is reused unchanged.
+        cutoffs = [args.top_k]
+        mem0 = None
+    else:
+        mem0 = Mem0Client(
+            mode=backend,
+            host=args.mem0_host,
+            api_key=args.mem0_api_key if backend == "cloud" else None,
+            rpm=args.rpm,
+        )
     shutdown = GracefulShutdown()
     checkpoint = Checkpoint(output_dir)
 
@@ -1256,7 +1338,7 @@ async def judge_one(question: dict) -> None:
 
     existing_ids = {e["question_id"] for e in all_evaluations}
 
-    async with mem0:
+    async with (contextlib.nullcontext() if mem0 is None else mem0):
         with shutdown:
             results_lock = asyncio.Lock()
             question_semaphore = asyncio.Semaphore(args.max_workers)
@@ -1282,6 +1364,35 @@ async def process_single_question(question: dict):
                             pbar.update(1)
                             return
 
+                    if mm_mode:
+                        # MemoryManager path: fresh agent → ingest → PREP+answer+PERSIST.
+                        success, agent, _pairs = await ingest_question_mm(
+                            question, args, run_id, logger, shutdown,
+                        )
+                        if shutdown.requested:
+                            return
+                        result = await process_question_answerer(
+                            question=question,
+                            user_id=f"longmemeval_{question_id}_{run_id}",
+                            mem0=None,
+                            answerer=answerer,
+                            judge_llm=judge_llm,
+                            cutoffs=cutoffs,
+                            top_k=args.top_k,
+                            user_profile=None,
+                            predict_only=args.predict_only,
+                            logger=logger,
+                            score_debug=args.score_debug,
+                            agent=agent,
+                        )
+                        result_path = os.path.join(output_dir, f"{question_id}.json")
+                        save_result_json(result_path, result)
+                        async with results_lock:
+                            all_evaluations.append(result)
+                            existing_ids.add(question_id)
+                        pbar.update(1)
+                        return
+
                     # Check if we have predict-only results (search data already exists)
                     existing_predict = predict_only_results.get(question_id)
                     if existing_predict and existing_predict.get("retrieval"):

From 673c4f71f9c17ca1453e0c4150278ac6e0334a9b Mon Sep 17 00:00:00 2001
From: Rohith Ramanathan <rohith@Rohiths-MacBook-Pro.local>
Date: Mon, 22 Jun 2026 18:00:04 -0400
Subject: [PATCH 2/6] Default MemoryManager backend embedder to OpenRouter
 text-embedding-3-small

Embedder parity with mem0 OSS (text-embedding-3-small), routed through OpenRouter
so the whole stack uses one OpenRouter key (no OpenAI key): route the bridge's
embedder through MM's make_embedder (was hardcoding SentenceTransformer), default
to "openrouter:openai/text-embedding-3-small" (1536-d, matches LongTermStore's
default embedding_dim), and add --mm-embedding-model (accepts openrouter:/openai:
specs or a local sentence-transformers name). Verified: OpenRouter's embeddings
endpoint returns a 1536-d vector for openai/text-embedding-3-small.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 benchmarks/common/mm_bridge.py | 9 ++++++---
 benchmarks/longmemeval/run.py  | 7 +++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py
index 2b33d8b..fbdfe1e 100644
--- a/benchmarks/common/mm_bridge.py
+++ b/benchmarks/common/mm_bridge.py
@@ -46,9 +46,12 @@ def _ensure(embedding_model: str):
             # OPENROUTER_API_KEY from env / MM's repo-root .env.
             _BACKEND = make_llm_backend(backend="openrouter")
         if _EMBEDDER is None:
-            from sentence_transformers import SentenceTransformer
+            from memory.embeddings import make_embedder  # MM's embedder factory
 
-            _EMBEDDER = SentenceTransformer(embedding_model)
+            # Resolves a spec to the right backend: "text-embedding-3-small" /
+            # "openai:<m>" (OpenAI direct), "openrouter:<m>" (via OpenRouter), or
+            # a sentence-transformers name (local). Matches mem0 OSS's embedder.
+            _EMBEDDER = make_embedder(embedding_model)
     return _BACKEND, _EMBEDDER
 
 
@@ -56,7 +59,7 @@ def make_mm_agent(
     max_tokens: int,
     model: str = "openai/gpt-4o",
     util_model: str = "openai/gpt-4o-mini",
-    embedding_model: str = "all-MiniLM-L6-v2",
+    embedding_model: str = "openrouter:openai/text-embedding-3-small",
     novelty_mode=None,
     mode: str = "llm",
 ):
diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py
index 8a1976f..c8ea025 100644
--- a/benchmarks/longmemeval/run.py
+++ b/benchmarks/longmemeval/run.py
@@ -528,6 +528,7 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown):
 
     agent = mm_bridge.make_mm_agent(
         args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model,
+        embedding_model=args.mm_embedding_model,
     )
     await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs)
     logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs))
@@ -1116,6 +1117,12 @@ def parse_args() -> argparse.Namespace:
         "--mm-util-model", default="openai/gpt-4o-mini",
         help="MemoryManager util model for compress/merge/novelty (memorymanager backend).",
     )
+    parser.add_argument(
+        "--mm-embedding-model", default="openrouter:openai/text-embedding-3-small",
+        help="MemoryManager embedder (memorymanager backend). Default routes via "
+             "OpenRouter (OPENROUTER_API_KEY). Also accepts 'openai:<m>'/'text-embedding-3-small' "
+             "(OpenAI direct) or a sentence-transformers name for a local keyless embedder.",
+    )
     return parser.parse_args()
 
 

From be7290f297ee006ab9ad26854b36a7e4c51e0296 Mon Sep 17 00:00:00 2001
From: Rohith Ramanathan <rohith@Rohiths-MacBook-Pro.local>
Date: Mon, 22 Jun 2026 18:43:30 -0400
Subject: [PATCH 3/6] Align MemoryManager bridge with native llm and
 algorithmic turn lifecycles.

Use PREP+PERSIST ingest and query in llm mode and dual receive() in algorithmic mode so LongMemEval exercises the same memory paths as Agent, with the harness answerer substituting for REPLY.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 benchmarks/common/mm_bridge.py | 124 ++++++++++++++++++++++-----------
 benchmarks/longmemeval/run.py  |  18 +++--
 2 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py
index fbdfe1e..112ee6a 100644
--- a/benchmarks/common/mm_bridge.py
+++ b/benchmarks/common/mm_bridge.py
@@ -2,18 +2,25 @@
 
 MM lives in a separate repo (default ~/MemoryManager, override with
 MEMORYMANAGER_PATH). Unlike mem0's stateless extract→search pipeline, MM is a
-stateful managed-context system: we ingest the conversation turn-by-turn, then
-at answer time surface relevant memory into its context (PREP), let the SAME
-harness answerer generate the answer from those surfaced blocks, and persist the
-exchange (PERSIST). Only the surfaced memory differs from the mem0 path — the
-answerer prompt + model and the judge stay identical, so the memory system is
-the sole variable.
-
-Construction mirrors MM's own eval/agent_server.py wiring. The LLM backend
-(OpenRouter, gpt-4o family) and the embedder are process-wide singletons built
-once and shared across per-question agents (both are safe for concurrent
-inference); each question gets its own Agent with an isolated in-memory
-LongTermStore.
+stateful managed-context system. Each benchmark question gets a fresh Agent with
+an isolated in-memory LongTermStore.
+
+Two memory modes mirror Agent's native turn structures (REPLY is always the
+external harness answerer, not MM's model):
+
+  llm mode — ingest:  PREP(user) → PERSIST(user, assistant) per haystack pair
+             query:   PREP(question) → harness answer → PERSIST(question, answer)
+
+  algorithmic mode — ingest:  receive(user) → receive(assistant) per pair
+                     query:   receive(question) → harness answer → receive(answer)
+
+Only the surfaced memory differs from the mem0 path — the answerer prompt,
+model, and judge stay identical, so the memory system is the sole variable.
+
+LLM-mode ingest is expensive (~2 tool-loop calls per haystack pair).
+
+The LLM backend (OpenRouter, gpt-4o family) and embedder are process-wide
+singletons built once and shared across per-question agents.
 """
 
 from __future__ import annotations
@@ -93,36 +100,35 @@ def make_mm_agent(
     )
 
 
-def _join_chunk(chunk) -> str:
-    """Render one ingestion chunk (a user+assistant pair) as plain text."""
+def _split_pair(chunk) -> tuple[str, str]:
+    """Extract user and assistant content from one haystack pair."""
     if isinstance(chunk, str):
-        return chunk
-    parts = []
+        return chunk, ""
+    user = assistant = ""
     for msg in chunk:
-        if isinstance(msg, dict):
-            parts.append(f"{msg.get('role', '')}: {msg.get('content', '')}")
-        else:
-            parts.append(str(msg))
-    return "\n".join(parts)
-
-
-def mm_ingest(agent, pairs) -> None:
-    """Ingest each conversation chunk into MM's memory lifecycle (no LLM reply)."""
-    for chunk in pairs:
-        text = _join_chunk(chunk)
-        if text.strip():
-            agent.ingest(text)
-
+        if not isinstance(msg, dict):
+            continue
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+        if role == "user":
+            user = content
+        elif role == "assistant":
+            assistant = content
+    return user, assistant
+
+
+def _algorithmic_receive(agent, content: str) -> None:
+    """One receive() pass — mirrors the memory half of _algorithmic_turn."""
+    if not content.strip():
+        return
+    embedding = agent._controller.embed(content)
+    agent._controller.receive(
+        content, embedding, agent._novelty_fn(content, embedding)
+    )
 
-def mm_surface_and_format(agent, question_text: str) -> list[dict]:
-    """PREP: surface relevant memory into context, then return the in-context
-    blocks shaped like the harness's search results ({memory, score, created_at})
-    so they feed the unchanged get_answer_generation_prompt.
 
-    created_at is None for now — threading session dates onto blocks is a tracked
-    follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions).
-    """
-    agent._llm_prep_phase(question_text)
+def _format_blocks(agent) -> list[dict]:
+    """Return in-context blocks shaped like mem0 search hits for the harness."""
     blocks = agent._controller._cm._store.all_blocks()
     return [
         {
@@ -134,6 +140,44 @@ def mm_surface_and_format(agent, question_text: str) -> list[dict]:
     ]
 
 
+def mm_ingest(agent, pairs) -> None:
+    """Ingest haystack pairs using the agent's native mode lifecycle (no REPLY)."""
+    from agent import MemoryMode
+
+    for chunk in pairs:
+        user, assistant = _split_pair(chunk)
+        if not user.strip() and not assistant.strip():
+            continue
+        if agent._mode == MemoryMode.LLM:
+            if user.strip():
+                agent._llm_prep_phase(user)
+            if user.strip() or assistant.strip():
+                agent._llm_persist_phase(user, assistant)
+        else:
+            _algorithmic_receive(agent, user)
+            _algorithmic_receive(agent, assistant)
+
+
+def mm_surface_and_format(agent, question_text: str) -> list[dict]:
+    """Surface relevant memory into context, then return blocks for the harness.
+
+    created_at is None for now — threading session dates onto blocks is a tracked
+    follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions).
+    """
+    from agent import MemoryMode
+
+    if agent._mode == MemoryMode.LLM:
+        agent._llm_prep_phase(question_text)
+    else:
+        _algorithmic_receive(agent, question_text)
+    return _format_blocks(agent)
+
+
 def mm_persist(agent, question_text: str, answer: str) -> None:
-    """PERSIST: store the exchange and re-score novelty."""
-    agent._llm_persist_phase(question_text, answer)
+    """Persist the Q+A exchange after the harness answer."""
+    from agent import MemoryMode
+
+    if agent._mode == MemoryMode.LLM:
+        agent._llm_persist_phase(question_text, answer)
+    else:
+        _algorithmic_receive(agent, answer)
diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py
index c8ea025..5ee9bc6 100644
--- a/benchmarks/longmemeval/run.py
+++ b/benchmarks/longmemeval/run.py
@@ -528,7 +528,7 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown):
 
     agent = mm_bridge.make_mm_agent(
         args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model,
-        embedding_model=args.mm_embedding_model,
+        embedding_model=args.mm_embedding_model, mode=args.mm_memory_mode,
     )
     await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs)
     logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs))
@@ -558,10 +558,11 @@ async def process_question_answerer(
     """Process a question in answerer mode: search + generate answer + judge.
 
     When ``agent`` is provided (MemoryManager backend), memory is surfaced via the
-    agent's PREP phase instead of mem0.search, the answer is generated by the SAME
-    harness answerer over those surfaced blocks, and the exchange is persisted via
-    the agent's PERSIST phase. cutoffs is a single value for MM (one managed
-    window), so the cutoff loop runs once.
+    agent's mode-native prep step (LLM PREP or algorithmic receive) instead of
+    mem0.search, the answer is generated by the SAME harness answerer over those
+    surfaced blocks, and the exchange is persisted via the agent's mode-native
+    persist step (LLM PERSIST or algorithmic receive). cutoffs is a single value
+    for MM (one managed window), so the cutoff loop runs once.
 
     Returns a result dict suitable for serialization.
     """
@@ -1123,6 +1124,12 @@ def parse_args() -> argparse.Namespace:
              "OpenRouter (OPENROUTER_API_KEY). Also accepts 'openai:<m>'/'text-embedding-3-small' "
              "(OpenAI direct) or a sentence-transformers name for a local keyless embedder.",
     )
+    parser.add_argument(
+        "--mm-memory-mode", default="llm", choices=["llm", "algorithmic"],
+        help="MemoryManager memory lifecycle (memorymanager backend). 'llm': PREP+PERSIST "
+             "ingest and query (~2 tool loops per haystack pair). 'algorithmic': dual "
+             "receive() ingest and query (no LLM memory tools).",
+    )
     return parser.parse_args()
 
 
@@ -1312,6 +1319,7 @@ async def judge_one(question: dict) -> None:
     backend = os.getenv("MEM0_BACKEND", args.backend)
     mm_mode = backend == "memorymanager"
     if mm_mode:
+        print(f"  MM memory mode: {args.mm_memory_mode}")
         if args.mode == "retrieval":
             raise SystemExit("memorymanager backend supports --mode answerer only.")
         # MM yields one managed context window, not top-k cutoffs. Collapse to a

From caef78be2af733302ec024036b149525bae29363 Mon Sep 17 00:00:00 2001
From: Rohith Ramanathan <rohith@Rohiths-MacBook-Pro.local>
Date: Tue, 23 Jun 2026 00:40:40 -0400
Subject: [PATCH 4/6] docs: add CLAUDE.md covering the MemoryManager backend
 integration

How-to for the MM-vs-mem0 LongMemEval head-to-head: the mm_bridge integration,
all-OpenRouter routing (3 points + env), reusing MM's venv, mem0 OpenRouter
config, run commands, and fairness caveats.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CLAUDE.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..ef987d6
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,106 @@
+# memory-benchmarks — CLAUDE.md
+
+Harness for memory-system benchmarks (LongMemEval, LoCoMo, BEAM). Upstream it
+benchmarks **mem0**; this repo has been extended with a **MemoryManager (MM)**
+backend so the two can be compared head-to-head on LongMemEval, varying only the
+memory system. See the repo `README.md` for the original mem0 usage.
+
+## The comparison goal
+
+Fair LongMemEval head-to-head: **only the memory system differs.** Conversation
+chunks, the answerer (`gpt-4o`), the judge (`gpt-4o-mini`), and the embedder
+(`text-embedding-3-small`) are identical across both backends. mem0 OSS's defaults
+(`gpt-4o-mini` extraction + `text-embedding-3-small`) already match, and MM is
+configured to the same models.
+
+**Everything routes through OpenRouter on one key — no OpenAI key.** MM chat, MM
+embeddings, the harness answerer/judge, and the mem0 server all hit OpenRouter's
+OpenAI-compatible endpoint.
+
+## How the MM integration works
+
+MM lives in a **separate repo** (`~/MemoryManager`, override `MEMORYMANAGER_PATH`)
+and is a stateful managed-context system, not a stateless extract→search store. So
+the integration is **at the runner's call sites**, not a drop-in `Mem0Client`:
+
+- `benchmarks/common/mm_bridge.py` — builds a fresh, isolated `Agent` per question
+  (in-memory `LongTermStore`), and exposes `mm_ingest` / `mm_surface_and_format`
+  (PREP) / `mm_persist` (PERSIST). The **harness answerer** generates the answer
+  from MM's surfaced blocks (MM's own REPLY phase is bypassed — the fairness crux);
+  the **judge is unchanged**.
+- `benchmarks/longmemeval/run.py` — `--backend memorymanager` branch at the
+  ingest/answer call sites; all mem0 paths untouched. MM yields one managed-context
+  window (no top-k cutoffs), so it reports a single cutoff.
+
+MM-side dependencies (the OpenRouter `LLMBackend` and the `memory/embeddings.py`
+Embedder abstraction) are on MM's **`main`** branch — run against a `main` checkout.
+
+## All-OpenRouter routing (3 points + env)
+
+| Component | How it routes to OpenRouter |
+|---|---|
+| MM chat (gpt-4o / gpt-4o-mini) | `make_llm_backend(backend="openrouter")` (reads `OPENROUTER_API_KEY` from MM's `.env`) |
+| MM embedder | `--mm-embedding-model openrouter:openai/text-embedding-3-small` (default) → `OpenAIEmbedder(base_url=OpenRouter)` |
+| Harness answerer + judge | `LLMClient` (provider `openai`) honors `OPENAI_BASE_URL` + `OPENAI_API_KEY` env |
+| mem0 OSS server | `mem0-config.yaml` (`openai_base_url: …openrouter…`) mounted via `docker-compose.yml`; key via `OPENAI_API_KEY` |
+
+`.env` (repo root, gitignored — **create it yourself**, the key comes from MM's `.env`):
+```
+OPENAI_API_KEY=<your OpenRouter key>
+OPENAI_BASE_URL=https://openrouter.ai/api/v1
+OPENROUTER_API_KEY=<your OpenRouter key>
+```
+One-liner to create it from MM's `.env`:
+```
+K=$(grep -E '^OPENROUTER_API_KEY=' ~/MemoryManager/.env | head -1 | cut -d= -f2-) && \
+printf 'OPENAI_API_KEY=%s\nOPENAI_BASE_URL=https://openrouter.ai/api/v1\nOPENROUTER_API_KEY=%s\n' "$K" "$K" > .env
+```
+
+## Environment
+
+No `.venv` is committed here; the default `python3` may be too new for some wheels.
+**Reuse MM's venv** (Python 3.12, already has every dep + `aiolimiter` added):
+`~/MemoryManager/.venv/bin/python`. mem0 is reached over raw HTTP, so no mem0 SDK
+is needed. Ensure `~/MemoryManager` is on `main` (has `memory/embeddings.py`).
+
+## Running
+
+```bash
+cd ~/memory-benchmarks
+PY=~/MemoryManager/.venv/bin/python
+
+# mem0 OSS server (OpenRouter-backed via mem0-config.yaml), at localhost:8888
+docker compose up -d
+
+# MemoryManager backend
+$PY -m benchmarks.longmemeval.run --backend memorymanager \
+  --answerer-model openai/gpt-4o --judge-model openai/gpt-4o-mini --provider openai \
+  --mm-max-tokens 8000 --mm-embedding-model openrouter:openai/text-embedding-3-small \
+  --per-type 1 --max-workers 2 --project-name mm_smoke
+
+# mem0 OSS backend (same flags)
+$PY -m benchmarks.longmemeval.run --backend oss \
+  --answerer-model openai/gpt-4o --judge-model openai/gpt-4o-mini --provider openai \
+  --per-type 1 --max-workers 4 --project-name mem0_smoke
+```
+
+MM fires several gpt-4o calls per question (PREP/PERSIST tool loops, and LLM-mode
+ingest is ~2 calls/haystack-pair) → **expensive**. Scale up deliberately:
+`--per-type 1` smoke → `--per-type 5` → `--all-questions` (500) only when intended.
+Use a lower `--max-workers` for MM than for mem0.
+
+### Key MM flags (`memorymanager` backend)
+- `--mm-max-tokens` (default 8000) — MM context-window budget; set near the token
+  size of the mem0 cutoff you compare against.
+- `--mm-model` / `--mm-util-model` — default `openai/gpt-4o` / `openai/gpt-4o-mini`.
+- `--mm-embedding-model` — default `openrouter:openai/text-embedding-3-small`; also
+  `openai:<m>`/`text-embedding-3-small` (OpenAI direct) or a sentence-transformers
+  name (local, keyless).
+
+## Fairness notes / known caveats
+- **Single managed window vs mem0 cutoffs:** MM has no top-k; it reports one cutoff.
+  Compare against the mem0 cutoff whose token footprint ≈ `--mm-max-tokens`.
+- **Per-memory dates:** MM blocks carry `created_at: None`, so the answer prompt
+  can't date-group MM memories (mem0 does) — a temporal-reasoning disadvantage.
+  Tracked follow-up in MM (thread session timestamps onto blocks).
+- **`--mode answerer` only** for the MM backend (retrieval mode is rejected).

From 607f9a2be36689775fab6a314753212842ebbd8b Mon Sep 17 00:00:00 2001
From: Rohith Ramanathan <rohith@Rohiths-MacBook-Pro.local>
Date: Tue, 23 Jun 2026 01:45:45 -0400
Subject: [PATCH 5/6] Wire per-pair source dates into the MM bridge (created_at
 parity)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thread each haystack pair's real-world session date through ingest: run.py
builds a per-pair date list (parse_longmemeval_date -> UTC datetime) parallel to
the pairs and passes it to mm_ingest, which stamps each pair's blocks via the
scoped ContextManager.using_source_date(). _format_blocks now emits the block's
source_date as an ISO created_at, so the answerer prompt date-orders/-groups MM
memories the same way it does mem0's per-memory timestamps — closing the
temporal-reasoning gap. dates is optional (un-dated ingest still works).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 benchmarks/common/mm_bridge.py | 69 ++++++++++++++++++++++------------
 benchmarks/longmemeval/run.py  |  8 +++-
 2 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py
index 112ee6a..d7fdf10 100644
--- a/benchmarks/common/mm_bridge.py
+++ b/benchmarks/common/mm_bridge.py
@@ -17,6 +17,10 @@
 Only the surfaced memory differs from the mem0 path — the answerer prompt,
 model, and judge stay identical, so the memory system is the sole variable.
 
+Each pair is ingested under its real-world session date (via the scoped
+ContextManager.using_source_date), so surfaced blocks carry a created_at the
+answerer can date-order — parity with mem0's per-memory timestamps.
+
 LLM-mode ingest is expensive (~2 tool-loop calls per haystack pair).
 
 The LLM backend (OpenRouter, gpt-4o family) and embedder are process-wide
@@ -128,41 +132,58 @@ def _algorithmic_receive(agent, content: str) -> None:
 
 
 def _format_blocks(agent) -> list[dict]:
-    """Return in-context blocks shaped like mem0 search hits for the harness."""
+    """Return in-context blocks shaped like mem0 search hits for the harness.
+
+    created_at is the block's source_date (the real-world date of the memory's
+    content) as an ISO string, so the answerer prompt can date-group/-order
+    memories just like it does for mem0's per-memory timestamps.
+    """
     blocks = agent._controller._cm._store.all_blocks()
-    return [
-        {
-            "memory": b.content,
-            "score": float(getattr(b, "novelty_score", 0.0)),
-            "created_at": None,
-        }
-        for b in blocks
-    ]
-
-
-def mm_ingest(agent, pairs) -> None:
-    """Ingest haystack pairs using the agent's native mode lifecycle (no REPLY)."""
+    out: list[dict] = []
+    for b in blocks:
+        sd = getattr(b, "source_date", None)
+        out.append(
+            {
+                "memory": b.content,
+                "score": float(getattr(b, "novelty_score", 0.0)),
+                "created_at": sd.isoformat() if sd else None,
+            }
+        )
+    return out
+
+
+def mm_ingest(agent, pairs, dates=None) -> None:
+    """Ingest haystack pairs using the agent's native mode lifecycle (no REPLY).
+
+    ``dates`` is an optional list parallel to ``pairs`` of datetimes (the real-world
+    session date of each pair); when given, each pair's blocks are stamped with that
+    source_date via the scoped using_source_date() so the answerer can date-order
+    them. None entries (or dates=None) leave blocks undated.
+    """
     from agent import MemoryMode
 
-    for chunk in pairs:
+    cm = agent._controller._cm
+    for i, chunk in enumerate(pairs):
         user, assistant = _split_pair(chunk)
         if not user.strip() and not assistant.strip():
             continue
-        if agent._mode == MemoryMode.LLM:
-            if user.strip():
-                agent._llm_prep_phase(user)
-            if user.strip() or assistant.strip():
-                agent._llm_persist_phase(user, assistant)
-        else:
-            _algorithmic_receive(agent, user)
-            _algorithmic_receive(agent, assistant)
+        source_date = dates[i] if dates is not None else None
+        with cm.using_source_date(source_date):
+            if agent._mode == MemoryMode.LLM:
+                if user.strip():
+                    agent._llm_prep_phase(user)
+                if user.strip() or assistant.strip():
+                    agent._llm_persist_phase(user, assistant)
+            else:
+                _algorithmic_receive(agent, user)
+                _algorithmic_receive(agent, assistant)
 
 
 def mm_surface_and_format(agent, question_text: str) -> list[dict]:
     """Surface relevant memory into context, then return blocks for the harness.
 
-    created_at is None for now — threading session dates onto blocks is a tracked
-    follow-up in MemoryManager (disadvantages MM on temporal-reasoning questions).
+    Surfaced blocks carry the source_date stamped at ingest (see mm_ingest), which
+    _format_blocks emits as created_at for date-ordering in the answerer prompt.
     """
     from agent import MemoryMode
 
diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py
index 5ee9bc6..057ec29 100644
--- a/benchmarks/longmemeval/run.py
+++ b/benchmarks/longmemeval/run.py
@@ -518,19 +518,23 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown):
     sorted_sessions = sort_sessions_chronologically(question)
 
     all_pairs: list = []
-    for _session_id, _date_str, session in sorted_sessions:
+    all_dates: list = []  # real-world session date per pair (parallel to all_pairs)
+    for _session_id, date_str, session in sorted_sessions:
         if not session:
             continue
+        ts = parse_longmemeval_date(date_str) if date_str else None
+        session_date = datetime.fromtimestamp(ts, tz=timezone.utc) if ts else None
         for messages in pair_turns(session):
             if any(not msg.get("content", "").strip() for msg in messages):
                 continue
             all_pairs.append(messages)
+            all_dates.append(session_date)
 
     agent = mm_bridge.make_mm_agent(
         args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model,
         embedding_model=args.mm_embedding_model, mode=args.mm_memory_mode,
     )
-    await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs)
+    await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs, all_dates)
     logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs))
     return True, agent, len(all_pairs)
 

From 765222f9fb3b073082ddfc46423af7c374bc1340 Mon Sep 17 00:00:00 2001
From: Rohith Ramanathan <rohith@Rohiths-MacBook-Pro.local>
Date: Tue, 23 Jun 2026 03:13:35 -0400
Subject: [PATCH 6/6] Expose MemoryManager simulated decay clock via
 --mm-clock-seconds-per-turn.

Thread clock_seconds_per_turn through mm_bridge into MemoryConfig so batch LongMemEval runs can differentiate recency during fast ingest.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 benchmarks/common/mm_bridge.py | 8 ++++++--
 benchmarks/longmemeval/run.py  | 9 +++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/benchmarks/common/mm_bridge.py b/benchmarks/common/mm_bridge.py
index d7fdf10..3fbaba1 100644
--- a/benchmarks/common/mm_bridge.py
+++ b/benchmarks/common/mm_bridge.py
@@ -73,25 +73,29 @@ def make_mm_agent(
     embedding_model: str = "openrouter:openai/text-embedding-3-small",
     novelty_mode=None,
     mode: str = "llm",
+    clock_seconds_per_turn: float = 0.0,
 ):
     """Build a fresh, isolated MemoryManager Agent for one benchmark question."""
     from agent import Agent, MemoryMode
     from controller import MemoryController
     from ContextManager import ContextManager
     from functions.llm_fns import make_compress_fn, make_merge_fn
+    from memory.config import MemoryConfig
     from memory.longterm import LongTermStore
     from memory.novelty import NoveltyMode
     from memory.store import ContextStore
 
     backend, embedder = _ensure(embedding_model)
 
-    store = ContextStore(max_tokens=max_tokens)
+    cfg = MemoryConfig(clock_seconds_per_turn=clock_seconds_per_turn)
+    store = ContextStore(max_tokens=max_tokens, config=cfg)
     lt = LongTermStore("sqlite:///:memory:")  # isolated per agent; StaticPool = thread-safe
-    cm = ContextManager(store, lt, embedding_model=embedder)
+    cm = ContextManager(store, lt, embedding_model=embedder, config=cfg)
     controller = MemoryController(
         cm,
         compress_fn=make_compress_fn(backend, util_model),
         merge_fn=make_merge_fn(backend, cm, util_model),
+        config=cfg,
     )
     mm_mode = MemoryMode.LLM if str(mode).lower() == "llm" else MemoryMode.ALGORITHMIC
     return Agent(
diff --git a/benchmarks/longmemeval/run.py b/benchmarks/longmemeval/run.py
index 057ec29..63522ab 100644
--- a/benchmarks/longmemeval/run.py
+++ b/benchmarks/longmemeval/run.py
@@ -533,6 +533,7 @@ async def ingest_question_mm(question, args, run_id, logger, shutdown):
     agent = mm_bridge.make_mm_agent(
         args.mm_max_tokens, model=args.mm_model, util_model=args.mm_util_model,
         embedding_model=args.mm_embedding_model, mode=args.mm_memory_mode,
+        clock_seconds_per_turn=args.mm_clock_seconds_per_turn,
     )
     await asyncio.to_thread(mm_bridge.mm_ingest, agent, all_pairs, all_dates)
     logger.info("MM ingested question %s: %d pairs", question_id, len(all_pairs))
@@ -1134,6 +1135,13 @@ def parse_args() -> argparse.Namespace:
              "ingest and query (~2 tool loops per haystack pair). 'algorithmic': dual "
              "receive() ingest and query (no LLM memory tools).",
     )
+    parser.add_argument(
+        "--mm-clock-seconds-per-turn", type=float, default=0.0,
+        help="MemoryManager simulated decay clock (memorymanager backend). 0 => "
+             "wall-clock (default). >0 => advance logical time by this many seconds "
+             "per turn (receive() or LLM persist); 600 gives meaningful recency decay "
+             "during fast batch ingest.",
+    )
     return parser.parse_args()
 
 
@@ -1324,6 +1332,7 @@ async def judge_one(question: dict) -> None:
     mm_mode = backend == "memorymanager"
     if mm_mode:
         print(f"  MM memory mode: {args.mm_memory_mode}")
+        print(f"  MM clock seconds/turn: {args.mm_clock_seconds_per_turn}")
         if args.mode == "retrieval":
             raise SystemExit("memorymanager backend supports --mode answerer only.")
         # MM yields one managed context window, not top-k cutoffs. Collapse to a