diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py
index ede0e1034..db91ad529 100644
--- a/openml/_api/resources/base/resources.py
+++ b/openml/_api/resources/base/resources.py
@@ -1,9 +1,15 @@
 from __future__ import annotations
 
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Any
+
 from openml.enums import ResourceType
 
 from .base import ResourceAPI
 
+if TYPE_CHECKING:
+    from openml.evaluations import OpenMLEvaluation
+
 
 class DatasetAPI(ResourceAPI):
     """Abstract API interface for dataset resources."""
@@ -34,6 +40,23 @@ class EvaluationAPI(ResourceAPI):
 
     resource_type: ResourceType = ResourceType.EVALUATION
 
+    @abstractmethod
+    def list(  # noqa: PLR0913
+        self,
+        limit: int,
+        offset: int,
+        *,
+        function: str,
+        tasks: list | None = None,
+        setups: list | None = None,
+        flows: list | None = None,
+        runs: list | None = None,
+        uploaders: list | None = None,
+        study: int | None = None,
+        sort_order: str | None = None,
+        **kwargs: Any,
+    ) -> list[OpenMLEvaluation]: ...
+
 
 class FlowAPI(ResourceAPI):
     """Abstract API interface for flow resources."""
diff --git a/openml/_api/resources/evaluation.py b/openml/_api/resources/evaluation.py
index fe7e360a6..8a8ecf5d0 100644
--- a/openml/_api/resources/evaluation.py
+++ b/openml/_api/resources/evaluation.py
@@ -1,11 +1,279 @@
 from __future__ import annotations
 
+import builtins
+import json
+from typing import Any
+
+import xmltodict
+
+from openml.evaluations import OpenMLEvaluation
+
 from .base import EvaluationAPI, ResourceV1API, ResourceV2API
 
 
 class EvaluationV1API(ResourceV1API, EvaluationAPI):
-    """Version 1 API implementation for evaluation resources."""
+    """V1 API implementation for evaluations.
+    Fetches evaluations from the v1 XML API endpoint.
+    """
+
+    def list(  # noqa: PLR0913
+        self,
+        limit: int,
+        offset: int,
+        *,
+        function: str,
+        tasks: builtins.list | None = None,
+        setups: builtins.list | None = None,
+        flows: builtins.list | None = None,
+        runs: builtins.list | None = None,
+        uploaders: builtins.list | None = None,
+        study: int | None = None,
+        sort_order: str | None = None,
+        **kwargs: Any,
+    ) -> builtins.list[OpenMLEvaluation]:
+        """Retrieve evaluations from the OpenML v1 XML API.
+
+        This method builds an evaluation query URL based on the provided
+        filters, sends a request to the OpenML v1 endpoint, parses the XML
+        response into a dictionary, and enriches the result with uploader
+        usernames.
+
+        Parameters
+        ----------
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+
+        limit : int
+            the number of evaluations to return
+        offset : int
+            the number of evaluations to skip, starting from the first
+        function : str
+            the evaluation function. e.g., predictive_accuracy
+
+        tasks : list[int,str], optional
+            the list of task IDs
+        setups: list[int,str], optional
+            the list of setup IDs
+        flows : list[int,str], optional
+            the list of flow IDs
+        runs :list[int,str], optional
+            the list of run IDs
+        uploaders : list[int,str], optional
+            the list of uploader IDs
+
+        study : int, optional
+
+        kwargs: dict, optional
+            Legal filter operators: tag, per_fold
+
+        sort_order : str, optional
+            order of sorting evaluations, ascending ("asc") or descending ("desc")
+
+        Returns
+        -------
+        list of OpenMLEvaluation objects
+
+        Notes
+        -----
+        This method performs two API calls:
+        1. Fetches evaluation data from the specified endpoint
+        2. Fetches user information for all uploaders in the evaluation data
+
+        The user information is used to map uploader IDs to usernames.
+        """
+        api_call = self._build_url(
+            limit,
+            offset,
+            function=function,
+            tasks=tasks,
+            setups=setups,
+            flows=flows,
+            runs=runs,
+            uploaders=uploaders,
+            study=study,
+            sort_order=sort_order,
+            **kwargs,
+        )
+
+        eval_response = self._http.get(api_call)
+        xml_content = eval_response.text
+
+        return self._parse_list_xml(xml_content)
+
+    def _build_url(  # noqa: PLR0913, C901
+        self,
+        limit: int,
+        offset: int,
+        *,
+        function: str,
+        tasks: builtins.list | None = None,
+        setups: builtins.list | None = None,
+        flows: builtins.list | None = None,
+        runs: builtins.list | None = None,
+        uploaders: builtins.list | None = None,
+        study: int | None = None,
+        sort_order: str | None = None,
+        **kwargs: Any,
+    ) -> str:
+        """
+        Construct an OpenML evaluation API URL with filtering parameters.
+
+        Parameters
+        ----------
+        The arguments that are lists are separated from the single value
+        ones which are put into the kwargs.
+
+        limit : int
+            the number of evaluations to return
+        offset : int
+            the number of evaluations to skip, starting from the first
+        function : str
+            the evaluation function. e.g., predictive_accuracy
+
+        tasks : list[int,str], optional
+            the list of task IDs
+        setups: list[int,str], optional
+            the list of setup IDs
+        flows : list[int,str], optional
+            the list of flow IDs
+        runs :list[int,str], optional
+            the list of run IDs
+        uploaders : list[int,str], optional
+            the list of uploader IDs
+
+        study : int, optional
+
+        kwargs: dict, optional
+            Legal filter operators: tag, per_fold
+
+        sort_order : str, optional
+            order of sorting evaluations, ascending ("asc") or descending ("desc")
+
+        Returns
+        -------
+        str
+            A relative API path suitable for an OpenML HTTP request.
+        """
+        api_call = f"evaluation/list/function/{function}"
+        if limit is not None:
+            api_call += f"/limit/{limit}"
+        if offset is not None:
+            api_call += f"/offset/{offset}"
+        if kwargs is not None:
+            for operator, value in kwargs.items():
+                if value is not None:
+                    api_call += f"/{operator}/{value}"
+        if tasks is not None:
+            api_call += f"/task/{','.join([str(int(i)) for i in tasks])}"
+        if setups is not None:
+            api_call += f"/setup/{','.join([str(int(i)) for i in setups])}"
+        if flows is not None:
+            api_call += f"/flow/{','.join([str(int(i)) for i in flows])}"
+        if runs is not None:
+            api_call += f"/run/{','.join([str(int(i)) for i in runs])}"
+        if uploaders is not None:
+            api_call += f"/uploader/{','.join([str(int(i)) for i in uploaders])}"
+        if study is not None:
+            api_call += f"/study/{study}"
+        if sort_order is not None:
+            api_call += f"/sort_order/{sort_order}"
+
+        return api_call
+
+    def _parse_list_xml(self, xml_content: str) -> builtins.list[OpenMLEvaluation]:
+        """Helper function to parse API calls which are lists of runs"""
+        evals_dict: dict[str, Any] = xmltodict.parse(xml_content, force_list=("oml:evaluation",))
+        # Minimalistic check if the XML is useful
+        if "oml:evaluations" not in evals_dict:
+            raise ValueError(
+                f'Error in return XML, does not contain "oml:evaluations": {evals_dict!s}',
+            )
+
+        assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), (
+            "Expected 'oml:evaluation' to be a list, but got "
+            f"{type(evals_dict['oml:evaluations']['oml:evaluation']).__name__}. "
+        )
+
+        uploader_ids = list(
+            {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]},
+        )
+        user_dict = self._get_users(uploader_ids)
+
+        evals = []
+        for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
+            run_id = int(eval_["oml:run_id"])
+            value = float(eval_["oml:value"]) if "oml:value" in eval_ else None
+            values = json.loads(eval_["oml:values"]) if eval_.get("oml:values", None) else None
+            array_data = eval_.get("oml:array_data")
+
+            evals.append(
+                OpenMLEvaluation(
+                    run_id=run_id,
+                    task_id=int(eval_["oml:task_id"]),
+                    setup_id=int(eval_["oml:setup_id"]),
+                    flow_id=int(eval_["oml:flow_id"]),
+                    flow_name=eval_["oml:flow_name"],
+                    data_id=int(eval_["oml:data_id"]),
+                    data_name=eval_["oml:data_name"],
+                    function=eval_["oml:function"],
+                    upload_time=eval_["oml:upload_time"],
+                    uploader=int(eval_["oml:uploader"]),
+                    uploader_name=user_dict[eval_["oml:uploader"]],
+                    value=value,
+                    values=values,
+                    array_data=array_data,
+                )
+            )
+
+        return evals
+
+    def _get_users(self, uploader_ids: builtins.list[str]) -> dict:
+        """
+        Retrieve usernames for a list of OpenML user IDs.
+
+        Parameters
+        ----------
+        uploader_ids : list[str]
+            List of OpenML user IDs.
+
+        Returns
+        -------
+        dict
+            A mapping from user ID (str) to username (str).
+        """
+        api_users = "user/list/user_id/" + ",".join(uploader_ids)
+        user_response = self._http.get(api_users)
+        xml_content_user = user_response.text
+
+        users = xmltodict.parse(xml_content_user, force_list=("oml:user",))
+        return {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
 
 
 class EvaluationV2API(ResourceV2API, EvaluationAPI):
-    """Version 2 API implementation for evaluation resources."""
+    """V2 API implementation for evaluations.
+    Fetches evaluations from the v2 json API endpoint.
+    """
+
+    def list(  # noqa: PLR0913
+        self,
+        limit: int,  # noqa: ARG002
+        offset: int,  # noqa: ARG002
+        *,
+        function: str,  # noqa: ARG002
+        tasks: builtins.list | None = None,  # noqa: ARG002
+        setups: builtins.list | None = None,  # noqa: ARG002
+        flows: builtins.list | None = None,  # noqa: ARG002
+        runs: builtins.list | None = None,  # noqa: ARG002
+        uploaders: builtins.list | None = None,  # noqa: ARG002
+        study: int | None = None,  # noqa: ARG002
+        sort_order: str | None = None,  # noqa: ARG002
+        **kwargs: Any,  # noqa: ARG002
+    ) -> builtins.list[OpenMLEvaluation]:
+        """
+        Retrieve evaluation results from the OpenML v2 JSON API.
+
+        Notes
+        -----
+        This method is not yet implemented.
+        """
+        self._not_supported(method="list")
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 61c95a480..72d22a605 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -2,10 +2,9 @@
 # ruff: noqa: PLR0913
 from __future__ import annotations
 
-import json
 from functools import partial
 from itertools import chain
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Literal
 from typing_extensions import overload
 
 import numpy as np
@@ -15,7 +14,9 @@
 import openml
 import openml._api_calls
 import openml.utils
-from openml.evaluations import OpenMLEvaluation
+
+if TYPE_CHECKING:
+    from openml.evaluations import OpenMLEvaluation
 
 
 @overload
@@ -120,7 +121,7 @@ def list_evaluations(
         per_fold_str = str(per_fold).lower()
 
     listing_call = partial(
-        _list_evaluations,
+        openml._backend.evaluation.list,
         function=function,
         tasks=tasks,
         setups=setups,
@@ -142,138 +143,6 @@ def list_evaluations(
     return {e.run_id: e for e in flattened}
 
 
-def _list_evaluations(  # noqa: C901
-    limit: int,
-    offset: int,
-    *,
-    function: str,
-    tasks: list | None = None,
-    setups: list | None = None,
-    flows: list | None = None,
-    runs: list | None = None,
-    uploaders: list | None = None,
-    study: int | None = None,
-    sort_order: str | None = None,
-    **kwargs: Any,
-) -> list[OpenMLEvaluation]:
-    """
-    Perform API call ``/evaluation/function{function}/{filters}``
-
-    Parameters
-    ----------
-    The arguments that are lists are separated from the single value
-    ones which are put into the kwargs.
-
-    limit : int
-        the number of evaluations to return
-    offset : int
-        the number of evaluations to skip, starting from the first
-    function : str
-        the evaluation function. e.g., predictive_accuracy
-
-    tasks : list[int,str], optional
-        the list of task IDs
-    setups: list[int,str], optional
-        the list of setup IDs
-    flows : list[int,str], optional
-        the list of flow IDs
-    runs :list[int,str], optional
-        the list of run IDs
-    uploaders : list[int,str], optional
-        the list of uploader IDs
-
-    study : int, optional
-
-    kwargs: dict, optional
-        Legal filter operators: tag, per_fold
-
-    sort_order : str, optional
-        order of sorting evaluations, ascending ("asc") or descending ("desc")
-
-    Returns
-    -------
-    list of OpenMLEvaluation objects
-    """
-    api_call = f"evaluation/list/function/{function}"
-    if limit is not None:
-        api_call += f"/limit/{limit}"
-    if offset is not None:
-        api_call += f"/offset/{offset}"
-    if kwargs is not None:
-        for operator, value in kwargs.items():
-            if value is not None:
-                api_call += f"/{operator}/{value}"
-    if tasks is not None:
-        api_call += f"/task/{','.join([str(int(i)) for i in tasks])}"
-    if setups is not None:
-        api_call += f"/setup/{','.join([str(int(i)) for i in setups])}"
-    if flows is not None:
-        api_call += f"/flow/{','.join([str(int(i)) for i in flows])}"
-    if runs is not None:
-        api_call += f"/run/{','.join([str(int(i)) for i in runs])}"
-    if uploaders is not None:
-        api_call += f"/uploader/{','.join([str(int(i)) for i in uploaders])}"
-    if study is not None:
-        api_call += f"/study/{study}"
-    if sort_order is not None:
-        api_call += f"/sort_order/{sort_order}"
-
-    return __list_evaluations(api_call)
-
-
-def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]:
-    """Helper function to parse API calls which are lists of runs"""
-    xml_string = openml._api_calls._perform_api_call(api_call, "get")
-    evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
-    # Minimalistic check if the XML is useful
-    if "oml:evaluations" not in evals_dict:
-        raise ValueError(
-            f'Error in return XML, does not contain "oml:evaluations": {evals_dict!s}',
-        )
-
-    assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), (
-        "Expected 'oml:evaluation' to be a list, but got"
-        f"{type(evals_dict['oml:evaluations']['oml:evaluation']).__name__}. "
-    )
-
-    uploader_ids = list(
-        {eval_["oml:uploader"] for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]},
-    )
-    api_users = "user/list/user_id/" + ",".join(uploader_ids)
-    xml_string_user = openml._api_calls._perform_api_call(api_users, "get")
-
-    users = xmltodict.parse(xml_string_user, force_list=("oml:user",))
-    user_dict = {user["oml:id"]: user["oml:username"] for user in users["oml:users"]["oml:user"]}
-
-    evals = []
-    for eval_ in evals_dict["oml:evaluations"]["oml:evaluation"]:
-        run_id = int(eval_["oml:run_id"])
-        value = float(eval_["oml:value"]) if "oml:value" in eval_ else None
-        values = json.loads(eval_["oml:values"]) if eval_.get("oml:values", None) else None
-        array_data = eval_.get("oml:array_data")
-
-        evals.append(
-            OpenMLEvaluation(
-                run_id=run_id,
-                task_id=int(eval_["oml:task_id"]),
-                setup_id=int(eval_["oml:setup_id"]),
-                flow_id=int(eval_["oml:flow_id"]),
-                flow_name=eval_["oml:flow_name"],
-                data_id=int(eval_["oml:data_id"]),
-                data_name=eval_["oml:data_name"],
-                function=eval_["oml:function"],
-                upload_time=eval_["oml:upload_time"],
-                uploader=int(eval_["oml:uploader"]),
-                uploader_name=user_dict[eval_["oml:uploader"]],
-                value=value,
-                values=values,
-                array_data=array_data,
-            )
-        )
-
-    return evals
-
-
 def list_evaluation_measures() -> list[str]:
     """Return list of evaluation measures available.
 
diff --git a/tests/test_api/test_evaluation.py b/tests/test_api/test_evaluation.py
new file mode 100644
index 000000000..14b655b2a
--- /dev/null
+++ b/tests/test_api/test_evaluation.py
@@ -0,0 +1,39 @@
+# License: BSD 3-Clause  
+from __future__ import annotations  
+  
+import pytest    
+from openml._api import EvaluationV1API, EvaluationV2API
+from openml.evaluations import OpenMLEvaluation
+from openml.exceptions import OpenMLNotSupportedError  
+
+
+@pytest.fixture
+def evaluation_v1(http_client_v1, minio_client) -> EvaluationV1API:
+    return EvaluationV1API(http=http_client_v1, minio=minio_client)
+
+@pytest.fixture
+def evaluation_v2(http_client_v2, minio_client) -> EvaluationV2API:
+    return EvaluationV2API(http=http_client_v2, minio=minio_client)
+
+
+@pytest.mark.test_server()
+def test_v1_list(evaluation_v1):
+    evaluations = evaluation_v1.list(
+        function="predictive_accuracy",
+        limit=10,
+        offset=0,
+    )
+    
+    assert isinstance(evaluations, list)
+    assert len(evaluations) == 10
+    assert all(isinstance(e, OpenMLEvaluation) for e in evaluations)
+  
+    
+@pytest.mark.test_server()
+def test_v2_list(evaluation_v2):
+    with pytest.raises(OpenMLNotSupportedError):
+        evaluation_v2.list(
+        function="predictive_accuracy",
+        limit=10,
+        offset=0,
+    )