diff --git a/.config.schema.yaml b/.config.schema.yaml index 34011a8..877f967 100644 --- a/.config.schema.yaml +++ b/.config.schema.yaml @@ -1,6 +1,19 @@ $schema: "https://json-schema.org/draft/2020-12/schema" type: object properties: + features: + type: object + properties: + postprocessing: + type: object + properties: + enabled: + type: boolean + user_group: + type: string + enum: ["all", "logged_in"] + required: ["enabled"] + required: ["postprocessing"] messages: type: object additionalProperties: @@ -38,4 +51,4 @@ properties: - required: ["event"] - required: ["after_messages"] required: ["message", "trigger"] -required: ["messages"] +required: ["features", "messages"] diff --git a/bin/chat-chainlit.py b/bin/chat-chainlit.py index 40f1012..7fb8a78 100644 --- a/bin/chat-chainlit.py +++ b/bin/chat-chainlit.py @@ -11,7 +11,7 @@ from conversational_chain.graph import RAGGraphWithMemory from retreival_chain import create_retrieval_chain -from util.chainlit_helpers import static_messages +from util.chainlit_helpers import is_feature_enabled, static_messages from util.config_yml import Config, TriggerEvent from util.embedding_environment import EmbeddingEnvironment from util.logging import logging @@ -87,11 +87,23 @@ async def main(message: cl.Message) -> None: stream_final_answer=True, force_stream_final_answer=True, # we're not using prefix tokens ) + enable_postprocess: bool = is_feature_enabled(config, "postprocessing") result: dict[str, Any] = await llm_graph.ainvoke( message.content, callbacks=[cb], thread_id=thread_id, + enable_postprocess=enable_postprocess, ) - if len(result["additional_text"]) > 0: - await cl.Message(content=result["additional_text"]).send() + if ( + enable_postprocess + and cb.final_stream + and len(result["additional_content"]["search_results"]) > 0 + ): + sent_message: cl.Message = cb.final_stream + search_results_element = cl.CustomElement( + name="SearchResults", + props={"results": result["additional_content"]["search_results"]}, + ) + sent_message.elements = [search_results_element] # type: ignore + await sent_message.update() await static_messages(config, after_messages=message_count) diff --git a/bin/chat-fastapi.py b/bin/chat-fastapi.py index 287c8ec..d9537de 100644 --- a/bin/chat-fastapi.py +++ b/bin/chat-fastapi.py @@ -71,7 +71,7 @@ async def verify_captcha_middleware(request: Request, call_next): response = await call_next(request) return response - host = request.headers.get('referer') + host = request.headers.get("referer") if host and host.startswith("http:"): url = request.url.replace(scheme="https") return RedirectResponse(url=str(url)) diff --git a/config_default.yml b/config_default.yml index 5f14007..6d27f8b 100644 --- a/config_default.yml +++ b/config_default.yml @@ -1,5 +1,10 @@ # yaml-language-server: $schema=./.config.schema.yaml +features: + postprocessing: # external web search feature + enabled: true + user_group: all + messages: welcome: diff --git a/docker-compose.yml b/docker-compose.yml index 544a6f9..cc1ba72 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,7 @@ services: - CHAINLIT_AUTH_SECRET=${CHAINLIT_AUTH_SECRET} - CHAINLIT_URI=${CHAINLIT_URI} - CHAINLIT_URL=${CHAINLIT_URL} + - TAVILY_API_KEY=${TAVILY_API_KEY} ports: - "8000:8000" depends_on: @@ -40,6 +41,8 @@ services: - CLOUDFLARE_SECRET_KEY=${CLOUDFLARE_SECRET_KEY} - CLOUDFLARE_SITE_KEY=${CLOUDFLARE_SITE_KEY} - CHAINLIT_URI=${CHAINLIT_URI_NO_LOGIN} + - CHAINLIT_URL=${CHAINLIT_URL} + - TAVILY_API_KEY=${TAVILY_API_KEY} ports: - "8001:8000" depends_on: diff --git a/poetry.lock b/poetry.lock index d136639..488b912 100644 --- a/poetry.lock +++ b/poetry.lock @@ -165,6 +165,17 @@ doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"] trio = ["trio (>=0.26.1)"] +[[package]] +name = "appdirs" +version = "1.4.4" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +optional = false +python-versions = "*" +files = [ + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, +] + [[package]] name = "asgiref" version = "3.8.1" @@ -748,6 +759,48 @@ files = [ marshmallow = ">=3.18.0,<4.0.0" typing-inspect = ">=0.4.0,<1" +[[package]] +name = "datasets" +version = "3.2.0" +description = "HuggingFace community-driven open-source library of datasets" +optional = false +python-versions = ">=3.9.0" +files = [ + {file = "datasets-3.2.0-py3-none-any.whl", hash = "sha256:f3d2ba2698b7284a4518019658596a6a8bc79f31e51516524249d6c59cf0fe2a"}, + {file = "datasets-3.2.0.tar.gz", hash = "sha256:9a6e1a356052866b5dbdd9c9eedb000bf3fc43d986e3584d9b028f4976937229"}, +] + +[package.dependencies] +aiohttp = "*" +dill = ">=0.3.0,<0.3.9" +filelock = "*" +fsspec = {version = ">=2023.1.0,<=2024.9.0", extras = ["http"]} +huggingface-hub = ">=0.23.0" +multiprocess = "<0.70.17" +numpy = ">=1.17" +packaging = "*" +pandas = "*" +pyarrow = ">=15.0.0" +pyyaml = ">=5.1" +requests = ">=2.32.2" +tqdm = ">=4.66.3" +xxhash = "*" + +[package.extras] +audio = ["librosa", "soundfile (>=0.12.1)", "soxr (>=0.4.0)"] +benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] +dev = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (>=7.17.12,<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch", "torch (>=2.0.0)", "torchdata", "transformers", "transformers (>=4.42.0)", "zstandard"] +docs = ["s3fs", "tensorflow (>=2.6.0)", "torch", "transformers"] +jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] +quality = ["ruff (>=0.3.0)"] +s3 = ["s3fs"] +tensorflow = ["tensorflow (>=2.6.0)"] +tensorflow-gpu = ["tensorflow (>=2.6.0)"] +tests = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (>=7.17.12,<8.0.0)", "faiss-cpu (>=1.8.0.post1)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tensorflow (>=2.16.0)", "tensorflow (>=2.6.0)", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +tests-numpy2 = ["Pillow (>=9.4.0)", "absl-py", "decorator", "decord (==0.6.0)", "elasticsearch (>=7.17.12,<8.0.0)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "lz4", "moto[server]", "polars[timezone] (>=0.20.0)", "protobuf (<4.0.0)", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "soxr (>=0.4.0)", "sqlalchemy", "tiktoken", "torch (>=2.0.0)", "torchdata", "transformers (>=4.42.0)", "zstandard"] +torch = ["torch"] +vision = ["Pillow (>=9.4.0)"] + [[package]] name = "deprecated" version = "1.2.15" @@ -765,6 +818,32 @@ wrapt = ">=1.10,<2" [package.extras] dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "jinja2 (>=3.0.3,<3.1.0)", "setuptools", "sphinx (<2)", "tox"] +[[package]] +name = "dill" +version = "0.3.8" +description = "serialize all of Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"}, + {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"}, +] + +[package.extras] +graph = ["objgraph (>=1.7.2)"] +profile = ["gprof2dot (>=2022.7.29)"] + +[[package]] +name = "diskcache" +version = "5.6.3" +description = "Disk Cache -- Disk and file backed persistent cache." +optional = false +python-versions = ">=3" +files = [ + {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"}, + {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"}, +] + [[package]] name = "distro" version = "1.9.0" @@ -959,15 +1038,18 @@ files = [ [[package]] name = "fsspec" -version = "2024.12.0" +version = "2024.9.0" description = "File-system specification" optional = false python-versions = ">=3.8" files = [ - {file = "fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2"}, - {file = "fsspec-2024.12.0.tar.gz", hash = "sha256:670700c977ed2fb51e0d9f9253177ed20cbde4a3e5c0283cc5385b5870c8533f"}, + {file = "fsspec-2024.9.0-py3-none-any.whl", hash = "sha256:a0947d552d8a6efa72cc2c730b12c41d043509156966cca4fb157b0f2a0c574b"}, + {file = "fsspec-2024.9.0.tar.gz", hash = "sha256:4b0afb90c2f21832df142f292649035d80b421f60a9e1c027802e5a0da2b04e8"}, ] +[package.dependencies] +aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""} + [package.extras] abfs = ["adlfs"] adl = ["adlfs"] @@ -2340,6 +2422,30 @@ files = [ {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"}, ] +[[package]] +name = "multiprocess" +version = "0.70.16" +description = "better multiprocessing and multithreading in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"}, + {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"}, + {file = "multiprocess-0.70.16-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:37b55f71c07e2d741374998c043b9520b626a8dddc8b3129222ca4f1a06ef67a"}, + {file = "multiprocess-0.70.16-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba8c31889abf4511c7308a8c52bb4a30b9d590e7f58523302ba00237702ca054"}, + {file = "multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41"}, + {file = "multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a"}, + {file = "multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02"}, + {file = "multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a"}, + {file = "multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e"}, + {file = "multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435"}, + {file = "multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3"}, + {file = "multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1"}, +] + +[package.dependencies] +dill = ">=0.3.8" + [[package]] name = "mypy" version = "1.14.1" @@ -3512,6 +3618,60 @@ files = [ [package.dependencies] typing-extensions = ">=4.6" +[[package]] +name = "pyarrow" +version = "19.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyarrow-19.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c318eda14f6627966997a7d8c374a87d084a94e4e38e9abbe97395c215830e0c"}, + {file = "pyarrow-19.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:62ef8360ff256e960f57ce0299090fb86423afed5e46f18f1225f960e05aae3d"}, + {file = "pyarrow-19.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2795064647add0f16563e57e3d294dbfc067b723f0fd82ecd80af56dad15f503"}, + {file = "pyarrow-19.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a218670b26fb1bc74796458d97bcab072765f9b524f95b2fccad70158feb8b17"}, + {file = "pyarrow-19.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:66732e39eaa2247996a6b04c8aa33e3503d351831424cdf8d2e9a0582ac54b34"}, + {file = "pyarrow-19.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:e675a3ad4732b92d72e4d24009707e923cab76b0d088e5054914f11a797ebe44"}, + {file = "pyarrow-19.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:f094742275586cdd6b1a03655ccff3b24b2610c3af76f810356c4c71d24a2a6c"}, + {file = "pyarrow-19.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8e3a839bf36ec03b4315dc924d36dcde5444a50066f1c10f8290293c0427b46a"}, + {file = "pyarrow-19.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ce42275097512d9e4e4a39aade58ef2b3798a93aa3026566b7892177c266f735"}, + {file = "pyarrow-19.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9348a0137568c45601b031a8d118275069435f151cbb77e6a08a27e8125f59d4"}, + {file = "pyarrow-19.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a0144a712d990d60f7f42b7a31f0acaccf4c1e43e957f7b1ad58150d6f639c1"}, + {file = "pyarrow-19.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2a1a109dfda558eb011e5f6385837daffd920d54ca00669f7a11132d0b1e6042"}, + {file = "pyarrow-19.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:be686bf625aa7b9bada18defb3a3ea3981c1099697239788ff111d87f04cd263"}, + {file = "pyarrow-19.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:239ca66d9a05844bdf5af128861af525e14df3c9591bcc05bac25918e650d3a2"}, + {file = "pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:a7bbe7109ab6198688b7079cbad5a8c22de4d47c4880d8e4847520a83b0d1b68"}, + {file = "pyarrow-19.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:4624c89d6f777c580e8732c27bb8e77fd1433b89707f17c04af7635dd9638351"}, + {file = "pyarrow-19.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b6d3ce4288793350dc2d08d1e184fd70631ea22a4ff9ea5c4ff182130249d9b"}, + {file = "pyarrow-19.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:450a7d27e840e4d9a384b5c77199d489b401529e75a3b7a3799d4cd7957f2f9c"}, + {file = "pyarrow-19.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a08e2a8a039a3f72afb67a6668180f09fddaa38fe0d21f13212b4aba4b5d2451"}, + {file = "pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f43f5aef2a13d4d56adadae5720d1fed4c1356c993eda8b59dace4b5983843c1"}, + {file = "pyarrow-19.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f672f5364b2d7829ef7c94be199bb88bf5661dd485e21d2d37de12ccb78a136"}, + {file = "pyarrow-19.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:cf3bf0ce511b833f7bc5f5bb3127ba731e97222023a444b7359f3a22e2a3b463"}, + {file = "pyarrow-19.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:4d8b0c0de0a73df1f1bf439af1b60f273d719d70648e898bc077547649bb8352"}, + {file = "pyarrow-19.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92aff08e23d281c69835e4a47b80569242a504095ef6a6223c1f6bb8883431d"}, + {file = "pyarrow-19.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3b78eff5968a1889a0f3bc81ca57e1e19b75f664d9c61a42a604bf9d8402aae"}, + {file = "pyarrow-19.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b34d3bde38eba66190b215bae441646330f8e9da05c29e4b5dd3e41bde701098"}, + {file = "pyarrow-19.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5418d4d0fab3a0ed497bad21d17a7973aad336d66ad4932a3f5f7480d4ca0c04"}, + {file = "pyarrow-19.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e82c3d5e44e969c217827b780ed8faf7ac4c53f934ae9238872e749fa531f7c9"}, + {file = "pyarrow-19.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f208c3b58a6df3b239e0bb130e13bc7487ed14f39a9ff357b6415e3f6339b560"}, + {file = "pyarrow-19.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:c751c1c93955b7a84c06794df46f1cec93e18610dcd5ab7d08e89a81df70a849"}, + {file = "pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b903afaa5df66d50fc38672ad095806443b05f202c792694f3a604ead7c6ea6e"}, + {file = "pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22a4bc0937856263df8b94f2f2781b33dd7f876f787ed746608e06902d691a5"}, + {file = "pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:5e8a28b918e2e878c918f6d89137386c06fe577cd08d73a6be8dafb317dc2d73"}, + {file = "pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:29cd86c8001a94f768f79440bf83fee23963af5e7bc68ce3a7e5f120e17edf89"}, + {file = "pyarrow-19.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c0423393e4a07ff6fea08feb44153302dd261d0551cc3b538ea7a5dc853af43a"}, + {file = "pyarrow-19.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:718947fb6d82409013a74b176bf93e0f49ef952d8a2ecd068fecd192a97885b7"}, + {file = "pyarrow-19.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c1c162c4660e0978411a4761f91113dde8da3433683efa473501254563dcbe8"}, + {file = "pyarrow-19.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c73268cf557e688efb60f1ccbc7376f7e18cd8e2acae9e663e98b194c40c1a2d"}, + {file = "pyarrow-19.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:edfe6d3916e915ada9acc4e48f6dafca7efdbad2e6283db6fd9385a1b23055f1"}, + {file = "pyarrow-19.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:da410b70a7ab8eb524112f037a7a35da7128b33d484f7671a264a4c224ac131d"}, + {file = "pyarrow-19.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:597360ffc71fc8cceea1aec1fb60cb510571a744fffc87db33d551d5de919bec"}, + {file = "pyarrow-19.0.0.tar.gz", hash = "sha256:8d47c691765cf497aaeed4954d226568563f1b3b74ff61139f2d77876717084b"}, +] + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pyasn1" version = "0.5.1" @@ -3937,6 +4097,37 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "ragas" +version = "0.2.11" +description = "" +optional = false +python-versions = ">=3.9" +files = [ + {file = "ragas-0.2.11-py3-none-any.whl", hash = "sha256:d79c4805f0551fd98eea0d15de761694cea7617fcb51df33acbb15b14925d3a1"}, + {file = "ragas-0.2.11.tar.gz", hash = "sha256:70e4a3b7c08abf8459c34ad4884a72ae794f9933a00ac561d314d65ea508d352"}, +] + +[package.dependencies] +appdirs = "*" +datasets = "*" +diskcache = ">=5.6.3" +langchain = "*" +langchain-community = "*" +langchain-core = "*" +langchain_openai = "*" +nest-asyncio = "*" +numpy = "*" +openai = ">1" +pydantic = ">=2" +tiktoken = "*" + +[package.extras] +all = ["datacompy", "llama_index", "nltk", "pandas", "rapidfuzz", "rouge_score", "sentence-transformers", "transformers"] +dev = ["black[jupyter]", "datacompy", "fastembed", "graphene", "isort", "llama_index", "nltk", "notebook", "pandas", "pyright", "rapidfuzz", "rich", "rouge_score", "ruff", "sentence-transformers", "sphinx-autobuild", "transformers"] +docs = ["mkdocs (>=1.6.1)", "mkdocs-autorefs", "mkdocs-gen-files", "mkdocs-git-committers-plugin-2", "mkdocs-git-revision-date-localized-plugin", "mkdocs-glightbox", "mkdocs-literate-nav", "mkdocs-material", "mkdocs-material[imaging]", "mkdocs-section-index", "mkdocstrings[python]"] +test = ["llama_index", "nbmake", "pytest", "pytest-asyncio", "pytest-xdist[psutil]"] + [[package]] name = "rank-bm25" version = "0.2.2" @@ -4551,6 +4742,22 @@ files = [ {file = "syncer-2.0.3.tar.gz", hash = "sha256:4340eb54b54368724a78c5c0763824470201804fe9180129daf3635cb500550f"}, ] +[[package]] +name = "tavily-python" +version = "0.5.0" +description = "Python wrapper for the Tavily API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "tavily_python-0.5.0-py3-none-any.whl", hash = "sha256:e874f6a04a56cdda80a505fe0b4f5d61d25372bd52a83e6773926fb297dcaa29"}, + {file = "tavily_python-0.5.0.tar.gz", hash = "sha256:2c60b88203b630e1b37fc711913a1090ced6719b3f21089f25ec06e9e1602822"}, +] + +[package.dependencies] +httpx = "*" +requests = "*" +tiktoken = ">=0.5.1" + [[package]] name = "tenacity" version = "9.0.0" @@ -5407,6 +5614,138 @@ files = [ [package.dependencies] h11 = ">=0.9.0,<1" +[[package]] +name = "xxhash" +version = "3.5.0" +description = "Python binding for xxHash" +optional = false +python-versions = ">=3.7" +files = [ + {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"}, + {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"}, + {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680"}, + {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da"}, + {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23"}, + {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196"}, + {file = "xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c"}, + {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482"}, + {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296"}, + {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415"}, + {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198"}, + {file = "xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442"}, + {file = "xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da"}, + {file = "xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9"}, + {file = "xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6"}, + {file = "xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1"}, + {file = "xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8"}, + {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166"}, + {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7"}, + {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623"}, + {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a"}, + {file = "xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88"}, + {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c"}, + {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2"}, + {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084"}, + {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d"}, + {file = "xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839"}, + {file = "xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da"}, + {file = "xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58"}, + {file = "xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3"}, + {file = "xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00"}, + {file = "xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9"}, + {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84"}, + {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793"}, + {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be"}, + {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6"}, + {file = "xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90"}, + {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27"}, + {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2"}, + {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d"}, + {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab"}, + {file = "xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e"}, + {file = "xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8"}, + {file = "xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e"}, + {file = "xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2"}, + {file = "xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6"}, + {file = "xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5"}, + {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc"}, + {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3"}, + {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c"}, + {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb"}, + {file = "xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f"}, + {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7"}, + {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326"}, + {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf"}, + {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7"}, + {file = "xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c"}, + {file = "xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637"}, + {file = "xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43"}, + {file = "xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b"}, + {file = "xxhash-3.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6e5f70f6dca1d3b09bccb7daf4e087075ff776e3da9ac870f86ca316736bb4aa"}, + {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e76e83efc7b443052dd1e585a76201e40b3411fe3da7af4fe434ec51b2f163b"}, + {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33eac61d0796ca0591f94548dcfe37bb193671e0c9bcf065789b5792f2eda644"}, + {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ec70a89be933ea49222fafc3999987d7899fc676f688dd12252509434636622"}, + {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86b8e7f703ec6ff4f351cfdb9f428955859537125904aa8c963604f2e9d3e7"}, + {file = "xxhash-3.5.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0adfbd36003d9f86c8c97110039f7539b379f28656a04097e7434d3eaf9aa131"}, + {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:63107013578c8a730419adc05608756c3fa640bdc6abe806c3123a49fb829f43"}, + {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:683b94dbd1ca67557850b86423318a2e323511648f9f3f7b1840408a02b9a48c"}, + {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5d2a01dcce81789cf4b12d478b5464632204f4c834dc2d064902ee27d2d1f0ee"}, + {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:a9d360a792cbcce2fe7b66b8d51274ec297c53cbc423401480e53b26161a290d"}, + {file = "xxhash-3.5.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:f0b48edbebea1b7421a9c687c304f7b44d0677c46498a046079d445454504737"}, + {file = "xxhash-3.5.0-cp37-cp37m-win32.whl", hash = "sha256:7ccb800c9418e438b44b060a32adeb8393764da7441eb52aa2aa195448935306"}, + {file = "xxhash-3.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c3bc7bf8cb8806f8d1c9bf149c18708cb1c406520097d6b0a73977460ea03602"}, + {file = "xxhash-3.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:74752ecaa544657d88b1d1c94ae68031e364a4d47005a90288f3bab3da3c970f"}, + {file = "xxhash-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dee1316133c9b463aa81aca676bc506d3f80d8f65aeb0bba2b78d0b30c51d7bd"}, + {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:602d339548d35a8579c6b013339fb34aee2df9b4e105f985443d2860e4d7ffaa"}, + {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:695735deeddfb35da1677dbc16a083445360e37ff46d8ac5c6fcd64917ff9ade"}, + {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1030a39ba01b0c519b1a82f80e8802630d16ab95dc3f2b2386a0b5c8ed5cbb10"}, + {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5bc08f33c4966f4eb6590d6ff3ceae76151ad744576b5fc6c4ba8edd459fdec"}, + {file = "xxhash-3.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:160e0c19ee500482ddfb5d5570a0415f565d8ae2b3fd69c5dcfce8a58107b1c3"}, + {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f1abffa122452481a61c3551ab3c89d72238e279e517705b8b03847b1d93d738"}, + {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:d5e9db7ef3ecbfc0b4733579cea45713a76852b002cf605420b12ef3ef1ec148"}, + {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:23241ff6423378a731d84864bf923a41649dc67b144debd1077f02e6249a0d54"}, + {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:82b833d5563fefd6fceafb1aed2f3f3ebe19f84760fdd289f8b926731c2e6e91"}, + {file = "xxhash-3.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0a80ad0ffd78bef9509eee27b4a29e56f5414b87fb01a888353e3d5bda7038bd"}, + {file = "xxhash-3.5.0-cp38-cp38-win32.whl", hash = "sha256:50ac2184ffb1b999e11e27c7e3e70cc1139047e7ebc1aa95ed12f4269abe98d4"}, + {file = "xxhash-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:392f52ebbb932db566973693de48f15ce787cabd15cf6334e855ed22ea0be5b3"}, + {file = "xxhash-3.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bfc8cdd7f33d57f0468b0614ae634cc38ab9202c6957a60e31d285a71ebe0301"}, + {file = "xxhash-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e0c48b6300cd0b0106bf49169c3e0536408dfbeb1ccb53180068a18b03c662ab"}, + {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe1a92cfbaa0a1253e339ccec42dbe6db262615e52df591b68726ab10338003f"}, + {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33513d6cc3ed3b559134fb307aae9bdd94d7e7c02907b37896a6c45ff9ce51bd"}, + {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eefc37f6138f522e771ac6db71a6d4838ec7933939676f3753eafd7d3f4c40bc"}, + {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a606c8070ada8aa2a88e181773fa1ef17ba65ce5dd168b9d08038e2a61b33754"}, + {file = "xxhash-3.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:42eca420c8fa072cc1dd62597635d140e78e384a79bb4944f825fbef8bfeeef6"}, + {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:604253b2143e13218ff1ef0b59ce67f18b8bd1c4205d2ffda22b09b426386898"}, + {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6e93a5ad22f434d7876665444a97e713a8f60b5b1a3521e8df11b98309bff833"}, + {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7a46e1d6d2817ba8024de44c4fd79913a90e5f7265434cef97026215b7d30df6"}, + {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:30eb2efe6503c379b7ab99c81ba4a779748e3830241f032ab46bd182bf5873af"}, + {file = "xxhash-3.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c8aa771ff2c13dd9cda8166d685d7333d389fae30a4d2bb39d63ab5775de8606"}, + {file = "xxhash-3.5.0-cp39-cp39-win32.whl", hash = "sha256:5ed9ebc46f24cf91034544b26b131241b699edbfc99ec5e7f8f3d02d6eb7fba4"}, + {file = "xxhash-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:220f3f896c6b8d0316f63f16c077d52c412619e475f9372333474ee15133a558"}, + {file = "xxhash-3.5.0-cp39-cp39-win_arm64.whl", hash = "sha256:a7b1d8315d9b5e9f89eb2933b73afae6ec9597a258d52190944437158b49d38e"}, + {file = "xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c"}, + {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986"}, + {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6"}, + {file = "xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b"}, + {file = "xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da"}, + {file = "xxhash-3.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2b4154c00eb22e4d543f472cfca430e7962a0f1d0f3778334f2e08a7ba59363c"}, + {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d30bbc1644f726b825b3278764240f449d75f1a8bdda892e641d4a688b1494ae"}, + {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa0b72f2423e2aa53077e54a61c28e181d23effeaafd73fcb9c494e60930c8e"}, + {file = "xxhash-3.5.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:13de2b76c1835399b2e419a296d5b38dc4855385d9e96916299170085ef72f57"}, + {file = "xxhash-3.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:0691bfcc4f9c656bcb96cc5db94b4d75980b9d5589f2e59de790091028580837"}, + {file = "xxhash-3.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:297595fe6138d4da2c8ce9e72a04d73e58725bb60f3a19048bc96ab2ff31c692"}, + {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc1276d369452040cbb943300dc8abeedab14245ea44056a2943183822513a18"}, + {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2061188a1ba352fc699c82bff722f4baacb4b4b8b2f0c745d2001e56d0dfb514"}, + {file = "xxhash-3.5.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38c384c434021e4f62b8d9ba0bc9467e14d394893077e2c66d826243025e1f81"}, + {file = "xxhash-3.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e6a4dd644d72ab316b580a1c120b375890e4c52ec392d4aef3c63361ec4d77d1"}, + {file = "xxhash-3.5.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:531af8845aaadcadf951b7e0c1345c6b9c68a990eeb74ff9acd8501a0ad6a1c9"}, + {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ce379bcaa9fcc00f19affa7773084dd09f5b59947b3fb47a1ceb0179f91aaa1"}, + {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd1b2281d01723f076df3c8188f43f2472248a6b63118b036e641243656b1b0f"}, + {file = "xxhash-3.5.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c770750cc80e8694492244bca7251385188bc5597b6a39d98a9f30e8da984e0"}, + {file = "xxhash-3.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b150b8467852e1bd844387459aa6fbe11d7f38b56e901f9f3b3e6aba0d660240"}, + {file = "xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f"}, +] + [[package]] name = "yarl" version = "1.18.3" @@ -5525,4 +5864,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.12, <4" -content-hash = "f7c5f29a2afd8c40fdaa6cb6b4f0672fdb70c9773e5eb21806db918eee9a670d" +content-hash = "5b1f865d119b14bd9b18d319d3e20422de1fed9eb3cacbef5e92dbb3594cdf3e" diff --git a/public/elements/SearchResults.jsx b/public/elements/SearchResults.jsx new file mode 100644 index 0000000..1106c59 --- /dev/null +++ b/public/elements/SearchResults.jsx @@ -0,0 +1,51 @@ +const getDomainFromUrl = (url) => { + const { hostname } = new URL(url); + return hostname; +}; + +const SearchResults = () => { + return ( +
+
+

+ Here are some external resources you may find helpful: +

+
+
+ {props.results.map((result) => ( + +
+
+
+
+ {result.title} +
+
+
+ {getDomainFromUrl(result.url)} +
+
+
+ {result.content.substring(0, 300)} +
+
+
+ ))} +
+
+ ) +}; + +export default SearchResults; diff --git a/pyproject.toml b/pyproject.toml index ddcfc25..2c1f087 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ rank-bm25 = "^0.2.2" psycopg = {extras = ["binary"], version = "^3.2.3"} pydantic = "^2.10.5" pyyaml = "^6.0.2" +tavily-python = "^0.5.0" [tool.poetry.group.dev.dependencies] ruff = "^0.7.1" @@ -54,6 +55,8 @@ isort = "^5.13.2" pandas-stubs = "^2.2.3.241009" types-requests = "^2.32.0.20241016" types-pyyaml = "^6.0.12.20241230" +datasets = "^3.2.0" +ragas = "^0.2.11" [[tool.poetry.source]] name = "PyPI" diff --git a/src/conversational_chain/graph.py b/src/conversational_chain/graph.py index 898155f..9128e2a 100644 --- a/src/conversational_chain/graph.py +++ b/src/conversational_chain/graph.py @@ -17,6 +17,8 @@ from psycopg_pool import AsyncConnectionPool from conversational_chain.chain import create_rag_chain +from external_search.state import WebSearchResult +from external_search.workflow import create_search_workflow from util.logging import logging LANGGRAPH_DB_URI = f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@postgres:5432/{os.getenv('POSTGRES_LANGGRAPH_DB')}?sslmode=disable" @@ -30,21 +32,25 @@ logging.warning("POSTGRES_LANGGRAPH_DB undefined; falling back to MemorySaver.") -class ChatResponse(TypedDict): - chat_history: Annotated[Sequence[BaseMessage], add_messages] - context: list[Document] - answer: str # primary LLM response that is streamed to the user +class AdditionalContent(TypedDict): + search_results: list[WebSearchResult] -class ChatState(ChatResponse): +class ChatState(TypedDict): input: str - additional_text: str # additional text to send after graph completes + chat_history: Annotated[Sequence[BaseMessage], add_messages] + context: list[Document] + answer: str # primary LLM response that is streamed to the user + additional_content: ( + AdditionalContent # additional content to send after graph completes + ) class RAGGraphWithMemory: def __init__(self, retriever: BaseRetriever, llm: BaseChatModel) -> None: # Set up runnables self.rag_chain: Runnable = create_rag_chain(llm, retriever) + self.search_workflow: CompiledStateGraph = create_search_workflow(llm) # Create graph state_graph: StateGraph = StateGraph(ChatState) @@ -91,35 +97,48 @@ async def close_pool(self) -> None: async def call_model( self, state: ChatState, config: RunnableConfig - ) -> ChatResponse: - response = await self.rag_chain.ainvoke(state, config) + ) -> dict[str, Any]: + result = await self.rag_chain.ainvoke(state, config) return { "chat_history": [ HumanMessage(state["input"]), - AIMessage(response["answer"]), + AIMessage(result["answer"]), ], - "context": response["context"], - "answer": response["answer"], + "context": result["context"], + "answer": result["answer"], } async def postprocess( - self, state: ChatResponse, config: RunnableConfig - ) -> dict[str, str]: - # TODO: add completeness checking flow here + self, state: ChatState, config: RunnableConfig + ) -> dict[str, dict[str, list[WebSearchResult]]]: + search_results: list[WebSearchResult] = [] + if config["configurable"]["enable_postprocess"]: + result: dict[str, Any] = await self.search_workflow.ainvoke( + {"question": state["input"], "generation": state["answer"]}, + ) + search_results = result["search_results"] return { - "additional_text": "", + "additional_content": {"search_results": search_results}, } async def ainvoke( - self, user_input: str, callbacks: Callbacks, thread_id: str + self, + user_input: str, + *, + callbacks: Callbacks, + thread_id: str, + enable_postprocess: bool = True, ) -> dict[str, Any]: if self.graph is None: self.graph = await self.initialize() - response: dict[str, Any] = await self.graph.ainvoke( + result: dict[str, Any] = await self.graph.ainvoke( {"input": user_input}, config=RunnableConfig( callbacks=callbacks, - configurable={"thread_id": thread_id}, + configurable={ + "thread_id": thread_id, + "enable_postprocess": enable_postprocess, + }, ), ) - return response + return result diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py index 5ea245b..9f96b23 100644 --- a/src/evaluation/evaluator.py +++ b/src/evaluation/evaluator.py @@ -1,106 +1,173 @@ import argparse -import json +import os +import pandas as pd from datasets import Dataset -from langchain_openai import ChatOpenAI +from langchain.retrievers import EnsembleRetriever +from langchain.retrievers.merger_retriever import MergerRetriever +from langchain.retrievers.self_query.base import SelfQueryRetriever +from langchain_chroma.vectorstores import Chroma +from langchain_community.document_loaders.csv_loader import CSVLoader +from langchain_community.retrievers import BM25Retriever +from langchain_openai import ChatOpenAI, OpenAIEmbeddings from ragas import evaluate +from ragas.metrics import answer_relevancy, context_utilization, faithfulness -from src.retrieval_chain import initialize_retrieval_chain +from conversational_chain.chain import create_rag_chain +from reactome.metadata_info import descriptions_info, field_info def parse_arguments(): """Parse command line arguments for the script.""" parser = argparse.ArgumentParser( - description="Load a test set and evaluate answers generated by a language model." + description="Load a directory of testsets and evaluate answers generated by a language model." ) parser.add_argument( - "--testset_path", type=str, required=True, help="Path to the testset JSON file" + "--testset_dir", + type=str, + required=True, + help="Path to the directory containing testset Excel (.xlsx) files", ) parser.add_argument( "--model", type=str, - default="gpt-3.5-turbo-0125", + default="gpt-4o-mini", help="Language model to use for evaluation", ) - parser.add_argument( - "--metrics", - type=str, - nargs="+", - help="List of evaluation metrics (e.g., precision, recall)", - ) - parser.add_argument( - "--replicates", - type=int, - default=1, - help="Number of replicates for answer generation", - ) return parser.parse_args() def load_dataset(testset_path): - """Load the dataset from a JSON file.""" + """Load the dataset from an Excel (.xlsx) file.""" try: - with open(testset_path, "r") as file: - data = json.load(file) - return data + df = pd.read_excel(testset_path) + return df.to_dict( + orient="records" + ) # Convert DataFrame to a list of dictionaries except FileNotFoundError: raise FileNotFoundError(f"The file {testset_path} does not exist.") - except json.JSONDecodeError: - raise ValueError("Invalid JSON file format.") + except ValueError as e: + raise ValueError(f"Error reading the Excel file: {e}") -def main(): - args = parse_arguments() +def initialize_rag_chain(embeddings_directory, model_name): + """Initialize the RAG chain system.""" + llm = ChatOpenAI(temperature=0.0, verbose=True, model=model_name) + retriever_list = [] + + loader = CSVLoader( + "/Users/hmohammadi/Desktop/react_to_me_github/reactome_chatbot/embeddings/openai/text-embedding-3-large/reactome/summation_csv/summations.csv" + ) + data = loader.load() + bm25_retriever = BM25Retriever.from_documents(data) + bm25_retriever.k = 15 + + # Set up vectorstore SelfQuery retriever + embedding = OpenAIEmbeddings(model="text-embedding-3-large") + vectordb = Chroma( + persist_directory=embeddings_directory, + embedding_function=embedding, + ) + + selfq_retriever = SelfQueryRetriever.from_llm( + llm=llm, + vectorstore=vectordb, + document_contents=descriptions_info["summations"], + metadata_field_info=field_info["summations"], + search_kwargs={"k": 15}, + ) + rrf_retriever = EnsembleRetriever( + retrievers=[bm25_retriever, selfq_retriever], weights=[0.2, 0.8] + ) + retriever_list.append(rrf_retriever) + + reactome_retriever = MergerRetriever(retrievers=retriever_list) + + qa = create_rag_chain( + retriever=reactome_retriever, + llm=llm, + ) + return qa - # Load testset from JSON file - testset = load_dataset(args.testset_path) +def process_testset( + testset_path, qa_system, embeddings_directory, response_dir, eval_dir, model_name +): + """Process a single testset file.""" + args = parse_arguments() + + testset = load_dataset(testset_path) questions = [item["question"] for item in testset] ground_truths = [item["ground_truth"] for item in testset] - embeddings_directory = "embeddings" - - qa_system = initialize_retrieval_chain(embeddings_directory, True) + answers = [] + contexts = [] + + for question in questions: + response = qa_system.get_context(question) + answers.append(response["answer"]) + contexts.append([context.page_content for context in response["context"]]) + + # Save responses to an Excel file + data = { + "question": questions, + "answer": answers, + "contexts": contexts, + "ground_truth": ground_truths, + } + df_ans = pd.DataFrame(data) + response_filename = os.path.join( + response_dir, + f"{os.path.splitext(os.path.basename(testset_path))[0]}_{args.model}_responses.xlsx", + ) + df_ans.to_excel(response_filename, index=False) + print(f"Responses saved to {response_filename}") + + # Evaluate the dataset + dataset = Dataset.from_dict(data) + result = evaluate( + llm=ChatOpenAI(temperature=0.0, verbose=True, model="gpt-4o"), + dataset=dataset, + metrics=[answer_relevancy, context_utilization, faithfulness], + ) - # This will hold the final results for all replicates - all_replicates_results = [] + # Save evaluation results to an Excel file + evaluation_filename = os.path.join( + eval_dir, + f"{os.path.splitext(os.path.basename(testset_path))[0]}_{args.model}_evaluation.xlsx", + ) + df_eval = result.to_pandas() + df_eval.to_excel(evaluation_filename, index=False) + print(f"Evaluation results saved to {evaluation_filename}") - for _ in range(args.replicates): - answers = [] - contexts = [] - for question in questions: - response = qa_system.invoke(question) - answers.append(response["answer"]) - contexts.append( - [context.page_content for context in response["source_documents"]] +def main(): + args = parse_arguments() + model_name = args.model + response_dir = os.path.join(args.testset_dir, "response") + eval_dir = os.path.join(args.testset_dir, "evals") + os.makedirs(response_dir, exist_ok=True) + os.makedirs(eval_dir, exist_ok=True) + + # Initialize RAG Chain + embeddings_directory = "/Users/hmohammadi/Desktop/react_to_me_github/reactome_chatbot/embeddings/openai/text-embedding-3-large/reactome/Release90/summations" + qa_system = initialize_rag_chain(embeddings_directory, model_name) + + # Iterate over all .xlsx files in the directory + for filename in os.listdir(args.testset_dir): + print(f"Found file: {filename}") + if filename.endswith(".xlsx"): + testset_path = os.path.join(args.testset_dir, filename) + print(f"Processing testset: {testset_path}") + process_testset( + testset_path, + qa_system, + embeddings_directory, + response_dir, + eval_dir, + model_name, ) - data = { - "question": questions, - "answer": answers, - "contexts": contexts, - "ground_truth": ground_truths, - } - - # Convert dict to a dataset object - dataset = Dataset.from_dict(data) - all_replicates_results.append(dataset) - - # Evaluation can be done outside the loop for each dataset or after aggregating results, depending on your needs - for dataset in all_replicates_results: - metrics_to_use = [ - eval(metric) for metric in args.metrics - ] # Assuming metrics are safely evaluable - result = evaluate( - llm=ChatOpenAI(temperature=0.0, verbose=True, model=args.model), - dataset=dataset, - metrics=metrics_to_use, - ) - - df = result.to_pandas() - print(df) - if __name__ == "__main__": main() diff --git a/src/evaluation/test_generator.py b/src/evaluation/test_generator.py index d0ac11e..a1c5945 100644 --- a/src/evaluation/test_generator.py +++ b/src/evaluation/test_generator.py @@ -1,7 +1,7 @@ import argparse -import json import os +import pandas as pd from dotenv import load_dotenv from langchain_community.document_loaders import DirectoryLoader from langchain_openai import ChatOpenAI, OpenAIEmbeddings @@ -49,10 +49,20 @@ def parse_arguments(): def save_testset(testset, filename): output_dir = "testsets" os.makedirs(output_dir, exist_ok=True) - output_path = os.path.join(output_dir, f"{filename}_testset.json") - with open(output_path, "w") as f: - json.dump(testset, f, indent=4) - print(f"Testset saved to {output_path}") + output_path = os.path.join(output_dir, f"{filename}_testset.xlsx") + + # Convert testset to DataFrame for filtering + df = pd.DataFrame(testset) + + # Filter out rows where "ground_truth" is "The answer to given question is not present in context" + filtered_df = df[ + df["ground_truth"] != "The answer to given question is not present in context" + ] + + # Save the filtered DataFrame to an Excel file + filtered_df.to_excel(output_path, index=False) + + print(f"Filtered testset saved to {output_path}") def main(): diff --git a/src/external_search/completeness_grader.py b/src/external_search/completeness_grader.py new file mode 100644 index 0000000..8528b27 --- /dev/null +++ b/src/external_search/completeness_grader.py @@ -0,0 +1,40 @@ +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.runnables import Runnable +from pydantic import BaseModel, Field + +completeness_grader_message = """ +You are an expert grader with extensive knowledge in molecular biology and experience as a Reactome curator. +Your task is to evaluate whether a response generated by an LLM is complete, meaning it fully addresses the user’s query with all necessary details, background information, and context. + +Additionally, assess whether the question is appropriate and directly related to biology or molecular biology. + +Based on this evaluation, determine whether an external search should be conducted. + +Provide a binary output as either: + +Yes: The response is incomplete, missing key details, or lacking sufficient context, AND the question is appropriate and directly related to biology or molecular biology, therefore external search should be conducted. +No: Either the response is complete (fully answers the query, provides enough background, and leaves no essential details missing), OR the question is inappropriate, harmful, or not related to biology or molecular biology, therefore no external search should be conducted. +Ensure your evaluation is based solely on the information requested in the query, the adequacy of the response, and the appropriateness of the question. +""" + +completeness_prompt = ChatPromptTemplate.from_messages( + [ + ("system", completeness_grader_message), + ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"), + ] +) + + +class GradeCompleteness(BaseModel): + external_search: str = Field( + description="Answer is complete and provides all necessary background, 'Yes' or 'No'" + ) + + +class CompletenessGrader: + def __init__(self, llm: BaseChatModel): + structured_completeness_grader: Runnable = llm.with_structured_output( + GradeCompleteness + ) + self.runnable: Runnable = completeness_prompt | structured_completeness_grader diff --git a/src/external_search/state.py b/src/external_search/state.py new file mode 100644 index 0000000..71c0884 --- /dev/null +++ b/src/external_search/state.py @@ -0,0 +1,14 @@ +from typing import TypedDict + + +class WebSearchResult(TypedDict): + title: str + url: str + content: str + + +class GraphState(TypedDict): + question: str # User question + generation: str # LLM generated reponse to the user question + external_search: str # "Yes" or "No" to search for external resources + search_results: list[WebSearchResult] # Results from searching the web diff --git a/src/external_search/tavily_wrapper.py b/src/external_search/tavily_wrapper.py new file mode 100644 index 0000000..016d80f --- /dev/null +++ b/src/external_search/tavily_wrapper.py @@ -0,0 +1,79 @@ +import time +from threading import Lock +from typing import Any, Literal + +from tavily import AsyncTavilyClient, MissingAPIKeyError + +from external_search.state import GraphState, WebSearchResult +from util.logging import logging + + +class TavilyWrapper: + def __init__( + self, + *, + api_key: str | None = None, + search_depth: Literal["basic", "advanced"] = "advanced", + max_results: int = 5, + rate_limit: int = 100, # requests per minute + ): + self.tavily_client: AsyncTavilyClient | None = None + self.search_depth = search_depth + self.max_results = max_results + + self.rate_interval: float = 60 / rate_limit # seconds between requests + self.last_request_time: float = time.monotonic() + self.lock = Lock() + + try: + self.tavily_client = AsyncTavilyClient(api_key) + except MissingAPIKeyError: + logging.warning( + "No Tavily API key was provided (TAVILY_API_KEY) - " + "external search feature is disabled." + ) + + async def search(self, query: str) -> list[WebSearchResult]: + if self.tavily_client is None: + return [] + + with self.lock: + now: float = time.monotonic() + if now - self.last_request_time < self.rate_interval: + return [] + self.last_request_time = now + + try: + response: dict[str, Any] = await self.tavily_client.search( + query=query, + search_depth=self.search_depth, + max_results=self.max_results, + ) + except Exception: + logging.warning("Tavily Search raised an Exception:", exc_info=True) + return [] + + results: list[dict[str, Any]] = response.get("results", []) + return [ + WebSearchResult( + title=result["title"], + url=result["url"], + content=result.get("content", ""), + ) + for result in results + if all(key in result for key in ["title", "url"]) + ] + + async def ainvoke(self, state: GraphState) -> dict[str, list[WebSearchResult]]: + query: str = state["question"] + search_results: list[WebSearchResult] = await self.search(query) + return {"search_results": search_results} + + @staticmethod + def format_results(web_search_results: list[WebSearchResult]) -> str: + if len(web_search_results) == 0: + return "" + formatted = "Here are some external resources you may find helpful:" + for result in web_search_results: + formatted += f"\n- [{result['title']}]({result['url']})" + return formatted diff --git a/src/external_search/workflow.py b/src/external_search/workflow.py new file mode 100644 index 0000000..8bebc28 --- /dev/null +++ b/src/external_search/workflow.py @@ -0,0 +1,44 @@ +from langchain_core.language_models.chat_models import BaseChatModel +from langgraph.graph import StateGraph +from langgraph.graph.state import CompiledStateGraph + +from external_search.completeness_grader import CompletenessGrader +from external_search.state import GraphState +from external_search.tavily_wrapper import TavilyWrapper + + +def decide_next_steps(state: GraphState) -> str: + if state["external_search"] == "Yes": + return "perform_web_search" + else: + return "no_search" + + +def no_search(_) -> dict[str, list]: + return {"search_results": []} + + +def create_search_workflow( + llm: BaseChatModel, max_results: int = 3 +) -> CompiledStateGraph: + completeness_grader = CompletenessGrader(llm) + tavily_wrapper = TavilyWrapper(max_results=max_results) + + workflow = StateGraph(GraphState) + + # Add nodes + workflow.add_node("assess_completeness", completeness_grader.runnable) + workflow.add_node("perform_web_search", tavily_wrapper.ainvoke) + workflow.add_node("no_search", no_search) + + # Add edges + workflow.set_entry_point("assess_completeness") + workflow.add_conditional_edges( + "assess_completeness", + decide_next_steps, + {"perform_web_search": "perform_web_search", "no_search": "no_search"}, + ) + + workflow.set_finish_point("perform_web_search") + workflow.set_finish_point("no_search") + return workflow.compile() diff --git a/src/retreival_chain.py b/src/retreival_chain.py index 8b25411..1bb7ab6 100644 --- a/src/retreival_chain.py +++ b/src/retreival_chain.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Callable, Optional +import chromadb.config from langchain.callbacks.base import BaseCallbackHandler from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.retrievers import EnsembleRetriever @@ -20,6 +21,8 @@ from conversational_chain.graph import RAGGraphWithMemory from reactome.metadata_info import descriptions_info, field_info +chroma_settings = chromadb.config.Settings(anonymized_telemetry=False) + def list_chroma_subdirectories(directory: Path) -> list[str]: subdirectories = list( @@ -99,6 +102,7 @@ def create_retrieval_chain( vectordb = Chroma( persist_directory=str(embeddings_directory / subdirectory), embedding_function=embedding, + client_settings=chroma_settings, ) selfq_retriever = SelfQueryRetriever.from_llm( diff --git a/src/system_prompt/reactome_prompt.py b/src/system_prompt/reactome_prompt.py index 5edca2c..b3f3908 100644 --- a/src/system_prompt/reactome_prompt.py +++ b/src/system_prompt/reactome_prompt.py @@ -19,22 +19,25 @@ # Answer generation prompt qa_system_prompt = """ -You are an expert in molecular biology with access to information from the Reactome Knowledgebase. Your primary responsibility is to answer the user's questions as comprehensively and accurately as possible based on the context provided to you from the Reactome Knowledgebase. +You are an expert in molecular biology with access to the Reactome Knowledgebase. +Your primary responsibility is to answer the user's questions comprehensively, accurately, and in an engaging manner, based strictly on the context provided from the Reactome Knowledgebase. Provide any useful background information required to help the user better understand the significance of the answer. Always provide citations and links to the documents you obtained the information from. When providing answers, please adhere to the following guidelines: -1. Answer the question **only** based on the provided context. Do **not** use any external infromation. +1. Provide answers **strictly based on the given context from the Reactome Knowledgebase**. Do **not** use or infer information from any external sources. 2. If the answer cannot be derived from the context provided, do **not** answer the question; instead explain that the information is not currently available in Reactome. -2. Answer the question comprehensively and accurately, providing useful background information based **only** on the context. -3. keep track of **all** the sources that are directly used to derive the final answer, ensuring **every** piece of information provided in the final answer is cited. -4. Create Citations for the sources used to generate the final asnwer according to the following: +3. Answer the question comprehensively and accurately, providing useful background information based **only** on the context. +4. keep track of **all** the sources that are directly used to derive the final answer, ensuring **every** piece of information in your response is **explicitly cited**. +5. Create Citations for the sources used to generate the final asnwer according to the following: - For Reactome always format citations in the following format: *Source_Name*, where *Source_Name* is the name of the retrieved document. Examples: - Apoptosis - Cell Cycle -5. Always provide the citations you created in the format requested, in point-form at the end of the response paragraph, ensuring every piece of information provided in the final answer is cited. +6. Always provide the citations you created in the format requested, in point-form at the end of the response paragraph, ensuring **every piece of information** provided in the final answer is cited. +7. Write in a conversational and engaging tone suitable for a chatbot. +8. Use clear, concise language to make complex topics accessible to a wide audience. """ qa_prompt = ChatPromptTemplate.from_messages( diff --git a/src/util/chainlit_helpers.py b/src/util/chainlit_helpers.py index 4afc8ef..7975230 100644 --- a/src/util/chainlit_helpers.py +++ b/src/util/chainlit_helpers.py @@ -11,6 +11,13 @@ def get_user_id() -> str | None: return user.identifier if user else None +def is_feature_enabled(config: Config | None, feature_id: str) -> bool: + if not config: + return True + user_id: str | None = get_user_id() + return config.get_feature(feature_id, user_id) + + async def send_messages(messages: Iterable[str]): for message in messages: await cl.Message(content=message).send() diff --git a/src/util/config_yml/__init__.py b/src/util/config_yml/__init__.py new file mode 100644 index 0000000..98b5829 --- /dev/null +++ b/src/util/config_yml/__init__.py @@ -0,0 +1,63 @@ +from datetime import datetime +from pathlib import Path +from typing import Self + +import yaml +from pydantic import BaseModel, ValidationError + +from util.config_yml.features import Feature, Features +from util.config_yml.messages import Message, TriggerEvent +from util.logging import logging + +CONFIG_YML = Path("config.yml") +CONFIG_DEFAULT_YML = Path("config_default.yml") + + +class Config(BaseModel): + features: Features + messages: dict[str, Message] + + def get_feature( + self, + feature_id: str, + user_id: str | None = None, + ) -> bool: + if feature_id in self.features.model_fields: + feature: Feature = getattr(self.features, feature_id) + return feature.enabled and feature.matches_user_group(user_id) + else: + return True + + def get_messages( + self, + user_id: str | None = None, + event: TriggerEvent | None = None, + after_messages: int | None = None, + last_messages: dict[str, datetime] = {}, + ) -> dict[str, str]: + return { + message_id: message.message + for message_id, message in self.messages.items() + if ( + message.enabled + and message.match_recipient(user_id) + and message.trigger.match_trigger( + event, after_messages, last_messages.get(message_id, None) + ) + ) + } + + @classmethod + def from_yaml(cls, config_yml: Path = CONFIG_YML) -> Self | None: + if not config_yml.exists(): + logging.warning( + f"Config file not found: {config_yml} ; falling back to {CONFIG_DEFAULT_YML}" + ) + config_yml = CONFIG_DEFAULT_YML + with open(config_yml) as f: + yaml_data: dict = yaml.safe_load(f) + try: + return cls(**yaml_data) + except ValidationError as e: + logging.warning(e) + return None diff --git a/src/util/config_yml/features.py b/src/util/config_yml/features.py new file mode 100644 index 0000000..eb6c760 --- /dev/null +++ b/src/util/config_yml/features.py @@ -0,0 +1,23 @@ +from enum import StrEnum, auto + +from pydantic import BaseModel + + +class UserGroup(StrEnum): + all = auto() + logged_in = auto() + + +class Feature(BaseModel): + enabled: bool + user_group: UserGroup | None = None + + def matches_user_group(self, user_id: str | None) -> bool: + if self.user_group == UserGroup.logged_in: + return user_id is not None + else: + return True + + +class Features(BaseModel): + postprocessing: Feature diff --git a/src/util/config_yml.py b/src/util/config_yml/messages.py similarity index 63% rename from src/util/config_yml.py rename to src/util/config_yml/messages.py index f43739b..725a904 100644 --- a/src/util/config_yml.py +++ b/src/util/config_yml/messages.py @@ -2,17 +2,11 @@ from datetime import datetime, timedelta from enum import StrEnum, auto from fnmatch import fnmatch -from pathlib import Path -from typing import Self -import yaml -from pydantic import BaseModel, ValidationError +from pydantic import BaseModel from util.logging import logging -CONFIG_YML = Path("config.yml") -CONFIG_DEFAULT_YML = Path("config_default.yml") - interval_units = { "s": "seconds", "m": "minutes", @@ -95,41 +89,3 @@ def match_recipient(self, user_id: str | None) -> bool: if fnmatch(user_id, entry): return True return False - - -class Config(BaseModel): - messages: dict[str, Message] - - def get_messages( - self, - user_id: str | None = None, - event: TriggerEvent | None = None, - after_messages: int | None = None, - last_messages: dict[str, datetime] = {}, - ) -> dict[str, str]: - return { - message_id: message.message - for message_id, message in self.messages.items() - if ( - message.enabled - and message.match_recipient(user_id) - and message.trigger.match_trigger( - event, after_messages, last_messages.get(message_id, None) - ) - ) - } - - @classmethod - def from_yaml(cls, config_yml: Path = CONFIG_YML) -> Self | None: - if not config_yml.exists(): - logging.warning( - f"Config file not found: {config_yml} ; falling back to {CONFIG_DEFAULT_YML}" - ) - config_yml = CONFIG_DEFAULT_YML - with open(config_yml) as f: - yaml_data: dict = yaml.safe_load(f) - try: - return cls(**yaml_data) - except ValidationError as e: - logging.warning(e) - return None