From a093897e45aeeb7dc2f9dda985a2bf06cd22fb7b Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Mon, 3 Feb 2025 10:14:26 +0800 Subject: [PATCH 1/2] Add support for deepseek models on Gaudi Signed-off-by: lvliang-intel --- ChatQnA/docker_compose/intel/hpu/gaudi/README.md | 2 +- ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml | 3 ++- ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml | 3 ++- ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml | 3 ++- .../docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml | 3 ++- ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh | 1 + 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md index 6421704687..b0e9fe19b1 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md @@ -10,7 +10,7 @@ Quick Start: 2. Run Docker Compose. 3. Consume the ChatQnA Service. -Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). +Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script. ## Quick Start: 1.Setup Environment Variable diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml index 6cf9ce67ef..b75312824e 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -92,6 +92,7 @@ services: HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none LLM_MODEL_ID: ${LLM_MODEL_ID} + NUM_CARDS: ${NUM_CARDS} VLLM_TORCH_PROFILER_DIR: "/mnt" healthcheck: test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"] @@ -102,7 +103,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 chatqna-gaudi-backend-server: image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} container_name: chatqna-gaudi-backend-server diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml index 6118f74a37..d5b56e424c 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml @@ -133,12 +133,13 @@ services: HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none LLM_MODEL_ID: ${LLM_MODEL_ID} + NUM_CARDS: ${NUM_CARDS} VLLM_TORCH_PROFILER_DIR: "/mnt" runtime: habana cap_add: - SYS_NICE ipc: host - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 chatqna-gaudi-backend-server: image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest} container_name: chatqna-gaudi-guardrails-server diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml index fd09988a91..8f860996c0 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml @@ -101,11 +101,12 @@ services: LIMIT_HPU_GRAPH: true USE_FLASH_ATTENTION: true FLASH_ATTENTION_RECOMPUTE: true + NUM_CARDS: ${NUM_CARDS} runtime: habana cap_add: - SYS_NICE ipc: host - command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT jaeger: image: jaegertracing/all-in-one:latest container_name: jaeger diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml index 9afde50e46..2d872c4d30 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml @@ -73,12 +73,13 @@ services: HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none LLM_MODEL_ID: ${LLM_MODEL_ID} + NUM_CARDS: ${NUM_CARDS} VLLM_TORCH_PROFILER_DIR: "/mnt" runtime: habana cap_add: - SYS_NICE ipc: host - command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 chatqna-gaudi-backend-server: image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest} container_name: chatqna-gaudi-backend-server diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh index 4e06c3b28e..27339c478f 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh @@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" export INDEX_NAME="rag-redis" +export NUM_CARDS=1 # Set it as a non-null string, such as true, if you want to enable logging facility, # otherwise, keep it as "" to disable it. export LOGFLAG="" From ff87b29967d48f5178c958d432dd4035f4c4eff9 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Mon, 3 Feb 2025 11:04:07 +0800 Subject: [PATCH 2/2] update scripts Signed-off-by: lvliang-intel --- ChatQnA/tests/test_compose_guardrails_on_gaudi.sh | 1 + ChatQnA/tests/test_compose_on_gaudi.sh | 1 + ChatQnA/tests/test_compose_tgi_on_gaudi.sh | 1 + ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh | 1 + 4 files changed, 4 insertions(+) diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh index 3d352b25d7..5b4489f519 100644 --- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh @@ -47,6 +47,7 @@ function start_services() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" + export NUM_CARDS=1 export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B" diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh index 75b2a68e5d..67b7845902 100644 --- a/ChatQnA/tests/test_compose_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_on_gaudi.sh @@ -45,6 +45,7 @@ function start_services() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" + export NUM_CARDS=1 export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export host_ip=${ip_address} diff --git a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh index 0b075d04a4..c4285c2195 100644 --- a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh @@ -46,6 +46,7 @@ function start_services() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export RERANK_MODEL_ID="BAAI/bge-reranker-base" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" + export NUM_CARDS=1 export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+') diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh index 848a4bad9a..41f4482f2d 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh @@ -45,6 +45,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" + export NUM_CARDS=1 export INDEX_NAME="rag-redis" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}