diff --git a/Translation/benchmark/performance/README.md b/Translation/benchmark/performance/README.md new file mode 100644 index 0000000000..fa43991dea --- /dev/null +++ b/Translation/benchmark/performance/README.md @@ -0,0 +1,77 @@ +# Translation Benchmarking + +This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance. + +By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community. + +## Purpose + +We aim to run these benchmarks and share them with the OPEA community for three primary reasons: + +- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs. +- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case. +- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc. + +## Metrics + +The benchmark will report the below metrics, including: + +- Number of Concurrent Requests +- End-to-End Latency: P50, P90, P99 (in milliseconds) +- End-to-End First Token Latency: P50, P90, P99 (in milliseconds) +- Average Next Token Latency (in milliseconds) +- Average Token Latency (in milliseconds) +- Requests Per Second (RPS) +- Output Tokens Per Second +- Input Tokens Per Second + +Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`. + +## Getting Started + +We recommend using Kubernetes to deploy the Translation service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs. + +### Prerequisites + +- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md). + +- Every node has direct internet access +- Set up kubectl on the master node with access to the Kubernetes cluster. +- Install Python 3.8+ on the master node for running GenAIEval. +- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods. +- Ensure that the container's ulimit can meet the the number of requests. + +```bash +# The way to modify the containered ulimit: +sudo systemctl edit containerd +# Add two lines: +[Service] +LimitNOFILE=65536:1048576 + +sudo systemctl daemon-reload; sudo systemctl restart containerd +``` + +### Test Steps + +Please deploy Translation service before benchmarking. + +#### Run Benchmark Test + +Before the benchmark, we can configure the number of test queries and test output directory by: + +```bash +export USER_QUERIES="[1, 1, 1, 1]" +export TEST_OUTPUT_DIR="/tmp/benchmark_output" +``` + +And then run the benchmark by: + +```bash +bash benchmark.sh -n +``` + +The argument `-n` refers to the number of test nodes. + +#### Data collection + +All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps. diff --git a/Translation/benchmark/performance/benchmark.sh b/Translation/benchmark/performance/benchmark.sh new file mode 100644 index 0000000000..6eac50baf8 --- /dev/null +++ b/Translation/benchmark/performance/benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deployment_type="k8s" +node_number=1 +service_port=7777 +query_per_node=128 + +benchmark_tool_path="$(pwd)/GenAIEval" + +usage() { + echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]" + echo " -d deployment_type deployment type, select between k8s and docker (default: ${deployment_type})" + echo " -n node_number Test node number, required only for k8s deployment_type, (default: ${node_number})" + echo " -i service_ip service ip, required only for docker deployment_type" + echo " -p service_port service port, required only for docker deployment_type, (default: ${service_port})" + exit 1 +} + +while getopts ":d:n:i:p:" opt; do + case ${opt} in + d ) + deployment_type=$OPTARG + ;; + n ) + node_number=$OPTARG + ;; + i ) + service_ip=$OPTARG + ;; + p ) + service_port=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then + echo "Error: service_ip is required for docker deployment_type" 1>&2 + usage +fi + +if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then + echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2 +fi + +function main() { + if [[ ! -d ${benchmark_tool_path} ]]; then + echo "Benchmark tool not found, setting up..." + setup_env + fi + run_benchmark +} + +function setup_env() { + git clone https://github.com/opea-project/GenAIEval.git + pushd ${benchmark_tool_path} + python3 -m venv stress_venv + source stress_venv/bin/activate + pip install -r requirements.txt + popd +} + +function run_benchmark() { + source ${benchmark_tool_path}/stress_venv/bin/activate + export DEPLOYMENT_TYPE=${deployment_type} + export SERVICE_IP=${service_ip:-"None"} + export SERVICE_PORT=${service_port:-"None"} + if [[ -z $USER_QUERIES ]]; then + user_query=$((query_per_node*node_number)) + export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]" + echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}." + fi + export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//') + if [[ -z $WARMUP ]]; then export WARMUP=0; fi + if [[ -z $TEST_OUTPUT_DIR ]]; then + if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}" + else + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker" + fi + echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}." + fi + + envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml + cd ${benchmark_tool_path}/evals/benchmark + python benchmark.py +} + +main diff --git a/Translation/benchmark/performance/benchmark.yaml b/Translation/benchmark/performance/benchmark.yaml new file mode 100644 index 0000000000..a8bff2f3d3 --- /dev/null +++ b/Translation/benchmark/performance/benchmark.yaml @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +test_suite_config: # Overall configuration settings for the test suite + examples: ["codetrans"] # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna + deployment_type: "k8s" # Default is "k8s", can also be "docker" + service_ip: None # Leave as None for k8s, specify for Docker + service_port: None # Leave as None for k8s, specify for Docker + warm_ups: 0 # Number of test requests for warm-up + run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs + user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests at each concurrency level + query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. + random_prompt: false # Use random prompts if true, fixed prompts if false + collect_service_metric: false # Collect service metrics if true, do not collect service metrics if false + data_visualization: false # Generate data visualization if true, do not generate data visualization if false + llm_model: "HuggingFaceH4/mistral-7b-grok" # The LLM model used for the test + test_output_dir: "/home/sdp/benchmark_output" # The directory to store the test output + load_shape: # Tenant concurrency pattern + name: constant # poisson or constant(locust default load shape) + params: # Loadshape-specific parameters + constant: # Constant load shape specific parameters, activate only if load_shape.name is constant + concurrent_level: 4 # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users + # arrival_rate: 1.0 # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate + poisson: # Poisson load shape specific parameters, activate only if load_shape.name is poisson + arrival_rate: 1.0 # Request arrival rate + namespace: "" # Fill the user-defined namespace. Otherwise, it will be default. + +test_cases: + codetrans: + llm: + run_test: true + service_name: "llm-svc" # Replace with your service name + parameters: + model_name: "HuggingFaceH4/mistral-7b-grok" + max_new_tokens: 128 + temperature: 0.01 + top_k: 10 + top_p: 0.95 + repetition_penalty: 1.03 + stream: true + llmserve: + run_test: true + service_name: "codetrans-llm-svc" # Replace with your service name + e2e: + run_test: true + service_name: "codetrans-backend-server-svc" # Replace with your service name diff --git a/Translation/docker_compose/amd/gpu/rocm/README.md b/Translation/docker_compose/amd/gpu/rocm/README.md index 38b0176349..ae5e5794ee 100644 --- a/Translation/docker_compose/amd/gpu/rocm/README.md +++ b/Translation/docker_compose/amd/gpu/rocm/README.md @@ -79,7 +79,9 @@ cd GenAIExamples/Translation/docker_compose/amd/gpu/rocm ### Set environments -In the file "GenAIExamples/Translation/docker_compose/amd/gpu/rocm/set_env.sh " it is necessary to set the required values. Parameter assignments are specified in the comments for each variable setting command +In the file "GenAIExamples/Translation/docker_compose/amd/gpu/rocm/set_env.sh " it is necessary to set the required values. Parameter assignments are specified in the comments for each variable setting command. + +if you need to start a Translation service for code (instead of texts), change the `TRANSLATION_LLM_MODEL_ID` in `set_env.sh` to "Qwen/Qwen2.5-Coder-7B-Instruct". ```bash chmod +x set_env.sh @@ -97,32 +99,54 @@ docker compose up -d ## Validate TGI service ```bash +# text translation curl http://${TRANSLATION_HOST_IP}:${TRANSLATIONS_TGI_SERVICE_PORT}/generate \ -X POST \ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ -H 'Content-Type: application/json' +# code translation +curl http://${TRANSLATION_HOST_IP}:${TRANSLATIONS_TGI_SERVICE_PORT}/generate \ + -X POST \ + -d '{"inputs":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' ``` ## Validate LLM service ```bash +# text translation curl http://${TRANSLATION_HOST_IP}:9000/v1/chat/completions \ -X POST \ -d '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' \ -H 'Content-Type: application/json' +# code translation +curl http://${TRANSLATION_HOST_IP}:9000/v1/chat/completions \ + -X POST \ + -d '{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}' \ + -H 'Content-Type: application/json' ``` ## Validate MegaService ```bash +# text translation curl http://${TRANSLATION_HOST_IP}:${TRANSLATION_BACKEND_SERVICE_PORT}/v1/translation -H "Content-Type: application/json" -d '{ - "language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + "language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。","translate_type":"text"}' +# code translation +curl http://${TRANSLATION_HOST_IP}:${TRANSLATION_BACKEND_SERVICE_PORT}/v1/translation \ + -H "Content-Type: application/json" \ + -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' ``` ## Validate Nginx service ```bash +# text translation curl http://${TRANSLATION_HOST_IP}:${TRANSLATION_NGINX_PORT}/v1/translation \ -H "Content-Type: application/json" \ - -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。","translate_type":"text"}' +# code translation +curl http://${TRANSLATION_HOST_IP}:${TRANSLATION_NGINX_PORT}/v1/translation \ + -H "Content-Type: application/json" \ + -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' ``` diff --git a/Translation/docker_compose/amd/gpu/rocm/compose.yaml b/Translation/docker_compose/amd/gpu/rocm/compose.yaml index 2ee2a9c2cb..41625fc8b3 100644 --- a/Translation/docker_compose/amd/gpu/rocm/compose.yaml +++ b/Translation/docker_compose/amd/gpu/rocm/compose.yaml @@ -47,8 +47,9 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - LLM_ENDPOINT: ${TRANSLATION_TGI_LLM_ENDPOINT} + LLM_ENDPOINT: "http://translation-tgi-service" LLM_MODEL_ID: ${TRANSLATION_LLM_MODEL_ID} + LLM_COMPONENT_NAME: ${TRANSLATION_LLM_COMPONENT_NAME} HUGGINGFACEHUB_API_TOKEN: ${TRANSLATION_HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 @@ -66,7 +67,7 @@ services: - https_proxy=${https_proxy} - http_proxy=${http_proxy} - MEGA_SERVICE_HOST_IP=${TRANSLATION_MEGA_SERVICE_HOST_IP} - - LLM_SERVICE_HOST_IP=${TRANSLATION_LLM_SERVICE_HOST_IP} + - LLM_SERVICE_HOST_IP="translation-llm" ipc: host restart: always translation-ui-server: diff --git a/Translation/docker_compose/intel/cpu/xeon/README.md b/Translation/docker_compose/intel/cpu/xeon/README.md index 4a41cb5385..3b81ac4e35 100644 --- a/Translation/docker_compose/intel/cpu/xeon/README.md +++ b/Translation/docker_compose/intel/cpu/xeon/README.md @@ -82,6 +82,37 @@ By default, the LLM model is set to a default value as listed below: Change the `LLM_MODEL_ID` below for your needs. +For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below: + +1. Online + + ```bash + export HF_TOKEN=${your_hf_token} + export HF_ENDPOINT="https://hf-mirror.com" + model_name="haoranxu/ALMA-13B" + # Start vLLM LLM Service + docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80 + # Start TGI LLM Service + docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name + ``` + +2. Offline + + - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `haoranxu/ALMA-13B`. + + - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`. + + - Run the following command to start the LLM service. + + ```bash + export HF_TOKEN=${your_hf_token} + export model_path="/path/to/model" + # Start vLLM LLM Service + docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80 + # Start TGI LLM Service + docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data + ``` + ### Setup Environment Variables 1. Set the required environment variables: @@ -105,6 +136,8 @@ Change the `LLM_MODEL_ID` below for your needs. 3. Set up other environment variables: + If you want to start a code translation service (instead of text translation), change the `LLM_MODEL_ID` to "mistralai/Mistral-7B-Instruct-v0.3" in `set_env.sh`. + ```bash cd ../../../ source set_env.sh @@ -113,7 +146,19 @@ Change the `LLM_MODEL_ID` below for your needs. ### Start Microservice Docker Containers ```bash -docker compose up -d +cd GenAIExamples/Translation/docker_compose/intel/cpu/xeon +``` + +If use vLLM as the LLM serving backend. + +```bash +docker compose -f compose.yaml up -d +``` + +If use TGI as the LLM serving backend. + +```bash +docker compose -f compose_tgi.yaml up -d ``` > Note: The docker images will be automatically downloaded from `docker hub`: @@ -127,43 +172,97 @@ docker pull opea/nginx:latest ### Validate Microservices -1. TGI Service +1. LLM backend Service + + In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready. + + Try the command below to check whether the LLM serving is ready. ```bash + # vLLM service + docker logs translation-xeon-vllm-service 2>&1 | grep complete + # If the service is ready, you will get the response like below. + INFO: Application startup complete. + ``` + + ```bash + # TGI service + docker logs translation-xeon-tgi-service | grep Connected + # If the service is ready, you will get the response like below. + 2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected + ``` + + Then try the `cURL` command below to validate services. + + ```bash + # either vLLM or TGI service + # text translation curl http://${host_ip}:8008/generate \ -X POST \ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ -H 'Content-Type: application/json' + # code translation + curl http://${host_ip}:8008/v1/chat/completions \ + -X POST \ + -d '{"inputs":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' ``` 2. LLM Microservice ```bash + # text translation curl http://${host_ip}:9000/v1/chat/completions \ -X POST \ -d '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' \ -H 'Content-Type: application/json' + # code translation + curl http://${host_ip}:9000/v1/chat/completions\ + -X POST \ + -d '{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}' \ + -H 'Content-Type: application/json' ``` 3. MegaService ```bash + # text translation curl http://${host_ip}:8888/v1/translation -H "Content-Type: application/json" -d '{ - "language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + "language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。","translate_type":"text"}' + # code translation + curl http://${host_ip}:8888/v1/translation \ + -H "Content-Type: application/json" \ + -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' ``` 4. Nginx Service ```bash + # text translation curl http://${host_ip}:${NGINX_PORT}/v1/translation \ -H "Content-Type: application/json" \ - -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。","translate_type":"text"}' + # code translation + curl http://${host_ip}:${NGINX_PORT}/v1/translation \ + -H "Content-Type: application/json" \ + -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' ``` Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service. ## 🚀 Launch the UI +### Launch with origin port + Open this URL `http://{host_ip}:5173` in your browser to access the frontend. + ![project-screenshot](../../../../assets/img/trans_ui_init.png) ![project-screenshot](../../../../assets/img/trans_ui_select.png) + +### Launch with Nginx + +If you want to launch the UI using Nginx, open this URL: `http://{host_ip}:{NGINX_PORT}` in your browser to access the frontend. + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/71214938-819c-4979-89cb-c03d937cd7b5) + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/be543e96-ddcd-4ee0-9f2c-4e99fee77e37) diff --git a/Translation/docker_compose/intel/cpu/xeon/compose.yaml b/Translation/docker_compose/intel/cpu/xeon/compose.yaml index 4b77d84484..6f6ce0ef77 100644 --- a/Translation/docker_compose/intel/cpu/xeon/compose.yaml +++ b/Translation/docker_compose/intel/cpu/xeon/compose.yaml @@ -2,33 +2,32 @@ # SPDX-License-Identifier: Apache-2.0 services: - tgi-service: - image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu - container_name: tgi-service + vllm-service: + image: ${REGISTRY:-opea}/vllm:${TAG:-latest} + container_name: translation-xeon-vllm-service ports: - "8008:80" + volumes: + - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub" + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - host_ip: ${host_ip} + LLM_MODEL_ID: ${LLM_MODEL_ID} + VLLM_TORCH_PROFILER_DIR: "/mnt" healthcheck: test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"] interval: 10s timeout: 10s retries: 100 - volumes: - - "${MODEL_CACHE:-./data}:/data" - shm_size: 1g - command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 + command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 llm: image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} - container_name: llm-textgen-server + container_name: translation-xeon-llm-server depends_on: - tgi-service: + vllm-service: condition: service_healthy ports: - "9000:9000" @@ -37,8 +36,9 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} + LLM_ENDPOINT: ${LLM_ENDPOINT} LLM_MODEL_ID: ${LLM_MODEL_ID} + LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 @@ -47,10 +47,10 @@ services: image: ${REGISTRY:-opea}/translation:${TAG:-latest} container_name: translation-xeon-backend-server depends_on: - - tgi-service + - vllm-service - llm ports: - - "8888:8888" + - "${BACKEND_SERVICE_PORT:-8888}:8888" environment: - no_proxy=${no_proxy} - https_proxy=${https_proxy} @@ -65,7 +65,7 @@ services: depends_on: - translation-xeon-backend-server ports: - - "5173:5173" + - "${FRONTEND_SERVICE_PORT:-5173}:5173" environment: - no_proxy=${no_proxy} - https_proxy=${https_proxy} diff --git a/Translation/docker_compose/intel/cpu/xeon/compose_tgi.yaml b/Translation/docker_compose/intel/cpu/xeon/compose_tgi.yaml new file mode 100644 index 0000000000..05470691ed --- /dev/null +++ b/Translation/docker_compose/intel/cpu/xeon/compose_tgi.yaml @@ -0,0 +1,98 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + tgi-service: + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu + container_name: translation-xeon-tgi-service + ports: + - "8008:80" + volumes: + - "${MODEL_CACHE}:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + host_ip: ${host_ip} + healthcheck: + test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 + llm: + image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} + container_name: translation-xeon-llm-server + depends_on: + tgi-service: + condition: service_healthy + ports: + - "9000:9000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + restart: unless-stopped + translation-xeon-backend-server: + image: ${REGISTRY:-opea}/translation:${TAG:-latest} + container_name: translation-xeon-backend-server + depends_on: + - tgi-service + - llm + ports: + - "${BACKEND_SERVICE_PORT:-8888}:8888" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} + - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP} + ipc: host + restart: always + translation-xeon-ui-server: + image: ${REGISTRY:-opea}/translation-ui:${TAG:-latest} + container_name: translation-xeon-ui-server + depends_on: + - translation-xeon-backend-server + ports: + - "${FRONTEND_SERVICE_PORT:-5173}:5173" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - BASE_URL=${BACKEND_SERVICE_ENDPOINT} + ipc: host + restart: always + translation-xeon-nginx-server: + image: ${REGISTRY:-opea}/nginx:${TAG:-latest} + container_name: translation-xeon-nginx-server + depends_on: + - translation-xeon-backend-server + - translation-xeon-ui-server + ports: + - "${NGINX_PORT:-80}:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP} + - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT} + - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME} + - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP} + - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT} + ipc: host + restart: always +networks: + default: + driver: bridge diff --git a/Translation/docker_compose/intel/hpu/gaudi/README.md b/Translation/docker_compose/intel/hpu/gaudi/README.md index 31ed7da040..3a49a7ab4e 100644 --- a/Translation/docker_compose/intel/hpu/gaudi/README.md +++ b/Translation/docker_compose/intel/hpu/gaudi/README.md @@ -74,6 +74,37 @@ By default, the LLM model is set to a default value as listed below: Change the `LLM_MODEL_ID` below for your needs. +For users in China who are unable to download models directly from Huggingface, you can use [ModelScope](https://www.modelscope.cn/models) or a Huggingface mirror to download models. The vLLM/TGI can load the models either online or offline as described below: + +1. Online + + ```bash + export HF_TOKEN=${your_hf_token} + export HF_ENDPOINT="https://hf-mirror.com" + model_name="haoranxu/ALMA-13B" + # Start vLLM LLM Service + docker run -p 8008:80 -v ./data:/root/.cache/huggingface/hub --name vllm-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 128g opea/vllm:latest --model $model_name --host 0.0.0.0 --port 80 + # Start TGI LLM Service + docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name + ``` + +2. Offline + + - Search your model name in ModelScope. For example, check [this page](https://www.modelscope.cn/models/rubraAI/Mistral-7B-Instruct-v0.3/files) for model `haoranxu/ALMA-13B`. + + - Click on `Download this model` button, and choose one way to download the model to your local path `/path/to/model`. + + - Run the following command to start the LLM service. + + ```bash + export HF_TOKEN=${your_hf_token} + export model_path="/path/to/model" + # Start vLLM LLM Service + docker run -p 8008:80 -v $model_path:/root/.cache/huggingface/hub --name vllm-service --shm-size 128g opea/vllm:latest --model /root/.cache/huggingface/hub --host 0.0.0.0 --port 80 + # Start TGI LLM Service + docker run -p 8008:80 -v $model_path:/data --name tgi-service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data + ``` + ### Setup Environment Variables 1. Set the required environment variables: @@ -105,7 +136,19 @@ Change the `LLM_MODEL_ID` below for your needs. ### Start Microservice Docker Containers ```bash -docker compose up -d +cd GenAIExamples/Translation/docker_compose/intel/cpu/gaudi +``` + +If use vLLM as the LLM serving backend. + +```bash +docker compose -f compose.yaml up -d +``` + +If use TGI as the LLM serving backend. + +```bash +docker compose -f compose_tgi.yaml up -d ``` > Note: The docker images will be automatically downloaded from `docker hub`: @@ -119,43 +162,97 @@ docker pull opea/nginx:latest ### Validate Microservices -1. TGI Service +1. LLM backend Service + + In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready. + + Try the command below to check whether the LLM serving is ready. ```bash + # vLLM service + docker logs translation-gaudi-vllm-service 2>&1 | grep complete + # If the service is ready, you will get the response like below. + INFO: Application startup complete. + ``` + + ```bash + # TGI service + docker logs translation-gaudi-tgi-service | grep Connected + # If the service is ready, you will get the response like below. + 2024-09-03T02:47:53.402023Z INFO text_generation_router::server: router/src/server.rs:2311: Connected + ``` + + Then try the `cURL` command below to validate services. + + ```bash + # either vLLM or TGI service + # text translation curl http://${host_ip}:8008/generate \ -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' + # code translation + curl http://${host_ip}:8008/v1/chat/completions \ + -X POST \ + -d '{"inputs":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \ -H 'Content-Type: application/json' ``` 2. LLM Microservice ```bash + # text translation curl http://${host_ip}:9000/v1/chat/completions \ -X POST \ -d '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' \ -H 'Content-Type: application/json' + # code translation + curl http://${host_ip}:9000/v1/chat/completions\ + -X POST \ + -d '{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}' \ + -H 'Content-Type: application/json' ``` 3. MegaService ```bash + # text translation curl http://${host_ip}:8888/v1/translation -H "Content-Type: application/json" -d '{ - "language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + "language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。","translate_type":"text"}' + # code translation + curl http://${host_ip}:8888/v1/translation \ + -H "Content-Type: application/json" \ + -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' ``` 4. Nginx Service ```bash + # text translation curl http://${host_ip}:${NGINX_PORT}/v1/translation \ -H "Content-Type: application/json" \ - -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + -d '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。","translate_type":"text"}' + # code translation + curl http://${host_ip}:${NGINX_PORT}/v1/translation \ + -H "Content-Type: application/json" \ + -d '{"language_from": "Golang","language_to": "Python","source_code": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' ``` Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service. ## 🚀 Launch the UI +### Launch with origin port + Open this URL `http://{host_ip}:5173` in your browser to access the frontend. + ![project-screenshot](../../../../assets/img/trans_ui_init.png) ![project-screenshot](../../../../assets/img/trans_ui_select.png) + +### Launch with Nginx + +If you want to launch the UI using Nginx, open this URL: `http://{host_ip}:{NGINX_PORT}` in your browser to access the frontend. + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/71214938-819c-4979-89cb-c03d937cd7b5) + +![image](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/be543e96-ddcd-4ee0-9f2c-4e99fee77e37) diff --git a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml index 9516e60ce6..368731aa05 100644 --- a/Translation/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/Translation/docker_compose/intel/hpu/gaudi/compose.yaml @@ -2,24 +2,23 @@ # SPDX-License-Identifier: Apache-2.0 services: - tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.3.1 - container_name: tgi-gaudi-server + vllm-service: + image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} + container_name: translation-gaudi-vllm-service ports: - "8008:80" + volumes: + - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub" environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true + LLM_MODEL_ID: ${LLM_MODEL_ID} + NUM_CARDS: ${NUM_CARDS} + VLLM_TORCH_PROFILER_DIR: "/mnt" healthcheck: test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"] interval: 10s @@ -29,14 +28,12 @@ services: cap_add: - SYS_NICE ipc: host - volumes: - - "${MODEL_CACHE:-./data}:/data" - command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048 + command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE} --max-num-seqs ${MAX_NUM_SEQS} --max-seq_len-to-capture ${MAX_SEQ_LEN_TO_CAPTURE} llm: image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} - container_name: llm-textgen-gaudi-server + container_name: translation-gaudi-llm-server depends_on: - tgi-service: + vllm-service: condition: service_healthy ports: - "9000:9000" @@ -45,8 +42,9 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} + LLM_ENDPOINT: ${LLM_ENDPOINT} LLM_MODEL_ID: ${LLM_MODEL_ID} + LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 @@ -55,10 +53,10 @@ services: image: ${REGISTRY:-opea}/translation:${TAG:-latest} container_name: translation-gaudi-backend-server depends_on: - - tgi-service + - vllm-service - llm ports: - - "8888:8888" + - "${BACKEND_SERVICE_PORT:-8888}:8888" environment: - no_proxy=${no_proxy} - https_proxy=${https_proxy} @@ -73,7 +71,7 @@ services: depends_on: - translation-gaudi-backend-server ports: - - "5173:5173" + - "${FRONTEND_SERVICE_PORT:-5173}:5173" environment: - no_proxy=${no_proxy} - https_proxy=${https_proxy} diff --git a/Translation/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/Translation/docker_compose/intel/hpu/gaudi/compose_tgi.yaml new file mode 100644 index 0000000000..65d8992c5e --- /dev/null +++ b/Translation/docker_compose/intel/hpu/gaudi/compose_tgi.yaml @@ -0,0 +1,107 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + tgi-service: + image: ghcr.io/huggingface/tgi-gaudi:2.3.1 + container_name: translation-gaudi-tgi-service + ports: + - "8008:80" + volumes: + - "${MODEL_CACHE}:/data" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + healthcheck: + test: ["CMD-SHELL", "curl -f http://$host_ip:8008/health || exit 1"] + interval: 10s + timeout: 10s + retries: 100 + runtime: habana + cap_add: + - SYS_NICE + ipc: host + command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048 + llm: + image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} + container_name: translation-gaudi-llm-server + depends_on: + tgi-service: + condition: service_healthy + ports: + - "9000:9000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_ENDPOINT: ${LLM_ENDPOINT} + LLM_MODEL_ID: ${LLM_MODEL_ID} + LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + restart: unless-stopped + translation-gaudi-backend-server: + image: ${REGISTRY:-opea}/translation:${TAG:-latest} + container_name: translation-gaudi-backend-server + depends_on: + - tgi-service + - llm + ports: + - "${BACKEND_SERVICE_PORT:-8888}:8888" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} + - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP} + ipc: host + restart: always + translation-gaudi-ui-server: + image: ${REGISTRY:-opea}/translation-ui:${TAG:-latest} + container_name: translation-gaudi-ui-server + depends_on: + - translation-gaudi-backend-server + ports: + - "${FRONTEND_SERVICE_PORT:-5173}:5173" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - BASE_URL=${BACKEND_SERVICE_ENDPOINT} + ipc: host + restart: always + translation-gaudi-nginx-server: + image: ${REGISTRY:-opea}/nginx:${TAG:-latest} + container_name: translation-gaudi-nginx-server + depends_on: + - translation-gaudi-backend-server + - translation-gaudi-ui-server + ports: + - "${NGINX_PORT:-80}:80" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP} + - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT} + - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME} + - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP} + - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT} + ipc: host + restart: always + +networks: + default: + driver: bridge diff --git a/Translation/docker_compose/set_env.sh b/Translation/docker_compose/set_env.sh index aa4b428f6e..03fff33fa9 100644 --- a/Translation/docker_compose/set_env.sh +++ b/Translation/docker_compose/set_env.sh @@ -8,14 +8,15 @@ popd > /dev/null export LLM_MODEL_ID="haoranxu/ALMA-13B" -export TGI_LLM_ENDPOINT="http://${host_ip}:8008" +export LLM_ENDPOINT="http://${host_ip}:8008" +export LLM_COMPONENT_NAME="OpeaTextGenService" export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export BACKEND_SERVICE_PORT=8888 export MEGA_SERVICE_HOST_IP=${host_ip} export LLM_SERVICE_HOST_IP=${host_ip} -export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/translation" -export NGINX_PORT=80 export FRONTEND_SERVICE_IP=${host_ip} export FRONTEND_SERVICE_PORT=5173 export BACKEND_SERVICE_NAME=translation export BACKEND_SERVICE_IP=${host_ip} -export BACKEND_SERVICE_PORT=8888 +export NGINX_PORT=80 +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:${BACKEND_SERVICE_PORT}/v1/translation" diff --git a/Translation/docker_image_build/build.yaml b/Translation/docker_image_build/build.yaml index 1dad29cb7c..f06c143e92 100644 --- a/Translation/docker_image_build/build.yaml +++ b/Translation/docker_image_build/build.yaml @@ -23,6 +23,18 @@ services: dockerfile: comps/llms/src/text-generation/Dockerfile extends: translation image: ${REGISTRY:-opea}/llm-textgen:${TAG:-latest} + vllm: + build: + context: vllm + dockerfile: Dockerfile.cpu + extends: translation + image: ${REGISTRY:-opea}/vllm:${TAG:-latest} + vllm-gaudi: + build: + context: vllm-fork + dockerfile: Dockerfile.hpu + extends: translation + image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} nginx: build: context: GenAIComps diff --git a/Translation/kubernetes/helm/README.md b/Translation/kubernetes/helm/README.md new file mode 100644 index 0000000000..dedc2520eb --- /dev/null +++ b/Translation/kubernetes/helm/README.md @@ -0,0 +1,18 @@ +# Deploy Translation on Kubernetes cluster + +- You should have Helm (version >= 3.15) installed. Refer to the [Helm Installation Guide](https://helm.sh/docs/intro/install/) for more information. +- For more deploy options, refer to [helm charts README](https://github.com/opea-project/GenAIInfra/tree/main/helm-charts#readme). + +## Deploy on Xeon + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install codetrans oci://ghcr.io/opea-project/charts/codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f cpu-values.yaml +``` + +## Deploy on Gaudi + +``` +export HFTOKEN="insert-your-huggingface-token-here" +helm install codetrans oci://ghcr.io/opea-project/charts/codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -f gaudi-values.yaml +``` diff --git a/Translation/kubernetes/helm/cpu-values.yaml b/Translation/kubernetes/helm/cpu-values.yaml new file mode 100644 index 0000000000..313f050754 --- /dev/null +++ b/Translation/kubernetes/helm/cpu-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 diff --git a/Translation/kubernetes/helm/gaudi-values.yaml b/Translation/kubernetes/helm/gaudi-values.yaml new file mode 100644 index 0000000000..89ed259285 --- /dev/null +++ b/Translation/kubernetes/helm/gaudi-values.yaml @@ -0,0 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 1 + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 diff --git a/Translation/tests/test_compose_on_gaudi.sh b/Translation/tests/test_compose_on_gaudi.sh index 63167b6e74..1145e0f545 100644 --- a/Translation/tests/test_compose_on_gaudi.sh +++ b/Translation/tests/test_compose_on_gaudi.sh @@ -30,46 +30,47 @@ function build_docker_images() { cd $WORKPATH/docker_image_build git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + git clone https://github.com/HabanaAI/vllm-fork.git && cd vllm-fork + VLLM_VER=$(git describe --tags "$(git rev-list --tags --max-count=1)") + git checkout ${VLLM_VER} &> /dev/null && cd ../ echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="translation translation-ui llm-textgen nginx" + service_list="translation translation-ui llm-textgen vllm-gaudi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1 docker images && sleep 1s } function start_services() { cd $WORKPATH/docker_compose/intel/hpu/gaudi - export LLM_MODEL_ID="haoranxu/ALMA-13B" - export TGI_LLM_ENDPOINT="http://${ip_address}:8008" + export http_proxy=${http_proxy} + export https_proxy=${https_proxy} + export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" + export LLM_ENDPOINT="http://${ip_address}:8008" + export LLM_COMPONENT_NAME="OpeaTextGenService" + export NUM_CARDS=1 + export BLOCK_SIZE=128 + export MAX_NUM_SEQS=256 + export MAX_SEQ_LEN_TO_CAPTURE=2048 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export MEGA_SERVICE_HOST_IP=${ip_address} export LLM_SERVICE_HOST_IP=${ip_address} - export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/translation" - export NGINX_PORT=80 export FRONTEND_SERVICE_IP=${ip_address} export FRONTEND_SERVICE_PORT=5173 export BACKEND_SERVICE_NAME=translation export BACKEND_SERVICE_IP=${ip_address} export BACKEND_SERVICE_PORT=8888 + export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" + export NGINX_PORT=80 export host_ip=${ip_address} sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers - docker compose up -d > ${LOG_PATH}/start_services_with_compose.log - - n=0 - until [[ "$n" -ge 100 ]]; do - docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log - if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then - break - fi - sleep 5s - n=$((n+1)) - done + docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 5s } function validate_services() { @@ -79,63 +80,65 @@ function validate_services() { local DOCKER_NAME="$4" local INPUT_DATA="$5" - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." fi - sleep 1s + + sleep 5s } function validate_microservices() { - # Check if the microservices are running correctly. - - # tgi gaudi service - validate_services \ - "${ip_address}:8008/generate" \ - "generated_text" \ - "tgi-gaudi" \ - "tgi-gaudi-server" \ - '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' - # llm microservice validate_services \ "${ip_address}:9000/v1/chat/completions" \ "data: " \ "llm" \ - "llm-textgen-gaudi-server" \ + "translation-gaudi-llm-server" \ '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' } function validate_megaservice() { - # Curl the Mega Service + # test the megaservice for code translation validate_services \ - "${ip_address}:8888/v1/translation" \ - "translation" \ - "mega-translation" \ - "translation-gaudi-backend-server" \ - '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ + "print" \ + "mega-translation" \ + "translation-gaudi-backend-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' - # test the megeservice via nginx + # test the megaservice for text translation validate_services \ - "${ip_address}:80/v1/translation" \ + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ "translation" \ + "mega-translation" \ + "translation-gaudi-backend-server" \ + '{"language_from": "Chinese","language_to": "English","source_data": "我爱机器翻译。","translate_type":"text"}' + + # test the megeservice via nginx + validate_services \ + "${ip_address}:${NGINX_PORT}/v1/translation" \ + "print" \ "mega-translation-nginx" \ "translation-gaudi-nginx-server" \ - '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' + } function validate_frontend() { @@ -168,7 +171,7 @@ function validate_frontend() { function stop_docker() { cd $WORKPATH/docker_compose/intel/hpu/gaudi - docker compose stop && docker compose rm -f + docker compose -f compose.yaml stop && docker compose rm -f } function main() { @@ -180,7 +183,7 @@ function main() { validate_microservices validate_megaservice -# validate_frontend + validate_frontend stop_docker echo y | docker system prune diff --git a/Translation/tests/test_compose_on_rocm.sh b/Translation/tests/test_compose_on_rocm.sh index 44bfc52cd1..53d19140b1 100644 --- a/Translation/tests/test_compose_on_rocm.sh +++ b/Translation/tests/test_compose_on_rocm.sh @@ -41,9 +41,13 @@ function build_docker_images() { function start_services() { cd $WORKPATH/docker_compose/amd/gpu/rocm/ + export http_proxy=${http_proxy} + export https_proxy=${http_proxy} + export TRANSLATION_TGI_SERVICE_PORT=8008 export TRANSLATION_HOST_IP=${ip_address} export TRANSLATION_LLM_MODEL_ID="haoranxu/ALMA-13B" - export TRANSLATION_TGI_LLM_ENDPOINT="http://${TRANSLATION_HOST_IP}:8008" + export TRANSLATION_TGI_LLM_ENDPOINT="http://${TRANSLATION_HOST_IP}:${TRANSLATION_TGI_SERVICE_PORT}" + export TRANSLATION_LLM_COMPONENT_NAME="OpeaTextGenService" export TRANSLATION_HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export TRANSLATION_MEGA_SERVICE_HOST_IP=${TRANSLATION_HOST_IP} export TRANSLATION_LLM_SERVICE_HOST_IP=${TRANSLATION_HOST_IP} @@ -80,25 +84,28 @@ function validate_services() { local DOCKER_NAME="$4" local INPUT_DATA="$5" - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." fi - sleep 1s + + sleep 5s } function validate_microservices() { @@ -106,7 +113,7 @@ function validate_microservices() { # tgi for llm service validate_services \ - "${TRANSLATION_HOST_IP}:8008/generate" \ + "${TRANSLATION_HOST_IP}:${TRANSLATION_TGI_SERVICE_PORT}/generate" \ "generated_text" \ "translation-tgi-service" \ "translation-tgi-service" \ @@ -122,13 +129,21 @@ function validate_microservices() { } function validate_megaservice() { - # Curl the Mega Service + # test the megaservice for text translation + validate_services \ + "${TRANSLATION_HOST_IP}:${TRANSLATION_BACKEND_SERVICE_PORT}/v1/translation" \ + "translation" \ + "translation-backend-server" \ + "translation-backend-server" \ + '{"language_from": "Chinese","language_to": "English","source_data": "我爱机器翻译。","translate_type":"text"}' + + # test the megaservice for code translation validate_services \ - "${TRANSLATION_HOST_IP}:8888/v1/translation" \ - "translation" \ - "translation-backend-server" \ - "translation-backend-server" \ - '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + "${TRANSLATION_HOST_IP}:${TRANSLATION_BACKEND_SERVICE_PORT}/v1/translation" \ + "print" \ + "translation-backend-server" \ + "translation-backend-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' # test the megeservice via nginx validate_services \ @@ -136,7 +151,7 @@ function validate_megaservice() { "translation" \ "translation-nginx-server" \ "translation-nginx-server" \ - '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + '{"language_from": "Chinese","language_to": "English","source_data": "我爱机器翻译。","translate_type":"text"}' } function validate_frontend() { diff --git a/Translation/tests/test_compose_on_xeon.sh b/Translation/tests/test_compose_on_xeon.sh index 9e2ac58cb7..74e761f137 100644 --- a/Translation/tests/test_compose_on_xeon.sh +++ b/Translation/tests/test_compose_on_xeon.sh @@ -30,9 +30,14 @@ function build_docker_images() { cd $WORKPATH/docker_image_build git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + git clone https://github.com/vllm-project/vllm.git && cd vllm + VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )" + echo "Check out vLLM tag ${VLLM_VER}" + git checkout ${VLLM_VER} &> /dev/null + cd ../ echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="translation translation-ui llm-textgen nginx" + service_list="translation translation-ui llm-textgen vllm nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu @@ -42,35 +47,29 @@ function build_docker_images() { function start_services() { cd $WORKPATH/docker_compose/intel/cpu/xeon/ - export LLM_MODEL_ID="haoranxu/ALMA-13B" - export TGI_LLM_ENDPOINT="http://${ip_address}:8008" + export http_proxy=${http_proxy} + export https_proxy=${https_proxy} + export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" + export LLM_ENDPOINT="http://${ip_address}:8008" + export LLM_COMPONENT_NAME="OpeaTextGenService" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export MEGA_SERVICE_HOST_IP=${ip_address} export LLM_SERVICE_HOST_IP=${ip_address} - export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/translation" - export NGINX_PORT=80 export FRONTEND_SERVICE_IP=${ip_address} export FRONTEND_SERVICE_PORT=5173 export BACKEND_SERVICE_NAME=translation export BACKEND_SERVICE_IP=${ip_address} export BACKEND_SERVICE_PORT=8888 + export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" + export NGINX_PORT=80 export host_ip=${ip_address} sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env # Start Docker Containers - docker compose up -d > ${LOG_PATH}/start_services_with_compose.log - - n=0 - # wait long for llm model download - until [[ "$n" -ge 500 ]]; do - docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log - if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then - break - fi - sleep 10s - n=$((n+1)) - done + docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 5s } function validate_services() { @@ -80,63 +79,64 @@ function validate_services() { local DOCKER_NAME="$4" local INPUT_DATA="$5" - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." fi - sleep 1s + + sleep 5s } function validate_microservices() { - # Check if the microservices are running correctly. - - # tgi for llm service - validate_services \ - "${ip_address}:8008/generate" \ - "generated_text" \ - "tgi" \ - "tgi-service" \ - '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' - # llm microservice validate_services \ "${ip_address}:9000/v1/chat/completions" \ "data: " \ "llm" \ - "llm-textgen-server" \ - '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' + "translation-xeon-llm-server" \ + '{"query":" ### System: Please translate the following Golang codes into Python codes. ### Original codes: '\'''\'''\''Golang \npackage main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n '\'''\'''\'' ### Translated codes:"}' } function validate_megaservice() { - # Curl the Mega Service + # test the megaservice for code translation validate_services \ - "${ip_address}:8888/v1/translation" \ - "translation" \ - "mega-translation" \ - "translation-xeon-backend-server" \ - '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ + "print" \ + "mega-translation" \ + "translation-xeon-backend-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' - # test the megeservice via nginx + # test the megaservice for text translation validate_services \ - "${ip_address}:80/v1/translation" \ + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ "translation" \ + "mega-translation" \ + "translation-xeon-backend-server" \ + '{"language_from": "Chinese","language_to": "English","source_data": "我爱机器翻译。","translate_type":"text"}' + + # test the megeservice via nginx + validate_services \ + "${ip_address}:${NGINX_PORT}/v1/translation" \ + "print" \ "mega-translation-nginx" \ "translation-xeon-nginx-server" \ - '{"language_from": "Chinese","language_to": "English","source_language": "我爱机器翻译。"}' + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' } function validate_frontend() { @@ -169,7 +169,7 @@ function validate_frontend() { function stop_docker() { cd $WORKPATH/docker_compose/intel/cpu/xeon/ - docker compose stop && docker compose rm -f + docker compose -f compose.yaml stop && docker compose rm -f } function main() { diff --git a/Translation/tests/test_compose_tgi_on_gaudi.sh b/Translation/tests/test_compose_tgi_on_gaudi.sh new file mode 100644 index 0000000000..f900b433de --- /dev/null +++ b/Translation/tests/test_compose_tgi_on_gaudi.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" +export REGISTRY=${IMAGE_REPO} +export TAG=${IMAGE_TAG} +export MODEL_CACHE=${model_cache:-"./data"} + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + opea_branch=${opea_branch:-"main"} + # If the opea_branch isn't main, replace the git clone branch in Dockerfile. + if [[ "${opea_branch}" != "main" ]]; then + cd $WORKPATH + OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" + NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" + find . -type f -name "Dockerfile*" | while read -r file; do + echo "Processing file: $file" + sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" + done + fi + + cd $WORKPATH/docker_image_build + git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="translation translation-ui llm-textgen nginx" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker pull ghcr.io/huggingface/tgi-gaudi:2.3.1 + docker images && sleep 1s +} + +function start_services() { + cd $WORKPATH/docker_compose/intel/hpu/gaudi + + export http_proxy=${http_proxy} + export https_proxy=${https_proxy} + export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" + export LLM_ENDPOINT="http://${ip_address}:8008" + export LLM_COMPONENT_NAME="OpeaTextGenService" + export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + export MEGA_SERVICE_HOST_IP=${ip_address} + export LLM_SERVICE_HOST_IP=${ip_address} + export FRONTEND_SERVICE_IP=${ip_address} + export FRONTEND_SERVICE_PORT=5173 + export BACKEND_SERVICE_NAME=translation + export BACKEND_SERVICE_IP=${ip_address} + export BACKEND_SERVICE_PORT=8888 + export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" + export NGINX_PORT=80 + export host_ip=${ip_address} + + sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env + + # Start Docker Containers + docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + n=0 + until [[ "$n" -ge 100 ]]; do + docker logs translation-gaudi-tgi-service > ${LOG_PATH}/tgi_service_start.log + if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then + break + fi + sleep 5s + n=$((n+1)) + done +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + sleep 5s +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # tgi gaudi service + validate_services \ + "${ip_address}:8008/generate" \ + "generated_text" \ + "tgi-gaudi" \ + "translation-gaudi-tgi-service" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + + # llm microservice + validate_services \ + "${ip_address}:9000/v1/chat/completions" \ + "data: " \ + "llm" \ + "translation-gaudi-llm-server" \ + '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' +} + +function validate_megaservice() { + # test the megaservice for code translation + validate_services \ + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ + "print" \ + "mega-translation" \ + "translation-gaudi-backend-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' + + # test the megaservice for text translation + validate_services \ + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ + "translation" \ + "mega-translation" \ + "translation-gaudi-backend-server" \ + '{"language_from": "Chinese","language_to": "English","source_data": "我爱机器翻译。","translate_type":"text"}' + + # test the megeservice via nginx + validate_services \ + "${ip_address}:${NGINX_PORT}/v1/translation" \ + "print" \ + "mega-translation-nginx" \ + "translation-gaudi-nginx-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' + +} + +function validate_frontend() { + cd $WORKPATH/ui/svelte + local conda_env_name="OPEA_e2e" + export PATH=${HOME}/miniforge3/bin/:$PATH + if conda info --envs | grep -q "$conda_env_name"; then + echo "$conda_env_name exist!" + else + conda create -n ${conda_env_name} python=3.12 -y + fi + source activate ${conda_env_name} + + sed -i "s/localhost/$ip_address/g" playwright.config.ts + + conda install -c conda-forge nodejs=22.6.0 -y + npm install && npm ci && npx playwright install --with-deps + node -v && npm -v && pip list + + exit_status=0 + npx playwright test || exit_status=$? + + if [ $exit_status -ne 0 ]; then + echo "[TEST INFO]: ---------frontend test failed---------" + exit $exit_status + else + echo "[TEST INFO]: ---------frontend test passed---------" + fi +} + +function stop_docker() { + cd $WORKPATH/docker_compose/intel/hpu/gaudi + docker compose -f compose_tgi.yaml stop && docker compose rm -f +} + +function main() { + + stop_docker + + if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi + start_services + + validate_microservices + validate_megaservice + validate_frontend + + stop_docker + echo y | docker system prune + +} + +main diff --git a/Translation/tests/test_compose_tgi_on_xeon.sh b/Translation/tests/test_compose_tgi_on_xeon.sh new file mode 100644 index 0000000000..0e67af9bba --- /dev/null +++ b/Translation/tests/test_compose_tgi_on_xeon.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" +export REGISTRY=${IMAGE_REPO} +export TAG=${IMAGE_TAG} +export MODEL_CACHE=${model_cache:-"./data"} + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + opea_branch=${opea_branch:-"main"} + # If the opea_branch isn't main, replace the git clone branch in Dockerfile. + if [[ "${opea_branch}" != "main" ]]; then + cd $WORKPATH + OLD_STRING="RUN git clone --depth 1 https://github.com/opea-project/GenAIComps.git" + NEW_STRING="RUN git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git" + find . -type f -name "Dockerfile*" | while read -r file; do + echo "Processing file: $file" + sed -i "s|$OLD_STRING|$NEW_STRING|g" "$file" + done + fi + + cd $WORKPATH/docker_image_build + git clone --depth 1 --branch ${opea_branch} https://github.com/opea-project/GenAIComps.git + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="translation translation-ui llm-textgen nginx" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu + docker images && sleep 1s +} + +function start_services() { + cd $WORKPATH/docker_compose/intel/cpu/xeon/ + + export http_proxy=${http_proxy} + export https_proxy=${https_proxy} + export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" + export LLM_ENDPOINT="http://${ip_address}:8008" + export LLM_COMPONENT_NAME="OpeaTextGenService" + export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + export MEGA_SERVICE_HOST_IP=${ip_address} + export LLM_SERVICE_HOST_IP=${ip_address} + export FRONTEND_SERVICE_IP=${ip_address} + export FRONTEND_SERVICE_PORT=5173 + export BACKEND_SERVICE_NAME=translation + export BACKEND_SERVICE_IP=${ip_address} + export BACKEND_SERVICE_PORT=8888 + export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" + export NGINX_PORT=80 + export host_ip=${ip_address} + + sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env + + # Start Docker Containers + docker compose -f compose_tgi.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + n=0 + # wait long for llm model download + until [[ "$n" -ge 500 ]]; do + docker logs translation-xeon-tgi-service > ${LOG_PATH}/tgi_service_start.log + if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then + break + fi + sleep 5s + n=$((n+1)) + done +} + +function validate_services() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + sleep 5s +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # tgi for llm service + validate_services \ + "${ip_address}:8008/generate" \ + "generated_text" \ + "tgi" \ + "translation-xeon-tgi-service" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + + # llm microservice + validate_services \ + "${ip_address}:9000/v1/chat/completions" \ + "data: " \ + "llm" \ + "translation-xeon-llm-server" \ + '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' +} + +function validate_megaservice() { + # test the megaservice for code translation + validate_services \ + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ + "print" \ + "mega-translation" \ + "translation-xeon-backend-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' + + # test the megaservice for text translation + validate_services \ + "${ip_address}:${BACKEND_SERVICE_PORT}/v1/translation" \ + "translation" \ + "mega-translation" \ + "translation-xeon-backend-server" \ + '{"language_from": "Chinese","language_to": "English","source_data": "我爱机器翻译。","translate_type":"text"}' + + # test the megeservice via nginx + validate_services \ + "${ip_address}:${NGINX_PORT}/v1/translation" \ + "print" \ + "mega-translation-nginx" \ + "translation-xeon-nginx-server" \ + '{"language_from": "Golang","language_to": "Python","source_data": "package main\n\nimport \"fmt\"\nfunc main() {\n fmt.Println(\"Hello, World!\");\n}","translate_type":"code"}' +} + +function validate_frontend() { + cd $WORKPATH/ui/svelte + local conda_env_name="OPEA_e2e" + export PATH=${HOME}/miniforge3/bin/:$PATH + if conda info --envs | grep -q "$conda_env_name"; then + echo "$conda_env_name exist!" + else + conda create -n ${conda_env_name} python=3.12 -y + fi + source activate ${conda_env_name} + + sed -i "s/localhost/$ip_address/g" playwright.config.ts + + conda install -c conda-forge nodejs=22.6.0 -y + npm install && npm ci && npx playwright install --with-deps + node -v && npm -v && pip list + + exit_status=0 + npx playwright test || exit_status=$? + + if [ $exit_status -ne 0 ]; then + echo "[TEST INFO]: ---------frontend test failed---------" + exit $exit_status + else + echo "[TEST INFO]: ---------frontend test passed---------" + fi +} + +function stop_docker() { + cd $WORKPATH/docker_compose/intel/cpu/xeon/ + docker compose -f compose_tgi.yaml stop && docker compose rm -f +} + +function main() { + + stop_docker + + if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi + start_services + + validate_microservices + validate_megaservice + validate_frontend + + stop_docker + echo y | docker system prune + +} + +main diff --git a/Translation/tests/test_gmc_on_gaudi.sh b/Translation/tests/test_gmc_on_gaudi.sh index 45c47c04ee..7e7f335943 100755 --- a/Translation/tests/test_gmc_on_gaudi.sh +++ b/Translation/tests/test_gmc_on_gaudi.sh @@ -37,7 +37,7 @@ function validate_translation() { kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -X POST -d '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_translation.log exit_code=$? if [ $exit_code -ne 0 ]; then - echo "chatqna failed, please check the logs in ${LOG_PATH}!" + echo "translation failed, please check the logs in ${LOG_PATH}!" exit 1 fi diff --git a/Translation/tests/test_gmc_on_xeon.sh b/Translation/tests/test_gmc_on_xeon.sh index 56caa54f40..9769079d83 100755 --- a/Translation/tests/test_gmc_on_xeon.sh +++ b/Translation/tests/test_gmc_on_xeon.sh @@ -37,7 +37,7 @@ function validate_translation() { kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -X POST -d '{"query":"Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_translation.log exit_code=$? if [ $exit_code -ne 0 ]; then - echo "chatqna failed, please check the logs in ${LOG_PATH}!" + echo "translation failed, please check the logs in ${LOG_PATH}!" exit 1 fi diff --git a/Translation/translation.py b/Translation/translation.py index 8a5d8aad6a..ed496fdc83 100644 --- a/Translation/translation.py +++ b/Translation/translation.py @@ -1,18 +1,6 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os from comps import MegaServiceEndpoint, MicroService, ServiceOrchestrator, ServiceRoleType, ServiceType @@ -53,18 +41,34 @@ async def handle_request(self, request: Request): data = await request.json() language_from = data["language_from"] language_to = data["language_to"] - source_language = data["source_language"] - prompt_template = """ - Translate this from {language_from} to {language_to}: + source_data = data["source_data"] + translate_type = data["translate_type"] + if translate_type == "code": + prompt_template = """ + ### System: Please translate the following {language_from} codes into {language_to} codes. - {language_from}: - {source_language} + ### Original codes: + '''{language_from} - {language_to}: - """ - prompt = prompt_template.format( - language_from=language_from, language_to=language_to, source_language=source_language - ) + {source_data} + + ''' + + ### Translated codes: + """ + elif translate_type == "text": + prompt_template = """ + Translate this from {language_from} to {language_to}: + + {language_from}: + {source_data} + + {language_to}: + """ + else: + raise ValueError("Invalid translate_type") + + prompt = prompt_template.format(language_from=language_from, language_to=language_to, source_data=source_data) result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs={"query": prompt}) for node, response in result_dict.items(): # Here it suppose the last microservice in the megaservice is LLM. diff --git a/Translation/ui/svelte/src/lib/shared/Network.ts b/Translation/ui/svelte/src/lib/shared/Network.ts index 489550818a..cee93d0f1f 100644 --- a/Translation/ui/svelte/src/lib/shared/Network.ts +++ b/Translation/ui/svelte/src/lib/shared/Network.ts @@ -23,7 +23,8 @@ export async function fetchLanguageResponse(input: string, transform: string, tr payload = { language_from: transform, language_to: transTo, - source_language: input, + source_data: input, + translate_type: "text", }; url = `${BASE_URL}`;