From a23e1b710727d5349750e6bdb01efe2fae12ca9e Mon Sep 17 00:00:00 2001 From: Samhita Alla Date: Tue, 30 Jul 2024 11:03:00 +0530 Subject: [PATCH] nim docs (#1709) * nim docs Signed-off-by: Samhita Alla * add docs Signed-off-by: Samhita Alla * add to index Signed-off-by: Samhita Alla * update the example Signed-off-by: Samhita Alla * add a note Signed-off-by: Samhita Alla * bump version Signed-off-by: Samhita Alla * add secret group Signed-off-by: Samhita Alla --------- Signed-off-by: Samhita Alla --- docs/index.md | 1 + docs/integrations.md | 2 + examples/nim_plugin/Dockerfile | 23 ++++ examples/nim_plugin/README.md | 40 +++++++ examples/nim_plugin/nim_plugin/__init__.py | 0 .../nim_plugin/serve_nim_container.py | 113 ++++++++++++++++++ examples/nim_plugin/requirements.in | 1 + 7 files changed, 180 insertions(+) create mode 100644 examples/nim_plugin/Dockerfile create mode 100644 examples/nim_plugin/README.md create mode 100644 examples/nim_plugin/nim_plugin/__init__.py create mode 100644 examples/nim_plugin/nim_plugin/serve_nim_container.py create mode 100644 examples/nim_plugin/requirements.in diff --git a/docs/index.md b/docs/index.md index 5cf0650ea..a67bfb480 100644 --- a/docs/index.md +++ b/docs/index.md @@ -119,6 +119,7 @@ auto_examples/mlflow_plugin/index auto_examples/mmcloud_agent/index auto_examples/modin_plugin/index auto_examples/kfmpi_plugin/index +auto_examples/nim_plugin/index auto_examples/onnx_plugin/index auto_examples/openai_batch_agent/index auto_examples/papermill_plugin/index diff --git a/docs/integrations.md b/docs/integrations.md index 60d68ed1d..88d97ba15 100644 --- a/docs/integrations.md +++ b/docs/integrations.md @@ -102,6 +102,8 @@ orchestrated by Flyte itself, within its provisioned Kubernetes clusters. - Run Databricks jobs in your workflows with the Databricks agent. * - {doc}`Memory Machine Cloud ` - Execute tasks using the MemVerge Memory Machine Cloud agent. +* - {doc}`NIM ` + - Serve optimized model containers with NIM. * - {doc}`OpenAI Batch ` - Submit requests for asynchronous batch processing on OpenAI. * - {doc}`SageMaker Inference ` diff --git a/examples/nim_plugin/Dockerfile b/examples/nim_plugin/Dockerfile new file mode 100644 index 000000000..0c46be23a --- /dev/null +++ b/examples/nim_plugin/Dockerfile @@ -0,0 +1,23 @@ +# ###################### +# NOTE: For CI/CD only # +######################## +FROM python:3.11-slim-buster +LABEL org.opencontainers.image.source=https://github.com/flyteorg/flytesnacks + +WORKDIR /root +ENV VENV /opt/venv +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +ENV PYTHONPATH /root + +# Install Python dependencies +COPY requirements.in /root +RUN pip install -r /root/requirements.in + +# Copy the actual code +COPY . /root/ + +# This tag is supplied by the build script and will be used to determine the version +# when registering tasks, workflows, and launch plans +ARG tag +ENV FLYTE_INTERNAL_IMAGE $tag diff --git a/examples/nim_plugin/README.md b/examples/nim_plugin/README.md new file mode 100644 index 000000000..506c9eab9 --- /dev/null +++ b/examples/nim_plugin/README.md @@ -0,0 +1,40 @@ +(nim_plugin)= + +# NIM + +```{eval-rst} +.. tags:: Inference, NVIDIA +``` + +Serve optimized model containers with NIM in a Flyte task. + +[NVIDIA NIM](https://www.nvidia.com/en-in/ai/), part of NVIDIA AI Enterprise, provides a streamlined path +for developing AI-powered enterprise applications and deploying AI models in production. +It includes an out-of-the-box optimization suite, enabling AI model deployment across any cloud, +data center, or workstation. Since NIM can be self-hosted, there is greater control over cost, data privacy, +and more visibility into behind-the-scenes operations. + +With NIM, you can invoke the model's endpoint as if it is hosted locally, minimizing network overhead. + +## Installation + +To use the NIM plugin, run the following command: + +``` +pip install flytekitplugins-inference +``` + +## Example usage + +For a usage example, see {doc}`NIM example usage `. + +```{note} +NIM can only be run in a Flyte cluster, not locally, as it must be deployed as a sidecar service in a Kubernetes pod. +``` + +```{toctree} +:maxdepth: -1 +:hidden: + +serve_nim_container +``` diff --git a/examples/nim_plugin/nim_plugin/__init__.py b/examples/nim_plugin/nim_plugin/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/nim_plugin/nim_plugin/serve_nim_container.py b/examples/nim_plugin/nim_plugin/serve_nim_container.py new file mode 100644 index 000000000..7cf401205 --- /dev/null +++ b/examples/nim_plugin/nim_plugin/serve_nim_container.py @@ -0,0 +1,113 @@ +# %% [markdown] +# (serve_nim_container)= +# +# # Serve Generative AI Models with NIM +# +# This guide demonstrates how to serve a Llama 3 8B model locally with NIM within a Flyte task. +# +# First, instantiate NIM by importing it from the `flytekitplugins.inference` package and specifying the image name along with the necessary secrets. +# The `ngc_image_secret` is required to pull the image from NGC, the `ngc_secret_key` is used to pull models +# from NGC after the container is up and running, and `secrets_prefix` is the environment variable prefix to access {ref}`secrets `. +# +# Below is a simple task that serves a Llama NIM container: +# %% +from flytekit import ImageSpec, Resources, Secret, task +from flytekit.extras.accelerators import A10G +from flytekitplugins.inference import NIM, NIMSecrets +from openai import OpenAI + +image = ImageSpec( + name="nim", + registry="ghcr.io/flyteorg", + packages=["flytekitplugins-inference"], +) + +nim_instance = NIM( + image="nvcr.io/nim/meta/llama3-8b-instruct:1.0.0", + secrets=NIMSecrets( + ngc_image_secret="nvcrio-cred", + ngc_secret_key="ngc-api-key", + ngc_secret_group="ngc", + secrets_prefix="_FSEC_", + ), +) + + +@task( + container_image=image, + pod_template=nim_instance.pod_template, + accelerator=A10G, + secret_requests=[ + Secret( + group="ngc", key="ngc-api-key", mount_requirement=Secret.MountType.ENV_VAR + ) # must be mounted as an env var + ], + requests=Resources(gpu="0"), +) +def model_serving() -> str: + client = OpenAI(base_url=f"{nim_instance.base_url}/v1", api_key="nim") # api key required but ignored + + completion = client.chat.completions.create( + model="meta/llama3-8b-instruct", + messages=[ + { + "role": "user", + "content": "Write a limerick about the wonders of GPU computing.", + } + ], + temperature=0.5, + top_p=1, + max_tokens=1024, + ) + + return completion.choices[0].message.content + + +# %% [markdown] +# :::{important} +# Replace `ghcr.io/flyteorg` with a container registry to which you can publish. +# To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. +# ::: +# +# The `model_serving` task initiates a sidecar service to serve the model, making it accessible on localhost via the `base_url` property. +# Both chat and chat completion endpoints can be utilized. +# +# You need to mount the secret as an environment variable, as it must be accessed by the `NGC_API_KEY` environment variable within the NIM container. +# +# By default, the NIM instantiation sets `cpu`, `gpu`, and `mem` to `1`, `1`, and `20Gi`, respectively. You can modify these settings as needed. +# +# To serve a fine-tuned Llama model, specify the HuggingFace repo ID in `hf_repo_ids` as `[]` and the +# LoRa adapter memory as `lora_adapter_mem`. Set the `NIM_PEFT_SOURCE` environment variable by +# including `env={"NIM_PEFT_SOURCE": "..."}` in the task decorator. +# +# Here is an example initialization for a fine-tuned Llama model: +# %% +nim_instance = NIM( + image="nvcr.io/nim/meta/llama3-8b-instruct:1.0.0", + secrets=NIMSecrets( + ngc_image_secret="nvcrio-cred", + ngc_secret_key="ngc-api-key", + ngc_secret_group="ngc", + secrets_prefix="_FSEC_", + hf_token_key="hf-key", + hf_token_group="hf", + ), + hf_repo_ids=[""], + lora_adapter_mem="500Mi", + env={"NIM_PEFT_SOURCE": "/home/nvs/loras"}, +) + +# %% [markdown] +# :::{note} +# Native directory and NGC support for LoRa adapters coming soon. +# ::: +# +# NIM containers can be integrated into different stages of your AI workflow, including data pre-processing, +# model inference, and post-processing. Flyte also allows serving multiple NIM containers simultaneously, +# each with different configurations on various instances. +# +# This integration enables you to self-host and serve optimized AI models on your own infrastructure, +# ensuring full control over costs and data security. By eliminating dependence on third-party APIs for AI model access, +# you gain not only enhanced control but also potentially lower expenses compared to traditional API services. +# +# For more detailed information, refer to the [NIM documentation by NVIDIA](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html). diff --git a/examples/nim_plugin/requirements.in b/examples/nim_plugin/requirements.in new file mode 100644 index 000000000..c36343873 --- /dev/null +++ b/examples/nim_plugin/requirements.in @@ -0,0 +1 @@ +flytekitplugins-inference>=1.13.1a5