forked from GoogleCloudPlatform/kubernetes-engine-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rendered samples and updated tags for `Best practices for optimizing …
…large language model inference on GPUs` documentation (GoogleCloudPlatform#1413) * added tgi bitsandbytes renderings * added vllm quantization renderings * fixed incorrectly named files and tags * added missing license comments * added files and tags for tensor parallelism * added rendering and tags for token config * updated region tags * fixed mismatched region tag
- Loading branch information
1 parent
5ce63c9
commit 974050a
Showing
14 changed files
with
675 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: tgi-gemma-deployment | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: gemma-server | ||
template: | ||
metadata: | ||
labels: | ||
ai.gke.io/inference-server: text-generation-inference | ||
ai.gke.io/model: gemma-7b-bitsandbytes-nf4 | ||
app: gemma-server | ||
examples.ai.gke.io/source: user-guide | ||
spec: | ||
containers: | ||
- args: | ||
- --model-id=$(MODEL_ID) | ||
- --num-shard=2 | ||
- --quantize=bitsandbytes-nf4 | ||
env: | ||
- name: MODEL_ID | ||
value: google/gemma-7b | ||
- name: PORT | ||
value: "8000" | ||
- name: HUGGING_FACE_HUB_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
key: hf_api_token | ||
name: hf-secret | ||
image: ghcr.io/huggingface/text-generation-inference:2.0.2 | ||
name: inference-server | ||
resources: | ||
limits: | ||
cpu: "10" | ||
ephemeral-storage: 40Gi | ||
memory: 25Gi | ||
nvidia.com/gpu: 2 | ||
requests: | ||
cpu: "2" | ||
ephemeral-storage: 40Gi | ||
memory: 25Gi | ||
nvidia.com/gpu: 2 | ||
volumeMounts: | ||
- mountPath: /dev/shm | ||
name: dshm | ||
nodeSelector: | ||
cloud.google.com/gke-accelerator: nvidia-l4 | ||
volumes: | ||
- emptyDir: | ||
medium: Memory | ||
name: dshm | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: llm-service | ||
spec: | ||
ports: | ||
- port: 8000 | ||
protocol: TCP | ||
targetPort: 8000 | ||
selector: | ||
app: gemma-server | ||
type: ClusterIP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: tgi-gemma-deployment | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: gemma-server | ||
template: | ||
metadata: | ||
labels: | ||
ai.gke.io/inference-server: text-generation-inference | ||
ai.gke.io/model: gemma-7b-bitsandbytes | ||
app: gemma-server | ||
examples.ai.gke.io/source: user-guide | ||
spec: | ||
containers: | ||
- args: | ||
- --model-id=$(MODEL_ID) | ||
- --num-shard=2 | ||
- --quantize=bitsandbytes | ||
env: | ||
- name: MODEL_ID | ||
value: google/gemma-7b | ||
- name: PORT | ||
value: "8000" | ||
- name: HUGGING_FACE_HUB_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
key: hf_api_token | ||
name: hf-secret | ||
image: ghcr.io/huggingface/text-generation-inference:2.0.2 | ||
name: inference-server | ||
resources: | ||
limits: | ||
cpu: "10" | ||
ephemeral-storage: 40Gi | ||
memory: 25Gi | ||
nvidia.com/gpu: 2 | ||
requests: | ||
cpu: "2" | ||
ephemeral-storage: 40Gi | ||
memory: 25Gi | ||
nvidia.com/gpu: 2 | ||
volumeMounts: | ||
- mountPath: /dev/shm | ||
name: dshm | ||
nodeSelector: | ||
cloud.google.com/gke-accelerator: nvidia-l4 | ||
volumes: | ||
- emptyDir: | ||
medium: Memory | ||
name: dshm | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: llm-service | ||
spec: | ||
ports: | ||
- port: 8000 | ||
protocol: TCP | ||
targetPort: 8000 | ||
selector: | ||
app: gemma-server | ||
type: ClusterIP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
82 changes: 82 additions & 0 deletions
82
ai-ml/llm-serving-gemma/tgi/tgi-7b-it-tensorparallelism.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: tgi-gemma-deployment | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: gemma-server | ||
template: | ||
metadata: | ||
labels: | ||
app: gemma-server | ||
ai.gke.io/model: gemma-7b-it | ||
ai.gke.io/inference-server: text-generation-inference | ||
examples.ai.gke.io/source: user-guide | ||
spec: | ||
containers: | ||
- name: inference-server | ||
image: ghcr.io/huggingface/text-generation-inference:2.0.2 | ||
resources: | ||
requests: | ||
cpu: "2" | ||
memory: "25Gi" | ||
ephemeral-storage: "40Gi" | ||
nvidia.com/gpu: 2 | ||
limits: | ||
cpu: "10" | ||
memory: "25Gi" | ||
ephemeral-storage: "40Gi" | ||
nvidia.com/gpu: 2 | ||
# [START gke_ai_ml_llm_serving_gemma_tgi_7b_it_tensorparallelism] | ||
args: | ||
- --model-id=$(MODEL_ID) | ||
- --num-shard=2 | ||
# [END gke_ai_ml_llm_serving_gemma_tgi_7b_it_tensorparallelism] | ||
env: | ||
- name: MODEL_ID | ||
value: google/gemma-7b-it | ||
- name: PORT | ||
value: "8000" | ||
- name: HUGGING_FACE_HUB_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
name: hf-secret | ||
key: hf_api_token | ||
volumeMounts: | ||
- mountPath: /dev/shm | ||
name: dshm | ||
volumes: | ||
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
nodeSelector: | ||
cloud.google.com/gke-accelerator: nvidia-l4 | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: llm-service | ||
spec: | ||
selector: | ||
app: gemma-server | ||
type: ClusterIP | ||
ports: | ||
- protocol: TCP | ||
port: 8000 | ||
targetPort: 8000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: tgi-gemma-deployment | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: gemma-server | ||
template: | ||
metadata: | ||
labels: | ||
ai.gke.io/inference-server: text-generation-inference | ||
ai.gke.io/model: gemma-7b-token | ||
app: gemma-server | ||
examples.ai.gke.io/source: user-guide | ||
spec: | ||
containers: | ||
- args: | ||
- --model-id=$(MODEL_ID) | ||
- --num-shard=1 | ||
- --max-total-tokens=3072 | ||
- --max-batch-prefill-tokens=512 | ||
- --max-input-length=512 | ||
env: | ||
- name: MODEL_ID | ||
value: google/gemma-7b | ||
- name: PORT | ||
value: "8000" | ||
- name: HUGGING_FACE_HUB_TOKEN | ||
valueFrom: | ||
secretKeyRef: | ||
key: hf_api_token | ||
name: hf-secret | ||
image: ghcr.io/huggingface/text-generation-inference:2.0.2 | ||
name: inference-server | ||
resources: | ||
limits: | ||
cpu: "2" | ||
ephemeral-storage: 20Gi | ||
memory: 7Gi | ||
nvidia.com/gpu: 1 | ||
requests: | ||
cpu: "2" | ||
ephemeral-storage: 20Gi | ||
memory: 7Gi | ||
nvidia.com/gpu: 1 | ||
volumeMounts: | ||
- mountPath: /dev/shm | ||
name: dshm | ||
nodeSelector: | ||
cloud.google.com/gke-accelerator: nvidia-l4 | ||
volumes: | ||
- emptyDir: | ||
medium: Memory | ||
name: dshm | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: llm-service | ||
spec: | ||
ports: | ||
- port: 8000 | ||
protocol: TCP | ||
targetPort: 8000 | ||
selector: | ||
app: gemma-server | ||
type: ClusterIP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.