Skip to content

Commit

Permalink
Rendered samples and updated tags for `Best practices for optimizing …
Browse files Browse the repository at this point in the history
…large language model inference on GPUs` documentation (GoogleCloudPlatform#1413)

* added tgi bitsandbytes renderings

* added vllm quantization renderings

* fixed incorrectly named files and tags

* added missing license comments

* added files and tags for tensor parallelism

* added rendering and tags for token config

* updated region tags

* fixed mismatched region tag
  • Loading branch information
brandonroyal authored Aug 19, 2024
1 parent 5ce63c9 commit 974050a
Show file tree
Hide file tree
Showing 14 changed files with 675 additions and 11 deletions.
81 changes: 81 additions & 0 deletions ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes-nf4.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: Deployment
metadata:
name: tgi-gemma-deployment
spec:
replicas: 1
selector:
matchLabels:
app: gemma-server
template:
metadata:
labels:
ai.gke.io/inference-server: text-generation-inference
ai.gke.io/model: gemma-7b-bitsandbytes-nf4
app: gemma-server
examples.ai.gke.io/source: user-guide
spec:
containers:
- args:
- --model-id=$(MODEL_ID)
- --num-shard=2
- --quantize=bitsandbytes-nf4
env:
- name: MODEL_ID
value: google/gemma-7b
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
key: hf_api_token
name: hf-secret
image: ghcr.io/huggingface/text-generation-inference:2.0.2
name: inference-server
resources:
limits:
cpu: "10"
ephemeral-storage: 40Gi
memory: 25Gi
nvidia.com/gpu: 2
requests:
cpu: "2"
ephemeral-storage: 40Gi
memory: 25Gi
nvidia.com/gpu: 2
volumeMounts:
- mountPath: /dev/shm
name: dshm
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
volumes:
- emptyDir:
medium: Memory
name: dshm
---
apiVersion: v1
kind: Service
metadata:
name: llm-service
spec:
ports:
- port: 8000
protocol: TCP
targetPort: 8000
selector:
app: gemma-server
type: ClusterIP
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ spec:
spec:
containers:
- name: inference-server
# [START gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes_nf4]
args:
- --model-id=$(MODEL_ID)
- --num-shard=2
- --quantize=bitsandbytes-nf4

# [END gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes_nf4]
81 changes: 81 additions & 0 deletions ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: Deployment
metadata:
name: tgi-gemma-deployment
spec:
replicas: 1
selector:
matchLabels:
app: gemma-server
template:
metadata:
labels:
ai.gke.io/inference-server: text-generation-inference
ai.gke.io/model: gemma-7b-bitsandbytes
app: gemma-server
examples.ai.gke.io/source: user-guide
spec:
containers:
- args:
- --model-id=$(MODEL_ID)
- --num-shard=2
- --quantize=bitsandbytes
env:
- name: MODEL_ID
value: google/gemma-7b
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
key: hf_api_token
name: hf-secret
image: ghcr.io/huggingface/text-generation-inference:2.0.2
name: inference-server
resources:
limits:
cpu: "10"
ephemeral-storage: 40Gi
memory: 25Gi
nvidia.com/gpu: 2
requests:
cpu: "2"
ephemeral-storage: 40Gi
memory: 25Gi
nvidia.com/gpu: 2
volumeMounts:
- mountPath: /dev/shm
name: dshm
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
volumes:
- emptyDir:
medium: Memory
name: dshm
---
apiVersion: v1
kind: Service
metadata:
name: llm-service
spec:
ports:
- port: 8000
protocol: TCP
targetPort: 8000
selector:
app: gemma-server
type: ClusterIP
3 changes: 2 additions & 1 deletion ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes/patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ spec:
spec:
containers:
- name: inference-server
# [START gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes]
args:
- --model-id=$(MODEL_ID)
- --num-shard=2
- --quantize=bitsandbytes

# [END gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes]
82 changes: 82 additions & 0 deletions ai-ml/llm-serving-gemma/tgi/tgi-7b-it-tensorparallelism.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: Deployment
metadata:
name: tgi-gemma-deployment
spec:
replicas: 1
selector:
matchLabels:
app: gemma-server
template:
metadata:
labels:
app: gemma-server
ai.gke.io/model: gemma-7b-it
ai.gke.io/inference-server: text-generation-inference
examples.ai.gke.io/source: user-guide
spec:
containers:
- name: inference-server
image: ghcr.io/huggingface/text-generation-inference:2.0.2
resources:
requests:
cpu: "2"
memory: "25Gi"
ephemeral-storage: "40Gi"
nvidia.com/gpu: 2
limits:
cpu: "10"
memory: "25Gi"
ephemeral-storage: "40Gi"
nvidia.com/gpu: 2
# [START gke_ai_ml_llm_serving_gemma_tgi_7b_it_tensorparallelism]
args:
- --model-id=$(MODEL_ID)
- --num-shard=2
# [END gke_ai_ml_llm_serving_gemma_tgi_7b_it_tensorparallelism]
env:
- name: MODEL_ID
value: google/gemma-7b-it
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
---
apiVersion: v1
kind: Service
metadata:
name: llm-service
spec:
selector:
app: gemma-server
type: ClusterIP
ports:
- protocol: TCP
port: 8000
targetPort: 8000
83 changes: 83 additions & 0 deletions ai-ml/llm-serving-gemma/tgi/tgi-7b-token.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: Deployment
metadata:
name: tgi-gemma-deployment
spec:
replicas: 1
selector:
matchLabels:
app: gemma-server
template:
metadata:
labels:
ai.gke.io/inference-server: text-generation-inference
ai.gke.io/model: gemma-7b-token
app: gemma-server
examples.ai.gke.io/source: user-guide
spec:
containers:
- args:
- --model-id=$(MODEL_ID)
- --num-shard=1
- --max-total-tokens=3072
- --max-batch-prefill-tokens=512
- --max-input-length=512
env:
- name: MODEL_ID
value: google/gemma-7b
- name: PORT
value: "8000"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
key: hf_api_token
name: hf-secret
image: ghcr.io/huggingface/text-generation-inference:2.0.2
name: inference-server
resources:
limits:
cpu: "2"
ephemeral-storage: 20Gi
memory: 7Gi
nvidia.com/gpu: 1
requests:
cpu: "2"
ephemeral-storage: 20Gi
memory: 7Gi
nvidia.com/gpu: 1
volumeMounts:
- mountPath: /dev/shm
name: dshm
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
volumes:
- emptyDir:
medium: Memory
name: dshm
---
apiVersion: v1
kind: Service
metadata:
name: llm-service
spec:
ports:
- port: 8000
protocol: TCP
targetPort: 8000
selector:
app: gemma-server
type: ClusterIP
3 changes: 2 additions & 1 deletion ai-ml/llm-serving-gemma/tgi/tgi-7b-token/patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ spec:
spec:
containers:
- name: inference-server
# [START gke_ai_ml_llm_serving_gemma_tgi_7b_token]
args:
- --model-id=$(MODEL_ID)
- --num-shard=1
Expand All @@ -33,4 +34,4 @@ spec:
env:
- name: MODEL_ID
value: google/gemma-7b

# [END gke_ai_ml_llm_serving_gemma_tgi_7b_token]
Loading

0 comments on commit 974050a

Please sign in to comment.