Rendered samples and updated tags for `Best practices for optimizing …

…large language model inference on GPUs` documentation (GoogleCloudPlatform#1413) * added tgi bitsandbytes renderings * added vllm quantization renderings * fixed incorrectly named files and tags * added missing license comments * added files and tags for tensor parallelism * added rendering and tags for token config * updated region tags * fixed mismatched region tag
ryanaoleary · Aug 19, 2024 · 974050a · 974050a
1 parent 5ce63c9
commit 974050a
Show file tree

Hide file tree

Showing 14 changed files with 675 additions and 11 deletions.
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes-nf4.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes-nf4.yaml
@@ -0,0 +1,81 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tgi-gemma-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gemma-server
+  template:
+    metadata:
+      labels:
+        ai.gke.io/inference-server: text-generation-inference
+        ai.gke.io/model: gemma-7b-bitsandbytes-nf4
+        app: gemma-server
+        examples.ai.gke.io/source: user-guide
+    spec:
+      containers:
+      - args:
+        - --model-id=$(MODEL_ID)
+        - --num-shard=2
+        - --quantize=bitsandbytes-nf4
+        env:
+        - name: MODEL_ID
+          value: google/gemma-7b
+        - name: PORT
+          value: "8000"
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: hf_api_token
+              name: hf-secret
+        image: ghcr.io/huggingface/text-generation-inference:2.0.2
+        name: inference-server
+        resources:
+          limits:
+            cpu: "10"
+            ephemeral-storage: 40Gi
+            memory: 25Gi
+            nvidia.com/gpu: 2
+          requests:
+            cpu: "2"
+            ephemeral-storage: 40Gi
+            memory: 25Gi
+            nvidia.com/gpu: 2
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-service
+spec:
+  ports:
+  - port: 8000
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: gemma-server
+  type: ClusterIP
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes-nf4/patch.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes-nf4/patch.yaml
@@ -24,8 +24,9 @@ spec:
     spec:
       containers:
       - name: inference-server
+# [START gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes_nf4]
         args:
         - --model-id=$(MODEL_ID)
         - --num-shard=2
         - --quantize=bitsandbytes-nf4
-
+# [END gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes_nf4]
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes.yaml
@@ -0,0 +1,81 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tgi-gemma-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gemma-server
+  template:
+    metadata:
+      labels:
+        ai.gke.io/inference-server: text-generation-inference
+        ai.gke.io/model: gemma-7b-bitsandbytes
+        app: gemma-server
+        examples.ai.gke.io/source: user-guide
+    spec:
+      containers:
+      - args:
+        - --model-id=$(MODEL_ID)
+        - --num-shard=2
+        - --quantize=bitsandbytes
+        env:
+        - name: MODEL_ID
+          value: google/gemma-7b
+        - name: PORT
+          value: "8000"
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: hf_api_token
+              name: hf-secret
+        image: ghcr.io/huggingface/text-generation-inference:2.0.2
+        name: inference-server
+        resources:
+          limits:
+            cpu: "10"
+            ephemeral-storage: 40Gi
+            memory: 25Gi
+            nvidia.com/gpu: 2
+          requests:
+            cpu: "2"
+            ephemeral-storage: 40Gi
+            memory: 25Gi
+            nvidia.com/gpu: 2
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-service
+spec:
+  ports:
+  - port: 8000
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: gemma-server
+  type: ClusterIP
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes/patch.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-bitsandbytes/patch.yaml
@@ -24,8 +24,9 @@ spec:
     spec:
       containers:
       - name: inference-server
+# [START gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes]
         args:
         - --model-id=$(MODEL_ID)
         - --num-shard=2
         - --quantize=bitsandbytes
-
+# [END gke_ai_ml_llm_serving_gemma_tgi_7b_bitsandbytes]
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-it-tensorparallelism.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-it-tensorparallelism.yaml
@@ -0,0 +1,82 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tgi-gemma-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gemma-server
+  template:
+    metadata:
+      labels:
+        app: gemma-server
+        ai.gke.io/model: gemma-7b-it
+        ai.gke.io/inference-server: text-generation-inference
+        examples.ai.gke.io/source: user-guide
+    spec:
+      containers:
+      - name: inference-server
+        image: ghcr.io/huggingface/text-generation-inference:2.0.2
+        resources:
+          requests:
+            cpu: "2"
+            memory: "25Gi"
+            ephemeral-storage: "40Gi"
+            nvidia.com/gpu: 2
+          limits:
+            cpu: "10"
+            memory: "25Gi"
+            ephemeral-storage: "40Gi"
+            nvidia.com/gpu: 2
+# [START gke_ai_ml_llm_serving_gemma_tgi_7b_it_tensorparallelism]
+        args:
+        - --model-id=$(MODEL_ID)
+        - --num-shard=2
+# [END gke_ai_ml_llm_serving_gemma_tgi_7b_it_tensorparallelism]
+        env:
+        - name: MODEL_ID
+          value: google/gemma-7b-it
+        - name: PORT
+          value: "8000"
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-secret
+              key: hf_api_token
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-service
+spec:
+  selector:
+    app: gemma-server
+  type: ClusterIP
+  ports:
+    - protocol: TCP
+      port: 8000
+      targetPort: 8000
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-token.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-token.yaml
@@ -0,0 +1,83 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tgi-gemma-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gemma-server
+  template:
+    metadata:
+      labels:
+        ai.gke.io/inference-server: text-generation-inference
+        ai.gke.io/model: gemma-7b-token
+        app: gemma-server
+        examples.ai.gke.io/source: user-guide
+    spec:
+      containers:
+      - args:
+        - --model-id=$(MODEL_ID)
+        - --num-shard=1
+        - --max-total-tokens=3072
+        - --max-batch-prefill-tokens=512
+        - --max-input-length=512
+        env:
+        - name: MODEL_ID
+          value: google/gemma-7b
+        - name: PORT
+          value: "8000"
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: hf_api_token
+              name: hf-secret
+        image: ghcr.io/huggingface/text-generation-inference:2.0.2
+        name: inference-server
+        resources:
+          limits:
+            cpu: "2"
+            ephemeral-storage: 20Gi
+            memory: 7Gi
+            nvidia.com/gpu: 1
+          requests:
+            cpu: "2"
+            ephemeral-storage: 20Gi
+            memory: 7Gi
+            nvidia.com/gpu: 1
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-service
+spec:
+  ports:
+  - port: 8000
+    protocol: TCP
+    targetPort: 8000
+  selector:
+    app: gemma-server
+  type: ClusterIP
diff --git a/ai-ml/llm-serving-gemma/tgi/tgi-7b-token/patch.yaml b/ai-ml/llm-serving-gemma/tgi/tgi-7b-token/patch.yaml
@@ -24,6 +24,7 @@ spec:
     spec:
       containers:
       - name: inference-server
+# [START gke_ai_ml_llm_serving_gemma_tgi_7b_token]
         args:
         - --model-id=$(MODEL_ID)
         - --num-shard=1
@@ -33,4 +34,4 @@ spec:
         env:
         - name: MODEL_ID
           value: google/gemma-7b
-
+# [END gke_ai_ml_llm_serving_gemma_tgi_7b_token]