vllm-project · brian-dellabetta · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 20, 2025
diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py
@@ -8,16 +8,18 @@
     Qwen2VLForConditionalGeneration as TraceableQwen2VLForConditionalGeneration,
 )
 from .idefics3 import (
-    Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration
+    Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration,
 )
 from .whisper import (
-    WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration
+    WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VLForConditionalGeneration as TraceableQwen2_5_VLForConditionalGeneration
 )
+from .debug import get_model_class
 
 __all__ = [
+    "get_model_class",
     "TraceableLlavaForConditionalGeneration",
     "TraceableMllamaForConditionalGeneration",
     "TraceableQwen2VLForConditionalGeneration",

diff --git a/src/llmcompressor/transformers/tracing/debug.py b/src/llmcompressor/transformers/tracing/debug.py
@@ -12,6 +12,10 @@
 from llmcompressor.transformers import TextGenerationDataset
 from llmcompressor.args import DatasetArguments
 
+__all__ = [
+    "get_model_class"
+]
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Trace a model into subgraphs")

diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -1,23 +1,27 @@
+import torch
 from datasets import load_dataset
 from loguru import logger
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
 from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import get_model_class
 from tests.test_timer.timer_utils import log_time
-from tests.testing_utils import preprocess_tokenize_dataset
+from tests.testing_utils import process_dataset
 
 
 @log_time
-def _load_model_and_tokenizer(
+def _load_model_and_processor(
     model: str,
+    model_class: str,
     device: str,
 ):
-    loaded_model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_class = get_model_class(model_class)
+    loaded_model = pretrained_model_class.from_pretrained(
         model, device_map=device, torch_dtype="auto"
     )
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    return loaded_model, tokenizer
+    processor = AutoProcessor.from_pretrained(model)
+    return loaded_model, processor
 
 
 @log_time
@@ -30,6 +34,7 @@ def _run_oneshot(device: str, **oneshot_kwargs):
 
 def run_oneshot_for_e2e_testing(
     model: str,
+    model_class: str,
     device: str,
     num_calibration_samples: int,
     max_seq_length: int,
@@ -43,16 +48,27 @@ def run_oneshot_for_e2e_testing(
     # Load model.
     oneshot_kwargs = {}
 
-    loaded_model, tokenizer = _load_model_and_tokenizer(model=model, device=device)
+    loaded_model, processor = _load_model_and_processor(
+        model=model, model_class=model_class, device=device
+    )
 
     if dataset_id:
         ds = load_dataset(dataset_id, name=dataset_config, split=dataset_split)
         ds = ds.shuffle(seed=42).select(range(num_calibration_samples))
-        ds = preprocess_tokenize_dataset(ds, tokenizer, max_seq_length)
+        ds = process_dataset(ds, processor, max_seq_length)
         oneshot_kwargs["dataset"] = ds
         oneshot_kwargs["max_seq_length"] = max_seq_length
         oneshot_kwargs["num_calibration_samples"] = num_calibration_samples
 
+        # Define a data collator for multimodal inputs.
+        if "flickr30k" in dataset_id:
+
+            def data_collator(batch):
+                assert len(batch) == 1
+                return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+            oneshot_kwargs["data_collator"] = data_collator
+
     oneshot_kwargs["model"] = loaded_model
     if recipe:
         oneshot_kwargs["recipe"] = recipe
@@ -72,4 +88,4 @@ def run_oneshot_for_e2e_testing(
     logger.info("ONESHOT KWARGS", oneshot_kwargs)
     _run_oneshot(device=device, **oneshot_kwargs)
 
-    return oneshot_kwargs["model"], tokenizer
+    return oneshot_kwargs["model"], processor
diff --git a/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
@@ -3,7 +3,7 @@ quant_stage:
     SmoothQuantModifier:
       smoothing_strength: 0.8
     GPTQModifier:
-      ignore: [lm_head]
+      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}

diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
@@ -1,7 +1,7 @@
 quant_stage:
   quant_modifiers:
     GPTQModifier:
-      ignore: ["lm_head"]
+      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
       config_groups:
         group_0:
           weights:

diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -71,6 +71,7 @@ def set_up(self):
             pytest.skip("Skipping test; cadence mismatch")
 
         self.model = eval_config["model"]
+        self.model_class = eval_config.get("model_class", "AutoModelForCausalLM")
         self.scheme = eval_config.get("scheme")
         self.dataset_id = eval_config.get("dataset_id")
         self.dataset_config = eval_config.get("dataset_config")
@@ -104,6 +105,7 @@ def test_vllm(self):
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
         oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
             model=self.model,
+            model_class=self.model_class,
             device=self.device,
             num_calibration_samples=self.num_calibration_samples,
             max_seq_length=self.max_seq_length,

diff --git a/tests/integration/__init__.py → tests/lmeval/__init__.py b/tests/integration/__init__.py → tests/lmeval/__init__.py
diff --git a/tests/lmeval/configs/fp8_dynamic_per_token.yaml b/tests/lmeval/configs/fp8_dynamic_per_token.yaml
@@ -0,0 +1,7 @@
+cadence: "weekly"
+model: meta-llama/Meta-Llama-3-8B-Instruct
+scheme: FP8_DYNAMIC
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.75
+    exact_match,strict-match: 0.75
diff --git a/...m_eval_configs/fp8_static_per_tensor.yaml → ...lmeval/configs/fp8_static_per_tensor.yaml b/...m_eval_configs/fp8_static_per_tensor.yaml → ...lmeval/configs/fp8_static_per_tensor.yaml
@@ -1,10 +1,9 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
 scheme: FP8
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.75
-exact_match,strict-match: 0.75
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.75
+    exact_match,strict-match: 0.75
diff --git a/..._configs/int8_w8a8_dynamic_per_token.yaml → .../configs/int8_w8a8_dynamic_per_token.yaml b/..._configs/int8_w8a8_dynamic_per_token.yaml → .../configs/int8_w8a8_dynamic_per_token.yaml
@@ -2,10 +2,9 @@ cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.77
-exact_match,strict-match: 0.76
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.77
+    exact_match,strict-match: 0.76
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -0,0 +1,16 @@
+cadence: weekly
+model: Qwen/Qwen2-VL-2B-Instruct
+model_class: TraceableQwen2VLForConditionalGeneration
+scheme: FP8_DYNAMIC
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_economics
+  num_fewshot: 0
+  limit: 1000
+  batch_size: 8
+  metrics:
+    acc,none: 0.333
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -0,0 +1,19 @@
+cadence: "weekly"
+model: llava-hf/llava-1.5-7b-hf
+model_class: TraceableLlavaForConditionalGeneration
+scheme: INT8_dyn_per_token
+recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
+dataset_id: lmms-lab/flickr30k
+dataset_split: "test[:512]"
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_economics
+  num_fewshot: 0
+  limit: 1000
+  metrics:
+    acc,none: 0.233
+  batch_size: 8
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -0,0 +1,19 @@
+cadence: "weekly"
+model: Qwen/Qwen2-VL-2B-Instruct
+model_class: TraceableQwen2VLForConditionalGeneration
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
+dataset_id: lmms-lab/flickr30k
+dataset_split: "test[:512]"
+scheme: W4A16_actorder_group
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_economics
+  num_fewshot: 0
+  limit: 1000
+  metrics:
+    acc,none: 0.4
+  batch_size: 4
diff --git a/...m_eval_configs/w4a16_actorder_weight.yaml → ...lmeval/configs/w4a16_actorder_weight.yaml b/...m_eval_configs/w4a16_actorder_weight.yaml → ...lmeval/configs/w4a16_actorder_weight.yaml
@@ -1,11 +1,10 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
+scheme: W4A16_actorder_group
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.72
-exact_match,strict-match: 0.72
-scheme: W4A16_actorder_group
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.72
+    exact_match,strict-match: 0.72
diff --git a/.../lm_eval_configs/w4a16_grouped_quant.yaml → ...s/lmeval/configs/w4a16_grouped_quant.yaml b/.../lm_eval_configs/w4a16_grouped_quant.yaml → ...s/lmeval/configs/w4a16_grouped_quant.yaml
@@ -1,11 +1,10 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
-exact_match,flexible-extract: 0.72
-exact_match,strict-match: 0.72
 scheme: W4A16
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-quant_type: "GPTQ"
+quant_type: "GPTQ"
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.72
+    exact_match,strict-match: 0.72
diff --git a/tests/e2e/vLLM/test_lmeval.py → tests/lmeval/test_lmeval.py b/tests/e2e/vLLM/test_lmeval.py → tests/lmeval/test_lmeval.py
@@ -6,11 +6,23 @@
 import pytest
 import yaml
 from loguru import logger
+from pydantic import BaseModel
 
 from llmcompressor.core import active_session
 from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
 from tests.examples.utils import requires_gpu_count
 
+
+class LmEvalConfig(BaseModel):
+    model: str = "hf"
+    model_args: dict = {"add_bos_token": True, "dtype": "bfloat16"}
+    task: str = "gsm8k"
+    num_fewshot: int = 5
+    limit: int = 1000
+    metrics: dict
+    batch_size: int = 100
+
+
 try:
     import lm_eval
 
@@ -51,18 +63,15 @@ def set_up(self):
             pytest.skip("Skipping test; cadence mismatch")
 
         self.model = eval_config["model"]
+        self.model_class = eval_config.get("model_class", "AutoModelForCausalLM")
+        self.lmeval = LmEvalConfig(**eval_config.get("lmeval", {}))
         self.scheme = eval_config.get("scheme")
         self.dataset_id = eval_config.get("dataset_id")
         self.dataset_config = eval_config.get("dataset_config")
         self.dataset_split = eval_config.get("dataset_split")
         self.recipe = eval_config.get("recipe")
         self.quant_type = eval_config.get("quant_type")
         self.save_dir = eval_config.get("save_dir")
-        self.task = eval_config.get("task")
-        self.num_fewshot = eval_config.get("num_fewshot")
-        self.limit = eval_config.get("limit")
-        self.exact_flex = eval_config.get("exact_match,flexible-extract")
-        self.exact_strict = eval_config.get("exact_match,strict-match")
 
         logger.info("========== RUNNING ==============")
         logger.info(self.scheme)
@@ -76,8 +85,9 @@ def test_lm_eval(self):
         self.set_up()
         if not self.save_dir:
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
-        oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
+        oneshot_model, processor = run_oneshot_for_e2e_testing(
             model=self.model,
+            model_class=self.model_class,
             device=self.device,
             num_calibration_samples=self.num_calibration_samples,
             max_seq_length=self.max_seq_length,
@@ -91,7 +101,7 @@ def test_lm_eval(self):
 
         logger.info("================= SAVING TO DISK ======================")
         oneshot_model.save_pretrained(self.save_dir)
-        tokenizer.save_pretrained(self.save_dir)
+        processor.save_pretrained(self.save_dir)
         recipe_path = os.path.join(self.save_dir, "recipe.yaml")
 
         # Use the session to fetch the recipe;
@@ -104,26 +114,26 @@ def test_lm_eval(self):
 
         logger.info("================= Running LM Eval ======================")
 
-        model_args = f"pretrained={self.save_dir},add_bos_token=True"
+        model_args = {"pretrained": self.save_dir}
+        model_args.update(self.lmeval.model_args)
         results = lm_eval.simple_evaluate(
-            model="hf",
+            model=self.lmeval.model,
             model_args=model_args,
-            tasks=[self.task],
-            num_fewshot=self.num_fewshot,
-            limit=self.limit,
+            tasks=[self.lmeval.task],
+            num_fewshot=self.lmeval.num_fewshot,
+            limit=self.lmeval.limit,
             device="cuda:0",
-            batch_size=100,
+            batch_size=self.lmeval.batch_size,
         )
 
-        metrics = results["results"][self.task]
-        exact_match_strict = metrics.get("exact_match,strict-match")
-        exact_match_flex = metrics.get("exact_match,flexible-extract")
-        logger.info("Exact Match, Strict")
-        logger.info(exact_match_strict)
-        logger.info("Exact Match, Flex")
-        logger.info(exact_match_flex)
-        assert numpy.isclose(exact_match_strict, self.exact_strict, rtol=0.05)
-        assert numpy.isclose(exact_match_flex, self.exact_flex, rtol=0.05)
+        metrics = results["results"][self.lmeval.task]
+        for metric, expected_val in self.lmeval.metrics.items():
+            actual_val = metrics.get(metric)
+            logger.info(
+                f"Comparing {metric}: Expected {expected_val}, Got {actual_val}"
+            )
+            assert numpy.isclose(expected_val, actual_val, rtol=0.05)
+
         self.tear_down()
 
     def tear_down(self):