index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="">
  <meta name="keywords" content="Video Language Model, Video Instruction Tuning">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title></title>

  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">QLIP: Text-Aligned Visual Tokenization Unifies Auto-Regressive Multimodal Understanding and Generation</h1>
          <h1 class="title is-4 publication-conference"></h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="http://zhaoyue-zephyrus.github.io" target="_blank">Yue Zhao</a><sup>1,*</sup>,
            </span>
            <span class="author-block">
              <a href="https://xuefuzhao.github.io/" target="_blank">Fuzhao Xue</a><sup>2,&dagger;</sup>,
            </span>
            <span class="author-block">
              <a href="https://reedscot.github.io/" target="_blank">Scott Reed</a><sup>2</sup>,
            </span>
            <span class="author-block">
              <a href="https://jimfan.me/" target="_blank">Linxi "Jim" Fan</a><sup>2</sup>,
            </span>
            <span class="author-block">
              <a href="https://yukezhu.me/" target="_blank">Yuke Zhu</a><sup>1,2</sup>,
            </span>
            <span class="author-block">
                <a href="https://jankautz.com/" target="_blank">Jan Kautz</a><sup>2</sup>,
            </span>
            <span class="author-block">
                <a href="https://chrisding.github.io/" target="_blank">Zhiding Yu</a><sup>2</sup>,
            </span>
            <span class="author-block">
                <a href="http://philkr.net" target="_blank">Philipp Kr&auml;henb&uuml;hl</a><sup>1</sup>,
            </span>
            <span class="author-block">
                <a href="https://ai.stanford.edu/~dahuang/" target="_blank">De-An Huang</a><sup>2</sup>
            </span>
        </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>UT Austin</span>
            <span class="author-block"><sup>2</sup>NVIDIA</span>
          </div>

          <div class="is-size-7 publication-authors">
            <span class="author-block"><sup>*</sup>The work was done during an internship at NVIDIA Research.</span>
            <span class="author-block"><sup>&dagger;</sup>Now at Google DeepMind.</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2502.05178"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/NVlabs/QLIP/tree/main"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <!-- HF Link. -->
              <span class="link-block">
                <a href="https://huggingface.co/collections/nvidia/qlip-67a478054fce07a7be99d5cd"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    🤗
                  </span>
                  <span>HF Models</span>
                  </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
            <p>
            We introduce Quantized Language-Image Pretraining <b>(QLIP)</b>,
            a visual tokenization method that combines state-of-the-art reconstruction quality with state-of-the-art zero-shot image understanding.
            QLIP trains a binary-spherical-quantization-based autoencoder with reconstruction and language-image alignment objectives.
            We are the first to show that the two objectives do not need to be at odds.
            We balance the two loss terms dynamically during training and show that a two-stage training pipeline effectively mixes the large-batch requirements of image-language pre-training with the memory bottleneck imposed by the reconstruction objective.
            We validate the effectiveness of QLIP for multimodal understanding and text-conditioned image generation with a single model.
            Specifically, QLIP serves as a drop-in replacement for the visual encoder for LLaVA and the image tokenizer for LlamaGen with comparable or even better performance.
            Finally, we demonstrate that QLIP enables a unified mixed-modality auto-regressive model for understanding and generation.
            </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->

  </div>
</section>

<section class="section">
    <div class="container is-max-desktop">
      <!-- Overview. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Overview</h2>
          <div class="content has-text-justified">
              <p>
                State-of-the-art visual tokenizers excel at either understanding (high zero-shot accuracy, e.g. SigLIP) or reconstruction (low reconstruction FID, e.g. RQ-VAE), but not both.
                QLIP can perform well on both understanding and reconstruction with a marginal performance drop, opening up an opportunity for unified multi-modal understanding and generation.
              </p>
              <div class="content has-text-justified">
                <img src="./static/images//overview.png" class="center" alt="method overview">
              </div>
        </div>
        </div>
      </div>
      <!--/ Overview. -->
  
    </div>
  </section>

<section class="section">
    <div class="container is-max-desktop">

        <div class="columns is-centered">


          <!-- Method Motivation. -->
          <div class="column">
            <div class="content">
              <h2 class="title is-5">Observation 1: Memory comsumption</h2>
              <div class="content has-text-justified">
                <img src="./static/images/memory.png" width="60%" class="center" alt="memory">
              </div>
              <p>
              Reconstruction losses incur a large proportion of the memory cost and prohibit large-batch training.
              Better reconstruction is not always beneficial for representation learning.
            </p>
            </div>
          </div>

          <div class="column">
            <div class="content">
              <h2 class="title is-5">Observation 2: Loss imbalance</h2>
              <div class="content has-text-justified">
                <img src="./static/images/loss_weight.png" width="60%" class="center" alt="loss weight">
              </div>
              <p>
              Different gradient magnitude (up to 2 orders of magnitude) leads to different convergence rates between the contrastive image-text
              alignment and pixel reconstruction objectives.
              </p>
            </div>
          </div>
          <!--/ Method Motivation. -->
        </div>

          <!-- Method Overview. -->
          <div class="column">
            <div class="content">
              <h2 class="title is-3">Solution: Training QLIP in two stages</h2>
              <div class="content has-text-justified">
                <img src="./static/images/training.png" width="60%" class="center" alt="training">
              </div>
              <p>
                Stage (1):  we optimize a weighted sum of reconstruction loss, quantization loss, and contrastive loss <i>without</i> the perceptual and adversarial loss.
                The loss weights are decided based on the gradient magnitude of each loss term. 
              </p>
              <p>
                Stage (2): we improve the reconstruction quality and restore higher-frequency details by fine-tuning the quantization bottleneck and the visual decoder.
                We drop the text encoder and freeze the visual encoder to prevent degradation when the batch-size restriction is relaxed.
              </p>
              <div class="content has-text-justified">
                <img src="./static/images/recon.png" width="60%" class="center" alt="reconstruction">
              </div>
            </div>
          </div>
          <!--/ Method Overview. -->

        </div>
    </div>
</section>

<section class="section">
    <div class="container is-max-desktop">
  
      <div class="columns is-centered">
  
        <div class="column">
          <div class="content">
            <h2 class="title is-3">Evaluating QLIP</h2>
            <p>
              We conduct a linear probing evaluation to compare all visual encoder methods.
              The methods include: <br>
              (1) reconstruction-only tokenizers such as VQ-VAE and <a href="https://arxiv.org/abs/2406.07548">BSQ-ViT</a>, <br>
              (2) language-quantized tokenizers, such as <a href="https://arxiv.org/abs/2302.00902">LQAE</a>, and <br>
              (3) CLIP-style vision encoder (without decoder), such as <a href="https://arxiv.org/abs/2303.15389">EVA-CLIP</a>. <br>
              We see significant improvement in linear probing classsification accuracy over reconstruction-only tokenizers and language-quantized tokenizers.
              In addition, QLIP is very close to EVA-CLIP.
            </p>
            <div class="content has-text-centered">
              <img src="./static/images/linear_probe.png" alt="qlip linear probe">
            </div>
        </div>
        </div>
    </div>
</section>
  
<section class="section">
    <div class="container is-max-desktop">
  
      <div class="columns is-centered">

        <div class="column">
          <div class="content">
            <h2 class="title is-3">QLIP For Text-to-Image Generation</h2>
              <p>
                We show the generated images by LlamaGen with its original VQGAN (<b>left</b>) and our QLIP (<b>right</b>) side by side with the same caption in the bottom.
                We can see images generated by QLIP follow the captions better by depicting all aspects that might be missing from the baseline with VQ-GAN.
              </p>
              <div class="content has-text-centered">
                <img src="./static/images/llamagen.png" width="100%" alt="llamagen">
              </div>
            </div>
  
          </div>
        </div>
      </div>
    </div>
</section>

<section class="section">
  <div class="container is-max-desktop">

    <div class="columns is-centered">

      <div class="column">
        <div class="content">
          <h2 class="title is-3">QLIP Enables Unified Multimodal Modeling</h2>
            <p>
              With QLIP as the underlying visual tokenizer,
              we show the performance of the unified multimodal model that performs all text-only,
              image-to-text, and text-to-image tasks in one <b>single</b> model.
            </p>
            <div class="content has-text-centered">
              <img src="./static/images/um3.png" width="70%" alt="um3">
            </div>
          </div>

        </div>
      </div>
    </div>
  </div>
</section>


<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@article{zhao2025qlip,
  author    = {Zhao, Yue and Xue, Fuzhao and Reed, Scott and Fan, Linxi and Zhu, Yuke and Kautz, Jan and Yu, Zhiding and Kr&auml;henb&uuml;hl, Philipp and Huang, De-An},
  title     = {QLIP: Text-Aligned Visual Tokenization Unifies Auto-Regressive Multimodal Understanding and Generation},
  journal   = {arXiv preprint arXiv:2502.05178},
  year      = {2025},
}</code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a class="icon-link"
         href="./static/contents/qlip_paper.pdf">
        <i class="fas fa-file-pdf"></i>
      </a>
      <a class="icon-link" href="https://github.com/NVlabs/QLIP" class="external-link" disabled>
        <i class="fab fa-github"></i>
      </a>
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            The website template is borrowed from <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies</a>.
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>