-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
737 lines (716 loc) · 45.6 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>
VividDreamer
</title>
<link rel="icon" href="assets/favicon.ico">
<!-- bootstrap -->
<script src="https://code.jquery.com/jquery-3.4.1.slim.min.js" integrity="sha384-J6qa4849blE2+poT4WnyKhv5vZF5SrPo0iEjwBvKU7imGFAV0wwj1yYfoRSJoZ+n" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/popper.min.js" integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" crossorigin="anonymous"></script>
<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-MML-AM_CHTML" async></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js" integrity="sha384-wfSDF2E50Y2D1uUdj0O3uMBJnjuUD4Ih7YwaYd1iqfktj0Uod8GCExl3Og8ifwB6" crossorigin="anonymous"></script>
<!-- icon -->
<script src="https://kit.fontawesome.com/87dc3e863a.js" crossorigin="anonymous"></script>
<!-- font -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans" rel="stylesheet" type="text/css">
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
showProcessingMessages: false,
messageStyle: "none", //不显示信息
extensions: ["tex2jax.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre','code', 'a', 'annotation', 'annotation-xml'],
ignoreClass: 'crayon-.*' // 'crayon-'
},
'HTML-CSS': {
showMathMenu: false
}
});
MathJax.Hub.Queue(["Typeset",MathJax.Hub]);
</script>
<style>
body {
background: rgb(255, 255, 255) no-repeat fixed top left;
font-family:'Open Sans', sans-serif;
}
</style>
</head>
<body>
<!-- cover -->
<section>
<div class="jumbotron text-center mt-0" style="padding-bottom: 0px;">
<div class="container-fluid" style="margin-top: -3em;">
<div class="row">
<div class="col">
<!-- paper title -->
<h2 style="font-size:40px;">
<strong>VividDreamer</strong>: Towards High-Fidelity and Efficient Text-to-3D Generation
</h2>
<!-- publication -->
<!-- <h3 style="color:#6e6e6e;"> ICCV 2023 </h3>
<hr> -->
<!-- authors -->
<span>
<h4>
<a href="https://NarcissusEx.github.io" target="_blank" style="margin-right:30px"><sup>1</sup>Zixuan Chen</a>
<a href="" target="_blank" style="margin-right:30px"><sup>1</sup>Ruijie Su</a>
<a href="" target="_blank" style="margin-right:30px"><sup>1</sup>Jiahao Zhu</a>
<a href="https://wanggcong.github.io" target="_blank" style="margin-right:30px"><sup>2</sup>Guangcong Wang</a>
<a href="https://zjjconan.github.io" target="_blank" style="margin-right:30px"><sup>1</sup>Lingxiao Yang</a>
<a href="https://cse.sysu.edu.cn/content/2498" target="_blank" style="margin-right:30px"><sup>1</sup>Jian-Huang Lai</a>
</h4>
<h4>
<a href="" target="_blank" style="margin-right:30px"><sup>3</sup>Shisong Wu</a>
<a href="https://cse.sysu.edu.cn/content/2478" target="_blank" style="margin-right:30px"><sup>1</sup>Xiaohua Xie</a>
</h4>
</span>
<div style="display: flex;justify-content: center;align-items: center;">
<div style="margin: 0 3%;">
<h4>
<sup>1</sup>Sun Yat-Sen University
</h4>
</div>
<div style="margin: 0 3%;">
<h4>
<sup>1</sup>Great Bay University
</h4>
</div>
</div>
<h4>
<sup>3</sup>China Southern Power Grid Artificial Intelligence Technology Co., Ltd.
</h4>
<!-- links -->
<div style="display: flex;justify-content: center;align-items: center;">
<div style="margin: 0 5%;">
<a href="https://arxiv.org/abs/2406.14964" target="_blank">
<img src="assets/paper.png", style="height: 60px;margin-bottom: 5px;">
<h6><strong>Paper</strong></h6>
</a>
</div>
<div style="margin: 0 5%;">
<a href="https://github.com/NarcissusEx/VividDreamer" target="_blank">
<img src="assets/github.png", style="height: 60px;margin-bottom: 5px;">
<h6><strong>Code</strong></h6>
</a>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- abstract -->
<section>
<div class="container">
<div class="row">
<div class="col-12 text-center">
<!-- <h3>Abstract</h3> -->
<div class="row justify-content-center" style="align-items:center; display:flex;margin-top: -1.5em;">
<div>
<img src="assets/Examples.png" alt="VividDreamer" class="img-responsive" width="100%"/>
</div>
<div>
<h4><b>Text-to-3D Examples</b></h4>
</div>
</div>
<hr style="margin-top: 0px;">
<!-- <br> -->
<div class="row justify-content-center" style="align-items:center; display:flex; margin-top: -1em;">
<div>
<video autoplay loop playsinline muted style="width: 100%;">
<source data-src="assets/process.mp4" type="video/mp4" src="assets/process.mp4">
</video>
<!-- <video width="100%" autoplay="autoplay" loop="loop" controls>
<source src="" type="video/mp4">
</video> -->
</div>
<table style="table-layout: fixed;" width="100%">
<tbody>
<td width="33%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Iron Man</b></span>"</i></p>
</td>
<td width="33%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>bichon frise</b></span> <span style="color:#52B19A"><b>wearing academic regalia</b></span>"</i></p>
</td>
<td width="33%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>plush dragon toy</b></span>"</i></p>
</td>
</tbody>
</table>
<div>
<h4><b>Training Process</b></h4>
</div>
</div>
</div>
<!-- <p class="text-justify">
<b>Examples of text-to-3D asset creations with our framework (a).</b> In this paper, we present <b>VividDreamer</b>, an efficient text-to-3D generation framework that can distill semantically-consistent textures and high-fidelity structures from pretrained 2D diffusion models using a novel <b>Pose-dependent Consistency Distillation Sampling</b> objective in a <b>coarse-to-fine optimization</b> manner, allowing to yield high-fidelity 3D objects (rows 1 and 2) and 3D avatars (row 3) based on the given text prompts. Specifically, <b>VividDreamer</b> achieves high training efficiency, which can create ready-to-use 3D assets within 10 minutes, while producing photorealistic 3D objects within 30 minutes <b>(b)</b>.
</p> -->
</div>
</div>
</section>
<section>
<div class="container">
<div class="row">
<div class="col-12 text-left">
<h3>Abstract</h3>
</div>
<p class="text-justify">
Text-to-3D generation aims to create 3D assets from text-to-image diffusion models. However, existing methods face an inherent bottleneck in generation quality because the widely-used objectives such as Score Distillation Sampling (SDS) inappropriately omit U-Net jacobians for swift generation, leading to significant bias compared to the ''true'' gradient obtained by full denoising sampling. This bias brings inconsistent updating direction, resulting in implausible 3D generation (<i>e.g.,</i> color deviation, Janus problem, and semantically inconsistent details). In this work, we propose <b>Pose-dependent Consistency Distillation Sampling</b> (PCDS), a novel yet efficient objective for diffusion-based 3D generation tasks. Specifically, <b>PCDS</b> builds the <b>pose-dependent consistency function</b> within diffusion trajectories, allowing to approximate true gradients through minimal sampling steps (1~3). Compared to SDS, <b>PCDS</b> can acquire a more accurate updating direction with the same sampling time (1 sampling step), while enabling few-step (2~3) sampling to trade compute for higher generation quality. For efficient generation, we propose a coarse-to-fine optimization strategy, which first utilizes 1-step <b>PCDS</b> to create the basic structure of 3D objects, and then gradually increases <b>PCDS</b> steps to generate fine-grained details. Extensive experiments demonstrate that our approach outperforms the state-of-the-art in generation quality and training efficiency, conspicuously alleviating the implausible 3D generation issues caused by the deviated updating direction. Specifically, <b>VividDreamer</b> can create ready-to-use 3D assets within 10 minutes, while produces photorealistic 3D objects within 30 minutes. Moreover, it can be simply applied to many 3D generative applications to yield impressive 3D assets, such as <b>3D portrait and avatar generation</b> and <b>text-to-3D editing</b>.
</p>
</div>
</div>
</section>
<br>
<!-- <section>
<div class="container">
<div class="row">
<div class="col-12 text-center">
<h3>Preliminary</h3>
<div class="row justify-content-center" style="align-items:center; display:flex;">
<div>
<img src="assets/preliminary.png" alt="cunerf" class="img-responsive" width="100%"/>
</div>
</div>
<br>
</div>
<p class="text-justify" style="margin-top:-30px">
Supervised MISR methods <b>(a)</b> need to collect considerable LR-HR pairs for training, while zero-shot MISR <b>(b)</b> and our <b>CuNeRF</b> <b>(c)</b> only train the model on each test volume itself.
However, given a test volume, ZSMISR methods can only upsample medical images at a specific scale (one-for-one), while our <b>CuNeRF</b> can handle arbitrary upsampling scales (one-for-all).
</p>
</div>
</div>
</div>
</section>
<br> -->
<!-- <section>
<div class="container">
<div class="row">
<div class="col-12 text-left">
<h3>Motivation</h3>
<div class="row justify-content-center" style="align-items:center; display:flex;">
<div>
<img src="assets/difference.png" alt="VividDreamer" class="img-responsive" width="100%"/>
</div>
</div>
<br>
</div>
<p class="text-justify" style="margin-top:-20px">
<b>Examples of different objectives. </b>Visually, the acquisition of ''true'' gradient <b>(a)</b> is a time-consuming work, requiring the full denoising sampling in each iteration. To skip such a lengthy process, Score Distillation Sampling (SDS) <b>(b)</b> directly maps the noise to data (<i>i.e.,</i> <b>pseudoGTs</b>) using 1-step DDPM sampling, but SDS struggles to acquire accurate gradients due to the intrinsic randomness brought by DDPM. On the contrary, our <b>PCDS</b> builds the <b>pose-dependent consistency function</b> $f_\phi$ from any timestep $t$ to the origin 0 within diffusion trajectories, allowing to generate accurate <b>pseudoGTs</b> and acquire precise gradients via minimal sampling steps (1~3).
</p>
</div>
</div>
</section>
<br> -->
<!-- <section>
<div class="container">
<div class="row">
<div class="col-12 text-center">
<h3>Framework</h3>
<div class="row justify-content-center" style="align-items:center; display:flex;">
<div>
<img src="assets/framework.png" alt="cunerf" class="img-responsive" width="100%"/>
</div>
</div>
<br>
</div>
<p class="text-justify">
The overall framework of our <b>CuNeRF</b>.
To synthesize a pixel (<font color="red">red circle</font>) with the spatial position $\mathbf{x}_t=(x, y, z)$, <b>(a)</b> <b>CuNeRF</b> first uniformly samples $N$ points as a point set $\{\hat{\mathbf{x}}_i\}^N_{i=1}$ within the cube space (<font color="orchid">purple cube</font>) centered by $\mathbf{x}_t$.
Then, <b>CuNeRF</b> obtains the coarse estimation (<font color="blue">blue cube</font>) by feeding the sampling points into an MLP $F_{\Theta}$ to produce the set of corresponding pixel intensity $\{c_i\}_{i=1}^N$ and volume density $\{\sigma_i\}_{i=1}^N$.
<b>(b)</b> Subsequently, assuming $\sigma$ of each sampling point is only related to the distance with the cube center $\mathbf{x}_t$, <b>CuNeRF</b> computes the coarse output of the target pixel via volume integral.
<b>(c)</b> Finally, <b>CuNeRF</b> resamples the points under the probability density function (PDF) of coarse estimation to acquire the fine estimation (<font color="orange">orange cube</font>) of the cube.
The fine output is generated by the same procedures as <b>(b)</b>.
Since these two rendering functions are differentiable, <b>CuNeRF</b> can be optimized by minimizing the rendering loss $\mathcal{L}_{A}$.
The fine output is the final rendering result of the target spatial position $\mathbf{x}_t$.
</p>
</div>
</div>
</section> -->
<!-- <br> -->
<!-- comparison results -->
<section>
<div class="container">
<div class="row">
<div class="text-left">
<!-- <hr style="margin-top:0px"> -->
<h3>Visual Comparisons</h3>
</div>
<div class="text-center">
<table style="table-layout: fixed;" width="100%">
<tbody>
<tr>
<td width="15%"></td>
<td width="21%">
<p style="font-size: larger;margin-bottom: 0em;">Stable DreamFusion</p><p style="font-size: large;">(~1h)</p>
</td>
<!-- <td width="17%">
<p style="font-size: larger;margin-bottom: 0em;">DreamGaussian</p><p style="font-size: large;">(~3mins)</p>
</td> -->
<td width="21%">
<p style="font-size: larger;margin-bottom: 0em;">GaussianDreamer</p><p style="font-size: large;">(~9mins)</p>
</td>
<td width="21%">
<p style="font-size: larger;margin-bottom: 0em;">LucidDreamer</p><p style="font-size: large;">(~45mins)</p>
</td>
<td width="21%">
<p style="font-size: larger;margin-bottom: 0em;"><b>VividDreamer (Ours)</b></p><p style="font-size: large;">(~30mins)</p>
</td>
</tr>
</tbody>
</table>
</div>
<div class="text-center">
<table>
<tbody>
<tr>
<td width="15%">
<!-- <i>"A DSLR photo of a <span style="color:#EA700D"><b>chow chow puppy</b></span>"</i> -->
<p style="font-size: larger;"><i>"A DSLR photo of a <span style="color:#EA700D"><b>chow chow puppy</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_DSLR_photo_of_a_chow_chow_puppy.mp4">
</video>
</td>
</tr>
<tr>
<td width="15%">
<!-- <i>"A DSLR photo of a <span style="color:#EA700D"><b>chow chow puppy</b></span>"</i> -->
<p style="font-size: larger;"><i>"A zoomed out DSLR photo of a <span style="color:#EA700D"><b>wizard raccoon</b></span> <span style="color:#52B19A"><b>casting a spell</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_zoomed_out_DSLR_photo_of_a_wizard_raccoon_casting_a_spell.mp4">
</video>
</td>
</tr>
<tr>
<td width="15%">
<!-- <i>"A DSLR photo of a <span style="color:#EA700D"><b>chow chow puppy</b></span>"</i> -->
<p style="font-size: larger;"><i>"A DSLR photo of a <span style="color:#EA700D"><b>baby dragon</b></span> <span style="color:#52B19A"><b>drinking_boba</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_DSLR_photo_of_a_baby_dragon_drinking_boba.mp4">
</video>
</td>
</tr>
<tr>
<td width="15%">
<!-- <i>"A DSLR photo of a <span style="color:#EA700D"><b>chow chow puppy</b></span>"</i> -->
<p style="font-size: larger;"><i>"A DSLR photo of <span style="color:#EA700D"><b>a tray of Sushi</b></span> <span style="color:#52B19A"><b>containing pugs</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_DSLR_photo_of_a_tray_of_Sushi_containing_pugs.mp4">
</video>
</td>
</tr>
<tr>
<td width="15%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>goat</b></span> <span style="color:#52B19A"><b>drinking beer</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_goat_drinking_beer.mp4">
</video>
</td>
</tr>
<tr>
<td width="15%">
<p style="font-size: larger;"><i>"A DSLR photo of a <span style="color:#EA700D"><b>terracotta bunny</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_DSLR_photo_of_a_terracotta_bunny.mp4">
</video>
</td>
</tr>
<!-- <tr>
<td width="15%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>capybara</b></span> <span style="color:#52B19A"><b>wearing a top hat</b></span>, <span style="color:#52B19A"><b>low poly</b></span>"</i></p>
</td>
<td width="85%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_capybara_wearing_a_top_hat_low_poly.mp4">
</video>
</td>
</tr> -->
<!-- <tr>
<td width="15%">
</td>
<td width="75%">
</td>
</tr> -->
</tbody>
</table>
<hr>
</div>
<br>
<div class="text-left">
<h3>Visual Comparisons (~10 minutes)</h3>
</div>
<div class="text-center">
<table style="table-layout: fixed;" width="100%">
<tbody>
<tr>
<td width="25%"></td>
<td width="20%">
<p style="font-size: larger;margin-bottom: 0em;">GaussianDreamer (SDS)</p><p style="font-size: large;">(~9mins)</p>
</td>
<td width="20%">
<p style="font-size: larger;margin-bottom: 0em;">LucidDreamer (ISM)</p><p style="font-size: large;">(~10mins)</p>
</td>
<td width="20%">
<p style="font-size: larger;margin-bottom: 0em;"><b>VividDreamer (Ours)</b></p><p style="font-size: large;">(~10mins)</p>
</td>
<td width="15%"></td>
</tr>
</tbody>
</table>
</div>
<div class="text-center">
<table>
<tbody>
<tr>
<td width="5%"></td>
<td width="20%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>pig</b></span> <span style="color:#52B19A"><b>wearing a backpack</b></span>"</i></p>
</td>
<td width="60%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_pig_wearing_a_backpack.mp4">
</video>
</td>
<td width="15%"></td>
</tr>
<tr>
<td width="5%"></td>
<td width="20%">
<p style="font-size: larger;"><i>"A DSLR photo of a <span style="color:#EA700D"><b>corgi puppy</b></span>"</i></p>
</td>
<td width="60%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_DSLR_photo_of_a_corgi_puppy.mp4">
</video>
</td>
<td width="15%"></td>
</tr>
<tr>
<td width="5%"></td>
<td width="20%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>yellow schoolbus</b></span>"</i></p>
</td>
<td width="60%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/comp/a_yellow_schoolbus.mp4">
</video>
</td>
<td width="15%"></td>
</tr>
</tbody>
</table>
</div>
</div>
<hr>
</div>
</section>
<section>
<div class="container">
<div class="row">
<div class="col-12 text-left">
<h3>More Generated Results</h3>
</div>
<div class="text-center">
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/0.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>piglet</b></span> <span style="color:#52B19A"><b>sitting in a teacup</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A <span style="color:#EA700D"><b>red panda</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>shiba inu</b></span> <span style="color:#52B19A"><b>wearing golf clothes and hat</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>cocker spaniel</b></span> <span style="color:#52B19A"><b>wearing a crown</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/1.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A zoomed out DSLR photo of a <span style="color:#EA700D"><b>corgi</b></span> <span style="color:#52B19A"><b>wearing a top hat</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>squirrel</b></span> <span style="color:#52B19A"><b>dressed like a clown</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A zoomed out DSLR photo of a <span style="color:#EA700D"><b>kingfisher bird</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>mandarin duck</b></span> <span style="color:#52B19A"><b>swimming in a pond</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/2.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A <span style="color:#EA700D"><b>plush toy</b></span> of a <span style="color:#52B19A"><b>corgi nurse</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A <span style="color:#EA700D"><b>plush dragon toy</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>hippo</b></span> with <span style="color:#52B19A"><b>wearing a sweater</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A zoomed out DSLR photo of a <span style="color:#EA700D"><b>lion's mane jellyfish</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/3.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>robot dinosaur</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#52B19A"><b>shiny silver</b></span> <span style="color:#EA700D"><b>robot_cat</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>hippo</b></span> <span style="color:#52B19A"><b>made out of chocolate</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-2.5em"><i>"A DSLR photo of an <span style="color:#EA700D"><b>origami hippo</b></span> <span style="color:#52B19A"><b>in a river</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/4.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-3.5em"><i>"An <span style="color:#EA700D"><b>airplane</b></span> <span style="color:#52B19A"><b>made out of wood</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-3.5em"><i>"A <span style="color:#EA700D"><b>Panther De Ville car</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-3.5em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>steam engine train</b></span>, <span style="color:#52B19A"><b>high resolution</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-3.5em"><i>"A DSLR photo of an <span style="color:#EA700D"><b>amigurumi motorcycle</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/5.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"A delicious <span style="color:#EA700D"><b>chocolate brownie dessert</b></span> with <span style="color:#52B19A"><b>ice cream</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"A DSLR photo of an <span style="color:#EA700D"><b>ice cream sundae</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"A DSLR photo of <span style="color:#EA700D"><b>spaghetti and meatballs</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"<span style="color:#EA700D"><b>Tower Bridge</b></span> <span style="color:#52B19A"><b>made out of gingerbread and candy</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/visual/6.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"A <span style="color:#EA700D"><b>20-sided die</b></span> <span style="color:#52B19A"><b>made out of glass</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"A DSLR photo of a <span style="color:#EA700D"><b>football helmet</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"A DSLR photo of the <span style="color:#EA700D"><b>leaning tower of Pisa</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;margin-top:-1em"><i>"An <span style="color:#EA700D"><b>erupting volcano</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
</div>
</div>
<hr>
</div>
</section>
<section>
<div class="container">
<div class="row">
<div class="col-12 text-left">
<h3>Applications</h3>
</div>
<div class="col-12 text-center">
<h4>3D Portrait Generation</h4>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/avatar/0.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>boy</b></span> <span style="color:#52B19A"><b>with facial painting</b></span>, head, HDR, photorealistic, 8K"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Barack Obama</b></span>, head, HDR, photorealistic, 8K"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"Portrait of <span style="color:#EA700D"><b>young norwegian woman</b></span>, <span style="color:#52B19A"><b>steampunk, long hair</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Robert Pattinson</b></span>, head, HDR, photorealistic, 8K"</i></p>
</td>
</tr>
</tbody>
</table>
<hr>
<h4>3D Avatar Generation</h4>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/avatar/1.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Iron man</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Ant man</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Bat man</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Groot</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<table style="table-layout: fixed;" width="100%">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/avatar/2.mp4">
</video>
<tbody>
<tr>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Sun Wukong</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"<span style="color:#EA700D"><b>Captain Marvel</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>young man</b></span> <span style="color:#52B19A"><b>wearing a turtleneck</b></span>"</i></p>
</td>
<td width="25%">
<p style="font-size: larger;"><i>"A <span style="color:#EA700D"><b>Mediterranean</b></span> <span style="color:#52B19A"><b>with beard wearing white linen shirt</b></span>"</i></p>
</td>
</tr>
</tbody>
</table>
<hr>
<h4>Text-to-3D Editing</h4>
<table style="table-layout: fixed;margin-bottom: -4em;">
<video autoplay loop playsinline muted style="width: 100%;">
<source type="video/mp4" src="assets/editing/0.mp4">
</video>
<img src="assets/editing/edit.png" width="100%" style="margin-top: -3.5em; margin-bottom: -1em;"/>
<hr style="margin-top:0px">
</table>
</div>
</div>
</div>
</section>
<br>
<br>
<br>
<!-- citing -->
<div class="container">
<div class="row ">
<div class="col-12">
<h3>Citation</h3>
<pre style="background-color: #e9eeef;padding: 1.25em 1.5em">
<code>@article{chen2024vividdreamer,
title={VividDreamer: Towards High-Fidelity and Efficient Text-to-3D Generation},
author={Chen, Zixuan and Su, Ruijie and Zhu, Jiahao and Yang, Lingxiao and Lai, Jian-Huang and Xie, Xiaohua},
journal={arXiv preprint arXiv:2406.14964},
year={2024}
}</code>
</pre>
</div>
</div>
</div>
<br>
<!-- journal={arXiv preprint arXiv:2305.18766}, -->
<div class="container">
<div class="row ">
<div class="col-12">
<h3>Acknowledgements</h3>
<hr style="margin-top:0px">
</div>
<p class="text-justify">
This project is supported by the Natural Science Foundation of China (No. 62072482), and is also supported by the Project of Guangdong Provincial Key Laboratory of Information Security Technology (Grant No. 2023B1212060026).<br>
We also thank to <a href="https://lioryariv.github.io/" target="_blank">Lior Yariv</a> for the <a href="https://lioryariv.github.io/idr/" target="_blank">website template</a>.
</p>
</div>
</div>
<section>
<div class="container">
<div class="row">
<div class="col">
<div class="row justify-content-center" style="text-align:center; padding:0 0 30px 0">
<a href="https://clustrmaps.com/site/1c39c" title="Visit tracker"><img src="https://www.clustrmaps.com/map_v2.png?d=B9FjzYCSBROZZ7ALYpj-UbKTWzReXLKIazwG1wWsr8E&cl=ffffff"></a>
</div>
</div>
</div>
</div>
</section>
</body>
</html>