@inproceedings{bb228700, AUTHOR = "Maniparambil, M. and Akshulakov, R. and Djilali, Y.A.D. and Seddik, M.E.A. and Narayan, S. and Mangalam, K. and O'Connor, N.E.", TITLE = "Do Vision and Language Encoders Represent the World Similarly?", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14334-14343", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223702"} @inproceedings{bb228701, AUTHOR = "Pan, C. and Yaman, B. and Velipasalar, S. and Ren, L.", TITLE = "CLIP-BEVFormer: Enhancing Multi-View Image-Based BEV Detector with Ground Truth Flow", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15216-15225", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223703"} @inproceedings{bb228702, AUTHOR = "Yang, C.G. and An, Z. and Huang, L. and Bi, J.Y. and Yu, X. and Yang, H. and Diao, B. and Xu, Y.J.", TITLE = "CLIP-KD: An Empirical Study of CLIP Model Distillation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15952-15962", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223704"} @inproceedings{bb228703, AUTHOR = "Vasu, P.K.A. and Pouransari, H. and Faghri, F. and Vemulapalli, R. and Tuzel, O.", TITLE = "MobileCLIP: Fast Image-Text Models through Multi-Modal Reinforced Training", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15963-15974", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223705"} @inproceedings{bb228704, AUTHOR = "Fan, L. and Zhou, J.X. and Xing, X.Y. and Wu, Y.", TITLE = "Active Open-Vocabulary Recognition: Let Intelligent Moving Mitigate CLIP Limitations", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16394-16403", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223706"} @inproceedings{bb228705, AUTHOR = "Stevens, S. and Wu, J. and Thompson, M.J. and Campolongo, E.G. and Song, C.H. and Carlyn, D.E. and Dong, L. and Dahdul, W.M. and Stewart, C. and Berger Wolf, T. and Chao, W.L. and Su, Y.", TITLE = "BioCLIP: A Vision Foundation Model for the Tree of Life", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "19412-19424", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223707"} @inproceedings{bb228706, AUTHOR = "Tang, Y.W. and Lin, Z. and Wang, Q.L. and Zhu, P.F. and Hu, Q.H.", TITLE = "AMU-Tuning: Effective Logit Bias for CLIP-based Few-shot Learning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "23323-23333", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223708"} @inproceedings{bb228707, AUTHOR = "Huang, Y.S. and Shakeri, F. and Dolz, J. and Boudiaf, M. and Bahig, H. and Ben Ayed, I.", TITLE = "LP++: A Surprisingly Strong Linear Probe for Few-Shot CLIP", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "23773-23782", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223709"} @inproceedings{bb228708, AUTHOR = "Bai, J. and Gao, K. and Min, S.B. and Xia, S.T. and Li, Z.F. and Liu, W.", TITLE = "BadCLIP: Trigger-Aware Prompt Learning for Backdoor Attacks on CLIP", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24239-24250", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223710"} @inproceedings{bb228709, AUTHOR = "Liang, S.Y. and Zhu, M.L. and Liu, A. and Wu, B.Y. and Cao, X.C. and Chang, E.C.", TITLE = "BadCLIP: Dual-Embedding Guided Backdoor Attack on Multimodal Contrastive Learning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24645-24654", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223711"} @inproceedings{bb228710, AUTHOR = "Cheng, J. and Liang, D. and Tan, S.", TITLE = "Transfer CLIP for Generalizable Image Denoising", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "25974-25984", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223712"} @inproceedings{bb228711, AUTHOR = "Ma, J.W. and Huang, P.Y. and Xie, S. and Li, S.W. and Zettlemoyer, L. and Chang, S.F. and Yih, W.T. and Xu, H.", TITLE = "MoDE: CLIP Data Experts via Clustering", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26344-26353", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223713"} @inproceedings{bb228712, AUTHOR = "Li, X. and Zhang, W. and Liu, Y.N. and Hu, Z.H. and Zhang, B. and HU, X.L.", TITLE = "Language-Driven Anchors for Zero-Shot Adversarial Robustness", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24686-24695", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223714"} @inproceedings{bb228713, AUTHOR = "Massiceti, D. and Longden, C. and Slowik, A. and Wills, S. and Grayson, M. and Morrison, C.", TITLE = "Explaining CLIP's Performance Disparities on Data from Blind/Low Vision Users", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12172-12182", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223715"} @inproceedings{bb228714, AUTHOR = "Wu, S. and Tan, H. and Tian, Z. and Chen, Y. and Qi, X.J. and Jia, J.Y.", TITLE = "SaCo Loss: Sample-Wise Affinity Consistency for Vision-Language Pre-Training", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27348-27359", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223716"} @inproceedings{bb228715, AUTHOR = "Lin, H. and Bai, H. and Liu, Z. and Hou, L. and Sun, M. and Song, L.Q. and Wei, Y. and Surr, Z.A.", TITLE = "MoPE-CLIP: Structured Pruning for Efficient Vision-Language Models with Module-Wise Pruning Error Metric", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27360-27370", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223717"} @inproceedings{bb228716, AUTHOR = "Gao, Y.P. and Wang, Z. and Zheng, W.S. and Xie, C. and Zhou, Y.", TITLE = "Sculpting Holistic 3D Representation in Contrastive Language-Image-3D Pre-Training", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22998-23008", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223718"} @inproceedings{bb228717, AUTHOR = "Shen, S. and Zhu, Z. and Fan, L.Q. and Zhang, H. and Wu, X.X.", TITLE = "DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D Classification", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "3584-3593", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223719"} @inproceedings{bb228718, AUTHOR = "Wan, B. and Tuytelaars, T.", TITLE = "Exploiting CLIP for Zero-shot HOI Detection Requires Knowledge Distillation at Multiple Levels", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "1794-1804", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223720"} @inproceedings{bb228719, AUTHOR = "Mei, J. and Piergiovanni, A.J. and Hwang, J.N. and Li, W.", TITLE = "SLVP: Self-Supervised Language-Video Pre-Training for Referring Video Object Segmentation", BOOKTITLE = Pretrain24, YEAR = "2024", PAGES = "507-517", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223721"} @inproceedings{bb228720, AUTHOR = "Nicolas, J. and Chiaroni, F. and Ziko, I. and Ahmad, O. and Desrosiers, C. and Dolz, J.", TITLE = "MoP-CLIP: A Mixture of Prompt-Tuned CLIP Models for Domain Incremental Learning", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "1751-1761", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223722"} @inproceedings{bb228721, AUTHOR = "Hess, G. and Tonderski, A. and Petersson, C. and Astrom, K. and Svensson, L.", TITLE = "LidarCLIP or: How I Learned to Talk to Point Clouds", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "7423-7432", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223723"} @inproceedings{bb228722, AUTHOR = "Theisen, W. and Scheirer, W.", TITLE = "C-CLIP: Contrastive Image-Text Encoders to Close the Descriptive-Commentative Gap", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "7226-7235", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223724"} @inproceedings{bb228723, AUTHOR = "Gondal, M.W. and Gast, J. and Ruiz, I.A. and Droste, R. and Macri, T. and Kumar, S. and Staudigl, L.", TITLE = "Domain Aligned CLIP for Few-shot Classification", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "5709-5718", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223725"} @inproceedings{bb228724, AUTHOR = "Phan, T. and Vo, K. and Le, D. and Doretto, G. and Adjeroh, D. and Le, N.", TITLE = "ZEETAD: Adapting Pretrained Vision-Language Model for Zero-Shot End-to-End Temporal Action Detection", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "7031-7040", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223726"} @inproceedings{bb228725, AUTHOR = "Lai, Z.F. and Bai, H.P. and Zhang, H.T. and Du, X.Z. and Shan, J.L. and Yang, Y.F. and Chuah, C.N. and Cao, M.", TITLE = "Empowering Unsupervised Domain Adaptation with Large-scale Pre-trained Vision-Language Models", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "2679-2689", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223727"} @inproceedings{bb228726, AUTHOR = "Gupta, D. and Kharbanda, S. and Zhou, J.W. and Li, W. and Pfister, H. and Wei, D.L.", TITLE = "CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for Multimodal Machine Translation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2863-2874", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223728"} @inproceedings{bb228727, AUTHOR = "Fang, H. and Yang, Z.F. and Wei, Y.H. and Zang, X.H. and Ban, C. and Feng, Z. and He, Z.J. and Li, Y.X. and Sun, H.", TITLE = "Alignment and Generation Adapter for Efficient Video-Text Understanding", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2783-2789", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223729"} @inproceedings{bb228728, AUTHOR = "Zhu, B. and Niu, Y. and Han, Y.C. and Wu, Y. and Zhang, H.W.", TITLE = "Prompt-aligned Gradient for Prompt Tuning", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15613-15623", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223730"} @inproceedings{bb228729, AUTHOR = "Yuan, H.J. and Zhang, S.W. and Wang, X. and Albanie, S. and Pan, Y. and Feng, T. and Jiang, J.W. and Ni, D. and Zhang, Y. and Zhao, D.L.", TITLE = "RLIPv2: Fast Scaling of Relational Language-Image Pre-training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21592-21604", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223731"} @inproceedings{bb228730, AUTHOR = "Wang, Z. and Yu, X. and Rao, Y.M. and Zhou, J. and Lu, J.W.", TITLE = "Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "5617-5627", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223732"} @inproceedings{bb228731, AUTHOR = "Li, M. and Wu, J. and Wang, X. and Chen, C. and Qin, J. and Xiao, X.F. and Wang, R. and Zheng, M. and Pan, X.", TITLE = "AlignDet: Aligning Pre-training and Fine-tuning in Object Detection", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "6843-6853", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223733"} @inproceedings{bb228732, AUTHOR = "Lee, S. and Chung, H.J. and Park, M.Y. and Park, J. and Ryu, W.S. and Ye, J.C.", TITLE = "Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "10676-10686", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223734"} @inproceedings{bb228733, AUTHOR = "Ye, Q.H. and Xu, G.H. and Yan, M. and Xu, H.Y. and Qian, Q. and Zhang, J. and Huang, F.", TITLE = "HiTeA: Hierarchical Temporal-Aware Video-Language Pre-training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15359-15370", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223735"} @inproceedings{bb228734, AUTHOR = "Wu, C.Y. and Zhang, X.M. and Zhang, Y. and Wang, Y.F. and Xie, W.", TITLE = "MedKLIP: Medical Knowledge Enhanced Language-Image Pre-Training for X-ray Diagnosis", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21315-21326", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223736"} @inproceedings{bb228735, AUTHOR = "Yang, Q.S. and Li, W.Y. and Li, B. and Yuan, Y.X.", TITLE = "MRM: Masked Relation Modeling for Medical Image Pre-Training with Genetics", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21395-21405", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223737"} @inproceedings{bb228736, AUTHOR = "Ma, W.X. and Li, S. and Zhang, J. and Liu, C.H. and Kang, J.X. and Wang, Y.L. and Huang, G.", TITLE = "Borrowing Knowledge From Pre-trained Language Model: A New Data-efficient Visual Learning Paradigm", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "18740-18751", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223738"} @inproceedings{bb228737, AUTHOR = "Ganugula, P. and Kumar, Y.S.S.S.S. and Reddy, N.K.S. and Chellingi, P. and Thakur, A. and Kasera, N. and Anand, C.S.", TITLE = "MOSAIC: Multi-Object Segmented Arbitrary Stylization Using CLIP", BOOKTITLE = NIVT23, YEAR = "2023", PAGES = "892-903", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223739"} @inproceedings{bb228738, AUTHOR = "Luo, Z.Y. and Zhao, P. and Xu, C. and Geng, X. and Shen, T. and Tao, C.Y. and Ma, J. and Lin, Q.W. and Jiang, D.X.", TITLE = "LexLIP: Lexicon-Bottlenecked Language-Image Pre-Training for Large-Scale Image-Text Sparse Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11172-11183", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223740"} @inproceedings{bb228739, AUTHOR = "Zhai, X.H. and Mustafa, B. and Kolesnikov, A. and Beyer, L.", TITLE = "Sigmoid Loss for Language Image Pre-Training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11941-11952", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223741"} @inproceedings{bb228740, AUTHOR = "Shtedritski, A. and Rupprecht, C. and Vedaldi, A.", TITLE = "What does CLIP know about a red circle? Visual prompt engineering for VLMs", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11953-11963", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223742"} @inproceedings{bb228741, AUTHOR = "Zhu, Z.Y. and Ma, X.J. and Chen, Y.X. and Deng, Z.D. and Huang, S.Y. and Li, Q.", TITLE = "3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2899-2909", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223743"} @inproceedings{bb228742, AUTHOR = "Yang, K.C. and Deng, J.K. and An, X. and Li, J.W. and Feng, Z. and Guo, J. and Yang, J. and Liu, T.L.", TITLE = "ALIP: Adaptive Language-Image Pre-training with Synthetic Caption", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2910-2919", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223744"} @inproceedings{bb228743, AUTHOR = "Yang, Y.F. and Huang, W.Q. and Wei, Y.X. and Peng, H. and Jiang, X.Y. and Jiang, H.Q. and Wei, F. and Wang, Y. and Hu, H. and Qiu, L. and Yang, Y.Q.", TITLE = "Attentive Mask CLIP", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2759-2769", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223745"} @inproceedings{bb228744, AUTHOR = "Vinker, Y. and Alaluf, Y. and Cohen Or, D. and Shamir, A.", TITLE = "CLIPascene: Scene Sketching with Different Types and Levels of Abstraction", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "4123-4133", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223746"} @inproceedings{bb228745, AUTHOR = "Wei, Y.X. and Hu, H. and Xie, Z. and Liu, Z. and Zhang, Z. and Cao, Y. and Bao, J.M. and Chen, D. and Guo, B.", TITLE = "Improving CLIP Fine-tuning Performance", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "5416-5426", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223747"} @inproceedings{bb228746, AUTHOR = "Maniparambil, M. and Vorster, C. and Molloy, D. and Murphy, N. and McGuinness, K. and O'Connor, N.E.", TITLE = "Enhancing CLIP with GPT-4: Harnessing Visual Descriptions as Prompts", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "262-271", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223748"} @inproceedings{bb228747, AUTHOR = "Zheng, X. and Huang, X.S. and Mei, G.F. and Hou, Y.N. and Lyu, Z.Y. and Dai, B. and Ouyang, W.L. and Gong, Y.S.", TITLE = "Point Cloud Pre-Training with Diffusion Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22935-22945", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223749"} @inproceedings{bb228748, AUTHOR = "Huang, T.Y. and Dong, B. and Yang, Y.H. and Huang, X.S. and Lau, R.W.H. and Ouyang, W.L. and Zuo, W.M.", TITLE = "CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth Pre-Training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "22100-22110", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223750"} @inproceedings{bb228749, AUTHOR = "Wu, K. and Peng, H.W. and Zhou, Z.H. and Xiao, B. and Liu, M.C. and Yuan, L. and Xuan, H. and Valenzuela, M. and Chen, X.S. and Wang, X.G. and Chao, H.Y. and Hu, H.", TITLE = "TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "21913-21923", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223751"} @inproceedings{bb228750, AUTHOR = "Deng, X. and Shi, H. and Huang, R. and Li, C.L. and Xu, H. and Han, J.H. and Kwok, J. and Zhao, S. and Zhang, W. and Liang, X.D.", TITLE = "GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "22121-22132", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223752"} @inproceedings{bb228751, AUTHOR = "Ranasinghe, K. and McKinzie, B. and Ravi, S. and Yang, Y.F. and Toshev, A. and Shlens, J.", TITLE = "Perceptual Grouping in Contrastive Vision-Language Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "5548-5561", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223753"} @inproceedings{bb228752, AUTHOR = "Shao, B. and Liu, J.Z. and Pei, R. and Xu, S. and Dai, P. and Lu, J.W. and Li, W. and Yan, Y.", TITLE = "HiVLP: Hierarchical Interactive Video-Language Pre-Training", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13710-13720", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223754"} @inproceedings{bb228753, AUTHOR = "Ali, M. and Khan, S.", TITLE = "CLIP-Decoder: ZeroShot Multilabel Classification using Multimodal CLIP Aligned Representations", BOOKTITLE = VLAR23, YEAR = "2023", PAGES = "4677-4681", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223755"} @inproceedings{bb228754, AUTHOR = "Singha, M. and Pal, H. and Jha, A. and Banerjee, B.", TITLE = "AD-CLIP: Adapting Domains in Prompt Space Using CLIP", BOOKTITLE = OutDistri23, YEAR = "2023", PAGES = "4357-4366", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223756"} @inproceedings{bb228755, AUTHOR = "Zhang, J. and Dong, R. and Ma, K.", TITLE = "CLIP-FO3D: Learning Free Open-world 3D Scene Representations from 2D Dense CLIP", BOOKTITLE = OpenSUN3D, PAGES = "2040-2051", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223757"} @inproceedings{bb228756, AUTHOR = "Auty, D. and Mikolajczyk, K.", TITLE = "Learning to Prompt CLIP for Monocular Depth Estimation: Exploring the Limits of Human Language", BOOKTITLE = OpenSUN3D, PAGES = "2031-2049", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223758"} @inproceedings{bb228757, AUTHOR = "Hegde, D. and Valanarasu, J.M.J. and Patel, V.M.", TITLE = "CLIP goes 3D: Leveraging Prompt Tuning for Language Grounded 3D Recognition", BOOKTITLE = OpenSUN3D, PAGES = "2020-2030", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223759"} @inproceedings{bb228758, AUTHOR = "Xu, X. and Xiong, T.Y. and Ding, Z. and Tu, Z.W.", TITLE = "MasQCLIP for Open-Vocabulary Universal Image Segmentation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "887-898", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223760"} @inproceedings{bb228759, AUTHOR = "Wang, H.L. and Li, Y. and Yao, H. and Li, X.M.", TITLE = "CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "1802-1812", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223761"} @inproceedings{bb228760, AUTHOR = "Zhu, X.Y. and Zhang, R.R. and He, B. and Zhou, A. and Wang, D. and Zhao, B. and Gao, P.", TITLE = "Not All Features Matter: Enhancing Few-shot CLIP with Adaptive Prior Refinement", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2605-2615", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223762"} @inproceedings{bb228761, AUTHOR = "Paiss, R. and Ephrat, A. and Tov, O. and Zada, S. and Mosseri, I. and Irani, M. and Dekel, T.", TITLE = "Teaching CLIP to Count to Ten", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "3147-3157", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223763"} @inproceedings{bb228762, AUTHOR = "Zhu, X.Y. and Zhang, R.R. and He, B. and Guo, Z.Y. and Zeng, Z. and Qin, Z. and Zhang, S.H. and Gao, P.", TITLE = "PointCLIP V2: Prompting CLIP and GPT for Powerful 3D Open-world Learning", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2639-2650", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223764"} @inproceedings{bb228763, AUTHOR = "Yuan, M. and Lv, N.N. and Xie, Y.F. and Lu, F.X. and Zhan, K.", TITLE = "CLIP-FG: Selecting Discriminative Image Patches by Contrastive Language-Image Pre-Training for Fine-Grained Image Classification", BOOKTITLE = ICIP23, YEAR = "2023", PAGES = "560-564", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223765"} @inproceedings{bb228764, AUTHOR = "Zeng, Z.Y. and Ge, Y.Y. and Liu, X.H. and Chen, B. and Luo, P. and Xia, S.T. and Ge, Y.X.", TITLE = "Learning Transferable Spatiotemporal Representations from Natural Script Knowledge", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23079-23089", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223766"} @inproceedings{bb228765, AUTHOR = "Wang, J.P. and Ge, Y.X. and Yan, R. and Ge, Y.Y. and Lin, K.Q. and Tsutsui, S. and Lin, X.D. and Cai, G. and Wu, J.P. and Shan, Y. and Qie, X. and Shou, M.Z.", TITLE = "All in One: Exploring Unified Video-Language Pre-Training", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6598-6608", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223767"} @inproceedings{bb228766, AUTHOR = "Ramrakhya, R. and Batra, D. and Wijmans, E. and Das, A.", TITLE = "PIRLNav: Pretraining with Imitation and RL Finetuning for OBJECTNAV", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "17896-17906", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223768"} @inproceedings{bb228767, AUTHOR = "Lin, X.D. and Tiwari, S. and Huang, S.Y. and Li, M. and Shou, M.Z. and Ji, H. and Chang, S.F.", TITLE = "Towards Fast Adaptation of Pretrained Contrastive Models for Multi-channel Video-Language Retrieval", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14846-14855", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223769"} @inproceedings{bb228768, AUTHOR = "Wang, H.C. and Du, X.D. and Li, J.H. and Yeh, R.A. and Shakhnarovich, G.", TITLE = "Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "12619-12629", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223770"} @inproceedings{bb228769, AUTHOR = "Luo, Y.X. and Ji, J.Y. and Chen, X.F. and Zhang, Y.X. and Ren, T. and Luo, G.", TITLE = "APL: Anchor-based Prompt Learning for One-stage Weakly Supervised Referring Expression Comprehension", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "XIII: 198-215", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223771"} @inproceedings{bb228770, AUTHOR = "Jin, L. and Luo, G. and Zhou, Y. and Sun, X.S. and Jiang, G.N. and Shu, A. and Ji, R.R.", TITLE = "RefCLIP: A Universal Teacher for Weakly Supervised Referring Expression Comprehension", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "01-10", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223772"} @inproceedings{bb228771, AUTHOR = "Saito, K. and Sohn, K. and Zhang, X. and Li, C.L. and Lee, C.Y. and Saenko, K. and Pfister, T.", TITLE = "Prefix Conditioning Unifies Language and Label Supervision", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2861-2870", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223773"} @inproceedings{bb228772, AUTHOR = "Park, J. and Han, B.H.", TITLE = "Multi-Modal Representation Learning with Text-Driven Soft Masks", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2798-2807", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223774"} @inproceedings{bb228773, AUTHOR = "Jin, Z. and Hayat, M. and Yang, Y.W. and Guo, Y.L. and Lei, Y.J.", TITLE = "Context-aware Alignment and Mutual Masking for 3D-Language Pre-training", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10984-10994", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223775"} @inproceedings{bb228774, AUTHOR = "Guo, Z.X. and Dong, B. and Ji, Z.L. and Bai, J.F. and Guo, Y.W. and Zuo, W.M.", TITLE = "Texts as Images in Prompt Tuning for Multi-Label Image Recognition", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2808-2817", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223776"} @inproceedings{bb228775, AUTHOR = "Cherti, M. and Beaumont, R. and Wightman, R. and Wortsman, M. and Ilharco, G. and Gordon, C. and Schuhmann, C. and Schmidt, L. and Jitsev, J.", TITLE = "Reproducible Scaling Laws for Contrastive Language-Image Learning", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2818-2829", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223777"} @inproceedings{bb228776, AUTHOR = "Lei, J. and Li, L.J. and Zhou, L. and Gan, Z. and Berg, T.L. and Bansal, M. and Liu, J.J.", TITLE = "Less is More: CLIPBERT for Video-and-Language Learning via Sparse Sampling", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "7327-7337", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223778"} @inproceedings{bb228777, AUTHOR = "Zhou, J.H. and Dong, L. and Gan, Z. and Wang, L.J. and Wei, F.", TITLE = "Non-Contrastive Learning Meets Language-Image Pre-Training", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "11028-11038", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223779"} @inproceedings{bb228778, AUTHOR = "Hu, Z. and Iscen, A. and Sun, C. and Wang, Z. and Chang, K.W. and Sun, Y.Z. and Schmid, C. and Ross, D.A. and Fathi, A.", TITLE = "Reveal: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23369-23379", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223780"} @inproceedings{bb228779, AUTHOR = "Li, Y.H. and Fan, H.Q. and Hu, R.H. and Feichtenhofer, C. and He, K.M.", TITLE = "Scaling Language-Image Pre-Training via Masking", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "23390-23400", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223781"} @inproceedings{bb228780, AUTHOR = "Jin, P. and Huang, J. and Xiong, P.F. and Tian, S.X. and Liu, C. and Ji, X.Y. and Yuan, L. and Chen, J.", TITLE = "Video-Text as Game Players: Hierarchical Banzhaf Interaction for Cross-Modal Representation Learning", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2472-2482", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223782"} @inproceedings{bb228781, AUTHOR = "Ye, S.Q. and Xie, Y.J. and Chen, D.D. and Xu, Y. and Yuan, L. and Zhu, C.G. and Liao, J.", TITLE = "Improving Commonsense in Vision-Language Models via Knowledge Graph Riddles", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2634-2645", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223783"} @inproceedings{bb228782, AUTHOR = "Li, H. and Zhu, J.G. and Jiang, X.H. and Zhu, X.Z. and Li, H.S. and Yuan, C. and Wang, X.H. and Qiao, Y. and Wang, X.G. and Wang, W.H. and Dai, J.F.", TITLE = "Uni-Perceiver v2: A Generalist Model for Large-Scale Vision and Vision-Language Tasks", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2691-2700", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223784"} @inproceedings{bb228783, AUTHOR = "Wu, W.H. and Wang, X.H. and Luo, H.P. and Wang, J.D. and Yang, Y. and Ouyang, W.L.", TITLE = "Bidirectional Cross-Modal Knowledge Exploration for Video Recognition with Pre-trained Vision-Language Models", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6620-6630", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223785"} @inproceedings{bb228784, AUTHOR = "Seth, A. and Hemani, M. and Agarwal, C.", TITLE = "DeAR: Debiasing Vision-Language Models with Additive Residuals", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6820-6829", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223786"} @inproceedings{bb228785, AUTHOR = "Radenovic, F. and Dubey, A. and Kadian, A. and Mihaylov, T. and Vandenhende, S. and Patel, Y. and Wen, Y. and Ramanathan, V. and Mahajan, D.", TITLE = "Filtering, Distillation, and Hard Negatives for Vision-Language Pre-Training", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6967-6977", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223787"} @inproceedings{bb228786, AUTHOR = "Yu, T. and Lu, Z. and Jin, X. and Chen, Z.B. and Wang, X.C.", TITLE = "Task Residual for Tuning Vision-Language Models", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10899-10909", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223788"} @inproceedings{bb228787, AUTHOR = "Ma, Z.X. and Hong, J. and Gul, M.O. and Gandhi, M. and Gao, I. and Krishna, R.", TITLE = "@ CREPE: Can Vision-Language Foundation Models Reason Compositionally?", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10910-10921", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223789"} @inproceedings{bb228788, AUTHOR = "Yin, D. and Gao, F. and Thattai, G. and Johnston, M. and Chang, K.W.", TITLE = "GIVL: Improving Geographical Inclusivity of Vision-Language Models with Pre-Training Methods", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10951-10961", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223790"} @inproceedings{bb228789, AUTHOR = "Gao, C. and Peng, X.Y. and Yan, M. and Wang, H. and Yang, L.R. and Ren, H.B. and Li, H.S. and Liu, S.", TITLE = "Adaptive Zone-aware Hierarchical Planner for Vision-Language Navigation", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14911-14920", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223791"} @inproceedings{bb228790, AUTHOR = "Yeh, C.H. and Russell, B. and Sivic, J. and Heilbron, F.C. and Jenni, S.", TITLE = "Meta-Personalizing Vision-Language Models to Find Named Instances in Video", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19123-19132", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223792"} @inproceedings{bb228791, AUTHOR = "Gou, Y.H. and Ko, T. and Yang, H. and Kwok, J. and Zhang, Y. and Wang, M.X.", TITLE = "Leveraging per Image-Token Consistency for Vision-Language Pre-Training", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19155-19164", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223793"} @inproceedings{bb228792, AUTHOR = "Wang, S.J. and Chang, J.L. and Li, H.J. and Wang, Z.H. and Ouyang, W.L. and Tian, Q.", TITLE = "Open-Set Fine-Grained Retrieval via Prompting Vision-Language Evaluator", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19381-19391", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223794"} @inproceedings{bb228793, AUTHOR = "Cheng, F. and Wang, X.Z. and Lei, J. and Crandall, D. and Bansal, M. and Bertasius, G.", TITLE = "VindLU: A Recipe for Effective Video-and-Language Pretraining", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10739-10750", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223795"} @inproceedings{bb228794, AUTHOR = "Zhou, H.L. and Martin Martin, R. and Kapadia, M. and Savarese, S. and Niebles, J.C.", TITLE = "Procedure-Aware Pretraining for Instructional Video Understanding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10727-10738", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223796"} @inproceedings{bb228795, AUTHOR = "Yang, A. and Nagrani, A. and Seo, P.H. and Miech, A. and Pont Tuset, J. and Laptev, I. and Sivic, J. and Schmid, C.", TITLE = "Vid2Seq: Large-Scale Pretraining of a Visual Language Model for Dense Video Captioning", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10714-10726", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223797"} @inproceedings{bb228796, AUTHOR = "Ji, Y. and Tu, R.C. and Jiang, J. and Kong, W.J. and Cai, C. and Zhao, W.Z. and Wang, H.F. and Yang, Y. and Liu, W.", TITLE = "Seeing What You Miss: Vision-Language Pre-training with Semantic Completion Learning", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6789-6798", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223798"} @inproceedings{bb228797, AUTHOR = "Alper, M. and Fiman, M. and Averbuch Elor, H.", TITLE = "Is BERT Blind? Exploring the Effect of Vision-and-Language Pretraining on Visual Language Understanding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6778-6788", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223799"} @inproceedings{bb228798, AUTHOR = "Liu, M.Y. and Jiang, J. and Zhu, C. and Yin, X.C.", TITLE = "VLPD: Context-Aware Pedestrian Detection via Vision-Language Semantic Self-Supervision", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6662-6671", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223800"} @inproceedings{bb228799, AUTHOR = "Wei, Y.X. and Cao, Y. and Zhang, Z. and Peng, H. and Yao, Z.L. and Xie, Z. and Hu, H. and Guo, B.", TITLE = "iCLIP: Bridging Image Classification and Contrastive Language-Image Pre-training for Visual Recognition", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2776-2786", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803clip3.html#TT223801"}