@inproceedings{bb235600, AUTHOR = "Mitra, C. and Huang, B. and Darrell, T.J. and Herzig, R.", TITLE = "Compositional Chain-of-Thought Prompting for Large Multimodal Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14420-14431", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230575"} @inproceedings{bb235601, AUTHOR = "Liu, C. and Yin, K. and Cao, H.Y. and Jiang, X.H. and Li, X. and Liu, Y. and Jiang, D.Q. and Sun, X. and Xu, L.", TITLE = "HRVDA: High-Resolution Visual Document Assistant", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15534-15545", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230576"} @inproceedings{bb235602, AUTHOR = "Luo, C. and Shen, Y.F. and Zhu, Z.Q. and Zheng, Q. and Yu, Z. and Yao, C.", TITLE = "LayoutLLM: Layout Instruction Tuning with Large Language Models for Document Understanding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15630-15640", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230577"} @inproceedings{bb235603, AUTHOR = "Yang, Y. and Sun, F.Y. and Weihs, L. and Vanderbilt, E. and Herrasti, A. and Han, W. and Wu, J.J. and Haber, N. and Krishna, R. and Liu, L.J. and Callison Burch, C. and Yatskar, M. and Kembhavi, A. and Clark, C.", TITLE = "Holodeck: Language Guided Generation of 3D Embodied AI Environments", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16277-16287", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230578"} @inproceedings{bb235604, AUTHOR = "Qin, Y.R. and Zhou, E. and Liu, Q. and Yin, Z.F. and Sheng, L. and Zhang, R.M. and Qiao, Y. and Shao, J.", TITLE = "MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active Perception", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16307-16316", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230579"} @inproceedings{bb235605, AUTHOR = "Zhang, S. and Yu, X.Y. and Song, X.H. and Wang, X.H. and Jiang, S.Q.", TITLE = "Imagine Before Go: Self-Supervised Generative Map for Object Goal Navigation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16414-16425", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230580"} @inproceedings{bb235606, AUTHOR = "Li, H. and Yang, X. and Wang, Z.K. and Zhu, X.Z. and Zhou, J. and Qiao, Y. and Wang, X.G. and Li, H.S. and Lu, L.W. and Dai, J.F.", TITLE = "Auto MC-Reward: Automated Dense Reward Design with Large Language Models for Minecraft", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16426-16435", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230581"} @inproceedings{bb235607, AUTHOR = "Liu, M.X. and Hayes, T.L. and Ricci, E. and Csurka, G. and Volpi, R.", TITLE = "SHiNe: Semantic Hierarchy Nexus for Open-Vocabulary Object Detection", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16634-16644", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230582"} @inproceedings{bb235608, AUTHOR = "Kim, J. and Cho, E. and Kim, S. and Kim, H.W.J.", TITLE = "Retrieval-Augmented Open-Vocabulary Object Detection", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17427-17436", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230583"} @inproceedings{bb235609, AUTHOR = "Saha, O. and van Horn, G. and Maji, S.", TITLE = "Improved Zero-Shot Classification by Adapting VLMs with Text Descriptions", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17542-17552", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230584"} @inproceedings{bb235610, AUTHOR = "Toubal, I.E. and Avinash, A. and Alldrin, N.G. and Dlabal, J. and Zhou, W. and Luo, E. and Stretcu, O. and Xiong, H. and Lu, C.T. and Zhou, H. and Krishna, R. and Fuxman, A. and Duerig, T.", TITLE = "Modeling Collaborator: Enabling Subjective Vision Classification with Minimal Human Effort via LLM Tool-Use", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "17553-17563", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230585"} @inproceedings{bb235611, AUTHOR = "Li, X.Q. and Xu, J.Y. and Zhang, M.X. and Liu, J.M. and Shen, Y. and Ponomarenko, I. and Xu, J.H. and Heng, L. and Huang, S.Y. and Zhang, S.H. and Dong, H.", TITLE = "Object-Centric Prompt-Driven Vision-Language-Action Model for Robotic Manipulation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "27638-27648", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230586"} @inproceedings{bb235612, AUTHOR = "Li, X.Q. and Zhang, M.X. and Geng, Y.R. and Geng, H.R. and Long, Y.X. and Shen, Y. and Zhang, R.R. and Liu, J.M. and Dong, H.", TITLE = "ManipLLM: Embodied Multimodal Large Language Model for Object-Centric Robotic Manipulation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18061-18070", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230587"} @inproceedings{bb235613, AUTHOR = "Han, T. and Bain, M. and Nagrani, A. and Varol, G. and Xie, W. and Zisserman, A.", TITLE = "AutoAD III: The Prequel: Back to the Pixels", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18164-18174", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230588"} @inproceedings{bb235614, AUTHOR = "Qu, H.X. and Cai, Y.J. and Liu, J.", TITLE = "LLMs are Good Action Recognizers", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18395-18406", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230589"} @inproceedings{bb235615, AUTHOR = "Chen, J. and Lv, Z.Y. and Wu, S.W. and Lin, K.Q. and Song, C. and Gao, D.F. and Liu, J.W. and Gao, Z.T. and Mao, D.X. and Shou, M.Z.", TITLE = "VideoLLM-online: Online Video Large Language Model for Streaming Video", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18407-18418", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230590"} @inproceedings{bb235616, AUTHOR = "Zhu, A. and Ke, Q.H. and Gong, M.M. and Bailey, J.", TITLE = "Part-Aware Unified Representation of Language and Skeleton for Zero-Shot Action Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18761-18770", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230591"} @inproceedings{bb235617, AUTHOR = "Chen, T.J. and Yu, H.S. and Yang, Z.G. and Li, Z.C. and Sun, W. and Chen, C.", TITLE = "OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor for General Video Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18888-18898", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230592"} @inproceedings{bb235618, AUTHOR = "Zhao, Q.H. and Dai, Y. and Li, H. and Hu, W. and Zhang, F. and Liu, J.", TITLE = "LTGC: Long-Tail Recognition via Leveraging LLMs-Driven Generated Content", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "19510-19520", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230593"} @inproceedings{bb235619, AUTHOR = "Siddiqui, Y. and Alliegro, A. and Artemov, A. and Tommasi, T. and Sirigatti, D. and Rosov, V. and Dai, A. and Nießner, M.", TITLE = "MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "19615-19625", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230594"} @inproceedings{bb235620, AUTHOR = "Li, Z. and Gao, Z.Y. and Tan, C. and Ren, B. and Yang, L.T. and Li, S.Z.", TITLE = "General Point Model Pretraining with Autoencoding and Autoregressive", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "20954-20964", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230595"} @inproceedings{bb235621, AUTHOR = "Li, K.C. and Wang, Y. and He, Y. and Li, Y.Z. and Wang, Y. and Liu, Y. and Wang, Z. and Xu, J. and Chen, G. and Lou, P. and Wang, L.M. and Qiao, Y.", TITLE = "MVBench: A Comprehensive Multi-modal Video Understanding Benchmark", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22195-22206", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230596"} @inproceedings{bb235622, AUTHOR = "Taesiri, M.R. and Feng, T.J. and Bezemer, C.P. and Nguyen, A.", TITLE = "GlitchBench: Can Large Multimodal Models Detect Video Game Glitches?", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22444-22455", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230597"} @inproceedings{bb235623, AUTHOR = "Zhang, R. and Zhang, Y.Z. and Chen, J. and Zhou, Y.F. and Gu, J.X. and Chen, C. and Sun, T.", TITLE = "TRINS: Towards Multimodal Language Models that Can Read", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22584-22594", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230598"} @inproceedings{bb235624, AUTHOR = "Dunlap, L. and Zhang, Y.H. and Wang, X.H. and Zhong, R.Q. and Darrell, T.J. and Steinhardt, J. and Gonzalez, J.E. and Yeung Levy, S.", TITLE = "Describing Differences in Image Sets with Natural Language", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24199-24208", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230599"} @inproceedings{bb235625, AUTHOR = "Ishmam, A.M. and Thomas, C.", TITLE = "Semantic Shield: Defending Vision-Language Models Against Backdooring and Poisoning via Fine-Grained Knowledge Alignment", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "24820-24830", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230600"} @inproceedings{bb235626, AUTHOR = "Yang, Y.J. and Zhou, T.Y. and Li, K. and Tao, D.P. and Li, L. and Shen, L. and He, X.D. and Jiang, J. and Shi, Y.H.", TITLE = "Embodied Multi-Modal Agent trained by an LLM from a Parallel TextWorld", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26265-26275", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230601"} @inproceedings{bb235627, AUTHOR = "Hong, Y. and Zheng, Z. and Chen, P.H. and Wang, Y.F. and Li, J. and Gan, C.", TITLE = "MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in 3D World", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26396-26406", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230602"} @inproceedings{bb235628, AUTHOR = "Zhang, Y. and Dong, Y.P. and Zhang, S.Y. and Min, T.Z. and Su, H. and Zhu, J.", TITLE = "Exploring the Transferability of Visual Prompting for Multimodal Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26552-26562", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230603"} @inproceedings{bb235629, AUTHOR = "Han, J.M. and Gong, K.X. and Zhang, Y.Y. and Wang, J.Q. and Zhang, K. and Lin, D. and Qiao, Y. and Gao, P. and Yue, X.Y.", TITLE = "OneLLM: One Framework to Align All Modalities with Language", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26574-26585", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230604"} @inproceedings{bb235630, AUTHOR = "Xie, H.X. and Peng, C.J. and Tseng, Y.W. and Chen, H.J. and Hsu, C.F. and Shuai, H.H. and Cheng, W.H.", TITLE = "EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26586-26595", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230605"} @inproceedings{bb235631, AUTHOR = "Wang, X.Y. and Zhuang, B. and Wu, Q.", TITLE = "ModaVerse: Efficiently Transforming Modalities with LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26596-26606", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230606"} @inproceedings{bb235632, AUTHOR = "Lin, J. and Yin, H.X. and Ping, W. and Molchanov, P. and Shoeybi, M. and Han, S.", TITLE = "VILA: On Pre-training for Visual Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26679-26689", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230607"} @inproceedings{bb235633, AUTHOR = "Lyu, Y.H. and Zheng, X. and Zhou, J.Z. and Wang, L.", TITLE = "UniBind: LLM-Augmented Unified and Balanced Representation Space to Bind Them All", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26742-26752", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230608"} @inproceedings{bb235634, AUTHOR = "Liang, T. and Huang, J. and Kong, M. and Chen, L. and Zhu, Q.", TITLE = "Querying as Prompt: Parameter-Efficient Learning for Multimodal Language Model", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26845-26855", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230609"} @inproceedings{bb235635, AUTHOR = "Zhu, L. and Wei, F. and Lu, Y.", TITLE = "Beyond Text: Frozen Large Language Models in Visual Signal Comprehension", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27037-27047", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230610"} @inproceedings{bb235636, AUTHOR = "Pi, R.J. and Yao, L.W. and Gao, J.H. and Zhang, J.P. and Zhang, T.", TITLE = "PerceptionGPT: Effectively Fusing Visual Perception Into LLM", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27114-27123", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230611"} @inproceedings{bb235637, AUTHOR = "Tai, Y. and Fan, W.C. and Zhang, Z. and Liu, Z.W.", TITLE = "Link-Context Learning for Multimodal LLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27166-27175", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230612"} @inproceedings{bb235638, AUTHOR = "Tang, Z. and Yang, Z. and Khademi, M. and Liu, Y. and Zhu, C.G. and Bansal, M.", TITLE = "CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27415-27424", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230613"} @inproceedings{bb235639, AUTHOR = "Jain, J. and Yang, J.W. and Shi, H.", TITLE = "VCoder: Versatile Vision Encoders for Multimodal Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27992-28002", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230614"} @inproceedings{bb235640, AUTHOR = "Yuan, Y.Q. and Li, W. and Liu, J. and Tang, D.Q. and Luo, X.J. and Qin, C. and Zhang, L. and Zhu, J.", TITLE = "Osprey: Pixel Understanding with Visual Instruction Tuning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28202-28211", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230615"} @inproceedings{bb235641, AUTHOR = "Zheng, Z.H. and Wei, J. and Hu, X.F. and Zhu, H.D. and Nevatia, R.", TITLE = "Large Language Models are Good Prompt Learners for Low-Shot Image Classification", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28453-28462", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230616"} @inproceedings{bb235642, AUTHOR = "He, H.Y. and Pan, Z.Z. and Liu, J. and Cai, J.F. and Zhuang, B.", TITLE = "Efficient Stitchable Task Adaptation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28555-28565", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230617"} @inproceedings{bb235643, AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.", TITLE = "ArGue: Attribute-Guided Prompt Tuning for Vision-Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28578-28587", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230618"} @inproceedings{bb235644, AUTHOR = "Barbany, O. and Huang, M. and Zhu, X.L. and Dhua, A.", TITLE = "Leveraging Large Language Models for Multimodal Search", BOOKTITLE = FGVC24, YEAR = "2024", PAGES = "1201-1210", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230619"} @inproceedings{bb235645, AUTHOR = "Lv, J.X. and Huang, Y. and Yan, M. and Huang, J.C. and Liu, J.Z. and Liu, Y.F. and Wen, Y.F. and Chen, X.X. and Chen, S.F.", TITLE = "GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via Blender-Oriented GPT Planning", BOOKTITLE = PBDL24, YEAR = "2024", PAGES = "1430-1440", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230620"} @inproceedings{bb235646, AUTHOR = "Baldassini, F.B. and Shukor, M. and Cord, M. and Soulier, L. and Piwowarski, B.", TITLE = "What Makes Multimodal In-Context Learning Work?", BOOKTITLE = Prompting24, YEAR = "2024", PAGES = "1539-1550", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230621"} @inproceedings{bb235647, AUTHOR = "Wang, J.C. and Ke, L.", TITLE = "LLM-Seg: Bridging Image Segmentation and Large Language Model Reasoning", BOOKTITLE = WhatNext24, YEAR = "2024", PAGES = "1765-1774", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230622"} @inproceedings{bb235648, AUTHOR = "Hakim, Z.I.A. and Sarker, N.H. and Singh, R.P. and Paul, B. and Dabouei, A. and Xu, M.", TITLE = "Leveraging Generative Language Models for Weakly Supervised Sentence Component Analysis in Video-Language Joint Learning", BOOKTITLE = MULA24, YEAR = "2024", PAGES = "1975-1985", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230623"} @inproceedings{bb235649, AUTHOR = "Deria, A. and Kumar, K. and Chakraborty, S. and Mahapatra, D. and Roy, S.", TITLE = "InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report Generation", BOOKTITLE = MULA24, YEAR = "2024", PAGES = "2028-2038", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230624"} @inproceedings{bb235650, AUTHOR = "Ma, F.P. and Zhou, Y.Z. and Zhang, Y.Y. and Wu, S.Y. and Zhang, Z. and He, Z.L. and Rao, F.Y. and Sun, X.Y.", TITLE = "Task Navigator: Decomposing Complex Tasks for Multimodal Large Language Models", BOOKTITLE = Reasoning24, YEAR = "2024", PAGES = "2248-2257", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230625"} @inproceedings{bb235651, AUTHOR = "Arefeen, M.A. and Debnath, B. and Uddin, M.Y.S. and Chakradhar, S.", TITLE = "ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based Video Analysis System", BOOKTITLE = Reasoning24, YEAR = "2024", PAGES = "2266-2274", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230626"} @inproceedings{bb235652, AUTHOR = "Chen, Y.W. and Chu, S.Y.", TITLE = "Large Language Models in Wargaming: Methodology, Application, and Robustness", BOOKTITLE = AML24, YEAR = "2024", PAGES = "2894-2903", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230627"} @inproceedings{bb235653, AUTHOR = "Lai, Z.X. and Wu, J. and Chen, S. and Zhou, Y.C. and Hovakimyan, N.", TITLE = "Residual-based Language Models are Free Boosters for Biomedical Imaging Tasks", BOOKTITLE = DEF-AI-MIA24, YEAR = "2024", PAGES = "5086-5096", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230628"} @inproceedings{bb235654, AUTHOR = "Fang, X. and Wang, W.G. and Lv, X.X. and Yan, J.", TITLE = "PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt Condition", BOOKTITLE = NTIRE24, YEAR = "2024", PAGES = "6167-6176", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230629"} @inproceedings{bb235655, AUTHOR = "Ye, Z. and Liu, J.X. and Cao, J.J. and Chen, Z.Y. and Xuan, Z.W. and Zhou, M.Y. and Liu, Q. and Qi, G.J.", TITLE = "OpenStory: A Large-Scale Open-Domain Dataset for Subject-Driven Visual Storytelling", BOOKTITLE = VDU24, YEAR = "2024", PAGES = "7953-7962", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230630"} @inproceedings{bb235656, AUTHOR = "Chen, X.Y. and Liu, J. and Wang, Y. and Wang, P.P. and Brand, M. and Wang, G.H. and Koike Akino, T.", TITLE = "SuperLoRA: Parameter-Efficient Unified Adaptation for Large Vision Models", BOOKTITLE = ECV24, YEAR = "2024", PAGES = "8050-8055", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230631"} @inproceedings{bb235657, AUTHOR = "Wei, C. and Liu, C.X. and Qiao, S.Y. and Zhang, Z.S. and Yuille, A.L. and Yu, J.H.", TITLE = "De-Diffusion Makes Text a Strong Cross-Modal Interface", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13492-13503", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230632"} @inproceedings{bb235658, AUTHOR = "Chen, B. and Xu, Z. and Kirmani, S. and Ichter, B. and Sadigh, D. and Guibas, L.J. and Xia, F.", TITLE = "SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14455-14465", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230633"} @inproceedings{bb235659, AUTHOR = "Dorkenwald, M. and Barazani, N. and Snoek, C.G.M. and Asano, Y.M.", TITLE = "PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13548-13558", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230634"} @inproceedings{bb235660, AUTHOR = "Cha, J. and Kang, W. and Mun, J. and Roh, B.", TITLE = "Honeybee: Locality-Enhanced Projector for Multimodal LLM", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13817-13827", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230635"} @inproceedings{bb235661, AUTHOR = "Sun, Z.Y. and Fang, Y. and Wu, T. and Zhang, P. and Zang, Y.H. and Kong, S. and Xiong, Y.J. and Lin, D. and Wang, J.Q.", TITLE = "Alpha-CLIP: A CLIP Model Focusing on Wherever you Want", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13019-13029", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230636"} @inproceedings{bb235662, AUTHOR = "Parashar, S. and Lin, Z.Q. and Liu, T. and Dong, X.J. and Li, Y. and Ramanan, D. and Caverlee, J. and Kong, S.", TITLE = "The Neglected Tails in Vision-Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12988-12997", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230637"} @inproceedings{bb235663, AUTHOR = "Luo, Y. and Shi, M. and Khan, M.O. and Afzal, M.M. and Huang, H. and Yuan, S. and Tian, Y. and Song, L. and Kouhana, A. and Elze, T. and Fang, Y. and Wang, M.Y.", TITLE = "FairCLIP: Harnessing Fairness in Vision-Language Learning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "12289-12301", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230638"} @inproceedings{bb235664, AUTHOR = "Zara, G. and Conti, A. and Roy, S. and Lathuiliere, S. and Rota, P. and Ricci, E.", TITLE = "The Unreasonable Effectiveness of Large Language-Vision Models for Source-free Video Domain Adaptation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "10273-10283", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230639"} @inproceedings{bb235665, AUTHOR = "Zhao, H.B. and Ni, B.L. and Fan, J.S. and Wang, Y.X. and Chen, Y.T. and Meng, G.F. and Zhang, Z.X.", TITLE = "Continual Forgetting for Pre-Trained Vision Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "28631-28642", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230640"} @inproceedings{bb235666, AUTHOR = "Zhan, X.Y. and Yang, L.X. and Zhao, Y.F. and Mao, K. and Xu, H.L. and Lin, Z. and Li, K.L. and Lu, C.", TITLE = "OakInk2: A Dataset of Bimanual Hands-Object Manipulation in Complex Task Completion", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "445-456", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230641"} @inproceedings{bb235667, AUTHOR = "Li, Y.C. and Zhao, N. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.", TITLE = "LASO: Language-Guided Affordance Segmentation on 3D Object", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14251-14260", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230642"} @inproceedings{bb235668, AUTHOR = "Rotstein, N. and Bensaid, D. and Brody, S. and Ganz, R. and Kimmel, R.", TITLE = "FuseCap: Leveraging Large Language Models for Enriched Fused Image Captions", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "5677-5688", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230643"} @article{bb235669, AUTHOR = "Lin, B.Q. and Nie, Y. and Wei, Z.M. and Chen, J.Q. and Ma, S. and Han, J.H. and Xu, H. and Chang, X.J. and Liang, X.D.", TITLE = "NavCoT: Boosting LLM-Based Vision-and-Language Navigation via Learning Disentangled Reasoning", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "7", MONTH = "July", PAGES = "5945-5957", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230644"} @article{bb235670, AUTHOR = "Ding, X.P. and Han, J.H. and Xu, H. and Zhang, W. and Li, X.M.", TITLE = "HiLM-D: Enhancing MLLMs with Multi-scale High-Resolution Details for Autonomous Driving", JOURNAL = IJCV, VOLUME = "133", YEAR = "2025", NUMBER = "8", MONTH = "August", PAGES = "5379-5395", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230645"} @inproceedings{bb235671, AUTHOR = "Ding, X.P. and Han, J.H. and Xu, H. and Liang, X.D. and Zhang, W. and Li, X.M.", TITLE = "Holistic Autonomous Driving Understanding by Bird'View Injected Multi-Modal Large Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "13668-13677", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230646"} @article{bb235672, AUTHOR = "Liu, T.Q. and Qin, Y.J. and Zhang, S.H. and Tao, X.M.", TITLE = "Empowering Corner Case Detection in Autonomous Vehicles With Multimodal Large Language Models", JOURNAL = SPLetters, VOLUME = "32", YEAR = "2025", PAGES = "51-55", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230647"} @article{bb235673, AUTHOR = "Wu, M.Y. and Yu, F.R. and Liu, P.X.P. and He, Y.", TITLE = "Facilitating Autonomous Driving Tasks With Large Language Models", JOURNAL = IEEE_Int_Sys, VOLUME = "40", YEAR = "2025", NUMBER = "1", MONTH = "January", PAGES = "45-52", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230648"} @article{bb235674, AUTHOR = "Cao, J.H. and Liu, S. and Wu, C.F. and Li, Y. and Du, S.", TITLE = "ATHENA - Autonomous Vehicle Trajectory Planning Considered Human Action Awareness", JOURNAL = SPLetters, VOLUME = "32", YEAR = "2025", PAGES = "1845-1849", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230649"} @inproceedings{bb235675, AUTHOR = "Chen, K. and Li, Y.Z. and Zhang, W.H. and Liu, Y.X. and Li, P.X. and Gao, R. and Hong, L.Q. and Tian, M. and Zhao, X.H. and Li, Z.G. and Yeung, D.Y. and Lu, H.C. and Jia, X.", TITLE = "Automated Evaluation of Large Vision-Language Models on Self-Driving Corner Cases", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "7817-7826", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230650"} @inproceedings{bb235676, AUTHOR = "Renz, K. and Chen, L. and Arani, E. and Sinavski, O.", TITLE = "SimLingo: Vision-Only Closed-Loop Autonomous Driving with Language-Action Alignment", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "11993-12003", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230651"} @inproceedings{bb235677, AUTHOR = "Zhang, Z.Y. and Li, X.F. and Xu, Z.H. and Peng, W.J. and Zhou, Z.J. and Shi, M.J. and Huang, S.P.", TITLE = "MPDrive: Improving Spatial Understanding with Marker-Based Prompt Learning for Autonomous Driving", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "12089-12099", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230652"} @inproceedings{bb235678, AUTHOR = "Xu, Z.H. and Bai, Y. and Zhang, Y.J. and Li, Z.L. and Xia, F. and Wong, K.Y.K. and Wang, J.Q. and Zhao, H.S.", TITLE = "DriveGPT4-V2: Harnessing Large Language Model Capabilities for Enhanced Closed-Loop Autonomous Driving", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "17261-17270", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230653"} @inproceedings{bb235679, AUTHOR = "Hegde, D. and Yasarla, R. and Cai, H. and Han, S.Z. and Bhattacharyya, A. and Mahajan, S. and Liu, L.T. and Garrepalli, R. and Patel, V.M. and Porikli, F.M.", TITLE = "Distilling Multi-Modal Large Language Models for Autonomous Driving", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "27575-27585", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230654"} @inproceedings{bb235680, AUTHOR = "Chen, Y. and Ding, Z.H. and Wang, Z.Q. and Wang, Y. and Zhang, L.J. and Liu, S.", TITLE = "Asynchronous Large Language Model Enhanced Planner for Autonomous Driving", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "XXXVI: 22-38", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230655"} @inproceedings{bb235681, AUTHOR = "Li, B. and Wang, Y. and Mao, J. and Ivanovic, B. and Veer, S. and Leung, K. and Pavone, M.", TITLE = "Driving Everywhere with Large Language Model Policy Adaptation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "14948-14957", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230656"} @inproceedings{bb235682, AUTHOR = "Wei, Y.X. and Wang, Z. and Lu, Y.F. and Xu, C.X. and Liu, C.X. and Zhao, H. and Chen, S. and Wang, Y.F.", TITLE = "Editable Scene Simulation for Autonomous Driving via Collaborative LLM-Agents", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15077-15087", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230657"} @inproceedings{bb235683, AUTHOR = "Shao, H. and Hu, Y.X. and Wang, L. and Song, G.L. and Waslander, S.L. and Liu, Y. and Li, H.S.", TITLE = "LMDrive: Closed-Loop End-to-End Driving with Large Language Models", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15120-15130", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230658"} @inproceedings{bb235684, AUTHOR = "Ma, Y.S. and Cui, C. and Cao, X. and Ye, W.Q. and Liu, P.R. and Lu, J. and Abdelraouf, A. and Gupta, R. and Han, K.T. and Bera, A. and Rehg, J.M. and Wang, Z.", TITLE = "LaMPilot: An Open Benchmark Dataset for Autonomous Driving with Language Model Programs", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15141-15151", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230659"} @inproceedings{bb235685, AUTHOR = "Zhang, J.W. and Xu, C. and Li, B.", TITLE = "ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for Autonomous Vehicles", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15459-15469", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230660"} @inproceedings{bb235686, AUTHOR = "Sirnam, S. and Yang, J. and Neiman, T. and Rizve, M.N. and Tran, S. and Yao, B. and Chilimbi, T. and Shah, M.", TITLE = "X-former: Unifying Contrastive and Reconstruction Learning for MLLMs", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "VI: 146-162", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230661"} @inproceedings{bb235687, AUTHOR = "Qiao, Y.Y. and Liu, Q.Y. and Liu, J.J. and Liu, J. and Wu, Q.", TITLE = "LLM as Copilot for Coarse-grained Vision-and-language Navigation", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "V: 459-476", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230662"} @inproceedings{bb235688, AUTHOR = "Zhang, J.Y. and Huang, Z.M. and Ray, A. and Ohn Bar, E.", TITLE = "Feedback-Guided Autonomous Driving", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "15000-15011", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230663"} @inproceedings{bb235689, AUTHOR = "Yang, Y. and Zhang, Q.W. and Li, C. and Marta, D.S. and Batool, N. and Folkesson, J.", TITLE = "Human-Centric Autonomous Systems With LLMs for User Command Reasoning", BOOKTITLE = LLVMCrive24, YEAR = "2024", PAGES = "988-994", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230664"} @inproceedings{bb235690, AUTHOR = "Cui, C. and Ma, Y.S. and Cao, X. and Ye, W.Q. and Zhou, Y. and Liang, K. and Chen, J. and Lu, J. and Yang, Z. and Liao, K.D. and Gao, T. and Li, E. and Tang, K. and Cao, Z.P. and Zhou, T. and Liu, A. and Yan, X.R. and Mei, S.Q. and Cao, J.G. and Wang, Z. and Zheng, C.", TITLE = "A Survey on Multimodal Large Language Models for Autonomous Driving", BOOKTITLE = LLVMCrive24, YEAR = "2024", PAGES = "958-979", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230665"} @inproceedings{bb235691, AUTHOR = "Fu, D.C. and Li, X. and Wen, L.C. and Dou, M. and Cai, P.L. and Shi, B. and Qiao, Y.", TITLE = "Drive Like a Human: Rethinking Autonomous Driving with Large Language Models", BOOKTITLE = LLVMCrive24, YEAR = "2024", PAGES = "910-919", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT230666"} @article{bb235692, AUTHOR = "Wang, J. and Zhu, M. and Li, Y. and Li, H.L. and Yang, L.Z. and Woo, W.L.", TITLE = "Detect2Interact: Localizing Object Key Field in Visual Question Answering with LLMs", JOURNAL = IEEE_Int_Sys, VOLUME = "39", YEAR = "2024", NUMBER = "3", MONTH = "May", PAGES = "35-44", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230667"} @article{bb235693, AUTHOR = "Hu, Z.J. and Yang, P. and Jiang, Y.S. and Bai, Z.J.", TITLE = "Prompting large language model with context and pre-answer for knowledge-based VQA", JOURNAL = PR, VOLUME = "151", YEAR = "2024", PAGES = "110399", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230668"} @article{bb235694, AUTHOR = "Kuang, J.Y. and Shen, Y. and Xie, J. and Luo, H. and Xu, Z. and Li, R.H. and Li, Y.H. and Cheng, X.F. and Lin, X. and Han, Y.", TITLE = "Natural Language Understanding and Inference with MLLM in Visual Question Answering: A Survey", JOURNAL = Surveys, VOLUME = "57", YEAR = "2025", NUMBER = "8", MONTH = "March", PAGES = "xx-yy", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230669"} @article{bb235695, AUTHOR = "Xiong, H.M. and Zhuge, Y.Z. and Zhu, J. and Zhang, L. and Lu, H.C.", TITLE = "3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene Understanding", JOURNAL = MultMed, VOLUME = "27", YEAR = "2025", PAGES = "2899-2911", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230670"} @article{bb235696, AUTHOR = "Yu, Z. and Ouyang, X.C. and Shao, Z.W. and Wang, M. and Yu, J.", TITLE = "Prophet: Prompting Large Language Models With Complementary Answer Heuristics for Knowledge-Based Visual Question Answering", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "8", MONTH = "August", PAGES = "6797-6808", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230671"} @inproceedings{bb235697, AUTHOR = "Shao, Z.W. and Yu, Z. and Wang, M. and Yu, J.", TITLE = "Prompting Large Language Models with Answer Heuristics for Knowledge-Based Visual Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14974-14983", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230672"} @article{bb235698, AUTHOR = "Xu, Z. and Li, Q. and Nie, W.Z. and Wang, W.J. and Liu, A.", TITLE = "Structure Causal Models and LLMs Integration in Medical Visual Question Answering", JOURNAL = MedImg, VOLUME = "44", YEAR = "2025", NUMBER = "8", MONTH = "August", PAGES = "3476-3489", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230673"} @inproceedings{bb235699, AUTHOR = "Fang, W.L. and Wu, Q. and Chen, J. and Xue, Y.", TITLE = "Notes-guided MLLM Reasoning: Enhancing MLLM with Knowledge and Visual Notes for Visual Question Answering", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19597-19607", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT230674"}