@inproceedings{bb240900,
        AUTHOR = "Liu, S.L. and Cheng, H. and Liu, H.T. and Zhang, H. and Li, F. and Ren, T. and Zou, X. and Yang, J.W. and Su, H. and Zhu, J. and Zhang, L. and Gao, J.F. and Li, C.Y.",
        TITLE = "LLaVA-Plus: Learning to Use Tools for Creating Multimodal Agents",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLVII: 126-142",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235817"}

@inproceedings{bb240901,
        AUTHOR = "Cai, R. and Song, Z. and Guan, D. and Chen, Z.H. and Li, Y.H. and Luo, X. and Yi, C.Y. and Kot, A.C.",
        TITLE = "BenchLMM: Benchmarking Cross-Style Visual Capability of Large
Multimodal Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "L: 340-358",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235818"}

@inproceedings{bb240902,
        AUTHOR = "Yu, E. and Zhao, L. and Wei, Y. and Yang, J.R. and Wu, D.M. and Kong, L.Y. and Wang, T. and Ge, Z. and Zhang, X.Y. and Tao, W.B.",
        TITLE = "Merlin: Empowering Multimodal LLMs with Foresight Minds",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "IV: 425-443",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235819"}

@inproceedings{bb240903,
        AUTHOR = "Song, K.P. and Zhu, Y.Z. and Liu, B.C. and Yan, Q. and Elgammal, A. and Yang, X.",
        TITLE = "MOMA: Multimodal LLM Adapter for Fast Personalized Image Generation",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XL: 117-132",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235820"}

@inproceedings{bb240904,
        AUTHOR = "Gou, Y.H. and Chen, K. and Liu, Z. and Hong, L.Q. and Xu, H. and Li, Z.G. and Yeung, D.Y. and Kwok, J.T. and Zhang, Y.",
        TITLE = "Eyes Closed, Safety on: Protecting Multimodal LLMs via Image-to-text
Transformation",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XVII: 388-404",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235821"}

@inproceedings{bb240905,
        AUTHOR = "Wang, D.S. and Cui, J. and Li, M. and Lin, W. and Chen, B. and Zhang, H.W.",
        TITLE = "Instruction Tuning-free Visual Token Complement for Multimodal LLMs",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXXI: 446-462",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235822"}

@inproceedings{bb240906,
        AUTHOR = "McKinzie, B. and Gan, Z. and Fauconnier, J.P. and Dodge, S. and Zhang, B. and Dufter, P. and Shah, D. and Du, X.Z. and Peng, F. and Belyi, A. and Zhang, H.T. and Singh, K. and Kang, D. and He, H.Y. and Schwarzer, M. and Gunter, T. and Kong, X. and Zhang, A. and Wang, J.Y. and Wang, C. and Du, N. and Lei, T. and Wiseman, S. and Lee, M. and Wang, Z. and Pang, R. and Grasch, P. and Toshev, A. and Yang, Y.F.",
        TITLE = "MM1: Methods, Analysis and Insights from Multimodal LLM Pre-training",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXIX: 304-323",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235823"}

@inproceedings{bb240907,
        AUTHOR = "Wang, Y. and Liu, X.G. and Li, Y. and Chen, M. and Xiao, C.W.",
        TITLE = "Adashield: Safeguarding Multimodal Large Language Models from
Structure-based Attack via Adaptive Shield Prompting",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XX: 77-94",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235824"}

@inproceedings{bb240908,
        AUTHOR = "Zhao, H.H. and Zhou, P. and Shou, M.Z.",
        TITLE = "Genixer: Empowering Multimodal Large Language Model as a Powerful Data
Generator",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXIII: 129-147",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235825"}

@inproceedings{bb240909,
        AUTHOR = "Fu, X.Y. and Hu, Y.S. and Li, B.Z. and Feng, Y. and Wang, H.Y. and Lin, X.D. and Roth, D. and Smith, N.A. and Ma, W.C. and Krishna, R.",
        TITLE = "Blink: Multimodal Large Language Models Can See but Not Perceive",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXIII: 148-166",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235826"}

@inproceedings{bb240910,
        AUTHOR = "Zhang, Z.K. and Li, Y.T. and Huang, H.F. and Lin, M.X. and Yi, L.",
        TITLE = "Freemotion: Mocap-free Human Motion Synthesis with Multimodal Large
Language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXIII: 403-421",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235827"}

@inproceedings{bb240911,
        AUTHOR = "Pi, R.J. and Han, T.Y. and Xiong, W. and Zhang, J.P. and Liu, R.T. and Pan, R. and Zhang, T.",
        TITLE = "Strengthening Multimodal Large Language Model with Bootstrapped
Preference Optimization",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXIII: 382-398",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235828"}

@inproceedings{bb240912,
        AUTHOR = "Xia, B. and Wang, S.Y. and Tao, Y. and Wang, Y.T. and Jia, J.Y.",
        TITLE = "Llmga: Multimodal Large Language Model Based Generation Assistant",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXVIII: 389-406",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235829"}

@inproceedings{bb240913,
        AUTHOR = "Wu, T. and Ma, K. and Liang, J. and Yang, Y. and Zhang, L.",
        TITLE = "A Comprehensive Study of Multimodal Large Language Models for Image
Quality Assessment",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXIV: 143-160",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235830"}

@inproceedings{bb240914,
        AUTHOR = "Xu, J. and Lo, S.Y. and Safaei, B. and Patel, V.M. and Dwivedi, I.",
        TITLE = "Towards Zero-Shot Anomaly Detection and Reasoning with Multimodal
Large Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "20370-20382",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235831"}

@inproceedings{bb240915,
        AUTHOR = "Yang, Y.C. and Lee, K. and Dariush, B. and Cao, Y. and Lo, S.Y.",
        TITLE = "Follow the Rules: Reasoning for Video Anomaly Detection with Large
Language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXXI: 304-322",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235832"}

@inproceedings{bb240916,
        AUTHOR = "Zheng, S. and Zhou, B. and Feng, Y.C. and Wang, Y. and Lu, Z.Q.",
        TITLE = "Unicode: Learning a Unified Codebook for Multimodal Large Language
Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "VIII: 426-443",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235833"}

@inproceedings{bb240917,
        AUTHOR = "Ren, Z.W. and Huang, Z.C. and Wei, Y.C. and Zhao, Y. and Fu, D.M. and Feng, J.S. and Jin, X.J.",
        TITLE = "PixelLM: Pixel Reasoning with Large Multimodal Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26364-26373",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235834"}

@inproceedings{bb240918,
        AUTHOR = "Yue, X. and Ni, Y.S. and Zheng, T.Y. and Zhang, K. and Liu, R. and Zhang, G. and Stevens, S. and Jiang, D. and Ren, W.M. and Sun, Y.X. and Wei, C. and Yu, B.T. and Yuan, R.B. and Sun, R.L. and Yin, M. and Zheng, B. and Yang, Z.Z. and Liu, Y. and Huang, W.H. and Sun, H. and Su, Y. and Chen, W.",
        TITLE = "MMMU: A Massive Multi-Discipline Multimodal Understanding and
Reasoning Benchmark for Expert AGI",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "9556-9567",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235835"}

@inproceedings{bb240919,
        AUTHOR = "Xia, Z.F. and Han, D.C. and Han, Y.Z. and Pan, X. and Song, S. and Huang, G.",
        TITLE = "GSVA: Generalized Segmentation via Multimodal Large Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "3858-3869",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235836"}

@inproceedings{bb240920,
        AUTHOR = "Du, Y.Y. and Wang, X.C. and Chen, C. and Ye, J. and Wang, Y. and Li, P. and Yan, M. and Zhang, J. and Huang, F. and Sui, Z.F. and Sun, M. and Liu, Y.",
        TITLE = "AdaMMS: Model Merging for Heterogeneous Multimodal Large Language
Models with Unsupervised Coefficient Optimization",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "9413-9422",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235837"}

@inproceedings{bb240921,
        AUTHOR = "Ye, Q.H. and Xu, H.Y. and Ye, J. and Yan, M. and Hu, A. and Liu, H. and Qian, Q. and Zhang, J. and Huang, F.",
        TITLE = "mPLUG-OwI2: Revolutionizing Multi-modal Large Language Model with
Modality Collaboration",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13040-13051",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235838"}

@inproceedings{bb240922,
        AUTHOR = "Qi, P. and Yan, Z. and Hsu, W. and Lee, M.L.",
        TITLE = "Sniffer: Multimodal Large Language Model for Explainable
Out-of-Context Misinformation Detection",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13052-13062",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235839"}

@inproceedings{bb240923,
        AUTHOR = "Mitra, C. and Huang, B. and Darrell, T.J. and Herzig, R.",
        TITLE = "Compositional Chain-of-Thought Prompting for Large Multimodal Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14420-14431",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235840"}

@inproceedings{bb240924,
        AUTHOR = "Li, X.Q. and Xu, J.Y. and Zhang, M.X. and Liu, J.M. and Shen, Y. and Ponomarenko, I. and Xu, J.H. and Heng, L. and Huang, S.Y. and Zhang, S.H. and Dong, H.",
        TITLE = "Object-Centric Prompt-Driven Vision-Language-Action Model for Robotic
Manipulation",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "27638-27648",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235841"}

@inproceedings{bb240925,
        AUTHOR = "Li, X.Q. and Zhang, M.X. and Geng, Y.R. and Geng, H.R. and Long, Y.X. and Shen, Y. and Zhang, R.R. and Liu, J.M. and Dong, H.",
        TITLE = "ManipLLM: Embodied Multimodal Large Language Model for Object-Centric
Robotic Manipulation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18061-18070",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235842"}

@inproceedings{bb240926,
        AUTHOR = "Taesiri, M.R. and Feng, T.J. and Bezemer, C.P. and Nguyen, A.",
        TITLE = "GlitchBench: Can Large Multimodal Models Detect Video Game Glitches?",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22444-22455",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235843"}

@inproceedings{bb240927,
        AUTHOR = "Zhang, R. and Zhang, Y.Z. and Chen, J. and Zhou, Y.F. and Gu, J.X. and Chen, C. and Sun, T.",
        TITLE = "TRINS: Towards Multimodal Language Models that Can Read",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22584-22594",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235844"}

@inproceedings{bb240928,
        AUTHOR = "Zhang, Y. and Dong, Y.P. and Zhang, S.Y. and Min, T.Z. and Su, H. and Zhu, J.",
        TITLE = "Exploring the Transferability of Visual Prompting for Multimodal
Large Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26552-26562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235845"}

@inproceedings{bb240929,
        AUTHOR = "Liang, T. and Huang, J. and Kong, M. and Chen, L. and Zhu, Q.",
        TITLE = "Querying as Prompt: Parameter-Efficient Learning for Multimodal
Language Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26845-26855",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235846"}

@inproceedings{bb240930,
        AUTHOR = "Pi, R.J. and Yao, L.W. and Gao, J.H. and Zhang, J.P. and Zhang, T.",
        TITLE = "PerceptionGPT: Effectively Fusing Visual Perception Into LLM",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27114-27123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235847"}

@inproceedings{bb240931,
        AUTHOR = "Tai, Y. and Fan, W.C. and Zhang, Z. and Liu, Z.W.",
        TITLE = "Link-Context Learning for Multimodal LLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27166-27175",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235848"}

@inproceedings{bb240932,
        AUTHOR = "Jain, J. and Yang, J.W. and Shi, H.",
        TITLE = "VCoder: Versatile Vision Encoders for Multimodal Large Language
Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27992-28002",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235849"}

@inproceedings{bb240933,
        AUTHOR = "Barbany, O. and Huang, M. and Zhu, X.L. and Dhua, A.",
        TITLE = "Leveraging Large Language Models for Multimodal Search",
        BOOKTITLE = FGVC24,
        YEAR = "2024",
        PAGES = "1201-1210",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235850"}

@inproceedings{bb240934,
        AUTHOR = "Baldassini, F.B. and Shukor, M. and Cord, M. and Soulier, L. and Piwowarski, B.",
        TITLE = "What Makes Multimodal In-Context Learning Work?",
        BOOKTITLE = Prompting24,
        YEAR = "2024",
        PAGES = "1539-1550",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235851"}

@inproceedings{bb240935,
        AUTHOR = "Ma, F.P. and Zhou, Y.Z. and Zhang, Y.Y. and Wu, S.Y. and Zhang, Z. and He, Z.L. and Rao, F.Y. and Sun, X.Y.",
        TITLE = "Task Navigator: Decomposing Complex Tasks for Multimodal Large
Language Models",
        BOOKTITLE = Reasoning24,
        YEAR = "2024",
        PAGES = "2248-2257",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235852"}

@inproceedings{bb240936,
        AUTHOR = "Cha, J. and Kang, W. and Mun, J. and Roh, B.",
        TITLE = "Honeybee: Locality-Enhanced Projector for Multimodal LLM",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13817-13827",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235853"}

@inproceedings{bb240937,
        AUTHOR = "Lai, C.G. and Song, S.L. and Yan, S. and Hu, G.",
        TITLE = "Improving Vision and Language Concepts Understanding with Multimodal
Counterfactual Samples",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXIX: 174-191",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235854"}

@inproceedings{bb240938,
        AUTHOR = "Cao, J.J. and Ye, P. and Li, S.Z. and Yu, C. and Tang, Y.S. and Lu, J.W. and Chen, T.",
        TITLE = "MADTP: Multimodal Alignment-Guided Dynamic Token Pruning for
Accelerating Vision-Language Transformer",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15710-15719",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235855"}

@inproceedings{bb240939,
        AUTHOR = "Sahin, U. and Li, H. and Khan, Q. and Cremers, D. and Tresp, V.",
        TITLE = "Enhancing Multimodal Compositional Reasoning of Visual Language
Models with Generative Negative Mining",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5551-5561",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235856"}

@inproceedings{bb240940,
        AUTHOR = "Hu, Z.Z. and Zhu, X.L. and Tran, S. and Vidal, R. and Dhua, A.",
        TITLE = "ProVLA: Compositional Image Search with Progressive Vision-Language
Alignment and Multimodal Fusion",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2764-2769",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT235857"}

@article{bb240941,
        AUTHOR = "Lin, B.Q. and Nie, Y.S. and Wei, Z.M. and Chen, J.Q. and Ma, S.K. and Han, J.H. and Xu, H. and Chang, X.J. and Liang, X.D.",
        TITLE = "NavCoT: Boosting LLM-Based Vision-and-Language Navigation via
Learning Disentangled Reasoning",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "5945-5957",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235858"}

@article{bb240942,
        AUTHOR = "Ding, X.P. and Han, J.H. and Xu, H. and Zhang, W. and Li, X.M.",
        TITLE = "HiLM-D: Enhancing MLLMs with Multi-scale High-Resolution Details for
Autonomous Driving",
        JOURNAL = IJCV,
        VOLUME = "133",
        YEAR = "2025",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "5379-5395",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235859"}

@inproceedings{bb240943,
        AUTHOR = "Ding, X.P. and Han, J.H. and Xu, H. and Liang, X.D. and Zhang, W. and Li, X.M.",
        TITLE = "Holistic Autonomous Driving Understanding by Bird'View Injected
Multi-Modal Large Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13668-13677",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235860"}

@article{bb240944,
        AUTHOR = "Liu, T.Q. and Qin, Y.J. and Zhang, S.H. and Tao, X.M.",
        TITLE = "Empowering Corner Case Detection in Autonomous Vehicles With
Multimodal Large Language Models",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "51-55",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235861"}

@article{bb240945,
        AUTHOR = "Wu, M.Y. and Yu, F.R. and Liu, P.X.P. and He, Y.",
        TITLE = "Facilitating Autonomous Driving Tasks With Large Language Models",
        JOURNAL = IEEE_Int_Sys,
        VOLUME = "40",
        YEAR = "2025",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "45-52",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235862"}

@article{bb240946,
        AUTHOR = "Cao, J.H. and Liu, S. and Wu, C.F. and Li, Y. and Du, S.",
        TITLE = "ATHENA - Autonomous Vehicle Trajectory Planning Considered Human
Action Awareness",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "1845-1849",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235863"}

@inproceedings{bb240947,
        AUTHOR = "Chahe, A. and Zhou, L.F.",
        TITLE = "ReasonDrive: Efficient Visual Question Answering for Autonomous
Vehicles with Reasoning-Enhanced Small Vision-Language Models",
        BOOKTITLE = DistillDrive24,
        YEAR = "2024",
        PAGES = "3870-3879",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235864"}

@inproceedings{bb240948,
        AUTHOR = "Chen, K. and Li, Y.Z. and Zhang, W.H. and Liu, Y.X. and Li, P.X. and Gao, R. and Hong, L.Q. and Tian, M. and Zhao, X.H. and Li, Z.G. and Yeung, D.Y. and Lu, H.C. and Jia, X.",
        TITLE = "Automated Evaluation of Large Vision-Language Models on Self-Driving
Corner Cases",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "7817-7826",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235865"}

@inproceedings{bb240949,
        AUTHOR = "Renz, K. and Chen, L. and Arani, E. and Sinavski, O.",
        TITLE = "SimLingo: Vision-Only Closed-Loop Autonomous Driving with
Language-Action Alignment",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "11993-12003",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235866"}

@inproceedings{bb240950,
        AUTHOR = "Zhang, Z.Y. and Li, X.F. and Xu, Z.H. and Peng, W.J. and Zhou, Z.J. and Shi, M.J. and Huang, S.P.",
        TITLE = "MPDrive: Improving Spatial Understanding with Marker-Based Prompt
Learning for Autonomous Driving",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "12089-12099",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235867"}

@inproceedings{bb240951,
        AUTHOR = "Xu, Z.H. and Bai, Y. and Zhang, Y.J. and Li, Z.L. and Xia, F. and Wong, K.Y.K. and Wang, J.Q. and Zhao, H.S.",
        TITLE = "DriveGPT4-V2: Harnessing Large Language Model Capabilities for
Enhanced Closed-Loop Autonomous Driving",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "17261-17270",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235868"}

@inproceedings{bb240952,
        AUTHOR = "Hegde, D. and Yasarla, R. and Cai, H. and Han, S.Z. and Bhattacharyya, A. and Mahajan, S. and Liu, L.T. and Garrepalli, R. and Patel, V.M. and Porikli, F.M.",
        TITLE = "Distilling Multi-Modal Large Language Models for Autonomous Driving",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "27575-27585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235869"}

@inproceedings{bb240953,
        AUTHOR = "Chen, Y. and Ding, Z.H. and Wang, Z.Q. and Wang, Y. and Zhang, L.J. and Liu, S.",
        TITLE = "Asynchronous Large Language Model Enhanced Planner for Autonomous
Driving",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXVI: 22-38",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235870"}

@inproceedings{bb240954,
        AUTHOR = "Li, B. and Wang, Y. and Mao, J. and Ivanovic, B. and Veer, S. and Leung, K. and Pavone, M.",
        TITLE = "Driving Everywhere with Large Language Model Policy Adaptation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14948-14957",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235871"}

@inproceedings{bb240955,
        AUTHOR = "Wei, Y.X. and Wang, Z. and Lu, Y.F. and Xu, C.X. and Liu, C.X. and Zhao, H. and Chen, S. and Wang, Y.F.",
        TITLE = "Editable Scene Simulation for Autonomous Driving via Collaborative
LLM-Agents",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15077-15087",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235872"}

@inproceedings{bb240956,
        AUTHOR = "Shao, H. and Hu, Y.X. and Wang, L. and Song, G.L. and Waslander, S.L. and Liu, Y. and Li, H.S.",
        TITLE = "LMDrive: Closed-Loop End-to-End Driving with Large Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15120-15130",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235873"}

@inproceedings{bb240957,
        AUTHOR = "Ma, Y.S. and Cui, C. and Cao, X. and Ye, W.Q. and Liu, P.R. and Lu, J. and Abdelraouf, A. and Gupta, R. and Han, K.T. and Bera, A. and Rehg, J.M. and Wang, Z.",
        TITLE = "LaMPilot: An Open Benchmark Dataset for Autonomous Driving with
Language Model Programs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15141-15151",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235874"}

@inproceedings{bb240958,
        AUTHOR = "Zhang, J.W. and Xu, C. and Li, B.",
        TITLE = "ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for
Autonomous Vehicles",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15459-15469",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235875"}

@inproceedings{bb240959,
        AUTHOR = "Sirnam, S. and Yang, J. and Neiman, T. and Rizve, M.N. and Tran, S. and Yao, B. and Chilimbi, T. and Shah, M.",
        TITLE = "X-former: Unifying Contrastive and Reconstruction Learning for MLLMs",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "VI: 146-162",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235876"}

@inproceedings{bb240960,
        AUTHOR = "Qiao, Y.Y. and Liu, Q.Y. and Liu, J.J. and Liu, J. and Wu, Q.",
        TITLE = "LLM as Copilot for Coarse-grained Vision-and-language Navigation",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "V: 459-476",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235877"}

@inproceedings{bb240961,
        AUTHOR = "Zhang, J.Y. and Huang, Z.M. and Ray, A. and Ohn Bar, E.",
        TITLE = "Feedback-Guided Autonomous Driving",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15000-15011",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235878"}

@inproceedings{bb240962,
        AUTHOR = "Yang, Y. and Zhang, Q.W. and Li, C. and Marta, D.S. and Batool, N. and Folkesson, J.",
        TITLE = "Human-Centric Autonomous Systems With LLMs for User Command Reasoning",
        BOOKTITLE = LLVMCrive24,
        YEAR = "2024",
        PAGES = "988-994",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235879"}

@inproceedings{bb240963,
        AUTHOR = "Cui, C. and Ma, Y.S. and Cao, X. and Ye, W.Q. and Zhou, Y. and Liang, K. and Chen, J. and Lu, J. and Yang, Z. and Liao, K.D. and Gao, T. and Li, E. and Tang, K. and Cao, Z.P. and Zhou, T. and Liu, A. and Yan, X.R. and Mei, S.Q. and Cao, J.G. and Wang, Z. and Zheng, C.",
        TITLE = "A Survey on Multimodal Large Language Models for Autonomous Driving",
        BOOKTITLE = LLVMCrive24,
        YEAR = "2024",
        PAGES = "958-979",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235880"}

@inproceedings{bb240964,
        AUTHOR = "Fu, D.C. and Li, X. and Wen, L.C. and Dou, M. and Cai, P.L. and Shi, B. and Qiao, Y.",
        TITLE = "Drive Like a Human: Rethinking Autonomous Driving with Large Language
Models",
        BOOKTITLE = LLVMCrive24,
        YEAR = "2024",
        PAGES = "910-919",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmdr5.html#TT235881"}

@article{bb240965,
        AUTHOR = "Wang, J. and Zhu, M. and Li, Y. and Li, H.L. and Yang, L.Z. and Woo, W.L.",
        TITLE = "Detect2Interact: Localizing Object Key Field in Visual Question
Answering with LLMs",
        JOURNAL = IEEE_Int_Sys,
        VOLUME = "39",
        YEAR = "2024",
        NUMBER = "3",
        MONTH = "May",
        PAGES = "35-44",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235882"}

@article{bb240966,
        AUTHOR = "Hu, Z.J. and Yang, P. and Jiang, Y.S. and Bai, Z.J.",
        TITLE = "Prompting large language model with context and pre-answer for
knowledge-based VQA",
        JOURNAL = PR,
        VOLUME = "151",
        YEAR = "2024",
        PAGES = "110399",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235883"}

@article{bb240967,
        AUTHOR = "Kuang, J.Y. and Shen, Y. and Xie, J. and Luo, H. and Xu, Z. and Li, R.H. and Li, Y.H. and Cheng, X.F. and Lin, X. and Han, Y.",
        TITLE = "Natural Language Understanding and Inference with MLLM in Visual
Question Answering: A Survey",
        JOURNAL = Surveys,
        VOLUME = "57",
        YEAR = "2025",
        NUMBER = "8",
        MONTH = "March",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235884"}

@article{bb240968,
        AUTHOR = "Xiong, H.M. and Zhuge, Y.Z. and Zhu, J. and Zhang, L. and Lu, H.C.",
        TITLE = "3UR-LLM: An End-to-End Multimodal Large Language Model for 3D Scene
Understanding",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "2899-2911",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235885"}

@article{bb240969,
        AUTHOR = "Yu, Z. and Ouyang, X.C. and Shao, Z.W. and Wang, M. and Yu, J.",
        TITLE = "Prophet: Prompting Large Language Models With Complementary Answer
Heuristics for Knowledge-Based Visual Question Answering",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "6797-6808",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235886"}

@inproceedings{bb240970,
        AUTHOR = "Shao, Z.W. and Yu, Z. and Wang, M. and Yu, J.",
        TITLE = "Prompting Large Language Models with Answer Heuristics for
Knowledge-Based Visual Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14974-14983",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235887"}

@article{bb240971,
        AUTHOR = "Xu, Z. and Li, Q. and Nie, W.Z. and Wang, W.J. and Liu, A.",
        TITLE = "Structure Causal Models and LLMs Integration in Medical Visual
Question Answering",
        JOURNAL = MedImg,
        VOLUME = "44",
        YEAR = "2025",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "3476-3489",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235888"}

@article{bb240972,
        AUTHOR = "Jegham, N. and Abdelatti, M. and Hendawi, A.",
        TITLE = "Visual reasoning consistency and robustness analysis of multimodal
LLMs",
        JOURNAL = PR,
        VOLUME = "172",
        YEAR = "2026",
        PAGES = "112765",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235889"}

@inproceedings{bb240973,
        AUTHOR = "Lu, J. and Srivastava, S. and Chen, J.Y. and Shrestha, R. and Acharya, M. and Kafle, K. and Kanan, C.",
        TITLE = "Revisiting Multi-Modal LLM Evaluation",
        BOOKTITLE = "AIBench25",
        YEAR = "2025",
        PAGES = "555-564",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235890"}

@inproceedings{bb240974,
        AUTHOR = "Quan, K.A.C. and Nguyen, Q.N. and Luu, D.T.",
        TITLE = "Toward Automation in Text-Based Video Retrieval with LLM Assistance",
        BOOKTITLE = IntVidSea25,
        YEAR = "2025",
        PAGES = "3699-3707",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235891"}

@inproceedings{bb240975,
        AUTHOR = "Kim, Y. and Jung, J.",
        TITLE = "KOFFVQA: An Objectively Evaluated Free-Form VQA Benchmark for Large
Vision-Language Models in the Korean Language",
        BOOKTITLE = "AIBench25",
        YEAR = "2025",
        PAGES = "575-585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235892"}

@inproceedings{bb240976,
        AUTHOR = "Fang, W.L. and Wu, Q. and Chen, J. and Xue, Y.",
        TITLE = "Notes-guided MLLM Reasoning: Enhancing MLLM with Knowledge and Visual
Notes for Visual Question Answering",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19597-19607",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235893"}

@inproceedings{bb240977,
        AUTHOR = "Huai, T.Y. and Zhou, J. and Wu, X.J. and Chen, Q. and Bai, Q.C. and Zhou, Z. and He, L.",
        TITLE = "CL-MoE: Enhancing Multimodal Large Language Model with Dual Momentum
Mixture-of-Experts for Continual Visual Question Answering",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19608-19617",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235894"}

@inproceedings{bb240978,
        AUTHOR = "Zhi, H.Y. and Chen, P.H. and Li, J. and Ma, S. and Sun, X.Y. and Xiang, T.H. and Lei, Y.J. and Tan, M.K. and Gan, C.",
        TITLE = "LSceneLLM: Enhancing Large 3D Scene Understanding Using Adaptive
Visual Preferences",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3761-3771",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235895"}

@inproceedings{bb240979,
        AUTHOR = "Cocchi, F. and Moratelli, N. and Cornia, M. and Baraldi, L. and Cucchiara, R.",
        TITLE = "Augmenting Multimodal LLMs with Self-Reflective Tokens for
Knowledge-based Visual Question Answering",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "9199-9209",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235896"}

@inproceedings{bb240980,
        AUTHOR = "Yang, Z. and Tao, Z. and Chen, Q. and Li, L. and Qi, Y.K. and van den Hengel, A.J. and Huang, Q.M.",
        TITLE = "Separation of powers: On segregating knowledge from observation in
LLM-enabled knowledge-based visual question answering",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24753-24762",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235897"}

@inproceedings{bb240981,
        AUTHOR = "Cai, M. and Huang, Z.Y. and Li, Y.H. and Ojha, U. and Wang, H.H. and Lee, Y.J.",
        TITLE = "An Investigation on LLMs' Visual Understanding Ability Using SVG for
Image-Text Bridging",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "5377-5386",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235898"}

@inproceedings{bb240982,
        AUTHOR = "Amoroso, R. and Zhang, G. and Koner, R. and Baraldi, L. and Cucchiara, R. and Tresp, V.",
        TITLE = "Perceive. Query & Reason: Enhancing Video QA with Question-Guided
Temporal Queries",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "8853-8862",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235899"}

@inproceedings{bb240983,
        AUTHOR = "Weng, W.X. and Zhang, R. and Meng, X.J. and Zhu, J. and Liu, Q. and Yuan, C.",
        TITLE = "Unsupervised Domain Adaptive Visual Question Answering in the Era of
Multi-Modal Large Language Models",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "6248-6258",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235900"}

@inproceedings{bb240984,
        AUTHOR = "Sun, G.H. and Qin, C. and Wang, J.M. and Chen, Z.Y. and Xu, R. and Tao, Z.Q.",
        TITLE = "SQ-LLAVA: Self-questioning for Large Vision-language Assistant",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "IX: 156-172",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235901"}

@inproceedings{bb240985,
        AUTHOR = "Ye, Q. and Yu, Z.T. and Shao, R. and Xie, X.Y. and Torr, P.H.S. and Cao, X.C.",
        TITLE = "CAT: Enhancing Multimodal Large Language Model to Answer Questions in
Dynamic Audio-visual Scenarios",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "X: 146-164",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235902"}

@inproceedings{bb240986,
        AUTHOR = "Li, Z. and Jasani, B. and Tang, P. and Ghadar, S.",
        TITLE = "Synthesize Step-by-Step: Tools, Templates and LLMs as Data Generators
for Reasoning-Based Chart VQA",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13613-13623",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235903"}

@inproceedings{bb240987,
        AUTHOR = "Ozdemir, O. and Akagunduz, E.",
        TITLE = "Enhancing Visual Question Answering through Question-Driven Image
Captions as Prompts",
        BOOKTITLE = Prompting24,
        YEAR = "2024",
        PAGES = "1562-1571",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235904"}

@inproceedings{bb240988,
        AUTHOR = "Ranasinghe, K. and Shukla, S.N. and Poursaeed, O. and Ryoo, M.S. and Lin, T.Y.",
        TITLE = "Learning to Localize Objects Improves Spatial Reasoning in
Visual-LLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12977-12987",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235905"}

@inproceedings{bb240989,
        AUTHOR = "Blau, T. and Fogel, S. and Ronen, R. and Golts, A. and Tsiper, S. and Avraham, E.B. and Aberdam, A. and Ganz, R. and Litman, R.",
        TITLE = "GRAM: Global Reasoning for Multi-Page VQA",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15598-15607",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235906"}

@inproceedings{bb240990,
        AUTHOR = "Li, L. and Peng, J.W. and Chen, H. and Gao, C.Y. and Yang, X.",
        TITLE = "How to Configure Good In-Context Sequence for Visual Question
Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26700-26710",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235907"}

@inproceedings{bb240991,
        AUTHOR = "Agrawal, A. and Lezcano, C.M.S. and Heredia Marin, I.B. and Sethi, P.S.",
        TITLE = "Listen Then See: Video Alignment with Speaker Attention",
        BOOKTITLE = MULA24,
        YEAR = "2024",
        PAGES = "2018-2027",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235908"}

@inproceedings{bb240992,
        AUTHOR = "Tan, R. and Sun, X. and Hu, P. and Wang, J.H. and Deilamsalehy, H. and Plummer, B.A. and Russell, B. and Saenko, K.",
        TITLE = "Koala: Key Frame-Conditioned Long Video-LLM",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13581-13591",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235909"}

@inproceedings{bb240993,
        AUTHOR = "Ganz, R. and Kittenplon, Y. and Aberdam, A. and Avraham, E.B. and Nuriel, O. and Mazor, S. and Litman, R.",
        TITLE = "Question Aware Vision Transformer for Multimodal Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13861-13871",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235910"}

@inproceedings{bb240994,
        AUTHOR = "Bansal, H. and Bitton, Y. and Szpektor, I. and Chang, K.W. and Grover, A.",
        TITLE = "VideoCon: Robust Video-Language Alignment via Contrast Captions",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13927-13937",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235911"}

@inproceedings{bb240995,
        AUTHOR = "Wang, S.W. and Zhang, L.L. and Zhu, L.J. and Qin, T. and Yap, K.H. and Zhang, X.Y. and Liu, J.",
        TITLE = "CoG-DQA: Chain-of-Guiding Learning with Large Language Models for
Diagram Question Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13969-13979",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235912"}

@inproceedings{bb240996,
        AUTHOR = "Khan, Z. and BG, V.K. and Schulter, S. and Fu, Y. and Chandraker, M.",
        TITLE = "Self-Training Large Language Models for Improved Visual Program
Synthesis With Visual Reinforcement",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14344-14353",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235913"}

@inproceedings{bb240997,
        AUTHOR = "Liao, Z. and Li, J.T. and Niu, L. and Zhang, L.Q.",
        TITLE = "Align and Aggregate: Compositional Reasoning with Video Alignment and
Answer Aggregation for Video Question-Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13395-13404",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235914"}

@inproceedings{bb240998,
        AUTHOR = "Pan, J.T. and Lin, Z. and Ge, Y.Y. and Zhu, X.T. and Zhang, R.R. and Wang, Y. and Qiao, Y. and Li, H.S.",
        TITLE = "Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen
Large Language Models",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "272-283",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235915"}

@inproceedings{bb240999,
        AUTHOR = "Guo, J.X. and Li, J. and Li, D.X. and Tiong, A.M.H. and Li, B.Y. and Tao, D.C. and Hoi, S.",
        TITLE = "From Images to Textual Prompts: Zero-shot Visual Question Answering
with Frozen Large Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "10867-10877",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqallm5.html#TT235916"}

Last update:Feb 26, 2026 at 10:58:24