@inproceedings{bb235400, AUTHOR = "Stefanini, M. and Cornia, M. and Baraldi, L. and Cucchiara, R.", TITLE = "A Novel Attention-based Aggregation Function to Combine Vision and Language", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "1212-1219", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT230375"} @inproceedings{bb235401, AUTHOR = "Zheng, W.B. and Yan, L. and Gou, C. and Wang, F.Y.", TITLE = "Webly Supervised Knowledge Embedding Model for Visual Reasoning", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "12442-12451", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT230376"} @inproceedings{bb235402, AUTHOR = "Nguyen, D.K. and Okatani, T.", TITLE = "Multi-Task Learning of Hierarchical Vision-Language Representation", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "10484-10493", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT230377"} @inproceedings{bb235403, AUTHOR = "Gupta, T. and Shih, K.J. and Singh, S. and Hoiem, D.", TITLE = "Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks", BOOKTITLE = ICCV17, YEAR = "2017", PAGES = "4223-4232", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT230378"} @article{bb235404, AUTHOR = "Zhao, Z. and Wang, S. and Gu, J. and Zhu, Y. and Mei, L. and Zhuang, Z.X. and Cui, Z.M. and Wang, Q. and Shen, D.G.", TITLE = "ChatCAD+: Toward a Universal and Reliable Interactive CAD Using LLMs", JOURNAL = MedImg, VOLUME = "43", YEAR = "2024", NUMBER = "11", MONTH = "November", PAGES = "3755-3766", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230379"} @article{bb235405, AUTHOR = "Luo, H. and Zeng, Y.J. and Yang, L. and Chen, K. and Shen, Z.X. and Lv, F.", TITLE = "VLAI: Exploration and Exploitation based on Visual-Language Aligned Information for Robotic Object Goal Navigation", JOURNAL = IVC, VOLUME = "151", YEAR = "2024", PAGES = "105259", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230380"} @article{bb235406, AUTHOR = "Mansourian, A. and Oucheikh, R.", TITLE = "ChatGeoAI: Enabling Geospatial Analysis for Public through Natural Language, with Large Language Models", JOURNAL = IJGI, VOLUME = "13", YEAR = "2024", NUMBER = "10", PAGES = "348", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230381"} @article{bb235407, AUTHOR = "Li, D. and Zhao, Y. and Wang, Z.F. and Jung, C. and Zhang, Z.", TITLE = "Large Language Model-Driven Structured Output: A Comprehensive Benchmark and Spatial Data Generation Framework", JOURNAL = IJGI, VOLUME = "13", YEAR = "2024", NUMBER = "11", PAGES = "405", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230382"} @article{bb235408, AUTHOR = "Li, Y.X. and Hu, B.T. and Chen, X.Y. and Ma, L. and Xu, Y. and Zhang, M.", TITLE = "LMEye: An Interactive Perception Network for Large Language Models", JOURNAL = MultMed, VOLUME = "26", YEAR = "2024", PAGES = "10952-10964", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230383"} @article{bb235409, AUTHOR = "Shao, R. and Zhang, Z.Y. and Tao, C. and Zhang, Y.S. and Peng, C.L. and Li, H.F.", TITLE = "Homogeneous tokenizer matters: Homogeneous visual tokenizer for remote sensing image understanding", JOURNAL = PandRS, VOLUME = "218", YEAR = "2024", PAGES = "294-310", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230384"} @article{bb235410, AUTHOR = "Wang, Z.H. and Luo, T. and Liu, C. and Liu, W.C. and Goh, R.S.M. and Wong, W.F.", TITLE = "Enabling Energy-Efficient Deployment of Large Language Models on Memristor Crossbar: A Synergy of Large and Small", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "2", MONTH = "February", PAGES = "916-933", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230385"} @article{bb235411, AUTHOR = "Wang, Z. and Cai, S.F. and Liu, A. and Jin, Y.G. and Hou, J. and Zhang, B. and Lin, H. and He, Z.F. and Zheng, Z.L. and Yang, Y.D. and Ma, X.J. and Liang, Y.", TITLE = "JARVIS-1: Open-World Multi-Task Agents With Memory-Augmented Multimodal Language Models", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "3", MONTH = "March", PAGES = "1894-1907", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230386"} @article{bb235412, AUTHOR = "Zhan, Y. and Xiong, Z. and Yuan, Y.", TITLE = "SkyEyeGPT: Unifying remote sensing vision-language tasks via instruction tuning with large language model", JOURNAL = PandRS, VOLUME = "221", YEAR = "2025", PAGES = "64-77", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230387"} @article{bb235413, AUTHOR = "Zhu, Y. and Wen, Z.Y. and Li, X. and Shi, X.F. and Wu, X. and Dong, H. and Chen, J.M.", TITLE = "ChatNav: Leveraging LLM to Zero-Shot Semantic Reasoning in Object Navigation", JOURNAL = CirSysVideo, VOLUME = "35", YEAR = "2025", NUMBER = "3", MONTH = "March", PAGES = "2369-2381", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230388"} @article{bb235414, AUTHOR = "Li, Y.X. and Jiang, S.Y. and Hu, B.T. and Wang, L.Y. and Zhong, W.Q. and Luo, W.H. and Ma, L. and Zhang, M.", TITLE = "Uni-MoE: Scaling Unified Multimodal LLMs With Mixture of Experts", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "5", MONTH = "May", PAGES = "3424-3439", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230389"} @article{bb235415, AUTHOR = "Huang, Z.Z. and Zhong, S.S. and Zhou, P. and Gao, S. and Zitnik, M. and Lin, L.", TITLE = "A Causality-Aware Paradigm for Evaluating Creativity of Multimodal Large Language Models", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "5", MONTH = "May", PAGES = "3830-3846", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230390"} @article{bb235416, AUTHOR = "Marasco, E. and Bourlai, T.", TITLE = "Enhancing trust in Large Language Models for streamlined decision-making in military operations", JOURNAL = IVC, VOLUME = "158", YEAR = "2025", PAGES = "105489", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230391"} @article{bb235417, AUTHOR = "Qiao, D. and Ao, X. and Liu, Y. and Chen, X.T. and Song, F.Y. and Qin, Z. and Jin, W.Q.", TITLE = "Tri-AFLLM: Resource-Efficient Adaptive Asynchronous Accelerated Federated LLMs", JOURNAL = CirSysVideo, VOLUME = "35", YEAR = "2025", NUMBER = "5", MONTH = "May", PAGES = "4198-4211", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230392"} @article{bb235418, AUTHOR = "Villani, F. and Maljkovic, I. and Lazzaro, D. and Sotgiu, A. and Cina, A.E. and Roli, F.", TITLE = "Robust image classification with multi-modal large language models", JOURNAL = PRL, VOLUME = "194", YEAR = "2025", PAGES = "1-7", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230393"} @article{bb235419, AUTHOR = "Wang, Q.W. and Li, C.H. and Liu, Y. and Zhu, Q.B. and Song, J. and Shen, T.", TITLE = "An Adaptive Framework Embedded With LLM for Knowledge Graph Construction", JOURNAL = MultMed, VOLUME = "27", YEAR = "2025", PAGES = "2912-2923", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230394"} @article{bb235420, AUTHOR = "Shao, Z.W. and Yu, Z. and Yu, J. and Ouyang, X.C. and Zheng, L. and Gai, Z.B. and Wang, M.Y. and Kuang, Z.Z. and Ding, J.J.", TITLE = "Imp: Highly Capable Large Multimodal Models for Mobile Devices", JOURNAL = MultMed, VOLUME = "27", YEAR = "2025", PAGES = "2961-2974", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230395"} @article{bb235421, AUTHOR = "Zhang, Y.X. and Liu, C.B. and Liu, Y.Z. and Gao, Y.F. and Lu, Z.Y. and Xie, H.T. and Zhang, Y.D.", TITLE = "Leveraging Concise Concepts With Probabilistic Modeling for Interpretable Visual Recognition", JOURNAL = MultMed, VOLUME = "27", YEAR = "2025", PAGES = "3117-3131", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230396"} @article{bb235422, AUTHOR = "Ge, J. and Zhang, X. and Zheng, Y. and Guo, K. and Liang, J.", TITLE = "RSTeller: Scaling up visual language modeling in remote sensing with rich linguistic semantics from openly available data and large language models", JOURNAL = PandRS, VOLUME = "226", YEAR = "2025", PAGES = "146-163", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230397"} @article{bb235423, AUTHOR = "Li, Z.S. and Muhtar, D. and Gu, F. and He, Y.L.X. and Zhang, X.L. and Xiao, P.F. and He, G. and Zhu, X.X.", TITLE = "LHRS-Bot-Nova: Improved multimodal large language model for remote sensing vision-language interpretation", JOURNAL = PandRS, VOLUME = "227", YEAR = "2025", PAGES = "539-550", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230398"} @inproceedings{bb235424, AUTHOR = "Muhtar, D. and Li, Z.S. and Gu, F. and Zhang, X.L. and Xiao, P.F.", TITLE = "Lhrs-bot: Empowering Remote Sensing with Vgi-enhanced Large Multimodal Language Model", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXXIV: 440-457", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230399"} @article{bb235425, AUTHOR = "Chen, L.F. and Hu, P. and Pan, Z.L. and Liu, Q. and Zhang, S.H. and Liu, Z.", TITLE = "Large Language Models Can Achieve Explainable and Training-Free One-Shot HRRP ATR", JOURNAL = SPLetters, VOLUME = "32", YEAR = "2025", PAGES = "3395-3399", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230400"} @article{bb235426, AUTHOR = "Li, X. and Zheng, Y. and Chen, H.T. and Chen, X.L. and Liang, Y.X. and Lai, C.H. and Li, B. and Xue, X.Y.", TITLE = "Instruction-guided fusion of multi-layer visual features in Large Vision-Language Models", JOURNAL = PR, VOLUME = "170", YEAR = "2026", PAGES = "111932", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230401"} @article{bb235427, AUTHOR = "Yang, S.Y. and Yu, W.J. and Yang, W.J. and Liu, X.W. and Tan, H.B. and Lan, L. and Xiao, N.", TITLE = "WildVideo: Benchmarking LMMs for Understanding Video-Language Interaction", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "10", MONTH = "October", PAGES = "9330-9344", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230402"} @inproceedings{bb235428, AUTHOR = "Mei, G.F. and Lin, W. and Riz, L. and Wu, Y.J. and Poiesi, F. and Wang, Y.M.", TITLE = "PerLA: Perceptive 3D language assistant", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14369-14379", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230403"} @inproceedings{bb235429, AUTHOR = "Hong, W. and Cheng, Y. and Yang, Z. and Luo, Z.Y. and Wu, H.N. and Li, D.X. and Ma, J. and Kankanhalli, M. and Li, J.", TITLE = "VideoAutoArena: An Automated Arena for Evaluating Large Multimodal Models in Video Analysis through User Simulation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "8461-8474", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230404"} @inproceedings{bb235430, AUTHOR = "Han, Y.D. and Guo, Q. and Pan, L.Y. and Liu, L. and Guan, Y. and Yang, M.", TITLE = "DynFocus: Dynamic Cooperative Network Empowers LLMs with Video Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "8512-8522", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230405"} @inproceedings{bb235431, AUTHOR = "Liu, Y. and Liang, Z.Y. and Wang, Y.Z. and Wu, X.F. and Tang, F.L. and He, M. and Li, J. and Liu, Z. and Yang, H. and Lim, S. and Zhao, B.", TITLE = "Unveiling the Ignorance of MLLMs: Seeing Clearly, Answering Incorrectly", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9087-9097", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230406"} @inproceedings{bb235432, AUTHOR = "Wang, Z.T. and Hu, S.M. and Zhao, S.Y. and Lin, X.W. and Juefei Xu, F. and Li, Z. and Han, L. and Subramanyam, H. and Chen, L. and Chen, J. and Jiang, N. and Lyu, L. and Ma, S.Q. and Metaxas, D.N. and Jain, A.", TITLE = "MLLM-as-a-Judge for Image Safety without Human Labeling", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14657-14666", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230407"} @inproceedings{bb235433, AUTHOR = "Tian, J. and Zhang, J.R. and Liu, S. and Xu, L. and Huang, Z.X. and Huang, G.", TITLE = "DTOS: Dynamic Time Object Sensing with Large Multimodal Model", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "13810-13820", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230408"} @inproceedings{bb235434, AUTHOR = "Li, M. and Zhong, J. and Chen, T. and Lai, Y.X. and Psounis, K.", TITLE = "EEE-Bench: A Comprehensive Multimodal Electrical And Electronics Engineering Benchmark", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "13337-13349", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230409"} @inproceedings{bb235435, AUTHOR = "Liu, Z.H. and Xie, C.W. and Li, P. and Zhao, L.M. and Tang, L.X. and Zheng, Y. and Liu, C.B. and Xie, H.T.", TITLE = "Hybrid-Level Instruction Injection for Video Token Compression in Multi-modal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "8568-8578", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230410"} @inproceedings{bb235436, AUTHOR = "Ma, Y.Y. and Liu, X.C. and Chen, X.K. and Liu, W. and Wu, C.Y. and Wu, Z.Y. and Pan, Z.Z. and Xie, Z. and Zhang, H. and Yu, X.K. and Zhao, L. and Wang, Y.S. and Liu, J.Y. and Ruan, C.", TITLE = "JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified Multimodal Understanding and Generation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "7739-7751", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230411"} @inproceedings{bb235437, AUTHOR = "Zhu, M. and Tian, Y.Z. and Chen, H. and Zhou, C. and Guo, Q. and Liu, Y. and Yang, M. and Shen, C.H.", TITLE = "SegAgent: Exploring Pixel Understanding Capabilities in MLLMs by Imitating Human Annotator Trajectories", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3686-3696", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230412"} @inproceedings{bb235438, AUTHOR = "Zhu, L. and Chen, T.R. and Xu, Q.X. and Liu, X. and Ji, D. and Wu, H.Y. and Soh, D.W. and Liu, J.", TITLE = "POPEN: Preference-Based Optimization and Ensemble for LVLM-Based Reasoning Segmentation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "30231-30240", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230413"} @inproceedings{bb235439, AUTHOR = "Niu, J. and Li, Y.F. and Miao, Z.Y. and Ge, C.J. and Zhou, Y.H. and He, Q.H. and Dong, X.Y. and Duan, H.D. and Ding, S. and Qian, R. and Zhang, P. and Zang, Y.H. and Cao, Y.H. and He, C.H. and Wang, J.Q.", TITLE = "OVO-Bench: How Far is Your Video-LLMs from Real-World Online Video Understanding?", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "18902-18913", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230414"} @inproceedings{bb235440, AUTHOR = "Farina, M. and Mancini, M. and Iacca, G. and Ricci, E.", TITLE = "Rethinking Few-Shot Adaptation of Vision-Language Models in Two Stages", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29989-29998", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230415"} @inproceedings{bb235441, AUTHOR = "Xue, X.Y. and Lu, Z. and Huang, D. and Wang, Z.D. and Ouyang, W.L. and Bai, L.", TITLE = "ComfyBench: Benchmarking LLM-based Agents in ComfyUI for Autonomously Designing Collaborative AI Systems", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24614-24624", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230416"} @inproceedings{bb235442, AUTHOR = "Zhao, Z. and Huo, Y.Q. and Yue, T.T. and Guo, L.T. and Lu, H.Y. and Wang, B.N. and Chen, W.P. and Liu, J.", TITLE = "Efficient Motion-Aware Video MLLM", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24159-24168", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230417"} @inproceedings{bb235443, AUTHOR = "Wu, R.H. and Su, W. and Liao, J.", TITLE = "Chat2SVG: Vector Graphics Generation with Large Language Models and Image Diffusion Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "23690-23700", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230418"} @inproceedings{bb235444, AUTHOR = "Zhang, Z. and Yadav, S. and Han, F.Z. and Shutova, E.", TITLE = "Cross-modal Information Flow in Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19781-19791", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230419"} @inproceedings{bb235445, AUTHOR = "Yang, S. and Chen, Y. and Tian, Z. and Wang, C.Y. and Li, J.Y. and Yu, B. and Jia, J.Y.", TITLE = "VisionZip: Longer is Better but Not Necessary in Vision Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19792-19802", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230420"} @inproceedings{bb235446, AUTHOR = "Xie, J.Y. and Yang, J.T. and Luo, Z. and Cao, Y. and Gao, Q. and Zhang, M.Y. and Hu, W.P.", TITLE = "AdaDARE-y: Balancing Stability and Plasticity in Multi-modal LLMs through Efficient Adaptation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19758-19768", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230421"} @inproceedings{bb235447, AUTHOR = "Fang, Y. and Jin, B. and Shen, J.C. and Ding, S. and Tan, Q. and Han, J.W.", TITLE = "GraphGPT-o: Synergistic Multimodal Comprehension and Generation on Graphs", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19467-19476", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230422"} @inproceedings{bb235448, AUTHOR = "Tao, K. and Qin, C. and You, H.X. and Sui, Y. and Wang, H.", TITLE = "DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "18992-19001", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230423"} @inproceedings{bb235449, AUTHOR = "Hao, H.R. and Han, J.M. and Li, C.S. and Li, Y.F. and Yue, X.Y.", TITLE = "RAP: Retrieval-Augmented Personalization for Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14538-14548", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230424"} @inproceedings{bb235450, AUTHOR = "Tao, C.X. and Su, S.Q. and Zhu, X.Z. and Zhang, C.Y. and Chen, Z. and Liu, J. and Wang, W.H. and Lu, L.W. and Huang, G. and Qiao, Y. and Dai, J.F.", TITLE = "HoVLE: Unleashing the Power of Monolithic Vision-Language Models with Holistic Vision-Language Embedding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14559-14569", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230425"} @inproceedings{bb235451, AUTHOR = "Tong, B. and Lai, B. and Zhou, Y. and Luo, G. and Shen, Y.H. and Li, K. and Sun, X.S. and Ji, R.R.", TITLE = "FlashSloth: Lightning Multimodal Large Language Models via Embedded Visual Compression", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14570-14581", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230426"} @inproceedings{bb235452, AUTHOR = "Lin, Y.Z. and Li, Y.S. and Chen, D.D. and Xu, W.J. and Clark, R. and Torr, P.", TITLE = "Olympus: A Universal Task Router for Computer Vision Tasks", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "14235-14246", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230427"} @inproceedings{bb235453, AUTHOR = "Szot, A. and Mazoure, B. and Attia, O. and Timofeev, A. and Agrawal, H. and Hjelm, D. and Gan, Z. and Kira, Z. and Toshev, A.", TITLE = "From Multimodal LLMs to Generalist Embodied Agents: Methods and Lessons", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "10644-10655", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230428"} @inproceedings{bb235454, AUTHOR = "Yin, H. and Si, G.Z. and Wang, Z.", TITLE = "Lifting the Veil on Visual Information Flow in MLLMs: Unlocking Pathways to Faster Inference", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9382-9391", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230429"} @inproceedings{bb235455, AUTHOR = "Gholami, M. and Akbari, M. and Cannons, K. and Zhang, Y.", TITLE = "CASP: Compression of Large Multimodal Models Based on Attention Sparsity", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9372-9381", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230430"} @inproceedings{bb235456, AUTHOR = "Jia, H.R. and Jiang, C. and Xu, H.Y. and Ye, W. and Dong, M.F. and Yan, M. and Zhang, J. and Huang, F. and Zhang, S.K.", TITLE = "SymDPO: Boosting In-Context Learning of Large Multimodal Models with Symbol Demonstration Direct Preference Optimization", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9361-9371", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230431"} @inproceedings{bb235457, AUTHOR = "Alvar, S.R. and Singh, G. and Akbari, M. and Zhang, Y.", TITLE = "DivPrune: Diversity-based Visual Token Pruning for Large Multimodal Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9392-9401", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230432"} @inproceedings{bb235458, AUTHOR = "Yang, L.R. and Shen, D. and Cai, C.X. and Chen, K.B. and Yang, F. and Gao, T.T. and Zhang, D. and Li, X.", TITLE = "Libra-Merging: Importance-Redundancy and Pruning-Merging Trade-Off for Acceleration Plug-In in Large Vision-Language Model", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9402-9412", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230433"} @inproceedings{bb235459, AUTHOR = "Zhang, Z.F. and Tang, H.Z. and Sheng, J.W. and Zhang, Z.Y. and Ren, Y.M. and Li, Z.Y. and Yin, D.W. and Ma, D. and Liu, T.W.", TITLE = "Debiasing Multimodal Large Language Models via Noise-Aware Preference Optimization", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9423-9433", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230434"} @inproceedings{bb235460, AUTHOR = "Liang, Y. and Wang, Z.W. and Xu, X.W. and Zhou, J. and Lu, J.W.", TITLE = "EfficientLLaVA: Generalizable Auto-Pruning for Large Vision-language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9445-9454", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230435"} @inproceedings{bb235461, AUTHOR = "Jiao, Q. and Chen, D. and Huang, Y.L. and Ding, B.L. and Li, Y. and Shen, Y.", TITLE = "Img-Diff: Contrastive Data Synthesis for Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "9296-9307", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230436"} @inproceedings{bb235462, AUTHOR = "Heo, M. and Chen, M.H. and Huang, D.A. and Liu, S. and Radhakrishnan, S. and Kim, S.J. and Wang, Y.C.A.F. and Hachiuma, R.", TITLE = "Omni-RGPT: Unifying Image and Video Region-level Understanding via Token Marks", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3919-3930", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230437"} @inproceedings{bb235463, AUTHOR = "Ouali, Y. and Bulat, A. and Xenos, A. and Zaganidis, A. and Metaxas, I.M. and Martinez, B. and Tzimiropoulos, G.", TITLE = "VladVA: Discriminative Fine-tuning of LVLMs", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4101-4111", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230438"} @inproceedings{bb235464, AUTHOR = "Ye, X. and Gan, Y. and Ge, Y.X. and Zhang, X.P. and Tang, Y.S.", TITLE = "ATP-LLaVA: Adaptive Token Pruning for Large Vision Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24972-24982", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230439"} @inproceedings{bb235465, AUTHOR = "Schnaus, D. and Araslanov, N. and Cremers, D.", TITLE = "It's a (Blind) Match! Towards Vision-Language Correspondence without Parallel Data", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24983-24992", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230440"} @inproceedings{bb235466, AUTHOR = "Luo, G. and Yang, X. and Dou, W.H. and Wang, Z.K. and Liu, J.W. and Dai, J.F. and Qiao, Y. and Zhu, X.Z.", TITLE = "Mono-InternVL: Pushing the Boundaries of Monolithic Multimodal Large Language Models with Endogenous Visual Pre-training", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24960-24971", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230441"} @inproceedings{bb235467, AUTHOR = "Zhao, Y.Q. and Yin, Y.Y. and Li, L. and Lin, M. and Huang, V.S.J. and Chen, S.W. and Chen, W.P. and Yin, B. and Zhou, Z. and Zhang, W.T.", TITLE = "Beyond Sight: Towards Cognitive Alignment in LVLM via Enriched Visual Knowledge", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24950-24959", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230442"} @inproceedings{bb235468, AUTHOR = "Qi, D. and Zhao, H. and Shi, J. and Jenni, S. and Fan, Y.F. and Dernoncourt, F. and Cohen, S. and Li, S.", TITLE = "The Photographer's Eye: Teaching Multimodal Large Language Models to See and Critique like Photographers", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "24807-24816", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230443"} @inproceedings{bb235469, AUTHOR = "Liu, S. and Li, J.N. and Zhao, G.H. and Zhang, Y.J. and Meng, X. and Yu, F.R. and Ji, X.Y. and Li, M.", TITLE = "EventGPT: Event Stream Understanding with Multimodal Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29139-29149", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230444"} @inproceedings{bb235470, AUTHOR = "Zhao, S.Y. and Wang, Z. and Juefei Xu, F. and Xia, X. and Liu, M. and Wang, X.F. and Liang, M. and Zhang, N. and Metaxas, D.N. and Yu, L.C.", TITLE = "Accelerating Multimodal Large Language Models by Searching Optimal Vision Token Reduction", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29869-29879", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230445"} @inproceedings{bb235471, AUTHOR = "Ye, X. and Gan, Y. and Huang, X. and Ge, Y.X. and Tang, Y.S.", TITLE = "VoCo-LLaMA: Towards Vision Compression with Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29836-29846", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230446"} @inproceedings{bb235472, AUTHOR = "Yan, Z. and Li, Z.L. and He, Y. and Wang, C.T. and Li, K. and Li, X.H. and Zeng, X.Y. and Wang, Z. and Wang, Y. and Qiao, Y. and Wang, L.M. and Wang, Y.", TITLE = "Task Preference Optimization: Improving Multimodal Large Language Models with Vision Task Alignment", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29880-29892", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230447"} @inproceedings{bb235473, AUTHOR = "Chen, C. and Zhai, Y.P. and Zhao, Y.F. and Gao, J.Y. and Ding, B.L. and Li, J.", TITLE = "Provoking Multi-modal Few-Shot LVLM via Exploration-Exploitation In-Context Learning", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3826-3835", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230448"} @inproceedings{bb235474, AUTHOR = "Zhang, Y.T. and Lu, H. and Hu, Q.Y. and Wang, Y. and Yuan, K. and Liu, X. and Wu, K.", TITLE = "Period-LLM: Extending the Periodic Capability of Multimodal Large Language Model", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29237-29247", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230449"} @inproceedings{bb235475, AUTHOR = "Hu, Y. and Song, Z.K. and Feng, N. and Luo, Y. and Yu, J.Q. and Chen, Y.P.P. and Yang, W.", TITLE = "SF2T: Self-supervised Fragment Finetuning of Video-LLMs for Fine-Grained Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29108-29117", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230450"} @inproceedings{bb235476, AUTHOR = "Chen, J. and Zeng, Z.Y. and Lin, Y.Q. and Li, W. and Ma, Z. and Shou, M.Z.", TITLE = "Live: Learning Video LLM with Streaming Speech Transcription at Scale", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29083-29095", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230451"} @inproceedings{bb235477, AUTHOR = "Wang, Z.W. and Chen, W.Z. and Yang, L. and Zhou, S. and Zhao, S. and Zhan, H. and Jin, J.C. and Li, L.C. and Shao, Z. and Bu, J.J.", TITLE = "MP-GUI: Modality Perception with MLLMs for GUI Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29711-29721", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230452"} @inproceedings{bb235478, AUTHOR = "Vayani, A. and Dissanayake, D. and Watawana, H. and Ahsan, N. and Sasikumar, N. and Thawakar, O. and Ademtew, H.B. and Hmaiti, Y. and Kumar, A. and Kuckreja, K. and Maslych, M. and Ghallabi, W.A. and Mihaylov, M. and Qin, C. and Shaker, A.M. and Zhang, M. and Ihsani, M.K. and Esplana, A. and Gokani, M. and Mirkin, S. and Singh, H. and Srivastava, A. and Hamerlik, E. and Izzati, F.A. and Maani, F.A. and Cavada, S. and Chim, J. and Gupta, R. and Manjunath, S. and Zhumakhanova, K. and Rabevohitra, F.H. and Amirudin, A. and Ridzuan, M. and Kareem, D. and More, K. and Li, K. and Shakya, P. and Saad, M. and Ghasemaghaei, A. and Djanibekov, A. and Azizov, D. and Jankovic, B. and Bhatia, N. and Cabrera, A. and Obando Ceron, J. and Otieno, O. and Farestam, F. and Rabbani, M. and Baliah, S. and Sanjeev, S. and Shtanchaev, A. and Fatima, M. and Nguyen, T. and Kareem, A. and Aremu, T. and Xavier, N. and Bhatkal, A. and Toyin, H. and Chadha, A. and Cholakkal, H. and Anwer, R.M. and Felsberg, M. and Laaksonen, J. and Solorio, T. and Choudhury, M. and Laptev, I. and Shah, M. and Khan, S. and Khan, F.S.", TITLE = "All Languages Matter: Evaluating LMMs on Culturally Diverse 100 Languages", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "19565-19575", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230453"} @inproceedings{bb235479, AUTHOR = "Cao, A. and Wei, X. and Ma, Z.H.", TITLE = "FLAME: Frozen Large Language Models Enable Data-Efficient Language-Image Pre-training", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4080-4090", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230454"} @inproceedings{bb235480, AUTHOR = "Bi, J. and Guo, J.J. and Tang, Y.L. and Wen, L.G.B. and Liu, Z. and Wang, B.J. and Xu, C.L.", TITLE = "Unveiling Visual Perception in Language Models: An Attention Head Analysis Approach", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4135-4144", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230455"} @inproceedings{bb235481, AUTHOR = "Li, S. and Hu, Y.C. and Ning, X.F. and Liu, X.H. and Hong, K. and Jia, X.T. and Li, X. and Yan, Y.Q. and Ran, P. and Dai, G.H. and Yan, S. and Yang, H.Z. and Wang, Y.", TITLE = "MBQ: Modality-Balanced Quantization for Large Vision-Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4167-4177", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230456"} @inproceedings{bb235482, AUTHOR = "Lin, J. and Chen, H.R. and Fan, Y. and Fan, Y.Q. and Jin, X. and Su, H. and Fu, J. and Shen, X.Y.", TITLE = "Multi-Layer Visual Feature Fusion in Multimodal LLMs: Methods, Analysis, and Best Practices", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4156-4166", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230457"} @inproceedings{bb235483, AUTHOR = "Zhao, Q.Q. and Lu, Y. and Kim, M.J. and Fu, Z. and Zhang, Z.Y. and Wu, Y. and Li, Z. and Ma, Q.L. and Han, S. and Finn, C. and Handa, A. and Lin, T.Y. and Wetzstein, G. and Liu, M.Y. and Xiang, D.L.", TITLE = "CoT-VLA: Visual Chain-of-Thought Reasoning for Vision-Language-Action Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "1702-1713", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230458"} @inproceedings{bb235484, AUTHOR = "Liu, Z. and Li, Y.Q. and Nguyen, K.D. and Zhong, Y. and Li, Y.", TITLE = "PAVE: Patching and Adapting Video Large Language Models", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3306-3317", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230459"} @inproceedings{bb235485, AUTHOR = "Lu, X.D. and Chen, Y.H. and Chen, C. and Tan, H. and Chen, B. and Xie, Y. and Hu, R. and Tan, G.X. and Wu, R.S. and Hu, Y. and Zeng, Y. and Wu, L. and Bian, L.Y. and Wang, Z.X. and Liu, L. and Yang, Y.Z. and Xiao, H. and Zhou, A. and Wen, Y.F. and Chen, X.X. and Ren, S. and Li, H.S.", TITLE = "BlueLM-V-3B: Algorithm and System Co-Design for Multimodal Large Language Models on Mobile Devices", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "4145-4155", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230460"} @inproceedings{bb235486, AUTHOR = "Malakouti, S. and Aghazadeh, A. and Khandelwal, A. and Kovashka, A.", TITLE = "Benchmarking VLMs' Reasoning About Persuasive Atypical Images", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "4788-4798", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230461"} @inproceedings{bb235487, AUTHOR = "Lee, H. and Seo, G. and Choi, W. and Jung, G. and Song, K. and Jung, J.Y.", TITLE = "Enhancing Visual Classification Using Comparative Descriptors", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "5274-5283", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230462"} @inproceedings{bb235488, AUTHOR = "Ee, Y.K. and Zhang, H. and Matyasko, A. and Fernando, B.", TITLE = "Deduce and Select Evidences with Language Models for Training-Free Video Goal Inference", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "5937-5947", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230463"} @inproceedings{bb235489, AUTHOR = "Fu, R. and Liu, J.Y. and Chen, X. and Nie, Y.X. and Xiong, W.H.", TITLE = "Scene-LLM: Extending Language Model for 3D Visual Reasoning", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "2195-2206", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230464"} @inproceedings{bb235490, AUTHOR = "Awais, M. and Alharthi, A.H.S.A. and Kumar, A. and Cholakkal, H. and Anwer, R.M.", TITLE = "AgroGPT: Efficient Agricultural Vision-Language Model with Expert Tuning", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "5687-5696", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230465"} @inproceedings{bb235491, AUTHOR = "Chen, S. and Han, Z. and He, B. and Liu, J.Z. and Buckley, M. and Qin, Y. and Torr, P. and Tresp, V. and Gu, J.D.", TITLE = "Can Multimodal Large Language Models Truly Perform Multimodal In-Context Learning?", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "6000-6010", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230466"} @inproceedings{bb235492, AUTHOR = "Kruzhkov, E. and Behnke, S.", TITLE = "LiLMaps: Learnable Implicit Language Maps", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "7711-7720", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230467"} @inproceedings{bb235493, AUTHOR = "Singh, C.K. and Kumar, D. and Sanap, V. and Sinha, R.", TITLE = "LLM-RSPF: Large Language Model-Based Robotic System Planning Framework for Domain Specific Use-cases", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "7277-7286", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230468"} @inproceedings{bb235494, AUTHOR = "Wang, C.Y. and Luo, W.X. and Dong, S. and Xuan, X.H. and Li, Z.X. and Ma, L. and Gao, S.H.", TITLE = "MLLM-Tool: A Multimodal Large Language Model for Tool Agent Learning", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "6678-6687", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230469"} @inproceedings{bb235495, AUTHOR = "Sun, L. and Ahuja, C. and Chen, P. and D'Zmura, M. and Batmanghelich, K. and Bontrager, P.", TITLE = "Multi-Modal Large Language Models are Effective Vision Learners", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "8617-8626", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230470"} @inproceedings{bb235496, AUTHOR = "Tateno, M. and Yagi, T. and Furuta, R. and Sato, Y.", TITLE = "Learning Multiple Object States from Actions via Large Language Models", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "9555-9565", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230471"} @inproceedings{bb235497, AUTHOR = "Bahadir, C.D. and Akar, G.B. and Sabuncu, M.R.", TITLE = "LLM-Generated Rewrite and Context Modulation for Enhanced Vision Language Models in Digital Pathology", BOOKTITLE = WACV25, YEAR = "2025", PAGES = "327-336", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230472"} @inproceedings{bb235498, AUTHOR = "Chu, X.X. and Su, J.L. and Zhang, B. and Shen, C.H.", TITLE = "VisionlLaMA: A Unified LLaMA Backbone for Vision Tasks", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LXVI: 1-18", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230473"} @inproceedings{bb235499, AUTHOR = "Long, F.C. and Qiu, Z.F. and Yao, T. and Mei, T.", TITLE = "VideoStudio: Generating Consistent-content and Multi-scene Videos", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "LX: 468-485", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230474"}