@inproceedings{bb235900,
AUTHOR = "Shi, B.F. and Wu, Z.Y. and Mao, M.L. and Wang, X. and Darrell, T.J.",
TITLE = "When Do We Not Need Larger Vision Models?",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "VIII: 444-462",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230869"}
@inproceedings{bb235901,
AUTHOR = "Yu, Q.H. and Shen, X.H. and Chen, L.C.",
TITLE = "Towards Open-ended Visual Recognition with Large Language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XIV: 359-376",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230870"}
@inproceedings{bb235902,
AUTHOR = "Huang, K. and Zou, H. and Xi, Y. and Wang, B.C. and Xie, Z. and Yu, L.",
TITLE = "IVTP: Instruction-guided Visual Token Pruning for Large Vision-language
Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XVII: 214-230",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230871"}
@inproceedings{bb235903,
AUTHOR = "Liu, H.T. and Li, C.Y. and Li, Y.H. and Lee, Y.J.",
TITLE = "Improved Baselines with Visual Instruction Tuning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26286-26296",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230872"}
@inproceedings{bb235904,
AUTHOR = "Schiappa, M. and Abdullah, R. and Azad, S. and Claypoole, J. and Cogswell, M. and Divakaran, A. and Rawat, Y.",
TITLE = "Probing Conceptual Understanding of Large Visual-Language Models",
BOOKTITLE = WhatNext24,
YEAR = "2024",
PAGES = "1797-1807",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230873"}
@inproceedings{bb235905,
AUTHOR = "Yue, T.T. and Cheng, J. and GUo, L.T. and Dai, X.Y. and Zhao, Z. and He, X.J. and Xiong, G. and Lv, Y.S. and Liu, J.",
TITLE = "SC- Tune: Unleashing Self-Consistent Referential Comprehension in
Large Vision Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13073-13083",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230874"}
@inproceedings{bb235906,
AUTHOR = "Wu, T.H. and Lian, L. and Gonzalez, J.E. and Li, B. and Darrell, T.J.",
TITLE = "Self-Correcting LLM-Controlled Diffusion Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "6327-6336",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230875"}
@inproceedings{bb235907,
AUTHOR = "Zheng, D. and Huang, S. and Zhao, L. and Zhong, Y. and Wang, L.W.",
TITLE = "Towards Learning a Generalist Model for Embodied Navigation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13624-13634",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230876"}
@inproceedings{bb235908,
AUTHOR = "Singh, S. and Fore, M. and Stamoulis, D.",
TITLE = "GeoLLM-Engine: A Realistic Environment for Building Geospatial
Copilots",
BOOKTITLE = EarthVision24,
YEAR = "2024",
PAGES = "585-594",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230877"}
@inproceedings{bb235909,
AUTHOR = "Zhang, Y.C. and Qian, S.J. and Peng, B. and Liu, S. and Jia, J.Y.",
TITLE = "Prompt Highlighter: Interactive Control for Multi-Modal LLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13215-13224",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230878"}
@inproceedings{bb235910,
AUTHOR = "Wang, D.K. and Xuan, S.Y. and Zhang, S.L.",
TITLE = "LocLLM: Exploiting Generalizable Human Keypoint Localization via
Large Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "614-623",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230879"}
@inproceedings{bb235911,
AUTHOR = "Liu, H.C. and Zhan, X.H. and Huang, S.L. and Mu, T.J. and Shan, Y.",
TITLE = "Programmable Motion Generation for Open-Set Motion Control Tasks",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "1399-1408",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230880"}
@inproceedings{bb235912,
AUTHOR = "Zhao, L. and Yang, Y. and Zhang, K. and Shao, W.Q. and Zhang, Y.X. and Qiao, Y. and Luo, P. and Ji, R.R.",
TITLE = "DiffAgent: Fast and Accurate Text-to-Image API Selection with Large
Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "6390-6399",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230881"}
@inproceedings{bb235913,
AUTHOR = "Yao, J. and Liu, Y.J. and Dong, Z. and Guo, M.F. and Hu, H. and Keutzer, K. and Du, L. and Zhou, D. and Zhang, S.H.",
TITLE = "PromptCoT: Align Prompt Distribution via Adapted Chain-of-Thought",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "7027-7037",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230882"}
@inproceedings{bb235914,
AUTHOR = "Cai, Z.P. and Mueller, M. and Birkl, R. and Wofk, D. and Tseng, S.Y. and Cheng, J. and Stan, G.B.M. and Lai, V. and Paulitsch, M.",
TITLE = "L-MAGIC: Language Model Assisted Generation of Images with Coherence",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "7049-7058",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230883"}
@inproceedings{bb235915,
AUTHOR = "Li, Y. and Liu, X. and Kag, A. and Hu, J. and Idelbayev, Y. and Sagar, D. and Wang, Y.Z. and Tulyakov, S. and Ren, J.",
TITLE = "TextCraftor: Your Text Encoder can be Image Quality Controller",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "7985-7995",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230884"}
@inproceedings{bb235916,
AUTHOR = "Argaw, D.M. and Yoon, S.H. and Heilbron, F.C. and Deilamsalehy, H. and Bui, T. and Wang, Z.W. and Dernoncourt, F. and Chung, J.S.",
TITLE = "Scaling Up Video Summarization Pretraining with Large Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "8332-8341",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230885"}
@inproceedings{bb235917,
AUTHOR = "Lai, X. and Tian, Z. and Chen, Y.K. and Li, Y.W. and Yuan, Y.H. and Liu, S. and Jia, J.Y.",
TITLE = "LISA: Reasoning Segmentation via Large Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "9579-9589",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230886"}
@inproceedings{bb235918,
AUTHOR = "Shang, C.M. and Zhou, S. and Zhang, H.Y. and Ni, X.Z. and Yang, Y. and Wang, Y.W.",
TITLE = "Incremental Residual Concept Bottleneck Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "11030-11040",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230887"}
@inproceedings{bb235919,
AUTHOR = "Xie, Y.T. and Chen, Q. and Wang, S. and To, M.S. and Lee, I. and Khoo, E.W. and Hendy, K. and Koh, D. and Xia, Y. and Wu, Q.",
TITLE = "PairAug: What Can Augmented Image-Text Pairs Do for Radiology?",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "11652-11661",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230888"}
@inproceedings{bb235920,
AUTHOR = "Dong, Z.K. and Liu, X.L. and Chen, B. and Polak, P. and Zhang, P.",
TITLE = "MuseChat: A Conversational Music Recommendation System for Videos",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12775-12785",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230889"}
@inproceedings{bb235921,
AUTHOR = "Li, F. and Jiang, Q. and Zhang, H. and Ren, T. and Liu, S.L. and Zou, X. and Xu, H.Z. and Li, H.Y. and Yang, J.W. and Li, C.Y. and Zhang, L. and Gao, J.F.",
TITLE = "Visual in-Context Prompting",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12861-12871",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230890"}
@inproceedings{bb235922,
AUTHOR = "Sachdeva, R. and Zisserman, A.",
TITLE = "The Manga Whisperer: Automatically Generating Transcriptions for
Comics",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12967-12976",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230891"}
@inproceedings{bb235923,
AUTHOR = "Zhong, S.S. and Huang, Z.Z. and Gao, S. and Wen, W. and Lin, L. and Zitnik, M. and Zhou, P.",
TITLE = "Let's Think Outside the Box: Exploring Leap-of-Thought in Large
Language Models with Creative Humor Generation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13246-13257",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230892"}
@inproceedings{bb235924,
AUTHOR = "Gao, Z. and Du, Y.T. and Zhang, X.T. and Ma, X.J. and Han, W.J. and Zhu, S.C. and Li, Q.",
TITLE = "CLOVA: A Closed-LOop Visual Assistant with Tool Usage and Update",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13258-13268",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230893"}
@inproceedings{bb235925,
AUTHOR = "Buettner, K. and Malakouti, S. and Li, X.L. and Kovashka, A.",
TITLE = "Incorporating Geo-Diverse Knowledge into Prompting for Increased
Geographical Robustness in Object Recognition",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13515-13524",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230894"}
@inproceedings{bb235926,
AUTHOR = "Liu, R. and Li, C. and Ge, Y.X. and Li, T.H. and Shan, Y. and Li, G.",
TITLE = "BT-Adapter: Video Conversation is Feasible Without Video Instruction
Tuning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13658-13667",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230895"}
@inproceedings{bb235927,
AUTHOR = "Li, J.X. and Vo, D.M. and Sugimoto, A. and Nakayama, H.",
TITLE = "Evcap: Retrieval-Augmented Image Captioning with External Visual-Name
Memory for Open-World Comprehension",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13733-13742",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230896"}
@inproceedings{bb235928,
AUTHOR = "Song, L. and Chen, Y.K. and Yang, S. and Ding, X.H. and Ge, Y.X. and Chen, Y.C. and Shan, Y.",
TITLE = "Low-Rank Approximation for Sparse Attention in Multi-Modal LLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13763-13773",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230897"}
@inproceedings{bb235929,
AUTHOR = "Guo, Q. and de Mello, S. and Yin, H.X. and Byeon, W. and Cheung, K.C. and Yu, Y.Z. and Luo, P. and Liu, S.",
TITLE = "RegionGPT: Towards Region Understanding Vision Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13796-13806",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230898"}
@inproceedings{bb235930,
AUTHOR = "Yu, T.Y. and Yao, Y. and Zhang, H.Y. and He, T. and Han, Y.F. and Cui, G. and Hu, J.Y. and Liu, Z.Y. and Zheng, H.T. and Sun, M.",
TITLE = "RLHF-V: Towards Trustworthy MLLMs via Behavior Alignment from
Fine-Grained Correctional Human Feedback",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13807-13816",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230899"}
@inproceedings{bb235931,
AUTHOR = "Xuan, S.Y. and Guo, Q. and Yang, M. and Zhang, S.L.",
TITLE = "Pink: Unveiling the Power of Referential Comprehension for
Multi-modal LLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13838-13848",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230900"}
@inproceedings{bb235932,
AUTHOR = "Yu, Q. and Sun, Q. and Zhang, X.S. and Cui, Y.F. and Zhang, F. and Cao, Y. and Wang, X.L. and Liu, J.J.",
TITLE = "CapsFusion: Rethinking Image-Text Data at Scale",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14022-14032",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230901"}
@inproceedings{bb235933,
AUTHOR = "Yao, J.W. and Qian, Q. and Hu, J.",
TITLE = "Multi-Modal Proxy Learning Towards Personalized Visual Multiple
Clustering",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14066-14075",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230902"}
@inproceedings{bb235934,
AUTHOR = "Zou, B. and Yang, C. and Qiao, Y. and Quan, C.B. and Zhao, Y.J.",
TITLE = "LLaMA-Excitor: General Instruction Tuning via Indirect Feature
Interaction",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14089-14099",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230903"}
@inproceedings{bb235935,
AUTHOR = "Hong, W. and Wang, W.H. and Lv, Q.S. and Xu, J.Z. and Yu, W. and Ji, J.H. and Wang, Y. and Wang, Z. and Dong, Y.X. and Ding, M. and Tang, J.",
TITLE = "CogAgent: A Visual Language Model for GUI Agents",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14281-14290",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230904"}
@inproceedings{bb235936,
AUTHOR = "Luo, C. and Shen, Y.F. and Zhu, Z.Q. and Zheng, Q. and Yu, Z. and Yao, C.",
TITLE = "LayoutLLM: Layout Instruction Tuning with Large Language Models for
Document Understanding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "15630-15640",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230905"}
@inproceedings{bb235937,
AUTHOR = "Yang, Y. and Sun, F.Y. and Weihs, L. and Vanderbilt, E. and Herrasti, A. and Han, W. and Wu, J.J. and Haber, N. and Krishna, R. and Liu, L.J. and Callison Burch, C. and Yatskar, M. and Kembhavi, A. and Clark, C.",
TITLE = "Holodeck: Language Guided Generation of 3D Embodied AI Environments",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "16277-16287",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230906"}
@inproceedings{bb235938,
AUTHOR = "Qin, Y.R. and Zhou, E. and Liu, Q. and Yin, Z.F. and Sheng, L. and Zhang, R.M. and Qiao, Y. and Shao, J.",
TITLE = "MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active
Perception",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "16307-16316",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230907"}
@inproceedings{bb235939,
AUTHOR = "Zhang, S. and Yu, X.Y. and Song, X.H. and Wang, X.H. and Jiang, S.Q.",
TITLE = "Imagine Before Go: Self-Supervised Generative Map for Object Goal
Navigation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "16414-16425",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230908"}
@inproceedings{bb235940,
AUTHOR = "Li, H. and Yang, X. and Wang, Z.K. and Zhu, X.Z. and Zhou, J. and Qiao, Y. and Wang, X.G. and Li, H.S. and Lu, L.W. and Dai, J.F.",
TITLE = "Auto MC-Reward: Automated Dense Reward Design with Large Language
Models for Minecraft",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "16426-16435",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230909"}
@inproceedings{bb235941,
AUTHOR = "Liu, M.X. and Hayes, T.L. and Ricci, E. and Csurka, G. and Volpi, R.",
TITLE = "SHiNe: Semantic Hierarchy Nexus for Open-Vocabulary Object Detection",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "16634-16644",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230910"}
@inproceedings{bb235942,
AUTHOR = "Kim, J. and Cho, E. and Kim, S. and Kim, H.W.J.",
TITLE = "Retrieval-Augmented Open-Vocabulary Object Detection",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "17427-17436",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230911"}
@inproceedings{bb235943,
AUTHOR = "Saha, O. and van Horn, G. and Maji, S.",
TITLE = "Improved Zero-Shot Classification by Adapting VLMs with Text
Descriptions",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "17542-17552",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230912"}
@inproceedings{bb235944,
AUTHOR = "Toubal, I.E. and Avinash, A. and Alldrin, N.G. and Dlabal, J. and Zhou, W. and Luo, E. and Stretcu, O. and Xiong, H. and Lu, C.T. and Zhou, H. and Krishna, R. and Fuxman, A. and Duerig, T.",
TITLE = "Modeling Collaborator: Enabling Subjective Vision Classification with
Minimal Human Effort via LLM Tool-Use",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "17553-17563",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230913"}
@inproceedings{bb235945,
AUTHOR = "Han, T. and Bain, M. and Nagrani, A. and Varol, G. and Xie, W. and Zisserman, A.",
TITLE = "AutoAD III: The Prequel: Back to the Pixels",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18164-18174",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230914"}
@inproceedings{bb235946,
AUTHOR = "Qu, H.X. and Cai, Y.J. and Liu, J.",
TITLE = "LLMs are Good Action Recognizers",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18395-18406",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230915"}
@inproceedings{bb235947,
AUTHOR = "Chen, J. and Lv, Z.Y. and Wu, S.W. and Lin, K.Q. and Song, C. and Gao, D.F. and Liu, J.W. and Gao, Z.T. and Mao, D.X. and Shou, M.Z.",
TITLE = "VideoLLM-online: Online Video Large Language Model for Streaming
Video",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18407-18418",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230916"}
@inproceedings{bb235948,
AUTHOR = "Zhu, A. and Ke, Q.H. and Gong, M.M. and Bailey, J.",
TITLE = "Part-Aware Unified Representation of Language and Skeleton for
Zero-Shot Action Recognition",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18761-18770",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230917"}
@inproceedings{bb235949,
AUTHOR = "Chen, T.J. and Yu, H.S. and Yang, Z.G. and Li, Z.C. and Sun, W. and Chen, C.",
TITLE = "OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor
for General Video Recognition",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18888-18898",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230918"}
@inproceedings{bb235950,
AUTHOR = "Zhao, Q.H. and Dai, Y. and Li, H. and Hu, W. and Zhang, F. and Liu, J.",
TITLE = "LTGC: Long-Tail Recognition via Leveraging LLMs-Driven Generated
Content",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "19510-19520",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230919"}
@inproceedings{bb235951,
AUTHOR = "Siddiqui, Y. and Alliegro, A. and Artemov, A. and Tommasi, T. and Sirigatti, D. and Rosov, V. and Dai, A. and Nießner, M.",
TITLE = "MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "19615-19625",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230920"}
@inproceedings{bb235952,
AUTHOR = "Li, Z. and Gao, Z.Y. and Tan, C. and Ren, B. and Yang, L.T. and Li, S.Z.",
TITLE = "General Point Model Pretraining with Autoencoding and Autoregressive",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "20954-20964",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230921"}
@inproceedings{bb235953,
AUTHOR = "Li, K.C. and Wang, Y. and He, Y. and Li, Y.Z. and Wang, Y. and Liu, Y. and Wang, Z. and Xu, J. and Chen, G. and Lou, P. and Wang, L.M. and Qiao, Y.",
TITLE = "MVBench: A Comprehensive Multi-modal Video Understanding Benchmark",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "22195-22206",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230922"}
@inproceedings{bb235954,
AUTHOR = "Dunlap, L. and Zhang, Y.H. and Wang, X.H. and Zhong, R.Q. and Darrell, T.J. and Steinhardt, J. and Gonzalez, J.E. and Yeung Levy, S.",
TITLE = "Describing Differences in Image Sets with Natural Language",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "24199-24208",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230923"}
@inproceedings{bb235955,
AUTHOR = "Ishmam, A.M. and Thomas, C.",
TITLE = "Semantic Shield: Defending Vision-Language Models Against Backdooring
and Poisoning via Fine-Grained Knowledge Alignment",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "24820-24830",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230924"}
@inproceedings{bb235956,
AUTHOR = "Yang, Y.J. and Zhou, T.Y. and Li, K. and Tao, D.P. and Li, L. and Shen, L. and He, X.D. and Jiang, J. and Shi, Y.H.",
TITLE = "Embodied Multi-Modal Agent trained by an LLM from a Parallel
TextWorld",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26265-26275",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230925"}
@inproceedings{bb235957,
AUTHOR = "Hong, Y. and Zheng, Z. and Chen, P.H. and Wang, Y.F. and Li, J. and Gan, C.",
TITLE = "MultiPLY: A Multisensory Object-Centric Embodied Large Language Model
in 3D World",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26396-26406",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230926"}
@inproceedings{bb235958,
AUTHOR = "Han, J.M. and Gong, K.X. and Zhang, Y.Y. and Wang, J.Q. and Zhang, K. and Lin, D. and Qiao, Y. and Gao, P. and Yue, X.Y.",
TITLE = "OneLLM: One Framework to Align All Modalities with Language",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26574-26585",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230927"}
@inproceedings{bb235959,
AUTHOR = "Xie, H.X. and Peng, C.J. and Tseng, Y.W. and Chen, H.J. and Hsu, C.F. and Shuai, H.H. and Cheng, W.H.",
TITLE = "EmoVIT: Revolutionizing Emotion Insights with Visual Instruction
Tuning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26586-26595",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230928"}
@inproceedings{bb235960,
AUTHOR = "Wang, X.Y. and Zhuang, B. and Wu, Q.",
TITLE = "ModaVerse: Efficiently Transforming Modalities with LLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26596-26606",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230929"}
@inproceedings{bb235961,
AUTHOR = "Lin, J. and Yin, H.X. and Ping, W. and Molchanov, P. and Shoeybi, M. and Han, S.",
TITLE = "VILA: On Pre-training for Visual Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26679-26689",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230930"}
@inproceedings{bb235962,
AUTHOR = "Lyu, Y.H. and Zheng, X. and Zhou, J.Z. and Wang, L.",
TITLE = "UniBind: LLM-Augmented Unified and Balanced Representation Space to
Bind Them All",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26742-26752",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230931"}
@inproceedings{bb235963,
AUTHOR = "Zhu, L. and Wei, F. and Lu, Y.",
TITLE = "Beyond Text: Frozen Large Language Models in Visual Signal
Comprehension",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27037-27047",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230932"}
@inproceedings{bb235964,
AUTHOR = "Tang, Z. and Yang, Z. and Khademi, M. and Liu, Y. and Zhu, C.G. and Bansal, M.",
TITLE = "CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any
Generation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27415-27424",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230933"}
@inproceedings{bb235965,
AUTHOR = "Yuan, Y.Q. and Li, W. and Liu, J. and Tang, D.Q. and Luo, X.J. and Qin, C. and Zhang, L. and Zhu, J.",
TITLE = "Osprey: Pixel Understanding with Visual Instruction Tuning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "28202-28211",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230934"}
@inproceedings{bb235966,
AUTHOR = "Zheng, Z.H. and Wei, J. and Hu, X.F. and Zhu, H.D. and Nevatia, R.",
TITLE = "Large Language Models are Good Prompt Learners for Low-Shot Image
Classification",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "28453-28462",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230935"}
@inproceedings{bb235967,
AUTHOR = "He, H.Y. and Pan, Z.Z. and Liu, J. and Cai, J.F. and Zhuang, B.",
TITLE = "Efficient Stitchable Task Adaptation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "28555-28565",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230936"}
@inproceedings{bb235968,
AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.",
TITLE = "ArGue: Attribute-Guided Prompt Tuning for Vision-Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "28578-28587",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230937"}
@inproceedings{bb235969,
AUTHOR = "Lv, J.X. and Huang, Y. and Yan, M. and Huang, J.C. and Liu, J.Z. and Liu, Y.F. and Wen, Y.F. and Chen, X.X. and Chen, S.F.",
TITLE = "GPT4Motion: Scripting Physical Motions in Text-to-Video Generation
via Blender-Oriented GPT Planning",
BOOKTITLE = PBDL24,
YEAR = "2024",
PAGES = "1430-1440",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230938"}
@inproceedings{bb235970,
AUTHOR = "Wang, J.C. and Ke, L.",
TITLE = "LLM-Seg: Bridging Image Segmentation and Large Language Model
Reasoning",
BOOKTITLE = WhatNext24,
YEAR = "2024",
PAGES = "1765-1774",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230939"}
@inproceedings{bb235971,
AUTHOR = "Hakim, Z.I.A. and Sarker, N.H. and Singh, R.P. and Paul, B. and Dabouei, A. and Xu, M.",
TITLE = "Leveraging Generative Language Models for Weakly Supervised Sentence
Component Analysis in Video-Language Joint Learning",
BOOKTITLE = MULA24,
YEAR = "2024",
PAGES = "1975-1985",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230940"}
@inproceedings{bb235972,
AUTHOR = "Deria, A. and Kumar, K. and Chakraborty, S. and Mahapatra, D. and Roy, S.",
TITLE = "InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report
Generation",
BOOKTITLE = MULA24,
YEAR = "2024",
PAGES = "2028-2038",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230941"}
@inproceedings{bb235973,
AUTHOR = "Arefeen, M.A. and Debnath, B. and Uddin, M.Y.S. and Chakradhar, S.",
TITLE = "ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based
Video Analysis System",
BOOKTITLE = Reasoning24,
YEAR = "2024",
PAGES = "2266-2274",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230942"}
@inproceedings{bb235974,
AUTHOR = "Chen, Y.W. and Chu, S.Y.",
TITLE = "Large Language Models in Wargaming: Methodology, Application, and
Robustness",
BOOKTITLE = AML24,
YEAR = "2024",
PAGES = "2894-2903",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230943"}
@inproceedings{bb235975,
AUTHOR = "Lai, Z.X. and Wu, J. and Chen, S. and Zhou, Y.C. and Hovakimyan, N.",
TITLE = "Residual-based Language Models are Free Boosters for Biomedical
Imaging Tasks",
BOOKTITLE = DEF-AI-MIA24,
YEAR = "2024",
PAGES = "5086-5096",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230944"}
@inproceedings{bb235976,
AUTHOR = "Fang, X. and Wang, W.G. and Lv, X.X. and Yan, J.",
TITLE = "PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt
Condition",
BOOKTITLE = NTIRE24,
YEAR = "2024",
PAGES = "6167-6176",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230945"}
@inproceedings{bb235977,
AUTHOR = "Ye, Z. and Liu, J.X. and Cao, J.J. and Chen, Z.Y. and Xuan, Z.W. and Zhou, M.Y. and Liu, Q. and Qi, G.J.",
TITLE = "OpenStory: A Large-Scale Open-Domain Dataset for Subject-Driven
Visual Storytelling",
BOOKTITLE = VDU24,
YEAR = "2024",
PAGES = "7953-7962",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230946"}
@inproceedings{bb235978,
AUTHOR = "Chen, X.Y. and Liu, J. and Wang, Y. and Wang, P.P. and Brand, M. and Wang, G.H. and Koike Akino, T.",
TITLE = "SuperLoRA: Parameter-Efficient Unified Adaptation for Large Vision
Models",
BOOKTITLE = ECV24,
YEAR = "2024",
PAGES = "8050-8055",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230947"}
@inproceedings{bb235979,
AUTHOR = "Wei, C. and Liu, C.X. and Qiao, S.Y. and Zhang, Z.S. and Yuille, A.L. and Yu, J.H.",
TITLE = "De-Diffusion Makes Text a Strong Cross-Modal Interface",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13492-13503",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230948"}
@inproceedings{bb235980,
AUTHOR = "Chen, B. and Xu, Z. and Kirmani, S. and Ichter, B. and Sadigh, D. and Guibas, L.J. and Xia, F.",
TITLE = "SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning
Capabilities",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14455-14465",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230949"}
@inproceedings{bb235981,
AUTHOR = "Dorkenwald, M. and Barazani, N. and Snoek, C.G.M. and Asano, Y.M.",
TITLE = "PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13548-13558",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230950"}
@inproceedings{bb235982,
AUTHOR = "Sun, Z.Y. and Fang, Y. and Wu, T. and Zhang, P. and Zang, Y.H. and Kong, S. and Xiong, Y.J. and Lin, D. and Wang, J.Q.",
TITLE = "Alpha-CLIP: A CLIP Model Focusing on Wherever you Want",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13019-13029",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230951"}
@inproceedings{bb235983,
AUTHOR = "Parashar, S. and Lin, Z.Q. and Liu, T. and Dong, X.J. and Li, Y. and Ramanan, D. and Caverlee, J. and Kong, S.",
TITLE = "The Neglected Tails in Vision-Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12988-12997",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230952"}
@inproceedings{bb235984,
AUTHOR = "Luo, Y. and Shi, M. and Khan, M.O. and Afzal, M.M. and Huang, H. and Yuan, S. and Tian, Y. and Song, L. and Kouhana, A. and Elze, T. and Fang, Y. and Wang, M.Y.",
TITLE = "FairCLIP: Harnessing Fairness in Vision-Language Learning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12289-12301",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230953"}
@inproceedings{bb235985,
AUTHOR = "Zara, G. and Conti, A. and Roy, S. and Lathuiliere, S. and Rota, P. and Ricci, E.",
TITLE = "The Unreasonable Effectiveness of Large Language-Vision Models for
Source-free Video Domain Adaptation",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "10273-10283",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230954"}
@inproceedings{bb235986,
AUTHOR = "Zhao, H.B. and Ni, B.L. and Fan, J.S. and Wang, Y.X. and Chen, Y.T. and Meng, G.F. and Zhang, Z.X.",
TITLE = "Continual Forgetting for Pre-Trained Vision Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "28631-28642",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230955"}
@inproceedings{bb235987,
AUTHOR = "Zhan, X.Y. and Yang, L.X. and Zhao, Y.F. and Mao, K. and Xu, H.L. and Lin, Z. and Li, K.L. and Lu, C.",
TITLE = "OakInk2: A Dataset of Bimanual Hands-Object Manipulation in Complex
Task Completion",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "445-456",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230956"}
@inproceedings{bb235988,
AUTHOR = "Li, Y.C. and Zhao, N. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.",
TITLE = "LASO: Language-Guided Affordance Segmentation on 3D Object",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14251-14260",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230957"}
@inproceedings{bb235989,
AUTHOR = "Rotstein, N. and Bensaid, D. and Brody, S. and Ganz, R. and Kimmel, R.",
TITLE = "FuseCap: Leveraging Large Language Models for Enriched Fused Image
Captions",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "5677-5688",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT230958"}
@article{bb235990,
AUTHOR = "Wang, Z. and Cai, S.F. and Liu, A. and Jin, Y.G. and Hou, J. and Zhang, B. and Lin, H. and He, Z.F. and Zheng, Z.L. and Yang, Y.D. and Ma, X.J. and Liang, Y.",
TITLE = "JARVIS-1: Open-World Multi-Task Agents With Memory-Augmented
Multimodal Language Models",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "3",
MONTH = "March",
PAGES = "1894-1907",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230959"}
@article{bb235991,
AUTHOR = "Li, Y.X. and Jiang, S.Y. and Hu, B.T. and Wang, L.Y. and Zhong, W.Q. and Luo, W.H. and Ma, L. and Zhang, M.",
TITLE = "Uni-MoE: Scaling Unified Multimodal LLMs With Mixture of Experts",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "5",
MONTH = "May",
PAGES = "3424-3439",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230960"}
@article{bb235992,
AUTHOR = "Huang, Z.Z. and Zhong, S.S. and Zhou, P. and Gao, S. and Zitnik, M. and Lin, L.",
TITLE = "A Causality-Aware Paradigm for Evaluating Creativity of Multimodal
Large Language Models",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "5",
MONTH = "May",
PAGES = "3830-3846",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230961"}
@article{bb235993,
AUTHOR = "Villani, F. and Maljkovic, I. and Lazzaro, D. and Sotgiu, A. and Cina, A.E. and Roli, F.",
TITLE = "Robust image classification with multi-modal large language models",
JOURNAL = PRL,
VOLUME = "194",
YEAR = "2025",
PAGES = "1-7",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230962"}
@article{bb235994,
AUTHOR = "Shao, Z.W. and Yu, Z. and Yu, J. and Ouyang, X.C. and Zheng, L. and Gai, Z.B. and Wang, M.Y. and Kuang, Z.Z. and Ding, J.J.",
TITLE = "Imp: Highly Capable Large Multimodal Models for Mobile Devices",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "2961-2974",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230963"}
@article{bb235995,
AUTHOR = "Ge, J. and Zhang, X. and Zheng, Y. and Guo, K. and Liang, J.",
TITLE = "RSTeller: Scaling up visual language modeling in remote sensing with
rich linguistic semantics from openly available data and large
language models",
JOURNAL = PandRS,
VOLUME = "226",
YEAR = "2025",
PAGES = "146-163",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230964"}
@article{bb235996,
AUTHOR = "Li, Z.S. and Muhtar, D. and Gu, F. and He, Y.L.X. and Zhang, X.L. and Xiao, P.F. and He, G. and Zhu, X.X.",
TITLE = "LHRS-Bot-Nova: Improved multimodal large language model for remote
sensing vision-language interpretation",
JOURNAL = PandRS,
VOLUME = "227",
YEAR = "2025",
PAGES = "539-550",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230965"}
@inproceedings{bb235997,
AUTHOR = "Muhtar, D. and Li, Z.S. and Gu, F. and Zhang, X.L. and Xiao, P.F.",
TITLE = "Lhrs-bot: Empowering Remote Sensing with Vgi-enhanced Large Multimodal
Language Model",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXIV: 440-457",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230966"}
@article{bb235998,
AUTHOR = "Li, X. and Zheng, Y. and Chen, H.T. and Chen, X.L. and Liang, Y.X. and Lai, C.H. and Li, B. and Xue, X.Y.",
TITLE = "Instruction-guided fusion of multi-layer visual features in Large
Vision-Language Models",
JOURNAL = PR,
VOLUME = "170",
YEAR = "2026",
PAGES = "111932",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230967"}
@article{bb235999,
AUTHOR = "Zhang, W.Y. and Wu, L. and Zhang, Z.Q. and Yu, T. and Ma, C. and Jin, X. and Yang, X.K. and Zeng, W.J.",
TITLE = "Unleash the Power of Vision-Language Models by Visual Attention
Prompt and Multimodal Interaction",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "2399-2411",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmllm5.html#TT230968"}
Last update:Nov 2, 2025 at 14:03:07