@inproceedings{bb228100,
        AUTHOR = "Zhang, J.W. and Xu, C. and Li, B.",
        TITLE = "ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for
Autonomous Vehicles",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15459-15469",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223102"}

@inproceedings{bb228101,
        AUTHOR = "Liu, C. and Yin, K. and Cao, H.Y. and Jiang, X.H. and Li, X. and Liu, Y. and Jiang, D.Q. and Sun, X. and Xu, L.",
        TITLE = "HRVDA: High-Resolution Visual Document Assistant",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15534-15545",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223103"}

@inproceedings{bb228102,
        AUTHOR = "Blau, T. and Fogel, S. and Ronen, R. and Golts, A. and Tsiper, S. and Avraham, E.B. and Aberdam, A. and Ganz, R. and Litman, R.",
        TITLE = "GRAM: Global Reasoning for Multi-Page VQA",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15598-15607",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223104"}

@inproceedings{bb228103,
        AUTHOR = "Luo, C. and Shen, Y.F. and Zhu, Z.Q. and Zheng, Q. and Yu, Z. and Yao, C.",
        TITLE = "LayoutLLM: Layout Instruction Tuning with Large Language Models for
Document Understanding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15630-15640",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223105"}

@inproceedings{bb228104,
        AUTHOR = "Yang, Y. and Sun, F.Y. and Weihs, L. and Vanderbilt, E. and Herrasti, A. and Han, W. and Wu, J.J. and Haber, N. and Krishna, R. and Liu, L.J. and Callison Burch, C. and Yatskar, M. and Kembhavi, A. and Clark, C.",
        TITLE = "Holodeck: Language Guided Generation of 3D Embodied AI Environments",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16277-16287",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223106"}

@inproceedings{bb228105,
        AUTHOR = "Qin, Y. and Zhou, E. and Liu, Q. and Yin, Z.F. and Sheng, L. and Zhang, R.M. and Qiao, Y. and Shao, J.",
        TITLE = "MP5: A Multi-modal Open-ended Embodied System in Minecraft via Active
Perception",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16307-16316",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223107"}

@inproceedings{bb228106,
        AUTHOR = "Zhang, S. and Yu, X.Y. and Song, X.H. and Wang, X.H. and Jiang, S.Q.",
        TITLE = "Imagine Before Go: Self-Supervised Generative Map for Object Goal
Navigation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16414-16425",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223108"}

@inproceedings{bb228107,
        AUTHOR = "Li, H. and Yang, X. and Wang, Z.K. and Zhu, X.Z. and Zhou, J. and Qiao, Y. and Wang, X.G. and Li, H.S. and Lu, L.W. and Dai, J.F.",
        TITLE = "Auto MC-Reward: Automated Dense Reward Design with Large Language
Models for Minecraft",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16426-16435",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223109"}

@inproceedings{bb228108,
        AUTHOR = "Liu, M.X. and Hayes, T.L. and Ricci, E. and Csurka, G. and Volpi, R.",
        TITLE = "SHiNe: Semantic Hierarchy Nexus for Open-Vocabulary Object Detection",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16634-16644",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223110"}

@inproceedings{bb228109,
        AUTHOR = "Lei, T. and Yin, S.F. and Liu, Y.",
        TITLE = "Exploring the Potential of Large Foundation Models for
Open-Vocabulary HOI Detection",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16657-16667",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223111"}

@inproceedings{bb228110,
        AUTHOR = "Kim, J. and Cho, E. and Kim, S. and Kim, H.W.J.",
        TITLE = "Retrieval-Augmented Open-Vocabulary Object Detection",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "17427-17436",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223112"}

@inproceedings{bb228111,
        AUTHOR = "Saha, O. and van Horn, G. and Maji, S.",
        TITLE = "Improved Zero-Shot Classification by Adapting VLMs with Text
Descriptions",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "17542-17552",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223113"}

@inproceedings{bb228112,
        AUTHOR = "Toubal, I.E. and Avinash, A. and Alldrin, N.G. and Dlabal, J. and Zhou, W. and Luo, E. and Stretcu, O. and Xiong, H. and Lu, C.T. and Zhou, H. and Krishna, R. and Fuxman, A. and Duerig, T.",
        TITLE = "Modeling Collaborator: Enabling Subjective Vision Classification with
Minimal Human Effort via LLM Tool-Use",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "17553-17563",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223114"}

@inproceedings{bb228113,
        AUTHOR = "Li, X.Q. and Zhang, M.X. and Geng, Y. and Geng, H.R. and Long, Y.X. and Shen, Y. and Zhang, R.R. and Liu, J. and Dong, H.",
        TITLE = "ManipLLM: Embodied Multimodal Large Language Model for Object-Centric
Robotic Manipulation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18061-18070",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223115"}

@inproceedings{bb228114,
        AUTHOR = "Han, T. and Bain, M. and Nagrani, A. and Varol, G. and Xie, W. and Zisserman, A.",
        TITLE = "AutoAD III: The Prequel: Back to the Pixels",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18164-18174",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223116"}

@inproceedings{bb228115,
        AUTHOR = "Song, E. and Chai, W.H. and Wang, G. and Zhang, Y.C. and Zhou, H.Y. and Wu, F. and Chi, H.Z. and Guo, X. and Ye, T. and Zhang, Y.T. and Lu, Y. and Hwang, J.N. and Wang, G.",
        TITLE = "MovieChat: From Dense Token to Sparse Memory for Long Video
Understanding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18221-18232",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223117"}

@inproceedings{bb228116,
        AUTHOR = "Qu, H.X. and Cai, Y.J. and Liu, J.",
        TITLE = "LLMs are Good Action Recognizers",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18395-18406",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223118"}

@inproceedings{bb228117,
        AUTHOR = "Chen, J. and Lv, Z.Y. and Wu, S.W. and Lin, K.Q. and Song, C. and Gao, D.F. and Liu, J.W. and Gao, Z.T. and Mao, D.X. and Shou, M.Z.",
        TITLE = "VideoLLM-online: Online Video Large Language Model for Streaming
Video",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18407-18418",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223119"}

@inproceedings{bb228118,
        AUTHOR = "Zhu, A. and Ke, Q.H. and Gong, M.M. and Bailey, J.",
        TITLE = "Part-Aware Unified Representation of Language and Skeleton for
Zero-Shot Action Recognition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18761-18770",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223120"}

@inproceedings{bb228119,
        AUTHOR = "Chen, T.J. and Yu, H.S. and Yang, Z.G. and Li, Z.C. and Sun, W. and Chen, C.",
        TITLE = "OST: Refining Text Knowledge with Optimal Spatio-Temporal Descriptor
for General Video Recognition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18888-18898",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223121"}

@inproceedings{bb228120,
        AUTHOR = "Zhao, Q.H. and Dai, Y. and Li, H. and Hu, W. and Zhang, F. and Liu, J.",
        TITLE = "LTGC: Long-Tail Recognition via Leveraging LLMs-Driven Generated
Content",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "19510-19520",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223122"}

@inproceedings{bb228121,
        AUTHOR = "Siddiqui, Y. and Alliegro, A. and Artemov, A. and Tommasi, T. and Sirigatti, D. and Rosov, V. and Dai, A. and Nießner, M.",
        TITLE = "MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "19615-19625",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223123"}

@inproceedings{bb228122,
        AUTHOR = "Yuan, Z.H. and Ren, J. and Feng, C.M. and Zhao, H.S. and Cui, S.G. and Li, Z.",
        TITLE = "Visual Programming for Zero-Shot Open-Vocabulary 3D Visual Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "20623-20633",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223124"}

@inproceedings{bb228123,
        AUTHOR = "Li, Z. and Gao, Z.Y. and Tan, C. and Ren, B. and Yang, L.T. and Li, S.Z.",
        TITLE = "General Point Model Pretraining with Autoencoding and Autoregressive",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "20954-20964",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223125"}

@inproceedings{bb228124,
        AUTHOR = "Li, K.C. and Wang, Y. and He, Y. and Li, Y.Z. and Wang, Y. and Liu, Y. and Wang, Z. and Xu, J. and Chen, G. and Lou, P. and Wang, L.M. and Qiao, Y.",
        TITLE = "MVBench: A Comprehensive Multi-modal Video Understanding Benchmark",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22195-22206",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223126"}

@inproceedings{bb228125,
        AUTHOR = "Taesiri, M.R. and Feng, T.J. and Bezemer, C.P. and Nguyen, A.",
        TITLE = "GlitchBench: Can Large Multimodal Models Detect Video Game Glitches?",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22444-22455",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223127"}

@inproceedings{bb228126,
        AUTHOR = "Zhang, R. and Zhang, Y.Z. and Chen, J. and Zhou, Y.F. and Gu, J.X. and Chen, C. and Sun, T.",
        TITLE = "TRINS: Towards Multimodal Language Models that Can Read",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22584-22594",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223128"}

@inproceedings{bb228127,
        AUTHOR = "Zhang, H.J. and Su, Y.Y. and Xu, X. and Jia, K.",
        TITLE = "Improving the Generalization of Segmentation Foundation Model under
Distribution Shift via Weakly Supervised Adaptation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "23385-23395",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223129"}

@inproceedings{bb228128,
        AUTHOR = "Dunlap, L. and Zhang, Y.H. and Wang, X.H. and Zhong, R.Q. and Darrell, T.J. and Steinhardt, J. and Gonzalez, J.E. and Yeung Levy, S.",
        TITLE = "Describing Differences in Image Sets with Natural Language",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "24199-24208",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223130"}

@inproceedings{bb228129,
        AUTHOR = "Ishmam, A.M. and Thomas, C.",
        TITLE = "Semantic Shield: Defending Vision-Language Models Against Backdooring
and Poisoning via Fine-Grained Knowledge Alignment",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "24820-24830",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223131"}

@inproceedings{bb228130,
        AUTHOR = "Wu, H.N. and Zhang, Z.C. and Zhang, E. and Chen, C.F. and Liao, L. and Wang, A. and Xu, K.X. and Li, C.Y. and Hou, J.W. and Zhai, G.T. and Xue, G. and Sun, W.X. and Yan, Q. and Lin, W.S.",
        TITLE = "Q-Instruct: Improving Low-Level Visual Abilities for Multi-Modality
Foundation Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "25490-25500",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223132"}

@inproceedings{bb228131,
        AUTHOR = "Yang, Y.J. and Zhou, T.Y. and Li, K. and Tao, D.P. and Li, L. and Shen, L. and He, X.D. and Jiang, J. and Shi, Y.H.",
        TITLE = "Embodied Multi-Modal Agent trained by an LLM from a Parallel
TextWorld",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26265-26275",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223133"}

@inproceedings{bb228132,
        AUTHOR = "Hong, Y. and Zheng, Z. and Chen, P.H. and Wang, Y.F. and Li, J. and Gan, C.",
        TITLE = "MultiPLY: A Multisensory Object-Centric Embodied Large Language Model
in 3D World",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26396-26406",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223134"}

@inproceedings{bb228133,
        AUTHOR = "Chen, G. and Shen, L. and Shao, R. and Deng, X. and Nie, L.Q.",
        TITLE = "LION: Empowering Multimodal Large Language Model with Dual-Level
Visual Knowledge",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26530-26540",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223135"}

@inproceedings{bb228134,
        AUTHOR = "Zhang, Y. and Dong, Y.P. and Zhang, S.Y. and Min, T.Z. and Su, H. and Zhu, J.",
        TITLE = "Exploring the Transferability of Visual Prompting for Multimodal
Large Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26552-26562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223136"}

@inproceedings{bb228135,
        AUTHOR = "Han, J. and Gong, K.X. and Zhang, Y.Y. and Wang, J.Q. and Zhang, K. and Lin, D. and Qiao, Y. and Gao, P. and Yue, X.Y.",
        TITLE = "OneLLM: One Framework to Align All Modalities with Language",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26574-26585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223137"}

@inproceedings{bb228136,
        AUTHOR = "Xie, H.X. and Peng, C.J. and Tseng, Y.W. and Chen, H.J. and Hsu, C.F. and Shuai, H.H. and Cheng, W.H.",
        TITLE = "EmoVIT: Revolutionizing Emotion Insights with Visual Instruction
Tuning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26586-26595",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223138"}

@inproceedings{bb228137,
        AUTHOR = "Wang, X.Y. and Zhuang, B. and Wu, Q.",
        TITLE = "ModaVerse: Efficiently Transforming Modalities with LLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26596-26606",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223139"}

@inproceedings{bb228138,
        AUTHOR = "Lin, J. and Yin, H.X. and Ping, W. and Molchanov, P. and Shoeybi, M. and Han, S.",
        TITLE = "VILA: On Pre-training for Visual Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26679-26689",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223140"}

@inproceedings{bb228139,
        AUTHOR = "Li, L. and Peng, J.W. and Chen, H. and Gao, C.Y. and Yang, X.",
        TITLE = "How to Configure Good In-Context Sequence for Visual Question
Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26700-26710",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223141"}

@inproceedings{bb228140,
        AUTHOR = "Lyu, Y.H. and Zheng, X. and Zhou, J.Z. and Wang, L.",
        TITLE = "UniBind: LLM-Augmented Unified and Balanced Representation Space to
Bind Them All",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26742-26752",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223142"}

@inproceedings{bb228141,
        AUTHOR = "Liang, T. and Huang, J. and Kong, M. and Chen, L. and Zhu, Q.",
        TITLE = "Querying as Prompt: Parameter-Efficient Learning for Multimodal
Language Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26845-26855",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223143"}

@inproceedings{bb228142,
        AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Dong, M. and Chen, J.X. and Ye, W. and Yan, M. and Ye, Q. and Zhang, J. and Huang, F. and Zhang, S.K.",
        TITLE = "Hallucination Augmented Contrastive Learning for Multimodal Large
Language Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27026-27036",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223144"}

@inproceedings{bb228143,
        AUTHOR = "Zhu, L. and Wei, F. and Lu, Y.",
        TITLE = "Beyond Text: Frozen Large Language Models in Visual Signal
Comprehension",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27037-27047",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223145"}

@inproceedings{bb228144,
        AUTHOR = "Pi, R.J. and Yao, L.W. and Gao, J. and Zhang, J.P. and Zhang, T.",
        TITLE = "PerceptionGPT: Effectively Fusing Visual Perception Into LLM",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27114-27123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223146"}

@inproceedings{bb228145,
        AUTHOR = "Tai, Y. and Fan, W.C. and Zhang, Z. and Liu, Z.W.",
        TITLE = "Link-Context Learning for Multimodal LLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27166-27175",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223147"}

@inproceedings{bb228146,
        AUTHOR = "Tang, Z. and Yang, Z. and Khademi, M. and Liu, Y. and Zhu, C.G. and Bansal, M.",
        TITLE = "CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any
Generation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27415-27424",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223148"}

@inproceedings{bb228147,
        AUTHOR = "Jain, J. and Yang, J.W. and Shi, H.",
        TITLE = "VCoder: Versatile Vision Encoders for Multimodal Large Language
Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27992-28002",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223149"}

@inproceedings{bb228148,
        AUTHOR = "Yuan, Y.Q. and Li, W. and Liu, J. and Tang, D.Q. and Luo, X.J. and Qin, C. and Zhang, L. and Zhu, J.",
        TITLE = "Osprey: Pixel Understanding with Visual Instruction Tuning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28202-28211",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223150"}

@inproceedings{bb228149,
        AUTHOR = "Zhai, A.J. and Shen, Y. and Chen, E.Y. and Wang, G.X. and Wang, X.L. and Wang, S. and Guan, K.Y. and Wang, S.",
        TITLE = "Physical Property Understanding from Language-Embedded Feature Fields",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28296-28305",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223151"}

@inproceedings{bb228150,
        AUTHOR = "Zheng, Z.H. and Wei, J. and Hu, X.F. and Zhu, H.D. and Nevatia, R.",
        TITLE = "Large Language Models are Good Prompt Learners for Low-Shot Image
Classification",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28453-28462",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223152"}

@inproceedings{bb228151,
        AUTHOR = "He, H.Y. and Pan, Z.Z. and Liu, J. and Cai, J.F. and Zhuang, B.",
        TITLE = "Efficient Stitchable Task Adaptation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28555-28565",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223153"}

@inproceedings{bb228152,
        AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.",
        TITLE = "ArGue: Attribute-Guided Prompt Tuning for Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28578-28587",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223154"}

@inproceedings{bb228153,
        AUTHOR = "Han, G.X. and Lim, S.N.",
        TITLE = "Few-Shot Object Detection with Foundation Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28608-28618",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223155"}

@inproceedings{bb228154,
        AUTHOR = "Roberts, J. and Luddecke, T. and Sheikh, R. and Han, K. and Albanie, S.",
        TITLE = "Charting New Territories: Exploring the Geographic and Geospatial
Capabilities of Multimodal LLMs",
        BOOKTITLE = EarthVision24,
        YEAR = "2024",
        PAGES = "554-563",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223156"}

@inproceedings{bb228155,
        AUTHOR = "Barbany, O. and Huang, M. and Zhu, X.L. and Dhua, A.",
        TITLE = "Leveraging Large Language Models for Multimodal Search",
        BOOKTITLE = FGVC24,
        YEAR = "2024",
        PAGES = "1201-1210",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223157"}

@inproceedings{bb228156,
        AUTHOR = "Lv, J.X. and Huang, Y. and Yan, M. and Huang, J.C. and Liu, J.Z. and Liu, Y.F. and Wen, Y.F. and Chen, X.X. and Chen, S.F.",
        TITLE = "GPT4Motion: Scripting Physical Motions in Text-to-Video Generation
via Blender-Oriented GPT Planning",
        BOOKTITLE = PBDL24,
        YEAR = "2024",
        PAGES = "1430-1440",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223158"}

@inproceedings{bb228157,
        AUTHOR = "Baldassini, F.B. and Shukor, M. and Cord, M. and Soulier, L. and Piwowarski, B.",
        TITLE = "What Makes Multimodal In-Context Learning Work?",
        BOOKTITLE = Prompting24,
        YEAR = "2024",
        PAGES = "1539-1550",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223159"}

@inproceedings{bb228158,
        AUTHOR = "Wang, J.C. and Ke, L.",
        TITLE = "LLM-Seg: Bridging Image Segmentation and Large Language Model
Reasoning",
        BOOKTITLE = WhatNext24,
        YEAR = "2024",
        PAGES = "1765-1774",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223160"}

@inproceedings{bb228159,
        AUTHOR = "Qu, M.X. and Chen, X.D. and Liu, W. and Li, A. and Zhao, Y.",
        TITLE = "ChatVTG: Video Temporal Grounding via Chat with Video Dialogue Large
Language Models",
        BOOKTITLE = PVUW24,
        YEAR = "2024",
        PAGES = "1847-1856",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223161"}

@inproceedings{bb228160,
        AUTHOR = "Hakim, Z.I.A. and Sarker, N.H. and Singh, R.P. and Paul, B. and Dabouei, A. and Xu, M.",
        TITLE = "Leveraging Generative Language Models for Weakly Supervised Sentence
Component Analysis in Video-Language Joint Learning",
        BOOKTITLE = MULA24,
        YEAR = "2024",
        PAGES = "1975-1985",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223162"}

@inproceedings{bb228161,
        AUTHOR = "Deria, A. and Kumar, K. and Chakraborty, S. and Mahapatra, D. and Roy, S.",
        TITLE = "InVERGe: Intelligent Visual Encoder for Bridging Modalities in Report
Generation",
        BOOKTITLE = MULA24,
        YEAR = "2024",
        PAGES = "2028-2038",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223163"}

@inproceedings{bb228162,
        AUTHOR = "Ma, F.P. and Zhou, Y.Z. and Zhang, Y.Y. and Wu, S.Y. and Zhang, Z. and He, Z.L. and Rao, F.Y. and Sun, X.Y.",
        TITLE = "Task Navigator: Decomposing Complex Tasks for Multimodal Large
Language Models",
        BOOKTITLE = Reasoning24,
        YEAR = "2024",
        PAGES = "2248-2257",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223164"}

@inproceedings{bb228163,
        AUTHOR = "Arefeen, M.A. and Debnath, B. and Uddin, M.Y.S. and Chakradhar, S.",
        TITLE = "ViTA: An Efficient Video-to-Text Algorithm using VLM for RAG-based
Video Analysis System",
        BOOKTITLE = Reasoning24,
        YEAR = "2024",
        PAGES = "2266-2274",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223165"}

@inproceedings{bb228164,
        AUTHOR = "Chen, Y.W. and Chu, S.Y.",
        TITLE = "Large Language Models in Wargaming: Methodology, Application, and
Robustness",
        BOOKTITLE = AML24,
        YEAR = "2024",
        PAGES = "2894-2903",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223166"}

@inproceedings{bb228165,
        AUTHOR = "Lai, Z.X. and Wu, J. and Chen, S. and Zhou, Y.C. and Hovakimyan, N.",
        TITLE = "Residual-based Language Models are Free Boosters for Biomedical
Imaging Tasks",
        BOOKTITLE = DEF-AI-MIA24,
        YEAR = "2024",
        PAGES = "5086-5096",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223167"}

@inproceedings{bb228166,
        AUTHOR = "Verma, A.A. and Saeidi, A. and Hegde, S. and Therala, A. and Bardoliya, F.D. and Machavarapu, N. and Ravindhiran, S.A.K. and Malyala, S. and Chatterjee, A. and Yang, Y.Z. and Baral, C.",
        TITLE = "Evaluating Multimodal Large Language Models across Distribution
Shifts and Augmentations",
        BOOKTITLE = GenerativeFM24,
        YEAR = "2024",
        PAGES = "5314-5324",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223168"}

@inproceedings{bb228167,
        AUTHOR = "Fang, X. and Wang, W.G. and Lv, X.X. and Yan, J.",
        TITLE = "PCQA: A Strong Baseline for AIGC Quality Assessment Based on Prompt
Condition",
        BOOKTITLE = NTIRE24,
        YEAR = "2024",
        PAGES = "6167-6176",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223169"}

@inproceedings{bb228168,
        AUTHOR = "Ye, Z. and Liu, J.X. and Cao, J.J. and Chen, Z.Y. and Xuan, Z.W. and Zhou, M.Y. and Liu, Q. and Qi, G.J.",
        TITLE = "OpenStory: A Large-Scale Open-Domain Dataset for Subject-Driven
Visual Storytelling",
        BOOKTITLE = VDU24,
        YEAR = "2024",
        PAGES = "7953-7962",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223170"}

@inproceedings{bb228169,
        AUTHOR = "Chen, X.Y. and Liu, J. and Wang, Y. and Wang, P.P. and Brand, M. and Wang, G.H. and Koike Akino, T.",
        TITLE = "SuperLoRA: Parameter-Efficient Unified Adaptation for Large Vision
Models",
        BOOKTITLE = ECV24,
        YEAR = "2024",
        PAGES = "8050-8055",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223171"}

@inproceedings{bb228170,
        AUTHOR = "Chen, Z. and Wu, J.N. and Wang, W.H. and Su, W.J. and Chen, G. and Xing, S. and Zhong, M. and Zhang, Q.L. and Zhu, X.Z. and Lu, L.W. and Li, B. and Luo, P. and Lu, T. and Qiao, Y. and Dai, J.F.",
        TITLE = "Intern VL: Scaling up Vision Foundation Models and Aligning for
Generic Visual-Linguistic Tasks",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "24185-24198",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223172"}

@inproceedings{bb228171,
        AUTHOR = "Zhang, J.Y. and Huang, Z.M. and Ray, A. and Ohn Bar, E.",
        TITLE = "Feedback-Guided Autonomous Driving",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "15000-15011",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223173"}

@inproceedings{bb228172,
        AUTHOR = "Wei, C. and Liu, C.X. and Qiao, S.Y. and Zhang, Z.S. and Yuille, A.L. and Yu, J.",
        TITLE = "De-Diffusion Makes Text a Strong Cross-Modal Interface",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13492-13503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223174"}

@inproceedings{bb228173,
        AUTHOR = "Chen, Y. and Sikka, K. and Cogswell, M. and Ji, H. and Divakaran, A.",
        TITLE = "DRESS: Instructing Large Vision-Language Models to Align and
Interact with Humans via Natural Language Feedback",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14239-14250",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223175"}

@inproceedings{bb228174,
        AUTHOR = "Chen, B. and Xu, Z. and Kirmani, S. and Ichter, B. and Sadigh, D. and Guibas, L.J. and Xia, F.",
        TITLE = "SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning
Capabilities",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14455-14465",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223176"}

@inproceedings{bb228175,
        AUTHOR = "Dorkenwald, M. and Barazani, N. and Snoek, C.G.M. and Asano, Y.M.",
        TITLE = "PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13548-13558",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223177"}

@inproceedings{bb228176,
        AUTHOR = "Cha, J. and Kang, W. and Mun, J. and Roh, B.",
        TITLE = "Honeybee: Locality-Enhanced Projector for Multimodal LLM",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13817-13827",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223178"}

@inproceedings{bb228177,
        AUTHOR = "Huang, Q.D. and Dong, X.Y. and Zhang, P. and Wang, B. and He, C.H. and Wang, J.Q. and Lin, D. and Zhang, W.M. and Yu, N.H.",
        TITLE = "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models
via Over-Trust Penalty and Retrospection-Allocation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13418-13427",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223179"}

@inproceedings{bb228178,
        AUTHOR = "Zhang, Y. and Ma, Z.Q. and Gao, X.F. and Shakiah, S. and Gao, Q. and Chai, J.",
        TITLE = "Groundhog Grounding Large Language Models to Holistic Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14227-14238",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223180"}

@inproceedings{bb228179,
        AUTHOR = "Sun, Z.Y. and Fang, Y. and Wu, T. and Zhang, P. and Zang, Y.H. and Kong, S. and Xiong, Y.J. and Lin, D. and Wang, J.Q.",
        TITLE = "Alpha-CLIP: A CLIP Model Focusing on Wherever you Want",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13019-13029",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223181"}

@inproceedings{bb228180,
        AUTHOR = "Parashar, S. and Lin, Z.Q. and Liu, T. and Dong, X.J. and Li, Y. and Ramanan, D. and Caverlee, J. and Kong, S.",
        TITLE = "The Neglected Tails in Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12988-12997",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223182"}

@inproceedings{bb228181,
        AUTHOR = "Yu, Q.F. and Li, J.C. and Wei, L.H. and Pang, L. and Ye, W.T. and Qin, B.S. and Tang, S.L. and Tian, Q. and Zhuang, Y.T.",
        TITLE = "HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual
Instruction Data",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12944-12953",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223183"}

@inproceedings{bb228182,
        AUTHOR = "Luo, Y. and Shi, M. and Khan, M.O. and Afzal, M.M. and Huang, H. and Yuan, S. and Tian, Y. and Song, L. and Kouhana, A. and Elze, T. and Fang, Y. and Wang, M.Y.",
        TITLE = "FairCLIP: Harnessing Fairness in Vision-Language Learning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12289-12301",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223184"}

@inproceedings{bb228183,
        AUTHOR = "Zara, G. and Conti, A. and Roy, S. and Lathuiliere, S. and Rota, P. and Ricci, E.",
        TITLE = "The Unreasonable Effectiveness of Large Language-Vision Models for
Source-free Video Domain Adaptation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10273-10283",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223185"}

@inproceedings{bb228184,
        AUTHOR = "Liao, Z. and Li, J.T. and Niu, L. and Zhang, L.Q.",
        TITLE = "Align and Aggregate: Compositional Reasoning with Video Alignment and
Answer Aggregation for Video Question-Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13395-13404",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223186"}

@inproceedings{bb228185,
        AUTHOR = "Zhao, H.B. and Ni, B. and Fan, J.S. and Wang, Y.X. and Chen, Y.T. and Meng, G.F. and Zhang, Z.X.",
        TITLE = "Continual Forgetting for Pre-Trained Vision Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28631-28642",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223187"}

@inproceedings{bb228186,
        AUTHOR = "Kim, K. and Yoon, K. and Jeon, J. and In, Y. and Moon, J. and Kim, D.H. and Park, C.",
        TITLE = "LLM4SGG: Large Language Models for Weakly Supervised Scene Graph
Generation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "28306-28316",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223188"}

@inproceedings{bb228187,
        AUTHOR = "Zhan, X.Y. and Yang, L.X. and Zhao, Y.F. and Mao, K. and Xu, H.L. and Lin, Z. and Li, K.L. and Lu, C.",
        TITLE = "OakInk2: A Dataset of Bimanual Hands-Object Manipulation in Complex
Task Completion",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "445-456",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223189"}

@inproceedings{bb228188,
        AUTHOR = "Li, Y.C. and Zhao, N. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.",
        TITLE = "LASO: Language-Guided Affordance Segmentation on 3D Object",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14251-14260",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223190"}

@inproceedings{bb228189,
        AUTHOR = "Rotstein, N. and Bensaid, D. and Brody, S. and Ganz, R. and Kimmel, R.",
        TITLE = "FuseCap: Leveraging Large Language Models for Enriched Fused Image
Captions",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5677-5688",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223191"}

@inproceedings{bb228190,
        AUTHOR = "Pan, J.T. and Lin, Z. and Ge, Y.Y. and Zhu, X.T. and Zhang, R.R. and Wang, Y. and Qiao, Y. and Li, H.S.",
        TITLE = "Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen
Large Language Models",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "272-283",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223192"}

@inproceedings{bb228191,
        AUTHOR = "Guo, J.X. and Li, J. and Li, D.X. and Tiong, A.M.H. and Li, B.Y. and Tao, D.C. and Hoi, S.",
        TITLE = "From Images to Textual Prompts: Zero-shot Visual Question Answering
with Frozen Large Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "10867-10877",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llm4.html#TT223193"}

@article{bb228192,
        AUTHOR = "Zhou, N. and Fan, J.P.",
        TITLE = "Automatic image-text alignment for large-scale web image indexing and
retrieval",
        JOURNAL = PR,
        VOLUME = "48",
        YEAR = "2015",
        NUMBER = "1",
        PAGES = "205-219",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223194"}

@article{bb228193,
        AUTHOR = "Huang, F.R. and Zhang, X.M. and Zhao, Z.H. and Li, Z.J.",
        TITLE = "Bi-Directional Spatial-Semantic Attention Networks for Image-Text
Matching",
        JOURNAL = IP,
        VOLUME = "28",
        YEAR = "2019",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "2008-2020",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223195"}

@article{bb228194,
        AUTHOR = "Otto, C. and Springstein, M. and Anand, A. and Ewerth, R.",
        TITLE = "Characterization and classification of semantic image-text relations",
        JOURNAL = MultInfoRetr,
        VOLUME = "9",
        YEAR = "2020",
        NUMBER = "1",
        MONTH = "March",
        PAGES = "31-45",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223196"}

@article{bb228195,
        AUTHOR = "Niu, K. and Huang, Y. and Wang, L.",
        TITLE = "Re-ranking image-text matching by adaptive metric fusion",
        JOURNAL = PR,
        VOLUME = "104",
        YEAR = "2020",
        PAGES = "107351",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223197"}

@article{bb228196,
        AUTHOR = "Wen, K.Y. and Gu, X.D. and Cheng, Q.R.",
        TITLE = "Learning Dual Semantic Relations With Graph Attention for Image-Text
Matching",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "2866-2879",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223198"}

@article{bb228197,
        AUTHOR = "Yang, S. and Li, Q. and Li, W.H. and Li, X. and Liu, A.A.",
        TITLE = "Dual-Level Representation Enhancement on Characteristic and Context
for Image-Text Retrieval",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "8037-8050",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223199"}

@article{bb228198,
        AUTHOR = "Jing, Y. and Wang, W. and Wang, L. and Tan, T.N.",
        TITLE = "Learning Aligned Image-Text Representations Using Graph Attentive
Relational Network",
        JOURNAL = IP,
        VOLUME = "30",
        YEAR = "2021",
        PAGES = "1840-1852",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223200"}

@inproceedings{bb228199,
        AUTHOR = "Zhao, F. and Huang, Y.Z. and Wang, L. and Tan, T.N.",
        TITLE = "Deep Semantic Ranking Based Hashing for Multi-Label Image Retrieval",
        BOOKTITLE = CVPR15,
        YEAR = "2015",
        PAGES = "1556-1564",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803imt4.html#TT223201"}

Last update:Mar 17, 2025 at 20:02:03