@inproceedings{bb243100,
AUTHOR = "Chen, Y.Y. and Xu, D.X. and Huang, Y. and Zhan, S.K. and Wang, H. and Chen, D.X. and Wang, X.P. and Qiu, M.K. and Li, H.",
TITLE = "MIMO: A medical vision language model with visual referring
multimodal input and pixel grounding multimodal output",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24732-24741",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238014"}
@inproceedings{bb243101,
AUTHOR = "Huang, H.F. and Chen, X. and Chen, Y.L. and Li, H. and Han, X. and Wang, Z. and Wang, T. and Pang, J.M. and Zhao, Z.",
TITLE = "RoboGround: Robotic Manipulation with Grounded Vision-Language Priors",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "22540-22550",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238015"}
@inproceedings{bb243102,
AUTHOR = "Man, Y.Z. and Huang, D.A. and Liu, G.L. and Sheng, S.W. and Liu, S.L. and Gui, L.Y. and Kautz, J. and Wang, Y.X. and Yu, Z.",
TITLE = "Argus: Vision-Centric Reasoning with Grounded Chain-of-Thought",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14268-14280",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238016"}
@inproceedings{bb243103,
AUTHOR = "Yin, H. and Ren, Y.Q. and Yan, K. and Ding, S.H. and Hao, Y.T.",
TITLE = "ROD-MLLM: Towards More Reliable Object Detection in Multimodal Large
Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14358-14368",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238017"}
@inproceedings{bb243104,
AUTHOR = "Liao, Y.H. and Mahmood, R. and Fidler, S. and Acuna, D.",
TITLE = "Can Large Vision-Language Models Correct Semantic Grounding Errors By
Themselves?",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14667-14678",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238018"}
@inproceedings{bb243105,
AUTHOR = "Yuan, Z.H. and Peng, Y. and Ren, J. and Liao, Y.H. and Han, Y. and Feng, C.M. and Zhao, H.S. and Li, G.B. and Cui, S.G. and Li, Z.",
TITLE = "Empowering Large Language Models with 3D Situation Awareness",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19435-19445",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238019"}
@inproceedings{bb243106,
AUTHOR = "Kang, S. and Kim, J. and Kim, J. and Hwang, S.J.",
TITLE = "Your Large Vision-Language Model Only Needs A Few Attention Heads For
Visual Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "9339-9350",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238020"}
@inproceedings{bb243107,
AUTHOR = "Liu, Q.Y. and Zhang, S.Q. and Qiao, Y.Y. and Zhu, J.Y. and Li, X. and Guo, L.T. and Wang, Q. and He, X.J. and Wu, Q. and Liu, J.",
TITLE = "GroundingMate: Aiding Object Grounding for Goal-Oriented
Vision-and-Language Navigation",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "1775-1784",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238021"}
@inproceedings{bb243108,
AUTHOR = "Yan, S. and Bai, M. and Chen, W.F. and Zhou, X. and Huang, Q.X. and Li, L.E.",
TITLE = "Vigor: Improving Visual Grounding of Large Vision Language Models with
Fine-grained Reward Modeling",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXI: 37-53",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238022"}
@inproceedings{bb243109,
AUTHOR = "Chowdhury, S. and Nag, S. and Dasgupta, S. and Chen, J. and Elhoseiny, M. and Gao, R.H. and Manocha, D.",
TITLE = "Meerkat: Audio-visual Large Language Model for Grounding in Space and
Time",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXIV: 52-70",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238023"}
@inproceedings{bb243110,
AUTHOR = "Kuckreja, K. and Danish, M.S. and Naseer, M. and Das, A. and Khan, S. and Khan, F.S.",
TITLE = "GeoChat: Grounded Large Vision-Language Model for Remote Sensing",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27831-27840",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238024"}
@inproceedings{bb243111,
AUTHOR = "Song, C.H. and Sadler, B.M. and Wu, J. and Chao, W.L. and Washington, C. and Su, Y.",
TITLE = "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with
Large Language Models",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "2986-2997",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238025"}
@inproceedings{bb243112,
AUTHOR = "You, K. and Zhang, H.T. and Schoop, E. and Weers, F. and Swearngin, A. and Nichols, J. and Yang, Y.F. and Gan, Z.",
TITLE = "FERRET-UI: Grounded Mobile UI Understanding with Multimodal LLMs",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXIV: 240-255",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238026"}
@inproceedings{bb243113,
AUTHOR = "Tong, S.B. and Liu, Z. and Zhai, Y.X. and Ma, Y. and LeCun, Y. and Xie, S.",
TITLE = "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "9568-9578",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238027"}
@inproceedings{bb243114,
AUTHOR = "Xu, J.R. and Zhou, X.Y. and Yan, S. and Gu, X. and Arnab, A. and Sun, C. and Wang, X.L. and Schmid, C.",
TITLE = "Pixel Aligned Language Models",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13030-13039",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238028"}
@inproceedings{bb243115,
AUTHOR = "Wu, P.H. and Xie, S.",
TITLE = "V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13084-13094",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238029"}
@inproceedings{bb243116,
AUTHOR = "He, R. and Cascante Bonilla, P. and Yang, Z.Y. and Berg, A.C. and Ordonez, V.",
TITLE = "Improved Visual Grounding through Self-Consistent Explanations",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13095-13105",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238030"}
@inproceedings{bb243117,
AUTHOR = "Feng, C. and Hsu, J. and Liu, W.Y. and Wu, J.J.",
TITLE = "Naturally Supervised 3D Visual Grounding with Language-Regularized
Concept Learners",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13269-13278",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238031"}
@inproceedings{bb243118,
AUTHOR = "He, J.W. and Wang, Y.F. and Wang, L.J. and Lu, H.C. and He, J.Y. and Lan, J.P. and Luo, B. and Xie, X.",
TITLE = "Multi-Modal Instruction Tuned LLMs with Fine-Grained Visual
Perception",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13980-13990",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238032"}
@inproceedings{bb243119,
AUTHOR = "Yuan, Z.H. and Ren, J. and Feng, C.M. and Zhao, H.S. and Cui, S.G. and Li, Z.",
TITLE = "Visual Programming for Zero-Shot Open-Vocabulary 3D Visual Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "20623-20633",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238033"}
@inproceedings{bb243120,
AUTHOR = "Chen, G. and Shen, L. and Shao, R. and Deng, X. and Nie, L.Q.",
TITLE = "LION: Empowering Multimodal Large Language Model with Dual-Level
Visual Knowledge",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26530-26540",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238034"}
@inproceedings{bb243121,
AUTHOR = "Qu, M.X. and Chen, X.D. and Liu, W. and Li, A. and Zhao, Y.",
TITLE = "ChatVTG: Video Temporal Grounding via Chat with Video Dialogue Large
Language Models",
BOOKTITLE = PVUW24,
YEAR = "2024",
PAGES = "1847-1856",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238035"}
@inproceedings{bb243122,
AUTHOR = "Zhang, Y. and Ma, Z.Q. and Gao, X.F. and Shakiah, S. and Gao, Q. and Chai, J.",
TITLE = "Groundhog Grounding Large Language Models to Holistic Segmentation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14227-14238",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238036"}
@inproceedings{bb243123,
AUTHOR = "Kim, K. and Yoon, K. and Jeon, J. and In, Y. and Moon, J. and Kim, D.H. and Park, C.",
TITLE = "LLM4SGG: Large Language Models for Weakly Supervised Scene Graph
Generation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "28306-28316",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803llmgr4.html#TT238037"}
@article{bb243124,
AUTHOR = "Liang, J.W. and Jiang, L. and Cao, L.L. and Kalantidis, Y. and Li, L.J. and Hauptmann, A.G.",
TITLE = "Focal Visual-Text Attention for Memex Question Answering",
JOURNAL = PAMI,
VOLUME = "41",
YEAR = "2019",
NUMBER = "8",
MONTH = "August",
PAGES = "1893-1908",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238039"}
@inproceedings{bb243125,
AUTHOR = "Liang, J.W. and Jiang, L. and Cao, L.L. and Li, L.J. and Hauptmann, A.G.",
TITLE = "Focal Visual-Text Attention for Visual Question Answering",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "6135-6143",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238040"}
@article{bb243126,
AUTHOR = "Riquelme, F. and de Goyeneche, A. and Zhang, Y.D. and Niebles, J.C. and Soto, A.",
TITLE = "Explaining VQA predictions using visual grounding and a knowledge
base",
JOURNAL = IVC,
VOLUME = "101",
YEAR = "2020",
PAGES = "103968",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238041"}
@article{bb243127,
AUTHOR = "Plummer, B.A. and Shih, K.J. and Li, Y.C. and Xu, K. and Lazebnik, S. and Sclaroff, S. and Saenko, K.",
TITLE = "Revisiting Image-Language Networks for Open-Ended Phrase Detection",
JOURNAL = PAMI,
VOLUME = "44",
YEAR = "2022",
NUMBER = "4",
MONTH = "April",
PAGES = "2155-2167",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238042"}
@inproceedings{bb243128,
AUTHOR = "Burns, A. and Tan, R. and Saenko, K. and Sclaroff, S. and Plummer, B.A.",
TITLE = "Language Features Matter: Effective Language Representations for
Vision-Language Tasks",
BOOKTITLE = ICCV19,
YEAR = "2019",
PAGES = "7473-7482",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238043"}
@inproceedings{bb243129,
AUTHOR = "Arbelle, A. and Doveh, S. and Alfassy, A. and Shtok, J. and Lev, G. and Schwartz, E. and Kuehne, H. and Levi, H.B. and Sattigeri, P. and Panda, R. and Chen, C.F. and Bronstein, A.M. and Saenko, K. and Ullman, S. and Giryes, R. and Feris, R.S. and Karlinsky, L.",
TITLE = "Detector-Free Weakly Supervised Grounding by Separation",
BOOKTITLE = ICCV21,
YEAR = "2021",
PAGES = "1781-1792",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238044"}
@inproceedings{bb243130,
AUTHOR = "Whitehead, S. and Wu, H. and Ji, H. and Feris, R.S. and Saenko, K.",
TITLE = "Separating Skills and Concepts for Novel Visual Question Answering",
BOOKTITLE = CVPR21,
YEAR = "2021",
PAGES = "5628-5637",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238045"}
@article{bb243131,
AUTHOR = "Zhao, L.C. and Cai, D.G. and Zhang, J. and Sheng, L. and Xu, D. and Zheng, R. and Zhao, Y.J. and Wang, L.P. and Fan, X.",
TITLE = "Toward Explainable 3D Grounded Visual Question Answering: A New
Benchmark and Strong Baseline",
JOURNAL = CirSysVideo,
VOLUME = "33",
YEAR = "2023",
NUMBER = "6",
MONTH = "June",
PAGES = "2935-2949",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238046"}
@article{bb243132,
AUTHOR = "Zhu, L.J. and Peng, L. and Zhou, W.N. and Yang, J.L.",
TITLE = "Dual-decoder transformer network for answer grounding in visual
question answering",
JOURNAL = PRL,
VOLUME = "171",
YEAR = "2023",
PAGES = "53-60",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238047"}
@article{bb243133,
AUTHOR = "Li, Y.C. and Wang, X. and Xiao, J.B. and Ji, W. and Chua, T.S.",
TITLE = "Transformer-Empowered Invariant Grounding for Video Question
Answering",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "11",
MONTH = "November",
PAGES = "9510-9522",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238048"}
@inproceedings{bb243134,
AUTHOR = "Li, Y.C. and Wang, X. and Xiao, J.B. and Ji, W. and Chua, T.S.",
TITLE = "Invariant Grounding for Video Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "2918-2927",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238049"}
@inproceedings{bb243135,
AUTHOR = "Huang, J.Y. and Jia, B.X. and Wang, Y. and Zhu, Z.Y. and Linghu, X.K. and Li, Q. and Zhu, S.C. and Huang, S.Y.",
TITLE = "Unveiling the Mist over 3D Vision-Language Understanding:
Object-centric Evaluation with Chain-of-Analysis",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24570-24581",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238050"}
@inproceedings{bb243136,
AUTHOR = "Chen, K. and Wu, X.Q.",
TITLE = "VTQA: Visual Text Question Answering via Entity Alignment and
Cross-Media Reasoning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27208-27217",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238051"}
@inproceedings{bb243137,
AUTHOR = "Di, S.Z. and Xie, W.",
TITLE = "Grounded Question-Answering in Long Egocentric Videos",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12934-12943",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238052"}
@inproceedings{bb243138,
AUTHOR = "Chen, C.Y. and Anjum, S. and Gurari, D.",
TITLE = "VQA Therapy: Exploring Answer Differences by Visually Grounding
Answers",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "15269-15279",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238053"}
@inproceedings{bb243139,
AUTHOR = "Le, T.M. and Le, V. and Gupta, S.I. and Venkatesh, S. and Tran, T.",
TITLE = "Guiding Visual Question Answering with Attention Priors",
BOOKTITLE = WACV23,
YEAR = "2023",
PAGES = "4370-4379",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238054"}
@inproceedings{bb243140,
AUTHOR = "Khan, A.U. and Kuehne, H. and Gan, C. and da Vitoria Lobo, N. and Shah, M.",
TITLE = "Weakly Supervised Grounding for VQA in Vision-Language Transformers",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXV:652-670",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238055"}
@inproceedings{bb243141,
AUTHOR = "Gupta, K. and Gautam, D. and Mamidi, R.",
TITLE = "cViL: Cross-Lingual Training of Vision-Language Models using
Knowledge Distillation",
BOOKTITLE = "ICPR22",
YEAR = "2022",
PAGES = "1734-1741",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238056"}
@inproceedings{bb243142,
AUTHOR = "Lu, X.P. and Fan, Z. and Wang, Y. and Oh, J. and Rose, C.P.",
TITLE = "Localize, Group, and Select: Boosting Text-VQA by Scene Text Modeling",
BOOKTITLE = XSAnim21,
YEAR = "2021",
PAGES = "2631-2639",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238057"}
@inproceedings{bb243143,
AUTHOR = "Khan, A.U. and Kuehne, H. and Duarte, K. and Gan, C. and Lobo, N. and Shah, M.",
TITLE = "Found a Reason for me? Weakly-supervised Grounded Visual Question
Answering using Capsules",
BOOKTITLE = CVPR21,
YEAR = "2021",
PAGES = "8461-8470",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238058"}
@inproceedings{bb243144,
AUTHOR = "Selvaraju, R.R. and Tendulkar, P. and Parikh, D. and Horvitz, E. and Tulio Ribeiro, M. and Nushi, B. and Kamar, E.",
TITLE = "SQuINTing at VQA Models: Introspecting VQA Models With Sub-Questions",
BOOKTITLE = CVPR20,
YEAR = "2020",
PAGES = "10000-10008",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238059"}
@inproceedings{bb243145,
AUTHOR = "Gouthaman, K.V. and Mittal, A.",
TITLE = "Reducing Language Biases in Visual Question Answering with
Visually-grounded Question Encoder",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "XIII:18-34",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238060"}
@inproceedings{bb243146,
AUTHOR = "Tan, H.L. and Leong, M.C. and Xu, Q. and Li, L. and Fang, F. and Cheng, Y. and Gauthier, N. and Sun, Y. and Lim, J.H.",
TITLE = "Task-Oriented Multi-Modal Question Answering For Collaborative
Applications",
BOOKTITLE = ICIP20,
YEAR = "2020",
PAGES = "1426-1430",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238061"}
@inproceedings{bb243147,
AUTHOR = "Selvaraju, R.R. and Lee, S. and Shen, Y. and Jin, H. and Ghosh, S. and Heck, L. and Batra, D. and Parikh, D.",
TITLE = "Taking a HINT: Leveraging Explanations to Make Vision and Language
Models More Grounded",
BOOKTITLE = ICCV19,
YEAR = "2019",
PAGES = "2591-2600",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238062"}
@inproceedings{bb243148,
AUTHOR = "Zhang, Y. and Niebles, J.C. and Soto, A.",
TITLE = "Interpretable Visual Question Answering by Visual Grounding From
Attention Supervision Mining",
BOOKTITLE = WACV19,
YEAR = "2019",
PAGES = "349-357",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgrqa3.html#TT238063"}
@article{bb243149,
AUTHOR = "Li, X. and Jiang, S.",
TITLE = "Bundled Object Context for Referring Expressions",
JOURNAL = MultMed,
VOLUME = "20",
YEAR = "2018",
NUMBER = "10",
MONTH = "October",
PAGES = "2749-2760",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238064"}
@article{bb243150,
AUTHOR = "Wang, J.M. and Cui, E. and Liu, K.L. and Sun, Y.K. and Liang, J.Y. and Yuan, C.M. and Duan, X.J. and Jin, G.H. and Chung, T.S.",
TITLE = "Referring expression comprehension model with matching detection and
linguistic feedback",
JOURNAL = IET-CV,
VOLUME = "14",
YEAR = "2020",
NUMBER = "8",
MONTH = "December",
PAGES = "625-633",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238065"}
@article{bb243151,
AUTHOR = "Qiao, Y.Y. and Deng, C.R. and Wu, Q.",
TITLE = "Referring Expression Comprehension: A Survey of Methods and Datasets",
JOURNAL = MultMed,
VOLUME = "23",
YEAR = "2021",
PAGES = "4426-4440",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238066"}
@article{bb243152,
AUTHOR = "Niu, Y.L. and Zhang, H.W. and Lu, Z.W. and Chang, S.F.",
TITLE = "Variational Context: Exploiting Visual and Textual Context for
Grounding Referring Expressions",
JOURNAL = PAMI,
VOLUME = "43",
YEAR = "2021",
NUMBER = "1",
MONTH = "January",
PAGES = "347-359",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238067"}
@article{bb243153,
AUTHOR = "Yang, S. and Li, G.B. and Yu, Y.Z.",
TITLE = "Relationship-Embedded Representation Learning for Grounding Referring
Expressions",
JOURNAL = PAMI,
VOLUME = "43",
YEAR = "2021",
NUMBER = "8",
MONTH = "August",
PAGES = "2765-2779",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238068"}
@inproceedings{bb243154,
AUTHOR = "Yang, S. and Li, G.B. and Yu, Y.Z.",
TITLE = "Cross-Modal Relationship Inference for Grounding Referring Expressions",
BOOKTITLE = CVPR19,
YEAR = "2019",
PAGES = "4140-4149",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238069"}
@article{bb243155,
AUTHOR = "Sun, M.J. and Xiao, J. and Lim, E.G. and Liu, S. and Goulermas, J.Y.",
TITLE = "Discriminative Triad Matching and Reconstruction for Weakly Referring
Expression Grounding",
JOURNAL = PAMI,
VOLUME = "43",
YEAR = "2021",
NUMBER = "11",
MONTH = "November",
PAGES = "4189-4195",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238070"}
@article{bb243156,
AUTHOR = "Lin, L. and Yan, P.X. and Xu, X.Q. and Yang, S. and Zeng, K. and Li, G.B.",
TITLE = "Structured Attention Network for Referring Image Segmentation",
JOURNAL = MultMed,
VOLUME = "24",
YEAR = "2022",
PAGES = "1922-1932",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238071"}
@article{bb243157,
AUTHOR = "Yang, X. and Wang, H. and Xie, D. and Deng, C. and Tao, D.C.",
TITLE = "Object-Agnostic Transformers for Video Referring Segmentation",
JOURNAL = IP,
VOLUME = "31",
YEAR = "2022",
PAGES = "2839-2849",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238072"}
@article{bb243158,
AUTHOR = "Wang, X. and Xie, D. and Zheng, Y.S.",
TITLE = "Referring expression grounding by multi-context reasoning",
JOURNAL = PRL,
VOLUME = "160",
YEAR = "2022",
PAGES = "66-72",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238073"}
@article{bb243159,
AUTHOR = "Shen, H.T. and Chen, C. and Wang, P. and Gao, L.L. and Wang, M. and Song, J.K.",
TITLE = "Continual Referring Expression Comprehension via Dual Modular
Memorization",
JOURNAL = IP,
VOLUME = "31",
YEAR = "2022",
PAGES = "6694-6706",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238074"}
@article{bb243160,
AUTHOR = "Chen, Y.W. and Tsai, Y.H. and Yang, M.H.",
TITLE = "Understanding Synonymous Referring Expressions via Contrastive Features",
JOURNAL = IJCV,
VOLUME = "130",
YEAR = "2022",
NUMBER = "10",
MONTH = "October",
PAGES = "2501-2516",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238075"}
@article{bb243161,
AUTHOR = "Suo, W. and Sun, M.Y. and Wang, P. and Zhang, Y.N. and Wu, Q.",
TITLE = "Rethinking and Improving Feature Pyramids for One-Stage Referring
Expression Comprehension",
JOURNAL = IP,
VOLUME = "32",
YEAR = "2023",
PAGES = "854-864",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238076"}
@article{bb243162,
AUTHOR = "Liu, X.J. and Li, L. and Wang, S.H. and Zha, Z.J. and Li, Z.C. and Tian, Q. and Huang, Q.M.",
TITLE = "Entity-Enhanced Adaptive Reconstruction Network for Weakly Supervised
Referring Expression Grounding",
JOURNAL = PAMI,
VOLUME = "45",
YEAR = "2023",
NUMBER = "3",
MONTH = "March",
PAGES = "3003-3018",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238077"}
@inproceedings{bb243163,
AUTHOR = "Liu, X.J. and Li, L. and Wang, S.H. and Zha, Z.J. and Meng, D.C. and Huang, Q.M.",
TITLE = "Adaptive Reconstruction Network for Weakly Supervised Referring
Expression Grounding",
BOOKTITLE = ICCV19,
YEAR = "2019",
PAGES = "2611-2620",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238078"}
@article{bb243164,
AUTHOR = "Feng, G. and Zhang, L. and Sun, J.Y. and Hu, Z.W. and Lu, H.C.",
TITLE = "Referring Segmentation via Encoder-Fused Cross-Modal Attention
Network",
JOURNAL = PAMI,
VOLUME = "45",
YEAR = "2023",
NUMBER = "6",
MONTH = "June",
PAGES = "7654-7667",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238079"}
@inproceedings{bb243165,
AUTHOR = "Feng, G. and Hu, Z.W. and Zhang, L. and Lu, H.C.",
TITLE = "Encoder Fusion Network with Co-Attention Embedding for Referring
Image Segmentation",
BOOKTITLE = CVPR21,
YEAR = "2021",
PAGES = "15501-15510",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238080"}
@article{bb243166,
AUTHOR = "Liu, D.Z. and Zhou, P. and Xu, Z. and Wang, H.Z. and Li, R.X.",
TITLE = "Few-Shot Temporal Sentence Grounding via Memory-Guided Semantic
Learning",
JOURNAL = CirSysVideo,
VOLUME = "33",
YEAR = "2023",
NUMBER = "5",
MONTH = "May",
PAGES = "2491-2505",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238081"}
@article{bb243167,
AUTHOR = "Sun, M.J. and Xiao, J. and Lim, E.G. and Zhao, Y.",
TITLE = "Cycle-Free Weakly Referring Expression Grounding With Self-Paced
Learning",
JOURNAL = MultMed,
VOLUME = "25",
YEAR = "2023",
PAGES = "1611-1621",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238082"}
@article{bb243168,
AUTHOR = "Sun, M.Y. and Suo, W. and Wang, P. and Zhang, Y.N. and Wu, Q.",
TITLE = "A Proposal-Free One-Stage Framework for Referring Expression
Comprehension and Generation via Dense Cross-Attention",
JOURNAL = MultMed,
VOLUME = "25",
YEAR = "2023",
PAGES = "2446-2458",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238083"}
@article{bb243169,
AUTHOR = "Sun, Y.F. and Zhang, Y. and Jiang, H. and Hu, Y.L. and Yin, B.C.",
TITLE = "Multi-level attention for referring expression comprehension",
JOURNAL = PRL,
VOLUME = "172",
YEAR = "2023",
PAGES = "252-258",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238084"}
@article{bb243170,
AUTHOR = "Wang, R. and Tang, Z. and Zhou, Q.L. and Liu, X.Q. and Hui, T.R. and Tan, Q. and Liu, S.",
TITLE = "Unified Transformer with Isomorphic Branches for Natural Language
Tracking",
JOURNAL = CirSysVideo,
VOLUME = "33",
YEAR = "2023",
NUMBER = "9",
MONTH = "September",
PAGES = "4529-4541",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238085"}
@article{bb243171,
AUTHOR = "Li, H. and Sun, M.J. and Xiao, J. and Lim, E.G. and Zhao, Y.",
TITLE = "Fully and Weakly Supervised Referring Expression Segmentation With
End-to-End Learning",
JOURNAL = CirSysVideo,
VOLUME = "33",
YEAR = "2023",
NUMBER = "10",
MONTH = "October",
PAGES = "5999-6012",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238086"}
@article{bb243172,
AUTHOR = "Liu, C. and Jiang, X.D. and Ding, H.H.",
TITLE = "Instance-Specific Feature Propagation for Referring Segmentation",
JOURNAL = MultMed,
VOLUME = "25",
YEAR = "2023",
PAGES = "3657-3667",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238087"}
@article{bb243173,
AUTHOR = "Song, Y.Z. and Chen, Y.S. and Shuai, H.H.",
TITLE = "Decoupling-Cooperative Framework for Referring Expression
Comprehension",
JOURNAL = SPLetters,
VOLUME = "30",
YEAR = "2023",
PAGES = "1542-1546",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238088"}
@article{bb243174,
AUTHOR = "Hua, G.G. and Liao, M. and Tian, S. and Zhang, Y.H. and Zou, W.B.",
TITLE = "Multiple Relational Learning Network for Joint Referring Expression
Comprehension and Segmentation",
JOURNAL = MultMed,
VOLUME = "25",
YEAR = "2023",
PAGES = "8805-8816",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238089"}
@article{bb243175,
AUTHOR = "Wang, W.B. and Pagnucco, M. and Xu, C.P. and Song, Y.",
TITLE = "InterREC: An Interpretable Method for Referring Expression
Comprehension",
JOURNAL = MultMed,
VOLUME = "25",
YEAR = "2023",
PAGES = "9330-9342",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238090"}
@article{bb243176,
AUTHOR = "Ke, J.C. and Wang, J. and Chen, J.C. and Jhuo, I.H. and Lin, C.W. and Lin, Y.Y.",
TITLE = "CLIPREC: Graph-Based Domain Adaptive Network for Zero-Shot Referring
Expression Comprehension",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "2480-2492",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238091"}
@article{bb243177,
AUTHOR = "Ke, J.C. and Wang, J. and Wong, W.K. and Toomey, A. and Wen, J.",
TITLE = "Graph-Based Group Division Network for Referring Expression
Comprehension",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "6",
MONTH = "June",
PAGES = "6170-6183",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238092"}
@article{bb243178,
AUTHOR = "Li, X.C. and Fan, B.Y. and Zhang, R.Z. and Zhao, K. and Guo, Z.H. and Zhao, Y.Q. and Li, R.",
TITLE = "Inexactly Matched Referring Expression Comprehension With Rationale",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "3937-3950",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238093"}
@article{bb243179,
AUTHOR = "Luo, G. and Zhou, Y.Y. and Sun, J. and Sun, X.S. and Ji, R.R.",
TITLE = "A Survivor in the Era of Large-Scale Pretraining: An Empirical Study
of One-Stage Referring Expression Comprehension",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "3689-3700",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238094"}
@article{bb243180,
AUTHOR = "Miao, P.H. and Su, W. and Wang, G.A. and Li, X.W. and Xi, L.",
TITLE = "Self-Paced Multi-Grained Cross-Modal Interaction Modeling for
Referring Expression Comprehension",
JOURNAL = IP,
VOLUME = "33",
YEAR = "2024",
PAGES = "1497-1507",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238095"}
@article{bb243181,
AUTHOR = "Liu, Z.T. and Xu, T.Y. and Song, X.N. and Wu, X.J.",
TITLE = "Unified Referring Expression Generation for Bounding Boxes and
Segmentations",
JOURNAL = SPLetters,
VOLUME = "31",
YEAR = "2024",
PAGES = "636-640",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238096"}
@article{bb243182,
AUTHOR = "Zhang, Y.J. and Li, Q.Z. and Pan, Y. and Zhao, X.G. and Tan, M.",
TITLE = "Multi-Stage Image-Language Cross-Generative Fusion Network for
Video-Based Referring Expression Comprehension",
JOURNAL = IP,
VOLUME = "33",
YEAR = "2024",
PAGES = "3256-3270",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238097"}
@article{bb243183,
AUTHOR = "Lu, M.C. and Li, R.F. and Feng, F.X. and Ma, Z.Y. and Wang, X.J.",
TITLE = "LGR-NET: Language Guided Reasoning Network for Referring Expression
Comprehension",
JOURNAL = CirSysVideo,
VOLUME = "34",
YEAR = "2024",
NUMBER = "8",
MONTH = "August",
PAGES = "7771-7784",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238098"}
@article{bb243184,
AUTHOR = "Yao, H.B. and Wang, L.P. and Cai, C.T. and Wang, W. and Zhang, Z. and Shang, X.B.",
TITLE = "Language conditioned multi-scale visual attention networks for visual
grounding",
JOURNAL = IVC,
VOLUME = "150",
YEAR = "2024",
PAGES = "105242",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238099"}
@article{bb243185,
AUTHOR = "Ji, Z. and Wu, J. and Wang, Y.D. and Yang, A.P. and Han, J.G.",
TITLE = "Progressive Semantic Reconstruction Network for Weakly Supervised
Referring Expression Grounding",
JOURNAL = CirSysVideo,
VOLUME = "34",
YEAR = "2024",
NUMBER = "12",
MONTH = "December",
PAGES = "13058-13070",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238100"}
@article{bb243186,
AUTHOR = "Qiu, H.Q. and Wang, L.X. and Zhao, T. and Meng, F.M. and Wu, Q.B. and Li, H.L.",
TITLE = "MCCE-REC: MLLM-Driven Cross-Modal Contrastive Entropy Model for
Zero-Shot Referring Expression Comprehension",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "1",
MONTH = "January",
PAGES = "754-768",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238101"}
@article{bb243187,
AUTHOR = "Ke, J.C. and Zhang, Q. and Wang, J. and Ding, H.Q. and Zhang, P.F. and Wen, J.",
TITLE = "Graph-based referring expression comprehension with expression-guided
selective filtering and noun-oriented reasoning",
JOURNAL = PR,
VOLUME = "161",
YEAR = "2025",
PAGES = "111222",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238102"}
@article{bb243188,
AUTHOR = "Ke, J.C. and Wang, D. and Chen, J.C. and Jhuo, I.H. and Lin, C.W. and Lin, Y.Y.",
TITLE = "Make Graph-Based Referring Expression Comprehension Great Again
Through Expression-Guided Dynamic Gating and Regression",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "1950-1961",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238103"}
@article{bb243189,
AUTHOR = "Huang, S.J. and Li, F. and Zhang, H. and Liu, S.L. and Zhang, L. and Wang, L.W.",
TITLE = "A Mutual Supervision Framework for Referring Expression Segmentation
and Generation",
JOURNAL = IJCV,
VOLUME = "133",
YEAR = "2025",
NUMBER = "6",
MONTH = "June",
PAGES = "3597-3612",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238104"}
@article{bb243190,
AUTHOR = "Ke, X. and Xu, P.R. and Guo, W.Z.",
TITLE = "Language-Image Consistency Augmentation and Distillation Network for
visual grounding",
JOURNAL = PR,
VOLUME = "166",
YEAR = "2025",
PAGES = "111663",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238105"}
@article{bb243191,
AUTHOR = "Yang, X.Z. and Liu, J.Z. and Wang, P. and Wang, G.Q. and Yang, Y. and Shen, H.T.",
TITLE = "New Dataset and Methods for Fine-Grained Compositional Referring
Expression Comprehension via Specialist-MLLM Collaboration",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "10",
MONTH = "October",
PAGES = "8598-8612",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238106"}
@article{bb243192,
AUTHOR = "Guo, H. and Fan, W. and Wei, B. and Zhu, J.F. and Tian, J. and Yi, C.Z. and Jiang, F.",
TITLE = "AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference
Understanding",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "10",
MONTH = "October",
PAGES = "10238-10249",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238107"}
@article{bb243193,
AUTHOR = "Ke, J.C. and Wen, J. and Wang, H.T. and Cheng, W.H. and Wang, J.",
TITLE = "Multi-Perspective Cross-Modal Object Encoding for Referring
Expression Comprehension",
JOURNAL = IP,
VOLUME = "34",
YEAR = "2025",
PAGES = "6911-6924",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238108"}
@article{bb243194,
AUTHOR = "Li, J. and Wen, Z. and Zhang, Y. and Wang, W.X. and Cai, Y.X. and Zhang, T.X. and He, X.J. and Liu, J.",
TITLE = "Generalized referring expression segmentation driven by
instance-oriented queries",
JOURNAL = PR,
VOLUME = "172",
YEAR = "2026",
PAGES = "112524",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238109"}
@article{bb243195,
AUTHOR = "Liu, X.Y. and Liu, T. and Huang, S. and Xin, Y. and Hu, Y. and Qin, L. and Wang, D.L. and Wu, Y.Y. and Chen, H.G.",
TITLE = "M2IST: Multi-Modal Interactive Side-Tuning for Efficient Referring
Expression Comprehension",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "2",
MONTH = "February",
PAGES = "1341-1354",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238110"}
@article{bb243196,
AUTHOR = "Li, R.F. and Lu, M.C. and Lin, P.Y. and Yu, Z.H. and Ma, Z.Y.",
TITLE = "Improving Scene Knowledge Referring Expression Comprehension With
Large Language Models",
JOURNAL = MultMedMag,
VOLUME = "33",
YEAR = "2026",
NUMBER = "1",
MONTH = "January",
PAGES = "72-80",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238111"}
@inproceedings{bb243197,
AUTHOR = "Chen, J. and Wei, F.Y. and Zhao, J.J. and Song, S. and Wu, B.H. and Peng, Z.X. and Chan, S.H.G. and Zhang, H.Y.",
TITLE = "Revisiting Referring Expression Comprehension Evaluation in the Era
of Large Multimodal Models",
BOOKTITLE = "AIBench25",
YEAR = "2025",
PAGES = "513-524",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238112"}
@inproceedings{bb243198,
AUTHOR = "Wang, Z.C. and Pan, Z.Y. and Peng, Z. and Cheng, J. and Xiao, L.W. and Jiang, W. and Cao, Z.G.",
TITLE = "Exploring Contextual Attribute Density in Referring Expression
Counting",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19587-19596",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238113"}
@inproceedings{bb243199,
AUTHOR = "Chen, X. and Luo, Y.X. and Luo, G. and Ji, J.Y. and Ding, H.H. and Zhou, Y.",
TITLE = "DViN: Dynamic Visual Routing Network for Weakly Supervised Referring
Expression Comprehension",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14347-14357",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803refex3.html#TT238114"}
Last update:May 24, 2026 at 14:46:09