@article{bb240600,
AUTHOR = "Zhang, Q.S. and Wu, Y.N. and Zhang, H. and Zhu, S.C.",
TITLE = "Mining deep And-Or object structures via cost-sensitive
question-answer-based active annotations",
JOURNAL = CVIU,
VOLUME = "176-177",
YEAR = "2018",
PAGES = "33-44",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235516"}
@article{bb240601,
AUTHOR = "Zhang, Q.S. and Ren, J. and Huang, G. and Cao, R.M. and Wu, Y.N. and Zhu, S.C.",
TITLE = "Mining Interpretable AOG Representations From Convolutional Networks
via Active Question Answering",
JOURNAL = PAMI,
VOLUME = "43",
YEAR = "2021",
NUMBER = "11",
MONTH = "November",
PAGES = "3949-3963",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235517"}
@inproceedings{bb240602,
AUTHOR = "Zhang, Q.S. and Cao, R.M. and Wu, Y.N. and Zhu, S.C.",
TITLE = "Mining Object Parts from CNNs via Active Question-Answering",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "3890-3899",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235518"}
@inproceedings{bb240603,
AUTHOR = "Zhang, Q.S. and Wu, Y.N. and Zhu, S.C.",
TITLE = "Mining And-Or Graphs for Graph Matching and Object Discovery",
BOOKTITLE = ICCV15,
YEAR = "2015",
PAGES = "55-63",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235519"}
@article{bb240604,
AUTHOR = "Cao, Q.X. and Liang, X.D. and Li, B.L. and Lin, L.",
TITLE = "Interpretable Visual Question Answering by Reasoning on Dependency
Trees",
JOURNAL = PAMI,
VOLUME = "43",
YEAR = "2021",
NUMBER = "3",
MONTH = "March",
PAGES = "887-901",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235520"}
@inproceedings{bb240605,
AUTHOR = "Cao, Q.X. and Liang, X.D. and Li, B.L. and Li, G. and Lin, L.",
TITLE = "Visual Question Reasoning on General Dependency Tree",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "7249-7257",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235521"}
@article{bb240606,
AUTHOR = "Zhong, H.S. and Chen, J.Y. and Shen, C. and Zhang, H.W. and Huang, J.Q. and Hua, X.S.",
TITLE = "Self-Adaptive Neural Module Transformer for Visual Question Answering",
JOURNAL = MultMed,
VOLUME = "23",
YEAR = "2021",
PAGES = "1264-1273",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235522"}
@article{bb240607,
AUTHOR = "Zheng, W.F. and Yin, L.R. and Chen, X.B. and Ma, Z.Y. and Liu, S. and Yang, B.",
TITLE = "Knowledge base graph embedding module design for Visual question
answering model",
JOURNAL = PR,
VOLUME = "120",
YEAR = "2021",
PAGES = "108153",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235523"}
@article{bb240608,
AUTHOR = "Sharma, H. and Jalal, A.S.",
TITLE = "Visual question answering model based on graph neural network and
contextual attention",
JOURNAL = IVC,
VOLUME = "110",
YEAR = "2021",
PAGES = "104165",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235524"}
@article{bb240609,
AUTHOR = "Song, L.Y. and Li, J. and Liu, J. and Yang, Y. and Shang, X.Q. and Sun, M.X.",
TITLE = "Answering Knowledge-Based Visual Questions via the Exploration of
Question Purpose",
JOURNAL = PR,
VOLUME = "133",
YEAR = "2023",
PAGES = "109015",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235525"}
@article{bb240610,
AUTHOR = "MeshuWelde, T. and Liao, L.",
TITLE = "Counting-based visual question answering with serial cascaded
attention deep learning",
JOURNAL = PR,
VOLUME = "144",
YEAR = "2023",
PAGES = "109850",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235526"}
@article{bb240611,
AUTHOR = "Liu, Y. and Li, G.B. and Lin, L.",
TITLE = "Cross-Modal Causal Relational Reasoning for Event-Level Visual
Question Answering",
JOURNAL = PAMI,
VOLUME = "45",
YEAR = "2023",
NUMBER = "10",
MONTH = "October",
PAGES = "11624-11641",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235527"}
@inproceedings{bb240612,
AUTHOR = "Cao, Q.X. and Wan, W.T. and Wang, K. and Liang, X.D. and Lin, L.",
TITLE = "Linguistically Routing Capsule Network for Out-of-distribution Visual
Question Answering",
BOOKTITLE = ICCV21,
YEAR = "2021",
PAGES = "1594-1603",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235528"}
@article{bb240613,
AUTHOR = "Yang, S.W. and Xiao, L. and Wu, X.J. and Xu, J.J. and Wang, L.L. and He, L.",
TITLE = "Simple contrastive learning in a self-supervised manner for robust
visual question answering",
JOURNAL = CVIU,
VOLUME = "241",
YEAR = "2024",
PAGES = "103976",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235529"}
@article{bb240614,
AUTHOR = "Wu, Y.L. and Pan, X. and Li, J.H. and Dou, S. and Wang, X.X.",
TITLE = "Interpretable answer retrieval based on heterogeneous network
embedding",
JOURNAL = PRL,
VOLUME = "182",
YEAR = "2024",
PAGES = "9-16",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235530"}
@article{bb240615,
AUTHOR = "Luo, L. and Lai, H.J. and Pan, Y. and Yin, J.",
TITLE = "Efficient Multimodal Selection for Retrieval in Knowledge-Based
Visual Question Answering",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "6",
MONTH = "June",
PAGES = "5195-5207",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235531"}
@inproceedings{bb240616,
AUTHOR = "Zhang, Y. and Chen, H. and Frikha, A. and Krompass, D. and Zhang, G. and Gu, J.D. and Tresp, V.",
TITLE = "CL-Cross VQA: A Continual Learning Benchmark for Cross-Domain Visual
Question Answering",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "6269-6278",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235532"}
@inproceedings{bb240617,
AUTHOR = "Marcu, A.M. and Chen, L. and Hunermann, J. and Karnsund, A. and Hanotte, B. and Chidananda, P. and Nair, S. and Badrinarayanan, V. and Kendall, A. and Shotton, J. and Arani, E. and Sinavski, O.",
TITLE = "Lingoqa: Visual Question Answering for Autonomous Driving",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXVII: 252-269",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235533"}
@inproceedings{bb240618,
AUTHOR = "Sima, C.H. and Renz, K. and Chitta, K. and Chen, L. and Zhang, H. and Xie, C.G. and Beißwenger, J. and Luo, P. and Geiger, A. and Li, H.Y.",
TITLE = "Drivelm: Driving with Graph Visual Question Answering",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LII: 256-274",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235534"}
@inproceedings{bb240619,
AUTHOR = "Feng, C. and Danier, D. and Zhang, F. and Bull, D.",
TITLE = "RankDVQA: Deep VQA based on Ranking-inspired Hybrid Training",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "1637-1647",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235535"}
@inproceedings{bb240620,
AUTHOR = "Ishay, A. and Yang, Z. and Lee, J. and Kang, I. and Lim, D.J.",
TITLE = "Think before You Simulate: Symbolic Reasoning to Orchestrate Neural
Computation for Counterfactual Question Answering",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "6684-6693",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235536"}
@inproceedings{bb240621,
AUTHOR = "Wang, Y. and Yasunaga, M. and Ren, H.Y. and Wada, S. and Leskovec, J.",
TITLE = "VQA-GNN: Reasoning with Multimodal Knowledge via Graph Neural
Networks for Visual Question Answering",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "21525-21535",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235537"}
@inproceedings{bb240622,
AUTHOR = "Souza, B. and Aasan, M. and Pedrini, H. and Rivera, A.R.",
TITLE = "SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-based
Question Answering",
BOOKTITLE = VLAR23,
YEAR = "2023",
PAGES = "4642-4647",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235538"}
@inproceedings{bb240623,
AUTHOR = "Haisa, G. and Altenbek, G.",
TITLE = "Question Classification Based on Weak Supervision and Interrogative
Pronouns Attention Mechanism",
BOOKTITLE = "ICPR22",
YEAR = "2022",
PAGES = "2273-2278",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235539"}
@inproceedings{bb240624,
AUTHOR = "Nguyen, B.X. and Do, T. and Tran, H. and Tjiputra, E. and Tran, Q.D. and Nguyen, A.",
TITLE = "Coarse-to-Fine Reasoning for Visual Question Answering",
BOOKTITLE = MULA22,
YEAR = "2022",
PAGES = "4557-4565",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235540"}
@inproceedings{bb240625,
AUTHOR = "Liang, Y.Y. and Wang, X. and Duan, X.G. and Zhu, W.W.",
TITLE = "Multi-modal Contextual Graph Neural Network for Text Visual Question
Answering",
BOOKTITLE = ICPR21,
YEAR = "2021",
PAGES = "3491-3498",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235541"}
@inproceedings{bb240626,
AUTHOR = "Patro, B.N. and Kurmi, V.K. and Kumar, S. and Namboodiri, V.P.",
TITLE = "Deep Bayesian Network for Visual Question Generation",
BOOKTITLE = WACV20,
YEAR = "2020",
PAGES = "1555-1565",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235542"}
@inproceedings{bb240627,
AUTHOR = "Singh, A.K. and Mishra, A. and Shekhar, S. and Chakraborty, A.",
TITLE = "From Strings to Things: Knowledge-Enabled VQA Model That Can Read and
Reason",
BOOKTITLE = ICCV19,
YEAR = "2019",
PAGES = "4601-4611",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235543"}
@inproceedings{bb240628,
AUTHOR = "Wilf, A. and Ma, M.Q. and Liang, P.P. and Zadeh, A. and Morency, L.P.",
TITLE = "Face-to-Face Contrastive Learning for Social Intelligence
Question-Answering",
BOOKTITLE = FG23,
YEAR = "2023",
PAGES = "1-7",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235544"}
@inproceedings{bb240629,
AUTHOR = "Zadeh, A. and Chan, M. and Liang, P.P. and Tong, E. and Morency, L.P.",
TITLE = "Social-IQ: A Question Answering Benchmark for Artificial Social
Intelligence",
BOOKTITLE = CVPR19,
YEAR = "2019",
PAGES = "8799-8809",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235545"}
@inproceedings{bb240630,
AUTHOR = "Ma, C. and Shen, C. and Dick, A. and Wu, Q. and Wang, P. and van den Hengel, A.J. and Reid, I.D.",
TITLE = "Visual Question Answering with Memory-Augmented Networks",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "6975-6984",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235546"}
@inproceedings{bb240631,
AUTHOR = "Shin, A. and Ushiku, Y. and Harada, T.",
TITLE = "Customized Image Narrative Generation via Interactive Visual Question
Generation and Answering",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "8925-8933",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235547"}
@inproceedings{bb240632,
AUTHOR = "Teney, D. and Anderson, P. and He, X. and van den Hengel, A.J.",
TITLE = "Tips and Tricks for Visual Question Answering:
Learnings from the 2017 Challenge",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "4223-4232",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235548"}
@inproceedings{bb240633,
AUTHOR = "Bai, Y.L. and Fu, J.L. and Zhao, T.J. and Mei, T.",
TITLE = "Deep Attention Neural Tensor Network for Visual Question Answering",
BOOKTITLE = ECCV18,
YEAR = "2018",
PAGES = "XII: 21-37",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235549"}
@inproceedings{bb240634,
AUTHOR = "Sinha, A. and Ayush, K.",
TITLE = "Towards Mathematical Reasoning: A Multimodal Deep Learning Approach",
BOOKTITLE = ICIP18,
YEAR = "2018",
PAGES = "4028-4032",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235550"}
@inproceedings{bb240635,
AUTHOR = "Rosso Mateus, A. and Gonzalez, F.A. and Montes y Gomez, M.",
TITLE = "A Two-Step Neural Network Approach to Passage Retrieval for Open Domain
Question Answering",
BOOKTITLE = CIARP17,
YEAR = "2017",
PAGES = "566-574",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235551"}
@inproceedings{bb240636,
AUTHOR = "Zhu, C. and Zhao, Y. and Huang, S. and Tu, K. and Ma, Y.",
TITLE = "Structured Attentions for Visual Question Answering",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "1300-1309",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235552"}
@inproceedings{bb240637,
AUTHOR = "Hu, R. and Andreas, J. and Rohrbach, M. and Darrell, T.J. and Saenko, K.",
TITLE = "Learning to Reason:
End-to-End Module Networks for Visual Question Answering",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "804-813",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235553"}
@inproceedings{bb240638,
AUTHOR = "Peris, A. and Casacuberta, F.",
TITLE = "Interactive-Predictive Neural Multimodal Systems",
BOOKTITLE = IbPRIA19,
YEAR = "2019",
PAGES = "I:16-28",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235554"}
@inproceedings{bb240639,
AUTHOR = "Bolanos, M. and Peris, A. and Casacuberta, F. and Radeva, P.",
TITLE = "VIBIKNet: Visual Bidirectional Kernelized Network for Visual Question
Answering",
BOOKTITLE = IbPRIA17,
YEAR = "2017",
PAGES = "372-380",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235555"}
@inproceedings{bb240640,
AUTHOR = "Kafle, K. and Kanan, C.",
TITLE = "An Analysis of Visual Question Answering Algorithms",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "1983-1991",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235556"}
@inproceedings{bb240641,
AUTHOR = "Kafle, K. and Kanan, C.",
TITLE = "Answer-Type Prediction for Visual Question Answering",
BOOKTITLE = CVPR16,
YEAR = "2016",
PAGES = "4976-4984",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235557"}
@inproceedings{bb240642,
AUTHOR = "Wang, P. and Wu, Q. and Shen, C. and van den Hengel, A.J.",
TITLE = "The VQA-Machine: Learning How to Use Existing Vision Algorithms to
Answer New Questions",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "3909-3918",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235558"}
@inproceedings{bb240643,
AUTHOR = "Yu, D. and Fu, J. and Mei, T. and Rui, Y.",
TITLE = "Multi-level Attention Networks for Visual Question Answering",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "4187-4195",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235559"}
@inproceedings{bb240644,
AUTHOR = "Ramakrishnan, S.K. and Pal, A. and Sharma, G. and Mittal, A.",
TITLE = "An Empirical Evaluation of Visual Question Answering for Novel
Objects",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "7312-7321",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT235560"}
@article{bb240645,
AUTHOR = "Gouthaman, K.V. and Nambiar, A. and Srinivas, K.S. and Mittal, A.",
TITLE = "Linguistically-aware attention for reducing the semantic gap in
vision-language tasks",
JOURNAL = PR,
VOLUME = "112",
YEAR = "2021",
PAGES = "107812",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235561"}
@article{bb240646,
AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
TITLE = "Learning to Prompt for Vision-Language Models",
JOURNAL = IJCV,
VOLUME = "130",
YEAR = "2022",
NUMBER = "9",
MONTH = "September",
PAGES = "2337-2348",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235562"}
@inproceedings{bb240647,
AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
TITLE = "Conditional Prompt Learning for Vision-Language Models",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "16795-16804",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235563"}
@article{bb240648,
AUTHOR = "Ma, C.C. and Liu, Y. and Deng, J.K. and Xie, L.X. and Dong, W.M. and Xu, C.S.",
TITLE = "Understanding and Mitigating Overfitting in Prompt Tuning for
Vision-Language Models",
JOURNAL = CirSysVideo,
VOLUME = "33",
YEAR = "2023",
NUMBER = "9",
MONTH = "September",
PAGES = "4616-4629",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235564"}
@article{bb240649,
AUTHOR = "Chen, C.Q. and Han, D.Z. and Chang, C.C.",
TITLE = "MPCCT: Multimodal vision-language learning paradigm with
context-based compact Transformer",
JOURNAL = PR,
VOLUME = "147",
YEAR = "2024",
PAGES = "110084",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235565"}
@article{bb240650,
AUTHOR = "Yu, Z.T. and Zhao, J. and Guo, C.L. and Yang, Y.",
TITLE = "StableNet: Distinguishing the hard samples to overcome language
priors in visual question answering",
JOURNAL = IET-CV,
VOLUME = "18",
YEAR = "2024",
NUMBER = "2",
PAGES = "315-327",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235566"}
@article{bb240651,
AUTHOR = "Bazi, Y. and Bashmal, L. and Rahhal, M.M.A. and Ricci, R. and Melgani, F.",
TITLE = "RS-LLaVA: A Large Vision-Language Model for Joint Captioning and
Question Answering in Remote Sensing Imagery",
JOURNAL = RS,
VOLUME = "16",
YEAR = "2024",
NUMBER = "9",
PAGES = "1477",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235567"}
@article{bb240652,
AUTHOR = "Tan, Y.T. and Chen, Y.Y. and Wang, J.Q.",
TITLE = "DSTA: Reinforcing Vision-Language Understanding for Scene-Text VQA
With Dual-Stream Training Approach",
JOURNAL = SPLetters,
VOLUME = "32",
YEAR = "2025",
PAGES = "6-10",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235568"}
@article{bb240653,
AUTHOR = "Alsabbagh, A.R. and Mansour, T. and Al Kharabsheh, M. and Ebdah, A.S. and Al Emaryeen, R. and Al Nahhas, S. and Mahafza, W. and Al Kadi, O.",
TITLE = "MiniMedGPT: Efficient Large Vision-Language Model for medical Visual
Question Answering",
JOURNAL = PRL,
VOLUME = "189",
YEAR = "2025",
PAGES = "8-16",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235569"}
@article{bb240654,
AUTHOR = "Wang, X. and Wu, J.L. and Lin, Z. and Zhang, F.Z. and Zhang, D. and Nie, L.Q.",
TITLE = "Video DataFlywheel: Resolving the Impossible Data Trinity in
Video-Language Understanding",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "4",
MONTH = "April",
PAGES = "2912-2923",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235570"}
@article{bb240655,
AUTHOR = "Shen, R. and Inoue, N. and Guan, D. and Cai, R. and Kot, A.C. and Shinoda, K.",
TITLE = "ContextualCoder: Adaptive In-Context Prompting for Programmatic
Visual Question Answering",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "4936-4949",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235571"}
@inproceedings{bb240656,
AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.",
TITLE = "Pyramid Coder: Hierarchical Code Generator for Compositional Visual
Question Answering",
BOOKTITLE = ICIP24,
YEAR = "2024",
PAGES = "430-436",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235572"}
@article{bb240657,
AUTHOR = "Zhang, Y.J. and Liu, H.L. and Kim, Y. and Hong, S.",
TITLE = "CAT-TPT: Class-Agnostic Text-based Test-time Prompt Tuning for
Vision-Language Models",
JOURNAL = IJCV,
VOLUME = "133",
YEAR = "2025",
NUMBER = "10",
MONTH = "October",
PAGES = "6930-6952",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235573"}
@article{bb240658,
AUTHOR = "Liu, F.Y. and Hu, Z.J. and Yang, P. and Liu, X.Y.",
TITLE = "Iterative Caption Generation with Heuristic Guidance for enhancing
knowledge-based visual question answering",
JOURNAL = CVIU,
VOLUME = "261",
YEAR = "2025",
PAGES = "104515",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235574"}
@article{bb240659,
AUTHOR = "Jiang, P.C. and Ibrayim, M. and Shen, S.",
TITLE = "TCaEx: Targeted Caption as External Knowledge for knowledge-based
visual question answering",
JOURNAL = IVC,
VOLUME = "163",
YEAR = "2025",
PAGES = "105772",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235575"}
@inproceedings{bb240660,
AUTHOR = "Panagopoulou, A. and Zhou, H.L. and Savarese, S. and Xiong, C.M. and Callison Burch, C. and Yatskar, M. and Niebles, J.C.",
TITLE = "ViUniT: Visual Unit Tests for More Robust Visual Programming",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24646-24656",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235576"}
@inproceedings{bb240661,
AUTHOR = "Wang, W.Z. and Duan, C. and Peng, Z.H. and Liu, Y.X. and Zhou, B.",
TITLE = "Embodied Scene Understanding for Vision Language Models via MetaVQA",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "22453-22464",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235577"}
@inproceedings{bb240662,
AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.",
TITLE = "Identifying and Mitigating Position Bias of Multi-image
Vision-Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "10599-10609",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235578"}
@inproceedings{bb240663,
AUTHOR = "Sheng, L.J. and Liang, J. and Wang, Z. and He, R.",
TITLE = "R-TPT: Improving Adversarial Robustness of Vision-Language Models
through Test-Time Prompt Tuning",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29958-29967",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235579"}
@inproceedings{bb240664,
AUTHOR = "Das, D. and Talon, D. and Mancini, M. and Wang, Y.M. and Ricci, E.",
TITLE = "One VLM to Keep it Learning: Generation and Balancing for Data-free
Continual Visual Question Answering",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "5635-5645",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235580"}
@inproceedings{bb240665,
AUTHOR = "Ishmam, M.F. and Tashdeed, I. and Saadat, T.A. and Ashmafee, M.H. and Kamal, A.R.M. and Hossain, M.A.",
TITLE = "Visual Robustness Benchmark for Visual Question Answering (VQA)",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "6623-6633",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235581"}
@inproceedings{bb240666,
AUTHOR = "Chen, X. and Djolonga, J. and Padlewski, P. and Mustafa, B. and Changpinyo, S. and Wu, J.L. and Ruiz, C.R. and Goodman, S. and Wang, X. and Tay, Y. and Shakeri, S. and Dehghani, M. and Salz, D. and Lucic, M. and Tschannen, M. and Nagrani, A. and Hu, H. and Joshi, M. and Pang, B. and Montgomery, C. and Pietrzyk, P. and Ritter, M. and Piergiovanni, A. and Minderer, M. and Pavetic, F. and Waters, A. and Li, G. and Alabdulmohsin, I. and Beyer, L. and Amelot, J. and Lee, K. and Steiner, A.P. and Li, Y. and Keysers, D. and Arnab, A. and Xu, Y.Z. and Rong, K. and Kolesnikov, A. and Seyedhosseini, M. and Angelova, A. and Zhai, X.H. and Houlsby, N. and Soricut, R.",
TITLE = "On Scaling Up a Multilingual Vision and Language Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14432-14444",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235582"}
@inproceedings{bb240667,
AUTHOR = "Li, R.J. and Wu, Y. and He, X.M.",
TITLE = "Learning by Correction: Efficient Tuning Task for Zero-Shot
Generative Vision-Language Reasoning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13428-13437",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235583"}
@inproceedings{bb240668,
AUTHOR = "Khan, Z. and Fu, Y.",
TITLE = "Consistency and Uncertainty: Identifying Unreliable Responses From
Black-Box Vision-Language Models for Selective Visual Question
Answering",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "10854-10863",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235584"}
@inproceedings{bb240669,
AUTHOR = "Gu, T.C. and Yang, K.C. and Liu, D. and Cai, W.D.",
TITLE = "LaPA: Latent Prompt Assist Model for Medical Visual Question
Answering",
BOOKTITLE = DEF-AI-MIA24,
YEAR = "2024",
PAGES = "4971-4980",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235585"}
@inproceedings{bb240670,
AUTHOR = "Feinglass, J. and Yang, Y.Z.",
TITLE = "Towards Addressing the Misalignment of Object Proposal Evaluation for
Vision-Language Tasks via Semantic Grounding",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "4385-4395",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235586"}
@inproceedings{bb240671,
AUTHOR = "Nadeem, A. and Hilton, A. and Dawes, R. and Thomas, G. and Mustafa, A.",
TITLE = "CAD: Contextual Multi-modal Alignment for Dynamic AVQA",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "7236-7248",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235587"}
@inproceedings{bb240672,
AUTHOR = "Wu, W. and Li, Q. and Zhong, W.L. and Huang, J.Z.",
TITLE = "MIVC: Multiple Instance Visual Component for Visual-Language Models",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "8102-8111",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235588"}
@inproceedings{bb240673,
AUTHOR = "Walmer, M. and Sikka, K. and Sur, I. and Shrivastava, A. and Jha, S.",
TITLE = "Dual-Key Multimodal Backdoors for Visual Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "15354-15364",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235589"}
@inproceedings{bb240674,
AUTHOR = "Ding, Y. and Yu, J. and Liu, B. and Hu, Y. and Cui, M.X. and Wu, Q.",
TITLE = "MuKEA: Multimodal Knowledge Extraction and Accumulation for
Knowledge-based Visual Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "5079-5088",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235590"}
@inproceedings{bb240675,
AUTHOR = "Gao, F. and Ping, Q. and Thattai, G. and Reganti, A. and Wu, Y.N. and Natarajan, P.",
TITLE = "Transform-Retrieve-Generate: Natural Language-Centric
Outside-Knowledge Visual Question Answering",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "5057-5067",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235591"}
@inproceedings{bb240676,
AUTHOR = "Aflalo, E. and Du, M. and Tseng, S.Y. and Liu, Y.F. and Wu, C. and Duan, N. and Lal, V.",
TITLE = "VL-InterpreT: An Interactive Visualization Tool for Interpreting
Vision-Language Transformers",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "21374-21383",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235592"}
@inproceedings{bb240677,
AUTHOR = "Jain, V. and Lodhavia, J.",
TITLE = "Automatic Question Tagging using k-Nearest Neighbors and Random
Forest",
BOOKTITLE = ISCV20,
YEAR = "2020",
PAGES = "1-4",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT235593"}
@article{bb240678,
AUTHOR = "Ye, Q. and Yu, Z.T. and Shao, R. and Cui, Y.W. and Kang, X. and Liu, X. and Torr, P. and Cao, X.C.",
TITLE = "CAT+: Investigating and Enhancing Audio-Visual Understanding in Large
Language Models",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "10",
MONTH = "October",
PAGES = "8674-8690",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235594"}
@article{bb240679,
AUTHOR = "Li, S.S. and Xu, X. and Meng, W.X. and Song, J.K. and Peng, C. and Shen, H.T.",
TITLE = "Mitigating Hallucinations in Large Vision-Language Models via
Reasoning Uncertainty-Guided Refinement",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "7380-7391",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235595"}
@article{bb240680,
AUTHOR = "Fan, J.Q. and Wu, J.H. and Chu, H.Q. and Ge, Q.B. and Gao, B.Z.",
TITLE = "Hallucination Elimination and Text Annotation Framework for Large
Vision-Language Models in Traffic Scenarios",
JOURNAL = ITS,
VOLUME = "27",
YEAR = "2026",
NUMBER = "1",
MONTH = "January",
PAGES = "358-374",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235596"}
@article{bb240681,
AUTHOR = "Tu, C.J. and Ye, P. and Zhou, D.Z. and Bai, L. and Yu, G. and Chen, T. and Ouyang, W.L.",
TITLE = "Attention Reallocation: Towards Zero-cost and Controllable
Hallucination Mitigation of MLLMs",
JOURNAL = IJCV,
VOLUME = "134",
YEAR = "2026",
NUMBER = "1",
MONTH = "January",
PAGES = "22",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235597"}
@article{bb240682,
AUTHOR = "Betti, F. and Baraldi, L. and Baraldi, L. and Cucchiara, R. and Sebe, N.",
TITLE = "Hallucination Early Detection in Diffusion Models",
JOURNAL = IJCV,
VOLUME = "134",
YEAR = "2026",
NUMBER = "1",
MONTH = "January",
PAGES = "35",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235598"}
@article{bb240683,
AUTHOR = "Sun, Y. and Min, X.K. and Zhang, Z.C. and Gao, Y.X. and Cao, Y.Q. and Zhai, G.T.",
TITLE = "Mitigating Low-Level Visual Hallucinations Requires Self-Awareness:
Database, Model, and Training Strategy",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "3",
MONTH = "March",
PAGES = "3382-3396",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235599"}
@article{bb240684,
AUTHOR = "Wang, Y. and Zhou, J.C. and Liu, Q. and Hu, F. and Wang, G.",
TITLE = "Visual Evidence-Aware for Object Hallucinations Rectification in
LLM-Based Video Captioning",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "3",
MONTH = "March",
PAGES = "2842-2853",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235600"}
@article{bb240685,
AUTHOR = "Bi, C. and Dang, T.T. and Wang, S.H. and Cao, F. and Huang, Q.M.",
TITLE = "Asking Questions to Alleviate Object Hallucination in Large
Vision-Language Models",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "3",
MONTH = "March",
PAGES = "3497-3512",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235601"}
@article{bb240686,
AUTHOR = "Yue, F. and Zhang, Y. and Liu, Y.W. and Yu, Y.",
TITLE = "Sarah: Hallucination detection for large vision language models with
semantic information locator and purifier in uncertainty
quantification method",
JOURNAL = IVC,
VOLUME = "168",
YEAR = "2026",
PAGES = "105938",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235602"}
@inproceedings{bb240687,
AUTHOR = "Zhu, L. and Ji, D. and Chen, T.R. and Xu, P. and Ye, J.P. and Liu, J.",
TITLE = "IBD: Alleviating Hallucinations in Large Vision-Language Models via
Image-Biased Decoding",
BOOKTITLE = TrustworthyOpen25,
YEAR = "2025",
PAGES = "1615-1624",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235603"}
@inproceedings{bb240688,
AUTHOR = "Li, Z.X. and Wu, X. and Du, H.Y. and Liu, F. and Nghiem, H. and Shi, G.Y.",
TITLE = "A Survey of State of the Art Large Vision Language Models: Alignment,
Benchmark, Evaluations and Challenges",
BOOKTITLE = TrustworthyOpen25,
YEAR = "2025",
PAGES = "1578-1597",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235604"}
@inproceedings{bb240689,
AUTHOR = "Tran, H.T. and Truong, T.D. and Luu, K.",
TITLE = "BIMA: Bijective Maximum Likelihood Learning Approach to Hallucination
Prediction and Mitigation in Large Vision-Language Models",
BOOKTITLE = Precognition25,
YEAR = "2025",
PAGES = "5302-5311",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235605"}
@inproceedings{bb240690,
AUTHOR = "Yu, T.Y. and Zhang, H. and Li, Q.M. and Xu, Q.X. and Yao, Y. and Chen, D. and Lu, X.M. and Cui, G. and Dang, Y.K. and He, T. and Feng, X.C. and Song, J. and Zheng, B. and Liu, Z.Y. and Chua, T.S. and Sun, M.S.",
TITLE = "RLAIF-V: Open-Source AI Feedback Leads to Super GPT-4V
Trustworthiness",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19985-19995",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235606"}
@inproceedings{bb240691,
AUTHOR = "Liang, J. and Huang, W.K. and Wan, G.C. and Yang, Q. and Ye, M.",
TITLE = "LoRASculpt: Sculpting LoRA for Harmonizing General and Specialized
Knowledge in Multimodal Large Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "26170-26180",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235607"}
@inproceedings{bb240692,
AUTHOR = "Cao, Y. and Xing, Y. and Zhang, J. and Lin, D. and Zhang, T.W. and Tsang, I. and Liu, Y. and Guo, Q.",
TITLE = "SceneTAP: Scene-Coherent Typographic Adversarial Planner against
Vision-Language Models in Real-World Environments",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "25050-25059",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235608"}
@inproceedings{bb240693,
AUTHOR = "Wang, Y.B. and Guan, J. and Liang, J. and He, R.",
TITLE = "Do We Really Need Curated Malicious Data for Safety Alignment in
Multi-modal Large Language Models?",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19879-19889",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235609"}
@inproceedings{bb240694,
AUTHOR = "Peng, R. and He, H.Y. and Wei, Y. and Wen, Y.D. and Hu, D.",
TITLE = "Matters: Training-free Fine-grained Image Caption Enhancement via
Local Perception",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3963-3973",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235610"}
@inproceedings{bb240695,
AUTHOR = "Yang, Z. and Luo, X. and Han, D.Q. and Xu, Y.J. and Li, D.S.",
TITLE = "Mitigating Hallucinations in Large Vision-Language Models via DPO:
On-Policy Data Hold the Key",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "10610-10620",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235611"}
@inproceedings{bb240696,
AUTHOR = "Bae, K. and Kim, J. and Lee, S. and Lee, S. and Lee, G. and Choi, J.",
TITLE = "MASH-VLM: Mitigating Action-Scene Hallucination in Video-LLMs through
Disentangled Spatial-Temporal Representations",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13744-13753",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235612"}
@inproceedings{bb240697,
AUTHOR = "Yin, H. and Si, G.Z. and Wang, Z.",
TITLE = "ClearSight: Visual Signal Enhancement for Object Hallucination
Mitigation in Multimodal Large Language Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14625-14634",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235613"}
@inproceedings{bb240698,
AUTHOR = "Yang, L. and Zheng, Z.W. and Chen, B. and Zhao, Z.Y. and Lin, C.H. and Shen, C.",
TITLE = "Nullu: Mitigating Object Hallucinations in Large Vision-Language
Models via HalluSpace Projection",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14635-14645",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235614"}
@inproceedings{bb240699,
AUTHOR = "Wu, Y.C. and Zhang, L. and Yao, H. and Du, J.L. and Yan, K. and Ding, S.H. and Wu, Y.S. and Li, X.Q.",
TITLE = "Antidote: A Unified Framework for Mitigating LVLM Hallucinations in
Counterfactual Presupposition and Object Perception",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14646-14656",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT235615"}
Last update:Mar 22, 2026 at 13:43:55