@inproceedings{bb242700,
AUTHOR = "Ganju, S. and Russakovsky, O. and Gupta, A.",
TITLE = "What's in a Question:
Using Visual Questions as a Form of Supervision",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "6422-6431",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237611"}
@inproceedings{bb242701,
AUTHOR = "Xu, H.J. and Saenko, K.",
TITLE = "Ask, Attend and Answer:
Exploring Question-Guided Spatial Attention for Visual Question Answering",
BOOKTITLE = ECCV16,
YEAR = "2016",
PAGES = "VII: 451-466",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237612"}
@inproceedings{bb242702,
AUTHOR = "Jabri, A. and Joulin, A. and van der Maaten, L.",
TITLE = "Revisiting Visual Question Answering Baselines",
BOOKTITLE = ECCV16,
YEAR = "2016",
PAGES = "VIII: 727-739",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237613"}
@inproceedings{bb242703,
AUTHOR = "Yang, Z.C. and He, X.D. and Gao, J.F. and Deng, L. and Smola, A.",
TITLE = "Stacked Attention Networks for Image Question Answering",
BOOKTITLE = CVPR16,
YEAR = "2016",
PAGES = "21-29",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237614"}
@inproceedings{bb242704,
AUTHOR = "Sadeghi, F. and Divvala, S.K. and Farhadi, A.",
TITLE = "VisKE: Visual knowledge extraction and question answering by visual
verification of relation phrases",
BOOKTITLE = CVPR15,
YEAR = "2015",
PAGES = "1456-1464",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237615"}
@inproceedings{bb242705,
AUTHOR = "Liu, Y. and Liu, J. and Wang, D. and Cheng, J.",
TITLE = "A robust multivariate reranking algorithm for Question Answering
enrichment",
BOOKTITLE = ICIP12,
YEAR = "2012",
PAGES = "1917-1920",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237616"}
@inproceedings{bb242706,
AUTHOR = "Varekamp, C. and van de Walle, P. and de Putter, M.",
TITLE = "Question interface for 3D picture creation on an autostereoscopic
digital picture frame",
BOOKTITLE = "3DTV09",
YEAR = "2009",
PAGES = "1-4",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqa1.html#TT237617"}
@article{bb242707,
AUTHOR = "Osman, A. and Samek, W.",
TITLE = "DRAU: Dual Recurrent Attention Units for Visual Question Answering",
JOURNAL = CVIU,
VOLUME = "185",
YEAR = "2019",
PAGES = "24-30",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237618"}
@article{bb242708,
AUTHOR = "Li, W. and Sun, J.H. and Liu, G. and Zhao, L.L. and Fang, X.Z.",
TITLE = "Visual question answering with attention transfer and a cross-modal
gating mechanism",
JOURNAL = PRL,
VOLUME = "133",
YEAR = "2020",
PAGES = "334-340",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237619"}
@article{bb242709,
AUTHOR = "Yu, J. and Zhu, Z.H. and Wang, Y.J. and Zhang, W.F. and Hu, Y. and Tan, J.L.",
TITLE = "Cross-modal knowledge reasoning for knowledge-based visual question
answering",
JOURNAL = PR,
VOLUME = "108",
YEAR = "2020",
PAGES = "107563",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237620"}
@inproceedings{bb242710,
AUTHOR = "Yang, Z.Q. and Qin, Z.C. and Yu, J. and Wan, T.",
TITLE = "Prior Visual Relationship Reasoning For Visual Question Answering",
BOOKTITLE = ICIP20,
YEAR = "2020",
PAGES = "1411-1415",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237621"}
@article{bb242711,
AUTHOR = "Yu, J. and Zhang, W.F. and Lu, Y.H. and Qin, Z.C. and Hu, Y. and Tan, J.L. and Wu, Q.",
TITLE = "Reasoning on the Relation: Enhancing Visual Representation for Visual
Question Answering and Cross-Modal Retrieval",
JOURNAL = MultMed,
VOLUME = "22",
YEAR = "2020",
NUMBER = "12",
MONTH = "December",
PAGES = "3196-3209",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237622"}
@article{bb242712,
AUTHOR = "Wu, Y.R. and Ma, Y.T. and Wan, S.H.",
TITLE = "Multi-scale relation reasoning for multi-modal Visual Question
Answering",
JOURNAL = SP:IC,
VOLUME = "96",
YEAR = "2021",
PAGES = "116319",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237623"}
@inproceedings{bb242713,
AUTHOR = "Ma, Y.T. and Lu, T. and Wu, Y.R.",
TITLE = "Multi-scale Relational Reasoning with Regional Attention for Visual
Question Answering",
BOOKTITLE = ICPR21,
YEAR = "2021",
PAGES = "5642-5649",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237624"}
@article{bb242714,
AUTHOR = "Hu, J. and Qian, S.S. and Fang, Q. and Xu, C.S.",
TITLE = "Heterogeneous Community Question Answering via Social-Aware
Multi-Modal Co-Attention Convolutional Matching",
JOURNAL = MultMed,
VOLUME = "23",
YEAR = "2021",
PAGES = "2321-2334",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237625"}
@article{bb242715,
AUTHOR = "Farazi, M. and Khan, S. and Barnes, N.M.",
TITLE = "Accuracy vs. complexity: A trade-off in visual question answering
models",
JOURNAL = PR,
VOLUME = "120",
YEAR = "2021",
PAGES = "108106",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237626"}
@article{bb242716,
AUTHOR = "Liu, F. and Liu, J. and Fang, Z.W. and Hong, R.C. and Lu, H.Q.",
TITLE = "Visual Question Answering With Dense Inter- and Intra-Modality
Interactions",
JOURNAL = MultMed,
VOLUME = "23",
YEAR = "2021",
PAGES = "3518-3529",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237627"}
@article{bb242717,
AUTHOR = "Wu, J.J. and Du, J. and Wang, F. and Yang, C. and Jiang, X.Z. and Hu, J. and Yin, B. and Zhang, J.S. and Dai, L.R.",
TITLE = "A multimodal attention fusion network with a dynamic vocabulary for
TextVQA",
JOURNAL = PR,
VOLUME = "122",
YEAR = "2022",
PAGES = "108214",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237628"}
@article{bb242718,
AUTHOR = "Peng, L. and Yang, Y. and Wang, Z. and Huang, Z. and Shen, H.T.",
TITLE = "MRA-Net: Improving VQA Via Multi-Modal Relation Attention Network",
JOURNAL = PAMI,
VOLUME = "44",
YEAR = "2022",
NUMBER = "1",
MONTH = "January",
PAGES = "318-329",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237629"}
@article{bb242719,
AUTHOR = "Shuang, K. and Guo, J. and Wang, Z.H.",
TITLE = "Comprehensive-perception dynamic reasoning for visual question
answering",
JOURNAL = PR,
VOLUME = "131",
YEAR = "2022",
PAGES = "108878",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237630"}
@article{bb242720,
AUTHOR = "Xie, J.Y. and Fang, W.H. and Cai, Y. and Huang, Q.B. and Li, Q.",
TITLE = "Knowledge-Based Visual Question Generation",
JOURNAL = CirSysVideo,
VOLUME = "32",
YEAR = "2022",
NUMBER = "11",
MONTH = "November",
PAGES = "7547-7558",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237631"}
@article{bb242721,
AUTHOR = "Gao, C.Y. and Zhu, Q. and Wang, P. and Li, H. and Liu, Y.L. and van den Hengel, A.J. and Wu, Q.",
TITLE = "Structured Multimodal Attentions for TextVQA",
JOURNAL = PAMI,
VOLUME = "44",
YEAR = "2022",
NUMBER = "12",
MONTH = "December",
PAGES = "9603-9614",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237632"}
@article{bb242722,
AUTHOR = "Xu, F.Z. and Lin, Q. and Liu, J. and Zhang, L.L. and Zhao, T.Z. and Chai, Q. and Pan, Y. and Huang, Y. and Wang, Q.Y.",
TITLE = "MoCA: Incorporating domain pretraining and cross attention for
textbook question answering",
JOURNAL = PR,
VOLUME = "140",
YEAR = "2023",
PAGES = "109588",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237633"}
@article{bb242723,
AUTHOR = "Mohamud, S.A.M. and Jalali, A. and Lee, M.H.",
TITLE = "Encoder-decoder cycle for visual question answering based on
perception-action cycle",
JOURNAL = PR,
VOLUME = "144",
YEAR = "2023",
PAGES = "109848",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237634"}
@article{bb242724,
AUTHOR = "Tito, R. and Karatzas, D. and Valveny, E.",
TITLE = "Hierarchical multimodal transformers for Multipage DocVQA",
JOURNAL = PR,
VOLUME = "144",
YEAR = "2023",
PAGES = "109834",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237635"}
@article{bb242725,
AUTHOR = "Biswas, K. and Shivakumara, P. and Pal, U. and Liu, C.L. and Lu, Y.",
TITLE = "VQAPT: A New visual question answering model for personality traits
in social media images",
JOURNAL = PRL,
VOLUME = "175",
YEAR = "2023",
PAGES = "66-73",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237636"}
@article{bb242726,
AUTHOR = "Cho, J.W. and Argaw, D.M. and Oh, Y. and Kim, D.J. and Kweon, I.S.",
TITLE = "Empirical study on using adapters for debiased Visual Question
Answering",
JOURNAL = CVIU,
VOLUME = "237",
YEAR = "2023",
PAGES = "103842",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237637"}
@inproceedings{bb242727,
AUTHOR = "Cho, J.W. and Kim, D.J. and Choi, J. and Jung, Y. and Kweon, I.S.",
TITLE = "Dealing with Missing Modalities in the Visual Question
Answer-Difference Prediction Task through Knowledge Distillation",
BOOKTITLE = MULA21,
YEAR = "2021",
PAGES = "1592-1601",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237638"}
@inproceedings{bb242728,
AUTHOR = "Cho, J.W. and Kim, D.J. and Ryu, H. and Kweon, I.S.",
TITLE = "Generative Bias for Robust Visual Question Answering",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "11681-11690",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237639"}
@article{bb242729,
AUTHOR = "Mashrur, A. and Luo, W. and Zaidi, N.A. and Robles Kelly, A.",
TITLE = "Robust visual question answering via semantic cross modal
augmentation",
JOURNAL = CVIU,
VOLUME = "238",
YEAR = "2024",
PAGES = "103862",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237640"}
@article{bb242730,
AUTHOR = "Yao, H.B. and Wang, L.P. and Cai, C.T. and Sun, Y.X. and Zhang, Z. and Luo, Y.K.",
TITLE = "Multi-modal spatial relational attention networks for visual question
answering",
JOURNAL = IVC,
VOLUME = "140",
YEAR = "2023",
PAGES = "104840",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237641"}
@article{bb242731,
AUTHOR = "Zheng, W.B. and Yan, L. and Wang, F.Y.",
TITLE = "So Many Heads, So Many Wits: Multimodal Graph Reasoning for
Text-Based Visual Question Answering",
JOURNAL = SMCS,
VOLUME = "54",
YEAR = "2024",
NUMBER = "2",
MONTH = "February",
PAGES = "854-865",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237642"}
@article{bb242732,
AUTHOR = "Bi, Y.D. and Jiang, H. and Hu, Y.L. and Sun, Y.F. and Yin, B.C.",
TITLE = "See and Learn More: Dense Caption-Aware Representation for Visual
Question Answering",
JOURNAL = CirSysVideo,
VOLUME = "34",
YEAR = "2024",
NUMBER = "2",
MONTH = "February",
PAGES = "1135-1146",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237643"}
@article{bb242733,
AUTHOR = "Jiang, J.J. and Liu, Z.Y. and Zheng, N.N.",
TITLE = "Correlation Information Bottleneck: Towards Adapting Pretrained
Multimodal Models for Robust Visual Question Answering",
JOURNAL = IJCV,
VOLUME = "132",
YEAR = "2024",
NUMBER = "1",
MONTH = "January",
PAGES = "185-207",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237644"}
@article{bb242734,
AUTHOR = "Zhang, S. and Chen, Y. and Sun, Y. and Wang, F. and Shi, H.B. and Wang, H.R.",
TITLE = "LOIS: Looking Out of Instance Semantics for Visual Question Answering",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "6202-6214",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237645"}
@article{bb242735,
AUTHOR = "Xie, J.Y. and Cai, Y. and Chen, J.L. and Xu, R.H. and Wang, J.X. and Li, Q.",
TITLE = "Knowledge-Augmented Visual Question Answering With Natural Language
Explanation",
JOURNAL = IP,
VOLUME = "33",
YEAR = "2024",
PAGES = "2652-2664",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237646"}
@article{bb242736,
AUTHOR = "Wang, J.J. and Ma, A.L. and Chen, Z.H. and Zheng, Z. and Wan, Y.T. and Zhang, L.P. and Zhong, Y.F.",
TITLE = "EarthVQANet: Multi-task visual question answering for remote sensing
image understanding",
JOURNAL = PandRS,
VOLUME = "212",
YEAR = "2024",
PAGES = "422-439",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237647"}
@article{bb242737,
AUTHOR = "Qian, S. and Liu, B.Q. and Sun, C.J. and Xu, Z. and Ma, L. and Wang, B.",
TITLE = "CroMIC-QA: The Cross-Modal Information Complementation Based Question
Answering",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "8348-8359",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237648"}
@article{bb242738,
AUTHOR = "Uehara, K. and Harada, T.",
TITLE = "Learning by Asking Questions for Knowledge-Based Novel Object
Recognition",
JOURNAL = IJCV,
VOLUME = "132",
YEAR = "2024",
NUMBER = "6",
MONTH = "June",
PAGES = "2290-2309",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237649"}
@inproceedings{bb242739,
AUTHOR = "Uehara, K. and Harada, T.",
TITLE = "K-VQG: Knowledge-aware Visual Question Generation for Common-sense
Acquisition",
BOOKTITLE = WACV23,
YEAR = "2023",
PAGES = "4390-4398",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237650"}
@inproceedings{bb242740,
AUTHOR = "Uehara, K. and Duan, N. and Harada, T.",
TITLE = "Learning to Ask Informative Sub-Questions for Visual Question
Answering",
BOOKTITLE = MULA22,
YEAR = "2022",
PAGES = "4680-4689",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237651"}
@inproceedings{bb242741,
AUTHOR = "Li, Y.K. and Duan, N. and Zhou, B.L. and Chu, X. and Ouyang, W.L. and Wang, X.G. and Zhou, M.",
TITLE = "Visual Question Generation as Dual Task of Visual Question Answering",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "6116-6124",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237652"}
@inproceedings{bb242742,
AUTHOR = "Gao, P. and Li, H.S. and Li, S. and Lu, P. and Li, Y.K. and Hoi, S.C.H. and Wang, X.G.",
TITLE = "Question-Guided Hybrid Convolution for Visual Question Answering",
BOOKTITLE = ECCV18,
YEAR = "2018",
PAGES = "I: 485-501",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237653"}
@inproceedings{bb242743,
AUTHOR = "Gao, P. and Jiang, Z.K. and You, H.X. and Lu, P. and Hoi, S.C.H. and Wang, X.G. and Li, H.S.",
TITLE = "Dynamic Fusion With Intra- and Inter-Modality Attention Flow for Visual
Question Answering",
BOOKTITLE = CVPR19,
YEAR = "2019",
PAGES = "6632-6641",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237654"}
@article{bb242744,
AUTHOR = "Vosoughi, A. and Deng, S.J. and Zhang, S.Y. and Tian, Y.P. and Xu, C.L. and Luo, J.B.",
TITLE = "Cross Modality Bias in Visual Question Answering:
A Causal View With Possible Worlds VQA",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "8609-8624",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237655"}
@article{bb242745,
AUTHOR = "Guo, Y.Y. and Jiao, F.K. and Shen, Z.Q. and Nie, L.Q. and Kankanhalli, M.",
TITLE = "UNK-VQA: A Dataset and a Probe Into the Abstention Ability of
Multi-Modal Large Models",
JOURNAL = PAMI,
VOLUME = "46",
YEAR = "2024",
NUMBER = "12",
MONTH = "December",
PAGES = "10284-10296",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237656"}
@article{bb242746,
AUTHOR = "Chen, F.Y. and Tang, X.S. and Hao, K.R.",
TITLE = "GEXMERT: Geometrically enhanced cross-modality encoder representations
from transformers inspired by higher-order visual percepts",
JOURNAL = PR,
VOLUME = "158",
YEAR = "2025",
PAGES = "111047",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237657"}
@article{bb242747,
AUTHOR = "Zhang, B. and Li, J.X. and Shi, Y.C. and Han, Y. and Hu, Q.H.",
TITLE = "VADS: Visuo-Adaptive DualStrike attack on visual question answer",
JOURNAL = CVIU,
VOLUME = "249",
YEAR = "2024",
PAGES = "104137",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237658"}
@article{bb242748,
AUTHOR = "Peng, D. and Li, Z.X.",
TITLE = "Unbiased VQA via modal information interaction and question
transformation",
JOURNAL = PR,
VOLUME = "162",
YEAR = "2025",
PAGES = "111394",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237659"}
@article{bb242749,
AUTHOR = "Fan, L. and Gong, X. and Zheng, C.Y. and Tan, X.L. and Li, J. and Ou, Y.F.",
TITLE = "Cycle-VQA: A Cycle-Consistent Framework for Robust Medical Visual
Question Answering",
JOURNAL = PR,
VOLUME = "165",
YEAR = "2025",
PAGES = "111609",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237660"}
@article{bb242750,
AUTHOR = "Lin, Q. and He, K. and Zhu, Y.F. and Xu, F.Z. and Cambria, E. and Feng, M.L.",
TITLE = "Cross-Modal Knowledge Diffusion-Based Generation for Difference-Aware
Medical VQA",
JOURNAL = IP,
VOLUME = "34",
YEAR = "2025",
PAGES = "2421-2434",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237661"}
@article{bb242751,
AUTHOR = "Kim, B.S. and Kim, J. and Lee, D. and Jang, B.",
TITLE = "Visual Question Answering: A Survey of Methods, Datasets, Evaluation,
and Challenges",
JOURNAL = Surveys,
VOLUME = "57",
YEAR = "2025",
NUMBER = "10",
MONTH = "May",
PAGES = "xx-yy",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237662"}
@article{bb242752,
AUTHOR = "Wen, Z.Q. and Tan, M.K. and Wang, Y.W. and Wu, Q.Y. and Wu, Q.",
TITLE = "Enhanced Reasoning via Multimodal LLMs and Collaborative Inference",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "7166-7178",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237663"}
@article{bb242753,
AUTHOR = "Pei, B.Q. and Huang, Y.F. and Chen, G. and Xu, J.L. and Wang, Y. and Wang, L.M. and Lu, T. and Qiao, Y. and Wu, F.",
TITLE = "Guiding Audio-Visual Question Answering with Collective Question
Reasoning",
JOURNAL = IJCV,
VOLUME = "133",
YEAR = "2025",
NUMBER = "10",
MONTH = "October",
PAGES = "6912-6929",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237664"}
@article{bb242754,
AUTHOR = "Gupta, C. and Gill, N.S. and Gulia, P. and Pau, G.",
TITLE = "CODNet: Context-based object detection network for multimodal image
captioning and virtual question answering",
JOURNAL = IVC,
VOLUME = "163",
YEAR = "2025",
PAGES = "105768",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237665"}
@article{bb242755,
AUTHOR = "Liu, H.L. and Chen, L. and Lu, X.C. and Wang, H. and Bai, L. and Wang, M. and Ren, P.",
TITLE = "A visual-textual mutual guidance fusion network for remote sensing
visual question answering",
JOURNAL = PR,
VOLUME = "176",
YEAR = "2026",
PAGES = "113258",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237666"}
@article{bb242756,
AUTHOR = "Xu, Q.X. and Zhou, L. and Zhong, X. and Zhang, F.F. and Tian, J. and Yu, X.H. and Huang, R.",
TITLE = "ETV-Attack: Efficient text-driven visual-variable adversarial attacks
on visual question answering with pre-trained language models",
JOURNAL = PR,
VOLUME = "176",
YEAR = "2026",
PAGES = "113202",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237667"}
@article{bb242757,
AUTHOR = "Wang, Z.Q. and Wan, W.T. and Lao, Q.Q. and Chen, R.M. and Lang, M.J. and Wang, X. and Gao, F. and Wang, K. and Lin, L.",
TITLE = "Toward Top-Down Reasoning: An Explainable Multi-Agent Approach for
Visual Question Answering",
JOURNAL = MultMed,
VOLUME = "28",
YEAR = "2026",
PAGES = "3081-3096",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237668"}
@article{bb242758,
AUTHOR = "Wu, K.X. and Li, X.D. and Li, X.L. and Zuo, K.Z. and Lv, Z.W.",
TITLE = "AVQACL++: Toward a Robust Framework and Benchmark for Audio-Visual
Question Answering Continual Learning",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "6",
MONTH = "June",
PAGES = "8232-8245",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237669"}
@inproceedings{bb242759,
AUTHOR = "Wu, K.X. and Li, X.D. and Li, X.L. and Hu, C.F. and Wu, G.L.",
TITLE = "AVQACL: A Novel Benchmark for Audio-Visual Question Answering
Continual Learning",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3252-3261",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237670"}
@article{bb242760,
AUTHOR = "Li, Y.X. and Yu, J. and Zeng, T.Y. and Zhou, Y. and Ma, L.Y.",
TITLE = "GIGAS: Adversarial Attacks on Visual Question Answering With
Multi-Modal Generative Models",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "6",
MONTH = "June",
PAGES = "8385-8397",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237671"}
@inproceedings{bb242761,
AUTHOR = "Kim, H.Y. and Jung, I. and Suh, D. and Zhang, Y. and Lee, S. and Hong, S.",
TITLE = "Question-Aware Gaussian Experts for Audio-Visual Question Answering",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13681-13690",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237672"}
@inproceedings{bb242762,
AUTHOR = "Huang, C.Y. and Maneechotesuwan, B. and Chopra, S. and Kira, Z.",
TITLE = "FRAMES-VQA: Benchmarking Fine-Tuning Robustness across Multi-Modal
Shifts in Visual Question Answering",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3909-3918",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237673"}
@inproceedings{bb242763,
AUTHOR = "Zhao, X.Y. and Bai, Z.W. and Zhou, M.L. and Ren, X.C. and Wang, Y.Q. and Wang, L.C.",
TITLE = "Integrating Dynamic Routing with Reinforcement Learning and
Multimodal Techniques for Visual Question Answering",
BOOKTITLE = ICIVC24,
YEAR = "2024",
PAGES = "295-301",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237674"}
@inproceedings{bb242764,
AUTHOR = "Park, K.R. and Lee, H.J. and Kim, J.U.",
TITLE = "Learning Trimodal Relation for Audio-visual Question Answering with
Missing Modality",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XV: 42-59",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237675"}
@inproceedings{bb242765,
AUTHOR = "Mishra, A. and Agarwala, A. and Tiwari, U. and Rajendiran, V.N. and Miriyala, S.S.",
TITLE = "Efficient Visual Question Answering on Embedded Devices:
Cross-Modality Attention with Evolutionary Quantization",
BOOKTITLE = ICIP24,
YEAR = "2024",
PAGES = "2142-2148",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237676"}
@inproceedings{bb242766,
AUTHOR = "Jiang, X. and Wang, G.M. and Guo, J.H. and Li, J.C. and Zhang, W.Q. and Lu, R.X. and Tang, S.L.",
TITLE = "DIEM: Decomposition-Integration Enhancing Multimodal Insights",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27294-27303",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237677"}
@inproceedings{bb242767,
AUTHOR = "Reichman, B. and Heck, L.",
TITLE = "Cross-Modal Dense Passage Retrieval for Outside Knowledge Visual
Question Answering",
BOOKTITLE = CLVL23,
YEAR = "2023",
PAGES = "2829-2834",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237678"}
@inproceedings{bb242768,
AUTHOR = "Qian, Z. and Wang, X. and Duan, X.G. and Qin, P. and Li, Y.H. and Zhu, W.W.",
TITLE = "Decouple Before Interact: Multi-Modal Prompt Learning for Continual
Visual Question Answering",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "2941-2950",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237679"}
@inproceedings{bb242769,
AUTHOR = "Li, B.J. and Wang, J. and Zhao, M. and Zhou, S.",
TITLE = "Two-stage Multimodality Fusion for High-performance Text-based Visual
Question Answering",
BOOKTITLE = ACCV22,
YEAR = "2022",
PAGES = "IV:658-674",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237680"}
@inproceedings{bb242770,
AUTHOR = "Chai, Z. and Wan, X.J. and Han, S.C. and Poon, J.",
TITLE = "Visual Question Generation Under Multi-granularity Cross-Modal
Interaction",
BOOKTITLE = MMMod23,
YEAR = "2023",
PAGES = "I: 255-266",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237681"}
@inproceedings{bb242771,
AUTHOR = "Wang, J.H. and Hu, M.H. and Song, Y.G. and Yang, X.S.",
TITLE = "Health-Oriented Multimodal Food Question Answering",
BOOKTITLE = MMMod23,
YEAR = "2023",
PAGES = "I: 191-203",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237682"}
@inproceedings{bb242772,
AUTHOR = "Zhang, H.T. and Wu, W.",
TITLE = "CAT: Re-Conv Attention in Transformer for Visual Question Answering",
BOOKTITLE = "ICPR22",
YEAR = "2022",
PAGES = "1471-1477",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237683"}
@inproceedings{bb242773,
AUTHOR = "Dancette, C. and Cadene, R. and Teney, D. and Cord, M.",
TITLE = "Beyond Question-Based Biases:
Assessing Multimodal Shortcut Learning in Visual Question Answering",
BOOKTITLE = ICCV21,
YEAR = "2021",
PAGES = "1554-1563",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237684"}
@inproceedings{bb242774,
AUTHOR = "Felix, R. and Repasky, B. and Hodge, S. and Zolfaghari, R. and Abbasnejad, E. and Sherrah, J.",
TITLE = "Cross-Modal Visual Question Answering for Remote Sensing Data: the
International Conference on Digital Image Computing: Techniques and
Applications (DICTA 2021)",
BOOKTITLE = DICTA21,
YEAR = "2021",
PAGES = "1-9",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237685"}
@inproceedings{bb242775,
AUTHOR = "Chen, H.Y. and Liu, R.F. and Peng, B.",
TITLE = "Cross-modal Relational Reasoning Network for Visual Question
Answering",
BOOKTITLE = MAIR2-21,
YEAR = "2021",
PAGES = "3939-3948",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237686"}
@inproceedings{bb242776,
AUTHOR = "Farazi, M. and Khan, S. and Barnes, N.M.",
TITLE = "Question-Agnostic Attention for Visual Question Answering",
BOOKTITLE = ICPR21,
YEAR = "2021",
PAGES = "3542-3549",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237687"}
@inproceedings{bb242777,
AUTHOR = "Li, Y. and Lin, Y. and Zhao, H.H. and Wang, D.H.",
TITLE = "Dual Path Multi-Modal High-Order Features for Textual Content based
Visual Question Answering",
BOOKTITLE = ICPR21,
YEAR = "2021",
PAGES = "4324-4331",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237688"}
@inproceedings{bb242778,
AUTHOR = "Huang, H.T. and Han, T. and Han, W. and Yap, D. and Chiang, C.M.",
TITLE = "Answer-checking in Context:
A Multi-modal Fully Attention Network for Visual Question Answering",
BOOKTITLE = ICPR21,
YEAR = "2021",
PAGES = "1173-1180",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237689"}
@inproceedings{bb242779,
AUTHOR = "Kant, Y. and Batra, D. and Anderson, P. and Schwing, A. and Parikh, D. and Lu, J. and Agrawal, H.",
TITLE = "Spatially Aware Multimodal Transformers for TextVQA",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "IX:715-732",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237690"}
@inproceedings{bb242780,
AUTHOR = "Hu, R. and Singh, A. and Darrell, T.J. and Rohrbach, M.",
TITLE = "Iterative Answer Prediction With Pointer-Augmented Multimodal
Transformers for TextVQA",
BOOKTITLE = CVPR20,
YEAR = "2020",
PAGES = "9989-9999",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237691"}
@inproceedings{bb242781,
AUTHOR = "Peng, G. and You, H.X. and Zhang, Z.P. and Wang, X.G. and Li, H.S.",
TITLE = "Multi-Modality Latent Interaction Network for Visual Question
Answering",
BOOKTITLE = ICCV19,
YEAR = "2019",
PAGES = "5824-5834",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237692"}
@inproceedings{bb242782,
AUTHOR = "Cadene, R. and Ben younes, H. and Cord, M. and Thome, N.",
TITLE = "MUREL: Multimodal Relational Reasoning for Visual Question Answering",
BOOKTITLE = CVPR19,
YEAR = "2019",
PAGES = "1989-1998",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237693"}
@inproceedings{bb242783,
AUTHOR = "Haurilet, M. and Al Halah, Z. and Stiefelhagen, R.",
TITLE = "DynGraph: Visual Question Answering via Dynamic Scene Graphs",
BOOKTITLE = GCPR19,
YEAR = "2019",
PAGES = "428-441",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237694"}
@inproceedings{bb242784,
AUTHOR = "Haurilet, M. and Al Halah, Z. and Stiefelhagen, R.",
TITLE = "MoQA: A Multi-modal Question Answering Architecture",
BOOKTITLE = VL18,
YEAR = "2018",
PAGES = "IV:106-113",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237695"}
@inproceedings{bb242785,
AUTHOR = "Gu, J.X. and Cai, J.F. and Joty, S. and Niu, L. and Wang, G.",
TITLE = "Look, Imagine and Match: Improving Textual-Visual Cross-Modal
Retrieval with Generative Models",
BOOKTITLE = CVPR18,
YEAR = "2018",
PAGES = "7181-7189",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237696"}
@inproceedings{bb242786,
AUTHOR = "Sheng, S.R. and Venkitasubramanian, A.N. and Moens, M.F.",
TITLE = "A Markov Network Based Passage Retrieval Method for Multimodal Question
Answering in the Cultural Heritage Domain",
BOOKTITLE = MMMod18,
YEAR = "2018",
PAGES = "I:3-15",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237697"}
@inproceedings{bb242787,
AUTHOR = "Yu, Z. and Yu, J. and Fan, J. and Tao, D.",
TITLE = "Multi-modal Factorized Bilinear Pooling with Co-attention Learning
for Visual Question Answering",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "1839-1848",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237698"}
@inproceedings{bb242788,
AUTHOR = "Ben Younes, H. and Cadene, R. and Cord, M. and Thome, N.",
TITLE = "MUTAN: Multimodal Tucker Fusion for Visual Question Answering",
BOOKTITLE = ICCV17,
YEAR = "2017",
PAGES = "2631-2639",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237699"}
@inproceedings{bb242789,
AUTHOR = "Kembhavi, A. and Seo, M. and Schwenk, D. and Choi, J. and Farhadi, A. and Hajishirzi, H.",
TITLE = "Are You Smarter Than a Sixth Grader? Textbook Question Answering for
Multimodal Machine Comprehension",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "5376-5384",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT237700"}
@article{bb242790,
AUTHOR = "Das, A. and Agrawal, H. and Zitnick, L. and Parikh, D. and Batra, D.",
TITLE = "Human Attention in Visual Question Answering:
Do Humans and Deep Networks Look at the Same Regions?",
JOURNAL = CVIU,
VOLUME = "163",
YEAR = "2017",
NUMBER = "1",
PAGES = "90-100",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237701"}
@article{bb242791,
AUTHOR = "Malinowski, M. and Rohrbach, M. and Fritz, M.",
TITLE = "Ask Your Neurons: A Deep Learning Approach to Visual Question Answering",
JOURNAL = IJCV,
VOLUME = "125",
YEAR = "2018",
NUMBER = "1-3",
MONTH = "December",
PAGES = "110-135",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237702"}
@inproceedings{bb242792,
AUTHOR = "Malinowski, M. and Rohrbach, M. and Fritz, M.",
TITLE = "Ask Your Neurons:
A Neural-Based Approach to Answering Questions about Images",
BOOKTITLE = ICCV15,
YEAR = "2015",
PAGES = "1-9",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237703"}
@inproceedings{bb242793,
AUTHOR = "Dancette, C. and Whitehead, S. and Maheshwary, R. and Vedantam, R. and Scherer, S. and Chen, X.L. and Cord, M. and Rohrbach, M.",
TITLE = "Improving Selective Visual Question Answering by Learning from Your
Peers",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "24049-24059",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237704"}
@article{bb242794,
AUTHOR = "Huang, Y.Z. and Zhong, T.",
TITLE = "Multitask learning for neural generative question answering",
JOURNAL = RealTimeIP,
VOLUME = "14",
YEAR = "2018",
NUMBER = "1",
MONTH = "January",
PAGES = "1009-1017",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237705"}
@article{bb242795,
AUTHOR = "Ruwa, N. and Mao, Q. and Song, H.P. and Jia, H.J. and Dong, M.",
TITLE = "Triple attention network for sentimental visual question answering",
JOURNAL = CVIU,
VOLUME = "189",
YEAR = "2019",
PAGES = "102829",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237706"}
@article{bb242796,
AUTHOR = "Bai, Z.W. and Li, Y. and Wozniak, M. and Zhou, M.L. and Li, D.",
TITLE = "DecomVQANet: Decomposing visual question answering deep network via
tensor decomposition and regression",
JOURNAL = PR,
VOLUME = "110",
YEAR = "2021",
PAGES = "107538",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237707"}
@article{bb242797,
AUTHOR = "Zhang, Q.S. and Wu, Y.N. and Zhang, H. and Zhu, S.C.",
TITLE = "Mining deep And-Or object structures via cost-sensitive
question-answer-based active annotations",
JOURNAL = CVIU,
VOLUME = "176-177",
YEAR = "2018",
PAGES = "33-44",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237708"}
@article{bb242798,
AUTHOR = "Zhang, Q.S. and Ren, J. and Huang, G. and Cao, R.M. and Wu, Y.N. and Zhu, S.C.",
TITLE = "Mining Interpretable AOG Representations From Convolutional Networks
via Active Question Answering",
JOURNAL = PAMI,
VOLUME = "43",
YEAR = "2021",
NUMBER = "11",
MONTH = "November",
PAGES = "3949-3963",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237709"}
@inproceedings{bb242799,
AUTHOR = "Zhang, Q.S. and Cao, R.M. and Wu, Y.N. and Zhu, S.C.",
TITLE = "Mining Object Parts from CNNs via Active Question-Answering",
BOOKTITLE = CVPR17,
YEAR = "2017",
PAGES = "3890-3899",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT237710"}
Last update:Jun 13, 2026 at 20:41:05