@inproceedings{bb233500,
        AUTHOR = "Kant, Y. and Batra, D. and Anderson, P. and Schwing, A. and Parikh, D. and Lu, J. and Agrawal, H.",
        TITLE = "Spatially Aware Multimodal Transformers for TextVQA",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "IX:715-732",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228476"}

@inproceedings{bb233501,
        AUTHOR = "Hu, R. and Singh, A. and Darrell, T.J. and Rohrbach, M.",
        TITLE = "Iterative Answer Prediction With Pointer-Augmented Multimodal
Transformers for TextVQA",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "9989-9999",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228477"}

@inproceedings{bb233502,
        AUTHOR = "Peng, G. and You, H.X. and Zhang, Z.P. and Wang, X.G. and Li, H.S.",
        TITLE = "Multi-Modality Latent Interaction Network for Visual Question
Answering",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "5824-5834",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228478"}

@inproceedings{bb233503,
        AUTHOR = "Cadene, R. and Ben younes, H. and Cord, M. and Thome, N.",
        TITLE = "MUREL: Multimodal Relational Reasoning for Visual Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "1989-1998",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228479"}

@inproceedings{bb233504,
        AUTHOR = "Haurilet, M. and Al Halah, Z. and Stiefelhagen, R.",
        TITLE = "DynGraph: Visual Question Answering via Dynamic Scene Graphs",
        BOOKTITLE = GCPR19,
        YEAR = "2019",
        PAGES = "428-441",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228480"}

@inproceedings{bb233505,
        AUTHOR = "Haurilet, M. and Al Halah, Z. and Stiefelhagen, R.",
        TITLE = "MoQA: A Multi-modal Question Answering Architecture",
        BOOKTITLE = VL18,
        YEAR = "2018",
        PAGES = "IV:106-113",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228481"}

@inproceedings{bb233506,
        AUTHOR = "Gu, J.X. and Cai, J.F. and Joty, S. and Niu, L. and Wang, G.",
        TITLE = "Look, Imagine and Match: Improving Textual-Visual Cross-Modal
Retrieval with Generative Models",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "7181-7189",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228482"}

@inproceedings{bb233507,
        AUTHOR = "Sheng, S.R. and Venkitasubramanian, A.N. and Moens, M.F.",
        TITLE = "A Markov Network Based Passage Retrieval Method for Multimodal Question
Answering in the Cultural Heritage Domain",
        BOOKTITLE = MMMod18,
        YEAR = "2018",
        PAGES = "I:3-15",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228483"}

@inproceedings{bb233508,
        AUTHOR = "Yu, Z. and Yu, J. and Fan, J. and Tao, D.",
        TITLE = "Multi-modal Factorized Bilinear Pooling with Co-attention Learning
for Visual Question Answering",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1839-1848",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228484"}

@inproceedings{bb233509,
        AUTHOR = "Ben Younes, H. and Cadene, R. and Cord, M. and Thome, N.",
        TITLE = "MUTAN: Multimodal Tucker Fusion for Visual Question Answering",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "2631-2639",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228485"}

@inproceedings{bb233510,
        AUTHOR = "Kembhavi, A. and Seo, M. and Schwenk, D. and Choi, J. and Farhadi, A. and Hajishirzi, H.",
        TITLE = "Are You Smarter Than a Sixth Grader? Textbook Question Answering for
Multimodal Machine Comprehension",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "5376-5384",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803mmovqa5.html#TT228486"}

@article{bb233511,
        AUTHOR = "Das, A. and Agrawal, H. and Zitnick, L. and Parikh, D. and Batra, D.",
        TITLE = "Human Attention in Visual Question Answering:
Do Humans and Deep Networks Look at the Same Regions?",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "90-100",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228487"}

@article{bb233512,
        AUTHOR = "Malinowski, M. and Rohrbach, M. and Fritz, M.",
        TITLE = "Ask Your Neurons: A Deep Learning Approach to Visual Question Answering",
        JOURNAL = IJCV,
        VOLUME = "125",
        YEAR = "2018",
        NUMBER = "1-3",
        MONTH = "December",
        PAGES = "110-135",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228488"}

@inproceedings{bb233513,
        AUTHOR = "Malinowski, M. and Rohrbach, M. and Fritz, M.",
        TITLE = "Ask Your Neurons:
A Neural-Based Approach to Answering Questions about Images",
        BOOKTITLE = ICCV15,
        YEAR = "2015",
        PAGES = "1-9",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228489"}

@inproceedings{bb233514,
        AUTHOR = "Dancette, C. and Whitehead, S. and Maheshwary, R. and Vedantam, R. and Scherer, S. and Chen, X.L. and Cord, M. and Rohrbach, M.",
        TITLE = "Improving Selective Visual Question Answering by Learning from Your
Peers",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "24049-24059",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228490"}

@article{bb233515,
        AUTHOR = "Huang, Y.Z. and Zhong, T.",
        TITLE = "Multitask learning for neural generative question answering",
        JOURNAL = RealTimeIP,
        VOLUME = "14",
        YEAR = "2018",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "1009-1017",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228491"}

@article{bb233516,
        AUTHOR = "Ruwa, N. and Mao, Q. and Song, H.P. and Jia, H.J. and Dong, M.",
        TITLE = "Triple attention network for sentimental visual question answering",
        JOURNAL = CVIU,
        VOLUME = "189",
        YEAR = "2019",
        PAGES = "102829",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228492"}

@article{bb233517,
        AUTHOR = "Bai, Z.W. and Li, Y. and Wozniak, M. and Zhou, M.L. and Li, D.",
        TITLE = "DecomVQANet: Decomposing visual question answering deep network via
tensor decomposition and regression",
        JOURNAL = PR,
        VOLUME = "110",
        YEAR = "2021",
        PAGES = "107538",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228493"}

@article{bb233518,
        AUTHOR = "Zhang, Q.S. and Wu, Y.N. and Zhang, H. and Zhu, S.C.",
        TITLE = "Mining deep And-Or object structures via cost-sensitive
question-answer-based active annotations",
        JOURNAL = CVIU,
        VOLUME = "176-177",
        YEAR = "2018",
        PAGES = "33-44",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228494"}

@article{bb233519,
        AUTHOR = "Zhang, Q.S. and Ren, J. and Huang, G. and Cao, R.M. and Wu, Y.N. and Zhu, S.C.",
        TITLE = "Mining Interpretable AOG Representations From Convolutional Networks
via Active Question Answering",
        JOURNAL = PAMI,
        VOLUME = "43",
        YEAR = "2021",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "3949-3963",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228495"}

@inproceedings{bb233520,
        AUTHOR = "Zhang, Q.S. and Cao, R.M. and Wu, Y.N. and Zhu, S.C.",
        TITLE = "Mining Object Parts from CNNs via Active Question-Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3890-3899",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228496"}

@inproceedings{bb233521,
        AUTHOR = "Zhang, Q.S. and Wu, Y.N. and Zhu, S.C.",
        TITLE = "Mining And-Or Graphs for Graph Matching and Object Discovery",
        BOOKTITLE = ICCV15,
        YEAR = "2015",
        PAGES = "55-63",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228497"}

@article{bb233522,
        AUTHOR = "Cao, Q.X. and Liang, X.D. and Li, B.L. and Lin, L.",
        TITLE = "Interpretable Visual Question Answering by Reasoning on Dependency
Trees",
        JOURNAL = PAMI,
        VOLUME = "43",
        YEAR = "2021",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "887-901",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228498"}

@inproceedings{bb233523,
        AUTHOR = "Cao, Q.X. and Liang, X.D. and Li, B.L. and Li, G. and Lin, L.",
        TITLE = "Visual Question Reasoning on General Dependency Tree",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "7249-7257",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228499"}

@article{bb233524,
        AUTHOR = "Zhong, H.S. and Chen, J.Y. and Shen, C. and Zhang, H.W. and Huang, J.Q. and Hua, X.S.",
        TITLE = "Self-Adaptive Neural Module Transformer for Visual Question Answering",
        JOURNAL = MultMed,
        VOLUME = "23",
        YEAR = "2021",
        PAGES = "1264-1273",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228500"}

@article{bb233525,
        AUTHOR = "Zheng, W.F. and Yin, L.R. and Chen, X.B. and Ma, Z.Y. and Liu, S. and Yang, B.",
        TITLE = "Knowledge base graph embedding module design for Visual question
answering model",
        JOURNAL = PR,
        VOLUME = "120",
        YEAR = "2021",
        PAGES = "108153",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228501"}

@article{bb233526,
        AUTHOR = "Sharma, H. and Jalal, A.S.",
        TITLE = "Visual question answering model based on graph neural network and
contextual attention",
        JOURNAL = IVC,
        VOLUME = "110",
        YEAR = "2021",
        PAGES = "104165",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228502"}

@article{bb233527,
        AUTHOR = "Song, L.Y. and Li, J. and Liu, J. and Yang, Y. and Shang, X. and Sun, M.X.",
        TITLE = "Answering knowledge-based visual questions via the exploration of
Question Purpose",
        JOURNAL = PR,
        VOLUME = "133",
        YEAR = "2023",
        PAGES = "109015",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228503"}

@article{bb233528,
        AUTHOR = "MeshuWelde, T. and Liao, L.",
        TITLE = "Counting-based visual question answering with serial cascaded
attention deep learning",
        JOURNAL = PR,
        VOLUME = "144",
        YEAR = "2023",
        PAGES = "109850",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228504"}

@article{bb233529,
        AUTHOR = "Liu, Y. and Li, G.B. and Lin, L.",
        TITLE = "Cross-Modal Causal Relational Reasoning for Event-Level Visual
Question Answering",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "11624-11641",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228505"}

@inproceedings{bb233530,
        AUTHOR = "Cao, Q.X. and Wan, W.T. and Wang, K. and Liang, X.D. and Lin, L.",
        TITLE = "Linguistically Routing Capsule Network for Out-of-distribution Visual
Question Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1594-1603",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228506"}

@article{bb233531,
        AUTHOR = "Yang, S.W. and Xiao, L. and Wu, X.J. and Xu, J.J. and Wang, L.L. and He, L.",
        TITLE = "Simple contrastive learning in a self-supervised manner for robust
visual question answering",
        JOURNAL = CVIU,
        VOLUME = "241",
        YEAR = "2024",
        PAGES = "103976",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228507"}

@article{bb233532,
        AUTHOR = "Wu, Y.L. and Pan, X. and Li, J.H. and Dou, S. and Wang, X.X.",
        TITLE = "Interpretable answer retrieval based on heterogeneous network
embedding",
        JOURNAL = PRL,
        VOLUME = "182",
        YEAR = "2024",
        PAGES = "9-16",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228508"}

@article{bb233533,
        AUTHOR = "Luo, L. and Lai, H.J. and Pan, Y. and Yin, J.",
        TITLE = "Efficient Multimodal Selection for Retrieval in Knowledge-Based
Visual Question Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "5195-5207",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228509"}

@inproceedings{bb233534,
        AUTHOR = "Zhang, Y. and Chen, H. and Frikha, A. and Krompass, D. and Zhang, G. and Gu, J.D. and Tresp, V.",
        TITLE = "CL-Cross VQA: A Continual Learning Benchmark for Cross-Domain Visual
Question Answering",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "6269-6278",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228510"}

@inproceedings{bb233535,
        AUTHOR = "Marcu, A.M. and Chen, L. and Hunermann, J. and Karnsund, A. and Hanotte, B. and Chidananda, P. and Nair, S. and Badrinarayanan, V. and Kendall, A. and Shotton, J. and Arani, E. and Sinavski, O.",
        TITLE = "Lingoqa: Visual Question Answering for Autonomous Driving",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXVII: 252-269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228511"}

@inproceedings{bb233536,
        AUTHOR = "Sima, C.H. and Renz, K. and Chitta, K. and Chen, L. and Zhang, H. and Xie, C.G. and Beißwenger, J. and Luo, P. and Geiger, A. and Li, H.Y.",
        TITLE = "Drivelm: Driving with Graph Visual Question Answering",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LII: 256-274",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228512"}

@inproceedings{bb233537,
        AUTHOR = "Feng, C. and Danier, D. and Zhang, F. and Bull, D.",
        TITLE = "RankDVQA: Deep VQA based on Ranking-inspired Hybrid Training",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "1637-1647",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228513"}

@inproceedings{bb233538,
        AUTHOR = "Ishay, A. and Yang, Z. and Lee, J. and Kang, I. and Lim, D.J.",
        TITLE = "Think before You Simulate: Symbolic Reasoning to Orchestrate Neural
Computation for Counterfactual Question Answering",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "6684-6693",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228514"}

@inproceedings{bb233539,
        AUTHOR = "Wang, Y. and Yasunaga, M. and Ren, H.Y. and Wada, S. and Leskovec, J.",
        TITLE = "VQA-GNN: Reasoning with Multimodal Knowledge via Graph Neural
Networks for Visual Question Answering",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "21525-21535",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228515"}

@inproceedings{bb233540,
        AUTHOR = "Souza, B. and Aasan, M. and Pedrini, H. and Rivera, A.R.",
        TITLE = "SelfGraphVQA: A Self-Supervised Graph Neural Network for Scene-based
Question Answering",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4642-4647",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228516"}

@inproceedings{bb233541,
        AUTHOR = "Haisa, G. and Altenbek, G.",
        TITLE = "Question Classification Based on Weak Supervision and Interrogative
Pronouns Attention Mechanism",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "2273-2278",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228517"}

@inproceedings{bb233542,
        AUTHOR = "Nguyen, B.X. and Do, T. and Tran, H. and Tjiputra, E. and Tran, Q.D. and Nguyen, A.",
        TITLE = "Coarse-to-Fine Reasoning for Visual Question Answering",
        BOOKTITLE = MULA22,
        YEAR = "2022",
        PAGES = "4557-4565",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228518"}

@inproceedings{bb233543,
        AUTHOR = "Liang, Y.Y. and Wang, X. and Duan, X.G. and Zhu, W.W.",
        TITLE = "Multi-modal Contextual Graph Neural Network for Text Visual Question
Answering",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "3491-3498",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228519"}

@inproceedings{bb233544,
        AUTHOR = "Patro, B.N. and Kurmi, V.K. and Kumar, S. and Namboodiri, V.P.",
        TITLE = "Deep Bayesian Network for Visual Question Generation",
        BOOKTITLE = WACV20,
        YEAR = "2020",
        PAGES = "1555-1565",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228520"}

@inproceedings{bb233545,
        AUTHOR = "Singh, A.K. and Mishra, A. and Shekhar, S. and Chakraborty, A.",
        TITLE = "From Strings to Things: Knowledge-Enabled VQA Model That Can Read and
Reason",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "4601-4611",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228521"}

@inproceedings{bb233546,
        AUTHOR = "Wilf, A. and Ma, M.Q. and Liang, P.P. and Zadeh, A. and Morency, L.P.",
        TITLE = "Face-to-Face Contrastive Learning for Social Intelligence
Question-Answering",
        BOOKTITLE = FG23,
        YEAR = "2023",
        PAGES = "1-7",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228522"}

@inproceedings{bb233547,
        AUTHOR = "Zadeh, A. and Chan, M. and Liang, P.P. and Tong, E. and Morency, L.P.",
        TITLE = "Social-IQ: A Question Answering Benchmark for Artificial Social
Intelligence",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "8799-8809",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228523"}

@inproceedings{bb233548,
        AUTHOR = "Ma, C. and Shen, C. and Dick, A. and Wu, Q. and Wang, P. and van den Hengel, A.J. and Reid, I.D.",
        TITLE = "Visual Question Answering with Memory-Augmented Networks",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6975-6984",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228524"}

@inproceedings{bb233549,
        AUTHOR = "Shin, A. and Ushiku, Y. and Harada, T.",
        TITLE = "Customized Image Narrative Generation via Interactive Visual Question
Generation and Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "8925-8933",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228525"}

@inproceedings{bb233550,
        AUTHOR = "Teney, D. and Anderson, P. and He, X. and van den Hengel, A.J.",
        TITLE = "Tips and Tricks for Visual Question Answering:
Learnings from the 2017 Challenge",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "4223-4232",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228526"}

@inproceedings{bb233551,
        AUTHOR = "Bai, Y.L. and Fu, J.L. and Zhao, T.J. and Mei, T.",
        TITLE = "Deep Attention Neural Tensor Network for Visual Question Answering",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XII: 21-37",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228527"}

@inproceedings{bb233552,
        AUTHOR = "Sinha, A. and Ayush, K.",
        TITLE = "Towards Mathematical Reasoning: A Multimodal Deep Learning Approach",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "4028-4032",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228528"}

@inproceedings{bb233553,
        AUTHOR = "Rosso Mateus, A. and Gonzalez, F.A. and Montes y Gomez, M.",
        TITLE = "A Two-Step Neural Network Approach to Passage Retrieval for Open Domain
Question Answering",
        BOOKTITLE = CIARP17,
        YEAR = "2017",
        PAGES = "566-574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228529"}

@inproceedings{bb233554,
        AUTHOR = "Zhu, C. and Zhao, Y. and Huang, S. and Tu, K. and Ma, Y.",
        TITLE = "Structured Attentions for Visual Question Answering",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1300-1309",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228530"}

@inproceedings{bb233555,
        AUTHOR = "Hu, R. and Andreas, J. and Rohrbach, M. and Darrell, T.J. and Saenko, K.",
        TITLE = "Learning to Reason:
End-to-End Module Networks for Visual Question Answering",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "804-813",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228531"}

@inproceedings{bb233556,
        AUTHOR = "Peris, A. and Casacuberta, F.",
        TITLE = "Interactive-Predictive Neural Multimodal Systems",
        BOOKTITLE = IbPRIA19,
        YEAR = "2019",
        PAGES = "I:16-28",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228532"}

@inproceedings{bb233557,
        AUTHOR = "Bolanos, M. and Peris, A. and Casacuberta, F. and Radeva, P.",
        TITLE = "VIBIKNet: Visual Bidirectional Kernelized Network for Visual Question
Answering",
        BOOKTITLE = IbPRIA17,
        YEAR = "2017",
        PAGES = "372-380",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228533"}

@inproceedings{bb233558,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "An Analysis of Visual Question Answering Algorithms",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1983-1991",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228534"}

@inproceedings{bb233559,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "Answer-Type Prediction for Visual Question Answering",
        BOOKTITLE = CVPR16,
        YEAR = "2016",
        PAGES = "4976-4984",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228535"}

@inproceedings{bb233560,
        AUTHOR = "Wang, P. and Wu, Q. and Shen, C. and van den Hengel, A.J.",
        TITLE = "The VQA-Machine: Learning How to Use Existing Vision Algorithms to
Answer New Questions",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3909-3918",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228536"}

@inproceedings{bb233561,
        AUTHOR = "Yu, D. and Fu, J. and Mei, T. and Rui, Y.",
        TITLE = "Multi-level Attention Networks for Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "4187-4195",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228537"}

@inproceedings{bb233562,
        AUTHOR = "Ramakrishnan, S.K. and Pal, A. and Sharma, G. and Mittal, A.",
        TITLE = "An Empirical Evaluation of Visual Question Answering for Novel
Objects",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "7312-7321",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqann4.html#TT228538"}

@article{bb233563,
        AUTHOR = "Tamaazousti, Y. and Le Borgne, H. and Popescu, A. and Gadeski, E. and Ginsca, A. and Hudelot, C.",
        TITLE = "Vision-language integration using constrained local semantic features",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "41-57",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228539"}

@article{bb233564,
        AUTHOR = "Gouthaman, K.V. and Nambiar, A. and Srinivas, K.S. and Mittal, A.",
        TITLE = "Linguistically-aware attention for reducing the semantic gap in
vision-language tasks",
        JOURNAL = PR,
        VOLUME = "112",
        YEAR = "2021",
        PAGES = "107812",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228540"}

@article{bb233565,
        AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
        TITLE = "Learning to Prompt for Vision-Language Models",
        JOURNAL = IJCV,
        VOLUME = "130",
        YEAR = "2022",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "2337-2348",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228541"}

@inproceedings{bb233566,
        AUTHOR = "Zhou, K.Y. and Yang, J.K. and Loy, C.C. and Liu, Z.W.",
        TITLE = "Conditional Prompt Learning for Vision-Language Models",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16795-16804",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228542"}

@article{bb233567,
        AUTHOR = "Ma, C.C. and Liu, Y. and Deng, J.K. and Xie, L.X. and Dong, W.M. and Xu, C.S.",
        TITLE = "Understanding and Mitigating Overfitting in Prompt Tuning for
Vision-Language Models",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "4616-4629",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228543"}

@article{bb233568,
        AUTHOR = "Zhu, Y.Q. and Li, X.Y. and Zheng, M. and Yang, J.H. and Wang, Z.H. and Guo, X.Q. and Chai, Z.F. and Yuan, Y.C. and Jiang, S.Q.",
        TITLE = "Focus and Align: Learning Tube Tokens for Video-Language Pre-Training",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8036-8050",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228544"}

@article{bb233569,
        AUTHOR = "Chen, C.Q. and Han, D. and Chang, C.C.",
        TITLE = "MPCCT: Multimodal vision-language learning paradigm with
context-based compact Transformer",
        JOURNAL = PR,
        VOLUME = "147",
        YEAR = "2024",
        PAGES = "110084",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228545"}

@article{bb233570,
        AUTHOR = "Wu, W.H. and Sun, Z. and Song, Y.X. and Wang, J.D. and Ouyang, W.L.",
        TITLE = "Transferring Vision-Language Models for Visual Recognition:
A Classifier Perspective",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "392-409",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228546"}

@article{bb233571,
        AUTHOR = "Ming, Y.F. and Li, Y.X.",
        TITLE = "How Does Fine-Tuning Impact Out-of-Distribution Detection for
Vision-Language Models?",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "596-609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228547"}

@article{bb233572,
        AUTHOR = "Zhao, C.R. and Wang, Y. and Jiang, X.Y. and Shen, Y.F. and Song, K. and Li, D.S. and Miao, D.Q.",
        TITLE = "Learning Domain Invariant Prompt for Vision-Language Models",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "1348-1360",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228548"}

@article{bb233573,
        AUTHOR = "Yang, X.F. and Liu, F. and Lin, G.S.",
        TITLE = "Neural Logic Vision Language Explainer",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "3331-3340",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228549"}

@article{bb233574,
        AUTHOR = "Wang, Y.D. and Yu, Z.O. and Wang, J.D. and Heng, Q. and Chen, H. and Ye, W. and Xie, R. and Xie, X. and Zhang, S.K.",
        TITLE = "Exploring Vision-Language Models for Imbalanced Learning",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "224-237",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228550"}

@article{bb233575,
        AUTHOR = "Yu, Z.T. and Zhao, J. and Guo, C.L. and Yang, Y.",
        TITLE = "StableNet: Distinguishing the hard samples to overcome language
priors in visual question answering",
        JOURNAL = IET-CV,
        VOLUME = "18",
        YEAR = "2024",
        NUMBER = "2",
        PAGES = "315-327",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228551"}

@article{bb233576,
        AUTHOR = "Zeng, Y. and Zhang, X. and Li, H. and Wang, J.W. and Zhang, J.P. and Zhou, W.",
        TITLE = "X2-VLM: All-in-One Pre-Trained Model for Vision-Language Tasks",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3156-3168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228552"}

@article{bb233577,
        AUTHOR = "Zheng, Y.Z. and Zhong, B. and Liang, Q.H. and Li, G.R. and Ji, R.R. and Li, X.X.",
        TITLE = "Toward Unified Token Learning for Vision-Language Tracking",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "2125-2135",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228553"}

@article{bb233578,
        AUTHOR = "Ye, P. and Xiao, G. and Liu, J.",
        TITLE = "Multimodal Features Alignment for Vision-Language Object Tracking",
        JOURNAL = RS,
        VOLUME = "16",
        YEAR = "2024",
        NUMBER = "7",
        PAGES = "1168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228554"}

@article{bb233579,
        AUTHOR = "Bazi, Y. and Bashmal, L. and Rahhal, M.M.A. and Ricci, R. and Melgani, F.",
        TITLE = "RS-LLaVA: A Large Vision-Language Model for Joint Captioning and
Question Answering in Remote Sensing Imagery",
        JOURNAL = RS,
        VOLUME = "16",
        YEAR = "2024",
        NUMBER = "9",
        PAGES = "1477",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228555"}

@article{bb233580,
        AUTHOR = "Kong, D. and Kong, K. and Kang, S.J.",
        TITLE = "Image clustering using generated text centroids",
        JOURNAL = SP:IC,
        VOLUME = "125",
        YEAR = "2024",
        PAGES = "117128",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228556"}

@article{bb233581,
        AUTHOR = "Chen, X.Y. and Yang, J.H. and Chen, S. and Wang, L. and Jiang, M. and Zhao, Q.",
        TITLE = "Every Problem, Every Step, All in Focus: Learning to Solve
Vision-Language Problems With Integrated Attention",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "4720-4735",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228557"}

@article{bb233582,
        AUTHOR = "Menon, S. and Chandratreya, I.P. and Vondrick, C.",
        TITLE = "Task Bias in Contrastive Vision-Language Models",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "2026-2040",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228558"}

@article{bb233583,
        AUTHOR = "Zhang, J.Y. and Huang, J.X. and Jin, S. and Lu, S.J.",
        TITLE = "Vision-Language Models for Vision Tasks: A Survey",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "5625-5644",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228559"}

@article{bb233584,
        AUTHOR = "Dong, M.P. and Li, F. and Li, Z.B. and Liu, X.",
        TITLE = "Cluster prototype earth mover's distance adapters and
alignment-guided prompt learning for vision-language models",
        JOURNAL = PR,
        VOLUME = "156",
        YEAR = "2024",
        PAGES = "110861",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228560"}

@article{bb233585,
        AUTHOR = "Liu, Y. and Pan, Y. and Yin, J.",
        TITLE = "Enhancing Multi-Label Deep Hashing for Image and Audio With Joint
Internal Global Loss Constraints and Large Vision-Language Model",
        JOURNAL = SPLetters,
        VOLUME = "31",
        YEAR = "2024",
        PAGES = "2550-2554",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228561"}

@article{bb233586,
        AUTHOR = "Zhan, C.L. and Zhang, Y.F. and Lin, Y. and Wang, G.A. and Wang, H.W.",
        TITLE = "UniDCP: Unifying Multiple Medical Vision-Language Tasks via Dynamic
Cross-Modal Learnable Prompts",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "9736-9748",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228562"}

@article{bb233587,
        AUTHOR = "Su, K. and Zhang, X.X. and Zhang, S.Y. and Zhu, J. and Zhang, B.",
        TITLE = "To Boost Zero-Shot Generalization for Embodied Reasoning With
Vision-Language Pre-Training",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "5370-5381",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228563"}

@article{bb233588,
        AUTHOR = "Xuan, S.Y. and Yang, M. and Zhang, S.L.",
        TITLE = "Adapting Vision-Language Models via Learning to Inject Knowledge",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "5798-5809",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228564"}

@article{bb233589,
        AUTHOR = "Zhou, W. and Zhou, Z.H.",
        TITLE = "Unsupervised Domain Adaption Harnessing Vision-Language Pre-Training",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "8201-8214",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228565"}

@article{bb233590,
        AUTHOR = "Guo, M.H. and Zhang, Y. and Mu, T.J. and Huang, S.X. and Hu, S.M.",
        TITLE = "Tuning Vision-Language Models With Multiple Prototypes Clustering",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "11186-11199",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228566"}

@article{bb233591,
        AUTHOR = "Sun, B. and Wu, Z.C. and Zhang, H. and He, J.",
        TITLE = "VTPL: Visual and text prompt learning for visual-language models",
        JOURNAL = JVCIR,
        VOLUME = "104",
        YEAR = "2024",
        PAGES = "104280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228567"}

@article{bb233592,
        AUTHOR = "Liu, L.C. and Wang, N.N. and Liu, D. and Yang, X. and Gao, X.B. and Liu, T.L.",
        TITLE = "Towards Specific Domain Prompt Learning via Improved Text Label
Optimization",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "10805-10815",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228568"}

@article{bb233593,
        AUTHOR = "Liu, X. and Wu, J. and Yang, W.F. and Zhou, X. and Zhang, T.Z.",
        TITLE = "Multi-Modal Attribute Prompting for Vision-Language Models",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "11579-11591",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228569"}

@article{bb233594,
        AUTHOR = "Jiang, H.J. and Zhang, J.K. and Huang, R. and Ge, C.J. and Ni, Z. and Song, S. and Huang, G.",
        TITLE = "Cross-modal adapter for vision-language retrieval",
        JOURNAL = PR,
        VOLUME = "159",
        YEAR = "2025",
        PAGES = "111144",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228570"}

@article{bb233595,
        AUTHOR = "Tan, Y.T. and Chen, Y.Y. and Wang, J.Q.",
        TITLE = "DSTA: Reinforcing Vision-Language Understanding for Scene-Text VQA
With Dual-Stream Training Approach",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "6-10",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228571"}

@article{bb233596,
        AUTHOR = "Yellinek, N. and Karlinsky, L. and Giryes, R.",
        TITLE = "3VL: Using Trees to Improve Vision-Language Models' Interpretability",
        JOURNAL = IP,
        VOLUME = "34",
        YEAR = "2025",
        PAGES = "495-509",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228572"}

@article{bb233597,
        AUTHOR = "Yang, L.F. and Li, X. and Wang, Y.Z. and Wang, X.L. and Yang, J.",
        TITLE = "Fine-Grained Visual Text Prompting",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "1594-1609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228573"}

@article{bb233598,
        AUTHOR = "Wang, F. and Han, Z.Y. and Liu, X. and Yin, Y.L. and Gao, X.",
        TITLE = "CTPT: Continual Test-time Prompt Tuning for vision-language models",
        JOURNAL = PR,
        VOLUME = "161",
        YEAR = "2025",
        PAGES = "111300",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228574"}

@article{bb233599,
        AUTHOR = "Liang, N. and Liu, Y.",
        TITLE = "DPO: Discrete Prompt Optimization for Vision-Language Models",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "671-675",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT228575"}

Last update:Sep 10, 2025 at 12:00:25