@inproceedings{bb237200,
        AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Shi, H. and Huang, X. and Wu, F. and Yang, Y. and Zhuang, Y.T.",
        TITLE = "Adaptive Hierarchical Graph Reasoning with Semantic Coherence for
Video-and-Language Inference",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1847-1857",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232129"}

@inproceedings{bb237201,
        AUTHOR = "Zhang, M.X. and Yang, Y. and Chen, X. and Ji, Y.L. and Xu, X. and Li, J.J. and Shen, H.T.",
        TITLE = "Multi-stage Aggregated Transformer Network for Temporal Language
Localization in Videos",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12664-12673",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232130"}

@inproceedings{bb237202,
        AUTHOR = "Kim, N. and Ha, S.J. and Kang, J.W.",
        TITLE = "Video Question Answering Using Language-Guided Deep Compressed-Domain
Video Feature",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1688-1697",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232131"}

@inproceedings{bb237203,
        AUTHOR = "Liu, F. and Liu, J. and Wang, W.N. and Lu, H.Q.",
        TITLE = "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video
Question Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1678-1687",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232132"}

@inproceedings{bb237204,
        AUTHOR = "Gao, D.F. and Wang, R.P. and Bai, Z. and Chen, X.L.",
        TITLE = "Env-QA: A Video Question Answering Benchmark for Comprehensive
Understanding of Dynamic Environments",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1655-1665",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232133"}

@inproceedings{bb237205,
        AUTHOR = "Yun, H. and Yu, Y. and Yang, W. and Lee, K. and Kim, G.",
        TITLE = "Pano-AVQA: Grounded Audio-Visual Question Answering on 360° Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2011-2021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232134"}

@inproceedings{bb237206,
        AUTHOR = "Xu, L. and Huang, H. and Liu, J.",
        TITLE = "SUTD-TrafficQA: A Question Answering Benchmark and an Efficient
Network for Video Reasoning over Traffic Events",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "9873-9883",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232135"}

@inproceedings{bb237207,
        AUTHOR = "Park, J. and Lee, J.Y. and Sohn, K.H.",
        TITLE = "Bridge to Answer: Structure-aware Graph Interaction Network for Video
Question Answering",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15521-15530",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232136"}

@inproceedings{bb237208,
        AUTHOR = "Chen, X.W. and Liu, R. and Song, X.M. and Han, Y.H.",
        TITLE = "Locating Visual Explanations for Video Question Answering",
        BOOKTITLE = MMMod21,
        YEAR = "2021",
        PAGES = "I:290-302",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232137"}

@inproceedings{bb237209,
        AUTHOR = "Garcia, N. and Nakashima, Y.",
        TITLE = "Knowledge-based Video Question Answering with Unsupervised Scene
Descriptions",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XVIII:581-598",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232138"}

@inproceedings{bb237210,
        AUTHOR = "Kim, J. and Ma, M. and Pham, T. and Kim, K. and Yoo, C.D.",
        TITLE = "Modality Shifting Attention Network for Multi-Modal Video Question
Answering",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10103-10112",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232139"}

@inproceedings{bb237211,
        AUTHOR = "Jiang, M. and Chen, S. and Yang, J. and Zhao, Q.",
        TITLE = "Fantastic Answers and Where to Find Them: Immersive Question-Directed
Visual Attention",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "2977-2986",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232140"}

@inproceedings{bb237212,
        AUTHOR = "Yang, Z. and Garcia, N. and Chu, C. and Otani, M. and Nakashima, Y. and Takemura, H.",
        TITLE = "BERT Representations for Video Question Answering",
        BOOKTITLE = WACV20,
        YEAR = "2020",
        PAGES = "1545-1554",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232141"}

@inproceedings{bb237213,
        AUTHOR = "Fan, C.Y. and Zhang, X.F. and Zhang, S. and Wang, W.S. and Zhang, C. and Huang, H.",
        TITLE = "Heterogeneous Memory Enhanced Multimodal Attention Model for Video
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "1999-2007",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232142"}

@inproceedings{bb237214,
        AUTHOR = "Kim, J.Y. and Ma, M. and Kim, K. and Kim, S. and Yoo, C.D.",
        TITLE = "Progressive Attention Memory Network for Movie Story Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "8329-8338",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232143"}

@inproceedings{bb237215,
        AUTHOR = "Liu, C.N. and Chen, D.J. and Chen, H.T. and Liu, T.L.",
        TITLE = "A2A: Attention to Attention Reasoning for Movie Question Answering",
        BOOKTITLE = ACCV18,
        YEAR = "2018",
        PAGES = "VI:404-419",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232144"}

@inproceedings{bb237216,
        AUTHOR = "Gao, J. and Ge, R. and Chen, K. and Nevatia, R.",
        TITLE = "Motion-Appearance Co-memory Networks for Video Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6576-6585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232145"}

@inproceedings{bb237217,
        AUTHOR = "Kim, K.M. and Choi, S.H. and Kim, J.H. and Zhang, B.T.",
        TITLE = "Multimodal Dual Attention Memory for Video Story Question Answering",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 698-713",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232146"}

@inproceedings{bb237218,
        AUTHOR = "Yu, Y.J. and Kim, J.S. and Kim, G.",
        TITLE = "A Joint Sequence Fusion Model for Video Question Answering and
Retrieval",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "VII: 487-503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232147"}

@inproceedings{bb237219,
        AUTHOR = "Hasan Chowdhury, M.I. and Nguyen, K. and Sridharan, S. and Fookes, C.",
        TITLE = "Hierarchical Relational Attention for Video Question Answering",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "599-603",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232148"}

@inproceedings{bb237220,
        AUTHOR = "Mun, J. and Seo, P.H. and Jung, I. and Han, B.H.",
        TITLE = "MarioQA: Answering Questions by Watching Gameplay Videos",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "2886-2894",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232149"}

@inproceedings{bb237221,
        AUTHOR = "Yu, Y. and Ko, H. and Choi, J. and Kim, G.",
        TITLE = "End-to-End Concept Word Detection for Video Captioning, Retrieval,
and Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3261-3269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT232150"}

@article{bb237222,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "Visual question answering:
Datasets, algorithms, and future challenges",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "3-20",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232153"}

@article{bb237223,
        AUTHOR = "Wu, Q. and Teney, D. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Visual question answering: A survey of methods and datasets",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "21-40",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232154"}

@article{bb237224,
        AUTHOR = "Teney, D. and Wu, Q. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering: A Tutorial",
        JOURNAL = SPMag,
        VOLUME = "34",
        YEAR = "2017",
        NUMBER = "6",
        MONTH = "November",
        PAGES = "63-75",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232155"}

@inproceedings{bb237225,
        AUTHOR = "Teney, D. and Liu, L. and van den Hengel, A.J.",
        TITLE = "Graph-Structured Representations for Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3233-3241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232156"}

@inproceedings{bb237226,
        AUTHOR = "Teney, D. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering as a Meta Learning Task",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 229-245",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232157"}

@inproceedings{bb237227,
        AUTHOR = "Teney, D. and Abbasnejad, E. and van den Hengel, A.J.",
        TITLE = "Unshuffling Data for Improved Generalization in Visual Question
Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1397-1407",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232158"}

@article{bb237228,
        AUTHOR = "Wu, Q. and Shen, C.H. and Wang, P. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Image Captioning and Visual Question Answering Based on Attributes
and External Knowledge",
        JOURNAL = PAMI,
        VOLUME = "40",
        YEAR = "2018",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "1367-1381",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232159"}

@inproceedings{bb237229,
        AUTHOR = "Wu, Q. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Ask Me Anything: Free-Form Visual Question Answering Based on
Knowledge from External Sources",
        BOOKTITLE = CVPR16,
        YEAR = "2016",
        PAGES = "4622-4630",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232160"}

@article{bb237230,
        AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.",
        TITLE = "Combining Multiple Cues for Visual Madlibs Question Answering",
        JOURNAL = IJCV,
        VOLUME = "127",
        YEAR = "2019",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "38-60",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232161"}

@inproceedings{bb237231,
        AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.",
        TITLE = "Solving Visual Madlibs with Multiple Cues",
        BOOKTITLE = BMVC16,
        YEAR = "2016",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232162"}

@inproceedings{bb237232,
        AUTHOR = "Yu, L.C. and Park, E. and Berg, A.C. and Berg, T.L.",
        TITLE = "Visual Madlibs:
Fill in the Blank Description Generation and Question Answering",
        BOOKTITLE = ICCV15,
        YEAR = "2015",
        PAGES = "2461-2469",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232163"}

@article{bb237233,
        AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.",
        TITLE = "Inverse Visual Question Answering:
A New Benchmark and VQA Diagnosis Tool",
        JOURNAL = PAMI,
        VOLUME = "42",
        YEAR = "2020",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "460-474",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232164"}

@inproceedings{bb237234,
        AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.",
        TITLE = "iVQA: Inverse Visual Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "8611-8619",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232165"}

@article{bb237235,
        AUTHOR = "Patil, C. and Patwardhan, M.",
        TITLE = "Visual Question Generation: The State of the Art",
        JOURNAL = Surveys,
        VOLUME = "53",
        YEAR = "2020",
        NUMBER = "3",
        MONTH = "May",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232166"}

@article{bb237236,
        AUTHOR = "He, F.J. and Wang, Y.X. and Miao, X.L. and Sun, X.",
        TITLE = "Interpretable visual reasoning: A survey",
        JOURNAL = IVC,
        VOLUME = "112",
        YEAR = "2021",
        PAGES = "104194",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232167"}

@article{bb237237,
        AUTHOR = "Sharma, H. and Jalal, A.S.",
        TITLE = "A survey of methods, datasets and evaluation metrics for visual
question answering",
        JOURNAL = IVC,
        VOLUME = "116",
        YEAR = "2021",
        PAGES = "104327",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232168"}

@article{bb237238,
        AUTHOR = "Yang, L. and Jiang, H. and Song, Q. and Guo, J.",
        TITLE = "A Survey on Long-Tailed Visual Recognition",
        JOURNAL = IJCV,
        VOLUME = "130",
        YEAR = "2022",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "1837-1872",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232169"}

@article{bb237239,
        AUTHOR = "Zhao, W.L. and Rao, Y.M. and Tang, Y.S. and Zhou, J. and Lu, J.W.",
        TITLE = "VideoABC: A Real-World Video Dataset for Abductive Visual Reasoning",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "6048-6061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232170"}

@article{bb237240,
        AUTHOR = "Lahouti, F. and Kostina, V. and Hassibi, B.",
        TITLE = "How to Query an Oracle? Efficient Strategies to Label Data",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "7597-7609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232171"}

@article{bb237241,
        AUTHOR = "Ma, J. and Wang, P.H. and Kong, D.C. and Wang, Z.W. and Liu, J. and Pei, H.B. and Zhao, J.Z.",
        TITLE = "Robust Visual Question Answering: Datasets, Methods, and Future
Challenges",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "5575-5594",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232172"}

@article{bb237242,
        AUTHOR = "Li, K. and Vosselman, G. and Yang, M.Y.",
        TITLE = "HRVQA: A Visual Question Answering benchmark for high-resolution
aerial images",
        JOURNAL = PandRS,
        VOLUME = "214",
        YEAR = "2024",
        PAGES = "65-81",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232173"}

@inproceedings{bb237243,
        AUTHOR = "Chen, C.Y. and Liu, M.C. and Codella, N. and Li, Y.S. and Yuan, L. and Gurari, D.",
        TITLE = "Fully Authentic Visual Question Answering Dataset from Online
Communities",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XLVIII: 252-269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232174"}

@inproceedings{bb237244,
        AUTHOR = "Singh, M. and Patvardhan, C. and Lakshmi, C.V.",
        TITLE = "Does ChatGPT Spell the End of Automatic Question Generation Research?",
        BOOKTITLE = ICCVMI23,
        YEAR = "2023",
        PAGES = "1-6",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232175"}

@inproceedings{bb237245,
        AUTHOR = "Zhu, L. and Ning, R. and Li, J. and Xin, C.S. and Wu, H.Y.",
        TITLE = "Most and Least Retrievable Images in Visual-Language Query Systems",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:1-18",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232176"}

@inproceedings{bb237246,
        AUTHOR = "Salewski, L. and Emde, C. and Do, V. and Akata, Z. and Lukasiewicz, T.",
        TITLE = "e-ViL: A Dataset and Benchmark for Natural Language Explanations in
Vision-Language Tasks",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1224-1234",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232177"}

@inproceedings{bb237247,
        AUTHOR = "Gupta, V. and Patro, B.N. and Parihar, H. and Namboodiri, V.P.",
        TITLE = "VQuAD: Video Question Answering Diagnostic Dataset",
        BOOKTITLE = Novelty22,
        YEAR = "2022",
        PAGES = "282-291",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232178"}

@inproceedings{bb237248,
        AUTHOR = "Nishimura, T. and Sakoda, K. and Hashimoto, A. and Ushiku, Y. and Tanaka, N. and Ono, F. and Kameko, H. and Mori, S.",
        TITLE = "Egocentric Biochemical Video-and-Language Dataset",
        BOOKTITLE = CLVL21,
        YEAR = "2021",
        PAGES = "3122-3126",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232179"}

@inproceedings{bb237249,
        AUTHOR = "Zhang, M. and Maidment, T. and Diab, A. and Kovashka, A. and Hwa, R.",
        TITLE = "Domain-robust VQA with diverse datasets and methods but no target
labels",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7042-7052",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232180"}

@inproceedings{bb237250,
        AUTHOR = "Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "DocVQA: A Dataset for VQA on Document Images",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "2199-2208",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232181"}

@inproceedings{bb237251,
        AUTHOR = "Patel, D. and Parikh, R. and Shastri, Y.",
        TITLE = "Recent Advances in Video Question Answering:
A Review of Datasets and Methods",
        BOOKTITLE = VTIUR20,
        YEAR = "2020",
        PAGES = "339-356",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232182"}

@inproceedings{bb237252,
        AUTHOR = "Fan, C.",
        TITLE = "EgoVQA: An Egocentric Video Question Answering Benchmark Dataset",
        BOOKTITLE = EPIC19,
        YEAR = "2019",
        PAGES = "4359-4366",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232183"}

@inproceedings{bb237253,
        AUTHOR = "Hudson, D.A. and Manning, C.D.",
        TITLE = "GQA: A New Dataset for Real-World Visual Reasoning and Compositional
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "6693-6702",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232184"}

@inproceedings{bb237254,
        AUTHOR = "Yang, G.Y.R. and Ganichev, I. and Wang, X.J. and Shlens, J. and Sussillo, D.",
        TITLE = "A Dataset and Architecture for Visual Reasoning with a Working Memory",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "X: 729-745",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232185"}

@inproceedings{bb237255,
        AUTHOR = "Gan, C. and Li, Y. and Li, H. and Sun, C. and Gong, B.",
        TITLE = "VQS: Linking Segmentations to Questions and Answers for Supervised
Attention in VQA and Question-Focused Semantic Segmentation",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "1829-1838",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232186"}

@inproceedings{bb237256,
        AUTHOR = "Maharaj, T. and Ballas, N. and Rohrbach, A. and Courville, A. and Pal, C.",
        TITLE = "A Dataset and Exploration of Models for Understanding Video Data
through Fill-in-the-Blank Question-Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "7359-7368",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT232187"}

@article{bb237257,
        AUTHOR = "Das, A. and Kottur, S. and Gupta, K. and Singh, A. and Yadav, D. and Lee, S. and Moura, J.M.F. and Parikh, D. and Batra, D.",
        TITLE = "Visual Dialog",
        JOURNAL = PAMI,
        VOLUME = "41",
        YEAR = "2019",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "1242-1256",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232188"}

@article{bb237258,
        AUTHOR = "Zhao, Z. and Zhang, Z. and Jiang, X.H. and Cai, D.",
        TITLE = "Multi-Turn Video Question Answering via Hierarchical Attention
Context Reinforced Networks",
        JOURNAL = IP,
        VOLUME = "28",
        YEAR = "2019",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "3860-3872",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232189"}

@article{bb237259,
        AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Cai, D. and Wu, F.",
        TITLE = "Video Dialog via Multi-Grained Convolutional Self-Attention Context
Multi-Modal Networks",
        JOURNAL = CirSysVideo,
        VOLUME = "30",
        YEAR = "2020",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "4453-4466",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232190"}

@article{bb237260,
        AUTHOR = "Guo, D. and Wang, H. and Wang, S. and Wang, M.",
        TITLE = "Textual-Visual Reference-Aware Attention Network for Visual Dialog",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        PAGES = "6655-6666",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232191"}

@article{bb237261,
        AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.",
        TITLE = "Probabilistic framework for solving visual dialog",
        JOURNAL = PR,
        VOLUME = "110",
        YEAR = "2021",
        PAGES = "107586",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232192"}

@article{bb237262,
        AUTHOR = "Zhao, L. and Lyu, X.Y. and Song, J.K. and Gao, L.L.",
        TITLE = "GuessWhich? Visual dialog with attentive memory network",
        JOURNAL = PR,
        VOLUME = "114",
        YEAR = "2021",
        PAGES = "107823",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232193"}

@article{bb237263,
        AUTHOR = "Jiang, T.L. and Shao, H.L. and Tian, X. and Ji, Y. and Liu, C.P.",
        TITLE = "Aligning vision-language for graph inference in visual dialog",
        JOURNAL = IVC,
        VOLUME = "116",
        YEAR = "2021",
        PAGES = "104316",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232194"}

@article{bb237264,
        AUTHOR = "Guo, D. and Wang, H. and Wang, M.",
        TITLE = "Context-Aware Graph Inference With Knowledge Distillation for Visual
Dialog",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "6056-6073",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232195"}

@inproceedings{bb237265,
        AUTHOR = "Guo, D. and Wang, H. and Zhang, H.W. and Zha, Z.J. and Wang, M.",
        TITLE = "Iterative Context-Aware Graph Inference for Visual Dialog",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10052-10061",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232196"}

@article{bb237266,
        AUTHOR = "Patro, B.N. and Anupriy and Namboodiri, V.P.",
        TITLE = "Explanation vs. attention: A two-player game to obtain attention for
VQA and visual dialog",
        JOURNAL = PR,
        VOLUME = "132",
        YEAR = "2022",
        PAGES = "108898",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232197"}

@article{bb237267,
        AUTHOR = "Zhu, Y. and Wu, Y. and Yang, Y. and Yan, Y.",
        TITLE = "Saying the Unseen: Video Descriptions via Dialog Agents",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "7190-7204",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232198"}

@article{bb237268,
        AUTHOR = "Huang, Y. and Wang, Y.M. and Wang, L.",
        TITLE = "Efficient Image and Sentence Matching",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "2970-2983",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232199"}

@article{bb237269,
        AUTHOR = "Zhao, L. and Li, J.L. and Gao, L.L. and Rao, Y. and Song, J.K. and Shen, H.T.",
        TITLE = "Heterogeneous Knowledge Network for Visual Dialog",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "861-871",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232200"}

@article{bb237270,
        AUTHOR = "Bucinca, Z. and Yemez, Y. and Erzin, E. and Sezgin, M.",
        TITLE = "AffectON: Incorporating Affect Into Dialog Generation",
        JOURNAL = AffCom,
        VOLUME = "14",
        YEAR = "2023",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "823-835",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232201"}

@article{bb237271,
        AUTHOR = "Yu, H. and Ko, Y.J.",
        TITLE = "Enriching the dialogue state tracking model with a asyntactic
discourse graph",
        JOURNAL = PRL,
        VOLUME = "169",
        YEAR = "2023",
        PAGES = "81-86",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232202"}

@article{bb237272,
        AUTHOR = "Wu, Y.X. and Liao, L. and Zhang, G.Y. and Lei, W.Q. and Zhao, G.S. and Qian, X.M. and Chua, T.S.",
        TITLE = "State Graph Reasoning for Multimodal Conversational Recommendation",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "3113-3124",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232203"}

@article{bb237273,
        AUTHOR = "Firdaus, M. and Thangavelu, N. and Ekbal, A. and Bhattacharyya, P.",
        TITLE = "I Enjoy Writing and Playing, Do You?: A Personalized and Emotion
Grounded Dialogue Agent Using Generative Adversarial Network",
        JOURNAL = AffCom,
        VOLUME = "14",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "July",
        PAGES = "2127-2138",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232204"}

@article{bb237274,
        AUTHOR = "Zhang, Z. and Li, S. and Ji, Y. and Liu, C.P.",
        TITLE = "Infer unseen from seen: Relation regularized zero-shot visual dialog",
        JOURNAL = JVCIR,
        VOLUME = "97",
        YEAR = "2023",
        PAGES = "103961",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232205"}

@article{bb237275,
        AUTHOR = "Qi, Q.S. and Zhang, A. and Liao, Y. and Sun, W.Y. and Wang, Y.L. and Li, X.B. and Liu, S.",
        TITLE = "Simultaneously Training and Compressing Vision-and-Language
Pre-Training Model",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "8194-8203",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232206"}

@article{bb237276,
        AUTHOR = "Liu, A.A. and Huang, C.X. and Xu, N. and Tian, H.S. and Liu, J. and Zhang, Y.D.",
        TITLE = "Counterfactual Visual Dialog: Robust Commonsense Knowledge Learning
From Unbiased Training",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "1639-1651",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232207"}

@article{bb237277,
        AUTHOR = "Ricci, R. and Bazi, Y. and Melgani, F.",
        TITLE = "Machine-to-Machine Visual Dialoguing with ChatGPT for Enriched
Textual Image Description",
        JOURNAL = RS,
        VOLUME = "16",
        YEAR = "2024",
        NUMBER = "3",
        PAGES = "441",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232208"}

@article{bb237278,
        AUTHOR = "Bulat, A. and Tzimiropoulos, G.",
        TITLE = "Language-Aware Soft Prompting: Text-to-Text Optimization for Few- and
Zero-Shot Adaptation of V&L Models",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "1108-1125",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232209"}

@inproceedings{bb237279,
        AUTHOR = "Bulat, A. and Tzimiropoulos, G.",
        TITLE = "LASP: Text-to-Text Optimization for Language-Aware Soft Prompting of
Vision and Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23232-23241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232210"}

@article{bb237280,
        AUTHOR = "Wang, A.J.P. and Zhou, P. and Shou, M.Z. and Yan, S.C.",
        TITLE = "Enhancing Visual Grounding in Vision-Language Pre-Training With
Position-Guided Text Prompts",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3406-3421",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232211"}

@inproceedings{bb237281,
        AUTHOR = "Wang, A.J.P. and Zhou, P. and Shou, M.Z. and Yan, S.C.",
        TITLE = "Position-Guided Text Prompt for Vision-Language Pre-Training",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23242-23251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232212"}

@article{bb237282,
        AUTHOR = "Du, S.S. and Wang, H. and Li, T.P. and Chen, C.W.",
        TITLE = "Hybrid Graph Reasoning With Dynamic Interaction for Visual Dialog",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "9095-9108",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232213"}

@article{bb237283,
        AUTHOR = "Sun, J.T. and Kou, J.Y. and Hou, W. and Bai, Y.",
        TITLE = "A multi-agent curiosity reward model for task-oriented dialogue
systems",
        JOURNAL = PR,
        VOLUME = "157",
        YEAR = "2025",
        PAGES = "110884",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232214"}

@article{bb237284,
        AUTHOR = "Kane, B. and Giugno, C. and Schubert, L. and Haut, K. and Wohn, C. and Hoque, E.",
        TITLE = "Managing Emotional Dialogue for a Virtual Cancer Patient:
A Schema-Guided Approach",
        JOURNAL = AffCom,
        VOLUME = "15",
        YEAR = "2024",
        NUMBER = "3",
        MONTH = "July",
        PAGES = "1041-1052",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232215"}

@article{bb237285,
        AUTHOR = "Xie, J.Y. and Chen, J.L. and Liu, Z.H. and Cai, Y. and Huang, Q. and Li, Q.",
        TITLE = "Video Question Generation for Dynamic Changes",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "8710-8721",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232216"}

@article{bb237286,
        AUTHOR = "Liu, Y.T. and Li, L. and Tu, Y. and Zhang, B.C. and Zha, Z.J. and Huang, Q.M.",
        TITLE = "Dynamic Strategy Prompt Reasoning for Emotional Support Conversation",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "108-119",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232217"}

@article{bb237287,
        AUTHOR = "Janssens, R. and Wolfert, P. and Demeester, T. and Belpaeme, T.",
        TITLE = "Integrating Visual Context Into Language Models for Situated Social
Conversation Starters",
        JOURNAL = AffCom,
        VOLUME = "16",
        YEAR = "2025",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "223-236",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232218"}

@article{bb237288,
        AUTHOR = "Ju, X.C. and Zhang, D. and Li, J.H. and Li, S. and Zhou, G.D.",
        TITLE = "Enhanced Generative Framework With LLMs for Multimodal Emotion-Cause
Pair Extraction in Conversations",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "4924-4935",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232219"}

@article{bb237289,
        AUTHOR = "Liao, N. and Shi, B. and Zhang, X.P. and Cao, M. and Yan, J.C. and Tian, Q.",
        TITLE = "Rethinking visual prompt learning as masked visual token modeling",
        JOURNAL = AI,
        VOLUME = "348",
        YEAR = "2025",
        PAGES = "104417",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232220"}

@inproceedings{bb237290,
        AUTHOR = "Nedunuri, U. and Hamelin, N. and Gupta, A.D. and Guha, D.",
        TITLE = "Exploring Emotional Engagement with Responsible AI Constructs:
A Video-Based Cognitive Experiment",
        BOOKTITLE = ICIVC25,
        YEAR = "2025",
        PAGES = "597-604",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232221"}

@inproceedings{bb237291,
        AUTHOR = "Abdessaied, A. and Rohrbach, A. and Rohrbach, M. and Bulling, A.",
        TITLE = "V2 Dial: Unification of Video and Visual Dialog via Multimodal
Experts",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "8637-8647",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232222"}

@inproceedings{bb237292,
        AUTHOR = "Lin, J. and Feng, Y. and Liu, W. and Black, M.J.",
        TITLE = "ChatHuman: Chatting about 3D Humans with Tools",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "8150-8161",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232223"}

@inproceedings{bb237293,
        AUTHOR = "Bai, Y. and Ji, Y.C. and Cao, M. and Wang, J.Q. and Ye, M.",
        TITLE = "Chat-based Person Retrieval via Dialogue-Refined Cross-Modal
Alignment",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3952-3962",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232224"}

@inproceedings{bb237294,
        AUTHOR = "Chou, C. and Dunlap, L. and Mashita, K. and Mandal, K. and Darrell, T.J. and Stoica, I. and Gonzalez, J.E. and Chiang, W.L.",
        TITLE = "VisionArena: 230K Real World User-VLM Conversations with Preference
Labels",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3877-3887",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232225"}

@inproceedings{bb237295,
        AUTHOR = "Soni, S. and Dudhane, A. and Debary, H. and Fiaz, M. and Munir, M.A. and Danish, M.S. and Fraccaro, P. and Watson, C.D. and Klein, L.J. and Khan, F.S. and Khan, S.",
        TITLE = "EarthDial: Turning Multi-sensory Earth Observations to Interactive
Dialogues",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14303-14313",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232226"}

@inproceedings{bb237296,
        AUTHOR = "Wang, H.Y. and Wang, L. and Zhou, S.P. and Tian, J.Y. and Qin, Z. and Wang, Y.B. and Hua, G. and Tang, W.",
        TITLE = "Towards Precise Embodied Dialogue Localization via Causality Guided
Diffusion",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "13350-13360",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232227"}

@inproceedings{bb237297,
        AUTHOR = "Jiang, B. and Chen, X. and Zhang, C. and Yin, F. and Li, Z.Y. and Yu, G. and Fan, J.Y.",
        TITLE = "Motionchain: Conversational Motion Controllers via Multimodal Prompts",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXVI: 54-74",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232228"}

@inproceedings{bb237298,
        AUTHOR = "Haydarov, K. and Shen, X.Q. and Madasu, A. and Salem, M. and Li, L.J. and Elsayed, G. and Elhoseiny, M.",
        TITLE = "Affective Visual Dialog: A Large-scale Benchmark for Emotional
Reasoning Based on Visually Grounded Conversations",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXV: 18-36",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232229"}

@inproceedings{bb237299,
        AUTHOR = "Abdessaied, A. and Shi, L. and Bulling, A.",
        TITLE = "Multi-modal Video Dialog State Tracking in the Wild",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LVII: 348-365",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vdi3.html#TT232230"}

Last update:Dec 7, 2025 at 16:47:52