@article{bb241000,
        AUTHOR = "Yu, T. and Fu, K. and Zhang, J. and Huang, Q.M. and Yu, J.",
        TITLE = "Multi-Granularity Contrastive Cross-Modal Collaborative Generation
for End-to-End Long-Term Video Question Answering",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "3115-3129",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235915"}

@article{bb241001,
        AUTHOR = "Liu, J. and Wang, G.X. and Xie, J.L. and Zhou, F.Y. and Xu, H.J.",
        TITLE = "Video Question Answering with Semantic Disentanglement and Reasoning",
        JOURNAL = CirSysVideo,
        VOLUME = "34",
        YEAR = "2024",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3663-3673",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235916"}

@article{bb241002,
        AUTHOR = "Nie, J. and Wang, X. and Hou, R.Z. and Li, G.H. and Chen, H. and Zhu, W.W.",
        TITLE = "Dynamic Spatio-Temporal Graph Reasoning for VideoQA With
Self-Supervised Event Recognition",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "4145-4158",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235917"}

@article{bb241003,
        AUTHOR = "Lee, S. and Kim, H.I. and Ro, Y.M.",
        TITLE = "Text-guided distillation learning to diversify video embeddings for
text-video retrieval",
        JOURNAL = PR,
        VOLUME = "156",
        YEAR = "2024",
        PAGES = "110754",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235918"}

@article{bb241004,
        AUTHOR = "Fei, H. and Wu, S.Q. and Zhang, M. and Zhang, M. and Chua, T.S. and Yan, S.C.",
        TITLE = "Enhancing Video-Language Representations With Structural
Spatio-Temporal Alignment",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "7701-7719",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235919"}

@article{bb241005,
        AUTHOR = "Wang, R.M. and Luo, Y.M. and Zhang, F. and Liu, M.Y. and Luo, X.N.",
        TITLE = "HSSHG: Heuristic Semantics-Constrained Spatio-Temporal Heterogeneous
Graph for VideoQA",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "11176-11190",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235920"}

@article{bb241006,
        AUTHOR = "Jeshmol, P.J. and Kovoor, B.C.",
        TITLE = "Video Question Answering: A survey of the state-of-the-art",
        JOURNAL = JVCIR,
        VOLUME = "105",
        YEAR = "2024",
        PAGES = "104320",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235921"}

@article{bb241007,
        AUTHOR = "Wu, X.Z. and Wu, J. and Zhu, L. and Senhadji, L. and Shu, H.Z.",
        TITLE = "Collaborative Aware Bidirectional Semantic Reasoning for Video
Question Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "2074-2086",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235922"}

@article{bb241008,
        AUTHOR = "Yang, A. and Miech, A. and Sivic, J. and Laptev, I. and Schmid, C.",
        TITLE = "Learning to Answer Visual Questions From Web Videos",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "3202-3218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235923"}

@inproceedings{bb241009,
        AUTHOR = "Yang, A. and Miech, A. and Sivic, J. and Laptev, I. and Schmid, C.",
        TITLE = "Just Ask:
Learning to Answer Questions from Millions of Narrated Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1666-1677",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235924"}

@article{bb241010,
        AUTHOR = "Jiang, Y.Y. and Yin, J.Q.",
        TITLE = "CLIP-Powered TASS: Target-Aware Single-Stream Network for Audio-Visual
Question Answering",
        JOURNAL = IJCV,
        VOLUME = "133",
        YEAR = "2025",
        NUMBER = "5",
        MONTH = "May",
        PAGES = "2581-2598",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235925"}

@article{bb241011,
        AUTHOR = "Wang, Y.Y. and Liu, M. and Song, X.M. and Nie, L.Q.",
        TITLE = "TR-Adapter: Parameter-Efficient Transfer Learning for Video Question
Answering",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "2232-2242",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235926"}

@article{bb241012,
        AUTHOR = "Xiao, J.B. and Huang, N.X. and Qin, H.Y. and Li, D.Y. and Li, Y.C. and Zhu, F.B. and Tao, Z. and Yu, J.X. and Lin, L. and Chua, T.S. and Yao, A.",
        TITLE = "VideoQA in the Era of LLMs: An Empirical Study",
        JOURNAL = IJCV,
        VOLUME = "133",
        YEAR = "2025",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "3970-3993",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235927"}

@article{bb241013,
        AUTHOR = "Liang, T.M. and Li, L.H. and Hu, J.F. and Yu, X.Y. and Zheng, W.S. and Lai, J.H.",
        TITLE = "Rethinking Temporal Context in Video-QA:
A Comprehensive Study of Single-Frame Static Bias",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "5077-5091",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235928"}

@article{bb241014,
        AUTHOR = "Qin, Y.X. and Zhao, L. and Gao, L. and Zhang, H.N. and Zeng, P.P. and Shen, H.T.",
        TITLE = "Temporal-Guided Mixture-of-Experts for Zero-Shot Video Question
Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "9003-9016",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235929"}

@article{bb241015,
        AUTHOR = "Wang, H.B. and Lai, C.H. and Ge, W.F.",
        TITLE = "Adapting Multimodal Large Language Models for Video Question
Answering by Capturing Question-Critical and Coherent Moments",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "8737-8747",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235930"}

@inproceedings{bb241016,
        AUTHOR = "Gia, B.T. and Le, K. and Do, T. and Mai, T.D. and Ngo, T.D. and Le, D.D. and Satoh, S.",
        TITLE = "VRAG: Retrieval-Augmented Video Question Answering for Long-Form
Videos",
        BOOKTITLE = IntVidSea25,
        YEAR = "2025",
        PAGES = "3689-3698",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235931"}

@article{bb241017,
        AUTHOR = "Song, E. and Chai, W.H. and Ye, T. and Hwang, J.N. and Li, X. and Wang, G.A.",
        TITLE = "MovieChat+: Question-Aware Sparse Memory for Long Video Question
Answering",
        JOURNAL = PAMI,
        VOLUME = "48",
        YEAR = "2026",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "374-389",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235932"}

@article{bb241018,
        AUTHOR = "Zhou, S. and Xiao, J.B. and Yang, X. and Song, P.P. and Guo, D. and Yao, A. and Wang, M. and Chua, T.S.",
        TITLE = "Scene-Text Grounding for Text-Based Video Question Answering",
        JOURNAL = MultMed,
        VOLUME = "28",
        YEAR = "2026",
        PAGES = "1417-1430",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235933"}

@inproceedings{bb241019,
        AUTHOR = "Zhou, S. and Xiao, J.B. and Li, Q.Y. and Li, Y.C. and Yang, X. and Guo, D. and Wang, M. and Chua, T.S. and Yao, A.",
        TITLE = "EgoTextVQA: Towards Egocentric Scene-Text Aware Video Question
Answering",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3363-3373",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235934"}

@article{bb241020,
        AUTHOR = "Gong, H.B. and Yan, C.G. and Zhang, J. and Sun, Y.Q. and Gao, Y.H. and Li, L.",
        TITLE = "Spatial-Temporal Clue Reasoning Chain for Long Video Question
Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "36",
        YEAR = "2026",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "2757-2770",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235935"}

@inproceedings{bb241021,
        AUTHOR = "Chowdhury, M.I. and Aukkapinyo, K. and Fujimura, H. and Woo, J.A. and Wasusatein, W. and Ghourabi, F.",
        TITLE = "Grid-Logat: Grid Based Local And Global Area Transcription For Video
Question Answering",
        BOOKTITLE = ICIP25,
        YEAR = "2025",
        PAGES = "1247-1252",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235936"}

@inproceedings{bb241022,
        AUTHOR = "Parikh, C. and Rawat, D. and Rakshitha, R.T. and Ghosh, T. and Sarvadevabhatla, R.K.",
        TITLE = "RoadSocial: A Diverse VideoQA Dataset and Benchmark for Road Event
Understanding from Social Video Narratives",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19002-19011",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235937"}

@inproceedings{bb241023,
        AUTHOR = "Azad, S. and Vineet, V. and Rawat, Y.S.",
        TITLE = "HierarQ: Task-Aware Hierarchical Q-Former for Enhanced Video
Understanding",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "8545-8556",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235938"}

@inproceedings{bb241024,
        AUTHOR = "Chen, W.X. and Liu, Y. and Chen, B.L. and Su, J. and Zheng, Y. and Lin, L.",
        TITLE = "Cross-modal Causal Relation Alignment for Video Question Grounding",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24087-24096",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235939"}

@inproceedings{bb241025,
        AUTHOR = "Yi, J.H. and Wasim, S.T. and Luo, Y. and Naseer, M. and Gall, J.",
        TITLE = "Video-Panda: Parameter-efficient Alignment for Encoder-free
Video-Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24119-24128",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235940"}

@inproceedings{bb241026,
        AUTHOR = "Islam, M.M. and Nagarajan, T. and Wang, H.Y. and Bertasius, G. and Torresani, L.",
        TITLE = "BIMBA: Selective-Scan Compression for Long-Range Video Question
Answering",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29096-29107",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235941"}

@inproceedings{bb241027,
        AUTHOR = "Cheng, F. and Wang, Z.Y. and Sung, Y.L. and Lin, Y.B. and Bansal, M. and Bertasius, G.",
        TITLE = "Dam: Dynamic Adapter Merging for Continual Video QA Learning",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "6805-6817",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235942"}

@inproceedings{bb241028,
        AUTHOR = "Chen, X.Y. and Lin, Y. and Zhang, Y.C. and Huang, W.R.",
        TITLE = "Autoeval-video: An Automatic Benchmark for Assessing Large Vision
Language Models in Open-ended Video Question Answering",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXVIII: 179-195",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235943"}

@inproceedings{bb241029,
        AUTHOR = "Wang, X. and Liang, J. and Wang, C.K. and Deng, K. and Lou, Y. and Lin, M.C. and Yang, S.",
        TITLE = "Vila: Efficient Video-language Alignment for Video Question Answering",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXII: 186-204",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235944"}

@inproceedings{bb241030,
        AUTHOR = "Choudhury, R. and Niinuma, K. and Kitani, K.M. and Jeni, L.A.",
        TITLE = "Video Question Answering with Procedural Programs",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XXXVIII: 315-332",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235945"}

@inproceedings{bb241031,
        AUTHOR = "Xiao, J.B. and Yao, A. and Li, Y.C. and Chua, T.S.",
        TITLE = "Can I Trust Your Answer? Visually Grounded Video Question Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13204-13214",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235946"}

@inproceedings{bb241032,
        AUTHOR = "Min, J. and Buch, S. and Nagrani, A. and Cho, M. and Schmid, C.",
        TITLE = "MoReVQA: Exploring Modular Reasoning Models for Video Question
Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13235-13245",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235947"}

@inproceedings{bb241033,
        AUTHOR = "Zou, B. and Yang, C. and Qiao, Y. and Quan, C.B. and Zhao, Y.J.",
        TITLE = "Language-aware Visual Semantic Distillation for Video Question
Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27103-27113",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235948"}

@inproceedings{bb241034,
        AUTHOR = "Liang, T.M. and Tan, C.L. and Xia, B.H. and Zheng, W.S. and Hu, J.F.",
        TITLE = "Ranking Distillation for Open-Ended Video Question Answering with
Insufficient Labels",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13161-13170",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235949"}

@inproceedings{bb241035,
        AUTHOR = "Wu, J.M. and Shu, P.C. and Hong, H.Y. and Ma, L. and Zhu, Y. and Wang, L.",
        TITLE = "Pre-trained Bidirectional Dynamic Memory Network For Long Video
Question Answering",
        BOOKTITLE = Crowded24,
        YEAR = "2024",
        PAGES = "5550-5557",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235950"}

@inproceedings{bb241036,
        AUTHOR = "Inoue, Y. and Yada, Y. and Tanahashi, K. and Yamaguchi, Y.",
        TITLE = "NuScenes-MQA: Integrated Evaluation of Captions and QA for Autonomous
Driving Datasets using Markup Annotations",
        BOOKTITLE = LLVMCrive24,
        YEAR = "2024",
        PAGES = "930-938",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235951"}

@inproceedings{bb241037,
        AUTHOR = "Park, S.Y. and Lee, M.J. and Kang, J.H. and Choi, H. and Park, Y. and Cho, J. and Lee, A. and Kim, D.K.",
        TITLE = "VLAAD: Vision and Language Assistant for Autonomous Driving",
        BOOKTITLE = LLVMCrive24,
        YEAR = "2024",
        PAGES = "980-987",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235952"}

@inproceedings{bb241038,
        AUTHOR = "Fang, J.Z.Y. and Zheng, S. and Sharma, V. and Piramuthu, R.",
        TITLE = "epislon-ViLM: Efficient Video-Language Model via Masked Video Modeling
with Semantic Vector-Quantized Tokenizer",
        BOOKTITLE = Pretrain24,
        YEAR = "2024",
        PAGES = "529-540",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235953"}

@inproceedings{bb241039,
        AUTHOR = "Zonneveld, A. and Gatt, A. and Calixto, I.",
        TITLE = "Video-and-Language (VidL) models and their cognitive relevance",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "325-338",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235954"}

@inproceedings{bb241040,
        AUTHOR = "Momeni, L. and Caron, M. and Nagrani, A. and Zisserman, A. and Schmid, C.",
        TITLE = "Verbs in Action: Improving verb understanding in video-language
models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15533-15545",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235955"}

@inproceedings{bb241041,
        AUTHOR = "Jin, P. and Li, H. and Cheng, Z. and Li, K. and Ji, X.Y. and Liu, C. and Yuan, L. and Chen, J.",
        TITLE = "DiffusionRet: Generative Text-Video Retrieval with Diffusion Model",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2470-2481",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235956"}

@inproceedings{bb241042,
        AUTHOR = "Li, P.D. and Xie, C.W. and Zhao, L.M. and Xie, H.T. and Ge, J.N. and Zheng, Y. and Zhao, D.L. and Zhang, Y.D.",
        TITLE = "Progressive Spatio-Temporal Prototype Matching for Text-Video
Retrieval",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "4077-4087",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235957"}

@inproceedings{bb241043,
        AUTHOR = "Guan, P.Y. and Pei, R.J. and Shao, B. and Liu, J.Z. and Li, W.M. and Gu, J.X. and Xu, H. and Xu, S.C. and Yan, Y. and Lam, E.Y.",
        TITLE = "PIDRo: Parallel Isomeric Attention with Dynamic Routing for
Text-Video Retrieval",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11130-11139",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235958"}

@inproceedings{bb241044,
        AUTHOR = "Deng, C.R. and Chen, Q. and Qin, P. and Chen, D. and Wu, Q.",
        TITLE = "Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15602-15612",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235959"}

@inproceedings{bb241045,
        AUTHOR = "Pirhadi, M.J. and Mirzaei, M. and Eetemadi, S.",
        TITLE = "Just Ask Plus: Using Transcripts for VideoQA",
        BOOKTITLE = ASI23,
        YEAR = "2023",
        PAGES = "3074-3077",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235960"}

@inproceedings{bb241046,
        AUTHOR = "Ahmad, M. and Park, G. and Park, D. and Park, S.",
        TITLE = "MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question
Answering",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4659-4664",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235961"}

@inproceedings{bb241047,
        AUTHOR = "Engin, D. and Avrithis, Y.",
        TITLE = "Zero-Shot and Few-Shot Video Question Answering with Multi-Modal
Prompts",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2797-2802",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235962"}

@inproceedings{bb241048,
        AUTHOR = "Nuthalapati, S.V. and Tunga, A.",
        TITLE = "Coarse to Fine Frame Selection for Online Open-ended Video Question
Answering",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "353-361",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235963"}

@inproceedings{bb241049,
        AUTHOR = "Li, Y.C. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.",
        TITLE = "Discovering Spatio-Temporal Rationales for Video Question Answering",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13823-13832",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235964"}

@inproceedings{bb241050,
        AUTHOR = "Ko, D. and Lee, J.S. and Choi, M. and Chu, J.W. and Park, J. and Kim, H.W.J.",
        TITLE = "Open-Vocabulary Video Question Answering: A New Benchmark for
Evaluating the Generalizability of Video Question Answering Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "3078-3089",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235965"}

@inproceedings{bb241051,
        AUTHOR = "Li, J. and Niu, L. and Zhang, L.Q.",
        TITLE = "Knowledge Proxy Intervention for Deconfounded Video Question
Answering",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2770-2781",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235966"}

@inproceedings{bb241052,
        AUTHOR = "Chen, G.Y. and Liu, X. and Wang, G. and Zhang, K. and Torr, P.H.S. and Zhang, X.P. and Tang, Y.S.",
        TITLE = "Tem-adapter:
Adapting Image-Text Pretraining for Video Question Answer",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13899-13909",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235967"}

@inproceedings{bb241053,
        AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "Understanding Video Scenes through Text:
Insights from Text-based Video Question Answering",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4648-4652",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235968"}

@inproceedings{bb241054,
        AUTHOR = "Peng, M. and Liu, L.C. and Li, Z.H. and Shi, Y. and Zhou, X.D.",
        TITLE = "Multi-Semantic Alignment Co-Reasoning Network for Video Question
Answering",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "2090-2094",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235969"}

@inproceedings{bb241055,
        AUTHOR = "Ye, S.H. and Kong, W.K. and Yao, C.L. and Ren, J.F. and Jiang, X.D.",
        TITLE = "Video Question Answering Using Clip-Guided Visual-Text Attention",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "81-85",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235970"}

@inproceedings{bb241056,
        AUTHOR = "Khan, Z. and Kumar, B.V. and Schulter, S. and Yu, X. and Fu, Y. and Chandraker, M.",
        TITLE = "Q: How to Specialize Large Vision-Language Models to Data-Scarce VQA
Tasks? A: Self-Train on Unlabeled Images!",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "15005-15015",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235971"}

@inproceedings{bb241057,
        AUTHOR = "Su, H.T. and Niu, Y. and Lin, X.D. and Hsu, W.H. and Chang, S.F.",
        TITLE = "Language Models are Causal Knowledge Extractors for Zero-shot Video
Question Answering",
        BOOKTITLE = L3D-IVU23,
        YEAR = "2023",
        PAGES = "4951-4960",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235972"}

@inproceedings{bb241058,
        AUTHOR = "Zang, C.Q. and Wang, H.Q. and Pei, M.T. and Liang, W.",
        TITLE = "Discovering the Real Association: Multimodal Causal Reasoning in
Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19027-19036",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235973"}

@inproceedings{bb241059,
        AUTHOR = "Gao, D.F. and Zhou, L. and Ji, L. and Zhu, L.C. and Yang, Y. and Shou, M.Z.",
        TITLE = "MIST: Multi-modal Iterative Spatial-Temporal Transformer for
Long-form Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14773-14783",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235974"}

@inproceedings{bb241060,
        AUTHOR = "Khan, A.U. and Kuehne, H. and Wu, B. and Chheu, K. and Bousselham, W. and Gan, C. and Lobo, N. and Shah, M.",
        TITLE = "Learning Situation Hyper-Graphs for Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14879-14889",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235975"}

@inproceedings{bb241061,
        AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "Watching the News: Towards VideoQA Models that can Read",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4430-4439",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235976"}

@inproceedings{bb241062,
        AUTHOR = "Zhang, M. and Hwa, R. and Kovashka, A.",
        TITLE = "How to Practice VQA on a Resource-limited Target Domain",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4440-4449",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235977"}

@inproceedings{bb241063,
        AUTHOR = "Lee, J. and Kang, W. and Kim, E.S.",
        TITLE = "Dense but Efficient VideoQA for Intricate Compositional Reasoning",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1114-1123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235978"}

@inproceedings{bb241064,
        AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.",
        TITLE = "Text-Guided Object Detector for Multi-modal Video Question Answering",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1032-1042",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235979"}

@inproceedings{bb241065,
        AUTHOR = "Fang, S. and Wang, S.H. and Zhuo, J. and Han, X.Z. and Huang, Q.M.",
        TITLE = "Learning Linguistic Association Towards Efficient Text-Video Retrieval",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:254-270",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235980"}

@inproceedings{bb241066,
        AUTHOR = "Piergiovanni, A.J. and Morton, K. and Kuo, W.C. and Ryoo, M.S. and Angelova, A.",
        TITLE = "Video Question Answering with Iterative Video-Text Co-tokenization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:76-94",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235981"}

@inproceedings{bb241067,
        AUTHOR = "Barmann, L. and Waibel, A.",
        TITLE = "Where did I leave my keys?: Episodic-Memory-Based Question Answering
on Egocentric Videos",
        BOOKTITLE = Ego4D-EPIC22,
        YEAR = "2022",
        PAGES = "1559-1567",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235982"}

@inproceedings{bb241068,
        AUTHOR = "Li, J.T. and Niu, L. and Zhang, L.Q.",
        TITLE = "From Representation to Reasoning: Towards both Evidence and
Commonsense Reasoning for Video Question-Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "21241-21250",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235983"}

@inproceedings{bb241069,
        AUTHOR = "Datta, S. and Dharur, S. and Cartillier, V. and Desai, R. and Khanna, M. and Batra, D. and Parikh, D.",
        TITLE = "Episodic Memory Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19097-19106",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235984"}

@inproceedings{bb241070,
        AUTHOR = "Gandhi, M. and Gul, M.O. and Prakash, E. and Grunde McLaughlin, M. and Krishna, R. and Agrawala, M.",
        TITLE = "Measuring Compositional Consistency for Video Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5036-5045",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235985"}

@inproceedings{bb241071,
        AUTHOR = "Gorti, S.K. and Vouitsis, N. and Ma, J.W. and Golestan, K. and Volkovs, M. and Garg, A. and Yu, G.",
        TITLE = "X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4996-5005",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235986"}

@inproceedings{bb241072,
        AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Shi, H. and Huang, X.W. and Wu, F. and Yang, Y. and Zhuang, Y.T.",
        TITLE = "Adaptive Hierarchical Graph Reasoning with Semantic Coherence for
Video-and-Language Inference",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1847-1857",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235987"}

@inproceedings{bb241073,
        AUTHOR = "Zhang, M.X. and Yang, Y. and Chen, X. and Ji, Y.L. and Xu, X. and Li, J.J. and Shen, H.T.",
        TITLE = "Multi-stage Aggregated Transformer Network for Temporal Language
Localization in Videos",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12664-12673",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235988"}

@inproceedings{bb241074,
        AUTHOR = "Kim, N. and Ha, S.J. and Kang, J.W.",
        TITLE = "Video Question Answering Using Language-Guided Deep Compressed-Domain
Video Feature",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1688-1697",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235989"}

@inproceedings{bb241075,
        AUTHOR = "Liu, F. and Liu, J. and Wang, W.N. and Lu, H.Q.",
        TITLE = "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video
Question Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1678-1687",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235990"}

@inproceedings{bb241076,
        AUTHOR = "Gao, D.F. and Wang, R.P. and Bai, Z. and Chen, X.L.",
        TITLE = "Env-QA: A Video Question Answering Benchmark for Comprehensive
Understanding of Dynamic Environments",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1655-1665",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235991"}

@inproceedings{bb241077,
        AUTHOR = "Yun, H. and Yu, Y. and Yang, W. and Lee, K. and Kim, G.",
        TITLE = "Pano-AVQA: Grounded Audio-Visual Question Answering on 360° Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2011-2021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235992"}

@inproceedings{bb241078,
        AUTHOR = "Xu, L. and Huang, H. and Liu, J.",
        TITLE = "SUTD-TrafficQA: A Question Answering Benchmark and an Efficient
Network for Video Reasoning over Traffic Events",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "9873-9883",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235993"}

@inproceedings{bb241079,
        AUTHOR = "Park, J. and Lee, J.Y. and Sohn, K.H.",
        TITLE = "Bridge to Answer: Structure-aware Graph Interaction Network for Video
Question Answering",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15521-15530",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235994"}

@inproceedings{bb241080,
        AUTHOR = "Chen, X.W. and Liu, R. and Song, X.M. and Han, Y.H.",
        TITLE = "Locating Visual Explanations for Video Question Answering",
        BOOKTITLE = MMMod21,
        YEAR = "2021",
        PAGES = "I:290-302",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235995"}

@inproceedings{bb241081,
        AUTHOR = "Garcia, N. and Nakashima, Y.",
        TITLE = "Knowledge-based Video Question Answering with Unsupervised Scene
Descriptions",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XVIII:581-598",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235996"}

@inproceedings{bb241082,
        AUTHOR = "Kim, J. and Ma, M. and Pham, T. and Kim, K. and Yoo, C.D.",
        TITLE = "Modality Shifting Attention Network for Multi-Modal Video Question
Answering",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10103-10112",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235997"}

@inproceedings{bb241083,
        AUTHOR = "Jiang, M. and Chen, S. and Yang, J. and Zhao, Q.",
        TITLE = "Fantastic Answers and Where to Find Them: Immersive Question-Directed
Visual Attention",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "2977-2986",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235998"}

@inproceedings{bb241084,
        AUTHOR = "Yang, Z. and Garcia, N. and Chu, C. and Otani, M. and Nakashima, Y. and Takemura, H.",
        TITLE = "BERT Representations for Video Question Answering",
        BOOKTITLE = WACV20,
        YEAR = "2020",
        PAGES = "1545-1554",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT235999"}

@inproceedings{bb241085,
        AUTHOR = "Fan, C.Y. and Zhang, X.F. and Zhang, S. and Wang, W.S. and Zhang, C. and Huang, H.",
        TITLE = "Heterogeneous Memory Enhanced Multimodal Attention Model for Video
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "1999-2007",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236000"}

@inproceedings{bb241086,
        AUTHOR = "Kim, J.Y. and Ma, M. and Kim, K. and Kim, S. and Yoo, C.D.",
        TITLE = "Progressive Attention Memory Network for Movie Story Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "8329-8338",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236001"}

@inproceedings{bb241087,
        AUTHOR = "Liu, C.N. and Chen, D.J. and Chen, H.T. and Liu, T.L.",
        TITLE = "A2A: Attention to Attention Reasoning for Movie Question Answering",
        BOOKTITLE = ACCV18,
        YEAR = "2018",
        PAGES = "VI:404-419",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236002"}

@inproceedings{bb241088,
        AUTHOR = "Gao, J. and Ge, R. and Chen, K. and Nevatia, R.",
        TITLE = "Motion-Appearance Co-memory Networks for Video Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6576-6585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236003"}

@inproceedings{bb241089,
        AUTHOR = "Kim, K.M. and Choi, S.H. and Kim, J.H. and Zhang, B.T.",
        TITLE = "Multimodal Dual Attention Memory for Video Story Question Answering",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 698-713",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236004"}

@inproceedings{bb241090,
        AUTHOR = "Yu, Y.J. and Kim, J.S. and Kim, G.",
        TITLE = "A Joint Sequence Fusion Model for Video Question Answering and
Retrieval",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "VII: 487-503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236005"}

@inproceedings{bb241091,
        AUTHOR = "Hasan Chowdhury, M.I. and Nguyen, K. and Sridharan, S. and Fookes, C.",
        TITLE = "Hierarchical Relational Attention for Video Question Answering",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "599-603",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236006"}

@inproceedings{bb241092,
        AUTHOR = "Mun, J. and Seo, P.H. and Jung, I. and Han, B.H.",
        TITLE = "MarioQA: Answering Questions by Watching Gameplay Videos",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "2886-2894",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236007"}

@inproceedings{bb241093,
        AUTHOR = "Yu, Y. and Ko, H. and Choi, J. and Kim, G.",
        TITLE = "End-to-End Concept Word Detection for Video Captioning, Retrieval,
and Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3261-3269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT236008"}

@article{bb241094,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "Visual question answering:
Datasets, algorithms, and future challenges",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "3-20",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT236011"}

@article{bb241095,
        AUTHOR = "Wu, Q. and Teney, D. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Visual question answering: A survey of methods and datasets",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "21-40",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT236012"}

@article{bb241096,
        AUTHOR = "Teney, D. and Wu, Q. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering: A Tutorial",
        JOURNAL = SPMag,
        VOLUME = "34",
        YEAR = "2017",
        NUMBER = "6",
        MONTH = "November",
        PAGES = "63-75",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT236013"}

@inproceedings{bb241097,
        AUTHOR = "Teney, D. and Liu, L. and van den Hengel, A.J.",
        TITLE = "Graph-Structured Representations for Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3233-3241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT236014"}

@inproceedings{bb241098,
        AUTHOR = "Teney, D. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering as a Meta Learning Task",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 229-245",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT236015"}

@inproceedings{bb241099,
        AUTHOR = "Teney, D. and Abbasnejad, E. and van den Hengel, A.J.",
        TITLE = "Unshuffling Data for Improved Generalization in Visual Question
Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1397-1407",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT236016"}

Last update:Apr 6, 2026 at 11:28:57