@article{bb213100,
        AUTHOR = "Xie, Z. and Wu, K.W. and Zhang, X.Y. and Yang, X.M. and Hou, J.K.",
        TITLE = "Learning continuous temporal embedding of videos using pattern theory",
        JOURNAL = PRL,
        VOLUME = "146",
        YEAR = "2021",
        PAGES = "222-229",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208176"}

@article{bb213101,
        AUTHOR = "Liu, Y. and Zhang, X.M. and Zhang, Q.Y. and Li, C.Z. and Huang, F. and Tang, X.H. and Li, Z.J.",
        TITLE = "Dual self-attention with co-attention networks for visual question
answering",
        JOURNAL = PR,
        VOLUME = "117",
        YEAR = "2021",
        PAGES = "107956",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208177"}

@article{bb213102,
        AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Shen, S.X. and Tian, P. and Li, L. and Li, Z.J.",
        TITLE = "Dynamic Self-Attention with Vision Synchronization Networks for Video
Question Answering",
        JOURNAL = PR,
        VOLUME = "132",
        YEAR = "2022",
        PAGES = "108959",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208178"}

@article{bb213103,
        AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Zhang, B. and Li, Z.J.",
        TITLE = "Cross-Attentional Spatio-Temporal Semantic Graph Networks for Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "1684-1696",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208179"}

@article{bb213104,
        AUTHOR = "Jin, W. and Zhao, Z. and Cao, X.C. and Zhu, J.M. and He, X.Q. and Zhuang, Y.T.",
        TITLE = "Adaptive Spatio-Temporal Graph Enhanced Vision-Language
Representation for Video QA",
        JOURNAL = IP,
        VOLUME = "30",
        YEAR = "2021",
        PAGES = "5477-5489",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208180"}

@article{bb213105,
        AUTHOR = "Gao, L. and Chen, T.M. and Li, X.P. and Zeng, P.P. and Zhao, L. and Li, Y.F.",
        TITLE = "Generalized pyramid co-attention with learnable aggregation net for
video question answering",
        JOURNAL = PR,
        VOLUME = "120",
        YEAR = "2021",
        PAGES = "108145",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208181"}

@article{bb213106,
        AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.",
        TITLE = "Hierarchical Conditional Relation Networks for Multimodal Video
Question Answering",
        JOURNAL = IJCV,
        VOLUME = "129",
        YEAR = "2021",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "3027-3050",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208182"}

@inproceedings{bb213107,
        AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.",
        TITLE = "Hierarchical Conditional Relation Networks for Video Question
Answering",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "9969-9978",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208183"}

@article{bb213108,
        AUTHOR = "Su, H.T. and Chang, C.H. and Shen, P.W. and Wang, Y.S. and Chang, Y.L. and Chang, Y.C. and Cheng, P.J. and Hsu, W.H.",
        TITLE = "End-to-End Video Question-Answer Generation With Generator-Pretester
Network",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "4497-4507",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208184"}

@article{bb213109,
        AUTHOR = "Gao, L.L. and Lei, Y. and Zeng, P.P. and Song, J.K. and Wang, M. and Shen, H.T.",
        TITLE = "Hierarchical Representation Network With Auxiliary Tasks for Video
Captioning and Video Question Answering",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "202-215",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208185"}

@article{bb213110,
        AUTHOR = "Zhang, J.P. and Shao, J. and Cao, R. and Gao, L.L. and Xu, X. and Shen, H.T.",
        TITLE = "Action-Centric Relation Transformer Network for Video Question
Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "63-74",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208186"}

@article{bb213111,
        AUTHOR = "Zhang, H. and Sun, A. and Jing, W. and Zhen, L.L. and Zhou, J.T.Y. and Goh, R.S.M.",
        TITLE = "Natural Language Video Localization: A Revisit in Span-Based Question
Answering Framework",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "4252-4266",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208187"}

@article{bb213112,
        AUTHOR = "Wang, J.Y. and Bao, B.K. and Xu, C.S.",
        TITLE = "DualVGR: A Dual-Visual Graph Reasoning Unit for Video Question
Answering",
        JOURNAL = MultMed,
        VOLUME = "24",
        YEAR = "2022",
        PAGES = "3369-3380",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208188"}

@article{bb213113,
        AUTHOR = "Zeng, P.P. and Zhang, H.N. and Gao, L. and Song, J.K. and Shen, H.T.",
        TITLE = "Video Question Answering With Prior Knowledge and Object-Sensitive
Learning",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "5936-5948",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208189"}

@article{bb213114,
        AUTHOR = "Gan, Z. and Li, L.J. and Li, C.Y. and Wang, L.J. and Liu, Z.C. and Gao, J.F.",
        TITLE = "Vision-Language Pre-Training:
Basics, Recent Advances, and Future Trends",
        JOURNAL = FTCGV,
        VOLUME = "14",
        YEAR = "2022",
        NUMBER = "3-4",
        PAGES = "163-352",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208190"}

@article{bb213115,
        AUTHOR = "Zhang, F. and Wang, R. and Zhou, F. and Luo, Y.M.",
        TITLE = "ERM: Energy-Based Refined-Attention Mechanism for Video Question
Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "1454-1467",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208191"}

@article{bb213116,
        AUTHOR = "Yang, J. and Jang, H. and Yu, K.",
        TITLE = "Analyzing Geographic Questions Using Embedding-based Topic Modeling",
        JOURNAL = IJGI,
        VOLUME = "12",
        YEAR = "2023",
        NUMBER = "2",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208192"}

@inproceedings{bb213117,
        AUTHOR = "Zhao, S.W. and Liu, Y.Y. and Du, S. and Tian, Z.Q. and Qu, T. and Xu, L.H.",
        TITLE = "CMFG: Cross-model Fine-grained Feature Interaction for Text-video
Retrieval",
        BOOKTITLE = MMMod23,
        YEAR = "2023",
        PAGES = "II: 435-445",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208193"}

@article{bb213118,
        AUTHOR = "Luo, H.N. and Lin, G.S. and Yao, Y.Z. and Liu, F.Y. and Liu, Z.C. and Tang, Z.M.",
        TITLE = "Depth and Video Segmentation Based Visual Attention for Embodied
Question Answering",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "6807-6819",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208194"}

@inproceedings{bb213119,
        AUTHOR = "Luo, H.N. and Lin, G.S. and Liu, Z.C. and Liu, F.Y. and Tang, Z.M. and Yao, Y.Z.",
        TITLE = "SegEQA: Video Segmentation Based Visual Attention for Embodied
Question Answering",
        BOOKTITLE = ICCV19,
        YEAR = "2019",
        PAGES = "9666-9675",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208195"}

@article{bb213120,
        AUTHOR = "Zhang, X. and Zhang, F.F. and Xu, C.S.",
        TITLE = "Reducing Vision-Answer Biases for Multiple-Choice VQA",
        JOURNAL = IP,
        VOLUME = "32",
        YEAR = "2023",
        PAGES = "4621-4634",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208196"}

@article{bb213121,
        AUTHOR = "Xiao, J.B. and Zhou, P. and Yao, A. and Li, Y.C. and Hong, R.C. and Yan, S.C. and Chua, T.S.",
        TITLE = "Contrastive Video Question Answering via Video Graph Transformer",
        JOURNAL = PAMI,
        VOLUME = "45",
        YEAR = "2023",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "13265-13280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208197"}

@inproceedings{bb213122,
        AUTHOR = "Xiao, J.B. and Zhou, P. and Chua, T.S. and Yan, S.C.",
        TITLE = "Video Graph Transformer for Video Question Answering",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:39-58",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208198"}

@article{bb213123,
        AUTHOR = "Shen, W.X. and Song, J.K. and Zhu, X.S. and Li, G.F. and Shen, H.T.",
        TITLE = "End-to-End Pre-Training With Hierarchical Matching and Momentum
Contrast for Text-Video Retrieval",
        JOURNAL = IP,
        VOLUME = "32",
        YEAR = "2023",
        PAGES = "5017-5030",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208199"}

@article{bb213124,
        AUTHOR = "Jiang, J.J. and Liu, Z. and Zheng, N.N.",
        TITLE = "LiVLR: A Lightweight Visual-Linguistic Reasoning Framework for Video
Question Answering",
        JOURNAL = MultMed,
        VOLUME = "25",
        YEAR = "2023",
        PAGES = "5002-5013",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208200"}

@article{bb213125,
        AUTHOR = "Xu, F.F. and Zhu, Y. and Wang, C. and Cao, Y.Z. and Zhong, Z. and Li, X.M.",
        TITLE = "Spatio-Temporal Two-stage Fusion for video question answering",
        JOURNAL = CVIU,
        VOLUME = "237",
        YEAR = "2023",
        PAGES = "103821",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208201"}

@article{bb213126,
        AUTHOR = "Wang, Y.Y. and Liu, M. and Wu, J.L. and Nie, L.Q.",
        TITLE = "Multi-Granularity Interaction and Integration Network for Video
Question Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "33",
        YEAR = "2023",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "7684-7695",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208202"}

@article{bb213127,
        AUTHOR = "Bai, Z. and Wang, R.P. and Gao, D.F. and Chen, X.L.",
        TITLE = "Event Graph Guided Compositional Spatial-Temporal Reasoning for Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "1109-1121",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208203"}

@article{bb213128,
        AUTHOR = "Qian, T.W. and Cui, R. and Chen, J.J. and Peng, P. and Guo, X.W. and Jiang, Y.G.",
        TITLE = "Locate Before Answering: Answer Guided Question Localization for
Video Question Answering",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "4554-4563",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208204"}

@article{bb213129,
        AUTHOR = "Cheng, Y. and Fan, H. and Lin, D.Y. and Sun, Y. and Kankanhalli, M. and Lim, J.H.",
        TITLE = "Keyword-Aware Relative Spatio-Temporal Graph Networks for Video
Question Answering",
        JOURNAL = MultMed,
        VOLUME = "26",
        YEAR = "2024",
        PAGES = "6131-6141",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208205"}

@article{bb213130,
        AUTHOR = "Jiang, Y.M. and Yan, T. and Yao, M.Z. and Wang, H. and Liu, W.Z.",
        TITLE = "Cascade transformers with dynamic attention for video question
answering",
        JOURNAL = CVIU,
        VOLUME = "242",
        YEAR = "2024",
        PAGES = "103983",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208206"}

@inproceedings{bb213131,
        AUTHOR = "Zonneveld, A. and Gatt, A. and Calixto, I.",
        TITLE = "Video-and-Language (VidL) models and their cognitive relevance",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "325-338",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208207"}

@inproceedings{bb213132,
        AUTHOR = "Momeni, L. and Caron, M. and Nagrani, A. and Zisserman, A. and Schmid, C.",
        TITLE = "Verbs in Action: Improving verb understanding in video-language
models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15533-15545",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208208"}

@inproceedings{bb213133,
        AUTHOR = "Jin, P. and Li, H. and Cheng, Z. and Li, K. and Ji, X.Y. and Liu, C. and Yuan, L. and Chen, J.",
        TITLE = "DiffusionRet: Generative Text-Video Retrieval with Diffusion Model",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2470-2481",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208209"}

@inproceedings{bb213134,
        AUTHOR = "Li, P.D. and Xie, C.W. and Zhao, L.M. and Xie, H.T. and Ge, J.N. and Zheng, Y. and Zhao, D.L. and Zhang, Y.D.",
        TITLE = "Progressive Spatio-Temporal Prototype Matching for Text-Video
Retrieval",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "4077-4087",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208210"}

@inproceedings{bb213135,
        AUTHOR = "Guan, P.Y. and Pei, R.J. and Shao, B. and Liu, J.Z. and Li, W. and Gu, J.X. and Xu, H. and Xu, S.C. and Yan, Y. and Lam, E.Y.",
        TITLE = "PIDRo: Parallel Isomeric Attention with Dynamic Routing for
Text-Video Retrieval",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11130-11139",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208211"}

@inproceedings{bb213136,
        AUTHOR = "Deng, C.R. and Chen, Q. and Qin, P. and Chen, D. and Wu, Q.",
        TITLE = "Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15602-15612",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208212"}

@inproceedings{bb213137,
        AUTHOR = "Pirhadi, M.J. and Mirzaei, M. and Eetemadi, S.",
        TITLE = "Just Ask Plus: Using Transcripts for VideoQA",
        BOOKTITLE = ASI23,
        YEAR = "2023",
        PAGES = "3074-3077",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208213"}

@inproceedings{bb213138,
        AUTHOR = "Ahmad, M. and Park, G. and Park, D. and Park, S.",
        TITLE = "MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question
Answering",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4659-4664",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208214"}

@inproceedings{bb213139,
        AUTHOR = "Engin, D. and Avrithis, Y.",
        TITLE = "Zero-Shot and Few-Shot Video Question Answering with Multi-Modal
Prompts",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2797-2802",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208215"}

@inproceedings{bb213140,
        AUTHOR = "Nuthalapati, S.V. and Tunga, A.",
        TITLE = "Coarse to Fine Frame Selection for Online Open-ended Video Question
Answering",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "353-361",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208216"}

@inproceedings{bb213141,
        AUTHOR = "Li, Y.C. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.",
        TITLE = "Discovering Spatio-Temporal Rationales for Video Question Answering",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13823-13832",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208217"}

@inproceedings{bb213142,
        AUTHOR = "Ko, D. and Lee, J.S. and Choi, M. and Chu, J.W. and Park, J. and Kim, H.W.J.",
        TITLE = "Open-Vocabulary Video Question Answering: A New Benchmark for
Evaluating the Generalizability of Video Question Answering Models",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "3078-3089",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208218"}

@inproceedings{bb213143,
        AUTHOR = "Li, J. and Niu, L. and Zhang, L.Q.",
        TITLE = "Knowledge Proxy Intervention for Deconfounded Video Question
Answering",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2770-2781",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208219"}

@inproceedings{bb213144,
        AUTHOR = "Chen, G.Y. and Liu, X. and Wang, G. and Zhang, K. and Torr, P.H.S. and Zhang, X.P. and Tang, Y.S.",
        TITLE = "Tem-adapter:
Adapting Image-Text Pretraining for Video Question Answer",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13899-13909",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208220"}

@inproceedings{bb213145,
        AUTHOR = "Pan, J.T. and Lin, Z. and Ge, Y.Y. and Zhu, X.T. and Zhang, R. and Wang, Y. and Qiao, Y. and Li, H.S.",
        TITLE = "Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen
Large Language Models",
        BOOKTITLE = MMFM23,
        YEAR = "2023",
        PAGES = "272-283",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208221"}

@inproceedings{bb213146,
        AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "Understanding Video Scenes through Text:
Insights from Text-based Video Question Answering",
        BOOKTITLE = VLAR23,
        YEAR = "2023",
        PAGES = "4648-4652",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208222"}

@inproceedings{bb213147,
        AUTHOR = "Peng, M. and Liu, L.C. and Li, Z.H. and Shi, Y. and Zhou, X.D.",
        TITLE = "Multi-Semantic Alignment Co-Reasoning Network for Video Question
Answering",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "2090-2094",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208223"}

@inproceedings{bb213148,
        AUTHOR = "Ye, S.H. and Kong, W. and Yao, C. and Ren, J.F. and Jiang, X.D.",
        TITLE = "Video Question Answering Using Clip-Guided Visual-Text Attention",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "81-85",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208224"}

@inproceedings{bb213149,
        AUTHOR = "Khan, Z. and Kumar, B.V. and Schulter, S. and Yu, X. and Fu, Y. and Chandraker, M.",
        TITLE = "Q: How to Specialize Large Vision-Language Models to Data-Scarce VQA
Tasks? A: Self-Train on Unlabeled Images!",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "15005-15015",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208225"}

@inproceedings{bb213150,
        AUTHOR = "Su, H.T. and Niu, Y. and Lin, X.D. and Hsu, W.H. and Chang, S.F.",
        TITLE = "Language Models are Causal Knowledge Extractors for Zero-shot Video
Question Answering",
        BOOKTITLE = L3D-IVU23,
        YEAR = "2023",
        PAGES = "4951-4960",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208226"}

@inproceedings{bb213151,
        AUTHOR = "Zang, C.Q. and Wang, H.Q. and Pei, M.T. and Liang, W.",
        TITLE = "Discovering the Real Association: Multimodal Causal Reasoning in
Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19027-19036",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208227"}

@inproceedings{bb213152,
        AUTHOR = "Gao, D.F. and Zhou, L. and Ji, L. and Zhu, L.C. and Yang, Y. and Shou, M.Z.",
        TITLE = "MIST: Multi-modal Iterative Spatial-Temporal Transformer for
Long-form Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14773-14783",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208228"}

@inproceedings{bb213153,
        AUTHOR = "Khan, A.U. and Kuehne, H. and Wu, B. and Chheu, K. and Bousselham, W. and Gan, C. and Lobo, N. and Shah, M.",
        TITLE = "Learning Situation Hyper-Graphs for Video Question Answering",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14879-14889",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208229"}

@inproceedings{bb213154,
        AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.",
        TITLE = "Watching the News: Towards VideoQA Models that can Read",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4430-4439",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208230"}

@inproceedings{bb213155,
        AUTHOR = "Zhang, M. and Hwa, R. and Kovashka, A.",
        TITLE = "How to Practice VQA on a Resource-limited Target Domain",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "4440-4449",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208231"}

@inproceedings{bb213156,
        AUTHOR = "Lee, J. and Kang, W. and Kim, E.S.",
        TITLE = "Dense but Efficient VideoQA for Intricate Compositional Reasoning",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1114-1123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208232"}

@inproceedings{bb213157,
        AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.",
        TITLE = "Text-Guided Object Detector for Multi-modal Video Question Answering",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1032-1042",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208233"}

@inproceedings{bb213158,
        AUTHOR = "Fang, S. and Wang, S.H. and Zhuo, J. and Han, X.Z. and Huang, Q.M.",
        TITLE = "Learning Linguistic Association Towards Efficient Text-Video Retrieval",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:254-270",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208234"}

@inproceedings{bb213159,
        AUTHOR = "Piergiovanni, A.J. and Morton, K. and Kuo, W.C. and Ryoo, M.S. and Angelova, A.",
        TITLE = "Video Question Answering with Iterative Video-Text Co-tokenization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:76-94",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208235"}

@inproceedings{bb213160,
        AUTHOR = "Barmann, L. and Waibel, A.",
        TITLE = "Where did I leave my keys?: Episodic-Memory-Based Question Answering
on Egocentric Videos",
        BOOKTITLE = Ego4D-EPIC22,
        YEAR = "2022",
        PAGES = "1559-1567",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208236"}

@inproceedings{bb213161,
        AUTHOR = "Li, J.T. and Niu, L. and Zhang, L.Q.",
        TITLE = "From Representation to Reasoning: Towards both Evidence and
Commonsense Reasoning for Video Question-Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "21241-21250",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208237"}

@inproceedings{bb213162,
        AUTHOR = "Datta, S. and Dharur, S. and Cartillier, V. and Desai, R. and Khanna, M. and Batra, D. and Parikh, D.",
        TITLE = "Episodic Memory Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19097-19106",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208238"}

@inproceedings{bb213163,
        AUTHOR = "Gandhi, M. and Gul, M.O. and Prakash, E. and Grunde McLaughlin, M. and Krishna, R. and Agrawala, M.",
        TITLE = "Measuring Compositional Consistency for Video Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5036-5045",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208239"}

@inproceedings{bb213164,
        AUTHOR = "Gorti, S.K. and Vouitsis, N. and Ma, J.W. and Golestan, K. and Volkovs, M. and Garg, A. and Yu, G.",
        TITLE = "X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4996-5005",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208240"}

@inproceedings{bb213165,
        AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Shi, H. and Huang, X. and Wu, F. and Yang, Y. and Zhuang, Y.T.",
        TITLE = "Adaptive Hierarchical Graph Reasoning with Semantic Coherence for
Video-and-Language Inference",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1847-1857",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208241"}

@inproceedings{bb213166,
        AUTHOR = "Zhang, M.X. and Yang, Y. and Chen, X. and Ji, Y.L. and Xu, X. and Li, J.J. and Shen, H.T.",
        TITLE = "Multi-stage Aggregated Transformer Network for Temporal Language
Localization in Videos",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12664-12673",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208242"}

@inproceedings{bb213167,
        AUTHOR = "Kim, N. and Ha, S.J. and Kang, J.W.",
        TITLE = "Video Question Answering Using Language-Guided Deep Compressed-Domain
Video Feature",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1688-1697",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208243"}

@inproceedings{bb213168,
        AUTHOR = "Liu, F. and Liu, J. and Wang, W.N. and Lu, H.Q.",
        TITLE = "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video
Question Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1678-1687",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208244"}

@inproceedings{bb213169,
        AUTHOR = "Yang, A. and Miech, A. and Sivic, J. and Laptev, I. and Schmid, C.",
        TITLE = "Just Ask:
Learning to Answer Questions from Millions of Narrated Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1666-1677",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208245"}

@inproceedings{bb213170,
        AUTHOR = "Gao, D.F. and Wang, R.P. and Bai, Z. and Chen, X.L.",
        TITLE = "Env-QA: A Video Question Answering Benchmark for Comprehensive
Understanding of Dynamic Environments",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1655-1665",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208246"}

@inproceedings{bb213171,
        AUTHOR = "Yun, H. and Yu, Y. and Yang, W. and Lee, K. and Kim, G.",
        TITLE = "Pano-AVQA: Grounded Audio-Visual Question Answering on 360° Videos",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "2011-2021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208247"}

@inproceedings{bb213172,
        AUTHOR = "Xu, L. and Huang, H. and Liu, J.",
        TITLE = "SUTD-TrafficQA: A Question Answering Benchmark and an Efficient
Network for Video Reasoning over Traffic Events",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "9873-9883",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208248"}

@inproceedings{bb213173,
        AUTHOR = "Park, J. and Lee, J.Y. and Sohn, K.H.",
        TITLE = "Bridge to Answer: Structure-aware Graph Interaction Network for Video
Question Answering",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15521-15530",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208249"}

@inproceedings{bb213174,
        AUTHOR = "Chen, X.W. and Liu, R. and Song, X.M. and Han, Y.H.",
        TITLE = "Locating Visual Explanations for Video Question Answering",
        BOOKTITLE = MMMod21,
        YEAR = "2021",
        PAGES = "I:290-302",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208250"}

@inproceedings{bb213175,
        AUTHOR = "Garcia, N. and Nakashima, Y.",
        TITLE = "Knowledge-based Video Question Answering with Unsupervised Scene
Descriptions",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XVIII:581-598",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208251"}

@inproceedings{bb213176,
        AUTHOR = "Kim, J. and Ma, M. and Pham, T. and Kim, K. and Yoo, C.D.",
        TITLE = "Modality Shifting Attention Network for Multi-Modal Video Question
Answering",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "10103-10112",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208252"}

@inproceedings{bb213177,
        AUTHOR = "Jiang, M. and Chen, S. and Yang, J. and Zhao, Q.",
        TITLE = "Fantastic Answers and Where to Find Them: Immersive Question-Directed
Visual Attention",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "2977-2986",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208253"}

@inproceedings{bb213178,
        AUTHOR = "Yang, Z. and Garcia, N. and Chu, C. and Otani, M. and Nakashima, Y. and Takemura, H.",
        TITLE = "BERT Representations for Video Question Answering",
        BOOKTITLE = WACV20,
        YEAR = "2020",
        PAGES = "1545-1554",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208254"}

@inproceedings{bb213179,
        AUTHOR = "Fan, C.Y. and Zhang, X.F. and Zhang, S. and Wang, W.S. and Zhang, C. and Huang, H.",
        TITLE = "Heterogeneous Memory Enhanced Multimodal Attention Model for Video
Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "1999-2007",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208255"}

@inproceedings{bb213180,
        AUTHOR = "Kim, J.Y. and Ma, M. and Kim, K. and Kim, S. and Yoo, C.D.",
        TITLE = "Progressive Attention Memory Network for Movie Story Question Answering",
        BOOKTITLE = CVPR19,
        YEAR = "2019",
        PAGES = "8329-8338",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208256"}

@inproceedings{bb213181,
        AUTHOR = "Liu, C.N. and Chen, D.J. and Chen, H.T. and Liu, T.L.",
        TITLE = "A2A: Attention to Attention Reasoning for Movie Question Answering",
        BOOKTITLE = ACCV18,
        YEAR = "2018",
        PAGES = "VI:404-419",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208257"}

@inproceedings{bb213182,
        AUTHOR = "Gao, J. and Ge, R. and Chen, K. and Nevatia, R.",
        TITLE = "Motion-Appearance Co-memory Networks for Video Question Answering",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "6576-6585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208258"}

@inproceedings{bb213183,
        AUTHOR = "Kim, K.M. and Choi, S.H. and Kim, J.H. and Zhang, B.T.",
        TITLE = "Multimodal Dual Attention Memory for Video Story Question Answering",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 698-713",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208259"}

@inproceedings{bb213184,
        AUTHOR = "Yu, Y.J. and Kim, J.S. and Kim, G.",
        TITLE = "A Joint Sequence Fusion Model for Video Question Answering and
Retrieval",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "VII: 487-503",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208260"}

@inproceedings{bb213185,
        AUTHOR = "Hasan Chowdhury, M.I. and Nguyen, K. and Sridharan, S. and Fookes, C.",
        TITLE = "Hierarchical Relational Attention for Video Question Answering",
        BOOKTITLE = ICIP18,
        YEAR = "2018",
        PAGES = "599-603",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208261"}

@inproceedings{bb213186,
        AUTHOR = "Mun, J. and Seo, P.H. and Jung, I. and Han, B.H.",
        TITLE = "MarioQA: Answering Questions by Watching Gameplay Videos",
        BOOKTITLE = ICCV17,
        YEAR = "2017",
        PAGES = "2886-2894",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208262"}

@inproceedings{bb213187,
        AUTHOR = "Yu, Y. and Ko, H. and Choi, J. and Kim, G.",
        TITLE = "End-to-End Concept Word Detection for Video Captioning, Retrieval,
and Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3261-3269",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208263"}

@article{bb213188,
        AUTHOR = "Kafle, K. and Kanan, C.",
        TITLE = "Visual question answering:
Datasets, algorithms, and future challenges",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "3-20",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208266"}

@article{bb213189,
        AUTHOR = "Wu, Q. and Teney, D. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Visual question answering: A survey of methods and datasets",
        JOURNAL = CVIU,
        VOLUME = "163",
        YEAR = "2017",
        NUMBER = "1",
        PAGES = "21-40",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208267"}

@article{bb213190,
        AUTHOR = "Teney, D. and Wu, Q. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering: A Tutorial",
        JOURNAL = SPMag,
        VOLUME = "34",
        YEAR = "2017",
        NUMBER = "6",
        MONTH = "November",
        PAGES = "63-75",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208268"}

@inproceedings{bb213191,
        AUTHOR = "Teney, D. and Liu, L. and van den Hengel, A.J.",
        TITLE = "Graph-Structured Representations for Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "3233-3241",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208269"}

@inproceedings{bb213192,
        AUTHOR = "Teney, D. and van den Hengel, A.J.",
        TITLE = "Visual Question Answering as a Meta Learning Task",
        BOOKTITLE = ECCV18,
        YEAR = "2018",
        PAGES = "XV: 229-245",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208270"}

@inproceedings{bb213193,
        AUTHOR = "Teney, D. and Abbasnejad, E. and van den Hengel, A.J.",
        TITLE = "Unshuffling Data for Improved Generalization in Visual Question
Answering",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "1397-1407",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208271"}

@article{bb213194,
        AUTHOR = "Wu, Q. and Shen, C.H. and Wang, P. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Image Captioning and Visual Question Answering Based on Attributes
and External Knowledge",
        JOURNAL = PAMI,
        VOLUME = "40",
        YEAR = "2018",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "1367-1381",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208272"}

@inproceedings{bb213195,
        AUTHOR = "Wu, Q. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.",
        TITLE = "Ask Me Anything: Free-Form Visual Question Answering Based on
Knowledge from External Sources",
        BOOKTITLE = CVPR16,
        YEAR = "2016",
        PAGES = "4622-4630",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208273"}

@article{bb213196,
        AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.",
        TITLE = "Combining Multiple Cues for Visual Madlibs Question Answering",
        JOURNAL = IJCV,
        VOLUME = "127",
        YEAR = "2019",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "38-60",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208274"}

@inproceedings{bb213197,
        AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.",
        TITLE = "Solving Visual Madlibs with Multiple Cues",
        BOOKTITLE = BMVC16,
        YEAR = "2016",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208275"}

@inproceedings{bb213198,
        AUTHOR = "Yu, L.C. and Park, E. and Berg, A.C. and Berg, T.L.",
        TITLE = "Visual Madlibs:
Fill in the Blank Description Generation and Question Answering",
        BOOKTITLE = ICCV15,
        YEAR = "2015",
        PAGES = "2461-2469",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208276"}

@article{bb213199,
        AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.",
        TITLE = "Inverse Visual Question Answering:
A New Benchmark and VQA Diagnosis Tool",
        JOURNAL = PAMI,
        VOLUME = "42",
        YEAR = "2020",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "460-474",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208277"}

Last update:Apr 18, 2024 at 11:38:49