@article{bb213100, AUTHOR = "Xie, Z. and Wu, K.W. and Zhang, X.Y. and Yang, X.M. and Hou, J.K.", TITLE = "Learning continuous temporal embedding of videos using pattern theory", JOURNAL = PRL, VOLUME = "146", YEAR = "2021", PAGES = "222-229", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208176"} @article{bb213101, AUTHOR = "Liu, Y. and Zhang, X.M. and Zhang, Q.Y. and Li, C.Z. and Huang, F. and Tang, X.H. and Li, Z.J.", TITLE = "Dual self-attention with co-attention networks for visual question answering", JOURNAL = PR, VOLUME = "117", YEAR = "2021", PAGES = "107956", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208177"} @article{bb213102, AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Shen, S.X. and Tian, P. and Li, L. and Li, Z.J.", TITLE = "Dynamic Self-Attention with Vision Synchronization Networks for Video Question Answering", JOURNAL = PR, VOLUME = "132", YEAR = "2022", PAGES = "108959", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208178"} @article{bb213103, AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Zhang, B. and Li, Z.J.", TITLE = "Cross-Attentional Spatio-Temporal Semantic Graph Networks for Video Question Answering", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "1684-1696", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208179"} @article{bb213104, AUTHOR = "Jin, W. and Zhao, Z. and Cao, X.C. and Zhu, J.M. and He, X.Q. and Zhuang, Y.T.", TITLE = "Adaptive Spatio-Temporal Graph Enhanced Vision-Language Representation for Video QA", JOURNAL = IP, VOLUME = "30", YEAR = "2021", PAGES = "5477-5489", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208180"} @article{bb213105, AUTHOR = "Gao, L. and Chen, T.M. and Li, X.P. and Zeng, P.P. and Zhao, L. and Li, Y.F.", TITLE = "Generalized pyramid co-attention with learnable aggregation net for video question answering", JOURNAL = PR, VOLUME = "120", YEAR = "2021", PAGES = "108145", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208181"} @article{bb213106, AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.", TITLE = "Hierarchical Conditional Relation Networks for Multimodal Video Question Answering", JOURNAL = IJCV, VOLUME = "129", YEAR = "2021", NUMBER = "11", MONTH = "November", PAGES = "3027-3050", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208182"} @inproceedings{bb213107, AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.", TITLE = "Hierarchical Conditional Relation Networks for Video Question Answering", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "9969-9978", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208183"} @article{bb213108, AUTHOR = "Su, H.T. and Chang, C.H. and Shen, P.W. and Wang, Y.S. and Chang, Y.L. and Chang, Y.C. and Cheng, P.J. and Hsu, W.H.", TITLE = "End-to-End Video Question-Answer Generation With Generator-Pretester Network", JOURNAL = CirSysVideo, VOLUME = "31", YEAR = "2021", NUMBER = "11", MONTH = "November", PAGES = "4497-4507", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208184"} @article{bb213109, AUTHOR = "Gao, L.L. and Lei, Y. and Zeng, P.P. and Song, J.K. and Wang, M. and Shen, H.T.", TITLE = "Hierarchical Representation Network With Auxiliary Tasks for Video Captioning and Video Question Answering", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "202-215", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208185"} @article{bb213110, AUTHOR = "Zhang, J.P. and Shao, J. and Cao, R. and Gao, L.L. and Xu, X. and Shen, H.T.", TITLE = "Action-Centric Relation Transformer Network for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "32", YEAR = "2022", NUMBER = "1", MONTH = "January", PAGES = "63-74", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208186"} @article{bb213111, AUTHOR = "Zhang, H. and Sun, A. and Jing, W. and Zhen, L.L. and Zhou, J.T.Y. and Goh, R.S.M.", TITLE = "Natural Language Video Localization: A Revisit in Span-Based Question Answering Framework", JOURNAL = PAMI, VOLUME = "44", YEAR = "2022", NUMBER = "8", MONTH = "August", PAGES = "4252-4266", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208187"} @article{bb213112, AUTHOR = "Wang, J.Y. and Bao, B.K. and Xu, C.S.", TITLE = "DualVGR: A Dual-Visual Graph Reasoning Unit for Video Question Answering", JOURNAL = MultMed, VOLUME = "24", YEAR = "2022", PAGES = "3369-3380", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208188"} @article{bb213113, AUTHOR = "Zeng, P.P. and Zhang, H.N. and Gao, L. and Song, J.K. and Shen, H.T.", TITLE = "Video Question Answering With Prior Knowledge and Object-Sensitive Learning", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "5936-5948", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208189"} @article{bb213114, AUTHOR = "Gan, Z. and Li, L.J. and Li, C.Y. and Wang, L.J. and Liu, Z.C. and Gao, J.F.", TITLE = "Vision-Language Pre-Training: Basics, Recent Advances, and Future Trends", JOURNAL = FTCGV, VOLUME = "14", YEAR = "2022", NUMBER = "3-4", PAGES = "163-352", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208190"} @article{bb213115, AUTHOR = "Zhang, F. and Wang, R. and Zhou, F. and Luo, Y.M.", TITLE = "ERM: Energy-Based Refined-Attention Mechanism for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "33", YEAR = "2023", NUMBER = "3", MONTH = "March", PAGES = "1454-1467", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208191"} @article{bb213116, AUTHOR = "Yang, J. and Jang, H. and Yu, K.", TITLE = "Analyzing Geographic Questions Using Embedding-based Topic Modeling", JOURNAL = IJGI, VOLUME = "12", YEAR = "2023", NUMBER = "2", PAGES = "xx-yy", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208192"} @inproceedings{bb213117, AUTHOR = "Zhao, S.W. and Liu, Y.Y. and Du, S. and Tian, Z.Q. and Qu, T. and Xu, L.H.", TITLE = "CMFG: Cross-model Fine-grained Feature Interaction for Text-video Retrieval", BOOKTITLE = MMMod23, YEAR = "2023", PAGES = "II: 435-445", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208193"} @article{bb213118, AUTHOR = "Luo, H.N. and Lin, G.S. and Yao, Y.Z. and Liu, F.Y. and Liu, Z.C. and Tang, Z.M.", TITLE = "Depth and Video Segmentation Based Visual Attention for Embodied Question Answering", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "6", MONTH = "June", PAGES = "6807-6819", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208194"} @inproceedings{bb213119, AUTHOR = "Luo, H.N. and Lin, G.S. and Liu, Z.C. and Liu, F.Y. and Tang, Z.M. and Yao, Y.Z.", TITLE = "SegEQA: Video Segmentation Based Visual Attention for Embodied Question Answering", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "9666-9675", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208195"} @article{bb213120, AUTHOR = "Zhang, X. and Zhang, F.F. and Xu, C.S.", TITLE = "Reducing Vision-Answer Biases for Multiple-Choice VQA", JOURNAL = IP, VOLUME = "32", YEAR = "2023", PAGES = "4621-4634", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208196"} @article{bb213121, AUTHOR = "Xiao, J.B. and Zhou, P. and Yao, A. and Li, Y.C. and Hong, R.C. and Yan, S.C. and Chua, T.S.", TITLE = "Contrastive Video Question Answering via Video Graph Transformer", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "11", MONTH = "November", PAGES = "13265-13280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208197"} @inproceedings{bb213122, AUTHOR = "Xiao, J.B. and Zhou, P. and Chua, T.S. and Yan, S.C.", TITLE = "Video Graph Transformer for Video Question Answering", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:39-58", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208198"} @article{bb213123, AUTHOR = "Shen, W.X. and Song, J.K. and Zhu, X.S. and Li, G.F. and Shen, H.T.", TITLE = "End-to-End Pre-Training With Hierarchical Matching and Momentum Contrast for Text-Video Retrieval", JOURNAL = IP, VOLUME = "32", YEAR = "2023", PAGES = "5017-5030", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208199"} @article{bb213124, AUTHOR = "Jiang, J.J. and Liu, Z. and Zheng, N.N.", TITLE = "LiVLR: A Lightweight Visual-Linguistic Reasoning Framework for Video Question Answering", JOURNAL = MultMed, VOLUME = "25", YEAR = "2023", PAGES = "5002-5013", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208200"} @article{bb213125, AUTHOR = "Xu, F.F. and Zhu, Y. and Wang, C. and Cao, Y.Z. and Zhong, Z. and Li, X.M.", TITLE = "Spatio-Temporal Two-stage Fusion for video question answering", JOURNAL = CVIU, VOLUME = "237", YEAR = "2023", PAGES = "103821", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208201"} @article{bb213126, AUTHOR = "Wang, Y.Y. and Liu, M. and Wu, J.L. and Nie, L.Q.", TITLE = "Multi-Granularity Interaction and Integration Network for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "33", YEAR = "2023", NUMBER = "12", MONTH = "December", PAGES = "7684-7695", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208202"} @article{bb213127, AUTHOR = "Bai, Z. and Wang, R.P. and Gao, D.F. and Chen, X.L.", TITLE = "Event Graph Guided Compositional Spatial-Temporal Reasoning for Video Question Answering", JOURNAL = IP, VOLUME = "33", YEAR = "2024", PAGES = "1109-1121", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208203"} @article{bb213128, AUTHOR = "Qian, T.W. and Cui, R. and Chen, J.J. and Peng, P. and Guo, X.W. and Jiang, Y.G.", TITLE = "Locate Before Answering: Answer Guided Question Localization for Video Question Answering", JOURNAL = MultMed, VOLUME = "26", YEAR = "2024", PAGES = "4554-4563", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208204"} @article{bb213129, AUTHOR = "Cheng, Y. and Fan, H. and Lin, D.Y. and Sun, Y. and Kankanhalli, M. and Lim, J.H.", TITLE = "Keyword-Aware Relative Spatio-Temporal Graph Networks for Video Question Answering", JOURNAL = MultMed, VOLUME = "26", YEAR = "2024", PAGES = "6131-6141", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208205"} @article{bb213130, AUTHOR = "Jiang, Y.M. and Yan, T. and Yao, M.Z. and Wang, H. and Liu, W.Z.", TITLE = "Cascade transformers with dynamic attention for video question answering", JOURNAL = CVIU, VOLUME = "242", YEAR = "2024", PAGES = "103983", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208206"} @inproceedings{bb213131, AUTHOR = "Zonneveld, A. and Gatt, A. and Calixto, I.", TITLE = "Video-and-Language (VidL) models and their cognitive relevance", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "325-338", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208207"} @inproceedings{bb213132, AUTHOR = "Momeni, L. and Caron, M. and Nagrani, A. and Zisserman, A. and Schmid, C.", TITLE = "Verbs in Action: Improving verb understanding in video-language models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15533-15545", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208208"} @inproceedings{bb213133, AUTHOR = "Jin, P. and Li, H. and Cheng, Z. and Li, K. and Ji, X.Y. and Liu, C. and Yuan, L. and Chen, J.", TITLE = "DiffusionRet: Generative Text-Video Retrieval with Diffusion Model", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2470-2481", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208209"} @inproceedings{bb213134, AUTHOR = "Li, P.D. and Xie, C.W. and Zhao, L.M. and Xie, H.T. and Ge, J.N. and Zheng, Y. and Zhao, D.L. and Zhang, Y.D.", TITLE = "Progressive Spatio-Temporal Prototype Matching for Text-Video Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "4077-4087", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208210"} @inproceedings{bb213135, AUTHOR = "Guan, P.Y. and Pei, R.J. and Shao, B. and Liu, J.Z. and Li, W. and Gu, J.X. and Xu, H. and Xu, S.C. and Yan, Y. and Lam, E.Y.", TITLE = "PIDRo: Parallel Isomeric Attention with Dynamic Routing for Text-Video Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11130-11139", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208211"} @inproceedings{bb213136, AUTHOR = "Deng, C.R. and Chen, Q. and Qin, P. and Chen, D. and Wu, Q.", TITLE = "Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15602-15612", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208212"} @inproceedings{bb213137, AUTHOR = "Pirhadi, M.J. and Mirzaei, M. and Eetemadi, S.", TITLE = "Just Ask Plus: Using Transcripts for VideoQA", BOOKTITLE = ASI23, YEAR = "2023", PAGES = "3074-3077", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208213"} @inproceedings{bb213138, AUTHOR = "Ahmad, M. and Park, G. and Park, D. and Park, S.", TITLE = "MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question Answering", BOOKTITLE = VLAR23, YEAR = "2023", PAGES = "4659-4664", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208214"} @inproceedings{bb213139, AUTHOR = "Engin, D. and Avrithis, Y.", TITLE = "Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2797-2802", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208215"} @inproceedings{bb213140, AUTHOR = "Nuthalapati, S.V. and Tunga, A.", TITLE = "Coarse to Fine Frame Selection for Online Open-ended Video Question Answering", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "353-361", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208216"} @inproceedings{bb213141, AUTHOR = "Li, Y.C. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.", TITLE = "Discovering Spatio-Temporal Rationales for Video Question Answering", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13823-13832", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208217"} @inproceedings{bb213142, AUTHOR = "Ko, D. and Lee, J.S. and Choi, M. and Chu, J.W. and Park, J. and Kim, H.W.J.", TITLE = "Open-Vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "3078-3089", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208218"} @inproceedings{bb213143, AUTHOR = "Li, J. and Niu, L. and Zhang, L.Q.", TITLE = "Knowledge Proxy Intervention for Deconfounded Video Question Answering", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2770-2781", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208219"} @inproceedings{bb213144, AUTHOR = "Chen, G.Y. and Liu, X. and Wang, G. and Zhang, K. and Torr, P.H.S. and Zhang, X.P. and Tang, Y.S.", TITLE = "Tem-adapter: Adapting Image-Text Pretraining for Video Question Answer", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13899-13909", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208220"} @inproceedings{bb213145, AUTHOR = "Pan, J.T. and Lin, Z. and Ge, Y.Y. and Zhu, X.T. and Zhang, R. and Wang, Y. and Qiao, Y. and Li, H.S.", TITLE = "Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen Large Language Models", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "272-283", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208221"} @inproceedings{bb213146, AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.", TITLE = "Understanding Video Scenes through Text: Insights from Text-based Video Question Answering", BOOKTITLE = VLAR23, YEAR = "2023", PAGES = "4648-4652", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208222"} @inproceedings{bb213147, AUTHOR = "Peng, M. and Liu, L.C. and Li, Z.H. and Shi, Y. and Zhou, X.D.", TITLE = "Multi-Semantic Alignment Co-Reasoning Network for Video Question Answering", BOOKTITLE = ICIP23, YEAR = "2023", PAGES = "2090-2094", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208223"} @inproceedings{bb213148, AUTHOR = "Ye, S.H. and Kong, W. and Yao, C. and Ren, J.F. and Jiang, X.D.", TITLE = "Video Question Answering Using Clip-Guided Visual-Text Attention", BOOKTITLE = ICIP23, YEAR = "2023", PAGES = "81-85", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208224"} @inproceedings{bb213149, AUTHOR = "Khan, Z. and Kumar, B.V. and Schulter, S. and Yu, X. and Fu, Y. and Chandraker, M.", TITLE = "Q: How to Specialize Large Vision-Language Models to Data-Scarce VQA Tasks? A: Self-Train on Unlabeled Images!", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "15005-15015", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208225"} @inproceedings{bb213150, AUTHOR = "Su, H.T. and Niu, Y. and Lin, X.D. and Hsu, W.H. and Chang, S.F.", TITLE = "Language Models are Causal Knowledge Extractors for Zero-shot Video Question Answering", BOOKTITLE = L3D-IVU23, YEAR = "2023", PAGES = "4951-4960", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208226"} @inproceedings{bb213151, AUTHOR = "Zang, C.Q. and Wang, H.Q. and Pei, M.T. and Liang, W.", TITLE = "Discovering the Real Association: Multimodal Causal Reasoning in Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19027-19036", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208227"} @inproceedings{bb213152, AUTHOR = "Gao, D.F. and Zhou, L. and Ji, L. and Zhu, L.C. and Yang, Y. and Shou, M.Z.", TITLE = "MIST: Multi-modal Iterative Spatial-Temporal Transformer for Long-form Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14773-14783", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208228"} @inproceedings{bb213153, AUTHOR = "Khan, A.U. and Kuehne, H. and Wu, B. and Chheu, K. and Bousselham, W. and Gan, C. and Lobo, N. and Shah, M.", TITLE = "Learning Situation Hyper-Graphs for Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14879-14889", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208229"} @inproceedings{bb213154, AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.", TITLE = "Watching the News: Towards VideoQA Models that can Read", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4430-4439", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208230"} @inproceedings{bb213155, AUTHOR = "Zhang, M. and Hwa, R. and Kovashka, A.", TITLE = "How to Practice VQA on a Resource-limited Target Domain", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4440-4449", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208231"} @inproceedings{bb213156, AUTHOR = "Lee, J. and Kang, W. and Kim, E.S.", TITLE = "Dense but Efficient VideoQA for Intricate Compositional Reasoning", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1114-1123", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208232"} @inproceedings{bb213157, AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.", TITLE = "Text-Guided Object Detector for Multi-modal Video Question Answering", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1032-1042", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208233"} @inproceedings{bb213158, AUTHOR = "Fang, S. and Wang, S.H. and Zhuo, J. and Han, X.Z. and Huang, Q.M.", TITLE = "Learning Linguistic Association Towards Efficient Text-Video Retrieval", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:254-270", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208234"} @inproceedings{bb213159, AUTHOR = "Piergiovanni, A.J. and Morton, K. and Kuo, W.C. and Ryoo, M.S. and Angelova, A.", TITLE = "Video Question Answering with Iterative Video-Text Co-tokenization", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:76-94", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208235"} @inproceedings{bb213160, AUTHOR = "Barmann, L. and Waibel, A.", TITLE = "Where did I leave my keys?: Episodic-Memory-Based Question Answering on Egocentric Videos", BOOKTITLE = Ego4D-EPIC22, YEAR = "2022", PAGES = "1559-1567", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208236"} @inproceedings{bb213161, AUTHOR = "Li, J.T. and Niu, L. and Zhang, L.Q.", TITLE = "From Representation to Reasoning: Towards both Evidence and Commonsense Reasoning for Video Question-Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "21241-21250", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208237"} @inproceedings{bb213162, AUTHOR = "Datta, S. and Dharur, S. and Cartillier, V. and Desai, R. and Khanna, M. and Batra, D. and Parikh, D.", TITLE = "Episodic Memory Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "19097-19106", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208238"} @inproceedings{bb213163, AUTHOR = "Gandhi, M. and Gul, M.O. and Prakash, E. and Grunde McLaughlin, M. and Krishna, R. and Agrawala, M.", TITLE = "Measuring Compositional Consistency for Video Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "5036-5045", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208239"} @inproceedings{bb213164, AUTHOR = "Gorti, S.K. and Vouitsis, N. and Ma, J.W. and Golestan, K. and Volkovs, M. and Garg, A. and Yu, G.", TITLE = "X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "4996-5005", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208240"} @inproceedings{bb213165, AUTHOR = "Li, J.C. and Tang, S.L. and Zhu, L.C. and Shi, H. and Huang, X. and Wu, F. and Yang, Y. and Zhuang, Y.T.", TITLE = "Adaptive Hierarchical Graph Reasoning with Semantic Coherence for Video-and-Language Inference", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1847-1857", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208241"} @inproceedings{bb213166, AUTHOR = "Zhang, M.X. and Yang, Y. and Chen, X. and Ji, Y.L. and Xu, X. and Li, J.J. and Shen, H.T.", TITLE = "Multi-stage Aggregated Transformer Network for Temporal Language Localization in Videos", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "12664-12673", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208242"} @inproceedings{bb213167, AUTHOR = "Kim, N. and Ha, S.J. and Kang, J.W.", TITLE = "Video Question Answering Using Language-Guided Deep Compressed-Domain Video Feature", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1688-1697", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208243"} @inproceedings{bb213168, AUTHOR = "Liu, F. and Liu, J. and Wang, W.N. and Lu, H.Q.", TITLE = "HAIR: Hierarchical Visual-Semantic Relational Reasoning for Video Question Answering", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1678-1687", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208244"} @inproceedings{bb213169, AUTHOR = "Yang, A. and Miech, A. and Sivic, J. and Laptev, I. and Schmid, C.", TITLE = "Just Ask: Learning to Answer Questions from Millions of Narrated Videos", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1666-1677", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208245"} @inproceedings{bb213170, AUTHOR = "Gao, D.F. and Wang, R.P. and Bai, Z. and Chen, X.L.", TITLE = "Env-QA: A Video Question Answering Benchmark for Comprehensive Understanding of Dynamic Environments", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1655-1665", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208246"} @inproceedings{bb213171, AUTHOR = "Yun, H. and Yu, Y. and Yang, W. and Lee, K. and Kim, G.", TITLE = "Pano-AVQA: Grounded Audio-Visual Question Answering on 360° Videos", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "2011-2021", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208247"} @inproceedings{bb213172, AUTHOR = "Xu, L. and Huang, H. and Liu, J.", TITLE = "SUTD-TrafficQA: A Question Answering Benchmark and an Efficient Network for Video Reasoning over Traffic Events", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "9873-9883", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208248"} @inproceedings{bb213173, AUTHOR = "Park, J. and Lee, J.Y. and Sohn, K.H.", TITLE = "Bridge to Answer: Structure-aware Graph Interaction Network for Video Question Answering", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "15521-15530", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208249"} @inproceedings{bb213174, AUTHOR = "Chen, X.W. and Liu, R. and Song, X.M. and Han, Y.H.", TITLE = "Locating Visual Explanations for Video Question Answering", BOOKTITLE = MMMod21, YEAR = "2021", PAGES = "I:290-302", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208250"} @inproceedings{bb213175, AUTHOR = "Garcia, N. and Nakashima, Y.", TITLE = "Knowledge-based Video Question Answering with Unsupervised Scene Descriptions", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XVIII:581-598", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208251"} @inproceedings{bb213176, AUTHOR = "Kim, J. and Ma, M. and Pham, T. and Kim, K. and Yoo, C.D.", TITLE = "Modality Shifting Attention Network for Multi-Modal Video Question Answering", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "10103-10112", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208252"} @inproceedings{bb213177, AUTHOR = "Jiang, M. and Chen, S. and Yang, J. and Zhao, Q.", TITLE = "Fantastic Answers and Where to Find Them: Immersive Question-Directed Visual Attention", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "2977-2986", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208253"} @inproceedings{bb213178, AUTHOR = "Yang, Z. and Garcia, N. and Chu, C. and Otani, M. and Nakashima, Y. and Takemura, H.", TITLE = "BERT Representations for Video Question Answering", BOOKTITLE = WACV20, YEAR = "2020", PAGES = "1545-1554", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208254"} @inproceedings{bb213179, AUTHOR = "Fan, C.Y. and Zhang, X.F. and Zhang, S. and Wang, W.S. and Zhang, C. and Huang, H.", TITLE = "Heterogeneous Memory Enhanced Multimodal Attention Model for Video Question Answering", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "1999-2007", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208255"} @inproceedings{bb213180, AUTHOR = "Kim, J.Y. and Ma, M. and Kim, K. and Kim, S. and Yoo, C.D.", TITLE = "Progressive Attention Memory Network for Movie Story Question Answering", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "8329-8338", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208256"} @inproceedings{bb213181, AUTHOR = "Liu, C.N. and Chen, D.J. and Chen, H.T. and Liu, T.L.", TITLE = "A2A: Attention to Attention Reasoning for Movie Question Answering", BOOKTITLE = ACCV18, YEAR = "2018", PAGES = "VI:404-419", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208257"} @inproceedings{bb213182, AUTHOR = "Gao, J. and Ge, R. and Chen, K. and Nevatia, R.", TITLE = "Motion-Appearance Co-memory Networks for Video Question Answering", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "6576-6585", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208258"} @inproceedings{bb213183, AUTHOR = "Kim, K.M. and Choi, S.H. and Kim, J.H. and Zhang, B.T.", TITLE = "Multimodal Dual Attention Memory for Video Story Question Answering", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "XV: 698-713", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208259"} @inproceedings{bb213184, AUTHOR = "Yu, Y.J. and Kim, J.S. and Kim, G.", TITLE = "A Joint Sequence Fusion Model for Video Question Answering and Retrieval", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "VII: 487-503", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208260"} @inproceedings{bb213185, AUTHOR = "Hasan Chowdhury, M.I. and Nguyen, K. and Sridharan, S. and Fookes, C.", TITLE = "Hierarchical Relational Attention for Video Question Answering", BOOKTITLE = ICIP18, YEAR = "2018", PAGES = "599-603", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208261"} @inproceedings{bb213186, AUTHOR = "Mun, J. and Seo, P.H. and Jung, I. and Han, B.H.", TITLE = "MarioQA: Answering Questions by Watching Gameplay Videos", BOOKTITLE = ICCV17, YEAR = "2017", PAGES = "2886-2894", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208262"} @inproceedings{bb213187, AUTHOR = "Yu, Y. and Ko, H. and Choi, J. and Kim, G.", TITLE = "End-to-End Concept Word Detection for Video Captioning, Retrieval, and Question Answering", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "3261-3269", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT208263"} @article{bb213188, AUTHOR = "Kafle, K. and Kanan, C.", TITLE = "Visual question answering: Datasets, algorithms, and future challenges", JOURNAL = CVIU, VOLUME = "163", YEAR = "2017", NUMBER = "1", PAGES = "3-20", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208266"} @article{bb213189, AUTHOR = "Wu, Q. and Teney, D. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.", TITLE = "Visual question answering: A survey of methods and datasets", JOURNAL = CVIU, VOLUME = "163", YEAR = "2017", NUMBER = "1", PAGES = "21-40", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208267"} @article{bb213190, AUTHOR = "Teney, D. and Wu, Q. and van den Hengel, A.J.", TITLE = "Visual Question Answering: A Tutorial", JOURNAL = SPMag, VOLUME = "34", YEAR = "2017", NUMBER = "6", MONTH = "November", PAGES = "63-75", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208268"} @inproceedings{bb213191, AUTHOR = "Teney, D. and Liu, L. and van den Hengel, A.J.", TITLE = "Graph-Structured Representations for Visual Question Answering", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "3233-3241", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208269"} @inproceedings{bb213192, AUTHOR = "Teney, D. and van den Hengel, A.J.", TITLE = "Visual Question Answering as a Meta Learning Task", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "XV: 229-245", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208270"} @inproceedings{bb213193, AUTHOR = "Teney, D. and Abbasnejad, E. and van den Hengel, A.J.", TITLE = "Unshuffling Data for Improved Generalization in Visual Question Answering", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "1397-1407", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208271"} @article{bb213194, AUTHOR = "Wu, Q. and Shen, C.H. and Wang, P. and Dick, A. and van den Hengel, A.J.", TITLE = "Image Captioning and Visual Question Answering Based on Attributes and External Knowledge", JOURNAL = PAMI, VOLUME = "40", YEAR = "2018", NUMBER = "6", MONTH = "June", PAGES = "1367-1381", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208272"} @inproceedings{bb213195, AUTHOR = "Wu, Q. and Wang, P. and Shen, C.H. and Dick, A. and van den Hengel, A.J.", TITLE = "Ask Me Anything: Free-Form Visual Question Answering Based on Knowledge from External Sources", BOOKTITLE = CVPR16, YEAR = "2016", PAGES = "4622-4630", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208273"} @article{bb213196, AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.", TITLE = "Combining Multiple Cues for Visual Madlibs Question Answering", JOURNAL = IJCV, VOLUME = "127", YEAR = "2019", NUMBER = "1", MONTH = "January", PAGES = "38-60", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208274"} @inproceedings{bb213197, AUTHOR = "Tommasi, T. and Mallya, A. and Plummer, B.A. and Lazebnik, S. and Berg, A.C. and Berg, T.L.", TITLE = "Solving Visual Madlibs with Multiple Cues", BOOKTITLE = BMVC16, YEAR = "2016", PAGES = "xx-yy", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208275"} @inproceedings{bb213198, AUTHOR = "Yu, L.C. and Park, E. and Berg, A.C. and Berg, T.L.", TITLE = "Visual Madlibs: Fill in the Blank Description Generation and Question Answering", BOOKTITLE = ICCV15, YEAR = "2015", PAGES = "2461-2469", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208276"} @article{bb213199, AUTHOR = "Liu, F. and Xiang, T. and Hospedales, T.M. and Yang, W.K. and Sun, C.Y.", TITLE = "Inverse Visual Question Answering: A New Benchmark and VQA Diagnosis Tool", JOURNAL = PAMI, VOLUME = "42", YEAR = "2020", NUMBER = "2", MONTH = "February", PAGES = "460-474", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vqds43.html#TT208277"}