@inproceedings{bb215800, AUTHOR = "Wang, W.H. and Yang, Z. and Xu, B. and Li, J. and Sun, Y.", TITLE = "ViLTA: Enhancing Vision-Language Pre-training through Textual Augmentation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "3135-3146", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210861"} @inproceedings{bb215801, AUTHOR = "Wang, T.J.J. and Laaksonen, J. and Langer, T. and Arponen, H. and Bishop, T.E.", TITLE = "Learning by Hallucinating: Vision-Language Pre-training with Weak Supervision", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1073-1083", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210862"} @inproceedings{bb215802, AUTHOR = "Boecking, B. and Usuyama, N. and Bannur, S. and Castro, D.C. and Schwaighofer, A. and Hyland, S. and Wetscherek, M. and Naumann, T. and Nori, A. and Alvarez Valle, J. and Poon, H. and Oktay, O.", TITLE = "Making the Most of Text Semantics to Improve Biomedical Vision-Language Processing", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:1-21", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210863"} @inproceedings{bb215803, AUTHOR = "Cui, Q. and Zhou, B. and Guo, Y. and Yin, W.D. and Wu, H. and Yoshie, O. and Chen, Y.", TITLE = "Contrastive Vision-Language Pre-training with Limited Resources", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:236-253", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210864"} @inproceedings{bb215804, AUTHOR = "Walmer, M. and Sikka, K. and Sur, I. and Shrivastava, A. and Jha, S.", TITLE = "Dual-Key Multimodal Backdoors for Visual Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "15354-15364", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210865"} @inproceedings{bb215805, AUTHOR = "Ding, Y. and Yu, J. and Liu, B. and Hu, Y. and Cui, M.X. and Wu, Q.", TITLE = "MuKEA: Multimodal Knowledge Extraction and Accumulation for Knowledge-based Visual Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "5079-5088", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210866"} @inproceedings{bb215806, AUTHOR = "Gao, F. and Ping, Q. and Thattai, G. and Reganti, A. and Wu, Y.N. and Natarajan, P.", TITLE = "Transform-Retrieve-Generate: Natural Language-Centric Outside-Knowledge Visual Question Answering", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "5057-5067", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210867"} @inproceedings{bb215807, AUTHOR = "Aflalo, E. and Du, M. and Tseng, S.Y. and Liu, Y.F. and Wu, C. and Duan, N. and Lal, V.", TITLE = "VL-InterpreT: An Interactive Visualization Tool for Interpreting Vision-Language Transformers", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "21374-21383", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210868"} @inproceedings{bb215808, AUTHOR = "Hu, X.W. and Gan, Z. and Wang, J.F. and Yang, Z.Y. and Liu, Z.C. and Lu, Y. and Wang, L.J.", TITLE = "Scaling Up Vision-Language Pretraining for Image Captioning", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "17959-17968", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210869"} @inproceedings{bb215809, AUTHOR = "Zhang, P.C. and Li, X.J. and Hu, X.W. and Yang, J.W. and Zhang, L. and Wang, L.J. and Choi, Y.J. and Gao, J.F.", TITLE = "VinVL: Revisiting Visual Representations in Vision-Language Models", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "5575-5584", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210870"} @inproceedings{bb215810, AUTHOR = "Li, Z.W. and Stengel Eskin, E. and Zhang, Y.X. and Xie, C. and Tran, Q. and van Durme, B. and Yuille, A.L.", TITLE = "Calibrating Concepts and Operations: Towards Symbolic Reasoning on Real Images", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "14890-14899", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210871"} @inproceedings{bb215811, AUTHOR = "Yang, X. and Zhang, H.W. and Qi, G.J. and Cai, J.F.", TITLE = "Causal Attention for Vision-Language Tasks", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "9842-9852", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210872"} @inproceedings{bb215812, AUTHOR = "Stefanini, M. and Cornia, M. and Baraldi, L. and Cucchiara, R.", TITLE = "A Novel Attention-based Aggregation Function to Combine Vision and Language", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "1212-1219", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210873"} @inproceedings{bb215813, AUTHOR = "Jain, V. and Lodhavia, J.", TITLE = "Automatic Question Tagging using k-Nearest Neighbors and Random Forest", BOOKTITLE = ISCV20, YEAR = "2020", PAGES = "1-4", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210874"} @inproceedings{bb215814, AUTHOR = "Zheng, W.B. and Yan, L. and Gou, C. and Wang, F.Y.", TITLE = "Webly Supervised Knowledge Embedding Model for Visual Reasoning", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "12442-12451", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210875"} @inproceedings{bb215815, AUTHOR = "Nguyen, D.K. and Okatani, T.", TITLE = "Multi-Task Learning of Hierarchical Vision-Language Representation", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "10484-10493", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210876"} @inproceedings{bb215816, AUTHOR = "Gupta, T. and Shih, K.J. and Singh, S. and Hoiem, D.", TITLE = "Aligned Image-Word Representations Improve Inductive Transfer Across Vision-Language Tasks", BOOKTITLE = ICCV17, YEAR = "2017", PAGES = "4223-4232", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlm3.html#TT210877"} @article{bb215817, AUTHOR = "Wu, Y.C. and Yang, J.C.", TITLE = "A Robust Passage Retrieval Algorithm for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "18", YEAR = "2008", NUMBER = "10", MONTH = "October", PAGES = "1411-1421", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210878"} @inproceedings{bb215818, AUTHOR = "Wu, Y.C. and Lee, Y.S. and Yang, J.C. and Yen, S.J.", TITLE = "A New Passage Ranking Algorithm for Video Question Answering", BOOKTITLE = PSIVT06, YEAR = "2006", PAGES = "563-572", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210879"} @article{bb215819, AUTHOR = "Li, G.D. and Li, H.J. and Ming, Z.Y. and Hong, R.C. and Tang, S. and Chua, T.S.", TITLE = "Question Answering over Community-Contributed Web Videos", JOURNAL = MultMedMag, VOLUME = "17", YEAR = "2010", NUMBER = "4", MONTH = "October", PAGES = "46-57", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210880"} @inproceedings{bb215820, AUTHOR = "Song, Y.C. and Li, H.J.", TITLE = "Mash-Up Approach for Web Video Category Recommendation", BOOKTITLE = PSIVT10, YEAR = "2010", PAGES = "197-202", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210881"} @article{bb215821, AUTHOR = "Guo, Z.Y. and Zhao, Z. and Jin, W. and Wei, Z.C. and Yang, M. and Wang, N.N. and Yuan, N.J.", TITLE = "Multi-Turn Video Question Generation via Reinforced Multi-Choice Attention Network", JOURNAL = CirSysVideo, VOLUME = "31", YEAR = "2021", NUMBER = "5", PAGES = "1697-1710", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210882"} @article{bb215822, AUTHOR = "Xue, H.Y. and Chu, W. and Zhao, Z. and Cai, D.", TITLE = "A Better Way to Attend: Attention With Trees for Video Question Answering", JOURNAL = IP, VOLUME = "27", YEAR = "2018", NUMBER = "11", MONTH = "November", PAGES = "5563-5574", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210883"} @article{bb215823, AUTHOR = "Xue, H.Y. and Zhao, Z. and Cai, D.", TITLE = "Unifying the Video and Question Attentions for Open-Ended Video Question Answering", JOURNAL = IP, VOLUME = "26", YEAR = "2017", NUMBER = "12", MONTH = "December", PAGES = "5656-5666", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210884"} @article{bb215824, AUTHOR = "Zhao, Z. and Xiao, S.W. and Song, Z. and Lu, C.J. and Xiao, J. and Zhuang, Y.T.", TITLE = "Open-Ended Video Question Answering via Multi-Modal Conditional Adversarial Networks", JOURNAL = IP, VOLUME = "29", YEAR = "2020", PAGES = "3859-3870", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210885"} @article{bb215825, AUTHOR = "Zhao, Z. and Zhang, Z. and Xiao, S.W. and Xiao, Z.X. and Yan, X.H. and Yu, J. and Cai, D. and Wu, F.", TITLE = "Long-Form Video Question Answering via Dynamic Hierarchical Reinforced Networks", JOURNAL = IP, VOLUME = "28", YEAR = "2019", NUMBER = "12", MONTH = "December", PAGES = "5939-5952", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210886"} @article{bb215826, AUTHOR = "Yu, T. and Yu, J. and Yu, Z. and Huang, Q.M. and Tian, Q.", TITLE = "Long-Term Video Question Answering via Multimodal Hierarchical Memory Attentive Networks", JOURNAL = CirSysVideo, VOLUME = "31", YEAR = "2021", NUMBER = "3", MONTH = "March", PAGES = "931-944", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210887"} @article{bb215827, AUTHOR = "Jang, Y. and Song, Y. and Kim, C.D. and Yu, Y. and Kim, Y. and Kim, G.", TITLE = "Video Question Answering with Spatio-Temporal Reasoning", JOURNAL = IJCV, VOLUME = "127", YEAR = "2019", NUMBER = "10", MONTH = "October", PAGES = "1385-1412", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210888"} @inproceedings{bb215828, AUTHOR = "Jang, Y. and Song, Y. and Yu, Y. and Kim, Y. and Kim, G.", TITLE = "TGIF-QA: Toward Spatio-Temporal Reasoning in Visual Question Answering", BOOKTITLE = CVPR17, YEAR = "2017", PAGES = "1359-1367", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210889"} @article{bb215829, AUTHOR = "Yu, T. and Yu, J. and Yu, Z. and Tao, D.", TITLE = "Compositional Attention Networks With Two-Stream Fusion for Video Question Answering", JOURNAL = IP, VOLUME = "29", YEAR = "2020", NUMBER = "", PAGES = "1204-1218", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210890"} @article{bb215830, AUTHOR = "Wang, W.N. and Huang, Y. and Wang, L.", TITLE = "Long video question answering: A Matching-guided Attention Model", JOURNAL = PR, VOLUME = "102", YEAR = "2020", PAGES = "107248", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210891"} @article{bb215831, AUTHOR = "Zhang, W. and Tang, S. and Cao, Y. and Pu, S. and Wu, F. and Zhuang, Y.", TITLE = "Frame Augmented Alternating Attention Network for Video Question Answering", JOURNAL = MultMed, VOLUME = "22", YEAR = "2020", NUMBER = "4", MONTH = "April", PAGES = "1032-1041", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210892"} @article{bb215832, AUTHOR = "Chen, J. and Shao, J. and He, C.", TITLE = "Movie fill in the blank by joint learning from video and text with adaptive temporal attention", JOURNAL = PRL, VOLUME = "132", YEAR = "2020", PAGES = "62-68", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210893"} @article{bb215833, AUTHOR = "Wang, A. and Luu, A.T. and Foo, C. and Zhu, H. and Tay, Y. and Chandrasekhar, V.", TITLE = "Holistic Multi-Modal Memory Network for Movie Question Answering", JOURNAL = IP, VOLUME = "29", YEAR = "2020", NUMBER = "1", PAGES = "489-499", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210894"} @article{bb215834, AUTHOR = "Yuan, Z.Q. and Sun, S.Y. and Duan, L.X. and Li, C.S. and Wu, X. and Xu, C.S.", TITLE = "Adversarial Multimodal Network for Movie Story Question Answering", JOURNAL = MultMed, VOLUME = "23", YEAR = "2021", PAGES = "1744-1756", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210895"} @article{bb215835, AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Hong, R. and Wu, F.", TITLE = "Graph-Based Multi-Interaction Network for Video Question Answering", JOURNAL = IP, VOLUME = "30", YEAR = "2021", PAGES = "2758-2770", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210896"} @article{bb215836, AUTHOR = "Xie, Z. and Wu, K.W. and Zhang, X.Y. and Yang, X.M. and Hou, J.K.", TITLE = "Learning continuous temporal embedding of videos using pattern theory", JOURNAL = PRL, VOLUME = "146", YEAR = "2021", PAGES = "222-229", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210897"} @article{bb215837, AUTHOR = "Liu, Y. and Zhang, X.M. and Zhang, Q.Y. and Li, C.Z. and Huang, F. and Tang, X.H. and Li, Z.J.", TITLE = "Dual self-attention with co-attention networks for visual question answering", JOURNAL = PR, VOLUME = "117", YEAR = "2021", PAGES = "107956", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210898"} @article{bb215838, AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Shen, S.X. and Tian, P. and Li, L. and Li, Z.J.", TITLE = "Dynamic Self-Attention with Vision Synchronization Networks for Video Question Answering", JOURNAL = PR, VOLUME = "132", YEAR = "2022", PAGES = "108959", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210899"} @article{bb215839, AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Zhang, B. and Li, Z.J.", TITLE = "Cross-Attentional Spatio-Temporal Semantic Graph Networks for Video Question Answering", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "1684-1696", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210900"} @article{bb215840, AUTHOR = "Jin, W. and Zhao, Z. and Cao, X.C. and Zhu, J.M. and He, X.Q. and Zhuang, Y.T.", TITLE = "Adaptive Spatio-Temporal Graph Enhanced Vision-Language Representation for Video QA", JOURNAL = IP, VOLUME = "30", YEAR = "2021", PAGES = "5477-5489", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210901"} @article{bb215841, AUTHOR = "Gao, L. and Chen, T.M. and Li, X.P. and Zeng, P.P. and Zhao, L. and Li, Y.F.", TITLE = "Generalized pyramid co-attention with learnable aggregation net for video question answering", JOURNAL = PR, VOLUME = "120", YEAR = "2021", PAGES = "108145", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210902"} @article{bb215842, AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.", TITLE = "Hierarchical Conditional Relation Networks for Multimodal Video Question Answering", JOURNAL = IJCV, VOLUME = "129", YEAR = "2021", NUMBER = "11", MONTH = "November", PAGES = "3027-3050", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210903"} @inproceedings{bb215843, AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.", TITLE = "Hierarchical Conditional Relation Networks for Video Question Answering", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "9969-9978", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210904"} @article{bb215844, AUTHOR = "Su, H.T. and Chang, C.H. and Shen, P.W. and Wang, Y.S. and Chang, Y.L. and Chang, Y.C. and Cheng, P.J. and Hsu, W.H.", TITLE = "End-to-End Video Question-Answer Generation With Generator-Pretester Network", JOURNAL = CirSysVideo, VOLUME = "31", YEAR = "2021", NUMBER = "11", MONTH = "November", PAGES = "4497-4507", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210905"} @article{bb215845, AUTHOR = "Gao, L.L. and Lei, Y. and Zeng, P.P. and Song, J.K. and Wang, M. and Shen, H.T.", TITLE = "Hierarchical Representation Network With Auxiliary Tasks for Video Captioning and Video Question Answering", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "202-215", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210906"} @article{bb215846, AUTHOR = "Zhang, J.P. and Shao, J. and Cao, R. and Gao, L.L. and Xu, X. and Shen, H.T.", TITLE = "Action-Centric Relation Transformer Network for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "32", YEAR = "2022", NUMBER = "1", MONTH = "January", PAGES = "63-74", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210907"} @article{bb215847, AUTHOR = "Zhang, H. and Sun, A. and Jing, W. and Zhen, L.L. and Zhou, J.T.Y. and Goh, R.S.M.", TITLE = "Natural Language Video Localization: A Revisit in Span-Based Question Answering Framework", JOURNAL = PAMI, VOLUME = "44", YEAR = "2022", NUMBER = "8", MONTH = "August", PAGES = "4252-4266", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210908"} @article{bb215848, AUTHOR = "Wang, J.Y. and Bao, B.K. and Xu, C.S.", TITLE = "DualVGR: A Dual-Visual Graph Reasoning Unit for Video Question Answering", JOURNAL = MultMed, VOLUME = "24", YEAR = "2022", PAGES = "3369-3380", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210909"} @article{bb215849, AUTHOR = "Zeng, P.P. and Zhang, H.N. and Gao, L. and Song, J.K. and Shen, H.T.", TITLE = "Video Question Answering With Prior Knowledge and Object-Sensitive Learning", JOURNAL = IP, VOLUME = "31", YEAR = "2022", PAGES = "5936-5948", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210910"} @article{bb215850, AUTHOR = "Gan, Z. and Li, L.J. and Li, C.Y. and Wang, L.J. and Liu, Z.C. and Gao, J.F.", TITLE = "Vision-Language Pre-Training: Basics, Recent Advances, and Future Trends", JOURNAL = FTCGV, VOLUME = "14", YEAR = "2022", NUMBER = "3-4", PAGES = "163-352", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210911"} @article{bb215851, AUTHOR = "Zhang, F. and Wang, R. and Zhou, F. and Luo, Y.M.", TITLE = "ERM: Energy-Based Refined-Attention Mechanism for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "33", YEAR = "2023", NUMBER = "3", MONTH = "March", PAGES = "1454-1467", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210912"} @article{bb215852, AUTHOR = "Yang, J. and Jang, H. and Yu, K.", TITLE = "Analyzing Geographic Questions Using Embedding-based Topic Modeling", JOURNAL = IJGI, VOLUME = "12", YEAR = "2023", NUMBER = "2", PAGES = "xx-yy", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210913"} @inproceedings{bb215853, AUTHOR = "Zhao, S.W. and Liu, Y.Y. and Du, S. and Tian, Z.Q. and Qu, T. and Xu, L.H.", TITLE = "CMFG: Cross-model Fine-grained Feature Interaction for Text-video Retrieval", BOOKTITLE = MMMod23, YEAR = "2023", PAGES = "II: 435-445", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210914"} @article{bb215854, AUTHOR = "Luo, H.N. and Lin, G.S. and Yao, Y.Z. and Liu, F.Y. and Liu, Z.C. and Tang, Z.M.", TITLE = "Depth and Video Segmentation Based Visual Attention for Embodied Question Answering", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "6", MONTH = "June", PAGES = "6807-6819", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210915"} @inproceedings{bb215855, AUTHOR = "Luo, H.N. and Lin, G.S. and Liu, Z.C. and Liu, F.Y. and Tang, Z.M. and Yao, Y.Z.", TITLE = "SegEQA: Video Segmentation Based Visual Attention for Embodied Question Answering", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "9666-9675", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210916"} @article{bb215856, AUTHOR = "Zhang, X. and Zhang, F.F. and Xu, C.S.", TITLE = "Reducing Vision-Answer Biases for Multiple-Choice VQA", JOURNAL = IP, VOLUME = "32", YEAR = "2023", PAGES = "4621-4634", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210917"} @article{bb215857, AUTHOR = "Xiao, J.B. and Zhou, P. and Yao, A. and Li, Y.C. and Hong, R.C. and Yan, S.C. and Chua, T.S.", TITLE = "Contrastive Video Question Answering via Video Graph Transformer", JOURNAL = PAMI, VOLUME = "45", YEAR = "2023", NUMBER = "11", MONTH = "November", PAGES = "13265-13280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210918"} @inproceedings{bb215858, AUTHOR = "Xiao, J.B. and Zhou, P. and Chua, T.S. and Yan, S.C.", TITLE = "Video Graph Transformer for Video Question Answering", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:39-58", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210919"} @article{bb215859, AUTHOR = "Shen, W.X. and Song, J.K. and Zhu, X.S. and Li, G.F. and Shen, H.T.", TITLE = "End-to-End Pre-Training With Hierarchical Matching and Momentum Contrast for Text-Video Retrieval", JOURNAL = IP, VOLUME = "32", YEAR = "2023", PAGES = "5017-5030", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210920"} @article{bb215860, AUTHOR = "Jiang, J.J. and Liu, Z. and Zheng, N.N.", TITLE = "LiVLR: A Lightweight Visual-Linguistic Reasoning Framework for Video Question Answering", JOURNAL = MultMed, VOLUME = "25", YEAR = "2023", PAGES = "5002-5013", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210921"} @article{bb215861, AUTHOR = "Xu, F.F. and Zhu, Y. and Wang, C. and Cao, Y.Z. and Zhong, Z. and Li, X.M.", TITLE = "Spatio-Temporal Two-stage Fusion for video question answering", JOURNAL = CVIU, VOLUME = "237", YEAR = "2023", PAGES = "103821", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210922"} @article{bb215862, AUTHOR = "Wang, Y.Y. and Liu, M. and Wu, J.L. and Nie, L.Q.", TITLE = "Multi-Granularity Interaction and Integration Network for Video Question Answering", JOURNAL = CirSysVideo, VOLUME = "33", YEAR = "2023", NUMBER = "12", MONTH = "December", PAGES = "7684-7695", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210923"} @article{bb215863, AUTHOR = "Bai, Z. and Wang, R.P. and Gao, D.F. and Chen, X.L.", TITLE = "Event Graph Guided Compositional Spatial-Temporal Reasoning for Video Question Answering", JOURNAL = IP, VOLUME = "33", YEAR = "2024", PAGES = "1109-1121", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210924"} @article{bb215864, AUTHOR = "Qian, T.W. and Cui, R. and Chen, J.J. and Peng, P. and Guo, X.W. and Jiang, Y.G.", TITLE = "Locate Before Answering: Answer Guided Question Localization for Video Question Answering", JOURNAL = MultMed, VOLUME = "26", YEAR = "2024", PAGES = "4554-4563", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210925"} @article{bb215865, AUTHOR = "Cheng, Y. and Fan, H. and Lin, D.Y. and Sun, Y. and Kankanhalli, M. and Lim, J.H.", TITLE = "Keyword-Aware Relative Spatio-Temporal Graph Networks for Video Question Answering", JOURNAL = MultMed, VOLUME = "26", YEAR = "2024", PAGES = "6131-6141", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210926"} @article{bb215866, AUTHOR = "Jiang, Y.M. and Yan, T. and Yao, M.Z. and Wang, H. and Liu, W.Z.", TITLE = "Cascade transformers with dynamic attention for video question answering", JOURNAL = CVIU, VOLUME = "242", YEAR = "2024", PAGES = "103983", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210927"} @article{bb215867, AUTHOR = "Yu, T. and Fu, K. and Zhang, J. and Huang, Q.M. and Yu, J.", TITLE = "Multi-Granularity Contrastive Cross-Modal Collaborative Generation for End-to-End Long-Term Video Question Answering", JOURNAL = IP, VOLUME = "33", YEAR = "2024", PAGES = "3115-3129", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210928"} @article{bb215868, AUTHOR = "Liu, J. and Wang, G.X. and Xie, J.L. and Zhou, F.Y. and Xu, H.J.", TITLE = "Video Question Answering with Semantic Disentanglement and Reasoning", JOURNAL = CirSysVideo, VOLUME = "34", YEAR = "2024", NUMBER = "5", MONTH = "May", PAGES = "3663-3673", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210929"} @article{bb215869, AUTHOR = "Nie, J. and Wang, X. and Hou, R. and Li, G.H. and Chen, H. and Zhu, W.W.", TITLE = "Dynamic Spatio-Temporal Graph Reasoning for VideoQA With Self-Supervised Event Recognition", JOURNAL = IP, VOLUME = "33", YEAR = "2024", PAGES = "4145-4158", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210930"} @inproceedings{bb215870, AUTHOR = "Inoue, Y. and Yada, Y. and Tanahashi, K. and Yamaguchi, Y.", TITLE = "NuScenes-MQA: Integrated Evaluation of Captions and QA for Autonomous Driving Datasets using Markup Annotations", BOOKTITLE = LLVMCrive24, YEAR = "2024", PAGES = "930-938", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210931"} @inproceedings{bb215871, AUTHOR = "Park, S.Y. and Lee, M.J. and Kang, J.H. and Choi, H. and Park, Y. and Cho, J. and Lee, A. and Kim, D.K.", TITLE = "VLAAD: Vision and Language Assistant for Autonomous Driving", BOOKTITLE = LLVMCrive24, YEAR = "2024", PAGES = "980-987", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210932"} @inproceedings{bb215872, AUTHOR = "Fang, J.Z.Y. and Zheng, S. and Sharma, V. and Piramuthu, R.", TITLE = "epislon-ViLM: Efficient Video-Language Model via Masked Video Modeling with Semantic Vector-Quantized Tokenizer", BOOKTITLE = Pretrain24, YEAR = "2024", PAGES = "529-540", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210933"} @inproceedings{bb215873, AUTHOR = "Zonneveld, A. and Gatt, A. and Calixto, I.", TITLE = "Video-and-Language (VidL) models and their cognitive relevance", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "325-338", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210934"} @inproceedings{bb215874, AUTHOR = "Momeni, L. and Caron, M. and Nagrani, A. and Zisserman, A. and Schmid, C.", TITLE = "Verbs in Action: Improving verb understanding in video-language models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15533-15545", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210935"} @inproceedings{bb215875, AUTHOR = "Jin, P. and Li, H. and Cheng, Z. and Li, K. and Ji, X.Y. and Liu, C. and Yuan, L. and Chen, J.", TITLE = "DiffusionRet: Generative Text-Video Retrieval with Diffusion Model", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2470-2481", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210936"} @inproceedings{bb215876, AUTHOR = "Li, P.D. and Xie, C.W. and Zhao, L.M. and Xie, H.T. and Ge, J.N. and Zheng, Y. and Zhao, D.L. and Zhang, Y.D.", TITLE = "Progressive Spatio-Temporal Prototype Matching for Text-Video Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "4077-4087", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210937"} @inproceedings{bb215877, AUTHOR = "Guan, P.Y. and Pei, R.J. and Shao, B. and Liu, J.Z. and Li, W. and Gu, J.X. and Xu, H. and Xu, S.C. and Yan, Y. and Lam, E.Y.", TITLE = "PIDRo: Parallel Isomeric Attention with Dynamic Routing for Text-Video Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "11130-11139", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210938"} @inproceedings{bb215878, AUTHOR = "Deng, C.R. and Chen, Q. and Qin, P. and Chen, D. and Wu, Q.", TITLE = "Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15602-15612", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210939"} @inproceedings{bb215879, AUTHOR = "Pirhadi, M.J. and Mirzaei, M. and Eetemadi, S.", TITLE = "Just Ask Plus: Using Transcripts for VideoQA", BOOKTITLE = ASI23, YEAR = "2023", PAGES = "3074-3077", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210940"} @inproceedings{bb215880, AUTHOR = "Ahmad, M. and Park, G. and Park, D. and Park, S.", TITLE = "MMTF: Multi-Modal Temporal Fusion for Commonsense Video Question Answering", BOOKTITLE = VLAR23, YEAR = "2023", PAGES = "4659-4664", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210941"} @inproceedings{bb215881, AUTHOR = "Engin, D. and Avrithis, Y.", TITLE = "Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts", BOOKTITLE = CLVL23, YEAR = "2023", PAGES = "2797-2802", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210942"} @inproceedings{bb215882, AUTHOR = "Nuthalapati, S.V. and Tunga, A.", TITLE = "Coarse to Fine Frame Selection for Online Open-ended Video Question Answering", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "353-361", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210943"} @inproceedings{bb215883, AUTHOR = "Li, Y.C. and Xiao, J.B. and Feng, C. and Wang, X. and Chua, T.S.", TITLE = "Discovering Spatio-Temporal Rationales for Video Question Answering", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13823-13832", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210944"} @inproceedings{bb215884, AUTHOR = "Ko, D. and Lee, J.S. and Choi, M. and Chu, J.W. and Park, J. and Kim, H.W.J.", TITLE = "Open-Vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "3078-3089", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210945"} @inproceedings{bb215885, AUTHOR = "Li, J. and Niu, L. and Zhang, L.Q.", TITLE = "Knowledge Proxy Intervention for Deconfounded Video Question Answering", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "2770-2781", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210946"} @inproceedings{bb215886, AUTHOR = "Chen, G.Y. and Liu, X. and Wang, G. and Zhang, K. and Torr, P.H.S. and Zhang, X.P. and Tang, Y.S.", TITLE = "Tem-adapter: Adapting Image-Text Pretraining for Video Question Answer", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13899-13909", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210947"} @inproceedings{bb215887, AUTHOR = "Pan, J.T. and Lin, Z. and Ge, Y.Y. and Zhu, X.T. and Zhang, R. and Wang, Y. and Qiao, Y. and Li, H.S.", TITLE = "Retrieving-to-Answer: Zero-Shot Video Question Answering with Frozen Large Language Models", BOOKTITLE = MMFM23, YEAR = "2023", PAGES = "272-283", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210948"} @inproceedings{bb215888, AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.", TITLE = "Understanding Video Scenes through Text: Insights from Text-based Video Question Answering", BOOKTITLE = VLAR23, YEAR = "2023", PAGES = "4648-4652", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210949"} @inproceedings{bb215889, AUTHOR = "Peng, M. and Liu, L.C. and Li, Z.H. and Shi, Y. and Zhou, X.D.", TITLE = "Multi-Semantic Alignment Co-Reasoning Network for Video Question Answering", BOOKTITLE = ICIP23, YEAR = "2023", PAGES = "2090-2094", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210950"} @inproceedings{bb215890, AUTHOR = "Ye, S.H. and Kong, W. and Yao, C. and Ren, J.F. and Jiang, X.D.", TITLE = "Video Question Answering Using Clip-Guided Visual-Text Attention", BOOKTITLE = ICIP23, YEAR = "2023", PAGES = "81-85", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210951"} @inproceedings{bb215891, AUTHOR = "Khan, Z. and Kumar, B.V. and Schulter, S. and Yu, X. and Fu, Y. and Chandraker, M.", TITLE = "Q: How to Specialize Large Vision-Language Models to Data-Scarce VQA Tasks? A: Self-Train on Unlabeled Images!", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "15005-15015", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210952"} @inproceedings{bb215892, AUTHOR = "Su, H.T. and Niu, Y. and Lin, X.D. and Hsu, W.H. and Chang, S.F.", TITLE = "Language Models are Causal Knowledge Extractors for Zero-shot Video Question Answering", BOOKTITLE = L3D-IVU23, YEAR = "2023", PAGES = "4951-4960", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210953"} @inproceedings{bb215893, AUTHOR = "Zang, C.Q. and Wang, H.Q. and Pei, M.T. and Liang, W.", TITLE = "Discovering the Real Association: Multimodal Causal Reasoning in Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "19027-19036", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210954"} @inproceedings{bb215894, AUTHOR = "Gao, D.F. and Zhou, L. and Ji, L. and Zhu, L.C. and Yang, Y. and Shou, M.Z.", TITLE = "MIST: Multi-modal Iterative Spatial-Temporal Transformer for Long-form Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14773-14783", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210955"} @inproceedings{bb215895, AUTHOR = "Khan, A.U. and Kuehne, H. and Wu, B. and Chheu, K. and Bousselham, W. and Gan, C. and Lobo, N. and Shah, M.", TITLE = "Learning Situation Hyper-Graphs for Video Question Answering", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14879-14889", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210956"} @inproceedings{bb215896, AUTHOR = "Jahagirdar, S. and Mathew, M. and Karatzas, D. and Jawahar, C.V.", TITLE = "Watching the News: Towards VideoQA Models that can Read", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4430-4439", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210957"} @inproceedings{bb215897, AUTHOR = "Zhang, M. and Hwa, R. and Kovashka, A.", TITLE = "How to Practice VQA on a Resource-limited Target Domain", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "4440-4449", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210958"} @inproceedings{bb215898, AUTHOR = "Lee, J. and Kang, W. and Kim, E.S.", TITLE = "Dense but Efficient VideoQA for Intricate Compositional Reasoning", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1114-1123", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210959"} @inproceedings{bb215899, AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.", TITLE = "Text-Guided Object Detector for Multi-modal Video Question Answering", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "1032-1042", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT210960"}