@article{bb234800,
        AUTHOR = "Wang, X. and Wu, J.L. and Lin, Z. and Zhang, F.Z. and Zhang, D. and Nie, L.Q.",
        TITLE = "Video DataFlywheel: Resolving the Impossible Data Trinity in
Video-Language Understanding",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "2912-2923",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229774"}

@article{bb234801,
        AUTHOR = "Shen, R. and Inoue, N. and Guan, D. and Cai, R. and Kot, A.C. and Shinoda, K.",
        TITLE = "ContextualCoder: Adaptive In-Context Prompting for Programmatic
Visual Question Answering",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "4936-4949",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229775"}

@inproceedings{bb234802,
        AUTHOR = "Shen, R. and Inoue, N. and Shinoda, K.",
        TITLE = "Pyramid Coder: Hierarchical Code Generator for Compositional Visual
Question Answering",
        BOOKTITLE = ICIP24,
        YEAR = "2024",
        PAGES = "430-436",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229776"}

@inproceedings{bb234803,
        AUTHOR = "Panagopoulou, A. and Zhou, H.L. and Savarese, S. and Xiong, C.M. and Callison Burch, C. and Yatskar, M. and Niebles, J.C.",
        TITLE = "ViUniT: Visual Unit Tests for More Robust Visual Programming",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24646-24656",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229777"}

@inproceedings{bb234804,
        AUTHOR = "Wang, W.Z. and Duan, C. and Peng, Z.H. and Liu, Y.X. and Zhou, B.",
        TITLE = "Embodied Scene Understanding for Vision Language Models via MetaVQA",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "22453-22464",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229778"}

@inproceedings{bb234805,
        AUTHOR = "Tian, X.Y. and Zou, S. and Yang, Z.Y. and Zhang, J.",
        TITLE = "Identifying and Mitigating Position Bias of Multi-image
Vision-Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "10599-10609",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229779"}

@inproceedings{bb234806,
        AUTHOR = "Sheng, L.J. and Liang, J. and Wang, Z. and He, R.",
        TITLE = "R-TPT: Improving Adversarial Robustness of Vision-Language Models
through Test-Time Prompt Tuning",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29958-29967",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229780"}

@inproceedings{bb234807,
        AUTHOR = "Das, D. and Talon, D. and Mancini, M. and Wang, Y.M. and Ricci, E.",
        TITLE = "One VLM to Keep it Learning: Generation and Balancing for Data-free
Continual Visual Question Answering",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "5635-5645",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229781"}

@inproceedings{bb234808,
        AUTHOR = "Ishmam, M.F. and Tashdeed, I. and Saadat, T.A. and Ashmafee, M.H. and Kamal, A.R.M. and Hossain, M.A.",
        TITLE = "Visual Robustness Benchmark for Visual Question Answering (VQA)",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "6623-6633",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229782"}

@inproceedings{bb234809,
        AUTHOR = "Chen, X. and Djolonga, J. and Padlewski, P. and Mustafa, B. and Changpinyo, S. and Wu, J.L. and Ruiz, C.R. and Goodman, S. and Wang, X. and Tay, Y. and Shakeri, S. and Dehghani, M. and Salz, D. and Lucic, M. and Tschannen, M. and Nagrani, A. and Hu, H. and Joshi, M. and Pang, B. and Montgomery, C. and Pietrzyk, P. and Ritter, M. and Piergiovanni, A. and Minderer, M. and Pavetic, F. and Waters, A. and Li, G. and Alabdulmohsin, I. and Beyer, L. and Amelot, J. and Lee, K. and Steiner, A.P. and Li, Y. and Keysers, D. and Arnab, A. and Xu, Y.Z. and Rong, K. and Kolesnikov, A. and Seyedhosseini, M. and Angelova, A. and Zhai, X.H. and Houlsby, N. and Soricut, R.",
        TITLE = "On Scaling Up a Multilingual Vision and Language Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14432-14444",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229783"}

@inproceedings{bb234810,
        AUTHOR = "Li, R.J. and Wu, Y. and He, X.M.",
        TITLE = "Learning by Correction: Efficient Tuning Task for Zero-Shot
Generative Vision-Language Reasoning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13428-13437",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229784"}

@inproceedings{bb234811,
        AUTHOR = "Khan, Z. and Fu, Y.",
        TITLE = "Consistency and Uncertainty: Identifying Unreliable Responses From
Black-Box Vision-Language Models for Selective Visual Question
Answering",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "10854-10863",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229785"}

@inproceedings{bb234812,
        AUTHOR = "Gu, T.C. and Yang, K.C. and Liu, D. and Cai, W.D.",
        TITLE = "LaPA: Latent Prompt Assist Model for Medical Visual Question
Answering",
        BOOKTITLE = DEF-AI-MIA24,
        YEAR = "2024",
        PAGES = "4971-4980",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229786"}

@inproceedings{bb234813,
        AUTHOR = "Feinglass, J. and Yang, Y.Z.",
        TITLE = "Towards Addressing the Misalignment of Object Proposal Evaluation for
Vision-Language Tasks via Semantic Grounding",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "4385-4395",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229787"}

@inproceedings{bb234814,
        AUTHOR = "Nadeem, A. and Hilton, A. and Dawes, R. and Thomas, G. and Mustafa, A.",
        TITLE = "CAD: Contextual Multi-modal Alignment for Dynamic AVQA",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "7236-7248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229788"}

@inproceedings{bb234815,
        AUTHOR = "Wu, W. and Li, Q. and Zhong, W.L. and Huang, J.Z.",
        TITLE = "MIVC: Multiple Instance Visual Component for Visual-Language Models",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "8102-8111",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229789"}

@inproceedings{bb234816,
        AUTHOR = "Walmer, M. and Sikka, K. and Sur, I. and Shrivastava, A. and Jha, S.",
        TITLE = "Dual-Key Multimodal Backdoors for Visual Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15354-15364",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229790"}

@inproceedings{bb234817,
        AUTHOR = "Ding, Y. and Yu, J. and Liu, B. and Hu, Y. and Cui, M.X. and Wu, Q.",
        TITLE = "MuKEA: Multimodal Knowledge Extraction and Accumulation for
Knowledge-based Visual Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5079-5088",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229791"}

@inproceedings{bb234818,
        AUTHOR = "Gao, F. and Ping, Q. and Thattai, G. and Reganti, A. and Wu, Y.N. and Natarajan, P.",
        TITLE = "Transform-Retrieve-Generate: Natural Language-Centric
Outside-Knowledge Visual Question Answering",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5057-5067",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229792"}

@inproceedings{bb234819,
        AUTHOR = "Aflalo, E. and Du, M. and Tseng, S.Y. and Liu, Y.F. and Wu, C. and Duan, N. and Lal, V.",
        TITLE = "VL-InterpreT: An Interactive Visualization Tool for Interpreting
Vision-Language Transformers",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "21374-21383",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229793"}

@inproceedings{bb234820,
        AUTHOR = "Jain, V. and Lodhavia, J.",
        TITLE = "Automatic Question Tagging using k-Nearest Neighbors and Random
Forest",
        BOOKTITLE = ISCV20,
        YEAR = "2020",
        PAGES = "1-4",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vlmvqa4.html#TT229794"}

@article{bb234821,
        AUTHOR = "Ye, Q. and Yu, Z.T. and Shao, R. and Cui, Y.W. and Kang, X. and Liu, X. and Torr, P. and Cao, X.C.",
        TITLE = "CAT+: Investigating and Enhancing Audio-Visual Understanding in Large
Language Models",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "8674-8690",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229795"}

@inproceedings{bb234822,
        AUTHOR = "Yu, T.Y. and Zhang, H. and Li, Q.M. and Xu, Q.X. and Yao, Y. and Chen, D. and Lu, X.M. and Cui, G. and Dang, Y.K. and He, T. and Feng, X.C. and Song, J. and Zheng, B. and Liu, Z.Y. and Chua, T.S. and Sun, M.S.",
        TITLE = "RLAIF-V: Open-Source AI Feedback Leads to Super GPT-4V
Trustworthiness",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19985-19995",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229796"}

@inproceedings{bb234823,
        AUTHOR = "Liang, J. and Huang, W.K. and Wan, G.C. and Yang, Q. and Ye, M.",
        TITLE = "LoRASculpt: Sculpting LoRA for Harmonizing General and Specialized
Knowledge in Multimodal Large Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "26170-26180",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229797"}

@inproceedings{bb234824,
        AUTHOR = "Cao, Y. and Xing, Y. and Zhang, J. and Lin, D. and Zhang, T.W. and Tsang, I. and Liu, Y. and Guo, Q.",
        TITLE = "SceneTAP: Scene-Coherent Typographic Adversarial Planner against
Vision-Language Models in Real-World Environments",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "25050-25059",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229798"}

@inproceedings{bb234825,
        AUTHOR = "Wang, Y.B. and Guan, J. and Liang, J. and He, R.",
        TITLE = "Do We Really Need Curated Malicious Data for Safety Alignment in
Multi-modal Large Language Models?",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19879-19889",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229799"}

@inproceedings{bb234826,
        AUTHOR = "Peng, R. and He, H.Y. and Wei, Y. and Wen, Y.D. and Hu, D.",
        TITLE = "Matters: Training-free Fine-grained Image Caption Enhancement via
Local Perception",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3963-3973",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229800"}

@inproceedings{bb234827,
        AUTHOR = "Yang, Z. and Luo, X. and Han, D.Q. and Xu, Y.J. and Li, D.S.",
        TITLE = "Mitigating Hallucinations in Large Vision-Language Models via DPO:
On-Policy Data Hold the Key",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "10610-10620",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229801"}

@inproceedings{bb234828,
        AUTHOR = "Bae, K. and Kim, J. and Lee, S. and Lee, S. and Lee, G. and Choi, J.",
        TITLE = "MASH-VLM: Mitigating Action-Scene Hallucination in Video-LLMs through
Disentangled Spatial-Temporal Representations",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "13744-13753",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229802"}

@inproceedings{bb234829,
        AUTHOR = "Yin, H. and Si, G.Z. and Wang, Z.",
        TITLE = "ClearSight: Visual Signal Enhancement for Object Hallucination
Mitigation in Multimodal Large Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14625-14634",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229803"}

@inproceedings{bb234830,
        AUTHOR = "Yang, L. and Zheng, Z.W. and Chen, B. and Zhao, Z.Y. and Lin, C.H. and Shen, C.",
        TITLE = "Nullu: Mitigating Object Hallucinations in Large Vision-Language
Models via HalluSpace Projection",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14635-14645",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229804"}

@inproceedings{bb234831,
        AUTHOR = "Wu, Y.C. and Zhang, L. and Yao, H. and Du, J.L. and Yan, K. and Ding, S.H. and Wu, Y.S. and Li, X.Q.",
        TITLE = "Antidote: A Unified Framework for Mitigating LVLM Hallucinations in
Counterfactual Presupposition and Object Perception",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "14646-14656",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229805"}

@inproceedings{bb234832,
        AUTHOR = "Tu, Y. and Hu, R. and Sang, J.",
        TITLE = "ODE: Open-Set Evaluation of Hallucinations in Multimodal Large
Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19836-19845",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229806"}

@inproceedings{bb234833,
        AUTHOR = "Liu, J.Z. and Fu, Y.H. and Xie, R. and Xie, R. and Sun, X. and Lian, F.Z. and Kang, Z. and Li, X.R.",
        TITLE = "PhD: A ChatGPT-Prompted Visual hallucination Evaluation Dataset",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19857-19866",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229807"}

@inproceedings{bb234834,
        AUTHOR = "Jiang, Z.Q. and Chen, J.K. and Zhu, B. and Luo, T.J. and Shen, Y.K. and Yang, X.",
        TITLE = "Devils in Middle Layers of Large Vision-Language Models:
Interpreting, Detecting and Mitigating Object Hallucinations via
Attention Lens",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "25004-25014",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229808"}

@inproceedings{bb234835,
        AUTHOR = "Park, E. and Kim, M. and Kim, G.",
        TITLE = "HalLoc: Token-level Localization of Hallucinations for Vision
Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29893-29903",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229809"}

@inproceedings{bb234836,
        AUTHOR = "Suo, W. and Zhang, L.J. and Sun, M.Y. and Wu, L.Y.B. and Wang, P. and Zhang, Y.N.",
        TITLE = "Octopus: Alleviating Hallucination via Dynamic Contrastive Decoding",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29904-29914",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229810"}

@inproceedings{bb234837,
        AUTHOR = "An, W.B. and Tian, F. and Leng, S. and Nie, J.H. and Lin, H. and Wang, Q.Y. and Chen, P. and Zhang, X.Q. and Lu, S.J.",
        TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models with
Assembly of Global and Local Attention",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29915-29926",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229811"}

@inproceedings{bb234838,
        AUTHOR = "Zhuang, X.W. and Zhu, Z.H. and Xie, Y.X. and Liang, L.M. and Zou, Y.X.",
        TITLE = "VASparse: Towards Efficient Visual Hallucination Mitigation via
Visual-Aware Token Sparsification",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "4189-4199",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229812"}

@inproceedings{bb234839,
        AUTHOR = "Basak, D. and Bhatt, S. and Kanduri, S. and Desarkar, M.S.",
        TITLE = "Aerial Mirage: Unmasking Hallucinations in Large Vision Language
Models",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "5500-5508",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229813"}

@inproceedings{bb234840,
        AUTHOR = "Tang, F.L. and Liu, C.Z. and Xu, Z.X. and Hu, M. and Huang, Z. and Xue, H.C. and Chen, Z.Y. and Peng, Z.L. and Yang, Z.W. and Zhou, S.J. and Li, W.X. and Li, Y.L. and Song, W.X. and Su, S.Y. and Feng, W. and Su, J. and Lin, M. and Peng, Y.F. and Cheng, X.L. and Razzak, I. and Ge, Z.Y.",
        TITLE = "Seeing Far and Clearly: Mitigating Hallucinations in MLLMs with
Attention Causal Decoding",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "26147-26159",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229814"}

@inproceedings{bb234841,
        AUTHOR = "Yang, J.N. and Chen, X. and Madaan, N. and Iyengar, M. and Qian, S. and Fouhey, D.F. and Chai, J.",
        TITLE = "3D-GRAND: A Million-Scale Dataset for 3D-LLMs with Better Grounding
and Less Hallucination",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29501-29512",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229815"}

@inproceedings{bb234842,
        AUTHOR = "Yoon, D. and Song, Y. and Park, W.",
        TITLE = "Stop learning it all to mitigate visual hallucination, Focus on the
hallucination target",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "4200-4208",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229816"}

@inproceedings{bb234843,
        AUTHOR = "Chen, J.Z. and Zhang, T.S. and Huang, S.Y. and Niu, Y.W. and Zhang, L.F. and Wen, L.J. and Hu, X.M.",
        TITLE = "ICT: Image-Object Cross-Level Trusted Intervention for Mitigating
Object Hallucination in Large Vision-Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "4209-4221",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229817"}

@inproceedings{bb234844,
        AUTHOR = "Huang, P.H. and Li, J.L. and Chen, C.P. and Chang, M.C. and Chen, W.C.",
        TITLE = "Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large
Vision-Language Model via Causality Analysis",
        BOOKTITLE = WACV25,
        YEAR = "2025",
        PAGES = "6125-6135",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229818"}

@inproceedings{bb234845,
        AUTHOR = "Liu, S. and Zheng, K. and Chen, W.",
        TITLE = "Paying More Attention to Image: A Training-free Method for Alleviating
Hallucination in LVLMS",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXXIII: 125-140",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229819"}

@inproceedings{bb234846,
        AUTHOR = "Zhang, J. and Wang, T. and Zhang, H.G. and Lu, P. and Zheng, F.",
        TITLE = "Reflective Instruction Tuning: Mitigating Hallucinations in Large
Vision-language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXVIII: 196-213",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229820"}

@inproceedings{bb234847,
        AUTHOR = "Kaul, P. and Li, Z.Z. and Yang, H. and Dukler, Y. and Swaminathan, A. and Taylor, C.J. and Soatto, S.",
        TITLE = "THRONE: An Object-Based Hallucination Benchmark for the Free-Form
Generations of Large Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27218-27228",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229821"}

@inproceedings{bb234848,
        AUTHOR = "Jiang, C.Y. and Xu, H.Y. and Dong, M.F. and Chen, J.X. and Ye, W. and Yan, M. and Ye, Q.H. and Zhang, J. and Huang, F. and Zhang, S.K.",
        TITLE = "Hallucination Augmented Contrastive Learning for Multimodal Large
Language Model",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27026-27036",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229822"}

@inproceedings{bb234849,
        AUTHOR = "Huang, Q.D. and Dong, X.Y. and Zhang, P. and Wang, B. and He, C.H. and Wang, J.Q. and Lin, D. and Zhang, W.M. and Yu, N.H.",
        TITLE = "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models
via Over-Trust Penalty and Retrospection-Allocation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13418-13427",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229823"}

@inproceedings{bb234850,
        AUTHOR = "Yu, Q.F. and Li, J.C. and Wei, L.H. and Pang, L. and Ye, W.T. and Qin, B.S. and Tang, S.L. and Tian, Q. and Zhuang, Y.T.",
        TITLE = "HalluciDoctor: Mitigating Hallucinatory Toxicity in Visual
Instruction Data",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "12944-12953",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229824"}

@inproceedings{bb234851,
        AUTHOR = "Favero, A. and Zancato, L. and Trager, M. and Choudhary, S. and Perera, P. and Achille, A. and Swaminathan, A. and Soatto, S.",
        TITLE = "Multi-Modal Hallucination Control by Visual Information Grounding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14303-14312",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229825"}

@inproceedings{bb234852,
        AUTHOR = "Ouali, Y. and Bulat, A. and Martinez, B. and Tzimiropoulos, G.",
        TITLE = "CLIP-DPO: Vision-language Models as a Source of Preference for Fixing
Hallucinations in LVLMS",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXVI: 395-413",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229826"}

@inproceedings{bb234853,
        AUTHOR = "Ye Bin, M. and Hyeon Woo, N. and Choi, W. and Oh, T.H.",
        TITLE = "Beaf: Observing Before-after Changes to Evaluate Hallucination in
Vision-language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "XI: 232-248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229827"}

@inproceedings{bb234854,
        AUTHOR = "Kim, M. and Kim, M. and Bae, J. and Choi, S. and Kim, S. and Chang, B.",
        TITLE = "Exploiting Semantic Reconstruction to Mitigate Hallucinations in
Vision-language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXXVI: 236-252",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229828"}

@inproceedings{bb234855,
        AUTHOR = "Guan, T.R. and Liu, F. and Wu, X. and Xian, R.Q. and Li, Z.X. and Liu, X.Y. and Wang, X. and Chen, L. and Huang, F. and Yacoob, Y. and Manocha, D. and Zhou, T.Y.",
        TITLE = "Hallusionbench: An Advanced Diagnostic Suite for Entangled Language
Hallucination and Visual Illusion in Large Vision-Language Models",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "14375-14385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229829"}

@inproceedings{bb234856,
        AUTHOR = "Leng, S. and Zhang, H. and Chen, G.Z. and Li, X. and Lu, S.J. and Miao, C.Y. and Bing, L.",
        TITLE = "Mitigating Object Hallucinations in Large Vision-Language Models
through Visual Contrastive Decoding",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "13872-13882",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229830"}

@inproceedings{bb234857,
        AUTHOR = "Wang, Z. and Bingham, G. and Yu, A.W. and Le, Q.V. and Luong, T. and Ghiasi, G.",
        TITLE = "Haloquest: A Visual Hallucination Dataset for Advancing Multimodal
Reasoning",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXVII: 288-304",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229831"}

@inproceedings{bb234858,
        AUTHOR = "Wang, T.J.J. and Laaksonen, J. and Langer, T. and Arponen, H. and Bishop, T.E.",
        TITLE = "Learning by Hallucinating:
Vision-Language Pre-training with Weak Supervision",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "1073-1083",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803hallum4.html#TT229832"}

@article{bb234859,
        AUTHOR = "Wang, Y.Z. and Hu, W.B. and Dong, Y.P. and Liu, J. and Zhang, H.W. and Hong, R.C.",
        TITLE = "Align Is Not Enough: Multimodal Universal Jailbreak Attack Against
Multimodal Large Language Models",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "5475-5488",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229833"}

@inproceedings{bb234860,
        AUTHOR = "Hossain, M.Z. and Imteaj, A.",
        TITLE = "SLADE: Shielding against Dual Exploits in Large Vision-Language
Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24244-24254",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229834"}

@inproceedings{bb234861,
        AUTHOR = "Jeong, J. and Bae, S. and Jung, Y. and Hwang, J. and Yang, E.",
        TITLE = "Playing the Fool: Jailbreaking LLMs and Multimodal LLMs with
Out-of-Distribution Strategy",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29937-29946",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229835"}

@inproceedings{bb234862,
        AUTHOR = "Yang, Z.P. and Fan, J. and Yan, A. and Gao, E. and Lin, X. and Li, T. and Mo, K. and Dong, C.",
        TITLE = "Distraction is All You Need for Multimodal Large Language Model
Jailbreaking",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "9467-9476",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229836"}

@inproceedings{bb234863,
        AUTHOR = "Hao, S.Y. and Hooi, B. and Liu, J. and Chang, K.W. and Huang, Z. and Cai, Y.J.",
        TITLE = "Exploring Visual Vulnerabilities via Multi-Loss Adversarial Search
for Jailbreaking Vision-Language Models",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "19890-19899",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229837"}

@inproceedings{bb234864,
        AUTHOR = "Wang, H. and Wang, G. and Zhang, H.",
        TITLE = "Steering Away from Harm: An Adaptive Approach to Defending Vision
Language Model Against Jailbreaks",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "29947-29957",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229838"}

@inproceedings{bb234865,
        AUTHOR = "Ghosal, S.S. and Chakraborty, S. and Singh, V. and Guan, T.R. and Wang, M. and Beirami, A. and Huang, F. and Velasquez, A. and Manocha, D. and Bedi, A.S.",
        TITLE = "Immune: Improving Safety Against Jailbreaks in Multi-modal LLMs via
Inference-Time Alignment",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "25038-25049",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229839"}

@inproceedings{bb234866,
        AUTHOR = "Xiang, Y.L. and Hong, Z.M. and Yao, L. and Wang, D.D. and Liu, T.L.",
        TITLE = "Jailbreaking the Non-Transferable Barrier via Test-Time Data
Disguising",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "30671-30681",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229840"}

@inproceedings{bb234867,
        AUTHOR = "Chen, J.X. and Dong, J.H. and Xie, X.H.",
        TITLE = "Mind the Trojan Horse: Image Prompt Adapter Enabling Scalable and
Deceptive Jailbreaking",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "23785-23794",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229841"}

@inproceedings{bb234868,
        AUTHOR = "Li, Y.F. and Guo, H. and Zhou, K. and Zhao, W.X. and Wen, J.R.",
        TITLE = "Images are Achilles' Heel of Alignment: Exploiting Visual
Vulnerabilities for Jailbreaking Multimodal Large Language Models",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "LXXIII: 174-189",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803jailb5.html#TT229842"}

@article{bb234869,
        AUTHOR = "Wu, Y.C. and Yang, J.C.",
        TITLE = "A Robust Passage Retrieval Algorithm for Video Question Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "18",
        YEAR = "2008",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "1411-1421",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229843"}

@inproceedings{bb234870,
        AUTHOR = "Wu, Y.C. and Lee, Y.S. and Yang, J.C. and Yen, S.J.",
        TITLE = "A New Passage Ranking Algorithm for Video Question Answering",
        BOOKTITLE = PSIVT06,
        YEAR = "2006",
        PAGES = "563-572",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229844"}

@article{bb234871,
        AUTHOR = "Li, G.D. and Li, H.J. and Ming, Z.Y. and Hong, R.C. and Tang, S. and Chua, T.S.",
        TITLE = "Question Answering over Community-Contributed Web Videos",
        JOURNAL = MultMedMag,
        VOLUME = "17",
        YEAR = "2010",
        NUMBER = "4",
        MONTH = "October",
        PAGES = "46-57",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229845"}

@inproceedings{bb234872,
        AUTHOR = "Song, Y.C. and Li, H.J.",
        TITLE = "Mash-Up Approach for Web Video Category Recommendation",
        BOOKTITLE = PSIVT10,
        YEAR = "2010",
        PAGES = "197-202",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229846"}

@article{bb234873,
        AUTHOR = "Guo, Z.Y. and Zhao, Z. and Jin, W. and Wei, Z.C. and Yang, M. and Wang, N.N. and Yuan, N.J.",
        TITLE = "Multi-Turn Video Question Generation via Reinforced Multi-Choice
Attention Network",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "5",
        PAGES = "1697-1710",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229847"}

@article{bb234874,
        AUTHOR = "Xue, H.Y. and Chu, W. and Zhao, Z. and Cai, D.",
        TITLE = "A Better Way to Attend: Attention With Trees for Video Question
Answering",
        JOURNAL = IP,
        VOLUME = "27",
        YEAR = "2018",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "5563-5574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229848"}

@article{bb234875,
        AUTHOR = "Xue, H.Y. and Zhao, Z. and Cai, D.",
        TITLE = "Unifying the Video and Question Attentions for Open-Ended Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "26",
        YEAR = "2017",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "5656-5666",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229849"}

@article{bb234876,
        AUTHOR = "Zhao, Z. and Xiao, S.W. and Song, Z. and Lu, C.J. and Xiao, J. and Zhuang, Y.T.",
        TITLE = "Open-Ended Video Question Answering via Multi-Modal Conditional
Adversarial Networks",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        PAGES = "3859-3870",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229850"}

@article{bb234877,
        AUTHOR = "Zhao, Z. and Zhang, Z. and Xiao, S.W. and Xiao, Z.X. and Yan, X.H. and Yu, J. and Cai, D. and Wu, F.",
        TITLE = "Long-Form Video Question Answering via Dynamic Hierarchical
Reinforced Networks",
        JOURNAL = IP,
        VOLUME = "28",
        YEAR = "2019",
        NUMBER = "12",
        MONTH = "December",
        PAGES = "5939-5952",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229851"}

@article{bb234878,
        AUTHOR = "Yu, T. and Yu, J. and Yu, Z. and Huang, Q.M. and Tian, Q.",
        TITLE = "Long-Term Video Question Answering via Multimodal Hierarchical Memory
Attentive Networks",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "931-944",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229852"}

@article{bb234879,
        AUTHOR = "Jang, Y. and Song, Y. and Kim, C.D. and Yu, Y. and Kim, Y. and Kim, G.",
        TITLE = "Video Question Answering with Spatio-Temporal Reasoning",
        JOURNAL = IJCV,
        VOLUME = "127",
        YEAR = "2019",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "1385-1412",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229853"}

@inproceedings{bb234880,
        AUTHOR = "Jang, Y. and Song, Y. and Yu, Y. and Kim, Y. and Kim, G.",
        TITLE = "TGIF-QA:
Toward Spatio-Temporal Reasoning in Visual Question Answering",
        BOOKTITLE = CVPR17,
        YEAR = "2017",
        PAGES = "1359-1367",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229854"}

@article{bb234881,
        AUTHOR = "Yu, T. and Yu, J. and Yu, Z. and Tao, D.",
        TITLE = "Compositional Attention Networks With Two-Stream Fusion for Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        NUMBER = "",
        PAGES = "1204-1218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229855"}

@article{bb234882,
        AUTHOR = "Wang, W.N. and Huang, Y. and Wang, L.",
        TITLE = "Long video question answering: A Matching-guided Attention Model",
        JOURNAL = PR,
        VOLUME = "102",
        YEAR = "2020",
        PAGES = "107248",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229856"}

@article{bb234883,
        AUTHOR = "Zhang, W. and Tang, S. and Cao, Y. and Pu, S. and Wu, F. and Zhuang, Y.",
        TITLE = "Frame Augmented Alternating Attention Network for Video Question
Answering",
        JOURNAL = MultMed,
        VOLUME = "22",
        YEAR = "2020",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "1032-1041",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229857"}

@article{bb234884,
        AUTHOR = "Chen, J. and Shao, J. and He, C.K.",
        TITLE = "Movie fill in the blank by joint learning from video and text with
adaptive temporal attention",
        JOURNAL = PRL,
        VOLUME = "132",
        YEAR = "2020",
        PAGES = "62-68",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229858"}

@article{bb234885,
        AUTHOR = "Wang, A. and Luu, A.T. and Foo, C. and Zhu, H. and Tay, Y. and Chandrasekhar, V.",
        TITLE = "Holistic Multi-Modal Memory Network for Movie Question Answering",
        JOURNAL = IP,
        VOLUME = "29",
        YEAR = "2020",
        NUMBER = "1",
        PAGES = "489-499",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229859"}

@article{bb234886,
        AUTHOR = "Yuan, Z.Q. and Sun, S.Y. and Duan, L.X. and Li, C.S. and Wu, X. and Xu, C.S.",
        TITLE = "Adversarial Multimodal Network for Movie Story Question Answering",
        JOURNAL = MultMed,
        VOLUME = "23",
        YEAR = "2021",
        PAGES = "1744-1756",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229860"}

@article{bb234887,
        AUTHOR = "Gu, M. and Zhao, Z. and Jin, W. and Hong, R. and Wu, F.",
        TITLE = "Graph-Based Multi-Interaction Network for Video Question Answering",
        JOURNAL = IP,
        VOLUME = "30",
        YEAR = "2021",
        PAGES = "2758-2770",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229861"}

@article{bb234888,
        AUTHOR = "Xie, Z. and Wu, K.W. and Zhang, X.Y. and Yang, X.M. and Hou, J.K.",
        TITLE = "Learning continuous temporal embedding of videos using pattern theory",
        JOURNAL = PRL,
        VOLUME = "146",
        YEAR = "2021",
        PAGES = "222-229",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229862"}

@article{bb234889,
        AUTHOR = "Liu, Y. and Zhang, X.M. and Zhang, Q.Y. and Li, C.Z. and Huang, F. and Tang, X.H. and Li, Z.J.",
        TITLE = "Dual self-attention with co-attention networks for visual question
answering",
        JOURNAL = PR,
        VOLUME = "117",
        YEAR = "2021",
        PAGES = "107956",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229863"}

@article{bb234890,
        AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Shen, S.X. and Tian, P. and Li, L. and Li, Z.J.",
        TITLE = "Dynamic Self-Attention with Vision Synchronization Networks for Video
Question Answering",
        JOURNAL = PR,
        VOLUME = "132",
        YEAR = "2022",
        PAGES = "108959",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229864"}

@article{bb234891,
        AUTHOR = "Liu, Y. and Zhang, X.M. and Huang, F. and Zhang, B. and Li, Z.J.",
        TITLE = "Cross-Attentional Spatio-Temporal Semantic Graph Networks for Video
Question Answering",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "1684-1696",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229865"}

@article{bb234892,
        AUTHOR = "Jin, W. and Zhao, Z. and Cao, X.C. and Zhu, J.M. and He, X.Q. and Zhuang, Y.T.",
        TITLE = "Adaptive Spatio-Temporal Graph Enhanced Vision-Language
Representation for Video QA",
        JOURNAL = IP,
        VOLUME = "30",
        YEAR = "2021",
        PAGES = "5477-5489",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229866"}

@article{bb234893,
        AUTHOR = "Gao, L. and Chen, T.M. and Li, X.P. and Zeng, P.P. and Zhao, L. and Li, Y.F.",
        TITLE = "Generalized pyramid co-attention with learnable aggregation net for
video question answering",
        JOURNAL = PR,
        VOLUME = "120",
        YEAR = "2021",
        PAGES = "108145",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229867"}

@article{bb234894,
        AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.",
        TITLE = "Hierarchical Conditional Relation Networks for Multimodal Video
Question Answering",
        JOURNAL = IJCV,
        VOLUME = "129",
        YEAR = "2021",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "3027-3050",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229868"}

@inproceedings{bb234895,
        AUTHOR = "Le, T.M. and Le, V. and Venkatesh, S. and Tran, T.",
        TITLE = "Hierarchical Conditional Relation Networks for Video Question
Answering",
        BOOKTITLE = CVPR20,
        YEAR = "2020",
        PAGES = "9969-9978",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229869"}

@article{bb234896,
        AUTHOR = "Su, H.T. and Chang, C.H. and Shen, P.W. and Wang, Y.S. and Chang, Y.L. and Chang, Y.C. and Cheng, P.J. and Hsu, W.H.",
        TITLE = "End-to-End Video Question-Answer Generation With Generator-Pretester
Network",
        JOURNAL = CirSysVideo,
        VOLUME = "31",
        YEAR = "2021",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "4497-4507",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229870"}

@article{bb234897,
        AUTHOR = "Gao, L.L. and Lei, Y. and Zeng, P.P. and Song, J.K. and Wang, M. and Shen, H.T.",
        TITLE = "Hierarchical Representation Network With Auxiliary Tasks for Video
Captioning and Video Question Answering",
        JOURNAL = IP,
        VOLUME = "31",
        YEAR = "2022",
        PAGES = "202-215",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229871"}

@article{bb234898,
        AUTHOR = "Zhang, J.P. and Shao, J. and Cao, R. and Gao, L.L. and Xu, X. and Shen, H.T.",
        TITLE = "Action-Centric Relation Transformer Network for Video Question
Answering",
        JOURNAL = CirSysVideo,
        VOLUME = "32",
        YEAR = "2022",
        NUMBER = "1",
        MONTH = "January",
        PAGES = "63-74",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229872"}

@article{bb234899,
        AUTHOR = "Zhang, H. and Sun, A. and Jing, W. and Zhen, L.L. and Zhou, J.T.Y. and Goh, R.S.M.",
        TITLE = "Natural Language Video Localization: A Revisit in Span-Based Question
Answering Framework",
        JOURNAL = PAMI,
        VOLUME = "44",
        YEAR = "2022",
        NUMBER = "8",
        MONTH = "August",
        PAGES = "4252-4266",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vidq2.html#TT229873"}

Last update:Oct 6, 2025 at 14:07:43