@article{bb243800,
AUTHOR = "Song, X. and Tian, W. and Zhu, Q.Q. and Zhang, X.L.",
TITLE = "VideoMamba++: Integrating state space model with dual attention for
enhanced video understanding",
JOURNAL = IVC,
VOLUME = "161",
YEAR = "2025",
PAGES = "105609",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238692"}
@article{bb243801,
AUTHOR = "Li, T.P. and Wang, H. and Li, Q. and Ni, Z.",
TITLE = "Vision-Language Relational Transformer for Video-to-Text Generation",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "4584-4596",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238693"}
@article{bb243802,
AUTHOR = "Korban, M. and Youngs, P. and Acton, S.T.",
TITLE = "Causal State Space Model for Video Understanding",
JOURNAL = SPLetters,
VOLUME = "32",
YEAR = "2025",
PAGES = "4314-4318",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238694"}
@inproceedings{bb243803,
AUTHOR = "Liu, Z.C. and Xu, K.L. and Su, B. and Zou, X. and Peng, Y.X. and Zhou, J.H.",
TITLE = "STOP: Integrated Spatial-Temporal Dynamic Prompting for Video
Understanding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13776-13786",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238695"}
@inproceedings{bb243804,
AUTHOR = "Wang, Q.H. and Shi, Y.K. and Ou, J. and Chen, R. and Lin, K. and Wang, J.H. and Jiang, B. and Yang, H.T. and Zheng, M. and Tao, X. and Yang, F. and Wan, P.F. and Zhang, D.",
TITLE = "Koala-36M: A Large-Scale Video Dataset Improving Consistency between
Fine-Grained Conditions and Video Content",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "8428-8437",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238696"}
@inproceedings{bb243805,
AUTHOR = "Ho, D. and Madden, S.",
TITLE = "DejaVid: Encoder-Agnostic Learned Temporal Matching for Video
Classification",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24023-24032",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238697"}
@inproceedings{bb243806,
AUTHOR = "Manasyan, A. and Seitzer, M. and Radovic, F. and Martius, G. and Zadaianchuk, A.",
TITLE = "Temporally Consistent Object-Centric Learning by Contrasting Slots",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "5401-5411",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238698"}
@inproceedings{bb243807,
AUTHOR = "Bigverdi, M. and Luo, Z. and Hsieh, C.Y. and Shen, E. and Chen, D.P. and Shapiro, L.G. and Krishna, R.",
TITLE = "Perception Tokens Enhance Visual Reasoning in Multimodal Language
Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3836-3845",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238699"}
@inproceedings{bb243808,
AUTHOR = "Ren, Z.W. and Wei, Y.C. and Guo, X. and Zhao, Y. and Kang, B. and Feng, J.S. and Jin, X.J.",
TITLE = "VideoWorld: Exploring Knowledge Learning from Unlabeled Videos",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29029-29039",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238700"}
@inproceedings{bb243809,
AUTHOR = "Tang, Y.L. and Guo, J.J. and Hua, H. and Liang, S. and Feng, M.Q. and Li, X.Y. and Mao, R. and Huang, C. and Bi, J. and Zhang, Z.L. and Fazli, P. and Xu, C.L.",
TITLE = "VidComposition: Can MLLMs Analyze Compositions in Compiled Videos?",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "8490-8500",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238701"}
@inproceedings{bb243810,
AUTHOR = "Kim, K. and Park, G. and Lee, Y. and Yeo, W. and Hwang, S.J.",
TITLE = "VideoICL: Confidence-based Iterative In-context Learning for
Out-of-Distribution Video Understanding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3295-3305",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238702"}
@inproceedings{bb243811,
AUTHOR = "Huang, Z.P. and Li, X.H. and Li, J.Q. and Wang, J. and Zeng, X.Y. and Liang, C. and Wu, T. and Chen, X. and Li, L. and Wang, L.M.",
TITLE = "Online Video Understanding: OVBench and VideoChat-Online",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3328-3338",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238703"}
@inproceedings{bb243812,
AUTHOR = "Liu, B. and Dong, Y.H. and Wang, Y.Q. and Ma, Z.X. and Tang, Y.S. and Tang, L. and Rao, Y.M. and Ma, W.C. and Krishna, R.",
TITLE = "Coarse Correspondences Boost Spatial-Temporal Reasoning in Multimodal
Language Model",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3783-3792",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238704"}
@inproceedings{bb243813,
AUTHOR = "Zhao, Y.L. and Zhang, H. and Xie, L. and Hu, T.Y. and Gan, G. and Long, Y. and Hu, Z.Y. and Chen, W.Y. and Li, C.H. and Xu, Z.J. and Wang, C.Y. and Shangguan, Z.Y. and Liang, Z.W. and Liu, Y.X. and Zhao, C. and Cohan, A.",
TITLE = "MMVU: Measuring Expert-Level Multi-Discipline Video Understanding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "8475-8489",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238705"}
@inproceedings{bb243814,
AUTHOR = "Lv, B.X. and Zha, Y.H. and Dai, T. and Yuerong, X. and Chen, K. and Xia, S.T.",
TITLE = "Adapting Pre-trained 3D Models for Point Cloud Video Understanding
via Cross-frame Spatio-temporal Perception",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "12413-12422",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238706"}
@inproceedings{bb243815,
AUTHOR = "Ashutosh, K. and Nagarajan, T. and Pavlakos, G. and Kitani, K. and Grauman, K.",
TITLE = "ExpertAF: Expert Actionable Feedback from Video",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13582-13594",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238707"}
@inproceedings{bb243816,
AUTHOR = "Yang, J.W. and Tan, R. and Wu, Q.H. and Zheng, R.J. and Peng, B.L. and Liang, Y. and Gu, Y. and Cai, M. and Ye, S. and Jang, J. and Deng, Y.Q. and Gao, J.F.",
TITLE = "Magma: A Foundation Model for Multimodal AI Agents",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "14203-14214",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238708"}
@inproceedings{bb243817,
AUTHOR = "Hu, K. and Gao, F. and Nie, X.H. and Zhou, P. and Tran, S. and Neiman, T. and Wang, L.Y. and Shah, M. and Hamid, R. and Yin, B. and Chilimbi, T.",
TITLE = "M-LLM Based Video Frame Selection for Efficient Video Understanding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13702-13712",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238709"}
@inproceedings{bb243818,
AUTHOR = "Jung, M. and Xiao, J.B. and Zhang, B.T. and Yao, A.",
TITLE = "On the Consistency of Video Large Language Models in Temporal
Comprehension",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13713-13722",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238710"}
@inproceedings{bb243819,
AUTHOR = "Li, C. and Im, E.W. and Fazli, P.",
TITLE = "VidHalluc: Evaluating Temporal Hallucinations in Multimodal Large
Language Models for Video Understanding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13723-13733",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238711"}
@inproceedings{bb243820,
AUTHOR = "Liu, J. and Han, J. and Liu, L. and Aviles Rivero, A.I. and Jiang, C. and Liu, Z. and Wang, H.S.",
TITLE = "Mamba4D: Efficient 4D Point Cloud Video Understanding with
Disentangled Spatial-Temporal State Space Models",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "17626-17636",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238712"}
@inproceedings{bb243821,
AUTHOR = "Das, S. and Mujavarsheik, B. and Lyngkhoi, R.E.Z. and Saha, S. and Maurya, A.",
TITLE = "Deciphering the Complaint Aspects: Towards an Aspect-Based Complaint
Identification Model with Video Complaint Dataset in Finance",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "7195-7204",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238713"}
@inproceedings{bb243822,
AUTHOR = "Liu, H. and Nakashima, Y. and Babaguchi, N.",
TITLE = "Paladin: Understanding Video Intentions in Political Advertisement
Videos",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "8239-8248",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238714"}
@inproceedings{bb243823,
AUTHOR = "Bae, K. and Ahn, G. and Kim, Y. and Choi, J.",
TITLE = "DEVIAS: Learning Disentangled Video Representations of Action and Scene",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXVIII: 431-448",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238715"}
@inproceedings{bb243824,
AUTHOR = "Salehi, M. and Dorkenwald, M. and Thoker, F.M. and Gavves, E. and Snoek, C.G.M. and Asano, Y.M.",
TITLE = "Sigma: Sinkhorn-guided Masked Video Modeling",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXIV: 293-312",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238716"}
@inproceedings{bb243825,
AUTHOR = "Xie, B.Z. and Zhang, S.C. and Zhou, Z. and Li, B. and Zhang, Y.H. and Hessel, J. and Yang, J.K. and Liu, Z.W.",
TITLE = "FUNQA: Towards Surprising Video Comprehension",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "I: 39-57",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238717"}
@inproceedings{bb243826,
AUTHOR = "Choi, M. and Goel, H. and Omama, M. and Yang, Y.H. and Shah, S. and Chinchali, S.",
TITLE = "Towards Neuro-symbolic Video Understanding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXVIII: 220-236",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238718"}
@inproceedings{bb243827,
AUTHOR = "Fan, Y. and Ma, X.J. and Wu, R.J. and Du, Y.T. and Li, J.Q. and Gao, Z. and Li, Q.",
TITLE = "Videoagent: A Memory-augmented Multimodal Agent for Video Understanding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXII: 75-92",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238719"}
@inproceedings{bb243828,
AUTHOR = "Wang, S.J. and Zhao, Q. and Do, M.Q. and Agarwal, N. and Lee, K. and Sun, C.",
TITLE = "VAMOS: Versatile Action Models for Video Understanding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XII: 142-160",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238720"}
@inproceedings{bb243829,
AUTHOR = "Lebreton, P. and Le Callet, P. and Birkbeck, N. and Wang, Y.L. and Adsumilli, B.",
TITLE = "A Dataset for Understanding Open UGC Video Datasets",
BOOKTITLE = ICIP24,
YEAR = "2024",
PAGES = "165-171",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238721"}
@inproceedings{bb243830,
AUTHOR = "Wu, J.T. and Mo, S.T. and Atito, S. and Feng, Z.H. and Kittler, J.V. and Husain, S.S. and Awais, M.",
TITLE = "Masked Momentum Contrastive Learning for Semantic Understanding by
Observation",
BOOKTITLE = ICIP24,
YEAR = "2024",
PAGES = "263-269",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238722"}
@inproceedings{bb243831,
AUTHOR = "Yun, H. and Ahn, J. and Kim, M. and Kim, E.S.",
TITLE = "Compositional Video Understanding with Spatiotemporal Structure-based
Transformers",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18751-18760",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238723"}
@inproceedings{bb243832,
AUTHOR = "Papalampidi, P. and Koppula, S. and Pathak, S. and Chiu, J. and Heyward, J. and Patraucean, V. and Shen, J.J. and Miech, A. and Zisserman, A. and Nematzdeh, A.",
TITLE = "A Simple Recipe for Contrastively Pre-Training Video-First Encoders
Beyond 16 Frames",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14386-14397",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238724"}
@inproceedings{bb243833,
AUTHOR = "Wang, A.D. and Wu, B. and Chen, S. and Chen, Z.F. and Guan, H.T. and Lee, W.N. and Li, L.E. and Gan, C.",
TITLE = "SOK-Bench: A Situated Video Reasoning Benchmark with Aligned
Open-World Knowledge",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13384-13394",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238725"}
@inproceedings{bb243834,
AUTHOR = "Zhong, Y. and Baghel, B.K.",
TITLE = "Multimodal Understanding of Memes with Fair Explanations",
BOOKTITLE = MULA24,
YEAR = "2024",
PAGES = "2007-2017",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238726"}
@inproceedings{bb243835,
AUTHOR = "Sheng, D. and Chen, D.D. and Tan, Z.T. and Liu, Q. and Chu, Q. and Bao, J.M. and Gong, T. and Liu, B. and Xu, S.W. and Yu, N.H.",
TITLE = "Towards More Unified In-Context Visual Understanding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13362-13372",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238727"}
@inproceedings{bb243836,
AUTHOR = "Ma, F. and Jin, X.J. and Wang, H. and Xian, Y.C. and Feng, J.S. and Yang, Y.",
TITLE = "Vista-llama: Reducing Hallucination in Video Language Models via
Equal Distance to Visual Tokens",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13151-13160",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238728"}
@inproceedings{bb243837,
AUTHOR = "Tan, C.L. and Lai, J.H. and Zheng, W.S. and Hu, J.F.",
TITLE = "Siamese Learning with Joint Alignment and Regression for
Weakly-Supervised Video Paragraph Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13569-13580",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238729"}
@inproceedings{bb243838,
AUTHOR = "Jin, P. and Takanobu, R. and Zhang, W. and Cao, X.C. and Yuan, L.",
TITLE = "Chat-UniVi: Unified Visual Representation Empowers Large Language
Models with Image and Video Understanding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13700-13710",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238730"}
@inproceedings{bb243839,
AUTHOR = "Chalk, J. and Huh, J. and Kazakos, E. and Zisserman, A. and Damen, D.",
TITLE = "TIM: A Time Interval Machine for Audio-Visual Action Recognition",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18153-18163",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238731"}
@inproceedings{bb243840,
AUTHOR = "Wang, J. and Chen, D.D. and Luo, C. and He, B. and Yuan, L. and Wu, Z.X. and Jiang, Y.G.",
TITLE = "OmniViD: A Generative Framework for Universal Video Understanding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18209-18220",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238732"}
@inproceedings{bb243841,
AUTHOR = "Zeng, R. and Chen, X.Y. and Liang, J.M. and Wu, H. and Cao, G.Z. and Guo, Y.",
TITLE = "Benchmarking the Robustness of Temporal Action Detection Models
Against Temporal Corruptions",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18263-18274",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238733"}
@inproceedings{bb243842,
AUTHOR = "Peirone, S.A. and Pistilli, F. and Alliegro, A. and Averta, G.",
TITLE = "A Backpack Full of Skills: Egocentric Video Understanding with
Diverse Task Perspectives",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18275-18285",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238734"}
@inproceedings{bb243843,
AUTHOR = "Nguyen, T.T. and Nguyen, P. and Luu, K.",
TITLE = "HIG: Hierarchical Interlacement Graph Approach to Scene Graph
Generation in Video Understanding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18384-18394",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238735"}
@inproceedings{bb243844,
AUTHOR = "Tores, J. and Sassatelli, L. and Wu, H.Y. and Bergman, C. and Andolfi, L. and Ecrement, V. and Precioso, F. and Devars, T. and Guaresi, M. and Julliard, V. and Lecossais, S.",
TITLE = "Visual Objectification in Films: Towards a New AI Task for Video
Interpretation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "10864-10874",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238736"}
@inproceedings{bb243845,
AUTHOR = "Jamal, M.A. and Mohareri, O.",
TITLE = "M33D: Learning 3D priors using Multi-Modal Masked Autoencoders for 2D
image and video understanding",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "2532-2542",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238737"}
@inproceedings{bb243846,
AUTHOR = "Li, K.C. and Wang, Y.L. and He, Y. and Li, Y.Z. and Wang, Y. and Wang, L.M. and Qiao, Y.",
TITLE = "UniFormerV2: Unlocking the Potential of Image ViTs for Video
Understanding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "1632-1643",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238738"}
@inproceedings{bb243847,
AUTHOR = "Zhao, Y.C. and Luo, C. and Tang, C.X. and Chen, D.D. and Codella, N. and Zha, Z.J.",
TITLE = "Streaming Video Model",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "14602-14612",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238739"}
@inproceedings{bb243848,
AUTHOR = "Maiya, S.R. and Girish, S. and Ehrlich, M. and Wang, H.Y. and Lee, K.S. and Poirson, P. and Wu, P.X. and Wang, C. and Shrivastava, A.",
TITLE = "NIRVANA: Neural Implicit Representations of Videos with Adaptive
Networks and Autoregressive Patch-Wise Modeling",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "14378-14387",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238740"}
@inproceedings{bb243849,
AUTHOR = "Zhang, Y.T. and Bai, Y. and Liu, C. and Wang, H. and Li, S. and Fu, Y.",
TITLE = "Frame Flexible Network",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "10504-10513",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238741"}
@inproceedings{bb243850,
AUTHOR = "Dessalene, E. and Maynord, M. and Fermuller, C. and Aloimonos, Y.F.",
TITLE = "Therbligs in Action: Video Understanding through Motion Primitives",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "10618-10626",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238742"}
@inproceedings{bb243851,
AUTHOR = "Zhao, Y. and Misra, I. and Krahenbuhl, P. and Girdhar, R.",
TITLE = "Learning Video Representations from Large Language Models",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "6586-6597",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238743"}
@inproceedings{bb243852,
AUTHOR = "Wang, R. and Chen, D.D. and Wu, Z.X. and Chen, Y.P. and Dai, X. and Liu, M.C. and Yuan, L. and Jiang, Y.G.",
TITLE = "Masked Video Distillation: Rethinking Masked Feature Modeling for
Self-supervised Video Representation Learning",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "6312-6322",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238744"}
@inproceedings{bb243853,
AUTHOR = "Foo, L.G. and Gong, J. and Fan, Z.P. and Liu, J.",
TITLE = "System-Status-Aware Adaptive Network for Online Streaming Video
Understanding",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "10514-10523",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238745"}
@inproceedings{bb243854,
AUTHOR = "Dong, S. and Hu, H.Z. and Lian, D.Z. and Luo, W.X. and Qian, Y.C. and Gao, S.H.",
TITLE = "Weakly Supervised Video Representation Learning with Unaligned Text
for Sequential Videos",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "2437-2447",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238746"}
@inproceedings{bb243855,
AUTHOR = "Zhang, H. and Liu, D. and Zheng, Q. and Su, B.",
TITLE = "Modeling Video as Stochastic Processes for Fine-Grained Video
Representation Learning",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "2225-2234",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238747"}
@inproceedings{bb243856,
AUTHOR = "Kumar, Y. and Mishra, A.",
TITLE = "Few-Shot Referring Relationships in Videos",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "2289-2298",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238748"}
@inproceedings{bb243857,
AUTHOR = "Harzig, P. and Einfalt, M. and Lienhart, R.",
TITLE = "Synchronized Audio-Visual Frames with Fractional Positional Encoding
for Transformers in Video-to-Text Translation",
BOOKTITLE = ICIP22,
YEAR = "2022",
PAGES = "2041-2045",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238749"}
@inproceedings{bb243858,
AUTHOR = "Wiles, O. and Carreira, J. and Barr, I. and Zisserman, A. and Malinowski, M.",
TITLE = "Compressed Vision for Efficient Video Understanding",
BOOKTITLE = ACCV22,
YEAR = "2022",
PAGES = "VII:679-695",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238750"}
@inproceedings{bb243859,
AUTHOR = "Rho, D. and Cho, J. and Ko, J.H. and Park, E.",
TITLE = "Neural Residual Flow Fields for Efficient Video Representations",
BOOKTITLE = ACCV22,
YEAR = "2022",
PAGES = "II:458-474",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238751"}
@inproceedings{bb243860,
AUTHOR = "Tian, F.R. and Fan, J.W. and Yu, X. and Du, S.Y. and Song, M. and Zhao, Y.",
TITLE = "TCVM: Temporal Contrasting Video Montage Framework for Self-Supervised
Video Representation Learning",
BOOKTITLE = ACCV22,
YEAR = "2022",
PAGES = "II:526-542",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238752"}
@inproceedings{bb243861,
AUTHOR = "Huang, Z.M. and Jia, C.M. and Wang, S.S. and Ma, S.W.",
TITLE = "A Compressive Prior Guided Mask Predictive Coding Approach for Video
Analysis",
BOOKTITLE = ACCV22,
YEAR = "2022",
PAGES = "IV:469-484",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238753"}
@inproceedings{bb243862,
AUTHOR = "Li, L. and Zhuang, L.S. and Gao, S.H. and Wang, S.",
TITLE = "Havit: Hybrid-attention Based Vision Transformer for Video
Classification",
BOOKTITLE = ACCV22,
YEAR = "2022",
PAGES = "IV:502-517",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238754"}
@inproceedings{bb243863,
AUTHOR = "Zhang, H.L. and Pirsiavash, H. and Liu, X.",
TITLE = "MASTAF: A Model-Agnostic Spatio-Temporal Attention Fusion Network for
Few-shot Video Classification",
BOOKTITLE = WACV23,
YEAR = "2023",
PAGES = "2507-2516",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238755"}
@inproceedings{bb243864,
AUTHOR = "Senocak, A. and Kim, J. and Oh, T.H. and Li, D.Z. and Kweon, I.S.",
TITLE = "Event-Specific Audio-Visual Fusion Layers:
A Simple and New Perspective on Video Understanding",
BOOKTITLE = WACV23,
YEAR = "2023",
PAGES = "2236-2246",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238756"}
@inproceedings{bb243865,
AUTHOR = "Xia, B.Y. and Wu, W.H. and Wang, H.R. and Su, R. and He, D.L. and Yang, H. and Fan, X.R. and Ouyang, W.L.",
TITLE = "NSNet: Non-saliency Suppression Sampler for Efficient Video Recognition",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXIV:705-723",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238757"}
@inproceedings{bb243866,
AUTHOR = "Xia, B.Y. and Wang, Z.H. and Wu, W.H. and Wang, H.R. and Han, J.G.",
TITLE = "Temporal Saliency Query Network for Efficient Video Recognition",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXIV:741-759",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238758"}
@inproceedings{bb243867,
AUTHOR = "Habibian, A. and Yahia, H.B. and Abati, D. and Gavves, E. and Porikli, F.M.",
TITLE = "Delta Distillation for Efficient Video Processing",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXV:213-229",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238759"}
@inproceedings{bb243868,
AUTHOR = "Li, Z.Z. and Wang, M.M. and Pi, H.J. and Xu, K. and Mei, J.B. and Liu, Y.",
TITLE = "E-NeRV: Expedite Neural Video Representation with Disentangled
Spatial-Temporal Context",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXV:267-284",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238760"}
@inproceedings{bb243869,
AUTHOR = "Kosman, E. and di Castro, D.",
TITLE = "GraphVid: It only Takes a Few Nodes to Understand a Video",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXV:195-212",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238761"}
@inproceedings{bb243870,
AUTHOR = "Ju, C. and Han, T. and Zheng, K. and Zhang, Y. and Xie, W.",
TITLE = "Prompting Visual-Language Models for Efficient Video Understanding",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXV:105-124",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238762"}
@inproceedings{bb243871,
AUTHOR = "Liang, S.X. and Shen, X. and Huang, J.Q. and Hua, X.S.",
TITLE = "Delving into Details: Synopsis-to-Detail Networks for Video Recognition",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "IV:262-278",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238763"}
@inproceedings{bb243872,
AUTHOR = "Ur Rehman, Y.A. and Gao, Y. and Shen, J.J. and de Gusmao, P.P.B. and Lane, N.",
TITLE = "Federated Self-supervised Learning for Video Understanding",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXI:506-522",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238764"}
@inproceedings{bb243873,
AUTHOR = "Dadashzadeh, A. and Whone, A. and Mirmehdi, M.",
TITLE = "Auxiliary Learning for Self-Supervised Video Representation via
Similarity-based Knowledge Distillation",
BOOKTITLE = L3D-IVU22,
YEAR = "2022",
PAGES = "4230-4239",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238765"}
@inproceedings{bb243874,
AUTHOR = "Li, Y. and Vasconcelos, N.M.",
TITLE = "Improving Video Model Transfer with Dynamic Representation Learning",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "19258-19269",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238766"}
@inproceedings{bb243875,
AUTHOR = "Guo, S. and Xiong, Z.H. and Zhong, Y.J. and Wang, L.M. and Guo, X.B. and Han, B. and Huang, W.L.",
TITLE = "Cross-Architecture Self-supervised Video Representation Learning",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "19248-19257",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238767"}
@inproceedings{bb243876,
AUTHOR = "Xu, X.Y. and Li, Y.L. and Lu, C.",
TITLE = "Learning to Anticipate Future with Dynamic Context Removal",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "12724-12734",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238768"}
@inproceedings{bb243877,
AUTHOR = "Gadre, S.Y. and Ehsani, K. and Song, S. and Mottaghi, R.",
TITLE = "Continuous Scene Representations for Embodied AI",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "14829-14839",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238769"}
@inproceedings{bb243878,
AUTHOR = "Liang, C. and Wang, W.G. and Zhou, T.F. and Yang, Y.",
TITLE = "Visual Abductive Reasoning",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "15544-15554",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238770"}
@inproceedings{bb243879,
AUTHOR = "Kinfu, K.A. and Vidal, R.",
TITLE = "Analysis and Extensions of Adversarial Training for Video
Classification",
BOOKTITLE = RoSe22,
YEAR = "2022",
PAGES = "3415-3424",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238771"}
@inproceedings{bb243880,
AUTHOR = "Xiao, F. and Kundu, K. and Tighe, J. and Modolo, D.",
TITLE = "Hierarchical Self-supervised Representation Learning for Movie
Understanding",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "9717-9726",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238772"}
@inproceedings{bb243881,
AUTHOR = "Li, L.L. and Zhou, T.F. and Wang, W.G. and Yang, L. and Li, J.W. and Yang, Y.",
TITLE = "Locality-Aware Inter-and Intra-Video Reconstruction for
Self-Supervised Correspondence Learning",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "8709-8720",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238773"}
@inproceedings{bb243882,
AUTHOR = "Jiang, Y.F. and Gong, X.Y. and Wu, J. and Shi, H. and Yan, Z.C. and Wang, Z.Y.",
TITLE = "Auto-X3D: Ultra-Efficient Video Understanding via Finer-Grained
Neural Architecture Search",
BOOKTITLE = WACV22,
YEAR = "2022",
PAGES = "2354-2363",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238774"}
@inproceedings{bb243883,
AUTHOR = "Chen, N.L. and Chu, L. and Pan, H. and Lu, Y. and Wang, W.P.",
TITLE = "Self-Supervised Image Representation Learning with Geometric Set
Consistency",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "19270-19280",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238775"}
@inproceedings{bb243884,
AUTHOR = "Lin, Y.Z. and Guo, X. and Lu, Y.",
TITLE = "Self-Supervised Video Representation Learning with Meta-Contrastive
Network",
BOOKTITLE = ICCV21,
YEAR = "2021",
PAGES = "8219-8229",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238776"}
@inproceedings{bb243885,
AUTHOR = "Guo, X.D. and Guo, X. and Lu, Y.",
TITLE = "SSAN: Separable Self-Attention Network for Video Representation
Learning",
BOOKTITLE = CVPR21,
YEAR = "2021",
PAGES = "12613-12622",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238777"}
@inproceedings{bb243886,
AUTHOR = "Yang, X.T. and Fan, H.Q. and Torresani, L. and Davis, L.S. and Wang, H.",
TITLE = "Beyond Short Clips:
End-to-End Video-Level Learning with Collaborative Memories",
BOOKTITLE = CVPR21,
YEAR = "2021",
PAGES = "7563-7572",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238778"}
@inproceedings{bb243887,
AUTHOR = "Zhang, C.H. and Gupta, A. and Zisserman, A.",
TITLE = "Temporal Query Networks for Fine-grained Video Understanding",
BOOKTITLE = CVPR21,
YEAR = "2021",
PAGES = "4484-4494",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238779"}
@inproceedings{bb243888,
AUTHOR = "Kangaspunta, J. and Piergiovanni, A. and Jonschkowski, R. and Ryoo, M. and Angelova, A.",
TITLE = "Adaptive Intermediate Representations for Video Understanding",
BOOKTITLE = MULA21,
YEAR = "2021",
PAGES = "1602-1612",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238780"}
@inproceedings{bb243889,
AUTHOR = "Duan, H.D. and Zhao, Y. and Xiong, Y.J. and Liu, W.T. and Lin, D.",
TITLE = "Omni-sourced Webly-supervised Learning for Video Recognition",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "XV:670-688",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238781"}
@inproceedings{bb243890,
AUTHOR = "Jha, A. and Kumar, A. and Pande, S. and Banerjee, B. and Chaudhuri, S.",
TITLE = "MT-UNET: A Novel U-Net Based Multi-Task Architecture For Visual Scene
Understanding",
BOOKTITLE = ICIP20,
YEAR = "2020",
PAGES = "2191-2195",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238782"}
@inproceedings{bb243891,
AUTHOR = "Diba, A. and Fayyaz, M. and Sharma, V. and Paluri, M. and Gall, J. and Stiefelhagen, R. and Van Gool, L.J.",
TITLE = "Large Scale Holistic Video Understanding",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "V:593-610",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238783"}
@inproceedings{bb243892,
AUTHOR = "Voigtlaender, P. and Changpinyo, S. and Pont Tuset, J. and Soricut, R. and Ferrari, V.",
TITLE = "Connecting Vision and Language with Video Localized Narratives",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "2461-2471",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238784"}
@inproceedings{bb243893,
AUTHOR = "Pont Tuset, J. and Uijlings, J. and Changpinyo, S. and Soricut, R. and Ferrari, V.",
TITLE = "Connecting Vision and Language with Localized Narratives",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "V:647-664",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238785"}
@inproceedings{bb243894,
AUTHOR = "Hu, A. and Cotter, F. and Mohan, N. and Gurau, C. and Kendall, A.",
TITLE = "Probabilistic Future Prediction for Video Scene Understanding",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "XVI: 767-785",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238786"}
@inproceedings{bb243895,
AUTHOR = "Mavroudi, E. and Haro, B.B. and Vidal, R.",
TITLE = "Representation Learning on Visual-Symbolic Graphs for Video
Understanding",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "XXIX: 71-90",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238787"}
@inproceedings{bb243896,
AUTHOR = "Sener, F. and Singhania, D. and Yao, A.",
TITLE = "Temporal Aggregate Representations for Long-range Video Understanding",
BOOKTITLE = ECCV20,
YEAR = "2020",
PAGES = "XVI: 154-171",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238788"}
@inproceedings{bb243897,
AUTHOR = "Tosi, F. and Aleotti, F. and Ramirez, P.Z. and Poggi, M. and Salti, S. and di Stefano, L. and Mattoccia, S.",
TITLE = "Distilled Semantics for Comprehensive Scene Understanding from Videos",
BOOKTITLE = CVPR20,
YEAR = "2020",
PAGES = "4653-4664",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238789"}
@inproceedings{bb243898,
AUTHOR = "Piergiovanni, A.J. and Angelova, A. and Ryoo, M.S.",
TITLE = "Evolving Losses for Unsupervised Video Representation Learning",
BOOKTITLE = CVPR20,
YEAR = "2020",
PAGES = "130-139",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238790"}
@inproceedings{bb243899,
AUTHOR = "Xiong, Y. and Huang, Q. and Guo, L. and Zhou, H. and Zhou, B. and Lin, D.",
TITLE = "A Graph-Based Framework to Bridge Movies and Synopses",
BOOKTITLE = ICCV19,
YEAR = "2019",
PAGES = "4591-4600",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT238791"}
Last update:Nov 26, 2025 at 20:24:09