@inproceedings{bb242300, AUTHOR = "Chalk, J. and Huh, J. and Kazakos, E. and Zisserman, A. and Damen, D.", TITLE = "TIM: A Time Interval Machine for Audio-Visual Action Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18153-18163", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237201"} @inproceedings{bb242301, AUTHOR = "Wang, J. and Chen, D.D. and Luo, C. and He, B. and Yuan, L. and Wu, Z.X. and Jiang, Y.G.", TITLE = "OmniViD: A Generative Framework for Universal Video Understanding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18209-18220", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237202"} @inproceedings{bb242302, AUTHOR = "Zeng, R. and Chen, X.Y. and Liang, J.M. and Wu, H. and Cao, G.Z. and Guo, Y.", TITLE = "Benchmarking the Robustness of Temporal Action Detection Models Against Temporal Corruptions", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18263-18274", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237203"} @inproceedings{bb242303, AUTHOR = "Peirone, S.A. and Pistilli, F. and Alliegro, A. and Averta, G.", TITLE = "A Backpack Full of Skills: Egocentric Video Understanding with Diverse Task Perspectives", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18275-18285", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237204"} @inproceedings{bb242304, AUTHOR = "Nguyen, T.T. and Nguyen, P. and Luu, K.", TITLE = "HIG: Hierarchical Interlacement Graph Approach to Scene Graph Generation in Video Understanding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18384-18394", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237205"} @inproceedings{bb242305, AUTHOR = "Tores, J. and Sassatelli, L. and Wu, H.Y. and Bergman, C. and Andolfi, L. and Ecrement, V. and Precioso, F. and Devars, T. and Guaresi, M. and Julliard, V. and Lecossais, S.", TITLE = "Visual Objectification in Films: Towards a New AI Task for Video Interpretation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "10864-10874", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237206"} @inproceedings{bb242306, AUTHOR = "Jamal, M.A. and Mohareri, O.", TITLE = "M33D: Learning 3D priors using Multi-Modal Masked Autoencoders for 2D image and video understanding", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "2532-2542", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237207"} @inproceedings{bb242307, AUTHOR = "Li, K.C. and Wang, Y.L. and He, Y. and Li, Y.Z. and Wang, Y. and Wang, L.M. and Qiao, Y.", TITLE = "UniFormerV2: Unlocking the Potential of Image ViTs for Video Understanding", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "1632-1643", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237208"} @inproceedings{bb242308, AUTHOR = "Zhao, Y.C. and Luo, C. and Tang, C.X. and Chen, D.D. and Codella, N. and Zha, Z.J.", TITLE = "Streaming Video Model", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14602-14612", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237209"} @inproceedings{bb242309, AUTHOR = "Maiya, S.R. and Girish, S. and Ehrlich, M. and Wang, H.Y. and Lee, K.S. and Poirson, P. and Wu, P.X. and Wang, C. and Shrivastava, A.", TITLE = "NIRVANA: Neural Implicit Representations of Videos with Adaptive Networks and Autoregressive Patch-Wise Modeling", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "14378-14387", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237210"} @inproceedings{bb242310, AUTHOR = "Zhang, Y.T. and Bai, Y. and Liu, C. and Wang, H. and Li, S. and Fu, Y.", TITLE = "Frame Flexible Network", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10504-10513", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237211"} @inproceedings{bb242311, AUTHOR = "Dessalene, E. and Maynord, M. and Fermuller, C. and Aloimonos, Y.F.", TITLE = "Therbligs in Action: Video Understanding through Motion Primitives", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10618-10626", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237212"} @inproceedings{bb242312, AUTHOR = "Zhao, Y. and Misra, I. and Krahenbuhl, P. and Girdhar, R.", TITLE = "Learning Video Representations from Large Language Models", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6586-6597", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237213"} @inproceedings{bb242313, AUTHOR = "Wang, R. and Chen, D.D. and Wu, Z.X. and Chen, Y.P. and Dai, X. and Liu, M.C. and Yuan, L. and Jiang, Y.G.", TITLE = "Masked Video Distillation: Rethinking Masked Feature Modeling for Self-supervised Video Representation Learning", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6312-6322", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237214"} @inproceedings{bb242314, AUTHOR = "Foo, L.G. and Gong, J. and Fan, Z.P. and Liu, J.", TITLE = "System-Status-Aware Adaptive Network for Online Streaming Video Understanding", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "10514-10523", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237215"} @inproceedings{bb242315, AUTHOR = "Dong, S. and Hu, H.Z. and Lian, D.Z. and Luo, W.X. and Qian, Y.C. and Gao, S.H.", TITLE = "Weakly Supervised Video Representation Learning with Unaligned Text for Sequential Videos", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2437-2447", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237216"} @inproceedings{bb242316, AUTHOR = "Zhang, H. and Liu, D. and Zheng, Q. and Su, B.", TITLE = "Modeling Video as Stochastic Processes for Fine-Grained Video Representation Learning", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2225-2234", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237217"} @inproceedings{bb242317, AUTHOR = "Kumar, Y. and Mishra, A.", TITLE = "Few-Shot Referring Relationships in Videos", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2289-2298", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237218"} @inproceedings{bb242318, AUTHOR = "Harzig, P. and Einfalt, M. and Lienhart, R.", TITLE = "Synchronized Audio-Visual Frames with Fractional Positional Encoding for Transformers in Video-to-Text Translation", BOOKTITLE = ICIP22, YEAR = "2022", PAGES = "2041-2045", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237219"} @inproceedings{bb242319, AUTHOR = "Wiles, O. and Carreira, J. and Barr, I. and Zisserman, A. and Malinowski, M.", TITLE = "Compressed Vision for Efficient Video Understanding", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "VII:679-695", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237220"} @inproceedings{bb242320, AUTHOR = "Rho, D. and Cho, J. and Ko, J.H. and Park, E.", TITLE = "Neural Residual Flow Fields for Efficient Video Representations", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "II:458-474", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237221"} @inproceedings{bb242321, AUTHOR = "Tian, F.R. and Fan, J.W. and Yu, X. and Du, S.Y. and Song, M. and Zhao, Y.", TITLE = "TCVM: Temporal Contrasting Video Montage Framework for Self-Supervised Video Representation Learning", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "II:526-542", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237222"} @inproceedings{bb242322, AUTHOR = "Huang, Z.M. and Jia, C.M. and Wang, S.S. and Ma, S.W.", TITLE = "A Compressive Prior Guided Mask Predictive Coding Approach for Video Analysis", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "IV:469-484", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237223"} @inproceedings{bb242323, AUTHOR = "Li, L. and Zhuang, L.S. and Gao, S.H. and Wang, S.", TITLE = "Havit: Hybrid-attention Based Vision Transformer for Video Classification", BOOKTITLE = ACCV22, YEAR = "2022", PAGES = "IV:502-517", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237224"} @inproceedings{bb242324, AUTHOR = "Zhang, H.L. and Pirsiavash, H. and Liu, X.", TITLE = "MASTAF: A Model-Agnostic Spatio-Temporal Attention Fusion Network for Few-shot Video Classification", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "2507-2516", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237225"} @inproceedings{bb242325, AUTHOR = "Senocak, A. and Kim, J. and Oh, T.H. and Li, D.Z. and Kweon, I.S.", TITLE = "Event-Specific Audio-Visual Fusion Layers: A Simple and New Perspective on Video Understanding", BOOKTITLE = WACV23, YEAR = "2023", PAGES = "2236-2246", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237226"} @inproceedings{bb242326, AUTHOR = "Xia, B.Y. and Wu, W.H. and Wang, H.R. and Su, R. and He, D.L. and Yang, H. and Fan, X.R. and Ouyang, W.L.", TITLE = "NSNet: Non-saliency Suppression Sampler for Efficient Video Recognition", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXIV:705-723", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237227"} @inproceedings{bb242327, AUTHOR = "Xia, B.Y. and Wang, Z.H. and Wu, W.H. and Wang, H.R. and Han, J.G.", TITLE = "Temporal Saliency Query Network for Efficient Video Recognition", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXIV:741-759", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237228"} @inproceedings{bb242328, AUTHOR = "Habibian, A. and Yahia, H.B. and Abati, D. and Gavves, E. and Porikli, F.M.", TITLE = "Delta Distillation for Efficient Video Processing", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:213-229", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237229"} @inproceedings{bb242329, AUTHOR = "Li, Z.Z. and Wang, M.M. and Pi, H.J. and Xu, K. and Mei, J.B. and Liu, Y.", TITLE = "E-NeRV: Expedite Neural Video Representation with Disentangled Spatial-Temporal Context", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:267-284", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237230"} @inproceedings{bb242330, AUTHOR = "Kosman, E. and di Castro, D.", TITLE = "GraphVid: It only Takes a Few Nodes to Understand a Video", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:195-212", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237231"} @inproceedings{bb242331, AUTHOR = "Ju, C. and Han, T. and Zheng, K. and Zhang, Y. and Xie, W.", TITLE = "Prompting Visual-Language Models for Efficient Video Understanding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXV:105-124", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237232"} @inproceedings{bb242332, AUTHOR = "Liang, S.X. and Shen, X. and Huang, J.Q. and Hua, X.S.", TITLE = "Delving into Details: Synopsis-to-Detail Networks for Video Recognition", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "IV:262-278", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237233"} @inproceedings{bb242333, AUTHOR = "Ur Rehman, Y.A. and Gao, Y. and Shen, J.J. and de Gusmao, P.P.B. and Lane, N.", TITLE = "Federated Self-supervised Learning for Video Understanding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXI:506-522", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237234"} @inproceedings{bb242334, AUTHOR = "Dadashzadeh, A. and Whone, A. and Mirmehdi, M.", TITLE = "Auxiliary Learning for Self-Supervised Video Representation via Similarity-based Knowledge Distillation", BOOKTITLE = L3D-IVU22, YEAR = "2022", PAGES = "4230-4239", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237235"} @inproceedings{bb242335, AUTHOR = "Li, Y. and Vasconcelos, N.M.", TITLE = "Improving Video Model Transfer with Dynamic Representation Learning", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "19258-19269", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237236"} @inproceedings{bb242336, AUTHOR = "Guo, S. and Xiong, Z.H. and Zhong, Y.J. and Wang, L.M. and Guo, X.B. and Han, B. and Huang, W.L.", TITLE = "Cross-Architecture Self-supervised Video Representation Learning", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "19248-19257", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237237"} @inproceedings{bb242337, AUTHOR = "Xu, X.Y. and Li, Y.L. and Lu, C.", TITLE = "Learning to Anticipate Future with Dynamic Context Removal", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "12724-12734", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237238"} @inproceedings{bb242338, AUTHOR = "Gadre, S.Y. and Ehsani, K. and Song, S. and Mottaghi, R.", TITLE = "Continuous Scene Representations for Embodied AI", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "14829-14839", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237239"} @inproceedings{bb242339, AUTHOR = "Liang, C. and Wang, W.G. and Zhou, T.F. and Yang, Y.", TITLE = "Visual Abductive Reasoning", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "15544-15554", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237240"} @inproceedings{bb242340, AUTHOR = "Kinfu, K.A. and Vidal, R.", TITLE = "Analysis and Extensions of Adversarial Training for Video Classification", BOOKTITLE = RoSe22, YEAR = "2022", PAGES = "3415-3424", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237241"} @inproceedings{bb242341, AUTHOR = "Xiao, F. and Kundu, K. and Tighe, J. and Modolo, D.", TITLE = "Hierarchical Self-supervised Representation Learning for Movie Understanding", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "9717-9726", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237242"} @inproceedings{bb242342, AUTHOR = "Li, L.L. and Zhou, T.F. and Wang, W.G. and Yang, L. and Li, J.W. and Yang, Y.", TITLE = "Locality-Aware Inter-and Intra-Video Reconstruction for Self-Supervised Correspondence Learning", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "8709-8720", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237243"} @inproceedings{bb242343, AUTHOR = "Jiang, Y.F. and Gong, X.Y. and Wu, J. and Shi, H. and Yan, Z.C. and Wang, Z.Y.", TITLE = "Auto-X3D: Ultra-Efficient Video Understanding via Finer-Grained Neural Architecture Search", BOOKTITLE = WACV22, YEAR = "2022", PAGES = "2354-2363", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237244"} @inproceedings{bb242344, AUTHOR = "Chen, N.L. and Chu, L. and Pan, H. and Lu, Y. and Wang, W.P.", TITLE = "Self-Supervised Image Representation Learning with Geometric Set Consistency", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "19270-19280", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237245"} @inproceedings{bb242345, AUTHOR = "Lin, Y.Z. and Guo, X. and Lu, Y.", TITLE = "Self-Supervised Video Representation Learning with Meta-Contrastive Network", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "8219-8229", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237246"} @inproceedings{bb242346, AUTHOR = "Guo, X.D. and Guo, X. and Lu, Y.", TITLE = "SSAN: Separable Self-Attention Network for Video Representation Learning", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "12613-12622", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237247"} @inproceedings{bb242347, AUTHOR = "Yang, X.T. and Fan, H.Q. and Torresani, L. and Davis, L.S. and Wang, H.", TITLE = "Beyond Short Clips: End-to-End Video-Level Learning with Collaborative Memories", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "7563-7572", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237248"} @inproceedings{bb242348, AUTHOR = "Zhang, C.H. and Gupta, A. and Zisserman, A.", TITLE = "Temporal Query Networks for Fine-grained Video Understanding", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "4484-4494", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237249"} @inproceedings{bb242349, AUTHOR = "Kangaspunta, J. and Piergiovanni, A. and Jonschkowski, R. and Ryoo, M. and Angelova, A.", TITLE = "Adaptive Intermediate Representations for Video Understanding", BOOKTITLE = MULA21, YEAR = "2021", PAGES = "1602-1612", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237250"} @inproceedings{bb242350, AUTHOR = "Duan, H.D. and Zhao, Y. and Xiong, Y.J. and Liu, W.T. and Lin, D.", TITLE = "Omni-sourced Webly-supervised Learning for Video Recognition", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XV:670-688", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237251"} @inproceedings{bb242351, AUTHOR = "Jha, A. and Kumar, A. and Pande, S. and Banerjee, B. and Chaudhuri, S.", TITLE = "MT-UNET: A Novel U-Net Based Multi-Task Architecture For Visual Scene Understanding", BOOKTITLE = ICIP20, YEAR = "2020", PAGES = "2191-2195", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237252"} @inproceedings{bb242352, AUTHOR = "Diba, A. and Fayyaz, M. and Sharma, V. and Paluri, M. and Gall, J. and Stiefelhagen, R. and Van Gool, L.J.", TITLE = "Large Scale Holistic Video Understanding", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "V:593-610", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237253"} @inproceedings{bb242353, AUTHOR = "Voigtlaender, P. and Changpinyo, S. and Pont Tuset, J. and Soricut, R. and Ferrari, V.", TITLE = "Connecting Vision and Language with Video Localized Narratives", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "2461-2471", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237254"} @inproceedings{bb242354, AUTHOR = "Pont Tuset, J. and Uijlings, J. and Changpinyo, S. and Soricut, R. and Ferrari, V.", TITLE = "Connecting Vision and Language with Localized Narratives", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "V:647-664", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237255"} @inproceedings{bb242355, AUTHOR = "Hu, A. and Cotter, F. and Mohan, N. and Gurau, C. and Kendall, A.", TITLE = "Probabilistic Future Prediction for Video Scene Understanding", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XVI: 767-785", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237256"} @inproceedings{bb242356, AUTHOR = "Mavroudi, E. and Haro, B.B. and Vidal, R.", TITLE = "Representation Learning on Visual-Symbolic Graphs for Video Understanding", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XXIX: 71-90", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237257"} @inproceedings{bb242357, AUTHOR = "Sener, F. and Singhania, D. and Yao, A.", TITLE = "Temporal Aggregate Representations for Long-range Video Understanding", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XVI: 154-171", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237258"} @inproceedings{bb242358, AUTHOR = "Tosi, F. and Aleotti, F. and Ramirez, P.Z. and Poggi, M. and Salti, S. and di Stefano, L. and Mattoccia, S.", TITLE = "Distilled Semantics for Comprehensive Scene Understanding from Videos", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "4653-4664", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237259"} @inproceedings{bb242359, AUTHOR = "Piergiovanni, A.J. and Angelova, A. and Ryoo, M.S.", TITLE = "Evolving Losses for Unsupervised Video Representation Learning", BOOKTITLE = CVPR20, YEAR = "2020", PAGES = "130-139", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237260"} @inproceedings{bb242360, AUTHOR = "Xiong, Y. and Huang, Q. and Guo, L. and Zhou, H. and Zhou, B. and Lin, D.", TITLE = "A Graph-Based Framework to Bridge Movies and Synopses", BOOKTITLE = ICCV19, YEAR = "2019", PAGES = "4591-4600", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237261"} @inproceedings{bb242361, AUTHOR = "Kanehira, A. and Takemoto, K. and Inayoshi, S. and Harada, T.", TITLE = "Multimodal Explanations by Predicting Counterfactuality in Videos", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "8586-8594", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237262"} @inproceedings{bb242362, AUTHOR = "Kanehira, A. and Harada, T.", TITLE = "Learning to Explain With Complemental Examples", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "8595-8603", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237263"} @inproceedings{bb242363, AUTHOR = "Zhou, L. and Kalantidis, Y. and Chen, X.L. and Corso, J.J. and Rohrbach, M.", TITLE = "Grounded Video Description", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "6571-6580", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237264"} @inproceedings{bb242364, AUTHOR = "Liu, X.Y. and Lee, J.Y. and Jin, H.L.", TITLE = "Learning Video Representations From Correspondence Proposals", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "4268-4276", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237265"} @inproceedings{bb242365, AUTHOR = "Xiong, B. and Kalantidis, Y. and Ghadiyaram, D. and Grauman, K.", TITLE = "Less Is More: Learning Highlight Detection From Video Duration", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "1258-1267", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237266"} @inproceedings{bb242366, AUTHOR = "Zhang, D. and Dai, X. and Wang, X. and Wang, Y.F. and Davis, L.S.", TITLE = "MAN: Moment Alignment Network for Natural Language Moment Retrieval via Iterative Graph Adjustment", BOOKTITLE = CVPR19, YEAR = "2019", PAGES = "1247-1257", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237267"} @inproceedings{bb242367, AUTHOR = "Fan, L. and Huang, W. and Gan, C. and Ermon, S. and Gong, B. and Huang, J.", TITLE = "End-to-End Learning of Motion Representation for Video Understanding", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "6016-6025", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237268"} @inproceedings{bb242368, AUTHOR = "Huang, D. and Ramanathan, V. and Mahajan, D. and Torresani, L. and Paluri, M. and Fei Fei, L. and Niebles, J.C.", TITLE = "What Makes a Video a Video: Analyzing Temporal Information in Video Understanding Models and Datasets", BOOKTITLE = CVPR18, YEAR = "2018", PAGES = "7366-7375", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237269"} @inproceedings{bb242369, AUTHOR = "Mahdisoltani, F. and Memisevic, R. and Fleet, D.J.", TITLE = "Hierarchical Video Understanding", BOOKTITLE = WiCV-E18, YEAR = "2018", PAGES = "IV:659-663", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237270"} @inproceedings{bb242370, AUTHOR = "Shin, K.S. and Jeon, J. and Lee, S. and Lim, B. and Jeong, M.S. and Nang, J.", TITLE = "Approach for Video Classification with Multi-label on YouTube-8M Dataset", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:317-324", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237271"} @inproceedings{bb242371, AUTHOR = "Skalic, M. and Austin, D.", TITLE = "Building A Size Constrained Predictive Models for Video Classification", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:297-305", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237272"} @inproceedings{bb242372, AUTHOR = "Garg, S.", TITLE = "Learning Video Features for Multi-label Classification", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:325-337", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237273"} @inproceedings{bb242373, AUTHOR = "Cho, C. and Antin, B. and Arora, S. and Ashrafi, S. and Duan, P.L. and Huynh, D.T. and James, L. and Nguyen, H.T. and Solgi, M. and Than, C.V.", TITLE = "Large-Scale Video Classification with Feature Space Augmentation Coupled with Learned Label Relations and Ensembling", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:338-346", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237274"} @inproceedings{bb242374, AUTHOR = "Lin, R.C. and Xiao, J. and Fan, J.P.", TITLE = "NeXtVLAD: An Efficient Neural Network to Aggregate Frame-Level Features for Large-Scale Video Classification", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:206-218", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237275"} @inproceedings{bb242375, AUTHOR = "Tang, Y.Y. and Zhang, X. and Wang, J.W. and Chen, S.X. and Ma, L. and Jiang, Y.G.", TITLE = "Non-local NetVLAD Encoding for Video Classification", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:219-228", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237276"} @inproceedings{bb242376, AUTHOR = "Kmiec, S. and Bae, J. and An, R.J.", TITLE = "Learnable Pooling Methods for Video Classification", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:229-238", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237277"} @inproceedings{bb242377, AUTHOR = "Liu, T.Q. and Liu, B.", TITLE = "Constrained-Size Tensorflow Models for YouTube-8M Video Understanding Challenge", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:239-249", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237278"} @inproceedings{bb242378, AUTHOR = "Lee, J. and Natsev, A.P. and Reade, W. and Sukthankar, R. and Toderici, G.", TITLE = "The 2nd YouTube-8M Large-Scale Video Understanding Challenge", BOOKTITLE = Large-Scale18, YEAR = "2018", PAGES = "IV:193-205", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237279"} @inproceedings{bb242379, AUTHOR = "Zolfaghari, M. and Singh, K. and Brox, T.", TITLE = "ECO: Efficient Convolutional Network for Online Video Understanding", BOOKTITLE = ECCV18, YEAR = "2018", PAGES = "II: 713-730", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237280"} @inproceedings{bb242380, AUTHOR = "Sah, S. and Nguyen, T. and Dominguez, M. and Such, F.P. and Ptucha, R.", TITLE = "Temporally Steered Gaussian Attention for Video Understanding", BOOKTITLE = DeepLearn-T17, YEAR = "2017", PAGES = "2208-2216", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237281"} @inproceedings{bb242381, AUTHOR = "Jiang, Y.G. and Ye, G. and Chang, S.F. and Ellis, D. and Loui, A.C.", TITLE = "Consumer video understanding: a benchmark database and an evaluation of human and machine performance", BOOKTITLE = ICMR11, YEAR = "2011", PAGES = "29", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237282"} @inproceedings{bb242382, AUTHOR = "Yang, Y. and Liu, J.G. and Shah, M.", TITLE = "Video Scene Understanding Using Multi-scale Analysis", BOOKTITLE = ICCV09, YEAR = "2009", PAGES = "1669-1676", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825vu1.html#TT237283"} @article{bb242383, AUTHOR = "Pang, B. and Peng, G. and Li, Y.Z. and Lu, C.", TITLE = "Markov Progressive Framework, a Universal Paradigm for Modeling Long Videos", JOURNAL = PAMI, VOLUME = "46", YEAR = "2024", NUMBER = "12", MONTH = "December", PAGES = "9749-9765", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237284"} @article{bb242384, AUTHOR = "You, Z. and Wen, Z.Q. and Chen, Y.F. and Li, X. and Zeng, R.H. and Wang, Y.W. and Tan, M.K.", TITLE = "Toward Long Video Understanding via Fine-Detailed Video Story Generation", JOURNAL = CirSysVideo, VOLUME = "35", YEAR = "2025", NUMBER = "5", MONTH = "May", PAGES = "4592-4607", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237285"} @inproceedings{bb242385, AUTHOR = "Liu, S.M. and Zhao, C. and Xu, T.Q. and Ghanem, B.", TITLE = "BOLT: Boost Large Vision-Language Model Without Training for Long-Form Video Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3318-3327", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237286"} @inproceedings{bb242386, AUTHOR = "Jang, H. and Yu, S. and Shin, J. and Abbeel, P. and Seo, Y.", TITLE = "Efficient Long Video Tokenization via Coordinate-based Patch Reconstruction", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "22853-22863", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237287"} @inproceedings{bb242387, AUTHOR = "Man, Y.B. and Huang, Y. and Zhang, C.M. and Li, B.Z. and Niu, W. and Yin, M.", TITLE = "AdaCM2: On Understanding Extremely Long-Term Video with Adaptive Cross-Modality Memory Reduction", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "8534-8544", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237288"} @inproceedings{bb242388, AUTHOR = "Ren, W.M. and Yang, H. and Min, J. and Wei, C. and Chen, W.", TITLE = "VISTA: Enhancing Long-Duration and High-Resolution Video Understanding by VIdeo SpatioTemporal Augmentation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3804-3814", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237289"} @inproceedings{bb242389, AUTHOR = "Wang, Z.Y. and Yu, S. and Stengel Eskin, E. and Yoon, J. and Cheng, F. and Bertasius, G. and Bansal, M.", TITLE = "VideoTree: Adaptive Tree-based Video Representation for LLM Reasoning on Long Videos", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3272-3282", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237290"} @inproceedings{bb242390, AUTHOR = "Ye, J.H. and Wang, Z. and Sun, H. and Chandrasegaran, K. and Durante, Z. and Eyzaguirre, C. and Bisk, Y. and Niebles, J.C. and Adeli, E. and Fei Fei, L. and Wu, J.J. and Li, M.", TITLE = "Re-thinking Temporal Search for Long-Form Video Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "8579-8591", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237291"} @inproceedings{bb242391, AUTHOR = "Wang, L. and Chen, Y.J. and Tran, D. and Boddeti, V.N. and Chu, W.S.", TITLE = "SEAL: SEmantic Attention Learning for Long Video Representation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "26192-26201", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237292"} @inproceedings{bb242392, AUTHOR = "Pan, Y. and Zhang, C. and Bertasius, G.", TITLE = "Basket: A Large-Scale Video Dataset for Fine-Grained Skill Estimation", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "28952-28962", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237293"} @inproceedings{bb242393, AUTHOR = "Zhou, J.J. and Shu, Y. and Zhao, B. and Wu, B. and Liang, Z.Y. and Xiao, S.T. and Qin, M.H. and Yang, X. and Xiong, Y.P. and Zhang, B. and Huang, T.J. and Liu, Z.", TITLE = "MLVU: Benchmarking Multi-task Long Video Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "13691-13701", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237294"} @inproceedings{bb242394, AUTHOR = "Shu, Y. and Liu, Z. and Zhang, P. and Qin, M.H. and Zhou, J.J. and Liang, Z.Y. and Huang, T.J. and Zhao, B.", TITLE = "Video-XL: Extra-Long Vision Language Model for Hour-Scale Video Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "26160-26169", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237295"} @inproceedings{bb242395, AUTHOR = "Tang, X. and Qiu, J. and Xie, L.X. and Tian, Y.J. and Jiao, J.B. and Ye, Q.X.", TITLE = "Adaptive Keyframe Sampling for Long Video Understanding", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "29118-29128", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237296"} @inproceedings{bb242396, AUTHOR = "Ventura, L. and Yang, A. and Schmid, C. and Varol, G.", TITLE = "Chapter-Llama: Efficient Chaptering in Hour-Long Videos with LLMs", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "18947-18958", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237297"} @inproceedings{bb242397, AUTHOR = "Geng, T.T. and Zhang, J. and Wang, Q. and Wang, T. and Duan, J.M. and Zheng, F.", TITLE = "LongVALE: Vision-Audio-Language-Event Benchmark Towards Time-Aware Omni-Modal Perception of Long Videos", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "18959-18969", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237298"} @inproceedings{bb242398, AUTHOR = "Kim, J. and Kim, H. and Lee, H. and Ro, Y.M.", TITLE = "SALOVA: Segment-Augmented Long Video Assistant for Targeted Retrieval and Routing in Long-Form Video Analysis", BOOKTITLE = CVPR25, YEAR = "2025", PAGES = "3352-3362", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237299"} @inproceedings{bb242399, AUTHOR = "Song, E. and Chai, W.H. and Wang, G. and Zhang, Y.C. and Zhou, H.Y. and Wu, F. and Chi, H.Z. and Guo, X. and Ye, T. and Zhang, Y.T. and Lu, Y. and Hwang, J.N. and Wang, G.A.", TITLE = "MovieChat: From Dense Token to Sparse Memory for Long Video Understanding", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18221-18232", BIBSOURCE = "http://www.visionbib.com/bibliography/applicat825lovu2.html#TT237300"}