Keith Price Bibliography Bibtex Entry (ANCHOR 282200 URL https://doi.org/10.1007/s11263-024-02261-x PAGES 1644-1664 YEAR 2025 MONTH April NUMBER 4 BIBSOURCE http://www.visionbib.com/bibliography/people916.html#TT276897 VOLUME 133 JOURNAL IJCV AUTHOR Zhou, J.X. and Shen, X.Y. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.R. TITLE Audio-Visual Segmentation with Semantics)


@article{bb282200,
        AUTHOR = "Zhou, J.X. and Shen, X.Y. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.R.",
        TITLE = "Audio-Visual Segmentation with Semantics",
        JOURNAL = IJCV,
        VOLUME = "133",
        YEAR = "2025",
        NUMBER = "4",
        MONTH = "April",
        PAGES = "1644-1664",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276897"}

@article{bb282201,
        AUTHOR = "Gao, J.Y. and Chen, M.Y. and Xu, C.S.",
        TITLE = "Learning Probabilistic Presence-Absence Evidence for
Weakly-Supervised Audio-Visual Event Perception",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "6",
        MONTH = "June",
        PAGES = "4787-4802",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276898"}

@inproceedings{bb282202,
        AUTHOR = "Gao, J.Y. and Chen, M.Y. and Xu, C.S.",
        TITLE = "Collecting Cross-Modal Presence-Absence Evidence for
Weakly-Supervised Audio-Visual Event Perception",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18827-18836",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276899"}

@article{bb282203,
        AUTHOR = "Lv, Y. and Liu, Z. and Chang, X.J.",
        TITLE = "Consistency-Queried Transformer for Audio-Visual Segmentation",
        JOURNAL = IP,
        VOLUME = "34",
        YEAR = "2025",
        PAGES = "2616-2627",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276900"}

@article{bb282204,
        AUTHOR = "Zhu, Y. and Li, K. and Yang, Z.X.",
        TITLE = "Exploiting EfficientSAM and Temporal Coherence for Audio-Visual
Segmentation",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "2999-3008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276901"}

@article{bb282205,
        AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y. and Chen, Y.R.",
        TITLE = "LR-ASD: Lightweight and Robust Network for Active Speaker Detection",
        JOURNAL = IJCV,
        VOLUME = "133",
        YEAR = "2025",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "4749-4769",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276902"}

@article{bb282206,
        AUTHOR = "Mao, Y.X. and Zhang, J. and Xiang, M. and Lv, Y.Q. and Li, D. and Zhong, Y.R. and Dai, Y.C.",
        TITLE = "Contrastive Conditional Latent Diffusion for Audio-Visual
Segmentation",
        JOURNAL = IP,
        VOLUME = "34",
        YEAR = "2025",
        PAGES = "4108-4119",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276903"}

@article{bb282207,
        AUTHOR = "Li, K.W. and Chen, H. and Du, J. and Zhou, H.S. and Siniscalchi, S.M. and Niu, S.T. and Xiong, S.F.",
        TITLE = "Lightweight Audio-Visual Wake Word Spotting With Diverse Acoustic
Knowledge Distillation",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "7",
        MONTH = "July",
        PAGES = "7308-7320",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276904"}

@article{bb282208,
        AUTHOR = "Vilaca, L. and Yu, Y. and Viana, P.",
        TITLE = "A Survey of Recent Advances and Challenges in Deep Audio-Visual
Correlation Learning",
        JOURNAL = Surveys,
        VOLUME = "57",
        YEAR = "2025",
        NUMBER = "12",
        MONTH = "July",
        PAGES = "xx-yy",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276905"}

@article{bb282209,
        AUTHOR = "Zhu, C.Z. and Shao, J.L. and Lin, J.X. and Wang, Y.J. and Wang, J. and Tang, J.H. and Li, K.",
        TITLE = "fMRI2GES: Co-Speech Gesture Reconstruction From fMRI Signal With Dual
Brain Decoding Alignment",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "9",
        MONTH = "September",
        PAGES = "9017-9029",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276906"}

@article{bb282210,
        AUTHOR = "Zhou, D.L. and Zhang, Y.K. and Wu, J.H. and Zhang, X.Y. and Xie, L. and Yin, E.",
        TITLE = "AVE Speech: A Comprehensive Multimodal Dataset for Speech Recognition
Integrating Audio, Visual, and Electromyographic Signals",
        JOURNAL = HMS,
        VOLUME = "55",
        YEAR = "2025",
        NUMBER = "4",
        MONTH = "August",
        PAGES = "559-568",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276907"}

@article{bb282211,
        AUTHOR = "Xuan, H.Y. and Liu, T.X. and Dong, W.X. and Li, Z.H. and Chen, S.",
        TITLE = "X-STA: Cross-Modal Spatial-Temporal Alignment Network for Unified
Audio-Visual Segmentation",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "2883-2887",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276908"}

@article{bb282212,
        AUTHOR = "Gong, S. and Zhuge, Y.Z. and Zhang, L. and Wang, Y.F. and Zhang, P.P. and Wang, L.J. and Lu, H.C.",
        TITLE = "AVS-Mamba: Exploring Temporal and Multi-Modal Mamba for Audio-Visual
Segmentation",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "5413-5425",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276909"}

@article{bb282213,
        AUTHOR = "Attia, D. and Benazza Benyahia, A.",
        TITLE = "Recognizing of Vocal Fold Disorders From High Speed Video: Use of
Spatio-Temporal Deep Neural Networks",
        JOURNAL = IJIST,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "5",
        PAGES = "e70170",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276910"}

@inproceedings{bb282214,
        AUTHOR = "Huang, S. and Wu, J.X. and Wei, X.Y. and Cai, Y. and Jiang, D.M. and Wang, Y.W.",
        TITLE = "Sound Bridge: Associating Egocentric and Exocentric Videos via Audio
Cues",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "28942-28951",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276911"}

@inproceedings{bb282215,
        AUTHOR = "Du, H.H. and Li, G.Y. and Zhou, C. and Zhang, C.J. and Zhao, A. and Hu, D.",
        TITLE = "Crab: A Unified Audio-Visual Scene Understanding Model with Explicit
Cooperation",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "18804-18814",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276912"}

@inproceedings{bb282216,
        AUTHOR = "Shaar, E. and Shaulov, A. and Chechik, G. and Wolf, L.B.",
        TITLE = "Adapting to the Unknown: Training-Free Audio-Visual Event Perception
with Dynamic Thresholds",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "3142-3151",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276913"}

@inproceedings{bb282217,
        AUTHOR = "Huang, S.F. and Ling, R. and Hui, T.R. and Li, H.Y. and Zhou, X. and Zhang, S.F. and Liu, S. and Hong, R.C. and Wang, M.",
        TITLE = "Revisiting Audio-Visual Segmentation with Vision-Centric Transformer",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "8352-8361",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276914"}

@inproceedings{bb282218,
        AUTHOR = "Wu, X.C. and Sun, H. and Wang, Y.F. and Nie, J. and Zhang, J. and Wang, Y.B. and Xue, J.X. and He, L.",
        TITLE = "AVF-MAE++: Scaling Affective Video Facial Masked Autoencoders via
Efficient Audio-Visual Self-Supervised Learning",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "9142-9153",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276915"}

@inproceedings{bb282219,
        AUTHOR = "Lai, Y.H. and Ebbers, J. and Wang, Y.C.A.F. and Germain, F. and Jones, M.J. and Chatterjee, M.",
        TITLE = "UWAV: Uncertainty-Weighted Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "13561-13570",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276916"}

@inproceedings{bb282220,
        AUTHOR = "Guo, R. and Ying, X.H. and Chen, Y. and Niu, D. and Li, G.Y. and Qu, L. and Qi, Y. and Zhou, J.X. and Xing, B. and Yue, W.Z. and Shi, J. and Wang, Q. and Zhang, P.L. and Liang, B.",
        TITLE = "Audio-Visual Instance Segmentation",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "13550-13560",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276917"}

@inproceedings{bb282221,
        AUTHOR = "Zhang, Y.H. and Yang, S. and Shan, S.G. and Chen, X.L.",
        TITLE = "ES3: Evolving Self-Supervised Learning of Robust Audio-Visual Speech
Representations",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27059-27069",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276918"}

@inproceedings{bb282222,
        AUTHOR = "Yang, Q. and Nie, X. and Li, T. and Gao, P.F. and Guo, Y. and Zhen, C. and Yan, P.F. and Xiang, S.M.",
        TITLE = "Cooperation Does Matter: Exploring Multi-Order Bilateral Relations
for Audio-Visual Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27124-27133",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276919"}

@inproceedings{bb282223,
        AUTHOR = "Xiong, J.W. and Zhang, P. and You, T. and Li, C.Y. and Huang, W. and Zha, Y.F.",
        TITLE = "DiffSal: Joint Audio and Video Learning for Diffusion Saliency
Prediction",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27263-27273",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276920"}

@inproceedings{bb282224,
        AUTHOR = "Li, X. and Wang, J.L. and Xu, X.H. and Peng, X.L. and Singh, R. and Lu, Y. and Raj, B.",
        TITLE = "QDFormer: Towards Robust Audiovisual Segmentation in Complex
Environments with Quantization-based Semantic Decomposition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "3402-3413",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276921"}

@inproceedings{bb282225,
        AUTHOR = "Singh, N. and Wu, C.W. and Orife, I. and Kalayeh, M.",
        TITLE = "Looking Similar, Sounding Different: Leveraging Counterfactual
Cross-Modal Pairs for Audiovisual Representation Learning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26897-26908",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276922"}

@inproceedings{bb282226,
        AUTHOR = "Liu, J.X. and Liu, Y.K. and Zhang, F. and Ju, C. and Zhang, Y. and Wang, Y.F.",
        TITLE = "Audio-Visual Segmentation via Unlabeled Frame Exploitation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26318-26329",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276923"}

@inproceedings{bb282227,
        AUTHOR = "Jia, W.Q. and Liu, M. and Jiang, H. and Ananthabhotla, I. and Rehg, J.M. and Ithapu, V.K. and Gao, R.H.",
        TITLE = "The Audio-Visual Conversational Graph: From an Egocentric-Exocentric
Perspective",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26386-26395",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276924"}

@inproceedings{bb282228,
        AUTHOR = "Chen, Y.H. and Liu, Y. and Wang, H. and Liu, F. and Wang, C. and Frazer, H. and Carneiro, G.",
        TITLE = "Unraveling Instance Associations: A Closer Look for Audio-Visual
Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26487-26497",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276925"}

@inproceedings{bb282229,
        AUTHOR = "Guo, Y.X. and Sun, S.Y. and Ma, S. and Zheng, K. and Bao, X.Y. and Ma, S.J. and Zou, W. and Zheng, Y.",
        TITLE = "CrossMAE: Cross-Modality Masked Autoencoders for Region-Aware
Audio-Visual Pre-Training",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26711-26721",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276926"}

@inproceedings{bb282230,
        AUTHOR = "Mo, S.T. and Morgado, P.",
        TITLE = "Unveiling the Power of Audio-Visual Early Fusion Transformers with
Dense Interactions Through Masked Modeling",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27176-27186",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276927"}

@inproceedings{bb282231,
        AUTHOR = "Wang, K. and Tian, Y.P. and Hatzinakos, D.",
        TITLE = "Towards Efficient Audio-Visual Learners via Empowering Pre-trained
Vision Transformers with Cross-Modal Adaptation",
        BOOKTITLE = WhatNext24,
        YEAR = "2024",
        PAGES = "1837-1846",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276928"}

@inproceedings{bb282232,
        AUTHOR = "Ryumina, E. and Markitantov, M. and Ryumin, D. and Kaya, H. and Karpov, A.",
        TITLE = "Zero-Shot Audio-Visual Compound Expression Recognition Method based
on Emotion Probability Fusion",
        BOOKTITLE = ABAW24,
        YEAR = "2024",
        PAGES = "4752-4760",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276929"}

@inproceedings{bb282233,
        AUTHOR = "Mahmud, T. and Mo, S.T. and Tian, Y.P. and Marculescu, D.",
        TITLE = "MA-AVT: Modality Alignment for Parameter-Efficient Audio-Visual
Transformers",
        BOOKTITLE = ECV24,
        YEAR = "2024",
        PAGES = "7996-8005",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276930"}

@inproceedings{bb282234,
        AUTHOR = "Yang, Z.Y. and Lin, J.G. and Chen, P.H. and Cherian, A. and Marks, T.K. and Le Roux, J. and Gan, C.",
        TITLE = "RILA: Reflective and Imaginative Language Agent for Zero-Shot
Semantic Audio-Visual Navigation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16251-16261",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276931"}

@inproceedings{bb282235,
        AUTHOR = "Dai, Y.S. and Chen, H. and Du, J. and Wang, R. and Chen, S.H. and Wang, H.T. and Lee, C.H.",
        TITLE = "A Study of Dropout-Induced Modality Bias on Robustness to Missing
Video Frames for Audio-Visual Speech Recognition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27435-27445",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276932"}

@inproceedings{bb282236,
        AUTHOR = "Galland, L. and Pelachaud, C. and Pecune, F.",
        TITLE = "Seeing and Hearing What Has Not Been Said: A multimodal client
behavior classifier in Motivational Interviewing with interpretable
fusion",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-9",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276933"}

@inproceedings{bb282237,
        AUTHOR = "Praveen, R.G. and Alam, J.",
        TITLE = "Audio-Visual Person Verification Based on Recursive Fusion of Joint
Cross-Attention",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276934"}

@inproceedings{bb282238,
        AUTHOR = "Praveen, R.G. and Alam, J.",
        TITLE = "Dynamic Cross Attention for Audio-Visual Person Verification",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276935"}

@inproceedings{bb282239,
        AUTHOR = "He, Y.H. and Shin, S. and Cherian, A. and Trigoni, N. and Markham, A.",
        TITLE = "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone
Array and RGB Images",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5484-5495",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276936"}

@inproceedings{bb282240,
        AUTHOR = "Ghaleb, E. and Burenko, I. and Rasenberg, M. and Pouw, W. and Uhrig, P. and Holler, J. and Toni, I. and Ozyurek, A. and Fernandez, R.",
        TITLE = "Co-Speech Gesture Detection through Multi-Phase Sequence Labeling",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "3995-4003",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276937"}

@inproceedings{bb282241,
        AUTHOR = "Liu, J.X. and Wang, Y. and Ju, C. and Ma, C.F. and Zhang, Y. and Xie, W.",
        TITLE = "Annotation-free Audio-Visual Segmentation",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5592-5602",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276938"}

@inproceedings{bb282242,
        AUTHOR = "Xu, Y.T. and Hu, C.H. and Lee, G.H.",
        TITLE = "Rethink Cross-Modal Fusion in Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5603-5612",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276939"}

@inproceedings{bb282243,
        AUTHOR = "Rachavarapu, K.K. and Ramakrishnan, K. and Rajagopalan, A. N.",
        TITLE = "Weakly-Supervised Audio-Visual Video Parsing with Prototype-Based
Pseudo-Labeling",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18952-18962",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276940"}

@inproceedings{bb282244,
        AUTHOR = "Rachavarapu, K.K. and Rajagopalan, A.N.",
        TITLE = "Boosting Positive Segments for Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10158-10168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276941"}

@inproceedings{bb282245,
        AUTHOR = "Chen, J. and Wang, W.G. and Liu, S. and Li, H.S. and Yang, Y.",
        TITLE = "Omnidirectional Information Gathering for Knowledge Transfer-based
Audio-Visual Navigation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10959-10969",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276942"}

@inproceedings{bb282246,
        AUTHOR = "Cheng, X.Z. and Jin, T. and Huang, R.J. and Li, L.J. and Lin, W. and Wang, Z. and Wang, Y. and Liu, H.D. and Yin, A.X. and Zhao, Z.",
        TITLE = "MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream
Mixup for Visual Speech Translation and Recognition",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15689-15699",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276943"}

@inproceedings{bb282247,
        AUTHOR = "Georgescu, M.I. and Fonseca, E. and Ionescu, R.T. and Lucic, M. and Schmid, C. and Arnab, A.",
        TITLE = "Audiovisual Masked Autoencoders",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "16098-16108",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276944"}

@inproceedings{bb282248,
        AUTHOR = "Chen, M.F. and Su, K. and Shlizerman, E.",
        TITLE = "Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by
Sparse Audio-Visual Samples",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "7819-7828",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276945"}

@inproceedings{bb282249,
        AUTHOR = "Xie, H.X. and Lee, M.X. and Chen, T.J. and Chen, H.J. and Liu, H.I. and Shuai, H.H. and Cheng, W.H.",
        TITLE = "Most Important Person-guided Dual-branch Cross-Patch Attention for
Group Affect Recognition",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20541-20551",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276946"}

@inproceedings{bb282250,
        AUTHOR = "Djilali, Y.A.D. and Narayan, S. and Boussaid, H. and Almazrouei, E. and Debbah, M.",
        TITLE = "Lip2Vec: Efficient and Robust Visual Speech Recognition via
Latent-to-Latent Visual to Audio Representation Mapping",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13744-13755",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276947"}

@inproceedings{bb282251,
        AUTHOR = "Chen, G.Y. and Zhang, D. and Liu, T. and Du, X.Y.",
        TITLE = "Local-Global Contrast for Learning Voice-Face Representations",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "51-55",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276948"}

@inproceedings{bb282252,
        AUTHOR = "Hong, J. and Kim, M. and Choi, J. and Ro, Y.M.",
        TITLE = "Watch or Listen: Robust Audio-Visual Speech Recognition with Visual
Corruption Modeling and Reliability Scoring",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18783-18794",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276949"}

@inproceedings{bb282253,
        AUTHOR = "Porgali, B. and Albiero, V. and Ryda, J. and Ferrer, C.C. and Hazirbas, C.",
        TITLE = "The Casual Conversations v2 Dataset: A diverse, large benchmark for
measuring fairness and robustness in audio/vision/speech models",
        BOOKTITLE = FaDE-TCV23,
        YEAR = "2023",
        PAGES = "10-17",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276950"}

@inproceedings{bb282254,
        AUTHOR = "Xiong, J.W. and Wang, G. and Zhang, P. and Huang, W. and Zha, Y.F. and Zhai, G.T.",
        TITLE = "CASP-Net: Rethinking Video Saliency Prediction from an Audio-Visual
Consistency Perceptual Perspective",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6441-6450",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276951"}

@inproceedings{bb282255,
        AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y.",
        TITLE = "A Light Weight Model for Active Speaker Detection",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22932-22941",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276952"}

@inproceedings{bb282256,
        AUTHOR = "Seo, P.H. and Nagrani, A. and Schmid, C.",
        TITLE = "AVFormer: Injecting Vision into Frozen Speech Models for Zero-Shot
AV-ASR",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22922-22931",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276953"}

@inproceedings{bb282257,
        AUTHOR = "Feng, D. and Yang, S. and Shan, S.G. and Chen, X.L.",
        TITLE = "Audio-Driven Deformation Flow for Effective Lip Reading",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "274-280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276954"}

@inproceedings{bb282258,
        AUTHOR = "Varshney, M. and Yadav, R. and Namboodiri, V.P. and Hegde, R.M.",
        TITLE = "Learning Speaker-specific Lip-to-Speech Generation",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "491-498",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276955"}

@inproceedings{bb282259,
        AUTHOR = "Shi, C. and Yang, S.",
        TITLE = "Spatial and Visual Perspective-Taking via View Rotation and Relation
Reasoning for Embodied Reference Understanding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:201-218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276956"}

@inproceedings{bb282260,
        AUTHOR = "Hayes, T. and Zhang, S.Y. and Yin, X. and Pang, G. and Sheng, S. and Yang, H. and Ge, S.W. and Hu, Q.Y. and Parikh, D.",
        TITLE = "MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and
GENeration",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "VIII:431-449",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276957"}

@inproceedings{bb282261,
        AUTHOR = "van Horn, G. and Qian, R. and Wilber, K. and Adam, H. and Aodha, O.M. and Belongie, S.",
        TITLE = "Exploring Fine-Grained Audiovisual Categorization with the SSW60
Dataset",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "VIII:271-289",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276958"}

@inproceedings{bb282262,
        AUTHOR = "Yu, S. and Wu, P. and Liang, P.P. and Salakhutdinov, R. and Morency, L.P.",
        TITLE = "PACS: A Dataset for Physical Audiovisual CommonSense Reasoning",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:292-309",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276959"}

@inproceedings{bb282263,
        AUTHOR = "Cheng, H.Y. and Liu, Z.Y. and Zhou, H. and Qian, C. and Wu, W. and Wang, L.M.",
        TITLE = "Joint-Modal Label Denoising for Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXIV:431-448",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276960"}

@inproceedings{bb282264,
        AUTHOR = "Zhang, Z.Q. and Zhang, J. and Zhang, J.S. and Wu, M.H. and Fang, X. and Dai, L.R.",
        TITLE = "Learning Contextually Fused Audio-Visual Representations for
Audio-Visual Speech Recognition",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "1346-1350",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276961"}

@inproceedings{bb282265,
        AUTHOR = "Montesinos, J.F. and Kadandale, V.S. and Haro, G.",
        TITLE = "VoViT: Low Latency Graph-Based Audio-Visual Voice Separation
Transformer",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:310-326",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276962"}

@inproceedings{bb282266,
        AUTHOR = "Tzinis, E. and Wisdom, S. and Remez, T. and Hershey, J.R.",
        TITLE = "AudioScopeV2: Audio-Visual Attention Architectures for Calibrated
Open-Domain On-Screen Sound Separation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:368-385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276963"}

@inproceedings{bb282267,
        AUTHOR = "Zhou, J.X. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.R.",
        TITLE = "Audio-Visual Segmentation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:386-403",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276964"}

@inproceedings{bb282268,
        AUTHOR = "Alcazar, J.L. and Cordes, M. and Zhao, C. and Ghanem, B.",
        TITLE = "End-to-End Active Speaker Detection",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:126-143",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276965"}

@inproceedings{bb282269,
        AUTHOR = "Chen, C.G. and Gao, R.H. and Calamia, P. and Grauman, K.",
        TITLE = "Visual Acoustic Matching",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18836-18846",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276966"}

@inproceedings{bb282270,
        AUTHOR = "Lee, S. and Kim, H.I. and Ro, Y.M.",
        TITLE = "Weakly Paired Associative Learning for Sound and Image
Representations via Bimodal Associative Memory",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10524-10533",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276967"}

@inproceedings{bb282271,
        AUTHOR = "Vasudevan, A.B. and Dai, D.X. and Van Gool, L.J.",
        TITLE = "Sound and Visual Representation Learning with Multiple Pretraining
Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "14596-14606",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276968"}

@inproceedings{bb282272,
        AUTHOR = "Ng, E. and Joo, H. and Hu, L.W. and Li, H. and Darrell, T.J. and Kanazawa, A. and Ginosar, S.",
        TITLE = "Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "20363-20373",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276969"}

@inproceedings{bb282273,
        AUTHOR = "Kurzendorfer, D. and Mercea, O.B. and Koepke, A.S. and Akata, Z.",
        TITLE = "Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large
Multi-Modal Models",
        BOOKTITLE = L3D-IVU24,
        YEAR = "2024",
        PAGES = "2627-2638",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276970"}

@inproceedings{bb282274,
        AUTHOR = "Mercea, O.B. and Hummel, T. and Koepke, A.S. and Akata, Z.",
        TITLE = "Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XX:488-505",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276971"}

@inproceedings{bb282275,
        AUTHOR = "Mercea, O.B. and Riesch, L. and Koepke, A.S. and Akata, Z.",
        TITLE = "Audiovisual Generalised Zero-shot Learning with Cross-modal Attention
and Language",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10543-10553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276972"}

@inproceedings{bb282276,
        AUTHOR = "Karas, V. and Tellamekala, M.K. and Mallol Ragolta, A. and Valstar, M. and Schuller, B.W.",
        TITLE = "Time-Continuous Audiovisual Fusion with Recurrence vs Attention for
In-The-Wild Affect Recognition",
        BOOKTITLE = ABAW22,
        YEAR = "2022",
        PAGES = "2381-2390",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276973"}

@inproceedings{bb282277,
        AUTHOR = "Yang, K. and Markovic, D. and Krenn, S. and Agrawal, V. and Richard, A.",
        TITLE = "Audio-Visual Speech Codecs: Rethinking Audio-Visual Speech
Enhancement by Re-Synthesis",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "8217-8227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276974"}

@inproceedings{bb282278,
        AUTHOR = "Kim, M. and Hong, J. and Park, S.J. and Ro, Y.M.",
        TITLE = "Multi-modality Associative Bridging through Memory:
Speech Sound Recollected from Face Video",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "296-306",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276975"}

@inproceedings{bb282279,
        AUTHOR = "Li, J. and Kang, D. and Pei, W.J. and Zhe, X.F. and Zhang, Y. and He, Z.Y. and Bao, L.C.",
        TITLE = "Audio2Gestures: Generating Diverse Gestures from Speech Audio with
Conditional Variational Autoencoders",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "11273-11282",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276976"}

@inproceedings{bb282280,
        AUTHOR = "Ye, M. and You, Q.Z. and Ma, F.L.",
        TITLE = "QUALIFIER: Question-Guided Self-Attentive Multimodal Fusion Network
for Audio Visual Scene-Aware Dialog",
        BOOKTITLE = WACV22,
        YEAR = "2022",
        PAGES = "2503-2511",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276977"}

@inproceedings{bb282281,
        AUTHOR = "Yao, S. and Min, X.K. and Zhai, G.T.",
        TITLE = "Deep Audio-Visual Fusion Neural Network for Saliency Estimation",
        BOOKTITLE = ICIP21,
        YEAR = "2021",
        PAGES = "1604-1608",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276978"}

@inproceedings{bb282282,
        AUTHOR = "Krishnamurthy, S.",
        TITLE = "Learning Self-supervised Audio-Visual Representations for Sound
Recommendations",
        BOOKTITLE = ISVC21,
        YEAR = "2021",
        PAGES = "II:124-138",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276979"}

@inproceedings{bb282283,
        AUTHOR = "Shi, W.J. and Pattichis, M.S. and Celedon Pattichis, S. and LopezLeiva, C.",
        TITLE = "Talking Detection in Collaborative Learning Environments",
        BOOKTITLE = CAIP21,
        YEAR = "2021",
        PAGES = "II:242-251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276980"}

@inproceedings{bb282284,
        AUTHOR = "Wang, G. and Chen, C.L.Z. and Fan, D.P. and Hao, A. and Qin, H.",
        TITLE = "From Semantic Categories to Fixations: A Novel Weakly-supervised
Visual-auditory Saliency Detection Approach",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15114-15123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276981"}

@inproceedings{bb282285,
        AUTHOR = "Wen, P.S. and Xu, Q.Q. and Jiang, Y.B.Y. and Yang, Z.Y. and He, Y. and Huang, Q.M.",
        TITLE = "Seeking the Shape of Sound:
An Adaptive Framework for Learning Voice-Face Association",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "16342-16351",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276982"}

@inproceedings{bb282286,
        AUTHOR = "Monfort, M. and Jin, S. and Liu, A. and Harwath, D. and Feris, R.S. and Glass, J. and Oliva, A.",
        TITLE = "Spoken Moments: Learning Joint Audio-Visual Representations from
Video Descriptions",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "14866-14876",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276983"}

@inproceedings{bb282287,
        AUTHOR = "Tian, Y.P. and Xu, C.L.",
        TITLE = "Can audio-visual integration strengthen robustness under multimodal
attacks?",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5597-5607",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276984"}

@inproceedings{bb282288,
        AUTHOR = "Morgado, P. and Vasconcelos, N.M. and Misra, I.",
        TITLE = "Audio-Visual Instance Discrimination with Cross-Modal Agreement",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12470-12481",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276985"}

@inproceedings{bb282289,
        AUTHOR = "Morgado, P. and Misra, I. and Vasconcelos, N.M.",
        TITLE = "Robust Audio-Visual Instance Discrimination",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12929-12940",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276986"}

@inproceedings{bb282290,
        AUTHOR = "Chen, Y.B. and Xian, Y.Q. and Koepke, A.S. and Shan, Y. and Akata, Z.",
        TITLE = "Distilling Audio-Visual Knowledge by Compositional Contrastive
Learning",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7012-7021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276987"}

@inproceedings{bb282291,
        AUTHOR = "Zhang, Z.M. and Li, L.C. and Ding, Y. and Fan, C.J.",
        TITLE = "Flow-guided One-shot Talking Face Generation with a High-resolution
Audio-visual Dataset",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "3660-3669",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276988"}

@inproceedings{bb282292,
        AUTHOR = "Gao, R.H. and Grauman, K.",
        TITLE = "VisualVoice: Audio-Visual Speech Separation with Cross-Modal
Consistency",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15490-15500",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276989"}

@inproceedings{bb282293,
        AUTHOR = "Mazumder, P. and Sing, P. and Parida, K.K. and Namboodiri, V.P.",
        TITLE = "AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by
Reconstructing Label Features from Multi-Modal Embeddings",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "3089-3098",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276990"}

@inproceedings{bb282294,
        AUTHOR = "Ishikawa, R. and Hachiuma, R. and Kurobe, A. and Saito, H.",
        TITLE = "Single-modal Incremental Terrain Clustering from Self-Supervised
Audio-Visual Feature Learning",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "9399-9406",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276991"}

@inproceedings{bb282295,
        AUTHOR = "Madrigal, F. and Lerasle, F. and Pibre, L. and Ferrane, I.",
        TITLE = "Audio-Video detection of the active speaker in meetings",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "2536-2543",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276992"}

@inproceedings{bb282296,
        AUTHOR = "Tellamekala, M.K. and Valstar, M. and Pound, M. and Giesbrecht, T.",
        TITLE = "Audio-Visual Predictive Coding for Self-Supervised Visual
Representation Learning",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "9912-9919",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276993"}

@inproceedings{bb282297,
        AUTHOR = "Liu, H. and Wang, Y. and Yang, B.",
        TITLE = "Mutual Alignment between Audiovisual Features for End-to-End
Audiovisual Speech Recognition",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "5348-5353",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276994"}

@inproceedings{bb282298,
        AUTHOR = "Liu, H. and Xu, W.L. and Yang, B.",
        TITLE = "Audio-Visual Speech Recognition Using A Two-Step Feature Fusion
Strategy",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "1896-1903",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276995"}

@inproceedings{bb282299,
        AUTHOR = "Liu, H. and Li, W.H. and Yang, B.",
        TITLE = "Robust Audio-Visual Speech Recognition Based on Hybrid Fusion",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "7580-7586",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT276996"}
Last update:Oct 6, 2025 at 14:07:43