@article{bb284000,
AUTHOR = "Zhu, D.D. and Zhang, K.W. and Zhang, N. and Zhou, Q.Q. and Min, X.K. and Zhai, G.T. and Yang, X.K.",
TITLE = "Unified Audio-Visual Saliency Model for Omnidirectional Videos With
Spatial Audio",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "764-775",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278687"}
@article{bb284001,
AUTHOR = "Zhu, Y.X. and Duan, H.Y. and Zhang, K.W. and Zhu, Y.C. and Zhu, X. and Teng, L. and Min, X.K. and Zhai, G.T.",
TITLE = "How Does Audio Influence Visual Attention in Omnidirectional Videos?
Database and Model",
JOURNAL = IP,
VOLUME = "34",
YEAR = "2025",
PAGES = "3447-3462",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278688"}
@inproceedings{bb284002,
AUTHOR = "Li, J. and Zhai, G.T. and Zhu, Y.C. and Zhou, J. and Zhang, X.P.",
TITLE = "How Sound Affects Visual Attention in Omnidirectional Videos",
BOOKTITLE = ICIP22,
YEAR = "2022",
PAGES = "3066-3070",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278689"}
@article{bb284003,
AUTHOR = "Qian, X.Y. and Xue, W. and Zhang, Q. and Tao, R.J. and Li, H.Z.",
TITLE = "Deep Cross-Modal Retrieval Between Spatial Image and Acoustic Speech",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "4480-4489",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278690"}
@article{bb284004,
AUTHOR = "Xie, J.W. and Liu, Z. and Li, G.Y. and Song, Y.J.",
TITLE = "Audio-visual saliency prediction with multisensory perception and
integration",
JOURNAL = IVC,
VOLUME = "143",
YEAR = "2024",
PAGES = "104955",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278691"}
@article{bb284005,
AUTHOR = "Sun, X. and Wang, X. and Liu, Q. and Zhou, X.",
TITLE = "Multi-Level Signal Fusion for Enhanced Weakly-Supervised Audio-Visual
Video Parsing",
JOURNAL = SPLetters,
VOLUME = "31",
YEAR = "2024",
PAGES = "1149-1153",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278692"}
@article{bb284006,
AUTHOR = "Han, H.C. and Zheng, Q.H. and Luo, M.N. and Miao, K.Y. and Tian, F. and Chen, Y.",
TITLE = "Noise-Tolerant Learning for Audio-Visual Action Recognition",
JOURNAL = MultMed,
VOLUME = "26",
YEAR = "2024",
PAGES = "7761-7774",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278693"}
@article{bb284007,
AUTHOR = "Xiao, Y.W. and Liu, X.M. and Zhu, A. and Huang, J.",
TITLE = "Relational-branchformer: Novel framework for audio-visual speech
recognition",
JOURNAL = IVC,
VOLUME = "149",
YEAR = "2024",
PAGES = "105182",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278694"}
@article{bb284008,
AUTHOR = "Li, W.R. and Wang, P.H. and Xiong, R.Q. and Fan, X.P.",
TITLE = "Spiking Tucker Fusion Transformer for Audio-Visual Zero-Shot Learning",
JOURNAL = IP,
VOLUME = "33",
YEAR = "2024",
PAGES = "4840-4852",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278695"}
@article{bb284009,
AUTHOR = "Li, W.R. and Wang, P.H. and Wang, X.T. and Zuo, W.M. and Fan, X.P. and Tian, Y.H.",
TITLE = "Multi-Timescale Motion-Decoupled Spiking Transformer for Audio-Visual
Zero-Shot Learning",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "11",
MONTH = "November",
PAGES = "10772-10786",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278696"}
@article{bb284010,
AUTHOR = "Li, K. and Xie, F. and Chen, H. and Yuan, K. and Hu, X.L.",
TITLE = "An Audio-Visual Speech Separation Model Inspired by
Cortico-Thalamo-Cortical Circuits",
JOURNAL = PAMI,
VOLUME = "46",
YEAR = "2024",
NUMBER = "10",
MONTH = "October",
PAGES = "6637-6651",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278697"}
@article{bb284011,
AUTHOR = "Zhou, J.X. and Guo, D. and Zhong, Y.R. and Wang, M.",
TITLE = "Advancing Weakly-Supervised Audio-Visual Video Parsing via Segment-Wise
Pseudo Labeling",
JOURNAL = IJCV,
VOLUME = "132",
YEAR = "2024",
NUMBER = "11",
MONTH = "November",
PAGES = "5308-5329",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278698"}
@inproceedings{bb284012,
AUTHOR = "Zhou, J.X. and Guo, D. and Mao, Y.X. and Zhong, Y.R. and Chang, X.J. and Wang, M.",
TITLE = "Label-anticipated Event Disentanglement for Audio-visual Video Parsing",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "X: 35-51",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278699"}
@article{bb284013,
AUTHOR = "Liu, J. and Chen, S. and He, X.J. and Guo, L.T. and Zhu, X.X. and Wang, W.N. and Tang, J.H.",
TITLE = "VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and
Dataset",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "2",
MONTH = "February",
PAGES = "708-724",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278700"}
@article{bb284014,
AUTHOR = "Steinmetz, N. and Balal, N.",
TITLE = "Feasibility Study of Real-Time Speech Detection and Characterization
Using Millimeter-Wave Micro-Doppler Radar",
JOURNAL = RS,
VOLUME = "17",
YEAR = "2025",
NUMBER = "1",
PAGES = "91",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278701"}
@article{bb284015,
AUTHOR = "Chen, T.X. and Tan, Z.T. and Gong, T. and Chu, Q. and Wu, Y. and Liu, B. and Yu, N.H. and Lu, L. and Ye, J.P.",
TITLE = "Bootstrapping Audio-Visual Video Segmentation by Strengthening Audio
Cues",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "3",
MONTH = "March",
PAGES = "2398-2409",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278702"}
@article{bb284016,
AUTHOR = "Zhou, J.X. and Shen, X.Y. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.R.",
TITLE = "Audio-Visual Segmentation with Semantics",
JOURNAL = IJCV,
VOLUME = "133",
YEAR = "2025",
NUMBER = "4",
MONTH = "April",
PAGES = "1644-1664",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278703"}
@article{bb284017,
AUTHOR = "Gao, J.Y. and Chen, M.Y. and Xu, C.S.",
TITLE = "Learning Probabilistic Presence-Absence Evidence for
Weakly-Supervised Audio-Visual Event Perception",
JOURNAL = PAMI,
VOLUME = "47",
YEAR = "2025",
NUMBER = "6",
MONTH = "June",
PAGES = "4787-4802",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278704"}
@inproceedings{bb284018,
AUTHOR = "Gao, J.Y. and Chen, M.Y. and Xu, C.S.",
TITLE = "Collecting Cross-Modal Presence-Absence Evidence for
Weakly-Supervised Audio-Visual Event Perception",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "18827-18836",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278705"}
@article{bb284019,
AUTHOR = "Lv, Y. and Liu, Z. and Chang, X.J.",
TITLE = "Consistency-Queried Transformer for Audio-Visual Segmentation",
JOURNAL = IP,
VOLUME = "34",
YEAR = "2025",
PAGES = "2616-2627",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278706"}
@article{bb284020,
AUTHOR = "Zhu, Y. and Li, K. and Yang, Z.X.",
TITLE = "Exploiting EfficientSAM and Temporal Coherence for Audio-Visual
Segmentation",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "2999-3008",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278707"}
@article{bb284021,
AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y. and Chen, Y.R.",
TITLE = "LR-ASD: Lightweight and Robust Network for Active Speaker Detection",
JOURNAL = IJCV,
VOLUME = "133",
YEAR = "2025",
NUMBER = "7",
MONTH = "July",
PAGES = "4749-4769",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278708"}
@article{bb284022,
AUTHOR = "Mao, Y.X. and Zhang, J. and Xiang, M. and Lv, Y.Q. and Li, D. and Zhong, Y.R. and Dai, Y.C.",
TITLE = "Contrastive Conditional Latent Diffusion for Audio-Visual
Segmentation",
JOURNAL = IP,
VOLUME = "34",
YEAR = "2025",
PAGES = "4108-4119",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278709"}
@article{bb284023,
AUTHOR = "Li, K.W. and Chen, H. and Du, J. and Zhou, H.S. and Siniscalchi, S.M. and Niu, S.T. and Xiong, S.F.",
TITLE = "Lightweight Audio-Visual Wake Word Spotting With Diverse Acoustic
Knowledge Distillation",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "7",
MONTH = "July",
PAGES = "7308-7320",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278710"}
@article{bb284024,
AUTHOR = "Vilaca, L. and Yu, Y. and Viana, P.",
TITLE = "A Survey of Recent Advances and Challenges in Deep Audio-Visual
Correlation Learning",
JOURNAL = Surveys,
VOLUME = "57",
YEAR = "2025",
NUMBER = "12",
MONTH = "July",
PAGES = "xx-yy",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278711"}
@article{bb284025,
AUTHOR = "Zhu, C.Z. and Shao, J.L. and Lin, J.X. and Wang, Y.J. and Wang, J. and Tang, J.H. and Li, K.",
TITLE = "fMRI2GES: Co-Speech Gesture Reconstruction From fMRI Signal With Dual
Brain Decoding Alignment",
JOURNAL = CirSysVideo,
VOLUME = "35",
YEAR = "2025",
NUMBER = "9",
MONTH = "September",
PAGES = "9017-9029",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278712"}
@article{bb284026,
AUTHOR = "Zhou, D.L. and Zhang, Y.K. and Wu, J.H. and Zhang, X.Y. and Xie, L. and Yin, E.",
TITLE = "AVE Speech: A Comprehensive Multimodal Dataset for Speech Recognition
Integrating Audio, Visual, and Electromyographic Signals",
JOURNAL = HMS,
VOLUME = "55",
YEAR = "2025",
NUMBER = "4",
MONTH = "August",
PAGES = "559-568",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278713"}
@article{bb284027,
AUTHOR = "Xuan, H.Y. and Liu, T.X. and Dong, W.X. and Li, Z.H. and Chen, S.",
TITLE = "X-STA: Cross-Modal Spatial-Temporal Alignment Network for Unified
Audio-Visual Segmentation",
JOURNAL = SPLetters,
VOLUME = "32",
YEAR = "2025",
PAGES = "2883-2887",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278714"}
@article{bb284028,
AUTHOR = "Gong, S. and Zhuge, Y.Z. and Zhang, L. and Wang, Y.F. and Zhang, P.P. and Wang, L.J. and Lu, H.C.",
TITLE = "AVS-Mamba: Exploring Temporal and Multi-Modal Mamba for Audio-Visual
Segmentation",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "5413-5425",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278715"}
@article{bb284029,
AUTHOR = "Attia, D. and Benazza Benyahia, A.",
TITLE = "Recognizing of Vocal Fold Disorders From High Speed Video: Use of
Spatio-Temporal Deep Neural Networks",
JOURNAL = IJIST,
VOLUME = "35",
YEAR = "2025",
NUMBER = "5",
PAGES = "e70170",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278716"}
@article{bb284030,
AUTHOR = "Gong, S. and Zhuge, Y.Z. and Zhang, L. and Zhang, P.P. and Lu, H.C.",
TITLE = "Complementary and Contrastive Learning for Audio-Visual Segmentation",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "7407-7418",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278717"}
@inproceedings{bb284031,
AUTHOR = "Huang, S. and Wu, J.X. and Wei, X.Y. and Cai, Y. and Jiang, D.M. and Wang, Y.W.",
TITLE = "Sound Bridge: Associating Egocentric and Exocentric Videos via Audio
Cues",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "28942-28951",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278718"}
@inproceedings{bb284032,
AUTHOR = "Du, H.H. and Li, G.Y. and Zhou, C. and Zhang, C.J. and Zhao, A. and Hu, D.",
TITLE = "Crab: A Unified Audio-Visual Scene Understanding Model with Explicit
Cooperation",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "18804-18814",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278719"}
@inproceedings{bb284033,
AUTHOR = "Shaar, E. and Shaulov, A. and Chechik, G. and Wolf, L.B.",
TITLE = "Adapting to the Unknown: Training-Free Audio-Visual Event Perception
with Dynamic Thresholds",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3142-3151",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278720"}
@inproceedings{bb284034,
AUTHOR = "Huang, S.F. and Ling, R. and Hui, T.R. and Li, H.Y. and Zhou, X. and Zhang, S.F. and Liu, S. and Hong, R.C. and Wang, M.",
TITLE = "Revisiting Audio-Visual Segmentation with Vision-Centric Transformer",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "8352-8361",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278721"}
@inproceedings{bb284035,
AUTHOR = "Wu, X.C. and Sun, H. and Wang, Y.F. and Nie, J.Y. and Zhang, J. and Wang, Y.B. and Xue, J.X. and He, L.",
TITLE = "AVF-MAE++: Scaling Affective Video Facial Masked Autoencoders via
Efficient Audio-Visual Self-Supervised Learning",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "9142-9153",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278722"}
@inproceedings{bb284036,
AUTHOR = "Lai, Y.H. and Ebbers, J. and Wang, Y.C.A.F. and Germain, F. and Jones, M.J. and Chatterjee, M.",
TITLE = "UWAV: Uncertainty-Weighted Weakly-Supervised Audio-Visual Video
Parsing",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13561-13570",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278723"}
@inproceedings{bb284037,
AUTHOR = "Guo, R. and Ying, X.H. and Chen, Y. and Niu, D. and Li, G.Y. and Qu, L. and Qi, Y.Y. and Zhou, J.X. and Xing, B. and Yue, W.Z. and Shi, J. and Wang, Q. and Zhang, P.L. and Liang, B.",
TITLE = "Audio-Visual Instance Segmentation",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13550-13560",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278724"}
@inproceedings{bb284038,
AUTHOR = "Zhang, Y.H. and Yang, S. and Shan, S.G. and Chen, X.L.",
TITLE = "ES3: Evolving Self-Supervised Learning of Robust Audio-Visual Speech
Representations",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27059-27069",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278725"}
@inproceedings{bb284039,
AUTHOR = "Yang, Q. and Nie, X. and Li, T. and Gao, P.F. and Guo, Y. and Zhen, C. and Yan, P.F. and Xiang, S.M.",
TITLE = "Cooperation Does Matter: Exploring Multi-Order Bilateral Relations
for Audio-Visual Segmentation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27124-27133",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278726"}
@inproceedings{bb284040,
AUTHOR = "Xiong, J.W. and Zhang, P. and You, T. and Li, C.Y. and Huang, W. and Zha, Y.F.",
TITLE = "DiffSal: Joint Audio and Video Learning for Diffusion Saliency
Prediction",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27263-27273",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278727"}
@inproceedings{bb284041,
AUTHOR = "Li, X. and Wang, J.L. and Xu, X.H. and Peng, X.L. and Singh, R. and Lu, Y. and Raj, B.",
TITLE = "QDFormer: Towards Robust Audiovisual Segmentation in Complex
Environments with Quantization-based Semantic Decomposition",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "3402-3413",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278728"}
@inproceedings{bb284042,
AUTHOR = "Singh, N. and Wu, C.W. and Orife, I. and Kalayeh, M.",
TITLE = "Looking Similar, Sounding Different: Leveraging Counterfactual
Cross-Modal Pairs for Audiovisual Representation Learning",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26897-26908",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278729"}
@inproceedings{bb284043,
AUTHOR = "Liu, J.X. and Liu, Y.K. and Zhang, F. and Ju, C. and Zhang, Y. and Wang, Y.F.",
TITLE = "Audio-Visual Segmentation via Unlabeled Frame Exploitation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26318-26329",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278730"}
@inproceedings{bb284044,
AUTHOR = "Jia, W.Q. and Liu, M. and Jiang, H. and Ananthabhotla, I. and Rehg, J.M. and Ithapu, V.K. and Gao, R.H.",
TITLE = "The Audio-Visual Conversational Graph: From an Egocentric-Exocentric
Perspective",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26386-26395",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278731"}
@inproceedings{bb284045,
AUTHOR = "Chen, Y.H. and Liu, Y. and Wang, H. and Liu, F. and Wang, C. and Frazer, H. and Carneiro, G.",
TITLE = "Unraveling Instance Associations: A Closer Look for Audio-Visual
Segmentation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26487-26497",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278732"}
@inproceedings{bb284046,
AUTHOR = "Guo, Y.X. and Sun, S.Y. and Ma, S. and Zheng, K. and Bao, X.Y. and Ma, S.J. and Zou, W. and Zheng, Y.",
TITLE = "CrossMAE: Cross-Modality Masked Autoencoders for Region-Aware
Audio-Visual Pre-Training",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "26711-26721",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278733"}
@inproceedings{bb284047,
AUTHOR = "Mo, S.T. and Morgado, P.",
TITLE = "Unveiling the Power of Audio-Visual Early Fusion Transformers with
Dense Interactions Through Masked Modeling",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27176-27186",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278734"}
@inproceedings{bb284048,
AUTHOR = "Wang, K. and Tian, Y.P. and Hatzinakos, D.",
TITLE = "Towards Efficient Audio-Visual Learners via Empowering Pre-trained
Vision Transformers with Cross-Modal Adaptation",
BOOKTITLE = WhatNext24,
YEAR = "2024",
PAGES = "1837-1846",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278735"}
@inproceedings{bb284049,
AUTHOR = "Ryumina, E. and Markitantov, M. and Ryumin, D. and Kaya, H. and Karpov, A.",
TITLE = "Zero-Shot Audio-Visual Compound Expression Recognition Method based
on Emotion Probability Fusion",
BOOKTITLE = ABAW24,
YEAR = "2024",
PAGES = "4752-4760",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278736"}
@inproceedings{bb284050,
AUTHOR = "Mahmud, T. and Mo, S.T. and Tian, Y.P. and Marculescu, D.",
TITLE = "MA-AVT: Modality Alignment for Parameter-Efficient Audio-Visual
Transformers",
BOOKTITLE = ECV24,
YEAR = "2024",
PAGES = "7996-8005",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278737"}
@inproceedings{bb284051,
AUTHOR = "Yang, Z.Y. and Lin, J.G. and Chen, P.H. and Cherian, A. and Marks, T.K. and Le Roux, J. and Gan, C.",
TITLE = "RILA: Reflective and Imaginative Language Agent for Zero-Shot
Semantic Audio-Visual Navigation",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "16251-16261",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278738"}
@inproceedings{bb284052,
AUTHOR = "Dai, Y.S. and Chen, H. and Du, J. and Wang, R. and Chen, S.H. and Wang, H.T. and Lee, C.H.",
TITLE = "A Study of Dropout-Induced Modality Bias on Robustness to Missing
Video Frames for Audio-Visual Speech Recognition",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "27435-27445",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278739"}
@inproceedings{bb284053,
AUTHOR = "Galland, L. and Pelachaud, C. and Pecune, F.",
TITLE = "Seeing and Hearing What Has Not Been Said: A multimodal client
behavior classifier in Motivational Interviewing with interpretable
fusion",
BOOKTITLE = FG24,
YEAR = "2024",
PAGES = "1-9",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278740"}
@inproceedings{bb284054,
AUTHOR = "Praveen, R.G. and Alam, J.",
TITLE = "Audio-Visual Person Verification Based on Recursive Fusion of Joint
Cross-Attention",
BOOKTITLE = FG24,
YEAR = "2024",
PAGES = "1-5",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278741"}
@inproceedings{bb284055,
AUTHOR = "Praveen, R.G. and Alam, J.",
TITLE = "Dynamic Cross Attention for Audio-Visual Person Verification",
BOOKTITLE = FG24,
YEAR = "2024",
PAGES = "1-5",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278742"}
@inproceedings{bb284056,
AUTHOR = "He, Y.H. and Shin, S. and Cherian, A. and Trigoni, N. and Markham, A.",
TITLE = "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone
Array and RGB Images",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "5484-5495",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278743"}
@inproceedings{bb284057,
AUTHOR = "Ghaleb, E. and Burenko, I. and Rasenberg, M. and Pouw, W. and Uhrig, P. and Holler, J. and Toni, I. and Ozyurek, A. and Fernandez, R.",
TITLE = "Co-Speech Gesture Detection through Multi-Phase Sequence Labeling",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "3995-4003",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278744"}
@inproceedings{bb284058,
AUTHOR = "Liu, J.X. and Wang, Y. and Ju, C. and Ma, C.F. and Zhang, Y. and Xie, W.",
TITLE = "Annotation-free Audio-Visual Segmentation",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "5592-5602",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278745"}
@inproceedings{bb284059,
AUTHOR = "Xu, Y.T. and Hu, C.H. and Lee, G.H.",
TITLE = "Rethink Cross-Modal Fusion in Weakly-Supervised Audio-Visual Video
Parsing",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "5603-5612",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278746"}
@inproceedings{bb284060,
AUTHOR = "Rachavarapu, K.K. and Ramakrishnan, K. and Rajagopalan, A. N.",
TITLE = "Weakly-Supervised Audio-Visual Video Parsing with Prototype-Based
Pseudo-Labeling",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18952-18962",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278747"}
@inproceedings{bb284061,
AUTHOR = "Rachavarapu, K.K. and Rajagopalan, A.N.",
TITLE = "Boosting Positive Segments for Weakly-Supervised Audio-Visual Video
Parsing",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "10158-10168",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278748"}
@inproceedings{bb284062,
AUTHOR = "Chen, J. and Wang, W.G. and Liu, S. and Li, H.S. and Yang, Y.",
TITLE = "Omnidirectional Information Gathering for Knowledge Transfer-based
Audio-Visual Navigation",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "10959-10969",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278749"}
@inproceedings{bb284063,
AUTHOR = "Cheng, X.Z. and Jin, T. and Huang, R.J. and Li, L.J. and Lin, W. and Wang, Z. and Wang, Y. and Liu, H.D. and Yin, A.X. and Zhao, Z.",
TITLE = "MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream
Mixup for Visual Speech Translation and Recognition",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "15689-15699",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278750"}
@inproceedings{bb284064,
AUTHOR = "Georgescu, M.I. and Fonseca, E. and Ionescu, R.T. and Lucic, M. and Schmid, C. and Arnab, A.",
TITLE = "Audiovisual Masked Autoencoders",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "16098-16108",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278751"}
@inproceedings{bb284065,
AUTHOR = "Chen, M.F. and Su, K. and Shlizerman, E.",
TITLE = "Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by
Sparse Audio-Visual Samples",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "7819-7828",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278752"}
@inproceedings{bb284066,
AUTHOR = "Xie, H.X. and Lee, M.X. and Chen, T.J. and Chen, H.J. and Liu, H.I. and Shuai, H.H. and Cheng, W.H.",
TITLE = "Most Important Person-guided Dual-branch Cross-Patch Attention for
Group Affect Recognition",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "20541-20551",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278753"}
@inproceedings{bb284067,
AUTHOR = "Djilali, Y.A.D. and Narayan, S. and Boussaid, H. and Almazrouei, E. and Debbah, M.",
TITLE = "Lip2Vec: Efficient and Robust Visual Speech Recognition via
Latent-to-Latent Visual to Audio Representation Mapping",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "13744-13755",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278754"}
@inproceedings{bb284068,
AUTHOR = "Chen, G.Y. and Zhang, D. and Liu, T. and Du, X.Y.",
TITLE = "Local-Global Contrast for Learning Voice-Face Representations",
BOOKTITLE = ICIP23,
YEAR = "2023",
PAGES = "51-55",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278755"}
@inproceedings{bb284069,
AUTHOR = "Hong, J. and Kim, M. and Choi, J. and Ro, Y.M.",
TITLE = "Watch or Listen: Robust Audio-Visual Speech Recognition with Visual
Corruption Modeling and Reliability Scoring",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "18783-18794",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278756"}
@inproceedings{bb284070,
AUTHOR = "Porgali, B. and Albiero, V. and Ryda, J. and Ferrer, C.C. and Hazirbas, C.",
TITLE = "The Casual Conversations v2 Dataset: A diverse, large benchmark for
measuring fairness and robustness in audio/vision/speech models",
BOOKTITLE = FaDE-TCV23,
YEAR = "2023",
PAGES = "10-17",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278757"}
@inproceedings{bb284071,
AUTHOR = "Xiong, J.W. and Wang, G. and Zhang, P. and Huang, W. and Zha, Y.F. and Zhai, G.T.",
TITLE = "CASP-Net: Rethinking Video Saliency Prediction from an Audio-Visual
Consistency Perceptual Perspective",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "6441-6450",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278758"}
@inproceedings{bb284072,
AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y.",
TITLE = "A Light Weight Model for Active Speaker Detection",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "22932-22941",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278759"}
@inproceedings{bb284073,
AUTHOR = "Seo, P.H. and Nagrani, A. and Schmid, C.",
TITLE = "AVFormer: Injecting Vision into Frozen Speech Models for Zero-Shot
AV-ASR",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "22922-22931",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278760"}
@inproceedings{bb284074,
AUTHOR = "Feng, D. and Yang, S. and Shan, S.G. and Chen, X.L.",
TITLE = "Audio-Driven Deformation Flow for Effective Lip Reading",
BOOKTITLE = "ICPR22",
YEAR = "2022",
PAGES = "274-280",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278761"}
@inproceedings{bb284075,
AUTHOR = "Varshney, M. and Yadav, R. and Namboodiri, V.P. and Hegde, R.M.",
TITLE = "Learning Speaker-specific Lip-to-Speech Generation",
BOOKTITLE = "ICPR22",
YEAR = "2022",
PAGES = "491-498",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278762"}
@inproceedings{bb284076,
AUTHOR = "Shi, C. and Yang, S.",
TITLE = "Spatial and Visual Perspective-Taking via View Rotation and Relation
Reasoning for Embodied Reference Understanding",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXVI:201-218",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278763"}
@inproceedings{bb284077,
AUTHOR = "Hayes, T. and Zhang, S.Y. and Yin, X. and Pang, G. and Sheng, S. and Yang, H. and Ge, S.W. and Hu, Q.Y. and Parikh, D.",
TITLE = "MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and
GENeration",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "VIII:431-449",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278764"}
@inproceedings{bb284078,
AUTHOR = "van Horn, G. and Qian, R. and Wilber, K. and Adam, H. and Aodha, O.M. and Belongie, S.",
TITLE = "Exploring Fine-Grained Audiovisual Categorization with the SSW60
Dataset",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "VIII:271-289",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278765"}
@inproceedings{bb284079,
AUTHOR = "Yu, S. and Wu, P. and Liang, P.P. and Salakhutdinov, R. and Morency, L.P.",
TITLE = "PACS: A Dataset for Physical Audiovisual CommonSense Reasoning",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXVII:292-309",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278766"}
@inproceedings{bb284080,
AUTHOR = "Cheng, H.Y. and Liu, Z.Y. and Zhou, H. and Qian, C. and Wu, W. and Wang, L.M.",
TITLE = "Joint-Modal Label Denoising for Weakly-Supervised Audio-Visual Video
Parsing",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXIV:431-448",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278767"}
@inproceedings{bb284081,
AUTHOR = "Zhang, Z.Q. and Zhang, J. and Zhang, J.S. and Wu, M.H. and Fang, X. and Dai, L.R.",
TITLE = "Learning Contextually Fused Audio-Visual Representations for
Audio-Visual Speech Recognition",
BOOKTITLE = ICIP22,
YEAR = "2022",
PAGES = "1346-1350",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278768"}
@inproceedings{bb284082,
AUTHOR = "Montesinos, J.F. and Kadandale, V.S. and Haro, G.",
TITLE = "VoViT: Low Latency Graph-Based Audio-Visual Voice Separation
Transformer",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXVII:310-326",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278769"}
@inproceedings{bb284083,
AUTHOR = "Tzinis, E. and Wisdom, S. and Remez, T. and Hershey, J.R.",
TITLE = "AudioScopeV2: Audio-Visual Attention Architectures for Calibrated
Open-Domain On-Screen Sound Separation",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXVII:368-385",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278770"}
@inproceedings{bb284084,
AUTHOR = "Zhou, J.X. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.R.",
TITLE = "Audio-Visual Segmentation",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXVII:386-403",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278771"}
@inproceedings{bb284085,
AUTHOR = "Alcazar, J.L. and Cordes, M. and Zhao, C. and Ghanem, B.",
TITLE = "End-to-End Active Speaker Detection",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XXXVII:126-143",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278772"}
@inproceedings{bb284086,
AUTHOR = "Chen, C.G. and Gao, R.H. and Calamia, P. and Grauman, K.",
TITLE = "Visual Acoustic Matching",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "18836-18846",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278773"}
@inproceedings{bb284087,
AUTHOR = "Lee, S. and Kim, H.I. and Ro, Y.M.",
TITLE = "Weakly Paired Associative Learning for Sound and Image
Representations via Bimodal Associative Memory",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "10524-10533",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278774"}
@inproceedings{bb284088,
AUTHOR = "Vasudevan, A.B. and Dai, D.X. and Van Gool, L.J.",
TITLE = "Sound and Visual Representation Learning with Multiple Pretraining
Tasks",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "14596-14606",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278775"}
@inproceedings{bb284089,
AUTHOR = "Ng, E. and Joo, H. and Hu, L.W. and Li, H. and Darrell, T.J. and Kanazawa, A. and Ginosar, S.",
TITLE = "Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "20363-20373",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278776"}
@inproceedings{bb284090,
AUTHOR = "Kurzendorfer, D. and Mercea, O.B. and Koepke, A.S. and Akata, Z.",
TITLE = "Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large
Multi-Modal Models",
BOOKTITLE = L3D-IVU24,
YEAR = "2024",
PAGES = "2627-2638",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278777"}
@inproceedings{bb284091,
AUTHOR = "Mercea, O.B. and Hummel, T. and Koepke, A.S. and Akata, Z.",
TITLE = "Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning",
BOOKTITLE = ECCV22,
YEAR = "2022",
PAGES = "XX:488-505",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278778"}
@inproceedings{bb284092,
AUTHOR = "Mercea, O.B. and Riesch, L. and Koepke, A.S. and Akata, Z.",
TITLE = "Audiovisual Generalised Zero-shot Learning with Cross-modal Attention
and Language",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "10543-10553",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278779"}
@inproceedings{bb284093,
AUTHOR = "Karas, V. and Tellamekala, M.K. and Mallol Ragolta, A. and Valstar, M. and Schuller, B.W.",
TITLE = "Time-Continuous Audiovisual Fusion with Recurrence vs Attention for
In-The-Wild Affect Recognition",
BOOKTITLE = ABAW22,
YEAR = "2022",
PAGES = "2381-2390",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278780"}
@inproceedings{bb284094,
AUTHOR = "Yang, K. and Markovic, D. and Krenn, S. and Agrawal, V. and Richard, A.",
TITLE = "Audio-Visual Speech Codecs: Rethinking Audio-Visual Speech
Enhancement by Re-Synthesis",
BOOKTITLE = CVPR22,
YEAR = "2022",
PAGES = "8217-8227",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278781"}
@inproceedings{bb284095,
AUTHOR = "Kim, M. and Hong, J. and Park, S.J. and Ro, Y.M.",
TITLE = "Multi-modality Associative Bridging through Memory:
Speech Sound Recollected from Face Video",
BOOKTITLE = ICCV21,
YEAR = "2021",
PAGES = "296-306",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278782"}
@inproceedings{bb284096,
AUTHOR = "Li, J. and Kang, D. and Pei, W.J. and Zhe, X.F. and Zhang, Y. and He, Z.Y. and Bao, L.C.",
TITLE = "Audio2Gestures: Generating Diverse Gestures from Speech Audio with
Conditional Variational Autoencoders",
BOOKTITLE = ICCV21,
YEAR = "2021",
PAGES = "11273-11282",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278783"}
@inproceedings{bb284097,
AUTHOR = "Ye, M. and You, Q.Z. and Ma, F.L.",
TITLE = "QUALIFIER: Question-Guided Self-Attentive Multimodal Fusion Network
for Audio Visual Scene-Aware Dialog",
BOOKTITLE = WACV22,
YEAR = "2022",
PAGES = "2503-2511",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278784"}
@inproceedings{bb284098,
AUTHOR = "Yao, S. and Min, X.K. and Zhai, G.T.",
TITLE = "Deep Audio-Visual Fusion Neural Network for Saliency Estimation",
BOOKTITLE = ICIP21,
YEAR = "2021",
PAGES = "1604-1608",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278785"}
@inproceedings{bb284099,
AUTHOR = "Krishnamurthy, S.",
TITLE = "Learning Self-supervised Audio-Visual Representations for Sound
Recommendations",
BOOKTITLE = ISVC21,
YEAR = "2021",
PAGES = "II:124-138",
BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT278786"}
Last update:Nov 26, 2025 at 20:24:09