@article{bb273600, AUTHOR = "Xiao, Y.W. and Liu, X.M. and Zhu, A. and Huang, J.", TITLE = "Relational-branchformer: Novel framework for audio-visual speech recognition", JOURNAL = IVC, VOLUME = "149", YEAR = "2024", PAGES = "105182", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268325"} @article{bb273601, AUTHOR = "Li, W.R. and Wang, P. and Xiong, R.Q. and Fan, X.P.", TITLE = "Spiking Tucker Fusion Transformer for Audio-Visual Zero-Shot Learning", JOURNAL = IP, VOLUME = "33", YEAR = "2024", PAGES = "4840-4852", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268326"} @article{bb273602, AUTHOR = "Li, K. and Xie, F. and Chen, H. and Yuan, K. and Hu, X.L.", TITLE = "An Audio-Visual Speech Separation Model Inspired by Cortico-Thalamo-Cortical Circuits", JOURNAL = PAMI, VOLUME = "46", YEAR = "2024", NUMBER = "10", MONTH = "October", PAGES = "6637-6651", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268327"} @article{bb273603, AUTHOR = "Zhou, J.X. and Guo, D. and Zhong, Y.R. and Wang, M.", TITLE = "Advancing Weakly-Supervised Audio-Visual Video Parsing via Segment-Wise Pseudo Labeling", JOURNAL = IJCV, VOLUME = "132", YEAR = "2024", NUMBER = "11", MONTH = "November", PAGES = "5308-5329", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268328"} @inproceedings{bb273604, AUTHOR = "Zhou, J.X. and Guo, D. and Mao, Y.X. and Zhong, Y. and Chang, X.J. and Wang, M.", TITLE = "Label-anticipated Event Disentanglement for Audio-visual Video Parsing", BOOKTITLE = ECCV24, YEAR = "2024", PAGES = "X: 35-51", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268329"} @article{bb273605, AUTHOR = "Liu, J. and Chen, S. and He, X.J. and Guo, L.T. and Zhu, X.X. and Wang, W.N. and Tang, J.H.", TITLE = "VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset", JOURNAL = PAMI, VOLUME = "47", YEAR = "2025", NUMBER = "2", MONTH = "February", PAGES = "708-724", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268330"} @article{bb273606, AUTHOR = "Wang, Y. and Qian, X.H. and Zhou, W.", TITLE = "Transformer-Prompted Network: Efficient Audio-Visual Segmentation via Transformer and Prompt Learning", JOURNAL = SPLetters, VOLUME = "32", YEAR = "2025", PAGES = "516-520", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268331"} @article{bb273607, AUTHOR = "Steinmetz, N. and Balal, N.", TITLE = "Feasibility Study of Real-Time Speech Detection and Characterization Using Millimeter-Wave Micro-Doppler Radar", JOURNAL = RS, VOLUME = "17", YEAR = "2025", NUMBER = "1", PAGES = "91", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268332"} @article{bb273608, AUTHOR = "Shi, Z.F. and Wu, Q.B. and Meng, F.M. and Xu, L.F. and Li, H.L.", TITLE = "Cross-Modal Cognitive Consensus Guided Audio-Visual Segmentation", JOURNAL = MultMed, VOLUME = "27", YEAR = "2025", PAGES = "209-223", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268333"} @article{bb273609, AUTHOR = "Zhang, J.X. and Wan, G. and Gao, J.Q. and Ling, Z.H.", TITLE = "Audio-visual representation learning via knowledge distillation from speech foundation models", JOURNAL = PR, VOLUME = "162", YEAR = "2025", PAGES = "111432", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268334"} @article{bb273610, AUTHOR = "Chen, T.X. and Tan, Z.T. and Gong, T. and Chu, Q. and Wu, Y. and Liu, B. and Yu, N.H. and Lu, L. and Ye, J.P.", TITLE = "Bootstrapping Audio-Visual Video Segmentation by Strengthening Audio Cues", JOURNAL = CirSysVideo, VOLUME = "35", YEAR = "2025", NUMBER = "3", MONTH = "March", PAGES = "2398-2409", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268335"} @inproceedings{bb273611, AUTHOR = "Zhang, Y.H. and Yang, S. and Shan, S.G. and Chen, X.L.", TITLE = "ES3: Evolving Self-Supervised Learning of Robust Audio-Visual Speech Representations", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27059-27069", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268336"} @inproceedings{bb273612, AUTHOR = "Yang, Q. and Nie, X. and Li, T. and Gao, P.F. and Guo, Y. and Zhen, C. and Yan, P.F. and Xiang, S.M.", TITLE = "Cooperation Does Matter: Exploring Multi-Order Bilateral Relations for Audio-Visual Segmentation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27124-27133", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268337"} @inproceedings{bb273613, AUTHOR = "Xiong, J.W. and Zhang, P. and You, T. and Li, C.Y. and Huang, W. and Zha, Y.F.", TITLE = "DiffSal: Joint Audio and Video Learning for Diffusion Saliency Prediction", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27263-27273", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268338"} @inproceedings{bb273614, AUTHOR = "Liu, C. and Li, P.P. and Yu, Q. and Sheng, H.W. and Wang, D.D. and Li, L. and Yu, X.", TITLE = "Benchmarking Audio Visual Segmentation for Long-Untrimmed Videos", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "22712-22722", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268339"} @inproceedings{bb273615, AUTHOR = "Li, X. and Wang, J.L. and Xu, X.H. and Peng, X.L. and Singh, R. and Lu, Y. and Raj, B.", TITLE = "QDFormer: Towards Robust Audiovisual Segmentation in Complex Environments with Quantization-based Semantic Decomposition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "3402-3413", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268340"} @inproceedings{bb273616, AUTHOR = "Singh, N. and Wu, C.W. and Orife, I. and Kalayeh, M.", TITLE = "Looking Similar, Sounding Different: Leveraging Counterfactual Cross-Modal Pairs for Audiovisual Representation Learning", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26897-26908", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268341"} @inproceedings{bb273617, AUTHOR = "Liu, J.X. and Liu, Y.K. and Zhang, F. and Ju, C. and Zhang, Y. and Wang, Y.F.", TITLE = "Audio-Visual Segmentation via Unlabeled Frame Exploitation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26318-26329", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268342"} @inproceedings{bb273618, AUTHOR = "Jia, W.Q. and Liu, M. and Jiang, H. and Ananthabhotla, I. and Rehg, J.M. and Ithapu, V.K. and Gao, R.H.", TITLE = "The Audio-Visual Conversational Graph: From an Egocentric-Exocentric Perspective", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26386-26395", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268343"} @inproceedings{bb273619, AUTHOR = "Chen, Y.H. and Liu, Y. and Wang, H. and Liu, F. and Wang, C. and Frazer, H. and Carneiro, G.", TITLE = "Unraveling Instance Associations: A Closer Look for Audio-Visual Segmentation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26487-26497", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268344"} @inproceedings{bb273620, AUTHOR = "Guo, Y.X. and Sun, S.Y. and Ma, S. and Zheng, K. and Bao, X.Y. and Ma, S.J. and Zou, W. and Zheng, Y.", TITLE = "CrossMAE: Cross-Modality Masked Autoencoders for Region-Aware Audio-Visual Pre-Training", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "26711-26721", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268345"} @inproceedings{bb273621, AUTHOR = "Mo, S.T. and Morgado, P.", TITLE = "Unveiling the Power of Audio-Visual Early Fusion Transformers with Dense Interactions Through Masked Modeling", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27176-27186", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268346"} @inproceedings{bb273622, AUTHOR = "Wang, K. and Tian, Y. and Hatzinakos, D.", TITLE = "Towards Efficient Audio-Visual Learners via Empowering Pre-trained Vision Transformers with Cross-Modal Adaptation", BOOKTITLE = WhatNext24, YEAR = "2024", PAGES = "1837-1846", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268347"} @inproceedings{bb273623, AUTHOR = "Ryumina, E. and Markitantov, M. and Ryumin, D. and Kaya, H. and Karpov, A.", TITLE = "Zero-Shot Audio-Visual Compound Expression Recognition Method based on Emotion Probability Fusion", BOOKTITLE = ABAW24, YEAR = "2024", PAGES = "4752-4760", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268348"} @inproceedings{bb273624, AUTHOR = "Mahmud, T. and Mo, S.T. and Tian, Y. and Marculescu, D.", TITLE = "MA-AVT: Modality Alignment for Parameter-Efficient Audio-Visual Transformers", BOOKTITLE = ECV24, YEAR = "2024", PAGES = "7996-8005", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268349"} @inproceedings{bb273625, AUTHOR = "Yang, Z. and Lin, J. and Chen, P.H. and Cherian, A. and Marks, T.K. and Le Roux, J. and Gan, C.", TITLE = "RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "16251-16261", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268350"} @inproceedings{bb273626, AUTHOR = "Dai, Y.S. and Chen, H. and Du, J. and Wang, R. and Chen, S.H. and Wang, H.T. and Lee, C.H.", TITLE = "A Study of Dropout-Induced Modality Bias on Robustness to Missing Video Frames for Audio-Visual Speech Recognition", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "27435-27445", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268351"} @inproceedings{bb273627, AUTHOR = "Galland, L. and Pelachaud, C. and Pecune, F.", TITLE = "Seeing and Hearing What Has Not Been Said: A multimodal client behavior classifier in Motivational Interviewing with interpretable fusion", BOOKTITLE = FG24, YEAR = "2024", PAGES = "1-9", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268352"} @inproceedings{bb273628, AUTHOR = "Praveen, R.G. and Alam, J.", TITLE = "Audio-Visual Person Verification Based on Recursive Fusion of Joint Cross-Attention", BOOKTITLE = FG24, YEAR = "2024", PAGES = "1-5", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268353"} @inproceedings{bb273629, AUTHOR = "Praveen, R.G. and Alam, J.", TITLE = "Dynamic Cross Attention for Audio-Visual Person Verification", BOOKTITLE = FG24, YEAR = "2024", PAGES = "1-5", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268354"} @inproceedings{bb273630, AUTHOR = "He, Y.H. and Shin, S. and Cherian, A. and Trigoni, N. and Markham, A.", TITLE = "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "5484-5495", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268355"} @inproceedings{bb273631, AUTHOR = "Ghaleb, E. and Burenko, I. and Rasenberg, M. and Pouw, W. and Uhrig, P. and Holler, J. and Toni, I. and Ozyurek, A. and Fernandez, R.", TITLE = "Co-Speech Gesture Detection through Multi-Phase Sequence Labeling", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "3995-4003", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268356"} @inproceedings{bb273632, AUTHOR = "Liu, J.X. and Wang, Y. and Ju, C. and Ma, C.F. and Zhang, Y. and Xie, W.", TITLE = "Annotation-free Audio-Visual Segmentation", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "5592-5602", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268357"} @inproceedings{bb273633, AUTHOR = "Xu, Y.T. and Hu, C.H. and Lee, G.H.", TITLE = "Rethink Cross-Modal Fusion in Weakly-Supervised Audio-Visual Video Parsing", BOOKTITLE = WACV24, YEAR = "2024", PAGES = "5603-5612", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268358"} @inproceedings{bb273634, AUTHOR = "Rachavarapu, K.K. and Ramakrishnan, K. and Rajagopalan, A. N.", TITLE = "Weakly-Supervised Audio-Visual Video Parsing with Prototype-Based Pseudo-Labeling", BOOKTITLE = CVPR24, YEAR = "2024", PAGES = "18952-18962", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268359"} @inproceedings{bb273635, AUTHOR = "Rachavarapu, K.K. and Rajagopalan, A.N.", TITLE = "Boosting Positive Segments for Weakly-Supervised Audio-Visual Video Parsing", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "10158-10168", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268360"} @inproceedings{bb273636, AUTHOR = "Chen, J. and Wang, W.G. and Liu, S. and Li, H.S. and Yang, Y.", TITLE = "Omnidirectional Information Gathering for Knowledge Transfer-based Audio-Visual Navigation", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "10959-10969", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268361"} @inproceedings{bb273637, AUTHOR = "Cheng, X. and Jin, T. and Huang, R.J. and Li, L.J. and Lin, W. and Wang, Z. and Wang, Y. and Liu, H. and Yin, A. and Zhao, Z.", TITLE = "MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "15689-15699", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268362"} @inproceedings{bb273638, AUTHOR = "Georgescu, M.I. and Fonseca, E. and Ionescu, R.T. and Lucic, M. and Schmid, C. and Arnab, A.", TITLE = "Audiovisual Masked Autoencoders", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "16098-16108", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268363"} @inproceedings{bb273639, AUTHOR = "Chen, M.F. and Su, K. and Shlizerman, E.", TITLE = "Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by Sparse Audio-Visual Samples", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "7819-7828", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268364"} @inproceedings{bb273640, AUTHOR = "Xie, H.X. and Lee, M.X. and Chen, T.J. and Chen, H.J. and Liu, H.I. and Shuai, H.H. and Cheng, W.H.", TITLE = "Most Important Person-guided Dual-branch Cross-Patch Attention for Group Affect Recognition", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "20541-20551", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268365"} @inproceedings{bb273641, AUTHOR = "Djilali, Y.A.D. and Narayan, S. and Boussaid, H. and Almazrouei, E. and Debbah, M.", TITLE = "Lip2Vec: Efficient and Robust Visual Speech Recognition via Latent-to-Latent Visual to Audio Representation Mapping", BOOKTITLE = ICCV23, YEAR = "2023", PAGES = "13744-13755", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268366"} @inproceedings{bb273642, AUTHOR = "Chen, G.Y. and Zhang, D. and Liu, T. and Du, X.Y.", TITLE = "Local-Global Contrast for Learning Voice-Face Representations", BOOKTITLE = ICIP23, YEAR = "2023", PAGES = "51-55", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268367"} @inproceedings{bb273643, AUTHOR = "Hong, J. and Kim, M. and Choi, J. and Ro, Y.M.", TITLE = "Watch or Listen: Robust Audio-Visual Speech Recognition with Visual Corruption Modeling and Reliability Scoring", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "18783-18794", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268368"} @inproceedings{bb273644, AUTHOR = "Gao, J.Y. and Chen, M.Y. and Xu, C.S.", TITLE = "Collecting Cross-Modal Presence-Absence Evidence for Weakly-Supervised Audio-Visual Event Perception", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "18827-18836", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268369"} @inproceedings{bb273645, AUTHOR = "Porgali, B. and Albiero, V. and Ryda, J. and Ferrer, C.C. and Hazirbas, C.", TITLE = "The Casual Conversations v2 Dataset: A diverse, large benchmark for measuring fairness and robustness in audio/vision/speech models", BOOKTITLE = FaDE-TCV23, YEAR = "2023", PAGES = "10-17", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268370"} @inproceedings{bb273646, AUTHOR = "Xiong, J.W. and Wang, G. and Zhang, P. and Huang, W. and Zha, Y.F. and Zhai, G.T.", TITLE = "CASP-Net: Rethinking Video Saliency Prediction from an Audio-Visual Consistency Perceptual Perspective", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "6441-6450", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268371"} @inproceedings{bb273647, AUTHOR = "Huang, C. and Tian, Y. and Kumar, A. and Xu, C.L.", TITLE = "Egocentric Audio-Visual Object Localization", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "22910-22921", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268372"} @inproceedings{bb273648, AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y.", TITLE = "A Light Weight Model for Active Speaker Detection", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "22932-22941", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268373"} @inproceedings{bb273649, AUTHOR = "Seo, P.H. and Nagrani, A. and Schmid, C.", TITLE = "AVFormer: Injecting Vision into Frozen Speech Models for Zero-Shot AV-ASR", BOOKTITLE = CVPR23, YEAR = "2023", PAGES = "22922-22931", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268374"} @inproceedings{bb273650, AUTHOR = "Feng, D. and Yang, S. and Shan, S.G. and Chen, X.L.", TITLE = "Audio-Driven Deformation Flow for Effective Lip Reading", BOOKTITLE = "ICPR22", YEAR = "2022", PAGES = "274-280", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268375"} @inproceedings{bb273651, AUTHOR = "Varshney, M. and Yadav, R. and Namboodiri, V.P. and Hegde, R.M.", TITLE = "Learning Speaker-specific Lip-to-Speech Generation", BOOKTITLE = "ICPR22", YEAR = "2022", PAGES = "491-498", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268376"} @inproceedings{bb273652, AUTHOR = "Shi, C. and Yang, S.", TITLE = "Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVI:201-218", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268377"} @inproceedings{bb273653, AUTHOR = "Hayes, T. and Zhang, S.Y. and Yin, X. and Pang, G. and Sheng, S. and Yang, H. and Ge, S.W. and Hu, Q.Y. and Parikh, D.", TITLE = "MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and GENeration", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "VIII:431-449", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268378"} @inproceedings{bb273654, AUTHOR = "van Horn, G. and Qian, R. and Wilber, K. and Adam, H. and Aodha, O.M. and Belongie, S.", TITLE = "Exploring Fine-Grained Audiovisual Categorization with the SSW60 Dataset", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "VIII:271-289", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268379"} @inproceedings{bb273655, AUTHOR = "Yu, S. and Wu, P. and Liang, P.P. and Salakhutdinov, R. and Morency, L.P.", TITLE = "PACS: A Dataset for Physical Audiovisual CommonSense Reasoning", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:292-309", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268380"} @inproceedings{bb273656, AUTHOR = "Cheng, H.Y. and Liu, Z.Y. and Zhou, H. and Qian, C. and Wu, W. and Wang, L.M.", TITLE = "Joint-Modal Label Denoising for Weakly-Supervised Audio-Visual Video Parsing", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXIV:431-448", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268381"} @inproceedings{bb273657, AUTHOR = "Zhang, Z.Q. and Zhang, J. and Zhang, J.S. and Wu, M.H. and Fang, X. and Dai, L.R.", TITLE = "Learning Contextually Fused Audio-Visual Representations for Audio-Visual Speech Recognition", BOOKTITLE = ICIP22, YEAR = "2022", PAGES = "1346-1350", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268382"} @inproceedings{bb273658, AUTHOR = "Mo, S.T. and Morgado, P.", TITLE = "Localizing Visual Sounds the Easy Way", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:218-234", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268383"} @inproceedings{bb273659, AUTHOR = "Montesinos, J.F. and Kadandale, V.S. and Haro, G.", TITLE = "VoViT: Low Latency Graph-Based Audio-Visual Voice Separation Transformer", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:310-326", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268384"} @inproceedings{bb273660, AUTHOR = "Tzinis, E. and Wisdom, S. and Remez, T. and Hershey, J.R.", TITLE = "AudioScopeV2: Audio-Visual Attention Architectures for Calibrated Open-Domain On-Screen Sound Separation", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:368-385", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268385"} @inproceedings{bb273661, AUTHOR = "Zhou, J.X. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.", TITLE = "Audio-Visual Segmentation", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:386-403", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268386"} @inproceedings{bb273662, AUTHOR = "Alcazar, J.L. and Cordes, M. and Zhao, C. and Ghanem, B.", TITLE = "End-to-End Active Speaker Detection", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XXXVII:126-143", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268387"} @inproceedings{bb273663, AUTHOR = "Chen, C.G. and Gao, R.H. and Calamia, P. and Grauman, K.", TITLE = "Visual Acoustic Matching", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "18836-18846", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268388"} @inproceedings{bb273664, AUTHOR = "Lee, S. and Kim, H.I. and Ro, Y.M.", TITLE = "Weakly Paired Associative Learning for Sound and Image Representations via Bimodal Associative Memory", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "10524-10533", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268389"} @inproceedings{bb273665, AUTHOR = "Vasudevan, A.B. and Dai, D.X. and Van Gool, L.J.", TITLE = "Sound and Visual Representation Learning with Multiple Pretraining Tasks", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "14596-14606", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268390"} @inproceedings{bb273666, AUTHOR = "Xia, Y. and Zhao, Z.", TITLE = "Cross-modal Background Suppression for Audio-Visual Event Localization", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "19957-19966", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268391"} @inproceedings{bb273667, AUTHOR = "Jiang, H. and Murdock, C. and Ithapu, V.K.", TITLE = "Egocentric Deep Multi-Channel Audio-Visual Active Speaker Localization", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "10534-10542", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268392"} @inproceedings{bb273668, AUTHOR = "Ng, E. and Joo, H. and Hu, L.W. and Li, H. and Darrell, T.J. and Kanazawa, A. and Ginosar, S.", TITLE = "Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "20363-20373", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268393"} @inproceedings{bb273669, AUTHOR = "Kurzendorfer, D. and Mercea, O.B. and Koepke, A.S. and Akata, Z.", TITLE = "Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large Multi-Modal Models", BOOKTITLE = L3D-IVU24, YEAR = "2024", PAGES = "2627-2638", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268394"} @inproceedings{bb273670, AUTHOR = "Mercea, O.B. and Hummel, T. and Koepke, A.S. and Akata, Z.", TITLE = "Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning", BOOKTITLE = ECCV22, YEAR = "2022", PAGES = "XX:488-505", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268395"} @inproceedings{bb273671, AUTHOR = "Mercea, O.B. and Riesch, L. and Koepke, A.S. and Akata, Z.", TITLE = "Audiovisual Generalised Zero-shot Learning with Cross-modal Attention and Language", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "10543-10553", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268396"} @inproceedings{bb273672, AUTHOR = "Karas, V. and Tellamekala, M.K. and Mallol Ragolta, A. and Valstar, M. and Schuller, B.W.", TITLE = "Time-Continuous Audiovisual Fusion with Recurrence vs Attention for In-The-Wild Affect Recognition", BOOKTITLE = ABAW22, YEAR = "2022", PAGES = "2381-2390", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268397"} @inproceedings{bb273673, AUTHOR = "Yang, K. and Markovic, D. and Krenn, S. and Agrawal, V. and Richard, A.", TITLE = "Audio-Visual Speech Codecs: Rethinking Audio-Visual Speech Enhancement by Re-Synthesis", BOOKTITLE = CVPR22, YEAR = "2022", PAGES = "8217-8227", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268398"} @inproceedings{bb273674, AUTHOR = "Kim, M. and Hong, J. and Park, S.J. and Ro, Y.M.", TITLE = "Multi-modality Associative Bridging through Memory: Speech Sound Recollected from Face Video", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "296-306", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268399"} @inproceedings{bb273675, AUTHOR = "Li, J. and Kang, D. and Pei, W.J. and Zhe, X.F. and Zhang, Y. and He, Z.Y. and Bao, L.C.", TITLE = "Audio2Gestures: Generating Diverse Gestures from Speech Audio with Conditional Variational Autoencoders", BOOKTITLE = ICCV21, YEAR = "2021", PAGES = "11273-11282", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268400"} @inproceedings{bb273676, AUTHOR = "Ye, M. and You, Q.Z. and Ma, F.L.", TITLE = "QUALIFIER: Question-Guided Self-Attentive Multimodal Fusion Network for Audio Visual Scene-Aware Dialog", BOOKTITLE = WACV22, YEAR = "2022", PAGES = "2503-2511", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268401"} @inproceedings{bb273677, AUTHOR = "Yao, S. and Min, X.K. and Zhai, G.T.", TITLE = "Deep Audio-Visual Fusion Neural Network for Saliency Estimation", BOOKTITLE = ICIP21, YEAR = "2021", PAGES = "1604-1608", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268402"} @inproceedings{bb273678, AUTHOR = "Krishnamurthy, S.", TITLE = "Learning Self-supervised Audio-Visual Representations for Sound Recommendations", BOOKTITLE = ISVC21, YEAR = "2021", PAGES = "II:124-138", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268403"} @inproceedings{bb273679, AUTHOR = "Shi, W.J. and Pattichis, M.S. and Celedon Pattichis, S. and LopezLeiva, C.", TITLE = "Talking Detection in Collaborative Learning Environments", BOOKTITLE = CAIP21, YEAR = "2021", PAGES = "II:242-251", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268404"} @inproceedings{bb273680, AUTHOR = "Wang, G. and Chen, C.Z. and Fan, D.P. and Hao, A. and Qin, H.", TITLE = "From Semantic Categories to Fixations: A Novel Weakly-supervised Visual-auditory Saliency Detection Approach", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "15114-15123", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268405"} @inproceedings{bb273681, AUTHOR = "Wen, P.S. and Xu, Q.Q. and Jiang, Y.B. and Yang, Z.Y. and He, Y. and Huang, Q.M.", TITLE = "Seeking the Shape of Sound: An Adaptive Framework for Learning Voice-Face Association", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "16342-16351", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268406"} @inproceedings{bb273682, AUTHOR = "Monfort, M. and Jin, S. and Liu, A. and Harwath, D. and Feris, R.S. and Glass, J. and Oliva, A.", TITLE = "Spoken Moments: Learning Joint Audio-Visual Representations from Video Descriptions", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "14866-14876", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268407"} @inproceedings{bb273683, AUTHOR = "Tian, Y.P. and Xu, C.L.", TITLE = "Can audio-visual integration strengthen robustness under multimodal attacks?", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "5597-5607", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268408"} @inproceedings{bb273684, AUTHOR = "Morgado, P. and Vasconcelos, N.M. and Misra, I.", TITLE = "Audio-Visual Instance Discrimination with Cross-Modal Agreement", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "12470-12481", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268409"} @inproceedings{bb273685, AUTHOR = "Morgado, P. and Misra, I. and Vasconcelos, N.M.", TITLE = "Robust Audio-Visual Instance Discrimination", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "12929-12940", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268410"} @inproceedings{bb273686, AUTHOR = "Chen, Y.B. and Xian, Y.Q. and Koepke, A.S. and Shan, Y. and Akata, Z.", TITLE = "Distilling Audio-Visual Knowledge by Compositional Contrastive Learning", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "7012-7021", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268411"} @inproceedings{bb273687, AUTHOR = "Zhang, Z.M. and Li, L.C. and Ding, Y. and Fan, C.J.", TITLE = "Flow-guided One-shot Talking Face Generation with a High-resolution Audio-visual Dataset", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "3660-3669", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268412"} @inproceedings{bb273688, AUTHOR = "Gao, R.H. and Grauman, K.", TITLE = "VisualVoice: Audio-Visual Speech Separation with Cross-Modal Consistency", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "15490-15500", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268413"} @inproceedings{bb273689, AUTHOR = "Lee, J.Y. and Chung, S.W. and Kim, S. and Kang, H.G. and Sohn, K.H.", TITLE = "Looking into Your Speech: Learning Cross-modal Affinity for Audio-visual Speech Separation", BOOKTITLE = CVPR21, YEAR = "2021", PAGES = "1336-1345", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268414"} @inproceedings{bb273690, AUTHOR = "Mazumder, P. and Sing, P. and Parida, K.K. and Namboodiri, V.P.", TITLE = "AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by Reconstructing Label Features from Multi-Modal Embeddings", BOOKTITLE = WACV21, YEAR = "2021", PAGES = "3089-3098", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268415"} @inproceedings{bb273691, AUTHOR = "Ishikawa, R. and Hachiuma, R. and Kurobe, A. and Saito, H.", TITLE = "Single-modal Incremental Terrain Clustering from Self-Supervised Audio-Visual Feature Learning", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "9399-9406", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268416"} @inproceedings{bb273692, AUTHOR = "Madrigal, F. and Lerasle, F. and Pibre, L. and Ferrane, I.", TITLE = "Audio-Video detection of the active speaker in meetings", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "2536-2543", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268417"} @inproceedings{bb273693, AUTHOR = "Tellamekala, M.K. and Valstar, M. and Pound, M. and Giesbrecht, T.", TITLE = "Audio-Visual Predictive Coding for Self-Supervised Visual Representation Learning", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "9912-9919", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268418"} @inproceedings{bb273694, AUTHOR = "Liu, H. and Wang, Y. and Yang, B.", TITLE = "Mutual Alignment between Audiovisual Features for End-to-End Audiovisual Speech Recognition", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "5348-5353", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268419"} @inproceedings{bb273695, AUTHOR = "Liu, H. and Xu, W.L. and Yang, B.", TITLE = "Audio-Visual Speech Recognition Using A Two-Step Feature Fusion Strategy", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "1896-1903", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268420"} @inproceedings{bb273696, AUTHOR = "Liu, H. and Li, W.H. and Yang, B.", TITLE = "Robust Audio-Visual Speech Recognition Based on Hybrid Fusion", BOOKTITLE = ICPR21, YEAR = "2021", PAGES = "7580-7586", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268421"} @inproceedings{bb273697, AUTHOR = "Chao, F.Y. and Ozcinar, C. and Zhang, L. and Hamidouche, W. and Deforges, O. and Smolic, A.", TITLE = "Towards Audio-Visual Saliency Prediction for Omnidirectional Video with Spatial Audio", BOOKTITLE = VCIP20, YEAR = "2020", PAGES = "355-358", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268422"} @inproceedings{bb273698, AUTHOR = "Zhou, H. and Xu, X.D. and Lin, D. and Wang, X.G. and Liu, Z.W.", TITLE = "Sep-stereo: Visually Guided Stereophonic Audio Generation by Associating Source Separation", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "XII: 52-69", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268423"} @inproceedings{bb273699, AUTHOR = "Tian, Y.P. and Li, D.Z. and Xu, C.L.", TITLE = "Unified Multisensory Perception: Weakly-supervised Audio-visual Video Parsing", BOOKTITLE = ECCV20, YEAR = "2020", PAGES = "III:436-454", BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268424"}