@article{bb273600,
        AUTHOR = "Xiao, Y.W. and Liu, X.M. and Zhu, A. and Huang, J.",
        TITLE = "Relational-branchformer: Novel framework for audio-visual speech
recognition",
        JOURNAL = IVC,
        VOLUME = "149",
        YEAR = "2024",
        PAGES = "105182",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268325"}

@article{bb273601,
        AUTHOR = "Li, W.R. and Wang, P. and Xiong, R.Q. and Fan, X.P.",
        TITLE = "Spiking Tucker Fusion Transformer for Audio-Visual Zero-Shot Learning",
        JOURNAL = IP,
        VOLUME = "33",
        YEAR = "2024",
        PAGES = "4840-4852",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268326"}

@article{bb273602,
        AUTHOR = "Li, K. and Xie, F. and Chen, H. and Yuan, K. and Hu, X.L.",
        TITLE = "An Audio-Visual Speech Separation Model Inspired by
Cortico-Thalamo-Cortical Circuits",
        JOURNAL = PAMI,
        VOLUME = "46",
        YEAR = "2024",
        NUMBER = "10",
        MONTH = "October",
        PAGES = "6637-6651",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268327"}

@article{bb273603,
        AUTHOR = "Zhou, J.X. and Guo, D. and Zhong, Y.R. and Wang, M.",
        TITLE = "Advancing Weakly-Supervised Audio-Visual Video Parsing via Segment-Wise
Pseudo Labeling",
        JOURNAL = IJCV,
        VOLUME = "132",
        YEAR = "2024",
        NUMBER = "11",
        MONTH = "November",
        PAGES = "5308-5329",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268328"}

@inproceedings{bb273604,
        AUTHOR = "Zhou, J.X. and Guo, D. and Mao, Y.X. and Zhong, Y. and Chang, X.J. and Wang, M.",
        TITLE = "Label-anticipated Event Disentanglement for Audio-visual Video Parsing",
        BOOKTITLE = ECCV24,
        YEAR = "2024",
        PAGES = "X: 35-51",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268329"}

@article{bb273605,
        AUTHOR = "Liu, J. and Chen, S. and He, X.J. and Guo, L.T. and Zhu, X.X. and Wang, W.N. and Tang, J.H.",
        TITLE = "VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and
Dataset",
        JOURNAL = PAMI,
        VOLUME = "47",
        YEAR = "2025",
        NUMBER = "2",
        MONTH = "February",
        PAGES = "708-724",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268330"}

@article{bb273606,
        AUTHOR = "Wang, Y. and Qian, X.H. and Zhou, W.",
        TITLE = "Transformer-Prompted Network: Efficient Audio-Visual Segmentation via
Transformer and Prompt Learning",
        JOURNAL = SPLetters,
        VOLUME = "32",
        YEAR = "2025",
        PAGES = "516-520",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268331"}

@article{bb273607,
        AUTHOR = "Steinmetz, N. and Balal, N.",
        TITLE = "Feasibility Study of Real-Time Speech Detection and Characterization
Using Millimeter-Wave Micro-Doppler Radar",
        JOURNAL = RS,
        VOLUME = "17",
        YEAR = "2025",
        NUMBER = "1",
        PAGES = "91",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268332"}

@article{bb273608,
        AUTHOR = "Shi, Z.F. and Wu, Q.B. and Meng, F.M. and Xu, L.F. and Li, H.L.",
        TITLE = "Cross-Modal Cognitive Consensus Guided Audio-Visual Segmentation",
        JOURNAL = MultMed,
        VOLUME = "27",
        YEAR = "2025",
        PAGES = "209-223",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268333"}

@article{bb273609,
        AUTHOR = "Zhang, J.X. and Wan, G. and Gao, J.Q. and Ling, Z.H.",
        TITLE = "Audio-visual representation learning via knowledge distillation from
speech foundation models",
        JOURNAL = PR,
        VOLUME = "162",
        YEAR = "2025",
        PAGES = "111432",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268334"}

@article{bb273610,
        AUTHOR = "Chen, T.X. and Tan, Z.T. and Gong, T. and Chu, Q. and Wu, Y. and Liu, B. and Yu, N.H. and Lu, L. and Ye, J.P.",
        TITLE = "Bootstrapping Audio-Visual Video Segmentation by Strengthening Audio
Cues",
        JOURNAL = CirSysVideo,
        VOLUME = "35",
        YEAR = "2025",
        NUMBER = "3",
        MONTH = "March",
        PAGES = "2398-2409",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268335"}

@inproceedings{bb273611,
        AUTHOR = "Zhang, Y.H. and Yang, S. and Shan, S.G. and Chen, X.L.",
        TITLE = "ES3: Evolving Self-Supervised Learning of Robust Audio-Visual Speech
Representations",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27059-27069",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268336"}

@inproceedings{bb273612,
        AUTHOR = "Yang, Q. and Nie, X. and Li, T. and Gao, P.F. and Guo, Y. and Zhen, C. and Yan, P.F. and Xiang, S.M.",
        TITLE = "Cooperation Does Matter: Exploring Multi-Order Bilateral Relations
for Audio-Visual Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27124-27133",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268337"}

@inproceedings{bb273613,
        AUTHOR = "Xiong, J.W. and Zhang, P. and You, T. and Li, C.Y. and Huang, W. and Zha, Y.F.",
        TITLE = "DiffSal: Joint Audio and Video Learning for Diffusion Saliency
Prediction",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27263-27273",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268338"}

@inproceedings{bb273614,
        AUTHOR = "Liu, C. and Li, P.P. and Yu, Q. and Sheng, H.W. and Wang, D.D. and Li, L. and Yu, X.",
        TITLE = "Benchmarking Audio Visual Segmentation for Long-Untrimmed Videos",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "22712-22722",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268339"}

@inproceedings{bb273615,
        AUTHOR = "Li, X. and Wang, J.L. and Xu, X.H. and Peng, X.L. and Singh, R. and Lu, Y. and Raj, B.",
        TITLE = "QDFormer: Towards Robust Audiovisual Segmentation in Complex
Environments with Quantization-based Semantic Decomposition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "3402-3413",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268340"}

@inproceedings{bb273616,
        AUTHOR = "Singh, N. and Wu, C.W. and Orife, I. and Kalayeh, M.",
        TITLE = "Looking Similar, Sounding Different: Leveraging Counterfactual
Cross-Modal Pairs for Audiovisual Representation Learning",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26897-26908",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268341"}

@inproceedings{bb273617,
        AUTHOR = "Liu, J.X. and Liu, Y.K. and Zhang, F. and Ju, C. and Zhang, Y. and Wang, Y.F.",
        TITLE = "Audio-Visual Segmentation via Unlabeled Frame Exploitation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26318-26329",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268342"}

@inproceedings{bb273618,
        AUTHOR = "Jia, W.Q. and Liu, M. and Jiang, H. and Ananthabhotla, I. and Rehg, J.M. and Ithapu, V.K. and Gao, R.H.",
        TITLE = "The Audio-Visual Conversational Graph: From an Egocentric-Exocentric
Perspective",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26386-26395",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268343"}

@inproceedings{bb273619,
        AUTHOR = "Chen, Y.H. and Liu, Y. and Wang, H. and Liu, F. and Wang, C. and Frazer, H. and Carneiro, G.",
        TITLE = "Unraveling Instance Associations: A Closer Look for Audio-Visual
Segmentation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26487-26497",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268344"}

@inproceedings{bb273620,
        AUTHOR = "Guo, Y.X. and Sun, S.Y. and Ma, S. and Zheng, K. and Bao, X.Y. and Ma, S.J. and Zou, W. and Zheng, Y.",
        TITLE = "CrossMAE: Cross-Modality Masked Autoencoders for Region-Aware
Audio-Visual Pre-Training",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "26711-26721",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268345"}

@inproceedings{bb273621,
        AUTHOR = "Mo, S.T. and Morgado, P.",
        TITLE = "Unveiling the Power of Audio-Visual Early Fusion Transformers with
Dense Interactions Through Masked Modeling",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27176-27186",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268346"}

@inproceedings{bb273622,
        AUTHOR = "Wang, K. and Tian, Y. and Hatzinakos, D.",
        TITLE = "Towards Efficient Audio-Visual Learners via Empowering Pre-trained
Vision Transformers with Cross-Modal Adaptation",
        BOOKTITLE = WhatNext24,
        YEAR = "2024",
        PAGES = "1837-1846",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268347"}

@inproceedings{bb273623,
        AUTHOR = "Ryumina, E. and Markitantov, M. and Ryumin, D. and Kaya, H. and Karpov, A.",
        TITLE = "Zero-Shot Audio-Visual Compound Expression Recognition Method based
on Emotion Probability Fusion",
        BOOKTITLE = ABAW24,
        YEAR = "2024",
        PAGES = "4752-4760",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268348"}

@inproceedings{bb273624,
        AUTHOR = "Mahmud, T. and Mo, S.T. and Tian, Y. and Marculescu, D.",
        TITLE = "MA-AVT: Modality Alignment for Parameter-Efficient Audio-Visual
Transformers",
        BOOKTITLE = ECV24,
        YEAR = "2024",
        PAGES = "7996-8005",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268349"}

@inproceedings{bb273625,
        AUTHOR = "Yang, Z. and Lin, J. and Chen, P.H. and Cherian, A. and Marks, T.K. and Le Roux, J. and Gan, C.",
        TITLE = "RILA: Reflective and Imaginative Language Agent for Zero-Shot
Semantic Audio-Visual Navigation",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "16251-16261",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268350"}

@inproceedings{bb273626,
        AUTHOR = "Dai, Y.S. and Chen, H. and Du, J. and Wang, R. and Chen, S.H. and Wang, H.T. and Lee, C.H.",
        TITLE = "A Study of Dropout-Induced Modality Bias on Robustness to Missing
Video Frames for Audio-Visual Speech Recognition",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "27435-27445",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268351"}

@inproceedings{bb273627,
        AUTHOR = "Galland, L. and Pelachaud, C. and Pecune, F.",
        TITLE = "Seeing and Hearing What Has Not Been Said: A multimodal client
behavior classifier in Motivational Interviewing with interpretable
fusion",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-9",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268352"}

@inproceedings{bb273628,
        AUTHOR = "Praveen, R.G. and Alam, J.",
        TITLE = "Audio-Visual Person Verification Based on Recursive Fusion of Joint
Cross-Attention",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268353"}

@inproceedings{bb273629,
        AUTHOR = "Praveen, R.G. and Alam, J.",
        TITLE = "Dynamic Cross Attention for Audio-Visual Person Verification",
        BOOKTITLE = FG24,
        YEAR = "2024",
        PAGES = "1-5",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268354"}

@inproceedings{bb273630,
        AUTHOR = "He, Y.H. and Shin, S. and Cherian, A. and Trigoni, N. and Markham, A.",
        TITLE = "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone
Array and RGB Images",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5484-5495",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268355"}

@inproceedings{bb273631,
        AUTHOR = "Ghaleb, E. and Burenko, I. and Rasenberg, M. and Pouw, W. and Uhrig, P. and Holler, J. and Toni, I. and Ozyurek, A. and Fernandez, R.",
        TITLE = "Co-Speech Gesture Detection through Multi-Phase Sequence Labeling",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "3995-4003",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268356"}

@inproceedings{bb273632,
        AUTHOR = "Liu, J.X. and Wang, Y. and Ju, C. and Ma, C.F. and Zhang, Y. and Xie, W.",
        TITLE = "Annotation-free Audio-Visual Segmentation",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5592-5602",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268357"}

@inproceedings{bb273633,
        AUTHOR = "Xu, Y.T. and Hu, C.H. and Lee, G.H.",
        TITLE = "Rethink Cross-Modal Fusion in Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5603-5612",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268358"}

@inproceedings{bb273634,
        AUTHOR = "Rachavarapu, K.K. and Ramakrishnan, K. and Rajagopalan, A. N.",
        TITLE = "Weakly-Supervised Audio-Visual Video Parsing with Prototype-Based
Pseudo-Labeling",
        BOOKTITLE = CVPR24,
        YEAR = "2024",
        PAGES = "18952-18962",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268359"}

@inproceedings{bb273635,
        AUTHOR = "Rachavarapu, K.K. and Rajagopalan, A.N.",
        TITLE = "Boosting Positive Segments for Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10158-10168",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268360"}

@inproceedings{bb273636,
        AUTHOR = "Chen, J. and Wang, W.G. and Liu, S. and Li, H.S. and Yang, Y.",
        TITLE = "Omnidirectional Information Gathering for Knowledge Transfer-based
Audio-Visual Navigation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "10959-10969",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268361"}

@inproceedings{bb273637,
        AUTHOR = "Cheng, X. and Jin, T. and Huang, R.J. and Li, L.J. and Lin, W. and Wang, Z. and Wang, Y. and Liu, H. and Yin, A. and Zhao, Z.",
        TITLE = "MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream
Mixup for Visual Speech Translation and Recognition",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15689-15699",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268362"}

@inproceedings{bb273638,
        AUTHOR = "Georgescu, M.I. and Fonseca, E. and Ionescu, R.T. and Lucic, M. and Schmid, C. and Arnab, A.",
        TITLE = "Audiovisual Masked Autoencoders",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "16098-16108",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268363"}

@inproceedings{bb273639,
        AUTHOR = "Chen, M.F. and Su, K. and Shlizerman, E.",
        TITLE = "Be Everywhere - Hear Everything (BEE): Audio Scene Reconstruction by
Sparse Audio-Visual Samples",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "7819-7828",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268364"}

@inproceedings{bb273640,
        AUTHOR = "Xie, H.X. and Lee, M.X. and Chen, T.J. and Chen, H.J. and Liu, H.I. and Shuai, H.H. and Cheng, W.H.",
        TITLE = "Most Important Person-guided Dual-branch Cross-Patch Attention for
Group Affect Recognition",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20541-20551",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268365"}

@inproceedings{bb273641,
        AUTHOR = "Djilali, Y.A.D. and Narayan, S. and Boussaid, H. and Almazrouei, E. and Debbah, M.",
        TITLE = "Lip2Vec: Efficient and Robust Visual Speech Recognition via
Latent-to-Latent Visual to Audio Representation Mapping",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13744-13755",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268366"}

@inproceedings{bb273642,
        AUTHOR = "Chen, G.Y. and Zhang, D. and Liu, T. and Du, X.Y.",
        TITLE = "Local-Global Contrast for Learning Voice-Face Representations",
        BOOKTITLE = ICIP23,
        YEAR = "2023",
        PAGES = "51-55",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268367"}

@inproceedings{bb273643,
        AUTHOR = "Hong, J. and Kim, M. and Choi, J. and Ro, Y.M.",
        TITLE = "Watch or Listen: Robust Audio-Visual Speech Recognition with Visual
Corruption Modeling and Reliability Scoring",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18783-18794",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268368"}

@inproceedings{bb273644,
        AUTHOR = "Gao, J.Y. and Chen, M.Y. and Xu, C.S.",
        TITLE = "Collecting Cross-Modal Presence-Absence Evidence for
Weakly-Supervised Audio-Visual Event Perception",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18827-18836",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268369"}

@inproceedings{bb273645,
        AUTHOR = "Porgali, B. and Albiero, V. and Ryda, J. and Ferrer, C.C. and Hazirbas, C.",
        TITLE = "The Casual Conversations v2 Dataset: A diverse, large benchmark for
measuring fairness and robustness in audio/vision/speech models",
        BOOKTITLE = FaDE-TCV23,
        YEAR = "2023",
        PAGES = "10-17",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268370"}

@inproceedings{bb273646,
        AUTHOR = "Xiong, J.W. and Wang, G. and Zhang, P. and Huang, W. and Zha, Y.F. and Zhai, G.T.",
        TITLE = "CASP-Net: Rethinking Video Saliency Prediction from an Audio-Visual
Consistency Perceptual Perspective",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6441-6450",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268371"}

@inproceedings{bb273647,
        AUTHOR = "Huang, C. and Tian, Y. and Kumar, A. and Xu, C.L.",
        TITLE = "Egocentric Audio-Visual Object Localization",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22910-22921",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268372"}

@inproceedings{bb273648,
        AUTHOR = "Liao, J.H. and Duan, H.H. and Feng, K.H. and Zhao, W.B. and Yang, Y.B. and Chen, L.Y.",
        TITLE = "A Light Weight Model for Active Speaker Detection",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22932-22941",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268373"}

@inproceedings{bb273649,
        AUTHOR = "Seo, P.H. and Nagrani, A. and Schmid, C.",
        TITLE = "AVFormer: Injecting Vision into Frozen Speech Models for Zero-Shot
AV-ASR",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22922-22931",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268374"}

@inproceedings{bb273650,
        AUTHOR = "Feng, D. and Yang, S. and Shan, S.G. and Chen, X.L.",
        TITLE = "Audio-Driven Deformation Flow for Effective Lip Reading",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "274-280",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268375"}

@inproceedings{bb273651,
        AUTHOR = "Varshney, M. and Yadav, R. and Namboodiri, V.P. and Hegde, R.M.",
        TITLE = "Learning Speaker-specific Lip-to-Speech Generation",
        BOOKTITLE = "ICPR22",
        YEAR = "2022",
        PAGES = "491-498",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268376"}

@inproceedings{bb273652,
        AUTHOR = "Shi, C. and Yang, S.",
        TITLE = "Spatial and Visual Perspective-Taking via View Rotation and Relation
Reasoning for Embodied Reference Understanding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:201-218",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268377"}

@inproceedings{bb273653,
        AUTHOR = "Hayes, T. and Zhang, S.Y. and Yin, X. and Pang, G. and Sheng, S. and Yang, H. and Ge, S.W. and Hu, Q.Y. and Parikh, D.",
        TITLE = "MUGEN: A Playground for Video-Audio-Text Multimodal Understanding and
GENeration",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "VIII:431-449",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268378"}

@inproceedings{bb273654,
        AUTHOR = "van Horn, G. and Qian, R. and Wilber, K. and Adam, H. and Aodha, O.M. and Belongie, S.",
        TITLE = "Exploring Fine-Grained Audiovisual Categorization with the SSW60
Dataset",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "VIII:271-289",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268379"}

@inproceedings{bb273655,
        AUTHOR = "Yu, S. and Wu, P. and Liang, P.P. and Salakhutdinov, R. and Morency, L.P.",
        TITLE = "PACS: A Dataset for Physical Audiovisual CommonSense Reasoning",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:292-309",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268380"}

@inproceedings{bb273656,
        AUTHOR = "Cheng, H.Y. and Liu, Z.Y. and Zhou, H. and Qian, C. and Wu, W. and Wang, L.M.",
        TITLE = "Joint-Modal Label Denoising for Weakly-Supervised Audio-Visual Video
Parsing",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXIV:431-448",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268381"}

@inproceedings{bb273657,
        AUTHOR = "Zhang, Z.Q. and Zhang, J. and Zhang, J.S. and Wu, M.H. and Fang, X. and Dai, L.R.",
        TITLE = "Learning Contextually Fused Audio-Visual Representations for
Audio-Visual Speech Recognition",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "1346-1350",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268382"}

@inproceedings{bb273658,
        AUTHOR = "Mo, S.T. and Morgado, P.",
        TITLE = "Localizing Visual Sounds the Easy Way",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:218-234",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268383"}

@inproceedings{bb273659,
        AUTHOR = "Montesinos, J.F. and Kadandale, V.S. and Haro, G.",
        TITLE = "VoViT: Low Latency Graph-Based Audio-Visual Voice Separation
Transformer",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:310-326",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268384"}

@inproceedings{bb273660,
        AUTHOR = "Tzinis, E. and Wisdom, S. and Remez, T. and Hershey, J.R.",
        TITLE = "AudioScopeV2: Audio-Visual Attention Architectures for Calibrated
Open-Domain On-Screen Sound Separation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:368-385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268385"}

@inproceedings{bb273661,
        AUTHOR = "Zhou, J.X. and Wang, J.Y. and Zhang, J.Y. and Sun, W.X. and Zhang, J. and Birchfield, S. and Guo, D. and Kong, L.P. and Wang, M. and Zhong, Y.",
        TITLE = "Audio-Visual Segmentation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:386-403",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268386"}

@inproceedings{bb273662,
        AUTHOR = "Alcazar, J.L. and Cordes, M. and Zhao, C. and Ghanem, B.",
        TITLE = "End-to-End Active Speaker Detection",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:126-143",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268387"}

@inproceedings{bb273663,
        AUTHOR = "Chen, C.G. and Gao, R.H. and Calamia, P. and Grauman, K.",
        TITLE = "Visual Acoustic Matching",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18836-18846",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268388"}

@inproceedings{bb273664,
        AUTHOR = "Lee, S. and Kim, H.I. and Ro, Y.M.",
        TITLE = "Weakly Paired Associative Learning for Sound and Image
Representations via Bimodal Associative Memory",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10524-10533",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268389"}

@inproceedings{bb273665,
        AUTHOR = "Vasudevan, A.B. and Dai, D.X. and Van Gool, L.J.",
        TITLE = "Sound and Visual Representation Learning with Multiple Pretraining
Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "14596-14606",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268390"}

@inproceedings{bb273666,
        AUTHOR = "Xia, Y. and Zhao, Z.",
        TITLE = "Cross-modal Background Suppression for Audio-Visual Event
Localization",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19957-19966",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268391"}

@inproceedings{bb273667,
        AUTHOR = "Jiang, H. and Murdock, C. and Ithapu, V.K.",
        TITLE = "Egocentric Deep Multi-Channel Audio-Visual Active Speaker
Localization",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10534-10542",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268392"}

@inproceedings{bb273668,
        AUTHOR = "Ng, E. and Joo, H. and Hu, L.W. and Li, H. and Darrell, T.J. and Kanazawa, A. and Ginosar, S.",
        TITLE = "Learning to Listen: Modeling Non-Deterministic Dyadic Facial Motion",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "20363-20373",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268393"}

@inproceedings{bb273669,
        AUTHOR = "Kurzendorfer, D. and Mercea, O.B. and Koepke, A.S. and Akata, Z.",
        TITLE = "Audio-Visual Generalized Zero-Shot Learning using Pre-Trained Large
Multi-Modal Models",
        BOOKTITLE = L3D-IVU24,
        YEAR = "2024",
        PAGES = "2627-2638",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268394"}

@inproceedings{bb273670,
        AUTHOR = "Mercea, O.B. and Hummel, T. and Koepke, A.S. and Akata, Z.",
        TITLE = "Temporal and Cross-modal Attention for Audio-Visual Zero-Shot Learning",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XX:488-505",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268395"}

@inproceedings{bb273671,
        AUTHOR = "Mercea, O.B. and Riesch, L. and Koepke, A.S. and Akata, Z.",
        TITLE = "Audiovisual Generalised Zero-shot Learning with Cross-modal Attention
and Language",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "10543-10553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268396"}

@inproceedings{bb273672,
        AUTHOR = "Karas, V. and Tellamekala, M.K. and Mallol Ragolta, A. and Valstar, M. and Schuller, B.W.",
        TITLE = "Time-Continuous Audiovisual Fusion with Recurrence vs Attention for
In-The-Wild Affect Recognition",
        BOOKTITLE = ABAW22,
        YEAR = "2022",
        PAGES = "2381-2390",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268397"}

@inproceedings{bb273673,
        AUTHOR = "Yang, K. and Markovic, D. and Krenn, S. and Agrawal, V. and Richard, A.",
        TITLE = "Audio-Visual Speech Codecs: Rethinking Audio-Visual Speech
Enhancement by Re-Synthesis",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "8217-8227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268398"}

@inproceedings{bb273674,
        AUTHOR = "Kim, M. and Hong, J. and Park, S.J. and Ro, Y.M.",
        TITLE = "Multi-modality Associative Bridging through Memory:
Speech Sound Recollected from Face Video",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "296-306",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268399"}

@inproceedings{bb273675,
        AUTHOR = "Li, J. and Kang, D. and Pei, W.J. and Zhe, X.F. and Zhang, Y. and He, Z.Y. and Bao, L.C.",
        TITLE = "Audio2Gestures: Generating Diverse Gestures from Speech Audio with
Conditional Variational Autoencoders",
        BOOKTITLE = ICCV21,
        YEAR = "2021",
        PAGES = "11273-11282",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268400"}

@inproceedings{bb273676,
        AUTHOR = "Ye, M. and You, Q.Z. and Ma, F.L.",
        TITLE = "QUALIFIER: Question-Guided Self-Attentive Multimodal Fusion Network
for Audio Visual Scene-Aware Dialog",
        BOOKTITLE = WACV22,
        YEAR = "2022",
        PAGES = "2503-2511",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268401"}

@inproceedings{bb273677,
        AUTHOR = "Yao, S. and Min, X.K. and Zhai, G.T.",
        TITLE = "Deep Audio-Visual Fusion Neural Network for Saliency Estimation",
        BOOKTITLE = ICIP21,
        YEAR = "2021",
        PAGES = "1604-1608",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268402"}

@inproceedings{bb273678,
        AUTHOR = "Krishnamurthy, S.",
        TITLE = "Learning Self-supervised Audio-Visual Representations for Sound
Recommendations",
        BOOKTITLE = ISVC21,
        YEAR = "2021",
        PAGES = "II:124-138",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268403"}

@inproceedings{bb273679,
        AUTHOR = "Shi, W.J. and Pattichis, M.S. and Celedon Pattichis, S. and LopezLeiva, C.",
        TITLE = "Talking Detection in Collaborative Learning Environments",
        BOOKTITLE = CAIP21,
        YEAR = "2021",
        PAGES = "II:242-251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268404"}

@inproceedings{bb273680,
        AUTHOR = "Wang, G. and Chen, C.Z. and Fan, D.P. and Hao, A. and Qin, H.",
        TITLE = "From Semantic Categories to Fixations: A Novel Weakly-supervised
Visual-auditory Saliency Detection Approach",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15114-15123",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268405"}

@inproceedings{bb273681,
        AUTHOR = "Wen, P.S. and Xu, Q.Q. and Jiang, Y.B. and Yang, Z.Y. and He, Y. and Huang, Q.M.",
        TITLE = "Seeking the Shape of Sound:
An Adaptive Framework for Learning Voice-Face Association",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "16342-16351",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268406"}

@inproceedings{bb273682,
        AUTHOR = "Monfort, M. and Jin, S. and Liu, A. and Harwath, D. and Feris, R.S. and Glass, J. and Oliva, A.",
        TITLE = "Spoken Moments: Learning Joint Audio-Visual Representations from
Video Descriptions",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "14866-14876",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268407"}

@inproceedings{bb273683,
        AUTHOR = "Tian, Y.P. and Xu, C.L.",
        TITLE = "Can audio-visual integration strengthen robustness under multimodal
attacks?",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "5597-5607",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268408"}

@inproceedings{bb273684,
        AUTHOR = "Morgado, P. and Vasconcelos, N.M. and Misra, I.",
        TITLE = "Audio-Visual Instance Discrimination with Cross-Modal Agreement",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12470-12481",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268409"}

@inproceedings{bb273685,
        AUTHOR = "Morgado, P. and Misra, I. and Vasconcelos, N.M.",
        TITLE = "Robust Audio-Visual Instance Discrimination",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "12929-12940",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268410"}

@inproceedings{bb273686,
        AUTHOR = "Chen, Y.B. and Xian, Y.Q. and Koepke, A.S. and Shan, Y. and Akata, Z.",
        TITLE = "Distilling Audio-Visual Knowledge by Compositional Contrastive
Learning",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "7012-7021",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268411"}

@inproceedings{bb273687,
        AUTHOR = "Zhang, Z.M. and Li, L.C. and Ding, Y. and Fan, C.J.",
        TITLE = "Flow-guided One-shot Talking Face Generation with a High-resolution
Audio-visual Dataset",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "3660-3669",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268412"}

@inproceedings{bb273688,
        AUTHOR = "Gao, R.H. and Grauman, K.",
        TITLE = "VisualVoice: Audio-Visual Speech Separation with Cross-Modal
Consistency",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "15490-15500",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268413"}

@inproceedings{bb273689,
        AUTHOR = "Lee, J.Y. and Chung, S.W. and Kim, S. and Kang, H.G. and Sohn, K.H.",
        TITLE = "Looking into Your Speech: Learning Cross-modal Affinity for
Audio-visual Speech Separation",
        BOOKTITLE = CVPR21,
        YEAR = "2021",
        PAGES = "1336-1345",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268414"}

@inproceedings{bb273690,
        AUTHOR = "Mazumder, P. and Sing, P. and Parida, K.K. and Namboodiri, V.P.",
        TITLE = "AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by
Reconstructing Label Features from Multi-Modal Embeddings",
        BOOKTITLE = WACV21,
        YEAR = "2021",
        PAGES = "3089-3098",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268415"}

@inproceedings{bb273691,
        AUTHOR = "Ishikawa, R. and Hachiuma, R. and Kurobe, A. and Saito, H.",
        TITLE = "Single-modal Incremental Terrain Clustering from Self-Supervised
Audio-Visual Feature Learning",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "9399-9406",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268416"}

@inproceedings{bb273692,
        AUTHOR = "Madrigal, F. and Lerasle, F. and Pibre, L. and Ferrane, I.",
        TITLE = "Audio-Video detection of the active speaker in meetings",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "2536-2543",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268417"}

@inproceedings{bb273693,
        AUTHOR = "Tellamekala, M.K. and Valstar, M. and Pound, M. and Giesbrecht, T.",
        TITLE = "Audio-Visual Predictive Coding for Self-Supervised Visual
Representation Learning",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "9912-9919",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268418"}

@inproceedings{bb273694,
        AUTHOR = "Liu, H. and Wang, Y. and Yang, B.",
        TITLE = "Mutual Alignment between Audiovisual Features for End-to-End
Audiovisual Speech Recognition",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "5348-5353",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268419"}

@inproceedings{bb273695,
        AUTHOR = "Liu, H. and Xu, W.L. and Yang, B.",
        TITLE = "Audio-Visual Speech Recognition Using A Two-Step Feature Fusion
Strategy",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "1896-1903",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268420"}

@inproceedings{bb273696,
        AUTHOR = "Liu, H. and Li, W.H. and Yang, B.",
        TITLE = "Robust Audio-Visual Speech Recognition Based on Hybrid Fusion",
        BOOKTITLE = ICPR21,
        YEAR = "2021",
        PAGES = "7580-7586",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268421"}

@inproceedings{bb273697,
        AUTHOR = "Chao, F.Y. and Ozcinar, C. and Zhang, L. and Hamidouche, W. and Deforges, O. and Smolic, A.",
        TITLE = "Towards Audio-Visual Saliency Prediction for Omnidirectional Video
with Spatial Audio",
        BOOKTITLE = VCIP20,
        YEAR = "2020",
        PAGES = "355-358",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268422"}

@inproceedings{bb273698,
        AUTHOR = "Zhou, H. and Xu, X.D. and Lin, D. and Wang, X.G. and Liu, Z.W.",
        TITLE = "Sep-stereo: Visually Guided Stereophonic Audio Generation by
Associating Source Separation",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "XII: 52-69",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268423"}

@inproceedings{bb273699,
        AUTHOR = "Tian, Y.P. and Li, D.Z. and Xu, C.L.",
        TITLE = "Unified Multisensory Perception: Weakly-supervised Audio-visual Video
Parsing",
        BOOKTITLE = ECCV20,
        YEAR = "2020",
        PAGES = "III:436-454",
        BIBSOURCE = "http://www.visionbib.com/bibliography/people916.html#TT268424"}

Last update:Mar 29, 2025 at 10:46:14