Keith Price Bibliography Bibtex Entry (ANCHOR 237500 URL http://dx.doi.org/10.1109/CVPRW63382.2024.00754 TYPE CONFERENCE PAGES 7587-7597 YEAR 2024 MONTH NIL BIBSOURCE http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232465 VOLUME NIL JOURNAL OpenSUN3D24 AUTHOR Qian, S.Y. and Chen, W.F. and Bai, M. and Zhou, X. and Tu, Z.W. and Li, L.E. TITLE AffordanceLLM: Grounding Affordance from Vision Language Models)


@inproceedings{bb237500,
        AUTHOR = "Qian, S.Y. and Chen, W.F. and Bai, M. and Zhou, X. and Tu, Z.W. and Li, L.E.",
        TITLE = "AffordanceLLM: Grounding Affordance from Vision Language Models",
        BOOKTITLE = OpenSUN3D24,
        YEAR = "2024",
        PAGES = "7587-7597",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232465"}

@inproceedings{bb237501,
        AUTHOR = "Miyanishi, T. and Azuma, D. and Kurita, S. and Kawanabe, M.",
        TITLE = "Cross3DVG: Cross-Dataset 3D Visual Grounding on Different RGB-D Scans",
        BOOKTITLE = "3DV24",
        YEAR = "2024",
        PAGES = "717-727",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232466"}

@inproceedings{bb237502,
        AUTHOR = "Gong, R. and Huang, J.Y. and Zhao, Y.Z. and Geng, H.R. and Gao, X.F. and Wu, Q.Y. and Ai, W. and Zhou, Z.H. and Terzopoulos, D. and Zhu, S.C. and Jia, B.X. and Huang, S.Y.",
        TITLE = "ARNOLD: A Benchmark for Language-Grounded Task Learning With
Continuous States in Realistic 3D Scenes",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "20426-20438",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232467"}

@inproceedings{bb237503,
        AUTHOR = "Wu, Y. and Wei, Y. and Wang, H.Z. and Liu, Y.F. and Yang, S. and He, X.M.",
        TITLE = "Grounded Image Text Matching with Mismatched Relation Reasoning",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2964-2975",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232468"}

@inproceedings{bb237504,
        AUTHOR = "Lee, C. and Kumar, M.G. and Tan, C.",
        TITLE = "DetermiNet: A Large-Scale Diagnostic Dataset for Complex
Visually-Grounded Referencing using Determiners",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "19962-19971",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232469"}

@inproceedings{bb237505,
        AUTHOR = "Lin, K.Q. and Zhang, P. and Chen, J. and Pramanick, S. and Gao, D.F. and Wang, A.J.P. and Yan, R. and Shou, M.Z.",
        TITLE = "UniVTG: Towards Unified Video-Language Temporal Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2782-2792",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232470"}

@inproceedings{bb237506,
        AUTHOR = "Liu, Y. and Zhang, J.H. and Chen, Q.C. and Peng, Y.X.",
        TITLE = "Confidence-aware Pseudo-label Learning for Weakly Supervised Visual
Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2816-2826",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232471"}

@inproceedings{bb237507,
        AUTHOR = "Khoshsirat, S. and Kambhamettu, C.",
        TITLE = "Sentence Attention Blocks for Answer Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "6057-6067",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232472"}

@inproceedings{bb237508,
        AUTHOR = "Li, H.X. and Cao, M. and Cheng, X. and Li, Y.W. and Zhu, Z.H. and Zou, Y.X.",
        TITLE = "G2L: Semantically Aligned and Uniform Video Grounding via Geodesic
and Game Theory",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "11998-12008",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232473"}

@inproceedings{bb237509,
        AUTHOR = "Li, H. and Shu, X.J. and He, S. and Qiao, R.Z. and Wen, W. and Guo, T. and Gan, B. and Sun, X.",
        TITLE = "D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with
Glance Annotation",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13688-13700",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232474"}

@inproceedings{bb237510,
        AUTHOR = "Pan, Y.L. and He, X.T. and Gong, B. and Lv, Y.L. and Shen, Y.J. and Peng, Y.X. and Zhao, D.L.",
        TITLE = "Scanning Only Once: An End-to-end Framework for Fast Temporal
Grounding in Long Videos",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13721-13731",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232475"}

@inproceedings{bb237511,
        AUTHOR = "Jang, J. and Park, J. and Kim, J. and Kwon, H. and Sohn, K.H.",
        TITLE = "Knowing Where to Focus: Event-aware Transformer for Video Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "13800-13810",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232476"}

@inproceedings{bb237512,
        AUTHOR = "Zhang, Y.M. and Gong, Z. and Chang, A.X.",
        TITLE = "Multi3DRefer: Grounding Text Description to Multiple 3D Objects",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15179-15179",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232477"}

@inproceedings{bb237513,
        AUTHOR = "Li, H. and Wei, P. and Ma, Z. and Zheng, N.N.",
        TITLE = "Inverse Compositional Learning for Weakly-supervised Relation
Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15431-15441",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232478"}

@inproceedings{bb237514,
        AUTHOR = "Chen, D.Z.Y. and Hu, R.H. and Chen, X.L. and Nießner, M. and Chang, A.X.",
        TITLE = "UniT3D: A Unified Transformer for 3D Dense Captioning and Visual
Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "18063-18073",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232479"}

@inproceedings{bb237515,
        AUTHOR = "de la Jara, I.M. and Rodriguez Opazo, C. and Marrese Taylor, E. and Bravo Marquez, F.",
        TITLE = "An empirical study of the effect of video encoders on Temporal Video
Grounding",
        BOOKTITLE = CLVL23,
        YEAR = "2023",
        PAGES = "2842-2847",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232480"}

@inproceedings{bb237516,
        AUTHOR = "Wang, Z. and Huang, H.F. and Zhao, Y. and Li, L.J. and Cheng, X.Z. and Zhu, Y.C. and Yin, A. and Zhao, Z.",
        TITLE = "Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly
Supervised 3D Visual Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "2662-2671",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232481"}

@inproceedings{bb237517,
        AUTHOR = "Guo, Z. and Tang, Y.W. and Zhang, R. and Wang, D. and Wang, Z.G. and Zhao, B. and Li, X.L.",
        TITLE = "ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding",
        BOOKTITLE = ICCV23,
        YEAR = "2023",
        PAGES = "15326-15337",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232482"}

@inproceedings{bb237518,
        AUTHOR = "Hsu, J. and Mao, J.Y. and Wu, J.J.",
        TITLE = "NS3D: Neuro-Symbolic Grounding of 3D Objects and Relations",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "2614-2623",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232483"}

@inproceedings{bb237519,
        AUTHOR = "Yi, J. and Uzkent, B. and Ignat, O. and Li, Z.L. and Garg, A. and Yu, X. and Liu, L.",
        TITLE = "Augment the Pairs: Semantics-Preserving Image-Caption Pair
Augmentation for Grounding-Based Vision and Language Models",
        BOOKTITLE = WACV24,
        YEAR = "2024",
        PAGES = "5508-5518",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232484"}

@inproceedings{bb237520,
        AUTHOR = "Uzkent, B. and Garg, A. and Zhu, W.T. and Doshi, K. and Yi, J. and Wang, X.L. and Omar, M.",
        TITLE = "Dynamic Inference with Grounding Based Vision and Language Models",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "2624-2633",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232485"}

@inproceedings{bb237521,
        AUTHOR = "Cao, M. and Wei, F.Y. and Xu, C. and Geng, X. and Chen, L. and Zhang, C. and Zou, Y.X. and Shen, T. and Jiang, D.X.",
        TITLE = "Iterative Proposal Refinement for Weakly-Supervised Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6524-6534",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232486"}

@inproceedings{bb237522,
        AUTHOR = "Lu, Z.J. and Iftekhar, A.S.M. and Mittal, G. and Meng, T.J. and Wang, X. and Zhao, C. and Kukkala, R. and Elhamifar, E. and Chen, M.",
        TITLE = "DeCafNet: Delegate and Conquer for Efficient Temporal Grounding in
Long Videos",
        BOOKTITLE = CVPR25,
        YEAR = "2025",
        PAGES = "24066-24076",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232487"}

@inproceedings{bb237523,
        AUTHOR = "Wang, L. and Mittal, G. and Sajeev, S. and Yu, Y. and Hall, M. and Boddeti, V.N. and Chen, M.",
        TITLE = "ProTéGé: Untrimmed Pretraining for Video Temporal Grounding by Video
Temporal Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6575-6585",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232488"}

@inproceedings{bb237524,
        AUTHOR = "Hwang, M.Y. and Jeong, J.Y. and Kim, M.S. and Oh, Y. and Oh, S.H.",
        TITLE = "Meta-Explore: Exploratory Hierarchical Vision-and-Language Navigation
Using Scene Object Spectrum Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6683-6693",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232489"}

@inproceedings{bb237525,
        AUTHOR = "Chen, J. and Gao, D.F. and Lin, K.Q. and Shou, M.Z.",
        TITLE = "Affordance Grounding from Demonstration Video to Target Image",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6799-6808",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232490"}

@inproceedings{bb237526,
        AUTHOR = "Shaharabany, T. and Wolf, L.B.",
        TITLE = "Similarity Maps for Self-Training Weakly-Supervised Phrase Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "6925-6934",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232491"}

@inproceedings{bb237527,
        AUTHOR = "Su, W. and Miao, P.H. and Dou, H.Z. and Wang, G.A. and Qiao, L. and Li, Z.Y. and Li, X.",
        TITLE = "Language Adaptive Weight Generation for Multi-Task Visual Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "10857-10866",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232492"}

@inproceedings{bb237528,
        AUTHOR = "Li, G. and Jampani, V. and Sun, D.Q. and Sevilla Lara, L.",
        TITLE = "LOCATE: Localize and Transfer Object Parts for Weakly Supervised
Affordance Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "10922-10931",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232493"}

@inproceedings{bb237529,
        AUTHOR = "Kim, S. and Oh, J. and Lee, S. and Yu, S. and Do, J. and Taghavi, T.",
        TITLE = "Grounding Counterfactual Explanation of Image Classifiers to Textual
Concept Space",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "10942-10950",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232494"}

@inproceedings{bb237530,
        AUTHOR = "Zhang, Y.M. and Chen, X. and Jia, J.H. and Liu, S. and Ding, K.",
        TITLE = "Text-Visual Prompting for Efficient 2D Temporal Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "14794-14804",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232495"}

@inproceedings{bb237531,
        AUTHOR = "Chen, Z.H. and Zhang, R. and Song, Y.B. and Wan, X. and Li, G.B.",
        TITLE = "Advancing Visual Grounding with Scene Knowledge: Benchmark and Method",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "15039-15049",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232496"}

@inproceedings{bb237532,
        AUTHOR = "Tan, C.L. and Lin, Z.H. and Hu, J.F. and Zheng, W.S. and Lai, J.H.",
        TITLE = "Hierarchical Semantic Correspondence Networks for Video Paragraph
Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "18973-18982",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232497"}

@inproceedings{bb237533,
        AUTHOR = "Yang, Z.Y. and Kafle, K. and Dernoncourt, F. and Ordonez, V.",
        TITLE = "Improving Visual Grounding by Encouraging Consistent Gradient-Based
Explanations",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19165-19174",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232498"}

@inproceedings{bb237534,
        AUTHOR = "Wu, Y.M. and Cheng, X.H. and Zhang, R.R. and Cheng, Z. and Zhang, J.",
        TITLE = "EDA: Explicit Text-Decoupling and Dense Alignment for 3D Visual
Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "19231-19242",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232499"}

@inproceedings{bb237535,
        AUTHOR = "Li, M.Z. and Wang, H. and Zhang, W.Q. and Miao, J.X. and Zhao, Z. and Zhang, S.Y. and Ji, W. and Wu, F.",
        TITLE = "WINNER: Weakly-supervised hIerarchical decompositioN and aligNment
for spatio-tEmporal video gRounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23090-23099",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232500"}

@inproceedings{bb237536,
        AUTHOR = "Lin, Z.H. and Tan, C.L. and Hu, J.F. and Jin, Z. and Ye, T. and Zheng, W.S.",
        TITLE = "Collaborative Static and Dynamic Vision-Language Streams for
Spatio-Temporal Video Grounding",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23100-23109",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232501"}

@inproceedings{bb237537,
        AUTHOR = "Yang, L. and Kong, Q. and Yang, H.K. and Kehl, W. and Sato, Y. and Kobori, N.",
        TITLE = "DeCo: Decomposition and Reconstruction for Compositional Temporal
Grounding via Coarse-to-Fine Contrastive Ranking",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23130-23140",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232502"}

@inproceedings{bb237538,
        AUTHOR = "Zhou, L. and Zhou, Z. and Mao, K. and He, Z.Y.",
        TITLE = "Joint Visual Grounding and Tracking with Natural Language
Specification",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23151-23160",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232503"}

@inproceedings{bb237539,
        AUTHOR = "Devaraj, C. and Fermuller, C. and Aloimonos, Y.F.",
        TITLE = "Incorporating Visual Grounding In GCN For Zero-shot Learning Of Human
Object Interaction Actions",
        BOOKTITLE = L3D-IVU23,
        YEAR = "2023",
        PAGES = "5008-5017",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232504"}

@inproceedings{bb237540,
        AUTHOR = "Fang, X. and Liu, D.Z. and Zhou, P. and Nan, G.S.",
        TITLE = "You Can Ground Earlier than See: An Effective and Efficient Pipeline
for Temporal Sentence Grounding in Compressed Videos",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "2448-2460",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232505"}

@inproceedings{bb237541,
        AUTHOR = "Fu, T.J. and Li, L.J. and Gan, Z. and Lin, K. and Wang, W.Y. and Wang, L.J. and Liu, Z.C.",
        TITLE = "An Empirical Study of End-to-End Video-Language Transformers with
Masked Visual Modeling",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "22898-22909",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232506"}

@inproceedings{bb237542,
        AUTHOR = "Li, L.J. and Gan, Z. and Lin, K. and Lin, C.C. and Liu, Z.C. and Liu, C. and Wang, L.J.",
        TITLE = "LAVENDER: Unifying Video-Language Understanding as Masked Language
Modeling",
        BOOKTITLE = CVPR23,
        YEAR = "2023",
        PAGES = "23119-23129",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232507"}

@inproceedings{bb237543,
        AUTHOR = "Dong, J.X. and Yin, Z.Z.",
        TITLE = "Boundary-aware Temporal Sentence Grounding with Adaptive Proposal
Refinement",
        BOOKTITLE = ACCV22,
        YEAR = "2022",
        PAGES = "IV:641-657",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232508"}

@inproceedings{bb237544,
        AUTHOR = "Gao, Y.Z. and Lu, Z.W.",
        TITLE = "SST-VLM: Sparse Sampling-twice Inspired Video-language Model",
        BOOKTITLE = ACCV22,
        YEAR = "2022",
        PAGES = "IV:537-553",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232509"}

@inproceedings{bb237545,
        AUTHOR = "Pacheco Ortega, A. and Mayol Cuervas, W.",
        TITLE = "One-shot Learning for Human Affordance Detection",
        BOOKTITLE = CVMeta22,
        YEAR = "2022",
        PAGES = "758-766",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232510"}

@inproceedings{bb237546,
        AUTHOR = "Ho, C.H. and Appalaraju, S. and Jasani, B. and Manmatha, R. and Vasconcelos, N.M.",
        TITLE = "YORO - Lightweight End to End Visual Grounding",
        BOOKTITLE = CMMP22,
        YEAR = "2022",
        PAGES = "3-23",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232511"}

@inproceedings{bb237547,
        AUTHOR = "Kim, D. and Park, J. and Lee, J.Y. and Park, S. and Sohn, K.H.",
        TITLE = "Language-free Training for Zero-shot Video Grounding",
        BOOKTITLE = WACV23,
        YEAR = "2023",
        PAGES = "2538-2547",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232512"}

@inproceedings{bb237548,
        AUTHOR = "Chou, S.H. and Fan, Z.C. and Little, J.J. and Sigal, L.",
        TITLE = "Semi-Supervised Grounding Alignment for Multi-Modal Feature Learning",
        BOOKTITLE = CRV22,
        YEAR = "2022",
        PAGES = "48-57",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232513"}

@inproceedings{bb237549,
        AUTHOR = "Chen, D.Z.Y. and Wu, Q.R. and Nießner, M. and Chang, A.X.",
        TITLE = "D 3 Net: A Unified Speaker-Listener Architecture for
3D Dense Captioning and Visual Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXII:487-505",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232514"}

@inproceedings{bb237550,
        AUTHOR = "Parcalabescu, L. and Frank, A.",
        TITLE = "Exploring Phrase Grounding without Training: Contextualisation and
Extension to Text-Based Image Retrieval",
        BOOKTITLE = MULWS20,
        YEAR = "2020",
        PAGES = "4137-4146",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232515"}

@inproceedings{bb237551,
        AUTHOR = "Tung, H. and Harley, A.W. and Huang, L. and Fragkiadaki, K.",
        TITLE = "Reward Learning from Narrated Demonstrations",
        BOOKTITLE = CVPR18,
        YEAR = "2018",
        PAGES = "7004-7013",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232516"}

@inproceedings{bb237552,
        AUTHOR = "Cohen, N. and Gal, R. and Meirom, E.A. and Chechik, G. and Atzmon, Y.",
        TITLE = "'This Is My Unicorn, Fluffy':
Personalizing Frozen Vision-Language Representations",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XX:558-577",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232517"}

@inproceedings{bb237553,
        AUTHOR = "Lee, J.H. and Kang, J.W.",
        TITLE = "Relation Enhanced Vision Language Pre-Training",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2286-2290",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232518"}

@inproceedings{bb237554,
        AUTHOR = "Khan, Z. and Kumar, B.G.V. and Yu, X. and Schulter, S. and Chandraker, M. and Fu, Y.",
        TITLE = "Single-Stream Multi-level Alignment for Vision-Language Pretraining",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:735-751",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232519"}

@inproceedings{bb237555,
        AUTHOR = "Wang, R. and Zhao, H. and Gao, Y.",
        TITLE = "CYBORGS: Contrastively Bootstrapping Object Representations by
Grounding in Segmentation",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXI:260-277",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232520"}

@inproceedings{bb237556,
        AUTHOR = "Yang, Z.Y. and Gan, Z. and Wang, J.F. and Hu, X.W. and Ahmed, F. and Liu, Z.C. and Lu, Y. and Wang, L.J.",
        TITLE = "UniTAB: Unifying Text and Box Outputs for Grounded Vision-Language
Modeling",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:521-539",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232521"}

@inproceedings{bb237557,
        AUTHOR = "Li, H. and Wei, P. and Li, J.P. and Ma, Z. and Shang, J.H. and Zheng, N.N.",
        TITLE = "Asymmetric Relation Consistency Reasoning for Video Relation Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:125-141",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232522"}

@inproceedings{bb237558,
        AUTHOR = "Dvornik, N. and Hadji, I. and Pham, H. and Bhatt, D. and Martinez, B. and Fazly, A. and Jepson, A.D.",
        TITLE = "Flow Graph to Video Grounding for Weakly-Supervised Multi-step
Localization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:319-335",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232523"}

@inproceedings{bb237559,
        AUTHOR = "Qu, M.X. and Wu, Y. and Liu, W. and Gong, Q.Q. and Liang, X.D. and Russakovsky, O. and Zhao, Y. and Wei, Y.C.",
        TITLE = "SiRi: A Simple Selective Retraining Mechanism for Transformer-Based
Visual Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:546-562",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232524"}

@inproceedings{bb237560,
        AUTHOR = "Zhu, C.Y. and Zhou, Y.Y. and Shen, Y.H. and Luo, G. and Pan, X.J. and Chen, M.B.L.C. and Cao, L.J. and Sun, X.S. and Ji, R.R.",
        TITLE = "SeqTR: A Simple Yet Universal Network for Visual Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:598-615",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232525"}

@inproceedings{bb237561,
        AUTHOR = "Hao, J.C. and Sun, H.F. and Ren, P.F. and Wang, J.Y. and Qi, Q. and Liao, J.X.",
        TITLE = "Can Shuffling Video Benefit Temporal Bias Problem: A Novel Training
Framework for Temporal Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:130-147",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232526"}

@inproceedings{bb237562,
        AUTHOR = "Jain, A. and Gkanatsios, N. and Mediratta, I. and Fragkiadaki, K.",
        TITLE = "Bottom Up Top Down Detection Transformers for Language Grounding in
Images and Point Clouds",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:417-433",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232527"}

@inproceedings{bb237563,
        AUTHOR = "Heisler, M. and Banitalebi Dehkordi, A. and Zhang, Y.",
        TITLE = "SemAug: Semantically Meaningful Image Augmentations for Object
Detection Through Language Grounding",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVI:610-626",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232528"}

@inproceedings{bb237564,
        AUTHOR = "Min, S. and Park, N. and Kim, S. and Park, S.H. and Kim, J.",
        TITLE = "Grounding Visual Representations with Texts for Domain Generalization",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXVII:37-53",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232529"}

@inproceedings{bb237565,
        AUTHOR = "Wang, J. and Wu, H.Y. and Chen, J.C. and Shuai, H.H. and Cheng, W.H.",
        TITLE = "Residual Graph Attention Network and Expression-Respect Data
Augmentation Aided Visual Grounding",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "326-330",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232530"}

@inproceedings{bb237566,
        AUTHOR = "Xiong, Z. and Liu, D. and Zhou, P.",
        TITLE = "Gaussian Kernel-Based Cross Modal Network for Spatio-Temporal Video
Grounding",
        BOOKTITLE = ICIP22,
        YEAR = "2022",
        PAGES = "2481-2485",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232531"}

@inproceedings{bb237567,
        AUTHOR = "Alaniz, S. and Federici, M. and Akata, Z.",
        TITLE = "Compositional Mixture Representations for Vision and Text",
        BOOKTITLE = L3D-IVU22,
        YEAR = "2022",
        PAGES = "4201-4210",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232532"}

@inproceedings{bb237568,
        AUTHOR = "Cho, J. and Yoon, Y. and Kwak, S.",
        TITLE = "Collaborative Transformers for Grounded Situation Recognition",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19627-19636",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232533"}

@inproceedings{bb237569,
        AUTHOR = "Singh, A. and Hu, R.H. and Goswami, V. and Couairon, G. and Galuba, W. and Rohrbach, M. and Kiela, D.",
        TITLE = "FLAVA: A Foundational Language And Vision Alignment Model",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15617-15629",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232534"}

@inproceedings{bb237570,
        AUTHOR = "Saini, N. and Pham, K. and Shrivastava, A.",
        TITLE = "Disentangling Visual Embeddings for Attributes and Objects",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "13648-13657",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232535"}

@inproceedings{bb237571,
        AUTHOR = "Ge, Y.Y. and Ge, Y.X. and Liu, X.H. and Wang, J.P. and Wu, J.P. and Shan, Y. and Qie, X. and Luo, P.",
        TITLE = "MILES: Visual BERT Pre-training with Injected Language Semantics for
Video-Text Retrieval",
        BOOKTITLE = ECCV22,
        YEAR = "2022",
        PAGES = "XXXV:691-708",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232536"}

@inproceedings{bb237572,
        AUTHOR = "Wang, A.J.P. and Ge, Y.X. and Cai, G. and Yan, R. and Lin, X.D. and Shan, Y. and Qie, X. and Shou, M.Z.",
        TITLE = "Object-aware Video-language Pre-training for Retrieval",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "3303-3312",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232537"}

@inproceedings{bb237573,
        AUTHOR = "Li, D.X. and Li, J.N. and Li, H.D. and Niebles, J.C. and Hoi, S.C.H.",
        TITLE = "Align and Prompt: Video-and-Language Pre-training with Entity Prompts",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4943-4953",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232538"}

@inproceedings{bb237574,
        AUTHOR = "Xue, H.W. and Hang, T. and Zeng, Y.H. and Sun, Y.C. and Liu, B. and Yang, H. and Fu, J.L. and Guo, B.N.",
        TITLE = "Advancing High-Resolution Video-Language Representation with
Large-Scale Video Transcriptions",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5026-5035",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232539"}

@inproceedings{bb237575,
        AUTHOR = "Sammani, F. and Mukherjee, T. and Deligiannis, N.",
        TITLE = "NLX-GPT: A Model for Natural Language Explanations in Vision and
Vision-Language Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "8312-8322",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232540"}

@inproceedings{bb237576,
        AUTHOR = "Lin, B.Q. and Zhu, Y. and Chen, Z.C. and Liang, X. and Liu, J.Z. and Liang, X.D.",
        TITLE = "ADAPT: Vision-Language Navigation with Modality-Aligned Action
Prompts",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15375-15385",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232541"}

@inproceedings{bb237577,
        AUTHOR = "Dou, Z.Y. and Xu, Y.C. and Gan, Z. and Wang, J.F. and Wang, S.H. and Wang, L.J. and Zhu, C.G. and Zhang, P.C. and Yuan, L. and Peng, N. and Liu, Z.C. and Zeng, M.",
        TITLE = "An Empirical Study of Training End-to-End Vision-and-Language
Transformers",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18145-18155",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232542"}

@inproceedings{bb237578,
        AUTHOR = "Xu, Z.P. and Lin, T.W. and Tang, H. and Li, F. and He, D.L. and Sebe, N. and Timofte, R. and Van Gool, L.J. and Ding, E.",
        TITLE = "Predict, Prevent, and Evaluate: Disentangled Text-Driven Image
Manipulation Empowered by Pre-Trained Vision-Language Model",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "18208-18217",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232543"}

@inproceedings{bb237579,
        AUTHOR = "Du, Y. and Wei, F.Y. and Zhang, Z.H. and Shi, M.J. and Gao, Y. and Li, G.Q.",
        TITLE = "Learning to Prompt for Open-Vocabulary Object Detection with
Vision-Language Model",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "14064-14073",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232544"}

@inproceedings{bb237580,
        AUTHOR = "Chang, Y.S. and Cao, G.H. and Narang, M. and Gao, J.F. and Suzuki, H. and Bisk, Y.",
        TITLE = "WebQA: Multihop and Multimodal QA",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16474-16483",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232545"}

@inproceedings{bb237581,
        AUTHOR = "Zellers, R. and Lu, J. and Lu, X.M. and Yu, Y. and Zhao, Y.P. and Salehi, M. and Kusupati, A. and Hessel, J. and Farhadi, A. and Choi, Y.",
        TITLE = "MERLOT RESERVE:
Neural Script Knowledge through Vision and Language and Sound",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16354-16366",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232546"}

@inproceedings{bb237582,
        AUTHOR = "Gupta, T. and Kamath, A. and Kembhavi, A. and Hoiem, D.",
        TITLE = "Towards General Purpose Vision Systems:
An End-to-End Task-Agnostic Vision-Language Architecture",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16378-16388",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232547"}

@inproceedings{bb237583,
        AUTHOR = "Suris, D. and Epstein, D. and Vondrick, C.",
        TITLE = "Globetrotter: Connecting Languages by Connecting Images",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16453-16463",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232548"}

@inproceedings{bb237584,
        AUTHOR = "Sung, Y.L. and Cho, J. and Bansal, M.",
        TITLE = "VL-ADAPTER: Parameter-Efficient Transfer Learning for
Vision-and-Language Tasks",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "5217-5227",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232549"}

@inproceedings{bb237585,
        AUTHOR = "Wu, D.M. and Dong, X.P. and Shao, L. and Shen, J.B.",
        TITLE = "Multi-Level Representation Learning with Semantic Alignment for
Referring Video Object Segmentation",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "4986-4995",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232550"}

@inproceedings{bb237586,
        AUTHOR = "Gao, K. and Chen, L. and Niu, Y. and Shao, J. and Xiao, J.",
        TITLE = "Classification-Then-Grounding: Reformulating Video Scene Graphs as
Temporal Bipartite Graphs",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "19475-19484",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232551"}

@inproceedings{bb237587,
        AUTHOR = "Kesen, I. and Can, O.A. and Erdem, E. and Erdem, A. and Yuret, D.",
        TITLE = "Modulating Bottom-Up and Top-Down Visual Processing via
Language-Conditional Filters",
        BOOKTITLE = MULA22,
        YEAR = "2022",
        PAGES = "4609-4619",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232552"}

@inproceedings{bb237588,
        AUTHOR = "Nebbia, G. and Kovashka, A.",
        TITLE = "Doubling down: sparse grounding with an additional, almost-matching
caption for detection-oriented multimodal pretraining",
        BOOKTITLE = MULA22,
        YEAR = "2022",
        PAGES = "4641-4650",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232553"}

@inproceedings{bb237589,
        AUTHOR = "Ye, J. and Tian, J.F. and Yan, M. and Yang, X.S. and Wang, X. and Zhang, J. and He, L. and Lin, X.",
        TITLE = "Shifting More Attention to Visual Backbone: Query-modulated
Refinement Networks for End-to-End Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15481-15491",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232554"}

@inproceedings{bb237590,
        AUTHOR = "Jiang, H.J. and Lin, Y.Z. and Han, D.C. and Song, S. and Huang, G.",
        TITLE = "Pseudo-Q: Generating Pseudo Language Queries for Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15492-15502",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232555"}

@inproceedings{bb237591,
        AUTHOR = "Huang, S. and Chen, Y.L. and Jia, J.Y. and Wang, L.W.",
        TITLE = "Multi-View Transformer for 3D Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15503-15512",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232556"}

@inproceedings{bb237592,
        AUTHOR = "Chen, S. and Li, B.",
        TITLE = "Multi-Modal Dynamic Graph Transformer for Visual Grounding",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15513-15522",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232557"}

@inproceedings{bb237593,
        AUTHOR = "Mavroudi, E. and Vidal, R.",
        TITLE = "Weakly-Supervised Generation and Grounding of Visual Descriptions
with Conditional Generative Models",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15523-15533",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232558"}

@inproceedings{bb237594,
        AUTHOR = "Chen, S. and Zhao, Q.",
        TITLE = "REX: Reasoning-aware and Grounded Explanation",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15565-15574",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232559"}

@inproceedings{bb237595,
        AUTHOR = "Lou, C. and Han, W.J. and Lin, Y. and Zheng, Z.L.",
        TITLE = "Unsupervised Vision-Language Parsing: Seamlessly Bridging Visual
Scene Graphs with Language Structures via Dependency Relationships",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "15586-15595",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232560"}

@inproceedings{bb237596,
        AUTHOR = "Luo, J.Y. and Fu, J.H. and Kong, X.H. and Gao, C. and Ren, H.B. and Shen, H. and Xia, H.X. and Liu, S.",
        TITLE = "3D-SPS: Single-Stage 3D Visual Grounding via Referred Point
Progressive Selection",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16433-16442",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232561"}

@inproceedings{bb237597,
        AUTHOR = "Cai, D. and Zhao, L.C. and Zhang, J. and Sheng, L. and Xu, D.",
        TITLE = "3DJCG: A Unified Framework for Joint Dense Captioning and Visual
Grounding on 3D Point Clouds",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "16443-16452",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232562"}

@inproceedings{bb237598,
        AUTHOR = "Luo, H.C. and Zhai, W. and Zhang, J. and Cao, Y. and Tao, D.C.",
        TITLE = "Learning Affordance Grounding from Exocentric Images",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "2242-2251",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232563"}

@inproceedings{bb237599,
        AUTHOR = "Jiang, X. and Xu, X. and Zhang, J. and Shen, F.M. and Cao, Z. and Shen, H.T.",
        TITLE = "Semi-supervised Video Paragraph Grounding with Contrastive Encoder",
        BOOKTITLE = CVPR22,
        YEAR = "2022",
        PAGES = "2456-2465",
        BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT232564"}
Last update:Nov 26, 2025 at 20:24:09