@article{bb241200,
AUTHOR = "Yang, J. and Wei, P.",
TITLE = "Learning unified patterns of multimodalities for video temporal
grounding",
JOURNAL = PR,
VOLUME = "172",
YEAR = "2026",
PAGES = "112484",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236116"}
@article{bb241201,
AUTHOR = "Dai, M. and Cheng, W.X. and Liu, J.J. and Yang, L.F. and Feng, Z.H. and Yang, W.K. and Wang, J.D.",
TITLE = "Improving Generalized Visual Grounding With Instance-Aware Joint
Learning",
JOURNAL = PAMI,
VOLUME = "48",
YEAR = "2026",
NUMBER = "1",
MONTH = "January",
PAGES = "448-465",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236117"}
@article{bb241202,
AUTHOR = "Zhu, Y. and Chen, D. and Wang, H. and Jia, T. and Deng, S.Z.",
TITLE = "DCART: A dual contrastive alignment residual transformer model for
visual grounding",
JOURNAL = PR,
VOLUME = "172",
YEAR = "2026",
PAGES = "112688",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236118"}
@article{bb241203,
AUTHOR = "Shi, L.T. and Liu, T. and Hu, X.T. and Hu, Y. and Yin, Q. and Hong, R.C.",
TITLE = "SwimVG: Step-Wise Multimodal Fusion and Adaption for Visual Grounding",
JOURNAL = MultMed,
VOLUME = "27",
YEAR = "2025",
PAGES = "9776-9787",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236119"}
@article{bb241204,
AUTHOR = "Fu, Z. and Mao, Z.D. and Zhang, L. and Zhang, Y.D.",
TITLE = "Boosting Faithful Multi-Modal LLMs via Complementary Visual Grounding",
JOURNAL = IP,
VOLUME = "34",
YEAR = "2025",
PAGES = "8641-8655",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236120"}
@article{bb241205,
AUTHOR = "Liu, Y. and Zheng, M.H. and Chen, Q.C. and Gong, S.G. and Peng, Y.X.",
TITLE = "Large-Scale Pre-Trained Models Empowering Phrase Generalization in
Temporal Sentence Localization",
JOURNAL = IJCV,
VOLUME = "134",
YEAR = "2026",
NUMBER = "2",
MONTH = "February",
PAGES = "53",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236121"}
@inproceedings{bb241206,
AUTHOR = "Zheng, M.H. and Cai, X.H. and Chen, Q.C. and Peng, Y.X. and Liu, Y.",
TITLE = "Training-Free Video Temporal Grounding Using Large-Scale Pre-Trained
Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXXII: 20-37",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236122"}
@article{bb241207,
AUTHOR = "Hu, M. and Yang, K. and Li, J.",
TITLE = "Text-Injected Discriminative Model for Remote Sensing Visual
Grounding",
JOURNAL = RS,
VOLUME = "18",
YEAR = "2026",
NUMBER = "1",
PAGES = "161",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236123"}
@article{bb241208,
AUTHOR = "Li, A. and Liu, H.J. and Zhu, Y.Q. and Ge, Y.X.",
TITLE = "Efficient Pre-Trained Semantics Refinement for Video Temporal
Grounding",
JOURNAL = CirSysVideo,
VOLUME = "36",
YEAR = "2026",
NUMBER = "2",
MONTH = "February",
PAGES = "1406-1418",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236124"}
@article{bb241209,
AUTHOR = "Xiao, F. and Xu, H.B. and Zhao, G. and Kang, W.X.",
TITLE = "LSVG: Language-Guided Scene Graphs with 2D-Assisted Multi-Modal
Encoding for 3D Visual Grounding",
JOURNAL = PR,
VOLUME = "174",
YEAR = "2026",
PAGES = "112926",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236125"}
@article{bb241210,
AUTHOR = "Xiao, L.H. and Yang, X.S. and Lan, X.Y. and Wang, Y.W. and Xu, C.S.",
TITLE = "Toward Visual Grounding: A Survey",
JOURNAL = PAMI,
VOLUME = "48",
YEAR = "2026",
NUMBER = "3",
MONTH = "March",
PAGES = "2749-2771",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236126"}
@article{bb241211,
AUTHOR = "Moon, W.J. and Hyun, S. and Lee, S. and Heo, J.P.",
TITLE = "Correlation-guided calibration of query dependency for video temporal
grounding",
JOURNAL = PR,
VOLUME = "174",
YEAR = "2026",
PAGES = "112984",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236127"}
@article{bb241212,
AUTHOR = "Zhou, Y. and Chen, J. and Zhang, Z. and Huang, P.H. and Ding, R. and Zou, Z.T. and Gao, P. and Wei, Y.C. and Li, K. and Yang, X. and Jiang, X. and Yang, H.X. and Li, J.",
TITLE = "DVGBench: Implicit-to-explicit visual grounding benchmark in UAV
imagery with large vision-language models",
JOURNAL = PandRS,
VOLUME = "232",
YEAR = "2026",
PAGES = "831-847",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236128"}
@inproceedings{bb241213,
AUTHOR = "Iwakata, S. and Oshima, R. and Tsunashima, H. and Feng, Q. and Kataoka, H. and Morishima, S.",
TITLE = "Viewpoint-Dependent 3D Visual Grounding for Mobile Robots",
BOOKTITLE = ICIP25,
YEAR = "2025",
PAGES = "1690-1695",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236129"}
@inproceedings{bb241214,
AUTHOR = "Zhao, H.R. and Chen, T.Y. and Wang, Z.",
TITLE = "On the Robustness of GUI Grounding Models Against Image Attacks",
BOOKTITLE = TrustworthyOpen25,
YEAR = "2025",
PAGES = "1609-1614",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236130"}
@inproceedings{bb241215,
AUTHOR = "Zheng, Y.H. and Lin, G.S. and Chang, K.Y.",
TITLE = "Transformer-based Visual Grounding with Inter-Modality Cross
Attention",
BOOKTITLE = MVA25,
YEAR = "2025",
PAGES = "1-6",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236131"}
@inproceedings{bb241216,
AUTHOR = "Lin, X.W. and Lin, T.W. and Huang, L.C. and Xie, H.Y. and Su, Z.Z.",
TITLE = "BIP3D: Bridging 2D Images and 3D Perception for Embodied Intelligence",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "9007-9016",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236132"}
@inproceedings{bb241217,
AUTHOR = "Chang, Y. and Fermoselle, L. and Ta, D. and Bucher, B. and Carlone, L. and Wang, J.",
TITLE = "ASHiTA: Automatic Scene-Grounded HIerarchical Task Analysis",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29458-29468",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236133"}
@inproceedings{bb241218,
AUTHOR = "Yang, Y. and Yang, H. and Zhou, J.C. and Chen, P.H. and Zhang, H.X. and Du, Y.L. and Gan, C.",
TITLE = "3D-Mem: 3D Scene Memory for Embodied Exploration and Reasoning",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "17294-17303",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236134"}
@inproceedings{bb241219,
AUTHOR = "Peng, Q.H. and Zheng, H. and Huang, G.",
TITLE = "ProxyTransformation: Preshaping Point Cloud Manifold With Proxy
Attention For 3D Visual Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24582-24592",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236135"}
@inproceedings{bb241220,
AUTHOR = "Zong, Y.S. and Zhang, Q. and An, D.S. and Li, Z.H. and Xu, X. and Xu, L.H. and Tu, Z.W. and Xing, Y.F. and Dabeer, O.",
TITLE = "Ground-V: Teaching VLMs to Ground Complex Instructions in Pixels",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "24635-24645",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236136"}
@inproceedings{bb241221,
AUTHOR = "Wang, Y.X. and Wu, A. and Yang, M. and Min, Y. and Zhu, Y.H. and Deng, C.",
TITLE = "Reasoning Mamba: Hypergraph-Guided Region Relation Calculating for
Weakly Supervised Affordance Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "27618-27627",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236137"}
@inproceedings{bb241222,
AUTHOR = "Kim, C.D. and Moon, J. and Moon, S. and Yun, H. and Lee, S. and Kembhavi, A. and Lee, S. and Kim, G. and Lee, S.H. and Clark, C.",
TITLE = "ReSpec: Relevance and Specificity Grounded Online Filtering for
Learning on Video-Text Data Streams",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "29040-29049",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236138"}
@inproceedings{bb241223,
AUTHOR = "Deng, A.D. and Gao, Z. and Choudhuri, A. and Planche, B. and Zheng, M. and Wang, B. and Chen, T. and Chen, C. and Wu, Z.Y.",
TITLE = "Seq2Time: Sequential Knowledge Transfer for Video LLM Temporal
Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "13766-13775",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236139"}
@inproceedings{bb241224,
AUTHOR = "Shao, Y.W. and Zhai, W. and Yang, Y.H. and Luo, H.C. and Cao, Y. and Zha, Z.J.",
TITLE = "GREAT: Geometry-Intention Collaborative Inference for Open-Vocabulary
3D Object Affordance Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "17326-17336",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236140"}
@inproceedings{bb241225,
AUTHOR = "Munasinghe, S. and Gani, H. and Zhu, W.Q. and Cao, J. and Xing, E. and Khan, F.S. and Khan, S.",
TITLE = "VideoGLaMM : A Large Multimodal Model for Pixel-Level Visual
Grounding in Videos",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "19036-19046",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236141"}
@inproceedings{bb241226,
AUTHOR = "Garg, A. and Kumar, A. and Rawat, Y.S.",
TITLE = "STPro: Spatial and Temporal Progressive Learning for Weakly
Supervised Spatio-Temporal Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3384-3394",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236142"}
@inproceedings{bb241227,
AUTHOR = "Vogel, F. and Bousselham, W. and Kukleva, A. and Shvetsova, N. and Kuehne, H.",
TITLE = "VideoGEM: Training-Free Action Grounding in Videos",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3374-3383",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236143"}
@inproceedings{bb241228,
AUTHOR = "Wang, J.Y. and Chen, M.H. and Karaev, N. and Vedaldi, A. and Rupprecht, C. and Novotny, D.",
TITLE = "VGGT: Visual Geometry Grounded Transformer",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "5294-5306",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236144"}
@inproceedings{bb241229,
AUTHOR = "Li, R. and Li, S.J. and Kong, L. and Yang, X. and Liang, J.W.",
TITLE = "SeeGround: See and Ground for Zero-Shot Open-Vocabulary 3D Visual
Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3707-3717",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236145"}
@inproceedings{bb241230,
AUTHOR = "Guo, W.X. and Xu, X.W. and Wang, Z.W. and Feng, J.J. and Zhou, J. and Lu, J.W.",
TITLE = "Text-guided Sparse Voxel Pruning for Efficient 3D Visual Grounding",
BOOKTITLE = CVPR25,
YEAR = "2025",
PAGES = "3666-3675",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236146"}
@inproceedings{bb241231,
AUTHOR = "Kukal, R. and Patravali, J. and Yu, F. and Singh, S. and Karianakis, N. and Madhok, R.",
TITLE = "Click&Describe: Multimodal Grounding and Tracking for Aerial Objects",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "6011-6021",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236147"}
@inproceedings{bb241232,
AUTHOR = "Dey, S. and Unal, O. and Sakaridis, C. and Van Gool, L.J.",
TITLE = "Fine-Grained Spatial and Verbal Losses for 3D Visual Grounding",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "4852-4861",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236148"}
@inproceedings{bb241233,
AUTHOR = "Weerakoon, D. and Subbaraju, V. and Lim, J.H. and Misra, A.",
TITLE = "NeuroViG:
Integrating Event Cameras for Resource-Efficient Video Grounding",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "5781-5790",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236149"}
@inproceedings{bb241234,
AUTHOR = "Cao, Z. and Zhang, B.Q. and Du, H.M. and Yu, X. and Li, X. and Wang, S.",
TITLE = "FlashVTG: Feature Layering and Adaptive Score Handling Network for
Video Temporal Grounding",
BOOKTITLE = WACV25,
YEAR = "2025",
PAGES = "9226-9236",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236150"}
@inproceedings{bb241235,
AUTHOR = "Singhi, N. and Kim, J.M. and Roth, K. and Akata, Z.",
TITLE = "Improving Intervention Efficacy via Concept Realignment in Concept
Bottleneck Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXVI: 422-438",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236151"}
@inproceedings{bb241236,
AUTHOR = "Lee, P. and Byun, H.R.",
TITLE = "BAM-DETR: Boundary-aligned Moment Detection Transformer for Temporal
Sentence Grounding in Videos",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "II: 220-238",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236152"}
@inproceedings{bb241237,
AUTHOR = "Ma, C. and Jiang, Y. and Wu, J.N. and Yuan, Z.H. and Qi, X.J.",
TITLE = "GROMA: Localized Visual Tokenization for Grounding Multimodal Large
Language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "VI: 417-435",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236153"}
@inproceedings{bb241238,
AUTHOR = "Huang, Z. and Satoh, S.",
TITLE = "LOA-TRANS: Enhancing Visual Grounding by Location-aware Transformers",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "VII: 405-421",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236154"}
@inproceedings{bb241239,
AUTHOR = "Zhu, C. and Wang, T. and Zhang, W.W. and Chen, K. and Liu, X.H.",
TITLE = "SCANREASON: Empowering 3d Visual Grounding with Reasoning Capabilities",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "VIII: 151-168",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236155"}
@inproceedings{bb241240,
AUTHOR = "Xiao, Z. and Gong, M. and Cascante Bonilla, P. and Zhang, X.Y. and Wu, J. and Ordonez, V.",
TITLE = "Grounding Language Models for Visual Entity Recognition",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XI: 393-411",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236156"}
@inproceedings{bb241241,
AUTHOR = "Cheng, Z.X. and Pu, Y.J. and Gong, S.G. and Kordjamshidi, P. and Kong, Y.",
TITLE = "Shine: Saliency-aware Hierarchical Negative Ranking for Compositional
Temporal Grounding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XIX: 398-416",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236157"}
@inproceedings{bb241242,
AUTHOR = "Lee, P.Y. and Sung, M.",
TITLE = "Reground: Improving Textual and Spatial Grounding at No Cost",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXIII: 275-292",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236158"}
@inproceedings{bb241243,
AUTHOR = "Jiang, H.B. and Lu, Z.Q.",
TITLE = "Visual Grounding for Object-level Generalization in Reinforcement
Learning",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXX: 55-72",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236159"}
@inproceedings{bb241244,
AUTHOR = "Sun, P.L. and Song, Y.X. and Pan, X.L. and Kang, W.T. and Liu, G. and Shah, M. and Yan, Y.",
TITLE = "SEGVG: Transferring Object Bounding Box to Segmentation for Visual
Grounding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXXVIII: 57-75",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236160"}
@inproceedings{bb241245,
AUTHOR = "Kang, D. and Cho, M.",
TITLE = "In Defense of Lazy Visual Grounding for Open-vocabulary Semantic
Segmentation",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLI: 143-164",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236161"}
@inproceedings{bb241246,
AUTHOR = "Liu, Y. and He, J. and Li, W. and Kim, J. and Wei, D.L. and Pfister, H. and Chen, C.W.",
TITLE = "R^1-tuning: Efficient Image-to-video Transfer Learning for Video
Temporal Grounding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLI: 421-438",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236162"}
@inproceedings{bb241247,
AUTHOR = "Zhang, H. and Li, H.Y. and Li, F. and Ren, T. and Zou, X. and Liu, S.L. and Huang, S.J. and Gao, J.F. and Zhang, L. and Li, C.Y. and Yang, J.W.",
TITLE = "LLAVA-Grounding: Grounded Visual Chat with Large Multimodal Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLIII: 19-35",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236163"}
@inproceedings{bb241248,
AUTHOR = "Yang, J. and Ding, R. and Brown, E. and Qi, X.J. and Xie, S.",
TITLE = "V-IRL: Grounding Virtual Intelligence in Real Life",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLV: 36-55",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236164"}
@inproceedings{bb241249,
AUTHOR = "Chen, W. and Chen, L. and Wu, Y.",
TITLE = "An Efficient and Effective Transformer Decoder-based Framework for
Multi-task Visual Grounding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLV: 125-141",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236165"}
@inproceedings{bb241250,
AUTHOR = "Qian, Z.P. and Ma, Y.W. and Lin, Z.K. and Ji, J.Y. and Zheng, X. and Sun, X.S. and Ji, R.R.",
TITLE = "Multi-branch Collaborative Learning Network for 3d Visual Grounding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLVI: 381-398",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236166"}
@inproceedings{bb241251,
AUTHOR = "Liu, S.L. and Zeng, Z.Y. and Ren, T. and Li, F. and Zhang, H. and Yang, J. and Jiang, Q. and Li, C.Y. and Yang, J.W. and Su, H. and Zhu, J. and Zhang, L.",
TITLE = "Grounding Dino: Marrying Dino with Grounded Pre-training for Open-set
Object Detection",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLVII: 38-55",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236167"}
@inproceedings{bb241252,
AUTHOR = "Jin, Y. and Mu, Y.D.",
TITLE = "Weakly-supervised Spatio-temporal Video Grounding with Variational
Cross-modal Alignment",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XLVIII: 412-429",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236168"}
@inproceedings{bb241253,
AUTHOR = "Fujiwara, K. and Tanaka, M. and Yu, Q.",
TITLE = "Chronologically Accurate Retrieval for Temporal Grounding of
Motion-language Models",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LVIII: 323-339",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236169"}
@inproceedings{bb241254,
AUTHOR = "Leroy, V. and Cabon, Y. and Revaud, J.",
TITLE = "Grounding Image Matching in 3d with Mast3r",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXII: 71-91",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236170"}
@inproceedings{bb241255,
AUTHOR = "Unal, O. and Sakaridis, C. and Saha, S. and Van Gool, L.J.",
TITLE = "Four Ways to Improve Verbo-visual Fusion for Dense 3d Visual Grounding",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXVI: 196-213",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236171"}
@inproceedings{bb241256,
AUTHOR = "Wan, D. and Cho, J. and Stengel Eskin, E. and Bansal, M.",
TITLE = "Contrastive Region Guidance: Improving Grounding in Vision-language
Models Without Training",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXIX: 198-215",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236172"}
@inproceedings{bb241257,
AUTHOR = "Bao, P.J. and Shao, Z. and Yang, W.H. and Ng, B.P. and Kot, A.C.",
TITLE = "E3m: Zero-shot Spatio-temporal Video Grounding with
Expectation-maximization Multimodal Modulation",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "LXXXIII: 227-243",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236173"}
@inproceedings{bb241258,
AUTHOR = "Hannan, T. and Islam, M.M. and Seidl, T. and Bertasius, G.",
TITLE = "RGNET: A Unified Clip Retrieval and Grounding Network for Long Videos",
BOOKTITLE = ECCV24,
YEAR = "2024",
PAGES = "XXI: 352-369",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236174"}
@inproceedings{bb241259,
AUTHOR = "Khoshsirat, S. and Kambhamettu, C.",
TITLE = "Embedding Attention Blocks for Answer Grounding",
BOOKTITLE = ICIP24,
YEAR = "2024",
PAGES = "521-527",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236175"}
@inproceedings{bb241260,
AUTHOR = "Hamilton, M. and Zisserman, A. and Hershey, J.R. and Freeman, W.T.",
TITLE = "Separating the 'Chirp' from the 'Chat':
Self-supervised Visual Grounding of Sound and Language",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13117-13127",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236176"}
@inproceedings{bb241261,
AUTHOR = "Shen, Y.H. and Fu, C.Y. and Chen, P.X. and Zhang, M. and Li, K. and Sun, X. and Wu, Y.S. and Lin, S.H. and Ji, R.R.",
TITLE = "Aligning and Prompting Everything All at Once for Universal Visual
Perception",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13193-13203",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236177"}
@inproceedings{bb241262,
AUTHOR = "Wu, T.H. and Biamby, G. and Chan, D. and Dunlap, L. and Gupta, R. and Wang, X.D. and Gonzalez, J.E. and Darrell, T.J.",
TITLE = "See, Say, and Segment: Teaching LMMs to Overcome False Premises",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13459-13469",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236178"}
@inproceedings{bb241263,
AUTHOR = "Wang, Y. and Li, Y. and Wang, S.J.",
TITLE = "G3-LQ: Marrying Hyperbolic Alignment with Explicit Semantic-Geometric
Modeling for 3D Visual Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13917-13926",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236179"}
@inproceedings{bb241264,
AUTHOR = "Rizve, M.N. and Fei, F. and Unnikrishnan, J. and Tran, S. and Yao, B.Z. and Zeng, B. and Shah, M. and Chilimbi, T.",
TITLE = "VidLA: Video-Language Alignment at Scale",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14043-14055",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236180"}
@inproceedings{bb241265,
AUTHOR = "Shi, X.X. and Wu, Z.H. and Lee, S.",
TITLE = "Viewpoint-Aware Visual Grounding in 3D Scenes",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14056-14065",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236181"}
@inproceedings{bb241266,
AUTHOR = "Feng, C.J. and Zhong, Y.J. and Jie, Z.Q. and Xie, W. and Ma, L.",
TITLE = "InstaGen: Enhancing Object Detection by Training on Synthetic Dataset",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14121-14130",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236182"}
@inproceedings{bb241267,
AUTHOR = "Chang, C.P. and Wang, S.X. and Pagani, A. and Stricker, D.",
TITLE = "MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual
Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14131-14140",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236183"}
@inproceedings{bb241268,
AUTHOR = "Wang, S. and Lin, Y.T. and Wu, Y.",
TITLE = "Omni-Q: Omni-Directional Scene Understanding for Unsupervised Visual
Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14261-14270",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236184"}
@inproceedings{bb241269,
AUTHOR = "Shen, Y.H. and Wang, H.Y. and Yang, X.T. and Feiszli, M. and Elhamifar, E. and Torresani, L. and Mavroudi, E.",
TITLE = "Learning to Segment Referred Objects from Narrated Egocentric Videos",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "14510-14520",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236185"}
@inproceedings{bb241270,
AUTHOR = "Xu, C. and Han, Y.H. and Xu, R. and Hui, L. and Xie, J. and Yang, J.",
TITLE = "Multi-Attribute Interactions Matter for 3D Visual Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "17253-17262",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236186"}
@inproceedings{bb241271,
AUTHOR = "Gu, X. and Fan, H. and Huang, Y. and Luo, T.J. and Zhang, L.B.",
TITLE = "Context-Guided Spatio-Temporal Video Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18330-18339",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236187"}
@inproceedings{bb241272,
AUTHOR = "Chen, B. and Shvetsova, N. and Rouditchenko, A. and Kondermann, D. and Thomas, S. and Chang, S.F. and Feris, R. and Glass, J. and Kuehne, H.",
TITLE = "What, When, and Where? Self-Supervised Spatio- Temporal Grounding in
Untrimmed Multi-Action Videos from Narrated Instructions",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18419-18429",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236188"}
@inproceedings{bb241273,
AUTHOR = "Wasim, S.T. and Naseer, M. and Khan, S. and Yang, M.H. and Khan, F.S.",
TITLE = "VideoGrounding-DINO: Towards Open-Vocabulary Spatio- Temporal Video
Grounding",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "18909-18918",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236189"}
@inproceedings{bb241274,
AUTHOR = "Tao, M. and Bai, B. and Lin, H.Z. and Wang, H. and Wang, Y. and Luo, L. and Fang, L.",
TITLE = "When Visual Grounding Meets Gigapixel-Level Large-Scale Scenes:
Benchmark and Approach",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "22119-22128",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236190"}
@inproceedings{bb241275,
AUTHOR = "Rasheed, H. and Maaz, M. and Shaji, S. and Shaker, A. and Khan, S. and Cholakkal, H. and Anwer, R.M. and Xing, E. and Yang, M.H. and Khan, F.S.",
TITLE = "GLaMM: Pixel Grounding Large Multimodal Model",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13009-13018",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236191"}
@inproceedings{bb241276,
AUTHOR = "Zhang, Y.Q. and Luo, H. and Lei, Y.J.",
TITLE = "Towards CLIP-Driven Language-Free 3D Visual Grounding via 2D-3D
Relational Enhancement and Consistency",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "13063-13072",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236192"}
@inproceedings{bb241277,
AUTHOR = "Zhang, C. and Li, M. and Budvytis, I. and Liwicki, S.",
TITLE = "DiaLoc: An Iterative Approach to Embodied Dialog Localization",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "12585-12593",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236193"}
@inproceedings{bb241278,
AUTHOR = "Xiao, B. and Wu, H.P. and Xu, W.J. and Dai, X.Y. and Hu, H.D. and Lu, Y. and Zeng, M. and Liu, C. and Yuan, L.",
TITLE = "Florence-2: Advancing a Unified Representation for a Variety of
Vision Tasks",
BOOKTITLE = CVPR24,
YEAR = "2024",
PAGES = "4818-4829",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236194"}
@inproceedings{bb241279,
AUTHOR = "Qian, S.Y. and Chen, W.F. and Bai, M. and Zhou, X. and Tu, Z.W. and Li, L.E.",
TITLE = "AffordanceLLM: Grounding Affordance from Vision Language Models",
BOOKTITLE = OpenSUN3D24,
YEAR = "2024",
PAGES = "7587-7597",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236195"}
@inproceedings{bb241280,
AUTHOR = "Miyanishi, T. and Azuma, D. and Kurita, S. and Kawanabe, M.",
TITLE = "Cross3DVG: Cross-Dataset 3D Visual Grounding on Different RGB-D Scans",
BOOKTITLE = "3DV24",
YEAR = "2024",
PAGES = "717-727",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236196"}
@inproceedings{bb241281,
AUTHOR = "Gong, R. and Huang, J.Y. and Zhao, Y.Z. and Geng, H.R. and Gao, X.F. and Wu, Q.Y. and Ai, W. and Zhou, Z.H. and Terzopoulos, D. and Zhu, S.C. and Jia, B.X. and Huang, S.Y.",
TITLE = "ARNOLD: A Benchmark for Language-Grounded Task Learning With
Continuous States in Realistic 3D Scenes",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "20426-20438",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236197"}
@inproceedings{bb241282,
AUTHOR = "Wu, Y. and Wei, Y. and Wang, H.Z. and Liu, Y.F. and Yang, S. and He, X.M.",
TITLE = "Grounded Image Text Matching with Mismatched Relation Reasoning",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "2964-2975",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236198"}
@inproceedings{bb241283,
AUTHOR = "Lee, C. and Kumar, M.G. and Tan, C.",
TITLE = "DetermiNet: A Large-Scale Diagnostic Dataset for Complex
Visually-Grounded Referencing using Determiners",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "19962-19971",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236199"}
@inproceedings{bb241284,
AUTHOR = "Lin, K.Q.H. and Zhang, P.C. and Chen, J. and Pramanick, S. and Gao, D.F. and Wang, A.J.P. and Yan, R. and Shou, M.Z.",
TITLE = "UniVTG: Towards Unified Video-Language Temporal Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "2782-2792",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236200"}
@inproceedings{bb241285,
AUTHOR = "Liu, Y. and Zhang, J.H. and Chen, Q.C. and Peng, Y.X.",
TITLE = "Confidence-aware Pseudo-label Learning for Weakly Supervised Visual
Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "2816-2826",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236201"}
@inproceedings{bb241286,
AUTHOR = "Khoshsirat, S. and Kambhamettu, C.",
TITLE = "Sentence Attention Blocks for Answer Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "6057-6067",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236202"}
@inproceedings{bb241287,
AUTHOR = "Li, H.X. and Cao, M. and Cheng, X. and Li, Y.W. and Zhu, Z.H. and Zou, Y.X.",
TITLE = "G2L: Semantically Aligned and Uniform Video Grounding via Geodesic
and Game Theory",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "11998-12008",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236203"}
@inproceedings{bb241288,
AUTHOR = "Li, H. and Shu, X.J. and He, S. and Qiao, R.Z. and Wen, W. and Guo, T. and Gan, B. and Sun, X.",
TITLE = "D3G: Exploring Gaussian Prior for Temporal Sentence Grounding with
Glance Annotation",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "13688-13700",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236204"}
@inproceedings{bb241289,
AUTHOR = "Pan, Y.L. and He, X.T. and Gong, B. and Lv, Y.L. and Shen, Y.J. and Peng, Y.X. and Zhao, D.L.",
TITLE = "Scanning Only Once: An End-to-end Framework for Fast Temporal
Grounding in Long Videos",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "13721-13731",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236205"}
@inproceedings{bb241290,
AUTHOR = "Jang, J. and Park, J. and Kim, J. and Kwon, H. and Sohn, K.H.",
TITLE = "Knowing Where to Focus: Event-aware Transformer for Video Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "13800-13810",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236206"}
@inproceedings{bb241291,
AUTHOR = "Zhang, Y.M. and Gong, Z. and Chang, A.X.",
TITLE = "Multi3DRefer: Grounding Text Description to Multiple 3D Objects",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "15179-15179",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236207"}
@inproceedings{bb241292,
AUTHOR = "Li, H. and Wei, P. and Ma, Z. and Zheng, N.N.",
TITLE = "Inverse Compositional Learning for Weakly-supervised Relation
Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "15431-15441",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236208"}
@inproceedings{bb241293,
AUTHOR = "Chen, D.Z.Y. and Hu, R.H. and Chen, X.L. and Nießner, M. and Chang, A.X.",
TITLE = "UniT3D: A Unified Transformer for 3D Dense Captioning and Visual
Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "18063-18073",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236209"}
@inproceedings{bb241294,
AUTHOR = "de la Jara, I.M. and Rodriguez Opazo, C. and Marrese Taylor, E. and Bravo Marquez, F.",
TITLE = "An empirical study of the effect of video encoders on Temporal Video
Grounding",
BOOKTITLE = CLVL23,
YEAR = "2023",
PAGES = "2842-2847",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236210"}
@inproceedings{bb241295,
AUTHOR = "Wang, Z. and Huang, H.F. and Zhao, Y. and Li, L.J. and Cheng, X.Z. and Zhu, Y.C. and Yin, A. and Zhao, Z.",
TITLE = "Distilling Coarse-to-Fine Semantic Matching Knowledge for Weakly
Supervised 3D Visual Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "2662-2671",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236211"}
@inproceedings{bb241296,
AUTHOR = "Guo, Z. and Tang, Y.W. and Zhang, R. and Wang, D. and Wang, Z.G. and Zhao, B. and Li, X.L.",
TITLE = "ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding",
BOOKTITLE = ICCV23,
YEAR = "2023",
PAGES = "15326-15337",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236212"}
@inproceedings{bb241297,
AUTHOR = "Hsu, J. and Mao, J.Y. and Wu, J.J.",
TITLE = "NS3D: Neuro-Symbolic Grounding of 3D Objects and Relations",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "2614-2623",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236213"}
@inproceedings{bb241298,
AUTHOR = "Yi, J. and Uzkent, B. and Ignat, O. and Li, Z.L. and Garg, A. and Yu, X. and Liu, L.",
TITLE = "Augment the Pairs: Semantics-Preserving Image-Caption Pair
Augmentation for Grounding-Based Vision and Language Models",
BOOKTITLE = WACV24,
YEAR = "2024",
PAGES = "5508-5518",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236214"}
@inproceedings{bb241299,
AUTHOR = "Uzkent, B. and Garg, A. and Zhu, W.T. and Doshi, K. and Yi, J. and Wang, X.L. and Omar, M.",
TITLE = "Dynamic Inference with Grounding Based Vision and Language Models",
BOOKTITLE = CVPR23,
YEAR = "2023",
PAGES = "2624-2633",
BIBSOURCE = "http://www.visionbib.com/bibliography/applicat803vgr2.html#TT236215"}
Last update:Feb 26, 2026 at 10:58:24