Tracking multiple objects based on textual queries is a challenging task that requires linking language understanding with object association across frames. Previous works typically train the whole process end-to-end or integrate an additional referring text module into a multi-object tracker, but they both require supervised training and potentially struggle with generalization to open-set queries. In this work, we introduce ReferGPT, a novel zero-shot referring multi-object tracking framework. We provide a multi-modal large language model (MLLM) with spatial knowledge enabling it to generate 3D-aware captions. This enhances its descriptive capabilities and supports a more flexible referring vocabulary without training. We also propose a robust query-matching strategy, leveraging CLIP-based semantic encoding and fuzzy matching to associate MLLM generated captions with user queries. Extensive experiments on Refer-KITTI, Refer-KITTIv2 and Refer-KITTI+ demonstrate that ReferGPT achieves competitive performance against trained methods, showcasing its robustness and zero-shot capabilities in autonomous driving. The codes will be publicly available on github.
Chamiti, T, Di Bella, L, Munteanu, A & Deligiannis, N 2025, ReferGPT: Towards Zero-Shot Referring Multi-Object Tracking. in 2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW). IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops, IEEE, pp. 3849-3858. https://doi.org/10.1109/CVPRW67362.2025.00370
Chamiti, T., Di Bella, L., Munteanu, A., & Deligiannis, N. (2025). ReferGPT: Towards Zero-Shot Referring Multi-Object Tracking. In 2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW) (pp. 3849-3858). (IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops). IEEE. https://doi.org/10.1109/CVPRW67362.2025.00370
@inproceedings{1507b5894cd24a2da374ced5d2235136,
title = "ReferGPT: Towards Zero-Shot Referring Multi-Object Tracking",
abstract = "Tracking multiple objects based on textual queries is a challenging task that requires linking language understanding with object association across frames. Previous works typically train the whole process end-to-end or integrate an additional referring text module into a multi-object tracker, but they both require supervised training and potentially struggle with generalization to open-set queries. In this work, we introduce ReferGPT, a novel zero-shot referring multi-object tracking framework. We provide a multi-modal large language model (MLLM) with spatial knowledge enabling it to generate 3D-aware captions. This enhances its descriptive capabilities and supports a more flexible referring vocabulary without training. We also propose a robust query-matching strategy, leveraging CLIP-based semantic encoding and fuzzy matching to associate MLLM generated captions with user queries. Extensive experiments on Refer-KITTI, Refer-KITTIv2 and Refer-KITTI+ demonstrate that ReferGPT achieves competitive performance against trained methods, showcasing its robustness and zero-shot capabilities in autonomous driving. The codes will be publicly available on github.",
author = "Tzoulio Chamiti and \{Di Bella\}, Leandro and Adrian Munteanu and Nikos Deligiannis",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.",
year = "2025",
doi = "10.1109/CVPRW67362.2025.00370",
language = "English",
isbn = "9798331599959",
series = "IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops",
publisher = "IEEE",
pages = "3849--3858",
booktitle = "2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)",
}