Natalia Wojak-Strzelecka, Szymon Bobek, Mehrdad Asadi, Ann Nowe, Krzysztof Kutt, José García, Grzegorz J. Nalepa
Anomaly detection is critical in industrial domains such as quality control and predictive maintenance, where combining it with visual inspection and explainability enhances trust and reduces errors. This study evaluates a pre-trained multimodal foundation model for visual anomaly detection and explanation on the MVTec dataset, using a post hoc fusion strategy that integrates outputs from independent models. The setup includes comparisons with PatchCore and extended configurations incorporating metadata such as heatmaps, segmentation masks, and patches. Results show that the multimodal model outperforms PatchCore on texture categories (mean F1: 0.960 vs. 0.947), but underperforms on object categories. It offers interpretable explanations in simple cases, but limited classification accuracy, marginal benefits from metadata augmentation, and reduced specificity in complex scenes indicate that further refinement is needed. These findings highlight the potential of multimodal models for explainable anomaly detection while underscoring current limitations in handling complex object structures.
Wojak-Strzelecka, N, Bobek, S, Asadi, M, Nowe, A, Kutt, K, García, J & J. Nalepa, G 2026, Explainable Visual Anomaly Detection with Multimodal Models and Metadata-Augmented Prompts. in I Koprinska, J Mendes-Moreira & P Branco (eds), Machine Learning and Principles and Practice of Knowledge Discovery in Databases - International Workshops of ECML PKDD 2025, Revised Selected Papers. Communications in Computer and Information Science, vol. 2842 CCIS, Springer, pp. 156-165, European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2025), Porto, Portugal, 15/09/25. https://doi.org/10.1007/978-3-032-19105-2_12
Wojak-Strzelecka, N., Bobek, S., Asadi, M., Nowe, A., Kutt, K., García, J., & J. Nalepa, G. (2026). Explainable Visual Anomaly Detection with Multimodal Models and Metadata-Augmented Prompts. In I. Koprinska, J. Mendes-Moreira, & P. Branco (Eds.), Machine Learning and Principles and Practice of Knowledge Discovery in Databases - International Workshops of ECML PKDD 2025, Revised Selected Papers (pp. 156-165). (Communications in Computer and Information Science; Vol. 2842 CCIS). Springer. https://doi.org/10.1007/978-3-032-19105-2_12
@inproceedings{9d132116d6da4e02ae527c20fb15a014,
title = "Explainable Visual Anomaly Detection with Multimodal Models and Metadata-Augmented Prompts",
abstract = "Anomaly detection is critical in industrial domains such as quality control and predictive maintenance, where combining it with visual inspection and explainability enhances trust and reduces errors. This study evaluates a pre-trained multimodal foundation model for visual anomaly detection and explanation on the MVTec dataset, using a post hoc fusion strategy that integrates outputs from independent models. The setup includes comparisons with PatchCore and extended configurations incorporating metadata such as heatmaps, segmentation masks, and patches. Results show that the multimodal model outperforms PatchCore on texture categories (mean F1: 0.960 vs. 0.947), but underperforms on object categories. It offers interpretable explanations in simple cases, but limited classification accuracy, marginal benefits from metadata augmentation, and reduced specificity in complex scenes indicate that further refinement is needed. These findings highlight the potential of multimodal models for explainable anomaly detection while underscoring current limitations in handling complex object structures.",
author = "Natalia Wojak-Strzelecka and Szymon Bobek and Mehrdad Asadi and Ann Nowe and Krzysztof Kutt and Jos{\'e} Garc{\'i}a and \{J. Nalepa\}, Grzegorz",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2026.; European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2025), ECML PKDD 2025 ; Conference date: 15-09-2025 Through 19-09-2025",
year = "2026",
doi = "10.1007/978-3-032-19105-2\_12",
language = "English",
isbn = "9783032191045",
series = "Communications in Computer and Information Science",
publisher = "Springer",
pages = "156--165",
editor = "Irena Koprinska and Jo{\~a}o Mendes-Moreira and Paula Branco",
booktitle = "Machine Learning and Principles and Practice of Knowledge Discovery in Databases - International Workshops of ECML PKDD 2025, Revised Selected Papers",
url = "https://ecmlpkdd.org/2025/",
}