@inproceedings{tripathi2025towards, type = {inproceedings}, key = {tripathi2025towards}, title = {Towards Explainable Drift Detection and Early Retrain in ML-Based Malware Detection Pipelines}, author = {Jayesh Tripathi and Heitor Gomes and Marcus Botacin}, booktitle = {International Conference on Detection of Intrusions and Malware, and Vulnerability Assessment}, year = {2025}, month = {July}, volume = {15748}, pages = {3-24}, abstract = {The current largest challenge in ML-based malware detection is maintaining high detection rates while samples evolve. Although multiple works have proposed drift detectors and retraining-aware pipelines that work with reasonable efficiency, none of these detectors and pipelines are currently explainable, which limits our understanding of the threats’ evolution and the detector’s efficiency. Despite previous works that presented taxonomies of concept drift events, no practical solution for explainable drift detection in malware pipelines existed until this work. Our insight to change this scenario is to split the classifier knowledge into two: (1) the knowledge about the frontier between Malware (M) and Goodware (G); and (2) the knowledge about the concept of the (M and G) classes. Thus, we can understand whether the concept or the classification frontier changed by measuring the variations in these two domains. We make this approach practical by deploying a pipeline with meta-classifiers to measure these sub-classes of the main malware detector. We demonstrate via 5K+ experiment runs the viability of our solution by (1) illustrating how it explains every drift point of the DREBIN and AndroZoo datasets and (2) how an explainable drift detector makes online retraining to achieve higher rates and requires fewer retraining points.}, doi = {10.1007/978-3-031-97623-0_1}, }