@inproceedings{barz2022weaklysupervised, type = {inproceedings}, key = {barz2022weaklysupervised}, title = {Weakly-Supervised Localization of Multiple Objects in Images using Cosine Loss}, author = {Björn Barz and Joachim Denzler}, booktitle = {International Conference on Computer Vision Theory and Applications (VISAPP)}, year = {2022}, organization = {INSTICC}, pages = {287-296}, publisher = {SciTePress}, abstract = {Can we learn to localize objects in images from just image-level class labels? Previous research has shown that this ability can be added to convolutional neural networks (CNNs) trained for image classification post hoc without additional cost or effort using so-called class activation maps (CAMs). However, while CAMs can localize a particular known class in the image quite accurately, they cannot detect and localize instances of multiple different classes in a single image. This limitation is a consequence of the missing comparability of prediction scores between classes, which results from training with the cross-entropy loss after a softmax activation. We find that CNNs trained with the cosine loss instead of cross-entropy do not exhibit this limitation and propose a variation of CAMs termed Dense Class Maps (DCMs) that fuse predictions for multiple classes into a coarse semantic segmentation of the scene. Even though the network has only been trained for single-label classification at the image level, DCMs allow for detecting the presence of multiple objects in an image and locating them. Our approach outperforms CAMs on the MS COCO object detection dataset by a relative increase of 27% in mean average precision.}, doi = {10.5220/0010760800003124}, }