@article{Simon19:Implicit, type = {article}, key = {Simon19:Implicit}, title = {The Whole Is More Than Its Parts? From Explicit to Implicit Pose Normalization}, author = {Marcel Simon and Erik Rodner and Trevor Darell and Joachim Denzler}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, year = {2020}, month = {March}, note = {(Pre-print published in 2019.)}, number = {3}, pages = {749-763}, volume = {42}, abstract = {Fine-grained classification describes the automated recognition of visually similar object categories like birds species. Previous works were usually based on explicit pose normalization, i.e., the detection and description of object parts. However, recent models based on a final global average or bilinear pooling have achieved a comparable accuracy without this concept. In this paper, we analyze the advantages of these approaches over generic CNNs and explicit pose normalization approaches. We also show how they can achieve an implicit normalization of the object pose. A novel visualization technique called activation flow is introduced to investigate limitations in pose handling in traditional CNNs like AlexNet and VGG. Afterward, we present and compare the explicit pose normalization approach neural activation constellations and a generalized framework for the final global average and bilinear pooling called α-pooling. We observe that the latter often achieves a higher accuracy improving common CNN models by up to 22.9%, but lacks the interpretability of the explicit approaches. We present a visualization approach for understanding and analyzing predictions of the model to address this issue. Furthermore, we show that our approaches for fine-grained recognition are beneficial for other fields like action recognition. }, }