@ARTICLE{9201037, author={C. {Deng} and Q. {Wu} and Q. {Wu} and F. {Lyu} and F. {Hu} and M. {Tan}}, journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, title={Visual Grounding via Accumulated Attention}, year={2020}, volume={}, number={}, pages={1-1}, abstract={Visual Grounding (VG) aims to locate the most relevant object or region in an image, based on a natural language query. In real-world VG applications, however, we usually have to deal with ambiguous queries and images with complicated scene structures. Identifying the target based on highly redundant and correlated information can be very challenging, leading to unsatisfactory performance. To tackle this, in this paper, we exploit an attention module for each kind of information to reduce the internal redundancies. We then propose the Accumulated Attention mechanism to reason among all the attention modules jointly, thus the correlations among different kinds of information can be explicitly captured. Moreover, to improve the performance and robustness of our VG models, we introduce some noises into the training procedure to bridge the distribution gap between the human-labeled training data and the real-world poor quality data. With this ``noised'' training strategy, we further learn a bounding box regressor, which can be used to refine the bounding box of the target object. We evaluate the proposed methods on four benchmark datasets. The experimental results show that our methods significantly outperform all previous works on every dataset in terms of both speed and accuracy.}, keywords={Proposals;Visualization;Training;Feature extraction;Task analysis;Grounding;Cognition;Visual Grounding;Accumulated Attention;Noised Training Strategy;Bounding Box Regression}, doi={10.1109/TPAMI.2020.3023438}, ISSN={1939-3539}, month={},}