<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">54841</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2024.054841</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A Concise and Varied Visual Features-Based Image Captioning Model with Visual Selection</article-title>
<alt-title alt-title-type="left-running-head">A Concise and Varied Visual Features-Based Image Captioning Model with Visual Selection</alt-title>
<alt-title alt-title-type="right-running-head">A Concise and Varied Visual Features-Based Image Captioning Model with Visual Selection</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Thobhani</surname><given-names>Alaa</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><email>althobhanialaa@gmail.com</email></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Zou</surname><given-names>Beiji</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Kui</surname><given-names>Xiaoyan</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Abdussalam</surname><given-names>Amr</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Asim</surname><given-names>Muhammad</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Ahmed</surname><given-names>Naveed</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>Ali Alshara</surname><given-names>Mohammed</given-names></name><xref ref-type="aff" rid="aff-4">4</xref><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<aff id="aff-1"><label>1</label><institution>School of Computer Science and Engineering, Central South University</institution>, <addr-line>Changsha, 410083</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Electronic Engineering and Information Science Department, University of Science and Technology of China</institution>, <addr-line>Hefei, 230026</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>EIAS Data Science Lab, College of Computer and Information Sciences, Prince Sultan University</institution>, <addr-line>Riyadh, 11586</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-4"><label>4</label><institution>College of Computer and Information Sciences, Prince Sultan University</institution>, <addr-line>Riyadh, 11586</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-5"><label>5</label><institution>College of Computer and Information Sciences, Imam Mohammad Ibn Saud Islamic University</institution>, <addr-line>Riyadh, 11432</addr-line>, <country>Saudi Arabia</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Alaa Thobhani. Email: <email>althobhanialaa@gmail.com</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2024</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>18</day><month>11</month><year>2024</year>
</pub-date>
<volume>81</volume>
<issue>2</issue>
<fpage>2873</fpage>
<lpage>2894</lpage>
<history>
<date date-type="received">
<day>09</day>
<month>6</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>9</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2024 The Authors.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_54841.pdf"></self-uri>
<abstract>
<p>Image captioning has gained increasing attention in recent years. Visual characteristics found in input images play a crucial role in generating high-quality captions. Prior studies have used visual attention mechanisms to dynamically focus on localized regions of the input image, improving the effectiveness of identifying relevant image regions at each step of caption generation. However, providing image captioning models with the capability of selecting the most relevant visual features from the input image and attending to them can significantly improve the utilization of these features. Consequently, this leads to enhanced captioning network performance. In light of this, we present an image captioning framework that efficiently exploits the extracted representations of the image. Our framework comprises three key components: the Visual Feature Detector module (VFD), the Visual Feature Visual Attention module (VFVA), and the language model. The VFD module is responsible for detecting a subset of the most pertinent features from the local visual features, creating an updated visual features matrix. Subsequently, the VFVA directs its attention to the visual features matrix generated by the VFD, resulting in an updated context vector employed by the language model to generate an informative description. Integrating the VFD and VFVA modules introduces an additional layer of processing for the visual features, thereby contributing to enhancing the image captioning model&#x2019;s performance. Using the MS-COCO dataset, our experiments show that the proposed framework competes well with state-of-the-art methods, effectively leveraging visual representations to improve performance. The implementation code can be found here: <ext-link ext-link-type="uri" xlink:href="https://github.com/althobhani/VFDICM">https://github.com/althobhani/VFDICM</ext-link> (accessed on 30 July 2024).</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Visual attention</kwd>
<kwd>image captioning</kwd>
<kwd>visual feature detector</kwd>
<kwd>visual feature visual attention</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>National Natural Science Foundation of China</funding-source>
<award-id>U22A2034</award-id>
<award-id>62177047</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Central South University Research Programme of Advanced Interdisciplinary Studies</funding-source>
<award-id>2023QYJC020</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>In image captioning, the model faces the formidable task of accurately discerning the salient objects within an image, comprehending their inherent characteristics and attributes, and effectively conveying the intricate interactions between these detected objects. Image captioning networks typically adhere to the encoder-decoder framework. Ingrained in Convolutional Neural Network (CNN), the encoder module diligently extracts the visual features and representations embedded within the input image. In parallel, the decoder module, founded on Recurrent Neural Networks (RNN), assumes the crucial role of generating a coherent textual description that encapsulates the essence of the image&#x2019;s content. This structured approach enables the model to seamlessly connect the visual and linguistic realms, transforming visual data into meaningful and interpretable textual descriptions, thus bridging the gap between computer vision [<xref ref-type="bibr" rid="ref-1">1</xref>&#x2013;<xref ref-type="bibr" rid="ref-3">3</xref>] and natural language processing.</p>
<p>Despite the notable progress made in previous studies on image captioning, there are still inherent limitations in existing approaches. Specifically, conventional visual-based image captioning methods have a tendency to rely on the same set of visual features throughout all time steps. This uniform treatment persists even when many object features in the input image may not be pertinent to the linguistic context required for generating the subsequent word in the caption. This reliance on irrelevant visual features poses a significant challenge. The inclusion of unrelated objects in the visual input has the potential to divert the attention of the image captioning model towards incorrect visual elements. This, in turn, can result in the generation of inaccurate words within the captions produced by the model. Therefore, it becomes imperative to delve into and discern the most relevant visual features at each time step. This exploration aims to augment the visual attention module, with the ultimate goal of refining the performance of caption generators. By identifying and focusing on the most contextually relevant visual cues during each stage of caption generation, we seek to enhance the accuracy and overall quality of the generated captions.</p>
<p>In our research work, we aim to design image captioning model that can develop image descriptors capable of efficiently exploiting the visual features of images. This proposed method can contribute to boosting the performance of the image captioning models and generating high-quality descriptions. The proposed new method, Visual Features Detection Based Image Captioning Model (VFDICM), aims to exploit the visual features of the input image effectively to enhance the performance of the image captioning models and generate more informative descriptions with higher quality. The proposed image captioning model is essentially built on the UpDown [<xref ref-type="bibr" rid="ref-4">4</xref>] framework and incorporates two additional modules that help leverage the visual features of the input image. These two modules are the visual feature detector module (VFD) and the visual feature visual attention module (VFVA). The VFD module is used to dynamically select the most related features from the visual features to generate a new visual matrix, which consists of the top-k most related visual features to the current linguistic context. Meanwhile, the VFVA module is used to attend to the selected visual features and generate a new visual context vector. This generated vector is fed into the language Long Short-Term Memory (LSTM) layer and used to generate the next word of the partial caption. An illustration of the newly proposed image captioning model is shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>An overview of the proposed pipeline for VFDICM model</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_54841-fig-1.tif"/>
</fig>
<p>The primary objective of our algorithm is to capture the visual attributes within the input image and emphasize their importance, as well as enhancing the performance and scoring of the model for the captioning of the input image. In order to determine the effectiveness of our captioning network, a comprehensive evaluation was conducted of thorough assessment by using MS-COCO [<xref ref-type="bibr" rid="ref-5">5</xref>] dataset. The captions generated by our model display a level of quality that is on par with those produced by numerous state-of-the-art methods when it comes to evaluation metrics, demonstrating a level of quality that is comparable with the results of our model. As a result of our extensive experiments, we have conclusively demonstrated that the visual feature matrix predicted by the VFD serves as a great guide for the decoder network, resulting in the generation of captions that are much more informative as a result. This significantly outperforms numerous recent image captioning models, marking a significant achievement in the field.</p>
<p>This work makes several notable contributions, which can be summarized as follows:
<list list-type="bullet">
<list-item>
<p>We explore the impact of utilizing various sets of visual features at each time step to enhance the quality of the generated captions.</p></list-item>
<list-item>
<p>We propose a new method, Visual Features Detection based Image Captioning Model (VFDICM), which aims to exploit the visual features of the input image effectively to enhance the performance of the image captioning models and generate more informative descriptions with higher quality.</p></list-item>
<list-item>
<p>We propose the VFD module for dynamically predicting the most top related visual features, and the VFVA module for addition attention of visual features processing.</p></list-item>
<list-item>
<p>We evaluate our model VFDICM on the MS-COCO dataset, and the results show that the proposed method performs comparably to the latest state-of-the-art techniques according to evaluation metrics.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<p>A dynamic visual attention mechanism was introduced in [<xref ref-type="bibr" rid="ref-6">6</xref>&#x2013;<xref ref-type="bibr" rid="ref-8">8</xref>], attention was integrated into the captioning process [<xref ref-type="bibr" rid="ref-7">7</xref>], and adaptive attention was adopted [<xref ref-type="bibr" rid="ref-9">9</xref>]. Other developments encompass merging top-down and bottom-up attention mechanisms [<xref ref-type="bibr" rid="ref-4">4</xref>], enhancing attention through memory mechanisms [<xref ref-type="bibr" rid="ref-10">10</xref>], as well as introducing task-adaptive attention to non-visual words through new models of attention [<xref ref-type="bibr" rid="ref-11">11</xref>]. The quality of captions can also be improved if two attentions are focused on pyramid images simultaneously [<xref ref-type="bibr" rid="ref-12">12</xref>] as well the grounding networks based on clusters are considered [<xref ref-type="bibr" rid="ref-13">13</xref>]. A new metric like Proposal Attention Correctness (PAC) [<xref ref-type="bibr" rid="ref-13">13</xref>] provide a bridge between evaluation of the proposal&#x2019;s performance and its visual grounding. Transformer, which is a multimodal model that uses multiple modalities [<xref ref-type="bibr" rid="ref-14">14</xref>], in conjunction with a multiview feature learning system [<xref ref-type="bibr" rid="ref-15">15</xref>], extends image captioning capabilities, collectively refining attention mechanisms and diversifying captioning techniques in the field. Previous work [<xref ref-type="bibr" rid="ref-16">16</xref>] presents an innovative technique for generating captions for images based on wavelet decomposition and convolutional neural networks in order to achieve comprehensive information extraction. Reference [<xref ref-type="bibr" rid="ref-17">17</xref>] introduces a novel Image Captioning HANs (Hybrid Attention Networks) combine human captioning attention with machine attention mechanisms to address issues like &#x201C;object hallucination&#x201D; and enhance caption diversity. Reference [<xref ref-type="bibr" rid="ref-18">18</xref>] introduces novel attention mechanisms (LSA and LSF) to enhance local visual modeling with leveraging the grid features. The authors in [<xref ref-type="bibr" rid="ref-19">19</xref>] present a framework for refined visual attention (RVA), in which the internal reweighting of visual attention is dependent upon the language context. Reference [<xref ref-type="bibr" rid="ref-20">20</xref>] introduces a GVA-based approach to image caption generation, enhancing the quality of captions by re-adjusting attentional weights. Reference [<xref ref-type="bibr" rid="ref-21">21</xref>] introduces JRAN, an image captioning approach that enhances caption coherence by investigating the relationships between features by incorporating semantic features and region. Despite the effectiveness of these methods, they still grapple with challenges. In particular, these approaches often use the same visual features at all time steps, even when many features are irrelevant to the context needed for the next word. This reliance on irrelevant features can misdirect the model&#x2019;s attention, resulting in inaccurate words in the captions. This highlights the necessity of developing a mechanism to investigate and identify the most relevant visual features at each time step. Such an effort aims to refine the visual attention module, ultimately enhancing the performance of caption generators. By pinpointing and emphasizing the most contextually relevant visual cues during each stage of caption generation.</p>
<p>Image attributes have been used alongside image features to enhance caption quality [<xref ref-type="bibr" rid="ref-22">22</xref>], which involved multimodal attribute detectors trained together with captioning models [<xref ref-type="bibr" rid="ref-23">23</xref>]. Additionally, PoS information has been incorporated into models, guiding information flow and caption generation [<xref ref-type="bibr" rid="ref-24">24</xref>&#x2013;<xref ref-type="bibr" rid="ref-26">26</xref>]. Topics extracted from caption corpora have been integrated into captioning tasks, influencing sentence generation [<xref ref-type="bibr" rid="ref-27">27</xref>&#x2013;<xref ref-type="bibr" rid="ref-30">30</xref>]. Some models adopt saliency mechanisms that enhance image representations based on visual, semantic, and sample-related saliency [<xref ref-type="bibr" rid="ref-31">31</xref>]. Attention components, such as semantic and text-guided attention, have also been employed to identify semantic attributes associated with image representations [<xref ref-type="bibr" rid="ref-32">32</xref>]. Multi-stage image descriptors like Stack-VS have been designed to efficiently exploit semantic and visual information through top-down and bottom-up techniques [<xref ref-type="bibr" rid="ref-33">33</xref>]. These diverse approaches collectively contribute to improving image captioning by integrating various sources of information and enhancing model performance. Prior work [<xref ref-type="bibr" rid="ref-34">34</xref>] introduces FUSECAP, enriching captions with visual expert insights and a large language model, creating 12 million improved caption pairs. These enhanced captions improve image captioning models and benefit image-text retrieval. Reference [<xref ref-type="bibr" rid="ref-35">35</xref>] presents a novel semantic-guided attention network for image captioning, integrating external knowledge into a Transformer-based model. Reference [<xref ref-type="bibr" rid="ref-36">36</xref>] introduces the Face-Att model, focusing on generating attribute-centric image captions with a special emphasis on facial features. However, semantic attention in image captioning faces limitations such as incomplete coverage of relevant image attributes, reliance on predefined attributes limiting adaptability to new visual elements, sensitivity to noise in part-of-speech (PoS) information, and challenges in effectively capturing diverse topics while avoiding biases from saliency mechanisms.</p>
<p>In tackling the challenge of encapsulating comprehensive visual content within a single caption, certain image captioning methods have opted for generating multiple descriptions encompassing various facets of an image. One such approach involves a multi-sentence image captioning model utilizing conditional Generative Adversarial Networks (GAN) [<xref ref-type="bibr" rid="ref-37">37</xref>]. This model takes an input image and a random vector, facilitating caption diversity through the joint training of a generator and an evaluator. The objective is to describe various image details by leveraging multiple sentences. Another innovative approach in this domain is the introduction of a multi-caption image captioning network based on topics [<xref ref-type="bibr" rid="ref-38">38</xref>]. In this model, an image and a topic are used as input to generate a topic-related caption while maintaining topical consistency. This is achieved through the fusion gate unit and the utilization of a topic classifier for accurate topic prediction. These approaches collectively contribute to effectively addressing the complexity of image content representation and diversity in captioning. More recently, a novel model presented in [<xref ref-type="bibr" rid="ref-39">39</xref>] introduces a unique approach that takes into account the number of ground truth captions available for an image during training. This model learns from the numbers associated with these captions and utilizes them to generate diverse captions for the image. Rather than solely relying on the semantic information provided by ground truth captions, this model capitalizes on the quantitative availability of multiple captions to create a varied set of captions for images. This innovative strategy contributes to a significant advancement in the domain of image captioning. However, generating multiple descriptions introduces complexity, and evaluating performance requires adapted metrics. Obtaining diverse training data is essential but may be limited. User preferences for the number and style of sentences are subjective, computational resources, risk of redundancy, interpretability issues.</p>
<p>Cross-entropy loss functions are used to predict the next word in ground truth captions, with evaluation metrics used post-generation. These indistinguishable evaluation metrics have been used to optimize image captioning models using reinforcement learning methods in recent years [<xref ref-type="bibr" rid="ref-40">40</xref>&#x2013;<xref ref-type="bibr" rid="ref-42">42</xref>]. Self-critical sequence training (SCST) [<xref ref-type="bibr" rid="ref-43">43</xref>] leverages the CIDEr metric for optimization, demonstrating significant improvements in model performance, particularly in CIDEr. Based on the global-local discrimination objective, Reference [<xref ref-type="bibr" rid="ref-40">40</xref>] introduces a reinforcement learning-based optimization approach, incorporating local and global constraints to generate more descriptive captions with finer visual details. Another model [<xref ref-type="bibr" rid="ref-44">44</xref>] incorporates a Kullback-Leibler (KL) divergence term in order to differentiate between accurate and inaccurate predictions, leveraging knowledge graphs to enhance description quality. Hierarchical Attention Fusion (HAF) [<xref ref-type="bibr" rid="ref-45">45</xref>] serves as a reinforcement learning baseline for image captioning, incorporating feature mapping for a number of levels and a revaluing scheme for word and sentence-level rewards. Vocabulary-Critical Sequence Training (VCST) [<xref ref-type="bibr" rid="ref-41">41</xref>] uses a word replacement-based vocabulary critic as a means of providing nuanced credit to words, with efficient algorithms for BLEU and CIDEr-D metric computation. Collectively, these approaches better the quality of descriptive captions through the optimization of models based on evaluation metrics and directly improving the accuracy of the captions.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<p>Our model focuses on optimizing the use of visual features extracted from input images in order to improve the performance of image captioning models. By leveraging advanced techniques, we aim to capture more nuanced details and context from images, which significantly contributes to more accurate and descriptive captions. As illustrated in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, the comprehensive workflow begins with a Faster Region-based Convolutional Neural Network (Faster-RCNN) network employed to extract visual features from the input image. Subsequently, the visual features collected are input into the visual attention module and then into the visual Feature Detector Module (VFD) to predict the most pertinent visual features matrix. Then, the generated visual matrix is utilized in the Visual Feature Visual Attention (VFVA) module. This module attends to the selected visual features and generates a visual context vector, which is crucial to the prediction of the partially generated caption of the next word.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Input Image Visual Features</title>
<p>Within our captioning network, the visual features are initially extracted from the input image to enable their utilization in subsequent processing by the language model. The initial phase of generating a description for an image involves acquiring the visual representations of the input image. A Faster-RCNN network utilizing ResNet-101 extracts object features from input images, producing an object feature matrix, represented as <italic>V</italic>, which contains <italic>N</italic> object feature vectors.
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mi>V</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mi>N</mml:mi></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>In this representation, <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula> refers to a vector that represents the features of an object, in which <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mi>i</mml:mi></mml:math></inline-formula> ranges from 1 to <italic>N</italic>, and <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mi>V</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> represents the matrix of features of the objects. In addition, a mean-pooled object features <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> of the input image is also used as an extra input to the image captioning system, with the following definition:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></disp-formula></p>
<p>Here, <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula>.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Visual Attention Mechanism</title>
<p>According to the framework we propose, local visual features of the input image are essential for boosting the model&#x2019;s performance. By accurately capturing these details, we can significantly enhance the overall effectiveness of the image captioning process. To focus on these local visual features, our model relies on a conventional visual attention module, which enables the model to selectively emphasize important aspects of the image. This visual attention process can be described by a set of formulas that enable precise attention to the most important visual features as a result of this operation.
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msubsup><mml:mi>&#x03B1;</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>tanh</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>b</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mspace linebreak="newline" /></mml:math></disp-formula>
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>softmax</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:msubsup><mml:mi>&#x03B2;</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>&#x2299;</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>N</mml:mi></mml:msup></mml:math></inline-formula>, and <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>N</mml:mi></mml:msup></mml:math></inline-formula>. <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:msub><mml:mi>W</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>e</mml:mi></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mi>W</mml:mi><mml:mi>a</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>e</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, and <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msub><mml:mi>W</mml:mi><mml:mi>b</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>e</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> are trainable weights. <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>g</mml:mi></mml:msup></mml:math></inline-formula> is the hidden state of the attention LSTM.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Language Model</title>
<p>A diagram illustrating the language model architecture is shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. As a basis for the approach we propose, the UpDown framework is used as the baseline structure, known for its effectiveness in image captioning tasks. There are two LSTM layers in the framework: one for language LSTMs, denoted <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mi>L</mml:mi><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and another for attention LSTMs, denoted <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mi>L</mml:mi><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. As a result of leveraging these LSTM layers, the model can better capture and integrate sequential information from the input data. The hidden states of the attention LSTM, represented as <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>g</mml:mi></mml:msup></mml:math></inline-formula>, and the language LSTM, represented as <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>l</mml:mi></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>g</mml:mi></mml:msup></mml:math></inline-formula>, can be determined by the following equations, which ensure that the interaction between visual features and language generation is precise and dynamic.</p>
<p><disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>a</mml:mi></mml:msubsup><mml:mo>;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>[</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>l</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi>E</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>l</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:mi>L</mml:mi><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mtext>lan</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>l</mml:mi></mml:msubsup><mml:mo>;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>]</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>Here, <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:mi>E</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>q</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> represents the word embeddings matrix, and <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>m</mml:mi></mml:msup></mml:math></inline-formula> denotes the token generated in the previous time step. These embeddings play a crucial role in capturing the semantic meaning of the tokens. <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula> is an updated context vector which will be explaned in <xref ref-type="sec" rid="s3_5">Section 3.5</xref>. To predict the next token, the hidden state of the language LSTM, <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>l</mml:mi></mml:msubsup></mml:math></inline-formula>, is fed into a fully connected layer with a softmax activation function. This setup allows the model to generate a probability distribution <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:msub><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> over the entire vocabulary, ensuring that the most likely subsequent token is selected based on the context provided by <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>l</mml:mi></mml:msubsup></mml:math></inline-formula>. The process is described by the following equation:
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:msub><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mi>softmax</mml:mi></mml:mrow><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>l</mml:mi></mml:msubsup><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>g</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:msub><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>m</mml:mi></mml:msup></mml:math></inline-formula>. <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msub><mml:mi>W</mml:mi><mml:mi>g</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> represents the weights to be trained.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>A description of the internal architecture of VFDICM&#x2019;s language model for image captioning</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_54841-fig-2.tif"/>
</fig>
<p>The input word to the attention LSTM at each time step during the training phase of our models is taken from the ground truth annotation. This approach ensures that the model learns from accurate data. Conversely, the input word for the attention LSTM in the testing phase, as defined in <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref>, comes from the word predicted in the previous time step. This method allows the model to generate sequences based on its learned predictions. As an initial input in both training and testing, a special &#x201C;begin-of-sequence&#x201D; token is used. As the generation of the description&#x2019;s words continues, a special &#x201C;end-of-sequence&#x201D; token will be predicted or a maximum description length will be reached indicating the end of the generated description, ensuring a coherent and contextually relevant output.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Visual Feature Detector (VFD)</title>
<p>The visual feature detector module (VFD) is an ordinary neural network that serves as a detector to dynamically select the most relevant visual features at each time step, in our case the number of top features is referred to as <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>k</mml:mi></mml:math></inline-formula>. The VFD module consists of a concatenation layer, a fully connected layer (FC), and a softmax layer. The input to VFD comprises the output of the conventional visual attention module <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> and the hidden state of the attention LSTM <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup></mml:math></inline-formula>. The VFD module generates an output matrix known as the selected top-related features matrix (STF), denoted as <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:msub><mml:mi>U</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, which consists of the top-k selected local visual features of the input image. The internal structure of the VFD module is illustrated in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>. Given the object features matrix <italic>V</italic>, the context vector <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, and the hidden state of the attention LSTM <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>g</mml:mi></mml:msup></mml:math></inline-formula> as input to the VFD module, we first concatenate <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup></mml:math></inline-formula> as follows:<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula>where [ ] refers to the concatenation operation. Then, a fully connected layer is used to map the resulted vector <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>+</mml:mo><mml:mi>g</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> into another vector <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> whose length is equal to the number of local visual features <italic>N</italic> as follows:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>x</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:msub><mml:mi>W</mml:mi><mml:mi>x</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo>+</mml:mo><mml:mi>g</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mi>N</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is a learnable parameter matrix. Next, we apply the softmax activation function on <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> to generate a new vector <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> which represents a probability distribution over <italic>N</italic> as follows:
<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>softmax</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x00AF;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>The internal structure of the VFD</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_54841-fig-3.tif"/>
</fig>
<p>The elements of <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> are probability values and each element of <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> represents the probability of its corresponding local visual feature. After that, the indexes of the <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>k</mml:mi></mml:math></inline-formula> elements of <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msub><mml:mrow><mml:mover><mml:mi>x</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> with the highest probabilities are determined and their corresponding local visual features in <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mi>V</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> are selected to form the matrix <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:msub><mml:mi>U</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> which is given by:
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:msub><mml:mi>U</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mn>2</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mi>k</mml:mi></mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula> and <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:msub><mml:mi>U</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>. <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msub><mml:mi>U</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> represents a subset of the local visual features the most related to the current linguistic context at the current time step <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>t</mml:mi></mml:math></inline-formula>. <xref ref-type="fig" rid="fig-4">Fig. 4</xref> provides an illustration of the selection of the local visual features according to their corresponding probabilities in the VFD module.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>An illustration of the selection of the local visual features according to their corresponding probabilities in the VFD module</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_54841-fig-4.tif"/>
</fig>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Visual Feature Visual Attention (VFVA)</title>
<p>After generating the STF matrix from the VFD module, we have developed an additional module that takes the STF as its input. This module is originally a visual attention module and is referred to as the Visual Feature Visual Attention (VFVA). The main purpose of this module is to attend to the visual features of the STF matrix, yielding an updated context vector known as the Updated Context Vector (UCV). The UCV is then fed into the language LSTM of the language model to guide the generation of the subsequent word. Considering the updated context vector <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> as defined in the following equations, the VFVA module&#x2019;s formulas demonstrate its role in attending to the importance of each of the various visual feature vectors.
<disp-formula id="eqn-13"><label>(13)</label><mml:math id="mml-eqn-13" display="block"><mml:msubsup><mml:mi>&#x03B4;</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>tanh</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>e</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup><mml:mo>+</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mi>f</mml:mi></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="eqn-14"><label>(14)</label><mml:math id="mml-eqn-14" display="block"><mml:msub><mml:mi>&#x03B3;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>softmax</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03B4;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="eqn-15"><label>(15)</label><mml:math id="mml-eqn-15" display="block"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>k</mml:mi></mml:munderover><mml:msubsup><mml:mi>&#x03B3;</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>&#x2299;</mml:mo><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:math></disp-formula>where <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:msub><mml:mi>&#x03B4;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:msup></mml:math></inline-formula>, and <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:msub><mml:mi>&#x03B3;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:msup></mml:math></inline-formula>. <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msub><mml:mi>W</mml:mi><mml:mi>e</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>e</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:msub><mml:mi>W</mml:mi><mml:mi>f</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>e</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:msub><mml:mi>W</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>e</mml:mi></mml:msup></mml:math></inline-formula> are learnable weights.</p>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Loss Functions</title>
<p>Our image annotating network is trained using two stages: cross-entropy (XE) and CIDEr optimization. In the first stage, standard cross-entropy loss is applied, which is determined as follows:
<disp-formula id="eqn-16"><label>(16)</label><mml:math id="mml-eqn-16" display="block"><mml:msub><mml:mi>Loss</mml:mi><mml:mrow><mml:mi>X</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>T</mml:mi></mml:munderover><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2223;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>Second stage, Self-Critical Sequence Training (SCST) is utilized alongside CIDEr-D for optimizing and training the model. The loss function at this stage is specified as follows:
<disp-formula id="eqn-17"><label>(17)</label><mml:math id="mml-eqn-17" display="block"><mml:msub><mml:mi>Loss</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>&#x223C;</mml:mo><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>In this context, <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> refers to the sampled annotation, while <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mi>r</mml:mi></mml:math></inline-formula> indicates CIDEr-D score of sampled annotation. Gradient approximation for <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:msub><mml:mi>Loss</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, symbolized as <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:msub><mml:msub><mml:mi>Loss</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, is detailed in <xref ref-type="disp-formula" rid="eqn-18">Eq. (18)</xref>. Here, <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>w</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> refers to the CIDEr reward for the maximally sampled annotation, and <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mi>s</mml:mi></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> denotes the CIDEr reward corresponding to the randomly sampled annotation. This approach enables the model to refine the generated annotations by aligning them more closely with human references. Utilizing the CIDEr-D score helps the model to effectively grasp the intricacies and relevance within the sampled annotations. Consequently, this method enhances the overall quality and accuracy of the generated captions. Additionally, it provides a robust framework for training models in tasks that require a nuanced understanding of content similarity.
<disp-formula id="eqn-18"><label>(18)</label><mml:math id="mml-eqn-18" display="block"><mml:msub><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:msub><mml:msub><mml:mi>Loss</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>L</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mi>r</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mi>s</mml:mi></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>r</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>w</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mi>&#x03B8;</mml:mi></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mi>p</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mi>s</mml:mi></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>Algorithm 1 outlines the training process for the VFDICM model. It begins by extracting feature representations from the input images using <xref ref-type="disp-formula" rid="eqn-1">Eq. (1)</xref>. Then, for each word position in the caption, the algorithm computes the output of the conventional visual attention module, the selected top-related feature matrix from the VFD module, and the output of the VFVA module using <xref ref-type="disp-formula" rid="eqn-5">Eqs. (5)</xref>, <xref ref-type="disp-formula" rid="eqn-12">(12)</xref>, and <xref ref-type="disp-formula" rid="eqn-15">(15)</xref>, respectively. These computations aid in generating the probability distribution for the next word using <xref ref-type="disp-formula" rid="eqn-8">Eq. (8)</xref>. The algorithm continues updating the cross-entropy loss and reinforcement learning loss based on <xref ref-type="disp-formula" rid="eqn-16">Eqs. (16)</xref> and <xref ref-type="disp-formula" rid="eqn-17">(17)</xref> until convergence is achieved. Finally, it returns the generated word probabilities and the model parameters.</p>
<fig id="fig-6">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_54841-fig-6.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments and Results</title>
<p>This section discusses various crucial aspects related to our conducted experiments, including the evaluation metrics, the dataset used, the model&#x2019;s configurations, and the training process for our image annotation networks. Additionally, we present and analyze the experimental results and comparisons of our networks. Furthermore, we offer an assessment of the generated text&#x2019;s quality in detail.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets and Evaluation Metrics</title>
<p>Extensive evaluations were conducted using the MS-COCO dataset [<xref ref-type="bibr" rid="ref-5">5</xref>] in image captioning. As a popular dataset, due to its diversity and numerous caption-image pairs, this dataset served as a solid base for evaluating the effectiveness of the models. The MS-COCO dataset, frequently employed in tasks involving image annotation, is comprised of 123,287 images. Following Karpathy&#x2019;s well-established data splitting method [<xref ref-type="bibr" rid="ref-46">46</xref>], 113,287 images were allocated for training, 5000 images for validation, and another 5000 images for testing. Notably, most of the MS-COCO images are associated with a total of five ground truth captions, resulting in a substantial collection of about 616,747 different ground truth captions for all the MS-COCO images. These captions vary in length, spanning a length of 5 to 49 words, offering a wide spectrum of scenarios and contexts for model training and evaluation.</p>
<p>To assess our model&#x2019;s performance, the model was tested using a series of evaluation metrics, including CIDEr [<xref ref-type="bibr" rid="ref-47">47</xref>], METEOR [<xref ref-type="bibr" rid="ref-48">48</xref>], BLEU [<xref ref-type="bibr" rid="ref-49">49</xref>], ROUGE-L [<xref ref-type="bibr" rid="ref-50">50</xref>], and SPICE [<xref ref-type="bibr" rid="ref-51">51</xref>]. As a result, each of these metrics offers unique insight into a variety of different characteristics of the performance of the model. BLEU, a precision-based metric originally intended for machine translation, has been shown to be highly correlated with human evaluation, emphasizing n-gram precision. METEOR evaluates machine-generated translations by employing a generalized concept of unigram matching with human reference translations, which allows for a balanced assessment of precision and recall. As part of its similarity calculation, CIDEr uses cosine similarity between words in candidate and reference captions based on Term Frequency-Inverse Document Frequency weighted n-grams, thereby taking both recall and precision into account, which is particularly useful for image captioning tasks.</p>
<p>ROUGE, on the other hand, measures the quality of a summary by comparing it with a human-generated summary, which helps in understanding how well the model can generate concise and relevant descriptions. SPICE assesses how effectively captions represent attributes, objects, as well as their relationships, offering a more semantic assessment of captions generated. To simplify the representation of these metrics, we denote METEOR, CIDEr, ROUGE-L, SPICE, and BLEU-n (n &#x003D; 1, 2, 3, 4), as M, C, R, S, and B-n (n &#x003D; 1, 2, 3, 4), respectively.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental Settings</title>
<p>To extract features of objects from images, Faster-RCNN is used based on ResNet-101, resulting in object features of dimensions 36 <inline-formula id="ieqn-67"><mml:math id="mml-ieqn-67"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 2048. To manage longer sentences, those exceeding 16 tokens are truncated. In constructing our vocabulary, only words occurring more than five times are included, yielding a vocabulary of 9487 words for MS-COCO. Word embeddings are 1000-dimensional, and hidden states are set for both LSTMs to 1000. During our experiments, we select <inline-formula id="ieqn-68"><mml:math id="mml-ieqn-68"><mml:mi>k</mml:mi></mml:math></inline-formula> to be equal to 10 since it gives us the best scores.</p>
<p>In our image captioning model, we utilize a visual features vector of size <inline-formula id="ieqn-69"><mml:math id="mml-ieqn-69"><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>2048</mml:mn></mml:math></inline-formula> to represent the input image. LSTM hidden state sizes <inline-formula id="ieqn-70"><mml:math id="mml-ieqn-70"><mml:mi>g</mml:mi><mml:mo>=</mml:mo><mml:mn>1000</mml:mn></mml:math></inline-formula> allow for the capture of complex linguistic patterns during caption generation. Words in our vocabulary are embedded in vectors of length <inline-formula id="ieqn-71"><mml:math id="mml-ieqn-71"><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mn>1000</mml:mn></mml:math></inline-formula>. The features of <inline-formula id="ieqn-72"><mml:math id="mml-ieqn-72"><mml:mi>N</mml:mi><mml:mo>=</mml:mo><mml:mn>36</mml:mn></mml:math></inline-formula> objects in the image are used for data extraction. The vocabulary for our captioning task consists of <inline-formula id="ieqn-73"><mml:math id="mml-ieqn-73"><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>9487</mml:mn></mml:math></inline-formula> unique words for MS-COCO. Additionally, we incorporate an internal hidden attention mechanism of size <inline-formula id="ieqn-74"><mml:math id="mml-ieqn-74"><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mn>512</mml:mn></mml:math></inline-formula> to improve the model&#x2019;s ability to generate captions for relevant sections of an image. The number of selected most relevant visual features is set to <inline-formula id="ieqn-75"><mml:math id="mml-ieqn-75"><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>10</mml:mn></mml:math></inline-formula>.</p>
<p>For training our annotation networks, we employ the Adam optimizer and conduct training for 50 epochs in the cross-entropy stage, followed by 100 epochs in the CIDEr optimization stage. Initially, the learning rate is set at 0.0005 and subsequently decreases by a factor of 0.8 every 5 epochs during cross-entropy training and every 10 epochs during CIDEr optimization. Regarding the batch size, a batch size of 40 is chosen. The scheduled sampling percentage increases by 5% every 5 epochs until it reaches 25% during cross-entropy training. The gradients are clipped to an absolute maximum of 0.1. We employ a dropout ratio of 0.5 in our model. Testing is conducted with a beam size of 3 with the beam search strategy. Our networks are developed using the PyTorch framework.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Building the Vocabulary and Preparing Captions</title>
<p>To construct the vocabulary for our proposed model, we underwent several processing steps with the ground truth caption corpus. Initially, we analyzed all the ground truth sentences, totaling 6,454,115 words, and identified 27,929 unique words. Following word counting, we retained words that appeared more than five times, resulting in a vocabulary of 9486 unique words, while discarding 18,443 unique words. These discarded words constituted approximately 66% of the total unique words in the caption corpus. However, in terms of overall word count, they represented merely 0.5%. While the percentage of discarded unique words may seem high, the actual impact on the model&#x2019;s performance is negligible due to their small contribution to the total word count. All discarded words were replaced with the &#x2018;Unknown&#x2019; special token, which was subsequently added to the vocabulary, expanding it to 9487 words. Additionally, we tokenized all ground truth sentences, replacing each word with its corresponding unique index or integer value assigned from the vocabulary. Sentences longer than 16 words were truncated, and those shorter than 16 words were padded using a &#x2018;0&#x2019; digit, serving as zero padding. Furthermore, we prepended the beginning-of-sequence token to the start of each ground truth caption and appended the end-of-sequence token to the end. These preprocessing steps prepared the ground truth captions for training our models.</p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Quantitative Scores</title>
<p>As shown in <xref ref-type="table" rid="table-1">Table 1</xref>, our model performed in cross-entropy and CIDEr optimizations using MS-COCO data. The cross-entropy results indicate that our proposed image captioning model surpasses the baseline in most evaluation metrics, notably excelling in METEOR, BLEU-4, SPICE, CIDEr, and ROUGE. Furthermore, experimental results underscore the superiority of our model over the baseline in terms of CIDEr optimization scores across various evaluation metrics, with significant differences observed in BLEU-4, BLEU-1, ROUGE, METEOR, and CIDEr metrics. These scores unequivocally demonstrate the superior performance of our proposed framework, highlighting the effectiveness and value of our approach.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>The scores of our model on the MS-COCO dataset for cross-entropy (XE) and CIDEr optimization (RL)</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>B1</th>
<th>B4</th>
<th>M</th>
<th>R</th>
<th>C</th>
<th>S</th>
</tr>
</thead>
<tbody>
<tr>
<td>Baseline (XE) [<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td>76.6</td>
<td>36.2</td>
<td>27.0</td>
<td>56.4</td>
<td>113.5</td>
<td>20.3</td>
</tr>
<tr>
<td>VFDICM (XE)</td>
<td>76.4</td>
<td>36.3</td>
<td>27.7</td>
<td>56.6</td>
<td>113.9</td>
<td>20.6</td>
</tr>
<tr>
<td>Baseline (CIDEr) [<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td>79.8</td>
<td>36.3</td>
<td>27.7</td>
<td>56.9</td>
<td>120.1</td>
<td>21.4</td>
</tr>
<tr>
<td>VFDICM (CIDEr)</td>
<td>80.8</td>
<td>37.2</td>
<td>28.3</td>
<td>57.9</td>
<td>122.4</td>
<td>21.5</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>VFDICM demonstrated significant enhancement during the CIDEr phase, where optimization considers caption quality and diversity. Significant advancements were observed in the BLEU-4, BLEU-1, METEOR, CIDEr, and ROUGE metrics. The integration of VFD and VFVA within our network significantly improves the image captioning algorithm&#x2019;s ability to capture rich visual representations and features, resulting in improved overall performance and description generation. The VFD module updates the visual features matrix, while the VFVA module directs attention to this matrix, generating an updated context vector for the language model to produce informative descriptions.</p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Comparison Results</title>
<p>The comparison of other models with our model using the MS-COCO dataset is detailed in <xref ref-type="table" rid="table-2">Tables 2</xref> and <xref ref-type="table" rid="table-3">3</xref>, covering cross-entropy optimization and CIDER stages. <xref ref-type="table" rid="table-2">Table 2</xref> focuses on the cross-entropy results, where our model demonstrates remarkable performance. Specifically, our model outperforms most other models in several key metrics, including ROUGE-L, BLEU-4, BLEU-3, and SPICE. Additionally, it achieves the second-highest scores in CIDEr, BLEU-1, and BLEU-2. It is noteworthy that the r-GRU model secured METEOR&#x2019;s highest score in this phase. Significant differences in scores between our model and others across the majority of metrics underline its greater capabilities during the cross-entropy training phase. Moving on to <xref ref-type="table" rid="table-3">Table 3</xref>, which displays the results from the CIDEr optimization stage, our model continues to excel. It achieves top scores in several metrics such as ROUGE-L, BLEU-3, BLEU-4, BLEU-2, BLEU-1, and METEOR. However, it ranks second in the CIDEr and SPICE metrics. The significant variance between our model&#x2019;s performance and that of other models in metrics like METEOR, BLEU-3, BLEU-2, and BLEU-1 further highlights the effectiveness of our model at this optimization stage.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>The performance of other models and VFDICM trained on MS-COCO using cross-entropy optimization (XE) is compared, with scores for the top two positions highlighted in bold and underline, respectively</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>B1</th>
<th>B2</th>
<th>B3</th>
<th>B4</th>
<th>M</th>
<th>R</th>
<th>C</th>
<th>S</th>
</tr>
</thead>
<tbody>
<tr>
<td>RFNet [<xref ref-type="bibr" rid="ref-52">52</xref>]</td>
<td><underline>76.4</underline></td>
<td><underline>60.4</underline></td>
<td><underline>46.6</underline></td>
<td>35.8</td>
<td>27.4</td>
<td><underline>56.5</underline></td>
<td>112.5</td>
<td><underline>20.5</underline></td>
</tr>
<tr>
<td>UpDown [<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td><bold>77.2</bold></td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td><underline>36.2</underline></td>
<td>27.0</td>
<td>56.4</td>
<td>113.5</td>
<td>20.3</td>
</tr>
<tr>
<td>RecallNet [<xref ref-type="bibr" rid="ref-53">53</xref>]</td>
<td>73.4</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>32.2</td>
<td>25.9</td>
<td>53.9</td>
<td>101.6</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>VIS_SAS [<xref ref-type="bibr" rid="ref-31">31</xref>]</td>
<td>72.5</td>
<td>52.6</td>
<td>38.2</td>
<td>28.1</td>
<td>23.7</td>
<td>55.4</td>
<td>82.1</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>SCST [<xref ref-type="bibr" rid="ref-43">43</xref>]</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>30.0</td>
<td>25.9</td>
<td>53.4</td>
<td>99.4</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>HAF [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td>75.9</td>
<td>59.5</td>
<td>45.4</td>
<td>34.4</td>
<td>26.8</td>
<td>&#x2013;</td>
<td>109.0</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>MRRC [<xref ref-type="bibr" rid="ref-54">54</xref>]</td>
<td>75.5</td>
<td>59.8</td>
<td>46.0</td>
<td>35.2</td>
<td>26.5</td>
<td>55.9</td>
<td>108.0</td>
<td>19.7</td>
</tr>
<tr>
<td>Vis-to-lang [<xref ref-type="bibr" rid="ref-32">32</xref>]</td>
<td>73.9</td>
<td>56.4</td>
<td>41.7</td>
<td>30.9</td>
<td>27.1</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>TAAIC [<xref ref-type="bibr" rid="ref-11">11</xref>]</td>
<td>71.0</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>27.7</td>
<td>23.8</td>
<td>51.1</td>
<td>93.2</td>
<td>18.3</td>
</tr>
<tr>
<td>r-GRU [<xref ref-type="bibr" rid="ref-55">55</xref>]</td>
<td><bold>77.2</bold></td>
<td><bold>61.3</bold></td>
<td>46.3</td>
<td>35.6</td>
<td><bold>30.2</bold></td>
<td>55.7</td>
<td>109.2</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>NumCap [<xref ref-type="bibr" rid="ref-39">39</xref>]</td>
<td>66.9</td>
<td>49.4</td>
<td>36.5</td>
<td>27.3</td>
<td>24.1</td>
<td>50.7</td>
<td>85.3</td>
<td>17.0</td>
</tr>
<tr>
<td>CSA [<xref ref-type="bibr" rid="ref-56">56</xref>]</td>
<td><bold>77.2</bold></td>
<td>59.8</td>
<td>46.0</td>
<td>36.2</td>
<td>27.9</td>
<td>56.4</td>
<td><bold>114.6</bold></td>
<td>&#x2013;</td>
</tr>
<tr>
<td>VFDICM (ours)</td>
<td><underline>76.4</underline></td>
<td><underline>60.4</underline></td>
<td><bold>46.9</bold></td>
<td><bold>36.3</bold></td>
<td>27.7</td>
<td><bold>56.6</bold></td>
<td><underline>113.9</underline></td>
<td><bold>20.6</bold></td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>comparison of the performance of other models and VFDICM trained on the MS-COCO dataset using CIDEr optimization (RL). Scores for the top two places are bolded and underlined, respectively</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>B1</th>
<th>B2</th>
<th>B3</th>
<th>B4</th>
<th>M</th>
<th>R</th>
<th>C</th>
<th>S</th>
</tr>
</thead>
<tbody>
<tr>
<td>RFNet [<xref ref-type="bibr" rid="ref-52">52</xref>]</td>
<td>79.1</td>
<td>63.1</td>
<td>48.4</td>
<td>36.5</td>
<td>27.7</td>
<td>57.3</td>
<td>121.9</td>
<td>21.2</td>
</tr>
<tr>
<td>UpDown [<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td>79.8</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>36.3</td>
<td>27.7</td>
<td>56.9</td>
<td>120.1</td>
<td>21.4</td>
</tr>
<tr>
<td>RecallNet [<xref ref-type="bibr" rid="ref-53">53</xref>]</td>
<td>75.8</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>33.1</td>
<td>24.7</td>
<td>54.9</td>
<td>103.7</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>HAF [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td><underline>80.5</underline></td>
<td>62.9</td>
<td>47.7</td>
<td>35.5</td>
<td>27.3</td>
<td>&#x2013;</td>
<td>116.4</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>Stack-VS [<xref ref-type="bibr" rid="ref-33">33</xref>]</td>
<td>79.4</td>
<td><underline>63.6</underline></td>
<td><underline>49.0</underline></td>
<td><bold>37.2</bold></td>
<td><underline>27.9</underline></td>
<td><underline>57.7</underline></td>
<td><bold>122.6</bold></td>
<td><bold>21.6</bold></td>
</tr>
<tr>
<td>SCST [<xref ref-type="bibr" rid="ref-43">43</xref>]</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>34.2</td>
<td>26.7</td>
<td>55.7</td>
<td>114.0</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>TDA&#x002B;GLD [<xref ref-type="bibr" rid="ref-40">40</xref>]</td>
<td>78.8</td>
<td>62.6</td>
<td>48.0</td>
<td>36.1</td>
<td>27.8</td>
<td>57.1</td>
<td>121.1</td>
<td><bold>21.6</bold></td>
</tr>
<tr>
<td>TAAIC [<xref ref-type="bibr" rid="ref-11">11</xref>]</td>
<td>78.6</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td><underline>37.1</underline></td>
<td>27.5</td>
<td>57.2</td>
<td>119.6</td>
<td>21.2</td>
</tr>
<tr>
<td>MRRC [<xref ref-type="bibr" rid="ref-54">54</xref>]</td>
<td>75.3</td>
<td>59.7</td>
<td>46.0</td>
<td>35.3</td>
<td>26.6</td>
<td>55.7</td>
<td>108.2</td>
<td>19.7</td>
</tr>
<tr>
<td>VFDICM (ours)</td>
<td><bold>80.8</bold></td>
<td><bold>64.2</bold></td>
<td><bold>49.3</bold></td>
<td><bold>37.2</bold></td>
<td><bold>28.3</bold></td>
<td><bold>57.9</bold></td>
<td><underline>122.4</underline></td>
<td><underline>21.5</underline></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In more detail, the cross-entropy phase results in <xref ref-type="table" rid="table-2">Table 2</xref> reveal that our model is not only competitive but often leading in many critical areas. For example, its performance in BLEU-3 and BLEU-4 indicates strong capabilities in generating accurate and contextually appropriate sequences. The high scores in ROUGE-L and SPICE metrics suggest that our model excels in generating linguistically rich and semantically relevant annotations. Despite the r-GRU model&#x2019;s achievement of the highest METEOR score, our model&#x2019;s close performance in this metric indicates its robustness and reliability. Similarly, the CIDEr optimization results shown in <xref ref-type="table" rid="table-3">Table 3</xref> confirm our model&#x2019;s strong performance across multiple evaluation criteria. Its leading scores in BLEU-1 through BLEU-4 metrics indicate consistent and high-quality output. The model&#x2019;s top performance in METEOR and ROUGE-L underscores its ability to produce both precise and contextually appropriate annotations. While the Stack-VS model attains the highest CIDEr score, our model&#x2019;s second-place ranking in this metric and others like SPICE demonstrates its overall superior performance across the board.</p>

<p>These findings strongly underscore the significant improvements our proposed method brings to model performance. The incorporation of the Visual Feature Decoder (VFD) and Visual Feature Visual Attention (VFVA) modules plays a crucial role in this enhancement. By integrating these modules, our model can more effectively harness visual features from input images, which leads to the generation of more detailed and informative descriptions. The VFD module is designed to dynamically identify and extract relevant features from the visual input. This process results in the creation of a new visual matrix that encapsulates the most important aspects of the image. In turn, the newly formed visual matrix is used by the VFVA module, focusing its attention on these critical features. In this way, VFVA updates the contextual data that the language model uses to predict the next word.</p>
<p>This sophisticated interplay between the VFD and VFVA modules significantly boosts model performance on a range of standard evaluation metrics. The VFD module&#x2019;s ability to distill relevant visual information ensures that the model captures essential details from the input images. Meanwhile, the VFVA module&#x2019;s attention mechanism allows the model to maintain a contextual understanding of these visual features, thereby enhancing the accuracy and relevance of the generated descriptions. A further advantage of this method is that it is designed to ensure that each word prediction is informed by a comprehensive understanding of the visual context, which in turn contributes to the model&#x2019;s robust performance. This approach not only improves the model&#x2019;s descriptive capabilities but also ensures consistency and coherence in the generated text. As a result, the model excels across multiple evaluation metrics, demonstrating superior performance compared to methods that do not leverage such advanced visual feature integration.</p>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Ablation Studies</title>
<p>We conducted several experiments using MS-COCO dataset, refer to <xref ref-type="table" rid="table-4">Table 4</xref>. In experiment VFDICM_sig2, we employed a sigmoid activation function within the VFD component, instead of the softmax activation function of the VFD. In the language LSTM, we concatenated only two inputs: <inline-formula id="ieqn-76"><mml:math id="mml-ieqn-76"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup></mml:math></inline-formula> and <inline-formula id="ieqn-77"><mml:math id="mml-ieqn-77"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, removing <inline-formula id="ieqn-78"><mml:math id="mml-ieqn-78"><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> from the inputs of the language LSTM. In contrast, in experiment VFDICM_soft2, we used the same two inputs in the language LSTM as the VFDICM_sig2 but applied a softmax activation function in the VFD component.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Ablation study of the model with different activation functions and various numbers of inputs to the language LSTM using MS-COCO dataset</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>B1</th>
<th>B2</th>
<th>B3</th>
<th>B4</th>
<th>M</th>
<th>R</th>
<th>C</th>
<th>S</th>
</tr>
</thead>
<tbody>
<tr>
<td>VFDICM_sig2</td>
<td>79.7</td>
<td>62.5</td>
<td>47.4</td>
<td>35.1</td>
<td>27.5</td>
<td>56.9</td>
<td>117.1</td>
<td>20.7</td>
</tr>
<tr>
<td>VFDICM_soft2</td>
<td>79.4</td>
<td>62.1</td>
<td>47.1</td>
<td>34.9</td>
<td>27.4</td>
<td>56.8</td>
<td>116.3</td>
<td>20.7</td>
</tr>
<tr>
<td>VFDICM_sig3</td>
<td>80.6</td>
<td>63.8</td>
<td>49.0</td>
<td>36.7</td>
<td>28.3</td>
<td>57.9</td>
<td>122.5</td>
<td>21.5</td>
</tr>
<tr>
<td>VFDICM (VFDICM_soft3)</td>
<td>80.8</td>
<td>64.2</td>
<td>49.3</td>
<td>37.2</td>
<td>28.3</td>
<td>57.9</td>
<td>122.4</td>
<td>21.5</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In experiment VFDICM_sig3, we again used a sigmoid activation function within the VFD component. However, in the language LSTM, we concatenated three inputs: the output of the first attention mechanism (<inline-formula id="ieqn-79"><mml:math id="mml-ieqn-79"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>), along with <inline-formula id="ieqn-80"><mml:math id="mml-ieqn-80"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup></mml:math></inline-formula> and <inline-formula id="ieqn-81"><mml:math id="mml-ieqn-81"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>. The VFDICM_soft3 experiment is our proposed model which uses softmax activation function in the VFD with three inputs into the language LSTM.</p>
<p>To demonstrate and analyze the significance of the second attention mechanism and the impact of the absence of <inline-formula id="ieqn-82"><mml:math id="mml-ieqn-82"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, we conducted several experiments using MS-COCO dataset, refer to <xref ref-type="table" rid="table-5">Table 5</xref>, where the attention mechanism was replaced with the mean of features (<inline-formula id="ieqn-83"><mml:math id="mml-ieqn-83"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>k</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>k</mml:mi></mml:munderover><mml:msubsup><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:math></inline-formula>).</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Ablation study for replacing VFD component with the mean of features with different activation functions and various numbers of inputs to the language LSTM using MS-COCO dataset</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>B1</th>
<th>B2</th>
<th>B3</th>
<th>B4</th>
<th>M</th>
<th>R</th>
<th>C</th>
<th>S</th>
</tr>
</thead>
<tbody>
<tr>
<td>VFDICM_soft2_m</td>
<td>78.8</td>
<td>61.2</td>
<td>46.2</td>
<td>34.2</td>
<td>27.0</td>
<td>56.1</td>
<td>112.6</td>
<td>20.1</td>
</tr>
<tr>
<td>VFDICM_sig2_m</td>
<td>79.0</td>
<td>61.7</td>
<td>46.7</td>
<td>34.6</td>
<td>27.3</td>
<td>56.5</td>
<td>114.0</td>
<td>20.4</td>
</tr>
<tr>
<td>VFDICM_soft3_m</td>
<td>80.9</td>
<td>63.9</td>
<td>48.8</td>
<td>36.6</td>
<td>28.1</td>
<td>57.7</td>
<td>121.1</td>
<td>21.5</td>
</tr>
<tr>
<td>VFDICM_sig3_m</td>
<td>80.8</td>
<td>63.9</td>
<td>49.0</td>
<td>36.8</td>
<td>28.2</td>
<td>57.9</td>
<td>122.0</td>
<td>21.3</td>
</tr>
<tr>
<td>VFDICM</td>
<td>80.8</td>
<td>64.2</td>
<td>49.3</td>
<td>37.2</td>
<td>28.3</td>
<td>57.9</td>
<td>122.4</td>
<td>21.5</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In experiment VFDICM_sig2_m, we used a sigmoid activation function in the VFD component and two inputs in the language LSTM: <inline-formula id="ieqn-84"><mml:math id="mml-ieqn-84"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup></mml:math></inline-formula> and the mean of features (<inline-formula id="ieqn-85"><mml:math id="mml-ieqn-85"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>). Conversely, in experiment VFDICM_soft2_m, we used the same inputs in the language LSTM but with a softmax activation function.</p>
<p>For experiment VFDICM_sig3_m, we utilized a sigmoid activation function in the VFD component and three inputs in the language LSTM: <inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:msubsup><mml:mi>h</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi></mml:msubsup></mml:math></inline-formula>, and <inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:msub><mml:mrow><mml:mover><mml:mi>v</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>. Similarly, in experiment VFDICM_soft3_m, we used the same inputs in the language LSTM as VFDICM_sig3_m but with a softmax activation function in the VFD component.</p>
<p>To evaluate the impact of selecting the most relevant visual features at every time step (top-k) on VFDICM performance, we performed ablation experiments varying the <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mi>k</mml:mi></mml:math></inline-formula> parameters using MS-COCO dataset, refer to <xref ref-type="table" rid="table-6">Table 6</xref>. Different versions were compared with top-k values of 1, 10, 15, 20. We observed a significant correlation between parameter <inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mi>k</mml:mi></mml:math></inline-formula> and overall performance. With <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mi>k</mml:mi></mml:math></inline-formula> &#x003D; 10, the version outperformed the others with <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:mi>k</mml:mi></mml:math></inline-formula> &#x003D; 1, 15, and 20. Therefore, we chose the configuration with <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:mi>k</mml:mi></mml:math></inline-formula> &#x003D; 10 since it yielded the best results for integration.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Ablation studies involving different top-k parameters in the phase of CIDEr optimization applied to the MS-COCO dataset</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Top-k</th>
<th>B1</th>
<th>B2</th>
<th>B3</th>
<th>B4</th>
<th>M</th>
<th>R</th>
<th>C</th>
<th>S</th>
</tr>
</thead>
<tbody>
<tr>
<td><italic>k</italic> &#x003D; 1</td>
<td>80.6</td>
<td>63.8</td>
<td>48.8</td>
<td>36.5</td>
<td>28.1</td>
<td>57.8</td>
<td>120.7</td>
<td>21.3</td>
</tr>
<tr>
<td><italic>k</italic> &#x003D; 10</td>
<td><styled-content style="color:#FF0000">80.8</styled-content></td>
<td><styled-content style="color:#FF0000">64.2</styled-content></td>
<td><styled-content style="color:#FF0000">49.3</styled-content></td>
<td><styled-content style="color:#FF0000">37.2</styled-content></td>
<td><styled-content style="color:#FF0000">28.3</styled-content></td>
<td><styled-content style="color:#FF0000">57.9</styled-content></td>
<td><styled-content style="color:#FF0000">122.4</styled-content></td>
<td><styled-content style="color:#FF0000">21.5</styled-content></td>
</tr>
<tr>
<td><italic>k</italic> &#x003D; 15</td>
<td>81.0</td>
<td>64.2</td>
<td>49.3</td>
<td>37.0</td>
<td>28.2</td>
<td>57.9</td>
<td>122.4</td>
<td>21.5</td>
</tr>
<tr>
<td><italic>k</italic> &#x003D; 20</td>
<td>80.6</td>
<td>63.8</td>
<td>48.9</td>
<td>36.7</td>
<td>28.1</td>
<td>57.7</td>
<td>121.7</td>
<td>21.5</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_7">
<label>4.7</label>
<title>Qualitative Evaluation</title>
<p>In addition to conducting quantitative score analysis, we must assess the quality of the captions produced by VFDICM. <xref ref-type="fig" rid="fig-5">Fig. 5</xref> displays a selection of sample images from the test dataset, accompanied by their respective captions. Each image in <xref ref-type="fig" rid="fig-5">Fig. 5</xref> is associated with two different types of description. First, we describe the image using our proposed model, followed by the ground truth image captions.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Examples of the generated captions from VFDICM model. The VFDICM generates caption V, while the GT represents the ground truth captions</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_54841-fig-5.tif"/>
</fig>
<p>For instance, consider the picture located in row one, column one, top left. The caption for our proposed model, &#x201C;two men are skiing on a snowboard in the snow,&#x201D; describes the scene in greater detail by identifying &#x201C;two men.&#x201D; This caption is very similar to the human-generated caption, &#x201C;Two men use their snowboards to go down a snowy incline,&#x201D; demonstrating our model&#x2019;s capability to generate human-like captions. In another example, look at the image in the first column and first row from the right. According to our model, &#x201C;a teddy bear sitting on top of a wooden table,&#x201D; accurately depicts the image. In this caption, the human-annotated ground truth caption is very similar, &#x201C;A cake shaped as a Teddy Bear on a wooden table.&#x201D; VFDICM&#x2019;s performance and quality of generated descriptions remain high, significantly exceeding that of the baseline on standard evaluation metrics, as shown by the scores in <xref ref-type="table" rid="table-3">Table 3</xref>. These examples illustrate the model&#x2019;s ability to understand and describe various contexts effectively. Moreover, the consistency in generating accurate captions across different images underscores the robustness of our model.</p>

</sec>
<sec id="s4_8">
<label>4.8</label>
<title>Discussion</title>
<p>In this work, we have introduced an innovative image captioning model that utilizes the visual characteristics of the input image to produce informative sentences. This model places a strong emphasis on the local visual features of the input image, enabling it to guide the prediction of the next word in the evolving caption. Through the integration of the Visual Feature Detector (VFD) and Visual Feature Visual Attention (VFVA) modules, the model effectively harnesses the visual representations of the input image, leading to a significant enhancement in the performance of the image captioning model.</p>
<p>VFDICM exclusively relies on the visual attributes of the input image, without depending on semantic information such as topics, attributes, or Parts of Speech (PoS). This reliance solely on the visual features empowers the model to exploit the richness of the visual content within the input image, ultimately generating high-quality text. Additionally, the proposed framework operates independently of any external data sources, other than the adopted dataset, allowing the model to concentrate entirely on the available visual data within the image content. As well as its simplified architecture and fewer hyperparameters, our model is a good choice as a baseline for more complicated models and architectures.</p>
<p>Furthermore, the VFDICM model, as proposed, makes a substantial contribution to improving the efficiency of image captioning algorithms and producing high-quality descriptions. This novel approach effectively exploits the visual features of the input image to improve the performance of image captioning models and generate more accurate and informative descriptions. Our model incorporates two additional modules, VFD and VFVA, to facilitate this. The VFD module dynamically selects the most relevant features of the input, creating a new visual matrix that consists of these relevant visual features in the current linguistic context. Meanwhile, the VFVA module attends to these selected visual features and generates a new visual context vector, which is then utilized for generating a new word in the evolving caption.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>An innovative approach for improving image captioning is presented in this paper. VFDICM model comprises the Visual Feature Detector (VFD) and Visual Feature Visual Attention (VFVA) modules, which dynamically select and emphasize relevant visual features at each time step. Through rigorous experimentation to substantiate the advantages of VFD and VFVA modules, the outcomes illustrate that the methodology attains state-of-the-art performance of the model trained with both cross-entropy loss and RL-based loss. By selectively emphasizing pertinent visual features, we aim to enhance caption accuracy and relevance, contributing to the advancement of image captioning technology. This work deepens the understanding of how selective attention to visual features can improve image captioning quality. In the future work, we aim to explore the integration of Transformer architectures, specifically by incorporating VFD and VFVA within the Transformer decoder and multi-head self-attention into our model architecture. This research also offers promising directions for future studies in this field, ultimately leading to more accurate and informative image descriptions for real-world applications.</p>
</sec>
</body>
<back>
<ack><p>The authors would like to thank Prince Sultan University for their support.</p>
</ack>
<sec><title>Funding Statement</title>
<p>This work is supported by the National Natural Science Foundation of China (Nos. U22A2034, 62177047), High Caliber Foreign Experts Introduction Plan funded by MOST, and Central South University Research Programme of Advanced Interdisciplinary Studies (No. 2023QYJC020). Also, the authors would like to thank Prince Sultan University for paying the APC of this article.</p>
</sec>
<sec><title>Author Contributions</title>
<p>Alaa Thobhani: Software, Project administration, Investigation, Conceptualization, Visualization. Beiji Zou: Supervision. Xiaoyan Kui: Writing review and editing. Amr Abdussalam: Validation. Muhammad Asim: Resources, Formal analysis. Naveed Ahmed: Investigation. Mohammed Ali Alshara: Writing review. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability"><title>Availability of Data and Materials</title>
<p>We used the well-known MS COCO dataset, which is publicly available.</p>
</sec>
<sec><title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. H.</given-names> <surname>Bashir</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Ahmad</surname></string-name>, <string-name><given-names>D. R.</given-names> <surname>Rizvi</surname></string-name>, and <string-name><given-names>A. A. A.</given-names> <surname>El-Latif</surname></string-name></person-group>, &#x201C;<article-title>Efficient CNN-based disaster events classification using UAV-aided images for emergency response application</article-title>,&#x201D; <source>Neural Comput. Appl.</source>, vol. <volume>36</volume>, no. <issue>18</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>14</lpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1007/s00521-024-09610-4</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Ibrahim</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Efficient color image enhancement using piecewise linear transformation and gamma correction</article-title>,&#x201D; <source>J. Opt.</source>, vol. <volume>53</volume>, pp. <fpage>2027</fpage>&#x2013;<lpage>2037</lpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1007/s12596-023-01171-4</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S. R</given-names> <surname>Waheed</surname></string-name>, <string-name><given-names>N. M.</given-names> <surname>Suaib</surname></string-name>, <string-name><given-names>M. S. M.</given-names> <surname>Rahim</surname></string-name>, <string-name><given-names>A. R.</given-names> <surname>Khan</surname></string-name>, <string-name><given-names>S. A.</given-names> <surname>Bahaj</surname></string-name> and <string-name><given-names>T.</given-names> <surname>Saba</surname></string-name></person-group>, &#x201C;<article-title>Synergistic integration of transfer learning and deep learning for enhanced object detection in digital images</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>12</volume>, pp. <fpage>13525</fpage>&#x2013;<lpage>13536</lpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2024.3354706</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>P.</given-names> <surname>Anderson</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Bottom-up and top-down attention for image captioning and visual question answering</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</conf-name>, <year>2018</year>, pp. <fpage>6077</fpage>&#x2013;<lpage>6086</lpage>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>T. Y.</given-names> <surname>Lin</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Microsoft COCO: Common objects in context</article-title>,&#x201D; in <conf-name>Comput. Vis.&#x2013;ECCV 2014: 13th Eur. Conf.</conf-name>, <publisher-loc>Zurich, Switzerland</publisher-loc>, <publisher-name>Springer</publisher-name>, pp. <fpage>740</fpage>&#x2013;<lpage>755</lpage>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>Q.</given-names> <surname>You</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Jin</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Fang</surname></string-name>, and <string-name><given-names>J.</given-names> <surname>Luo</surname></string-name></person-group>, &#x201C;<article-title>Image captioning with semantic attention</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</conf-name>, <year>2016</year>, pp. <fpage>4651</fpage>&#x2013;<lpage>4659</lpage>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>K.</given-names> <surname>Xu</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Show, attend and tell: Neural image caption generation with visual attention</article-title>,&#x201D; in <conf-name>Int. Conf. on Mach. Learn.</conf-name>, <publisher-name>PMLR</publisher-name>, <year>2015</year>, pp. <fpage>2048</fpage>&#x2013;<lpage>2057</lpage>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>W.</given-names> <surname>Jiang</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>H.</given-names> <surname>Hu</surname></string-name></person-group>, &#x201C;<article-title>Bi-directional co-attention network for image captioning</article-title>,&#x201D; <source>ACM Trans. Multimedia Comput., Commun., Appl. (TOMM)</source>, vol. <volume>17</volume>, no. <issue>4</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>20</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1145/3460474</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Lu</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Xiong</surname></string-name>, <string-name><given-names>D.</given-names> <surname>Parikh</surname></string-name>, and <string-name><given-names>R.</given-names> <surname>Socher</surname></string-name></person-group>, &#x201C;<article-title>Knowing when to look: Adaptive attention via a visual sentinel for image captioning</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</conf-name>, <year>2017</year>, pp. <fpage>375</fpage>&#x2013;<lpage>383</lpage>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Ji</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Xu</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>B.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>X.</given-names> <surname>Song</surname></string-name></person-group>, &#x201C;<article-title>Spatio-temporal memory attention for image captioning</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>29</volume>, pp. <fpage>7615</fpage>&#x2013;<lpage>7628</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2020.3004729</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>C.</given-names> <surname>Yan</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Task-adaptive attention for image captioning</article-title>,&#x201D; <source>IEEE Trans. Circuits Syst. Video Technol.</source>, vol. <volume>32</volume>, no. <issue>1</issue>, pp. <fpage>43</fpage>&#x2013;<lpage>51</lpage>, <year>2022</year>. doi: <pub-id pub-id-type="doi">10.1109/TCSVT.2021.3067449</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Yu</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Zhang</surname></string-name>, and <string-name><given-names>Q.</given-names> <surname>Wu</surname></string-name></person-group>, &#x201C;<article-title>Dual attention on pyramid feature maps for image captioning</article-title>,&#x201D; <source>IEEE Trans. Multimedia</source>, vol. <volume>24</volume>, pp. <fpage>1775</fpage>&#x2013;<lpage>1786</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1109/TMM.2021.3072479</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>W.</given-names> <surname>Jiang</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Zhu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Fang</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Shi</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Zhao</surname></string-name> and <string-name><given-names>Y.</given-names> <surname>Liu</surname></string-name></person-group>, &#x201C;<article-title>Visual cluster grounding for image captioning</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>31</volume>, pp. <fpage>3920</fpage>&#x2013;<lpage>3934</lpage>, <year>2022</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2022.3177318</pub-id>; <pub-id pub-id-type="pmid">35635813</pub-id></mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Yu</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Yu</surname></string-name>, and <string-name><given-names>Q.</given-names> <surname>Huang</surname></string-name></person-group>, &#x201C;<article-title>Multimodal transformer with multi-view visual representation for image captioning</article-title>,&#x201D; <source>IEEE Trans. Circuits Syst. Video Technol.</source>, vol. <volume>30</volume>, no. <issue>12</issue>, pp. <fpage>4467</fpage>&#x2013;<lpage>4480</lpage>, <year>2019</year>. doi: <pub-id pub-id-type="doi">10.1109/TCSVT.2019.2947482</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A. A.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Zhai</surname></string-name>, <string-name><given-names>N.</given-names> <surname>Xu</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Nie</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Li</surname></string-name> and <string-name><given-names>Y.</given-names> <surname>Zhang</surname></string-name></person-group>, &#x201C;<article-title>Region-aware image captioning via interaction learning</article-title>,&#x201D; <source>IEEE Trans. Circuits Syst. Video Technol.</source>, vol. <volume>32</volume>, no. <issue>6</issue>, pp. <fpage>3685</fpage>&#x2013;<lpage>3696</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1109/TCSVT.2021.3107035</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. H.</given-names> <surname>Guo</surname></string-name>, <string-name><given-names>C. Z.</given-names> <surname>Lu</surname></string-name>, <string-name><given-names>Z. N.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>M. M.</given-names> <surname>Cheng</surname></string-name>, and <string-name><given-names>S. M.</given-names> <surname>Hu</surname></string-name></person-group>, &#x201C;<article-title>Visual attention network</article-title>,&#x201D; <source>Comput. Vis. Media</source>, vol. <volume>9</volume>, no. <issue>4</issue>, pp. <fpage>733</fpage>&#x2013;<lpage>752</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1007/s41095-023-0364-2</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>W.</given-names> <surname>Jiang</surname></string-name>, <string-name><given-names>Q.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Zhan</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Fang</surname></string-name>, and <string-name><given-names>F.</given-names> <surname>Shen</surname></string-name></person-group>, &#x201C;<article-title>Hybrid attention network for image captioning</article-title>,&#x201D; <source>Displays</source>, vol. <volume>73</volume>, <year>2022</year>, Art. no. 102238. doi: <pub-id pub-id-type="doi">10.1016/j.displa.2022.102238</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Ma</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Ji</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Sun</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Zhou</surname></string-name>, and <string-name><given-names>R.</given-names> <surname>Ji</surname></string-name></person-group>, &#x201C;<article-title>Towards local visual modeling for image captioning</article-title>,&#x201D; <source>Pattern Recognit.</source>, vol. <volume>138</volume>, <year>2023</year>, Art. no. 109420. doi: <pub-id pub-id-type="doi">10.1016/j.patcog.2023.109420</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Al-Qatf</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>RVAIC: Refined visual attention for improved image captioning</article-title>,&#x201D; <source>J. Intell. Fuzzy Syst.</source>, vol. <volume>46</volume>, pp. <fpage>1</fpage>&#x2013;<lpage>13</lpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.3233/JIFS-233004</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. B.</given-names> <surname>Hossen</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Ye</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Abdussalam</surname></string-name>, and <string-name><given-names>M. I.</given-names> <surname>Hossain</surname></string-name></person-group>, &#x201C;<article-title>GVA: Guided visual attention approach for automatic image caption generation</article-title>,&#x201D; <source>Multimed. Syst.</source>, vol. <volume>30</volume>, no. <issue>1</issue>, <year>2024</year>, <comment>Art. no. 50</comment>. doi: <pub-id pub-id-type="doi">10.1007/s00530-023-01249-w</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>C.</given-names> <surname>Wang</surname></string-name> and <string-name><given-names>X.</given-names> <surname>Gu</surname></string-name></person-group>, &#x201C;<article-title>Learning joint relationship attention network for image captioning</article-title>,&#x201D; <source>Expert. Syst. Appl.</source>, vol. <volume>211</volume>, no. <issue>20</issue>, <year>2023</year>, Art. no. 118474. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2022.118474</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Yang</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Ji</surname></string-name>, <string-name><given-names>H. T.</given-names> <surname>Shen</surname></string-name> and <string-name><given-names>T. S.</given-names> <surname>Chua</surname></string-name></person-group>, &#x201C;<article-title>More is better: Precise and detailed image captioning using online positive recall and missing concepts mining</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>28</volume>, no. <issue>1</issue>, pp. <fpage>32</fpage>&#x2013;<lpage>44</lpage>, <year>2018</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2018.2855415</pub-id>; <pub-id pub-id-type="pmid">30010565</pub-id></mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Huang</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Chen</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Ouyang</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Wan</surname></string-name>, and <string-name><given-names>Y.</given-names> <surname>Xue</surname></string-name></person-group>, &#x201C;<article-title>Image captioning with end-to-end attribute detection and subsequent attributes prediction</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>29</volume>, pp. <fpage>4013</fpage>&#x2013;<lpage>4026</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2020.2969330</pub-id>; <pub-id pub-id-type="pmid">32012014</pub-id></mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J. W.</given-names> <surname>Bae</surname></string-name>, <string-name><given-names>S. H.</given-names> <surname>Lee</surname></string-name>, <string-name><given-names>W. Y.</given-names> <surname>Kim</surname></string-name>, <string-name><given-names>J. H.</given-names> <surname>Seong</surname></string-name>, and <string-name><given-names>D. H.</given-names> <surname>Seo</surname></string-name></person-group>, &#x201C;<article-title>Image captioning model using part-of-speech guidance module for description with diverse vocabulary</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>10</volume>, no. <issue>11</issue>, pp. <fpage>45219</fpage>&#x2013;<lpage>45229</lpage>, <year>2022</year>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3169781</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Mei</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Zheng</surname></string-name>, and <string-name><given-names>J.</given-names> <surname>Fan</surname></string-name></person-group>, &#x201C;<article-title>Integrating part of speech guidance for image captioning</article-title>,&#x201D; <source>IEEE Trans. Multimedia</source>, vol. <volume>23</volume>, pp. <fpage>92</fpage>&#x2013;<lpage>104</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/TMM.2020.2976552</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Al-Qatf</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>NPoSC-A3: A novel part of speech clues-aware adaptive attention mechanism for image captioning</article-title>,&#x201D; <source>Eng. Appl. Artif. Intell.</source>, vol. <volume>131</volume>, no. <issue>4</issue>, <year>2024</year>, Art. no. 107732. doi: <pub-id pub-id-type="doi">10.1016/j.engappai.2023.107732</pub-id>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Yu</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Hu</surname></string-name>, <string-name><given-names>B.</given-names> <surname>Song</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Yang</surname></string-name>, and <string-name><given-names>J.</given-names> <surname>Zhang</surname></string-name></person-group>, &#x201C;<article-title>Topic-oriented image captioning based on order-embedding</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>28</volume>, no. <issue>6</issue>, pp. <fpage>2743</fpage>&#x2013;<lpage>2754</lpage>, <year>2018</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2018.2889922</pub-id>; <pub-id pub-id-type="pmid">30596577</pub-id></mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Wei</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>F.</given-names> <surname>Huang</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Ma</surname></string-name> and <string-name><given-names>Z.</given-names> <surname>Shi</surname></string-name></person-group>, &#x201C;<article-title>Integrating scene semantic knowledge into image captioning</article-title>,&#x201D; <source>ACM Trans. Multimedia Comput., Commun., Appl. (TOMM)</source>, vol. <volume>17</volume>, no. <issue>2</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>22</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1145/3439734</pub-id>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Hu</surname></string-name>, <string-name><given-names>L.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Yu</surname></string-name>, and <string-name><given-names>W.</given-names> <surname>Guan</surname></string-name></person-group>, &#x201C;<article-title>Chinese image caption generation via visual attention and topic modeling</article-title>,&#x201D; <source>IEEE Trans. Cybern.</source>, vol. <volume>52</volume>, no. <issue>2</issue>, pp. <fpage>1247</fpage>&#x2013;<lpage>1257</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/TCYB.2020.2997034</pub-id>; <pub-id pub-id-type="pmid">32568717</pub-id></mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Al-Qatf</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Hawbani</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Abdusallam</surname></string-name>, and <string-name><given-names>S. H.</given-names> <surname>Alsamhi</surname></string-name></person-group>, &#x201C;<article-title>Image captioning with novel topics guidance and retrieval-based topics re-weighting</article-title>,&#x201D; <source>IEEE Trans. Multimedia</source>, vol. <volume>25</volume>, pp. <fpage>5984</fpage>&#x2013;<lpage>5999</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1109/TMM.2022.3202690</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>Y. G.</given-names> <surname>Jiang</surname></string-name>, <string-name><given-names>T.</given-names> <surname>Zhang</surname></string-name>, and <string-name><given-names>W.</given-names> <surname>Fan</surname></string-name></person-group>, &#x201C;<article-title>Re-caption: Saliency-enhanced image captioning through two-phase learning</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>29</volume>, pp. <fpage>694</fpage>&#x2013;<lpage>709</lpage>, <year>2019</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2019.2928144</pub-id>; <pub-id pub-id-type="pmid">31331893</pub-id></mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>X.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Yuan</surname></string-name>, and <string-name><given-names>X.</given-names> <surname>Lu</surname></string-name></person-group>, &#x201C;<article-title>Vision-to-language tasks based on attributes and attention mechanism</article-title>,&#x201D; <source>IEEE Trans. Cybern.</source>, vol. <volume>51</volume>, no. <issue>2</issue>, pp. <fpage>913</fpage>&#x2013;<lpage>926</lpage>, <year>2019</year>. doi: <pub-id pub-id-type="doi">10.1109/TCYB.2019.2914351</pub-id>; <pub-id pub-id-type="pmid">31107679</pub-id></mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Cheng</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Wei</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Mao</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Liu</surname></string-name>, and <string-name><given-names>C.</given-names> <surname>Miao</surname></string-name></person-group>, &#x201C;<article-title>Stack-VS: Stacked visual-semantic attention for image caption generation</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>8</volume>, pp. <fpage>154953</fpage>&#x2013;<lpage>154965</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3018752</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Rotstein</surname></string-name>, <string-name><given-names>D.</given-names> <surname>Bensaid</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Brody</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Ganz</surname></string-name>, and <string-name><given-names>R.</given-names> <surname>Kimmel</surname></string-name></person-group>, &#x201C;<article-title>FuseCap: Leveraging large language models to fuse visual data into enriched image captions</article-title>,&#x201D; <year>2023</year>, <italic>arXiv:2305.17718</italic>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>D. A.</given-names> <surname>Hafeth</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Kollias</surname></string-name>, and <string-name><given-names>M.</given-names> <surname>Ghafoor</surname></string-name></person-group>, &#x201C;<article-title>Semantic representations with attention networks for boosting image captioning</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>11</volume>, pp. <fpage>40230</fpage>&#x2013;<lpage>40239</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2023.3268744</pub-id>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Haque</surname></string-name>, <string-name><given-names>I.</given-names> <surname>Labiba</surname></string-name>, and <string-name><given-names>S.</given-names> <surname>Akter</surname></string-name></person-group>, &#x201C;<article-title>FaceAtt: Enhancing image captioning with facial attributes for portrait images</article-title>,&#x201D; <year>2023</year>, <italic>arXiv:2309.13601</italic>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>B.</given-names> <surname>Dai</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Fidler</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Urtasun</surname></string-name>, and <string-name><given-names>D.</given-names> <surname>Lin</surname></string-name></person-group>, &#x201C;<article-title>Towards diverse and natural image descriptions via a conditional gan</article-title>,&#x201D; in <conf-name>Proc. of the IEEE Int. Conf. on Comput. Vis.</conf-name>, <year>2017</year>, pp. <fpage>2970</fpage>&#x2013;<lpage>2979</lpage>.</mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Mao</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>R.</given-names> <surname>Li</surname></string-name></person-group>, &#x201C;<chapter-title>Show and tell more: Topic-oriented multi-sentence image captioning</chapter-title>,&#x201D; in <source>IJCAI</source>, <year>2018</year>, pp. <fpage>4258</fpage>&#x2013;<lpage>4264</lpage>.</mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Abdussalam</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Ye</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Hawbani</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Al-Qatf</surname></string-name>, and <string-name><given-names>R.</given-names> <surname>Khan</surname></string-name></person-group>, &#x201C;<article-title>NumCap: A number-controlled multi-caption image captioning network</article-title>,&#x201D; <source>ACM Trans. Multimedia Comput., Commun. Appl.</source>, vol. <volume>19</volume>, no. <issue>4</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>24</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1145/3576927</pub-id>.</mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Wu</surname></string-name>, <string-name><given-names>T.</given-names> <surname>Chen</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Wu</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Yang</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Luo</surname></string-name> and <string-name><given-names>L.</given-names> <surname>Lin</surname></string-name></person-group>, &#x201C;<article-title>Fine-grained image captioning with global-local discriminative objective</article-title>,&#x201D; <source>IEEE Trans. Multimedia</source>, vol. <volume>23</volume>, pp. <fpage>2413</fpage>&#x2013;<lpage>2427</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/TMM.2020.3011317</pub-id>.</mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Lin</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Wen</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Li</surname></string-name> and <string-name><given-names>X.</given-names> <surname>Hu</surname></string-name></person-group>, &#x201C;<article-title>Vocabulary-wide credit assignment for training image captioning models</article-title>,&#x201D; <source>IEEE Trans. Image Process.</source>, vol. <volume>30</volume>, pp. <fpage>2450</fpage>&#x2013;<lpage>2460</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1109/TIP.2021.3051476</pub-id>; <pub-id pub-id-type="pmid">33471759</pub-id></mixed-citation></ref>
<ref id="ref-42"><label>[42]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Xu</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Multi-level policy and reward-based deep reinforcement learning framework for image captioning</article-title>,&#x201D; <source>IEEE Trans. Multimedia</source>, vol. <volume>22</volume>, no. <issue>5</issue>, pp. <fpage>1372</fpage>&#x2013;<lpage>1383</lpage>, <year>2019</year>. doi: <pub-id pub-id-type="doi">10.1109/TMM.2019.2941820</pub-id>.</mixed-citation></ref>
<ref id="ref-43"><label>[43]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>S. J.</given-names> <surname>Rennie</surname></string-name>, <string-name><given-names>E.</given-names> <surname>Marcheret</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Mroueh</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Ross</surname></string-name>, and <string-name><given-names>V.</given-names> <surname>Goel</surname></string-name></person-group>, &#x201C;<article-title>Self-critical sequence training for image captioning</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</conf-name>, <year>2017</year>, pp. <fpage>7008</fpage>&#x2013;<lpage>7024</lpage>.</mixed-citation></ref>
<ref id="ref-44"><label>[44]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Shi</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Mi</surname></string-name>, and <string-name><given-names>X.</given-names> <surname>Yang</surname></string-name></person-group>, &#x201C;<article-title>Image captioning with transformer and knowledge graph</article-title>,&#x201D; <source>Pattern Recognit. Lett.</source>, vol. <volume>143</volume>, no. <issue>6</issue>, pp. <fpage>43</fpage>&#x2013;<lpage>49</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1016/j.patrec.2020.12.020</pub-id>.</mixed-citation></ref>
<ref id="ref-45"><label>[45]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>C.</given-names> <surname>Wu</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Yuan</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Cao</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Wei</surname></string-name>, and <string-name><given-names>L.</given-names> <surname>Wang</surname></string-name></person-group>, &#x201C;<article-title>Hierarchical attention-based fusion for image caption with multi-grained rewards</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>8</volume>, pp. <fpage>57943</fpage>&#x2013;<lpage>57951</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2020.2981513</pub-id>.</mixed-citation></ref>
<ref id="ref-46"><label>[46]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Karpathy</surname></string-name> and <string-name><given-names>L.</given-names> <surname>Fei</surname></string-name></person-group>, &#x201C;<article-title>Deep visual-semantic alignments for generating image descriptions</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognit.</conf-name>, <year>2015</year>, pp. <fpage>3128</fpage>&#x2013;<lpage>3137</lpage>.</mixed-citation></ref>
<ref id="ref-47"><label>[47]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>R.</given-names> <surname>Vedantam</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Lawrence Zitnick</surname></string-name>, and <string-name><given-names>D.</given-names> <surname>Parikh</surname></string-name></person-group>, &#x201C;<article-title>CIDEr: Consensus-based image description evaluation</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognition</conf-name>, <year>2015</year>, pp. <fpage>4566</fpage>&#x2013;<lpage>4575</lpage>.</mixed-citation></ref>
<ref id="ref-48"><label>[48]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Banerjee</surname></string-name> and <string-name><given-names>A.</given-names> <surname>Lavie</surname></string-name></person-group>, &#x201C;<article-title>METEOR: An automatic metric for MT evaluation with improved correlation with human judgments</article-title>,&#x201D; in <conf-name>Proc. ACL Workshop Intrinsic Extrinsic Eval. Meas. for Mach. Transl. Summarizat.</conf-name>, <year>2005</year>, pp. <fpage>65</fpage>&#x2013;<lpage>72</lpage>.</mixed-citation></ref>
<ref id="ref-49"><label>[49]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>K.</given-names> <surname>Papineni</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Roukos</surname></string-name>, <string-name><given-names>T.</given-names> <surname>Ward</surname></string-name>, and <string-name><given-names>W. J.</given-names> <surname>Zhu</surname></string-name></person-group>, &#x201C;<article-title>BLEU: A method for automatic evaluation of machine translation</article-title>,&#x201D; in <conf-name>Proc. 40th Annu. Meet. Assoc. Computat. Linguistics</conf-name>, <year>2002</year>, pp. <fpage>311</fpage>&#x2013;<lpage>318</lpage>.</mixed-citation></ref>
<ref id="ref-50"><label>[50]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><given-names>C. Y.</given-names> <surname>Lin</surname></string-name></person-group>, &#x201C;<chapter-title>Rouge: A package for automatic evaluation of summaries</chapter-title>,&#x201D; in <source>Text Summarization Branches Out</source>, <publisher-loc>Barcelona, Spain</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>, <year>2004</year>, pp. <fpage>74</fpage>&#x2013;<lpage>81</lpage>.</mixed-citation></ref>
<ref id="ref-51"><label>[51]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>P.</given-names> <surname>Anderson</surname></string-name>, <string-name><given-names>B.</given-names> <surname>Fernando</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Johnson</surname></string-name>, and <string-name><given-names>S.</given-names> <surname>Gould</surname></string-name></person-group>, &#x201C;<article-title>Spice: Semantic propositional image caption evaluation</article-title>,&#x201D; in <conf-name>Comput. Vis.&#x2013;ECCV 2016</conf-name>: <publisher-loc>Amsterdam, The Netherlands</publisher-loc>, <publisher-name>Springer</publisher-name>, <year>2016</year>, pp. <fpage>14</fpage>&#x2013;<lpage>398</lpage>.</mixed-citation></ref>
<ref id="ref-52"><label>[52]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>W.</given-names> <surname>Jiang</surname></string-name>, <string-name><given-names>L.</given-names> <surname>Ma</surname></string-name>, <string-name><given-names>Y. -G.</given-names> <surname>Jiang</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Liu</surname></string-name>, and <string-name><given-names>T.</given-names> <surname>Zhang</surname></string-name></person-group>, &#x201C;<article-title>Recurrent fusion network for image captioning</article-title>,&#x201D; in <conf-name>Proc. Eur. Conf. Comput. Vis. (ECCV)</conf-name>, <year>2018</year>, pp. <fpage>499</fpage>&#x2013;<lpage>515</lpage>.</mixed-citation></ref>
<ref id="ref-53"><label>[53]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Wu</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Xu</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>S.</given-names> <surname>Perry</surname></string-name></person-group>, &#x201C;<article-title>Recall what you see continually using gridlstm in image captioning</article-title>,&#x201D; <source>IEEE Trans. Multimedia</source>, vol. <volume>22</volume>, no. <issue>3</issue>, pp. <fpage>808</fpage>&#x2013;<lpage>818</lpage>, <year>2019</year>. doi: <pub-id pub-id-type="doi">10.1109/TMM.2019.2931815</pub-id>.</mixed-citation></ref>
<ref id="ref-54"><label>[54]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>C.</given-names> <surname>Sur</surname></string-name></person-group>, &#x201C;<article-title>MRRC: Multiple role representation crossover interpretation for image captioning with R-CNN feature distribution composition (FDC)</article-title>,&#x201D; <source>Multimed. Tools Appl.</source>, vol. <volume>80</volume>, no. <issue>12</issue>, pp. <fpage>18413</fpage>&#x2013;<lpage>18443</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1007/s11042-021-10578-9</pub-id>.</mixed-citation></ref>
<ref id="ref-55"><label>[55]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>do Carmo Nogueira</surname></string-name>, <string-name><given-names>C. D. N.</given-names> <surname>Vinhal</surname></string-name>, <string-name><given-names>G.</given-names> <surname>da Cruz J&#x00FA;nior</surname></string-name>, <string-name><given-names>M. R. D.</given-names> <surname>Ullmann</surname></string-name>, and <string-name><given-names>T. C.</given-names> <surname>Marques</surname></string-name></person-group>, &#x201C;<article-title>A reference-based model using deep learning for image captioning</article-title>,&#x201D; <source>Multimed. Syst.</source>, vol. <volume>29</volume>, no. <issue>3</issue>, pp. <fpage>1665</fpage>&#x2013;<lpage>1681</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1007/s00530-022-00937-3</pub-id>.</mixed-citation></ref>
<ref id="ref-56"><label>[56]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>D.</given-names> <surname>Zhao</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Yang</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>Z.</given-names> <surname>Qi</surname></string-name></person-group>, &#x201C;<article-title>A cooperative approach based on self-attention with interactive attribute for image caption</article-title>,&#x201D; <source>Multimed. Tools Appl.</source>, vol. <volume>82</volume>, no. <issue>1</issue>, pp. <fpage>1223</fpage>&#x2013;<lpage>1236</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1007/s11042-022-13279-z</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>