<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">50767</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2024.050767</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>MSD-Net: Pneumonia Classification Model Based on Multi-Scale Directional Feature Enhancement</article-title>
<alt-title alt-title-type="left-running-head">MSD-Net: Pneumonia Classification Model Based on Multi-Scale Directional Feature Enhancement</alt-title>
<alt-title alt-title-type="right-running-head">MSD-Net: Pneumonia Classification Model Based on Multi-Scale Directional Feature Enhancement</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Zhou</surname><given-names>Tao</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-2" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Guo</surname><given-names>Yujie</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref><email>guo_yujie0815@163.com</email></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Peng</surname><given-names>Caiyue</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Niu</surname><given-names>Yuxia</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Pan</surname><given-names>Yunfeng</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Lu</surname><given-names>Huiling</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<aff id="aff-1"><label>1</label><institution>School of Computer Science and Engineering, North Minzu University</institution>, <addr-line>Yinchuan, 750021</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>School of Medical Information &#x0026; Engineering, Ningxia Medical University</institution>, <addr-line>Yinchuan, 750004</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Key Laboratory of Image and Graphics Intelligent Processing of State Ethnic Affairs Commission, North Minzu University</institution>, <addr-line>Yinchuan, 750021</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Yujie Guo. Email: <email>guo_yujie0815@163.com</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2024</year></pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>20</day>
<month>6</month>
<year>2024</year></pub-date>
<volume>79</volume>
<issue>3</issue>
<fpage>4863</fpage>
<lpage>4882</lpage>
<history>
<date date-type="received">
<day>17</day>
<month>2</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>4</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2024 Zhou et al.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Zhou et al.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_50767.pdf"></self-uri>
<abstract>
<p>Computer-aided diagnosis of pneumonia based on deep learning is a research hotspot. However, there are some problems that the features of different sizes and different directions are not sufficient when extracting the features in lung X-ray images. A pneumonia classification model based on multi-scale directional feature enhancement MSD-Net is proposed in this paper. The main innovations are as follows: Firstly, the Multi-scale Residual Feature Extraction Module (MRFEM) is designed to effectively extract multi-scale features. The MRFEM uses dilated convolutions with different expansion rates to increase the receptive field and extract multi-scale features effectively. Secondly, the Multi-scale Directional Feature Perception Module (MDFPM) is designed, which uses a three-branch structure of different sizes convolution to transmit direction feature layer by layer, and focuses on the target region to enhance the feature information. Thirdly, the Axial Compression Former Module (ACFM) is designed to perform global calculations to enhance the perception ability of global features in different directions. To verify the effectiveness of the MSD-Net, comparative experiments and ablation experiments are carried out. In the COVID-19 RADIOGRAPHY DATABASE, the Accuracy, Recall, Precision, F1 Score, and Specificity of MSD-Net are 97.76%, 95.57%, 95.52%, 95.52%, and 98.51%, respectively. In the chest X-ray dataset, the Accuracy, Recall, Precision, F1 Score and Specificity of MSD-Net are 97.78%, 95.22%, 96.49%, 95.58%, and 98.11%, respectively. This model improves the accuracy of lung image recognition effectively and provides an important clinical reference to pneumonia Computer-Aided Diagnosis.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Pneumonia</kwd>
<kwd>X-ray image</kwd>
<kwd>ResNet</kwd>
<kwd>multi-scale feature</kwd>
<kwd>direction feature</kwd>
<kwd>transformer</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>National Natural Science Foundation of China</funding-source>
<award-id>62062003</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Natural Science Foundation of Ningxia</funding-source>
<award-id>2023AAC03293</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>There are significant morbidity and mortality in Pneumonia worldwide. It is the leading infectious death reason in all human ages [<xref ref-type="bibr" rid="ref-1">1</xref>]. Pneumonia is a lung inflammation in terminal airways, alveoli, and pulmonary interstitium, which is caused by bacteria, viruses, or other pathogenic pathogens. Alveoli are filled with inflammatory cells and fluid once the pathogen enters the lungs. The infection prevents gas from being exchanged in the lungs, increasing the patient&#x2019;s risk of death [<xref ref-type="bibr" rid="ref-2">2</xref>]. Different pathogens cause different symptoms of pneumonia. Each pneumonia treatment strategy is different. Therefore, early recognition is critical for making pneumonia treatment strategies. The pneumonia diagnosis based on X-ray images is an important means of screening in radiology departments. Lung X-ray images are less costly, time-consuming, and can show the lung structure clearly. It is widely used in clinical medicine [<xref ref-type="bibr" rid="ref-3">3</xref>]. However, different lung X-ray images have different lesion sizes and lesion directions. It is difficult to distinguish different lesions only through doctor experience. There is a lack of specificity in pneumonia clinical images, which brings challenges to the early disease diagnosis.</p>
<p>The development of deep learning provides new ideas and methods for Computer-Aided Diagnosis in pneumonia images [<xref ref-type="bibr" rid="ref-4">4</xref>]. CNN (Convolutional Neural Network) uses convolutional layers to extract the image features and improve performance greatly [<xref ref-type="bibr" rid="ref-5">5</xref>]. ResNet (Residual Network) [<xref ref-type="bibr" rid="ref-6">6</xref>] uses skip connections in the internal residual blocks to reuse features. It alleviates the disappearing gradients problem in deep neural networks and improves the network expression ability. Khurana et al. [<xref ref-type="bibr" rid="ref-7">7</xref>] propose a machine learning-based time-series Facebook NeuralProphet model, this study aims to determine categorical predictions for COVID-19 (Corona Virus Disease 2019). Akbulut [<xref ref-type="bibr" rid="ref-8">8</xref>] propose a powerful algorithm based on a new customized deep learning model, the model is trained synchronously with the attention and LSTM (Long Short-Term Memory) model with CNN models to classify healthy, COVID-19, and pneumonia. Kaur et al. [<xref ref-type="bibr" rid="ref-9">9</xref>] propose a new image processing-based technique for the health care systems named &#x201C;C19D-Net&#x201D;. The proposed system extracts deep learning features by applying the InceptionV4 architecture and Multiclass SVM (Support Vector Machine) classifier to classify and detect COVID-19 infection into four different classes. Zhou et al. [<xref ref-type="bibr" rid="ref-10">10</xref>] propose a COVID-ResNet auxiliary diagnosis model based on CT images, this model can focus lesion region by attention mechanism into the residual block, and it improves the classification performance of convolutional neural networks to improve the accuracy of COVID classification.</p>
<p>Although the residual network can improve the accuracy of lung X-ray image classification. However, the size of pneumonia lesions is different, and the features cannot be extracted by a single-size convolution operation. Zhou et al. [<xref ref-type="bibr" rid="ref-11">11</xref>] propose a cross-modal cross-scale global-local attention detection model, which obtained rich multi-scale features by grouping multi-scale attention for feature fusion and improving the model&#x2019;s ability to extract lesion features. Xiao et al. [<xref ref-type="bibr" rid="ref-12">12</xref>] propose a multi-scale spatial channel attention module and multi-feature fusion global local attention module, which effectively solves the problem of low classification accuracy. Huo et al. [<xref ref-type="bibr" rid="ref-13">13</xref>] propose a hierarchical multi-scale feature fusion network for medical image classification, The model can extract local and global features effectively on different semantic scales, and it can improve the classification accuracy of various medical images. Although the multi-scale module can improve the feature extraction ability, it lacks the direction features perception ability about lung X-ray images.</p>
<p>In summary, the existing network is not sufficient to extract the lesion size and directional features. To solve the above problems, A pneumonia classification model based on multi-scale directional feature enhancement is proposed in this paper. The main contributions of this model are as follows:
<list list-type="order">
<list-item>
<p>To solve the different lesion size problems in lung X-ray images, a Multi-scale Residual Feature Extraction Module (MRFEM) is designed. The MRFEM uses dilated convolutions with different expansion rates to extract features. It improves the model&#x2019;s adaptability to different sizes in pneumonia lesions, and the model&#x2019;s ability to capture multi-scale features.</p></list-item>
<list-item>
<p>A Multi-scale Directional Feature Perception Module (MDFPM) is designed to further enhance the feature extraction ability in the backbone network. The MDFPM uses three different sizes of convolutions, and the multi-branch structure is used to gather highly correlated features to enhance the lesion features. It can further improve the feature extraction ability of the model.</p></list-item>
<list-item>
<p>To obtain the direction features in lung X-ray images and increase the extraction ability of global features, the Axial Compression Former Module (ACFM) is added at the end of the feature extraction network. The ACFM performs global calculations in different directions. It enhances the global features perception in different directions and improves the classification performance in pneumonia diseases.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<p>The residual network effectively alleviates the problems, which of gradient disappearance and network degradation caused by the increase in network depth. It can greatly improve the generalization ability and robustness of deep networks, and it has made a breakthrough in the field of pneumonia image classification.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Pneumonia Classification Method Based on Residual Unit</title>
<p>In the residual unit, the problems of gradient disappearance and gradient explosion during deep neural network training are solved by introducing skip connection. Gopatoti et al. [<xref ref-type="bibr" rid="ref-14">14</xref>] propose a multi-textural multi-class attention recurrent residual convolutional neural network, it can classify the CXR (chest X-ray) images into normal, COVID-19, viral pneumonia, and lung opacity using extracted multi-textural features with improved accuracy. Zhang et al. [<xref ref-type="bibr" rid="ref-15">15</xref>] propose a dimension-driven multi-path attention residual network, a dimension-driven multipath attention residual block is developed to effectively obtain the multi-scale features, and differently treats these features containing different amounts of information through the channel attention mechanism, which makes the data depth features better expressed. Sreedevi et al. [<xref ref-type="bibr" rid="ref-16">16</xref>] propose a Dual Attention method based on the Resnet-50 with bidirectional gated recurrent unit for image classification, the GRU (gated recurrent unit) is combined with ResNET 50 to enhance the expressiveness of the model.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Pneumonia Classification Method Based on Overall Structure</title>
<p>The optimization of ResNet based on the overall structure can effectively reduce overfitting and enhance the ability of the network to learn features. Hassan et al. [<xref ref-type="bibr" rid="ref-17">17</xref>] propose an architecture called Medical Quantum Convolutional Neural Network, based on the Quantum Convolutional Neural Networks model and a modified ResNet pre-trained model, for enhancing the biomedical image classification in the MNIST medical dataset. Ejiyi et al. [<xref ref-type="bibr" rid="ref-18">18</xref>] propose a classification network named ResfEANet, which is built upon ResNet and incorporates an External Attention mechanism to extract features effectively. Nawaz et al. [<xref ref-type="bibr" rid="ref-19">19</xref>] propose a swish-based improved ResNet model, which introduced multiple dense layers at the end of the proposed CNN structure to ensure more robust sample features for classification purposes.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Method</title>
<p>Compared with the traditional neural network, ResNet is a good deep learning model to alleviates the disappearing gradients. The problem that deep learning models are difficult to train is alleviated. However, it is limited in capturing local and global features. In addition, there are different lesion sizes and lesion directions in Lung X-ray images. The MSD-Net model is proposed, its structure is shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, the main introduction of three modules: Multi-scale Residual Feature Extraction Module (MRFEM), Multi-scale Directional Feature Perception Module (MDFPM), Axial Compression Former Module (ACFM). In this model, ResNet50 is used as the main backbone network, there are five stages in backbone network, that is Stage I&#x2013;Stage V. Firstly, the lung X-Ray image is inputted into Conv7 &#x00D7; 7 and Maxpool layers to extract shallow features. Then advanced semantic information is obtained stage by stage through the first four stages Stage I&#x2013;Stage IV. Each Stage is composed by several MRFEMs, and rich multi-scale features are obtained through dilated convolutions with different expansion rates. In Stage V, the ACFM is designed to extract global semantic features in different spatial directions, which improves the model ability to perceive the global features of lung X-ray images. Secondly, a MDFPM is designed between the previous stage and the next stage. The module uses different size convolution and directional pooling to enhance the acquisition ability about multi-scale and directional features. Finally, it is classified by full connection layer (FC). The pseudo-code of the overall architecture is shown in <xref ref-type="table" rid="table-1">Table 1</xref>.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>MSD-Net overall framework</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-1.tif"/>
</fig><table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Pseudo-code for the overall architecture</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
</colgroup>
<tbody>
<tr>
<td>Input: X-Ray image of pneumonia</td>
</tr>
<tr>
<td>output: Results of pneumonia image classification</td>
</tr>
<tr>
<td>1. input X</td>
</tr>
<tr>
<td>2. X<sub>0</sub> &#x003D; Maxpool (Conv7 &#x00D7; 7 (X))</td>
</tr>
<tr>
<td>3. for (i &#x003D; 1; i &#x003C; 6; i&#x002B;&#x002B;) {</td>
</tr>
<tr>
<td>4. if (i &#x003D; 1) {Xi &#x003D;MRFEM (X<sub>0</sub>);}</td>
</tr>
<tr>
<td>5. else if (i &#x003D; 2) {X<sub>m</sub> &#x003D; MRFEM (X<sub>1</sub>); X<sub>n</sub> &#x003D; MDFPM (X<sub>1</sub>); X<sub>i</sub> &#x003D; Concat (X<sub>m</sub>, X<sub>n</sub>);}</td>
</tr>
<tr>
<td>6. else if (2 &#x003C; i &#x003C; 5) {X<sub>m</sub> &#x003D; MRFEM (Xi); X<sub>n</sub> &#x003D; MDFPM (Xn); X<sub>i</sub> &#x003D; Concat (Xm, Xn);}</td>
</tr>
<tr>
<td>7. else {Xm &#x003D; ACFM (Xi); Xn &#x003D; MDFPM (Xn); Xi &#x003D; Add (Xm, Xn);}</td>
</tr>
<tr>
<td>8. F &#x003D; FC (Xi);</td>
</tr>
<tr>
<td>9. Output F;</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s3_1">
<label>3.1</label>
<title>Multi-Scale Residual Feature Extraction Module</title>
<p>ResNet is a typical deep learning network with great feature extraction ability. However, the basic resblock extract features using a single convolution operation, which limit the perceptive range of the input images. To solve these problems, a Multi-scale Residual Feature Extraction Module (MRFEM) is proposed in this paper. In this model, dilated convolutions with different expansion rates are added to enlarge perceptive field, which makes the model handle multi-scale and multi-level features greatly. It improves the ability of feature extraction and expression in lung X-ray images. Its structure is shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Multi-scale residual feature extraction module</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-2.tif"/>
</fig>
<p>There are four-branch structures in this model: The first branch preserves the initial features. In branch 2&#x2013;branch 4, there are three expanding convolutions with different expansion rates that are used to extract features. The calculation method of dilated convolution is similar to the chessboard form. Since the feature maps of each layer are obtained by the convolution calculation of the feature maps of the previous layer. The convolution result is lack of interdependence and continuity, which is gridding effect [<xref ref-type="bibr" rid="ref-20">20</xref>]. Therefore, the residual connection is used to obtain pixel information in wider range, which avoid gridding effect. Each branch features are added to next branch which fully fuse each branch features. Finally, a 1 &#x00D7; 1 convolution operation is used to adjust the channel number.</p>
<p>The specific process is as follows: The feature map <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is inputted into <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, the results are divided into 4 feature map subsets, represented by <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula>. Each feature subset <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> has the same space size and 1/4 number of channels compared with the input features. Except for <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, there are corresponding 3 &#x00D7; 3 dilated convolution in each <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and the expansion rate is increased gradually with <italic>i</italic>. Three perceptive fields are obtained by three different sizes of convolution, which improve the module perception ability. Because the four branches have different ability to extract features, the features of each branch are sufficiently fused by transferring each branch output into the next branch. The residual connection is added to avoid the gridding effect caused by dilated convolution. The specific formula is shown in <xref ref-type="disp-formula" rid="eqn-1">(1)</xref>&#x2013;<xref ref-type="disp-formula" rid="eqn-4">(4)</xref>:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>+</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>+</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>4</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>+</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>where, <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> represent dilated convolution with expansion rates of 1, 2, 3. Finally, the feature maps <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>4</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> from all branches are merged into the channel dimension. Then a 1 &#x00D7; 1 convolution operation is carried out on the fused features, and the final output result is obtained by adding residual connection. The specific operation is shown in <xref ref-type="disp-formula" rid="eqn-5">formula (5)</xref>.
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mi>p</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>3</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mn>4</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula></p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Multi-Scale Directional Feature Perception Module</title>
<p>In X-ray images of pneumonia, the lesion shape is complex and the size is different, so it is difficult to extract the size, shape and direction information of different lesions. Moreover, it is insufficient to extract the size and direction features of lesion region using single-scale convolution, and small lesion region features are easily ignored. In this paper, a Multi-scale Directional Feature Perception Module (MDFPM) is designed to enhance features in three different scales. Two different ways of enlarging the convolution size are used to enlarge the perceptive field: Firstly, convolution with kernel size 3, 5, 7 is used to obtain different scale features, and the output feature are pooled in H and W directions to obtain the lesion direction features. Then, dilated convolutions with expansion rates 1, 2, 3 are used to increase the perceptive field again and capture multi-scale context features. Finally, the different scale features are fused in proper order. In this model, directional features are obtained by three-branch structure. It can improve the network ability to locate interesting targets. Its structure is shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Multi-scale directional feature perception module</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-3.tif"/>
</fig>
<p>There are three branches in MDFPM. Convolutions of size 3 &#x00D7; 3, 5 &#x00D7; 5, and 7 &#x00D7; 7 are processed in parallel to obtain the multi-scale features. The features are aggregated by two spatial directions to capture cross-channel and direction-perception features. It helps the model to obtain spatial features and restrain irrelevant features. <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is supplemented by the feature map <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> using the 7 &#x00D7; 7 convolution, and then <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is supplemented by the feature map <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> using the 7 &#x00D7; 7 convolution and the feature map <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> using the 5 &#x00D7; 5 convolution. Thus, the feature supplement of the three branches is realized. Dilated convolution is added to improve the extraction ability of local features further. The specific process is as follows:</p>
<p>Firstly, the features <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is dealt with three different sizes to get three feature mappings. The specific formulas are shown in <xref ref-type="disp-formula" rid="eqn-6">(6)</xref>&#x2013;<xref ref-type="disp-formula" rid="eqn-8">(8)</xref>:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>5</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>7</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Secondly, taking the first branch as an example, the feature map <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is calculated by average pooling in the H and W direction, respectively, which can get the attention maps <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>. Specifically, the input feature <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> is encoded by a pooling kernel (H-avgpool, H-Ap) with size 1 &#x00D7; W in the horizontal direction. The output result formula is shown in <xref ref-type="disp-formula" rid="eqn-9">(9)</xref>:
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p><inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>A</mml:mi><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> is a pooling kernel of size 1 &#x00D7; W. Similarly, the input feature is encoded by a pooling kernel (W-avgpool, W-Ap) with size H &#x00D7; 1 in the vertical direction. The output result formula is shown in <xref ref-type="disp-formula" rid="eqn-10">(10)</xref>:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p><inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mi>A</mml:mi><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> is a pooling kernel of size H &#x00D7; 1. The feature maps in H direction and W direction are merged, and then the <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> operation, <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mi>B</mml:mi><mml:mi>N</mml:mi></mml:math></inline-formula> operation and <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:math></inline-formula> operation are used to get the fused feature map <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>r</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, where, r is scale factor and the formula is shown in <xref ref-type="disp-formula" rid="eqn-11">(11)</xref>:
<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Thirdly, the feature map <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is Split into two independent feature maps <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>r</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>r</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> along the space dimension. The attention maps in H, W direction is obtained by Conv1 &#x00D7; 1, Sigmoid. Then it is fused with the original features to output <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The specific formula is shown in <xref ref-type="disp-formula" rid="eqn-12">(12)</xref> and <xref ref-type="disp-formula" rid="eqn-13">(13)</xref>:
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-13"><label>(13)</label><mml:math id="mml-eqn-13" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>Sigmoid</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>Sigmoid</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>W</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>And so on, the feature map <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is obtained.</p>
<p>Finally, in order to improve the degree of feature complementarity among different branches, the features are transferred from the feature extraction branch of the larger perceptive field to the feature extraction branch of the smaller perceptive field. Then the enhanced features are obtained by using convolution of different expansion rates. <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is used to obtain the enhanced features by a 3 &#x00D7; 3 convolution with dilation rate 3. <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is used to obtain the enhanced features by a 3 &#x00D7; 3 convolution with dilation rate 2. <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is used to obtain the enhanced features by a 3 &#x00D7; 3 convolution with dilation rate 1. The final features of each branch are obtained by 1 &#x00D7; 1 convolution. It is fused together and the dimension of the feature is adjusted by convolution, output the final result <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:msubsup><mml:mi>X</mml:mi><mml:mi>T</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msubsup></mml:math></inline-formula>. The specific formula is shown in <xref ref-type="disp-formula" rid="eqn-14">(14)</xref>&#x2013;<xref ref-type="disp-formula" rid="eqn-17">(17)</xref>:
<disp-formula id="eqn-14"><label>(14)</label><mml:math id="mml-eqn-14" display="block"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-15"><label>(15)</label><mml:math id="mml-eqn-15" display="block"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-16"><label>(16)</label><mml:math id="mml-eqn-16" display="block"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-17"><label>(17)</label><mml:math id="mml-eqn-17" display="block"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>In order to resize the feature map while retaining more feature, a hybrid downsampling module is used in this paper. Its structure is shown in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>. The module steps are as follows: Firstly, the feature map <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is processed by 3 &#x00D7; 3 convolution, Average pooling (Avgp) and max pooling (Maxp) operations to retain more useful features. Secondly, the feature maps obtained from the three branches are concatenated. Finally, the number of channels is adjusted by GroupNorm and 1 &#x00D7; 1 convolution, and the final feature map is output. The specific formula of the module is shown in <xref ref-type="disp-formula" rid="eqn-18">(18)</xref>:</p>
<p><disp-formula id="eqn-18"><label>(18)</label><mml:math id="mml-eqn-18" display="block"><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>G</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>p</mml:mi><mml:mi>N</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Hybrid downsampling module</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-4.tif"/>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Axial Compression Former Module</title>
<p>There are some advantages in CNN, it has great superiority in local feature extraction. In lung X-ray images, lesion regions are widely distributed, its shape and size are different. The perceptive field of convolution operation is limited. In contrast, the Transformer model [<xref ref-type="bibr" rid="ref-21">21</xref>] can extract global features by capturing long distance dependencies. However, the perception ability of the lesion direction is ignored in Transformer. An Axial Compression Former Module is designed in this paper. Its structure is shown in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Axial compression former module</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-5.tif"/>
</fig>
<p>The input feature map is calculated by 1 &#x00D7; 1 convolution to get Q, K, V, it is processed by three branches: In the first branch, Q, K, V are compressed along the horizontal direction to obtain horizontal direction features <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, In the second branch, Q, K, V are compressed along the vertical direction to obtain vertical direction features <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, Since different dimensions have different relationships and features. Self-attention is calculated in two branches, long-distance context information is captured in horizontal and vertical directions. The perception ability of direction features is improved in transformer, it can make the model to capture multiple relationships and features. In the third branch, Q, K and V are concatenated together through channels, it is computed by deep convolution to supplement the detail features. Then, the globlal features with direction and position perception and the enhanced local features are fused, and the feature map with rich global semantic and local detail features is obtained. The specific process is as follows.</p>
<p>Firstly, the input features <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is mapped to obtain Query <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>Q</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, Key <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mi>K</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, and Value <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>V</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>v</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> by 1 &#x00D7; 1 convolution operation. The calculation formula is shown in <xref ref-type="disp-formula" rid="eqn-19">(19)</xref>&#x2013;<xref ref-type="disp-formula" rid="eqn-21">(21)</xref>:
<disp-formula id="eqn-19"><label>(19)</label><mml:math id="mml-eqn-19" display="block"><mml:mi>Q</mml:mi><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-20"><label>(20)</label><mml:math id="mml-eqn-20" display="block"><mml:mi>K</mml:mi><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-21"><label>(21)</label><mml:math id="mml-eqn-21" display="block"><mml:mi>V</mml:mi><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Secondly, in order to obtain <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>v</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>H</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, Q, K, V are compressed into two-dimensional features along the horizontal direction. Q is reshaped and multiplied with the key V, an attention map with size H &#x00D7; H in the horizontal direction is obtained by Softmax. And it is multiplied with the value V to obtain the horizontal attention map. The attention formula is shown in <xref ref-type="disp-formula" rid="eqn-22">(22)</xref>:
<disp-formula id="eqn-22"><label>(22)</label><mml:math id="mml-eqn-22" display="block"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mi>Q</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:msqrt><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub></mml:msqrt></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:msubsup><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:math></disp-formula></p>
<p>Thirdly, in order to obtain <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>v</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, Q, K, V are compressed into two-dimensional features along the vertical direction. Q is reshaped and multiplied with the key V, an attention map with size W &#x00D7; W in the vertical direction is obtained by Softmax. And it is multiplied with the value V to obtain the vertical attention map. The attention formula is shown in <xref ref-type="disp-formula" rid="eqn-23">(23)</xref>. Then the attention map <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> obtained in the horizontal direction and the attention map <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> obtained in the vertical direction are fused together as the output result <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mi>y</mml:mi></mml:math></inline-formula>. The attention formula is shown in <xref ref-type="disp-formula" rid="eqn-24">(24)</xref>:
<disp-formula id="eqn-23"><label>(23)</label><mml:math id="mml-eqn-23" display="block"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mi>Q</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>K</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:msqrt><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub></mml:msqrt></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:msubsup><mml:mi>V</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:math></disp-formula>
<disp-formula id="eqn-24"><label>(24)</label><mml:math id="mml-eqn-24" display="block"><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>h</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></disp-formula></p>
<p>Fourthly, although the global semantic features are improved by the compress operation effectively, local detail features are lost in some degree. Therefore, the convolution operation is used to enhance spatial features in this paper. Q, K, V are concatenated in the channels, and the 3 &#x00D7; 3 depth wise convolution (<inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:mi>D</mml:mi><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>) and BN operations are performed. Then the features of Cq &#x002B; Ck &#x002B; Cv dimension are compressed into C channels after ReLU6, BN operation and 1 &#x00D7; 1 convolution operation. The detail enhancement features are got. The formula is shown in <xref ref-type="disp-formula" rid="eqn-25">(25)</xref> and <xref ref-type="disp-formula" rid="eqn-26">(26)</xref>:
<disp-formula id="eqn-25"><label>(25)</label><mml:math id="mml-eqn-25" display="block"><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-26"><label>(26)</label><mml:math id="mml-eqn-26" display="block"><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>L</mml:mi><mml:mi>U</mml:mi><mml:mn>6</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mi>B</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>The global features obtained by 1 &#x00D7; 1 convolution is fused with the local features. The output <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is obtained after the residual connection operation. The calculation formula is shown in <xref ref-type="disp-formula" rid="eqn-27">(27)</xref>:
<disp-formula id="eqn-27"><label>(27)</label><mml:math id="mml-eqn-27" display="block"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Finally, the feature maps are passed through the LayerNormal and the MLP layer. Then the residual connection operation is used to better capture the complex relationship among features. The final output of this stage is <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>O</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The specific formula is shown in <xref ref-type="disp-formula" rid="eqn-28">(28)</xref>:
<disp-formula id="eqn-28"><label>(28)</label><mml:math id="mml-eqn-28" display="block"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>O</mml:mi><mml:mi>u</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>L</mml:mi><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>y</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>N</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>m</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula></p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments and Analysis</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets and Data Pre-Processing</title>
<p>The dataset uses in this paper is a publicly available dataset from the COVID-19 RADIOGRAPHY DATABASE, created by a team of researchers from Qatar University and Dhaka University in Bangladesh, along with collaborators from Pakistan and Malaysia, in collaboration with physicians [<xref ref-type="bibr" rid="ref-22">22</xref>,<xref ref-type="bibr" rid="ref-23">23</xref>]. A total of 5374 medical images are selected, including 1332 images of COVID-19, 1335 images of lung opacity, 1362 images of normal lung and 1345 images of viral pneumonia. The images are shown in <xref ref-type="fig" rid="fig-6">Fig. 6</xref>. The ratio of 9:1 is divided into the training set and the verification set. In order to match the model input size, the image is randomly cropped to 224 &#x00D7; 224, then it is converted to vector format and the pixel value is normalized.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Lung X-ray image samples</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-6.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental Environment</title>
<p>The experimental environment for this experiment is a 64-bit Windows Server 2019 Datacenter system equipped with an Inter (R) Xenon (R) Gold 5218 CPU @2.3 GHZ, the computer has 64 GB of RAM and uses NVIDIA TITAN RTX graphics cards to speed up image processing. The program is written in Python, based on the GPU version of the Pytorch framework for network construction and training. Optimiser is performed using the Adam with a learning rate decay value of 0.001. The training period for the lung X-ray dataset is set to 150 and the training batch size is set to 8. The loss function is the cross-entropy loss. Cross entropy is a measure of how similar two distributions are. In machine learning, it is expressed as the difference between the true probability distribution and the predicted probability distribution. The lower the value of cross-entropy is, the better the model prediction effect is. Where the cross entropy loss function of multiple classes is:
<disp-formula id="eqn-29"><label>(29)</label><mml:math id="mml-eqn-29" display="block"><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>where <italic>M</italic> is category; <italic>i</italic> is i<sub>th</sub> sample; <italic>y<sub>ic</sub></italic> is the label of the <italic>i<sub>th</sub></italic> sample class <italic>c</italic> (0 or 1); <italic>p<sub>ic</sub></italic> is the probability of the <italic>i<sub>th</sub></italic> sample class <italic>c</italic>. Finally, the model architecture is tested and evaluated.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Evaluation Metrics</title>
<p>Evaluation index is a quantitative index for model performance. In order to reasonably and comprehensively evaluate the classification performance of the overall structure, and facilitate the comparison with other networks. the confusion matrix of each model for pneumonia classification is visualized by using four index values of true positive (TP), false positive (FP), true negative (TN) and false negative (FN). Accuracy (Acc), Recall-Macro (Rec), Precision-Macro (Pre), F1 Score-Macro (F1), and Specificity (Spe) are used as the evaluation criterion to explore the effect of improved network model on the classification of different pneumonia. The calculation formula of each evaluation index is as follows:
<disp-formula id="eqn-30"><label>(30)</label><mml:math id="mml-eqn-30" display="block"><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-31"><label>(31)</label><mml:math id="mml-eqn-31" display="block"><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>C</mml:mi></mml:mfrac><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-32"><label>(32)</label><mml:math id="mml-eqn-32" display="block"><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>C</mml:mi></mml:mfrac><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-33"><label>(33)</label><mml:math id="mml-eqn-33" display="block"><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mspace width="thinmathspace" /><mml:mi>S</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>+</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-34"><label>(34)</label><mml:math id="mml-eqn-34" display="block"><mml:mi>S</mml:mi><mml:mi>p</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>C</mml:mi></mml:mfrac><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Ablation Experiment</title>
<p>To verify the validity of the methods used in this paper, ablation experiments are performed on the same dataset. As shown in <xref ref-type="table" rid="table-2">Table 2</xref>, a total of 8 groups are designed in ablation experiment, and it is as follows.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Design of the ablation experiment</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th></th>
<th>ResBlock</th>
<th>MRFEM</th>
<th>ACFM</th>
<th>MDFPM</th>
</tr>
</thead>
<tbody>
<tr>
<td>Experiment_1</td>
<td>&#x221A;</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Experiment_2</td>
<td></td>
<td>&#x221A;</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Experiment_3</td>
<td>&#x221A;</td>
<td></td>
<td>&#x221A;</td>
<td></td>
</tr>
<tr>
<td>Experiment_4</td>
<td>&#x221A;</td>
<td></td>
<td></td>
<td>&#x221A;</td>
</tr>
<tr>
<td>Experiment_5</td>
<td>&#x221A;</td>
<td></td>
<td>&#x221A;</td>
<td>&#x221A;</td>
</tr>
<tr>
<td>Experiment_6</td>
<td></td>
<td>&#x221A;</td>
<td>&#x221A;</td>
<td></td>
</tr>
<tr>
<td>Experiment_7</td>
<td></td>
<td>&#x221A;</td>
<td></td>
<td>&#x221A;</td>
</tr>
<tr>
<td>Experiment_8</td>
<td></td>
<td>&#x221A;</td>
<td>&#x221A;</td>
<td>&#x221A;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Experiment_1: Basic Resnet50 network.</p>
<p>Experiment_2: The Multi-scale Residual Feature Extraction Module is used to replace the original residual block for feature extraction, and then it is passed into FC layer for classification.</p>
<p>Experiment_3: Original residual blocks are used to extract features. And the Axial Compression Former Module is added to enhance the global semantic information of the backbone network.</p>
<p>Experiment_4: Original residual blocks are used to extract features. A multi-scale direction feature perception module is added after residual blocks of each stage. it expands the perceptive field network of the entire network to capture diverse features.</p>
<p>Experiment_5: Original residual blocks are used to extract features. A multi-scale direction feature perception module is added after residual blocks of each stage. At the same time, the Axial Compression Former Module is added to enhance the global semantic information of backbone network.</p>
<p>Experiment_6: The Multi-scale Residual Feature Extraction Module is used for feature extraction, and the Axial Compression Former Module is added to enhance the global semantic information.</p>
<p>Experiment_7: The Multi-scale Residual Feature Extraction Module is used for feature extraction. And the multi-scale directional feature perception is added to enrich the features.</p>
<p>Experiment_8 (the MSD-net in this paper): The Multi-scale Residual Feature Extraction Module is used for feature extraction. A multi-scale direction feature perception module is added to enhance the direction feature. The Axial Compression Former Module is added to obtain the global information.</p>
<p>The comparative results with different networks of ablation experiments are shown in <xref ref-type="table" rid="table-3">Table 3</xref>. Experiment_2, Experiment_3, Experiment_4 are improved in various indicators compared to Experiment_1. Experiment_2 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.02%, 2.17%, 2.06%, 2.08% and 0.69%. It shows that the Multi-scale Residual Feature Extraction Module can improve the ability of the model to extract multi-scale features. Experiment_3 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.12%, 2.56%, 2.23%, 2.24% and 0.75%. It is evident that the Multi-scale Direction Feature Perception Module can improve the network extraction ability to extract direction features. Experiment_4 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.58%, 3.39%, 3.18%, 3.23%, 1.06%. It is proved that the Axial Compression Former Module enhances the extraction of global and direction features. It can complement the insufficient of convolution in the extraction of features. Experiment_5 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.68%, 3.31%, 3.37%, 3.39%, 1.12%. Experiment_6 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.77%, 3.47%, 3.56%, 3.74%, 1.19%. Experiment_7 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.77%, 3.47%, 3.56%, 3.58%, 1.19%. Experiment_8 is improved in various indicators compared to Experiment_1, accuracy, precision, recall rate, F1 value and Spe value increase by 1.86%, 3.78%, 3.75%, 3.74%, 1.25%. They proved that the pairwise concatenation of modules can improve the feature extraction ability of the model more. Compared to Experiment_8, MSD-net has the best results. The accuracy of the model increased from 95.06% to 97.76%, the precision increased from 90.20% to 95.57%, the recall rate from 90.10% to 95.52%, the F1 Score from 90.08% to 95.52%, and the Spe value increased from 96.70% to 98.51%.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Comparative results of ablation experiments</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>Acc</th>
<th>Pre</th>
<th>Rec</th>
<th>F1 Score</th>
<th>Spe</th>
</tr>
</thead>
<tbody>
<tr>
<td>Experiment_1</td>
<td>0.9506</td>
<td>0.9020</td>
<td>0.9010</td>
<td>0.9008</td>
<td>0.9670</td>
</tr>
<tr>
<td>Experiment_2</td>
<td>0.9608</td>
<td>0.9237</td>
<td>0.9216</td>
<td>0.9216</td>
<td>0.9739</td>
</tr>
<tr>
<td>Experiment_3</td>
<td>0.9618</td>
<td>0.9276</td>
<td>0.9233</td>
<td>0.9232</td>
<td>0.9745</td>
</tr>
<tr>
<td>Experiment_4</td>
<td>0.9664</td>
<td>0.9359</td>
<td>0.9328</td>
<td>0.9331</td>
<td>0.9776</td>
</tr>
<tr>
<td>Experiment_5</td>
<td>0.9674</td>
<td>0.9351</td>
<td>0.9347</td>
<td>0.9347</td>
<td>0.9782</td>
</tr>
<tr>
<td>Experiment_6</td>
<td>0.9683</td>
<td>0.9367</td>
<td>0.9366</td>
<td>0.9366</td>
<td>0.9789</td>
</tr>
<tr>
<td>Experiment_7</td>
<td>0.9692</td>
<td>0.9398</td>
<td>0.9385</td>
<td>0.9382</td>
<td>0.9795</td>
</tr>
<tr>
<td>Experiment_8</td>
<td><bold>0.9776</bold><break/><bold>(&#x2191;2.7%)</bold></td>
<td><bold>0.9557 (&#x2191;5.35%)</bold></td>
<td><bold>0.9552 (&#x2191;5.42%)</bold></td>
<td><bold>0.9552 (&#x2191;5.44%)</bold></td>
<td><bold>0.9851 (&#x2191;1.81%)</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>It can be seen that the MSD-net has a better classification effect for the four types of lung X-ray images. The experiments are compared more intuitively by drawing the radar map of the ablation experiment results in this paper, as shown in <xref ref-type="fig" rid="fig-7">Fig. 7</xref>. MSD-net represented by the red line is at the outer end, which proves that the model has the best performance.</p>
<fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Radar chart of the ablation experiment results</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-7.tif"/>
</fig>
<p>In addition, to investigate the difference between the prediction label and the true case for the classification of test samples in different models, the results of each ablation experiment in the test set are visualized by using the confusion matrix. The visualization result is shown in <xref ref-type="fig" rid="fig-8">Fig. 8</xref>. In the confusion matrix, the diagonal elements represent the same number of predicted and true labels. The higher the number of diagonal elements, the better the classification accuracy of the model. The same number of true labels and predicted labels for MSD-Net is the highest. It can be seen that the MSD-Net has a more balanced detection and better classification effect for the four types of lung X-ray images in the confusion matrix, which indicates that this model can achieve accurate classification of pneumonia.</p>
<fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Confusion matrix of each model in ablation experiments</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-8.tif"/>
</fig>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Comparison Experiment</title>
<p>This paper mainly discusses the classification effect of 11 network architectures and MSD-Net in this paper on the same dataset. The experiments are divided into three categories: The first uses the original convolutional neural network architecture. The second uses Transformer network architecture. The third is the network architecture proposed in this paper. The experimental evaluation indexes are Acc, Pre, Rec, F1 Score and Spe for quantitative analysis. The specific experimental results are shown in <xref ref-type="table" rid="table-4">Table 4</xref>.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Classification results of pneumonia X-ray images for each model</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>Acc</th>
<th>Pre</th>
<th>Rec</th>
<th>F1 Score</th>
<th>Spe</th>
</tr>
</thead>
<tbody>
<tr>
<td>ResNet50 [<xref ref-type="bibr" rid="ref-6">6</xref>]</td>
<td>0.9506</td>
<td>0.9020</td>
<td>0.9010</td>
<td>0.9008</td>
<td>0.9670</td>
</tr>
<tr>
<td>DenseNet121 [<xref ref-type="bibr" rid="ref-24">24</xref>]</td>
<td>0.9664</td>
<td>0.9349</td>
<td>0.9328</td>
<td>0.9327</td>
<td>0.9776</td>
</tr>
<tr>
<td>ResNeXt50 [<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>0.9524</td>
<td>0.9060</td>
<td>0.9048</td>
<td>0.9042</td>
<td>0.9682</td>
</tr>
<tr>
<td>Res2Net50 [<xref ref-type="bibr" rid="ref-26">26</xref>]</td>
<td>0.9049</td>
<td>0.8129</td>
<td>0.8093</td>
<td>0.8088</td>
<td>0.9366</td>
</tr>
<tr>
<td>RegNet [<xref ref-type="bibr" rid="ref-27">27</xref>]</td>
<td>0.9151</td>
<td>0.8298</td>
<td>0.8298</td>
<td>0.8298</td>
<td>0.9434</td>
</tr>
<tr>
<td>ResNest50 [<xref ref-type="bibr" rid="ref-28">28</xref>]</td>
<td>0.9412</td>
<td>0.8856</td>
<td>0.8822</td>
<td>0.8797</td>
<td>0.9608</td>
</tr>
<tr>
<td>ConvNeXt [<xref ref-type="bibr" rid="ref-29">29</xref>]</td>
<td>0.9132</td>
<td>0.8277</td>
<td>0.8261</td>
<td>0.8261</td>
<td>0.9422</td>
</tr>
<tr>
<td>Vision transformer [<xref ref-type="bibr" rid="ref-30">30</xref>]</td>
<td>0.9384</td>
<td>0.8794</td>
<td>0.8767</td>
<td>0.8767</td>
<td>0.9590</td>
</tr>
<tr>
<td>Swin transformer [<xref ref-type="bibr" rid="ref-31">31</xref>]</td>
<td>0.9496</td>
<td>0.9023</td>
<td>0.8992</td>
<td>0.8991</td>
<td>0.9664</td>
</tr>
<tr>
<td>MobileViT [<xref ref-type="bibr" rid="ref-32">32</xref>]</td>
<td>0.9674</td>
<td>0.9351</td>
<td>0.9348</td>
<td>0.9347</td>
<td>0.9783</td>
</tr>
<tr>
<td>SMT [<xref ref-type="bibr" rid="ref-33">33</xref>]</td>
<td>0.9608</td>
<td>0.9245</td>
<td>0.9217</td>
<td>0.9213</td>
<td>0.9739</td>
</tr>
<tr>
<td>Ours</td>
<td><bold>0.9776</bold><break/><bold>(&#x2191;2.7%)</bold></td>
<td><bold>0.9557 (&#x2191;5.35%)</bold></td>
<td><bold>0.9552 (&#x2191;5.42%)</bold></td>
<td><bold>0.9552 (&#x2191;5.44%)</bold></td>
<td><bold>0.9851 (&#x2191;1.81%)</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From the data in <xref ref-type="table" rid="table-4">Table 4</xref>, The Acc of the model in this paper is 97.76%, the Pre is 95.57%, the Rec is 95.52%, the F1 Score is 95.52% and the Spe is 98.51%. For convolutional networks, the MSD-Net model is better than the common classification convolutional networks. MSD-Net is improved in various indicators compared to DenseNet121, accuracy, precision, recall rate, F1 value and Spe value increase by 1.12%, 2.08%, 2.24%, 5.44% and 0.75%. MSD-Net is improved in various indicators compared to ResNeXt50, Acc, Pre, Rec, F1 Score and Spe increase by 2.52%, 4.97%, 5.04%, 5.1% and 1.69%. MSD-Net is improved in various indicators compared to Res2Net50, Acc, Pre, Rec, F1 Score and Spe increase by 7.27%, 14.28%, 14.59%, 14.64% and 4.85%. MSD-Net is improved in various indicators compared to Reget, Acc, Pre, Rec, F1 Score and Spe increase by 6.25%, 12.59%, 12.54%, 12.54% and 4.17%. MSD-Net is improved in various indicators compared to ResNest50, Acc, Pre, Rec, F1 Score and Spe increase by 3.64%, 7.01%, 7.3%, 7.55% and 2.43%. MSD-Net is improved in various indicators compared to ConvNeXt, Acc, Pre, Rec, F1 Score and Spe increase by 6.44%, 12.8%, 12.91%, 12.91% and 4.29%. For Transformer, MSD-Net is improved in various indicators compared to Vision Transformer, Acc, Pre, Rec, F1 Score and Spe increase by 3.92%, 7.63%, 7.85%, 7.85% and 2.61%. MSD-Net is improved in various indicators compared to Swin Transformer, Acc, Pre, Rec, F1 Score and Spe increase by 2.8%, 5.34%, 5.62%, 5.61% and 1.87%. MSD-Net is improved in various indicators compared to MobileViT, Acc, Pre, Rec, F1 Score and Spe increase by 1.02%, 2.06%, 2.04%, 2.05% and 0.68%. MSD-Net is improved in various indicators compared to SMT, Acc, Pre, Rec, F1 Score and Spe increase by 1.68%, 3.12%, 3.35%, 3.39% and 1.12%.</p>
<p>From the comparison, it can be seen that the MSD-Net is superior to other networks and its classification performance is better than the other 11 network models. At the same time, in order to compare various classification networks more intuitively, the comparison experiment results are visualized by drawing radar maps. As shown in <xref ref-type="fig" rid="fig-9">Fig. 9</xref>, MSD-Net is shown as a red polyline in the diagram and located at the outermost edge. Therefore, the classification effect of this model is better than other models.</p>
<fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>Radar chart of pneumonia classification results of different models</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-9.tif"/>
</fig>
<p>At the same time, this paper uses confusion matrix to visualize the results of each model in the comparsion experiment. The visualized result is shown in <xref ref-type="fig" rid="fig-10">Fig. 10</xref>. From the comparison results, the same number of true labels and predicted labels for MSD-Net is the highest, it can be seen that the recognition effect of MSD-Net is the best and significantly better than the other models.</p>
<fig id="fig-10">
<label>Figure 10</label>
<caption>
<title>Confusion matrix of classification results of pneumonia X-ray images of each model</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_50767-fig-10.tif"/>
</fig>
<p>To highlight the validity of the model in this paper, a new four-classification dataset is used [<xref ref-type="bibr" rid="ref-34">34</xref>]. On this dataset, the proposed model is compared with the basic network model, as shown in <xref ref-type="table" rid="table-5">Table 5</xref>. The Acc of the model in this paper is 97.78%, the Pre is 95.22%, the Rec is 96.49%, the F1 Score is 95.58% and the Spe is 98.11%. MSD-Net is improved in various indicators compared to ResNet50, Acc, Pre, Rec, F1 Score, and Spe increase by 2.71%, 2.28%, 3.66%, 3.29%, and 1.96%. MSD-Net is improved in various indicators compared to ResNeXt50, Acc, Pre, Rec, F1 Score, and Spe increase by 1.29%, 4.95%, 3.51%, 4.82%, and 1.08%. MSD-Net is improved in various indicators compared to ResNest50, Acc, Pre, Rec, F1 Score, and Spe increase by 2.71%, 5.52%, 5.94%, 5.47%, and 2.53%. MSD-Net is improved in various indicators compared to ConvNeXt, Acc, Pre, Rec, F1 Score, and Spe increase by 7.25%, 18.11%, 16.26%, 18.22%, and 5.69%. MSD-Net is improved in various indicators compared to Swin Transformer, Acc, Pre, Rec, F1 Score, and Spe increase by 2.84%, 2.99%, 4.91%, 4.69%, and 2.94%. MSD-Net is improved in various indicators compared to SMT, Acc, Pre, Rec, F1 Score, and Spe increase by 1.87%, 2.28%, 3.4%, 3.14%, and 1.91%. It can be seen that the generalization ability of the model is still good despite the different datasets.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Classification results of pneumonia X-ray images for each model</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>Acc</th>
<th>Pre</th>
<th>Rec</th>
<th>F1 Score</th>
<th>Spe</th>
</tr>
</thead>
<tbody>
<tr>
<td>ResNet50</td>
<td>0.9507</td>
<td>0.9294</td>
<td>0.9283</td>
<td>0.9229</td>
<td>0.9615</td>
</tr>
<tr>
<td>ResNeXt50</td>
<td>0.9649</td>
<td>0.9027</td>
<td>0.9298</td>
<td>0.9076</td>
<td>0.9703</td>
</tr>
<tr>
<td>ResNest50</td>
<td>0.9507</td>
<td>0.8970</td>
<td>0.9055</td>
<td>0.9011</td>
<td>0.9558</td>
</tr>
<tr>
<td>ConvNeXt</td>
<td>0.9053</td>
<td>0.7711</td>
<td>0.8023</td>
<td>0.7736</td>
<td>0.9242</td>
</tr>
<tr>
<td>Swin transformer</td>
<td>0.9494</td>
<td>0.9223</td>
<td>0.9158</td>
<td>0.9089</td>
<td>0.9517</td>
</tr>
<tr>
<td>SMT</td>
<td>0.9591</td>
<td>0.9294</td>
<td>0.9309</td>
<td>0.9244</td>
<td>0.9620</td>
</tr>
<tr>
<td>Ours</td>
<td><bold>0.9778 (&#x2191;2.71%)</bold></td>
<td><bold>0.9522 (&#x2191;2.28%)</bold></td>
<td><bold>0.9649 (&#x2191;3.66%)</bold></td>
<td><bold>0.9558 (&#x2191;3.29%)</bold></td>
<td><bold>0.9811 (&#x2191;1.96%)</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>There is a problem that lesion size features and direction features are extracted insufficiently in lung X-ray images. To solve it, a multi-scale directional feature enhanced pneumonia classification model MSD-Net is proposed. In this model, three different methods are used to enhance the ability to extract lesion size features and lesion direction features. Firstly, in the ablation experiment, the Acc, Pre, Rec, F1 Score and Spe of MRFEM are 96.08%, 92.37%, 92.16%, 92.16%, and 97.39%, respectively, it is proved that MRFEM can effectively extract multi-scale features. The Acc, Pre, Rec, F1 Score and Spe of MDFPM are 96.64%, 93.59%, 93.28%, 93.31%, and 97.76%, respectively, it is proved that MDFPM can enhance the detailed and direction features. The Acc, Pre, Rec, F1 Score and Spe of ACFM are 96.18%, 92.76%, 92.33%, 92.32%, and 97.45%, respectively, it is proved that ACFM can capture the direction features of the lesion and fully extract the global features. Then, through comparative experiments with different datasets, in the COVID-19 RADIOGRAPHY DATABASE, the Acc, Pre, Rec, F1 Score and Spe of MSD-Net are 97.76%, 95.57%,95.52%, 95.52%, and 98.51%, respectively. In the chest X-ray dataset, the Acc, Pre, Rec, F1 Score, and Spe of MSD-Net are 97.78%, 95.22%, 96.49%, 95.58%, and 98.11%, respectively. The growth of each evaluation index shows that the feature enhancement method proposed in this paper can improve the model&#x2019;s perception ability of feature size and direction. Finally, the MSD-Net can improve the precision of lung X-ray diagnosis and has positive significance for pneumonia Computer-Aided Diagnosis.</p>
</sec>
</body>
<back>
<ack>
<p>The authors would like to thank the anonymous editors and reviewers for their critical and constructive comments and suggestions.</p>
</ack>
<sec><title>Funding Statement</title>
<p>This work was supported in part by the National Natural Science Foundation of China (Grant No. 62062003), Natural Science Foundation of Ningxia (Grant No. 2023AAC03293).</p>
</sec>
<sec><title>Author Contributions</title>
<p>Tao Zhou: The framework of the overall paper is proposed; Provision of experimental environment, including a 64-bit Windows Server 2019 Datacenter system equipped, NVIDIA TITAN RTX graphics cards; Fund support, including the National Natural Science Foundation of China, Natural Science Foundation of Ningxia; Evaluation of experimental data; Revision of paper. Yujie Guo: Three innovative points are proposed in the article, including: MRFEM, MDFPM, and ACFM; The code implementation of the article model; Analysis of experimental data, including tabulation of ablation experiment and comparison experiment, analysis of confusion matrix and radar map; The writing of the paper. Caiyue Peng: Drawing of model diagrams in the article; Reference search, analysis, and citations, about the application of transformer in lung disease classification; Correction of incorrect statements in the article. Yuxia Niu: Reference search, analysis, and citations, about the characteristics of pneumonia and the imaging characteristics of pneumonia X-ray images, The application of Convolutional neural network in lung disease classification. Yunfeng Pan: Drawing of model diagrams in the article; The layout of the article, check, proofread; adjust the overall structure; Correction of incorrect statements in the article. Huiling Lu: Two datasets of search, pretreatment, and analysis, including the ratio of 9:1 is divided into the training set and the verification set, Check the final layout of the article.</p>
</sec>
<sec sec-type="data-availability"><title>Availability of Data and Materials</title>
<p>The dataset used in this study are the public dataset COVID-19 RADIOGRAPHY DATABASE which can be download from the link: <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database">https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database</ext-link> and Chest X-Ray pneumonia datasets which can be download from the link: <ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia">https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia</ext-link>.</p>
</sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare that they have no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A. G.</given-names> <surname>Mathioudakis</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Clinical trials of Pneumonia management assess heterogeneous outcomes and measurement instruments</article-title>,&#x201D; <source>J. Clin. Epidemiol.</source>, vol. <volume>164</volume>, pp. <fpage>88</fpage>&#x2013;<lpage>95</lpage>, <year>2023</year>; <pub-id pub-id-type="pmid">37898460</pub-id></mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>P. F.</given-names> <surname>Dequin</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Hydrocortisone in severe community-acquired pneumonia</article-title>,&#x201D; <source>New Engl. J. Med.</source>, vol. <volume>388</volume>, no. <issue>21</issue>, pp. <fpage>1931</fpage>&#x2013;<lpage>1941</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1056/NEJMoa2215145</pub-id>; <pub-id pub-id-type="pmid">36942789</pub-id></mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S. X.</given-names> <surname>Xing</surname></string-name>, <string-name><given-names>Z. H.</given-names> <surname>Ju</surname></string-name>, <string-name><given-names>Z. J.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>F. Q.</given-names> <surname>Fan</surname></string-name></person-group>, &#x201C;<article-title>Multi-label classification of chest X-ray images with pre-trained vision Transformer model</article-title>,&#x201D; <comment>(in Chinese)</comment>, <source>J. Image Graph.</source>, vol. <volume>28</volume>, no. <issue>4</issue>, pp. <fpage>1186</fpage>&#x2013;<lpage>1197</lpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.11834/jig.220284</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>B. Q.</given-names> <surname>Huo</surname></string-name>, <string-name><given-names>H. L.</given-names> <surname>Lu</surname></string-name>, and <string-name><given-names>H. B.</given-names> <surname>Shi</surname></string-name></person-group>, &#x201C;<article-title>Progress of residual neural network optimization algorithm for medical imaging disease diagnosis</article-title>,&#x201D; <source>J. Image Graph.</source>, vol. <volume>25</volume>, no. <issue>10</issue>, pp. <fpage>2079</fpage>&#x2013;<lpage>2092</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>Y. L.</given-names> <surname>Dong</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Bai</surname></string-name>, and <string-name><given-names>H. L.</given-names> <surname>Lu</surname></string-name></person-group>, &#x201C;<article-title>Parallel decomposition adaptive fusion model: Cross-modal image fusion of lung tumors</article-title>,&#x201D; <source>J. Image Graph.</source>, vol. <volume>28</volume>, no. <issue>1</issue>, pp. <fpage>221</fpage>&#x2013;<lpage>233</lpage>, <year>2023</year>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>K.</given-names> <surname>He</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Ren</surname></string-name>, and <string-name><given-names>J.</given-names> <surname>Sun</surname></string-name></person-group>, &#x201C;<article-title>Deep residual learning for image recognition</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Computer Vis. Pattern Recognit. (CVPR)</conf-name>, <publisher-loc>Las Vegas, USA</publisher-loc>, <year>2016</year>, pp. <fpage>770</fpage>&#x2013;<lpage>778</lpage>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Khurana</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>An intelligent fine-tuned forecasting technique for COVID-19 prediction using neuralprophet model</article-title>,&#x201D; <source>Comput. Mater. Contin.</source>, vol. <volume>71</volume>, pp. <fpage>629</fpage>&#x2013;<lpage>649</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Akbulut</surname></string-name></person-group>, &#x201C;<article-title>Automated pneumonia based lung diseases classification with robust technique based on a customized deep learning approach</article-title>,&#x201D; <source>Diagnostics</source>, vol. <volume>13</volume>, no. <issue>2</issue>, pp. <fpage>260</fpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics13020260</pub-id>; <pub-id pub-id-type="pmid">36673070</pub-id></mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>P.</given-names> <surname>Kaur</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>A hybrid convolutional neural network model for diagnosis of COVID-19 using chest X-ray images</article-title>,&#x201D; <source>Int. J. Environ. Res. Public Health</source>, vol. <volume>18</volume>, no. <issue>22</issue>, pp. <fpage>12191</fpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.3390/ijerph182212191</pub-id>; <pub-id pub-id-type="pmid">34831960</pub-id></mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Chang</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Ye</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Lu</surname></string-name> and <string-name><given-names>F.</given-names> <surname>Hu</surname></string-name></person-group>, &#x201C;<article-title>COVID-ResNet: COVID-19 recognition based on improved attention ResNet</article-title>,&#x201D; <source>Electronics</source>, vol. <volume>12</volume>, no. <issue>6</issue>, pp. <fpage>1413</fpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.3390/electronics12061413</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>F.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Ye</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Wang</surname></string-name>, and <string-name><given-names>H.</given-names> <surname>Lu</surname></string-name></person-group>, &#x201C;<article-title>CCGL-YOLOV5: A cross-modal cross-scale global-local attention YOLOV5 lung tumor detection model</article-title>,&#x201D; <source>Comput. Biol. Med.</source>, vol. <volume>165</volume>, no. <issue>14</issue>, pp. <fpage>107387</fpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107387</pub-id>; <pub-id pub-id-type="pmid">37659112</pub-id></mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Xiao</surname></string-name>, <string-name><given-names>Q.</given-names> <surname>Liu</surname></string-name>, and <string-name><given-names>L.</given-names> <surname>Li</surname></string-name></person-group>, &#x201C;<article-title>MFMANet: Multi-feature Multi-attention Network for efficient subtype classification on non-small cell lung cancer CT images</article-title>,&#x201D; <source>Biomed. Signal Process. Control</source>, vol. <volume>84</volume>, no. <issue>3</issue>, pp. <fpage>104768</fpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2023.104768</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>X.</given-names> <surname>Huo</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>HiFuse: Hierarchical multi-scale feature fusion network for medical image classification</article-title>,&#x201D; <source>Biomed. Signal Process. Control</source>, vol. <volume>87</volume>, no. <issue>7660</issue>, pp. <fpage>105534</fpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2023.105534</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Gopatoti</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>MTMC-AUR2CNet: Multi-textural multi-class attention recurrent residual convolutional neural network for COVID-19 classification using chest X-ray images</article-title>,&#x201D; <source>Biomed. Signal Process. Control</source>, vol. <volume>85</volume>, no. <issue>4</issue>, pp. <fpage>104857</fpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2023.104857</pub-id>; <pub-id pub-id-type="pmid">36968651</pub-id></mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>X.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Gao</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Yang</surname></string-name>, and <string-name><given-names>K.</given-names> <surname>Chang</surname></string-name></person-group>, &#x201C;<article-title>Hyperspectral pathology image classification using dimension-driven multi-path attention residual network</article-title>,&#x201D; <source>Expert. Syst. Appl.</source>, vol. <volume>230</volume>, no. <issue>1</issue>, pp. <fpage>120615</fpage>, <year>2023</year>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2023.120615</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>P.</given-names> <surname>Sreedevi</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Da-resbigru-brain tumor classification using Dual attention residual bi directional gated recurrent unit using MRI images</article-title>,&#x201D; <source>Biomed. Signal Process. Control</source>, vol. <volume>88</volume>, pp. <fpage>105596</fpage>, <year>2024</year>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>E.</given-names> <surname>Hassan</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>A quantum convolutional network and ResNet (50)-based classification architecture for the MNIST medical dataset</article-title>,&#x201D; <source>Biomed. Signal Process. Control</source>, vol. <volume>87</volume>, no. <issue>7792</issue>, pp. <fpage>105560</fpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2023.105560</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>C. J.</given-names> <surname>Ejiyi</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>ResfEANet: ResNet-fused external attention network for tuberculosis diagnosis using chest X-ray images</article-title>,&#x201D; <source>Comput. Methods Programs Biomed. Update</source>, vol. <volume>5</volume>, no. <issue>11</issue>, pp. <fpage>100133</fpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1016/j.cmpbup.2023.100133</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Nawaz</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Saleem</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Masood</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Rashid</surname></string-name>, and <string-name><given-names>T.</given-names> <surname>Nazir</surname></string-name></person-group>, &#x201C;<article-title>COVID-ECG-RSNet: COVID-19 classification from ECG images using swish-based improved ResNet model</article-title>,&#x201D; <source>Biomed. Signal Process. Control</source>, vol. <volume>89</volume>, no. <issue>2</issue>, pp. <fpage>105801</fpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2023.105801</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>P.</given-names> <surname>Wang</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Understanding convolution for semantic segmentation</article-title>,&#x201D; in <conf-name>2018 IEEE Winter Conf. Appli. Comput. Vis. (WACV)</conf-name>, <publisher-loc>Lake Tahoe, Nevada</publisher-loc>, <publisher-name>IEEE</publisher-name>, <year>2018</year>, pp. <fpage>1451</fpage>&#x2013;<lpage>1460</lpage>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Niu</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Lu</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Peng</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Guo</surname></string-name> and <string-name><given-names>H.</given-names> <surname>Zhou</surname></string-name></person-group>, &#x201C;<article-title>Vision transformer: To discover the &#x201C;Four secrets&#x201D; of image patches</article-title>,&#x201D; <source>Inf. Fusion</source>, vol. <volume>105</volume>, pp. <fpage>102248</fpage>, <year>2024</year>. doi: <pub-id pub-id-type="doi">10.1016/j.inffus.2024.102248</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. E. H.</given-names> <surname>Chowdhury</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Can AI help in screening viral and COVID-19 pneumonia?</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>8</volume>, pp. <fpage>132665</fpage>&#x2013;<lpage>132676</lpage>, <year>2020</year>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3010287</pub-id>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Rahman</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>Exploring the effect of image enhancement techniques on COVID-19 detection using chest X-ray images</article-title>,&#x201D; <source>Comput. Biol. Med.</source>, vol. <volume>132</volume>, no. <issue>2</issue>, pp. <fpage>104319</fpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104319</pub-id>; <pub-id pub-id-type="pmid">33799220</pub-id></mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>G.</given-names> <surname>Huang</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>L.</given-names> <surname>van der Maaten</surname></string-name>, and <string-name><given-names>K. Q.</given-names> <surname>Weinberger</surname></string-name></person-group>, &#x201C;<article-title>Densely connected convolutional networks</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Computer Vis. Pattern Recognit. (CVPR)</conf-name>, <publisher-loc>Honolulu, Hawaii, USA</publisher-loc>, <year>2017</year>, pp. <fpage>4700</fpage>&#x2013;<lpage>4708</lpage>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Xie</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Girshick</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Doll&#x00E1;r</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Tu</surname></string-name>, and <string-name><given-names>K.</given-names> <surname>He</surname></string-name></person-group>, &#x201C;<article-title>Aggregated residual transformations for deep neural networks</article-title>,&#x201D; in <conf-name>Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR)</conf-name>, <publisher-loc>Honolulu, Hawaii, USA</publisher-loc>, <year>2017</year>, pp. <fpage>1492</fpage>&#x2013;<lpage>1500</lpage>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Gao</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Cheng</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Zhao</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Yang</surname></string-name> and <string-name><given-names>P.</given-names> <surname>Torr</surname></string-name></person-group>, &#x201C;<article-title>Res2Net: A new multiscale backbone architecture</article-title>,&#x201D; <source>IEEE Trans. Pattern Anal. Mach. Intell.</source>, vol. <volume>43</volume>, no. <issue>2</issue>, pp. <fpage>652</fpage>&#x2013;<lpage>662</lpage>, <year>2021</year>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2019.2938758</pub-id>; <pub-id pub-id-type="pmid">31484108</pub-id></mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>I.</given-names> <surname>Radosavovic</surname></string-name>, <string-name><given-names>R. P.</given-names> <surname>Kosaraju</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Girshick</surname></string-name>, <string-name><given-names>K.</given-names> <surname>He</surname></string-name>, and <string-name><given-names>P.</given-names> <surname>Dollar</surname></string-name></person-group>, &#x201C;<article-title>Designing network design spaces</article-title>,&#x201D; in <conf-name>Proc. IEEE/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)</conf-name>, <publisher-loc>Seattle, USA</publisher-loc>, <year>2020</year>, pp. <fpage>10428</fpage>&#x2013;<lpage>10436</lpage>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Zhang</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>ResNeSt: Split-attention networks</article-title>,&#x201D; in <conf-name>Proc. IEEE/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)</conf-name>, <publisher-loc>New Orleans, LA, USA</publisher-loc>, <year>2022</year>, pp. <fpage>2736</fpage>&#x2013;<lpage>2746</lpage>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>Z.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Mao</surname></string-name>, <string-name><given-names>C. Y.</given-names> <surname>Wu</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Feichtenhofer</surname></string-name>, <string-name><given-names>T.</given-names> <surname>Darrell</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Xie</surname></string-name></person-group>, &#x201C;<article-title>A convnet for the 2020s</article-title>,&#x201D; in <conf-name>Proc. IEEE/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)</conf-name>, <publisher-loc>New Orleans, LA, USA</publisher-loc>, <year>2022</year>, pp. <fpage>11976</fpage>&#x2013;<lpage>11986</lpage>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Dosovitskiy</surname></string-name> <etal>et al.</etal></person-group>, &#x201C;<article-title>An image is worth 16x16 words: Transformers for image recognition at scale</article-title>,&#x201D; <year>2020</year>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2010.11929</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>Z.</given-names> <surname>Liu</surname></string-name></person-group> <etal>et al.</etal>, &#x201C;<article-title>Swin transformer: Hierarchical vision transformer using shifted windows</article-title>,&#x201D; in <conf-name>Proc. IEEE/CVF Int. Conf. Comput. Vis. (ICCV)</conf-name>, <publisher-loc>Montreal, QC, Canada</publisher-loc>, <year>2021</year>, pp. <fpage>10012</fpage>&#x2013;<lpage>10022</lpage>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Mehta</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Rastegari</surname></string-name></person-group>, &#x201C;<article-title>MobileViT: Light-weight, general-purpose, and mobile-friendly vision transformer</article-title>,&#x201D; <year>2021</year>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2110.02178</pub-id>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Weifeng</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Ziheng</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Jiayu</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Huang</surname></string-name>, and <string-name><given-names>L.</given-names> <surname>Jin</surname></string-name></person-group>, &#x201C;<article-title>Scale-aware modulation meet transformer</article-title>,&#x201D; in <conf-name>Proc. IEEE/CVF Int. Conf. Comput. Vis. (ICCV)</conf-name>, <publisher-loc>Paris, France</publisher-loc>, <year>2023</year>, pp. <fpage>6015</fpage>&#x2013;<lpage>6026</lpage>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>D. S.</given-names> <surname>Kermany</surname></string-name> <etal>et al.,</etal></person-group> &#x201C;<article-title>Identifying medical diagnoses and treatable diseases by image-based deep learning</article-title>,&#x201D; <source>Cell</source>, vol. <volume>172</volume>, no. <issue>5</issue>, pp. <fpage>1122</fpage>&#x2013;<lpage>1131</lpage>, <year>2018</year>; <pub-id pub-id-type="pmid">29474911</pub-id></mixed-citation></ref>
</ref-list>
</back></article>