<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMES</journal-id>
<journal-id journal-id-type="nlm-ta">CMES</journal-id>
<journal-id journal-id-type="publisher-id">CMES</journal-id>
<journal-title-group>
<journal-title>Computer Modeling in Engineering &#x0026; Sciences</journal-title>
</journal-title-group>
<issn pub-type="epub">1526-1506</issn>
<issn pub-type="ppub">1526-1492</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">52622</article-id>
<article-id pub-id-type="doi">10.32604/cmes.2024.052622</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>DPAL-BERT: A Faster and Lighter Question Answering Model</article-title>
<alt-title alt-title-type="left-running-head">DPAL-BERT: A Faster and Lighter Question Answering Model</alt-title>
<alt-title alt-title-type="right-running-head">DPAL-BERT: A Faster and Lighter Question Answering Model</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Yin</surname><given-names>Lirong</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Wang</surname><given-names>Lei</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Cai</surname><given-names>Zhuohang</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-4" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Lu</surname><given-names>Siyu</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><email>siyu.lu@std.uestc.edu.cn</email></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Wang</surname><given-names>Ruiyang</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>AlSanad</surname><given-names>Ahmed</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>AlQahtani</surname><given-names>Salman A.</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-8" contrib-type="author">
<name name-style="western"><surname>Chen</surname><given-names>Xiaobing</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-9" contrib-type="author">
<name name-style="western"><surname>Yin</surname><given-names>Zhengtong</given-names></name><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-10" contrib-type="author">
<name name-style="western"><surname>Li</surname><given-names>Xiaolu</given-names></name><xref ref-type="aff" rid="aff-6">6</xref></contrib>
<contrib id="author-11" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Zheng</surname><given-names>Wenfeng</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref><email>winfirms@ieee.org</email></contrib>
<aff id="aff-1"><label>1</label><institution>Department of Geography and Anthropology, Louisiana State University</institution>, <addr-line>Baton Rouge</addr-line>, <country>LA</country> <addr-line>70803</addr-line>, <country>USA</country></aff>
<aff id="aff-2"><label>2</label><institution>School of Automation, University of Electronic Science and Technology of China</institution>, <addr-line>Chengdu, 610054</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>College of Computer and Information Sciences, King Saud University</institution>, <addr-line>Riyadh, 11574</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-4"><label>4</label><institution>School of Electrical and Computer Engineering, Louisiana State University</institution>, <addr-line>Baton Rouge</addr-line>, <country>LA</country> <addr-line>70803</addr-line>, <country>USA</country></aff>
<aff id="aff-5"><label>5</label><institution>College of Resources and Environmental Engineering, Guizhou University</institution>, <addr-line>Guiyang, 550025</addr-line>, <country>China</country></aff>
<aff id="aff-6"><label>6</label><institution>School of Geographical Sciences, Southwest University</institution>, <addr-line>Chongqing, 400715</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Authors: Siyu Lu. Email: <email>siyu.lu@std.uestc.edu.cn</email>; Wenfeng Zheng. Email: <email>winfirms@ieee.org</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2024</year></pub-date>
<pub-date date-type="pub" publication-format="electronic"><day>20</day><month>8</month><year>2024</year></pub-date>
<volume>141</volume>
<issue>1</issue>
<fpage>771</fpage>
<lpage>786</lpage>
<history>
<date date-type="received">
<day>09</day>
<month>4</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>7</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2024 The Authors.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMES_52622.pdf"></self-uri>
<abstract>
<p>Recent advancements in natural language processing have given rise to numerous pre-training language models in question-answering systems. However, with the constant evolution of algorithms, data, and computing power, the increasing size and complexity of these models have led to increased training costs and reduced efficiency. This study aims to minimize the inference time of such models while maintaining computational performance. It also proposes a novel Distillation model for PAL-BERT (DPAL-BERT), specifically, employs knowledge distillation, using the PAL-BERT model as the teacher model to train two student models: DPAL-BERT-Bi and DPAL-BERT-C. This research enhances the dataset through techniques such as masking, replacement, and n-gram sampling to optimize knowledge transfer. The experimental results showed that the distilled models greatly outperform models trained from scratch. In addition, although the distilled models exhibit a slight decrease in performance compared to PAL-BERT, they significantly reduce inference time to just 0.25% of the original. This demonstrates the effectiveness of the proposed approach in balancing model performance and efficiency.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>DPAL-BERT</kwd>
<kwd>question answering systems</kwd>
<kwd>knowledge distillation</kwd>
<kwd>model compression</kwd>
<kwd>BERT</kwd>
<kwd>Bi-directional long short-term memory (BiLSTM)</kwd>
<kwd>knowledge information transfer</kwd>
<kwd>PAL-BERT</kwd>
<kwd>training efficiency</kwd>
<kwd>natural language processing</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Sichuan Science and Technology Program</funding-source>
<award-id>2023YFSY0026</award-id>
<award-id>2023YFH0004</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec>
<title>Highlight</title>
<p>1. A novel Distillation model on PAL-BERT (DPAL-BERT) is proposed for the question-answering task.</p>
<p>2. BiLSTM is adopted as the student model to shorten inference time.</p>
<p>3. The PAL-BERT model is used as the teacher model to achieve high accuracy.</p>
<p>4. DPAL-BERT achieves competitive performance and significantly reduces the inference time.</p>
</sec>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>In natural language processing (NLP) tasks, deep learning (DL) has gathered considerable attention and is currently widely used. The advent of pre-trained language models in recent years has significantly enhanced the technology of question-answering systems. After pre-training the question-answering system, its transfer learning ability will be stronger, and its application range will be wider. In the training process of the Question Answering Model, complex models and many computing resources are needed to extract information from large and highly redundant datasets. In the experiment, the best models are often large-scale, such as Chat Generative Pre-trained Transformer (ChatGPT) [<xref ref-type="bibr" rid="ref-1">1</xref>], BERT [<xref ref-type="bibr" rid="ref-2">2</xref>], or even integrated by multiple models [<xref ref-type="bibr" rid="ref-3">3</xref>].</p>
<p>Deploying large models in service environments faces several common challenges, which can significantly impede their practicality and efficiency [<xref ref-type="bibr" rid="ref-4">4</xref>&#x2013;<xref ref-type="bibr" rid="ref-6">6</xref>]. These challenges include: 1. Slower inference speed, leading to delays in obtaining results and reduced system responsiveness. 2. High demands on deployment resources, such as memory, make the process resource-intensive. 3. Stringent constraints are required during deployment to achieve low latency and efficient use of computing resources, necessitating careful optimization and planning. Due to the rapid development of portable equipment, some special application situations, such as devices with little memory and low computing capacity, do not support the online calculation of large models. Hence, it becomes imperative to downscale the model to ensure performance [<xref ref-type="bibr" rid="ref-7">7</xref>,<xref ref-type="bibr" rid="ref-8">8</xref>].</p>
<p>Currently, the prevalent techniques for compressing models can be broadly categorized into four groups: (1) parameter pruning and quantization [<xref ref-type="bibr" rid="ref-9">9</xref>], which mainly deletes redundant parameters in the model; (2) low-rank factorization [<xref ref-type="bibr" rid="ref-10">10</xref>], which uses tensor factorization to estimate the parameters of neural networks; (3) transferred/compact convolutional filters [<xref ref-type="bibr" rid="ref-11">11</xref>], designed a particular structure of convolutional filters, which can reduce parameter space and save memory; (4) knowledge distillation.</p>
<p>General experience holds that similar scale models must be maintained to retain similar knowledge [<xref ref-type="bibr" rid="ref-12">12</xref>]. This indicates that the parameters of a model determine the amount of knowledge contained in the data captured by the model. This understanding is correct, but the relationship between the parameter quantity contained in a model and the knowledge quantity that can be captured from the original data is not a stable linear relationship but a curve form in which, as the parameter quantity increases, the marginal return gradually decreases. In contrast, even when two models possess identical structures and equivalent parameters, they can assimilate different types of knowledge when trained on the same dataset. One of the critical factors is the selection of training methods. An appropriate training method can help the model capture as much knowledge as possible with a few parameters. This is the primary idea used in knowledge distillation [<xref ref-type="bibr" rid="ref-6">6</xref>,<xref ref-type="bibr" rid="ref-13">13</xref>,<xref ref-type="bibr" rid="ref-14">14</xref>].</p>
<p>Knowledge distillation fundamentally represents a technique for compressing models [<xref ref-type="bibr" rid="ref-14">14</xref>]. The fundamental concept of knowledge distillation is to direct the training of a lightweight model using the trained complex model as a guide and then get a lightweight model with the effect as close as possible to the complex model while simultaneously reducing the computational burden, decreasing the model scale and training time. The complex structure of the teacher network can train a suitable probability distribution, and the small model is the student network. The output probability distribution is employed to fit the distribution of the teacher network to realize knowledge transfer and performance improvement. In general, no distinction will be made between the models used in training and deployment, but there are some inconsistencies between training and deployment.</p>
<p>Hinton et al. [<xref ref-type="bibr" rid="ref-13">13</xref>] put forward the approach of relevant knowledge distillation as early as 2014. He proposed that using a &#x201C;soft label&#x201D; to perform model distillation can improve the effect of the &#x201C;student&#x201D; model. He reported that the classification prediction probability obtained by the complex model after training, although cross-entropy is chosen as a loss function, its score in the correct category considered by the model will be particularly large, while the score in other categories will be particularly low. However, this value with a particularly low score still has a relative role; that is, it can represent the correlation between classes, which cannot be reflected in the original annotation data. For example, in the MNIST handwritten data recognition dataset, the handwritten font &#x201C;2&#x201D; is often very similar to the handwritten font &#x201C;3&#x201D;, but people will only tell it that it corresponds to the label &#x201C;2&#x201D;. Such a hard label results in the model not considering the correlation between handwritten font &#x201C;2&#x201D; and &#x201C;3&#x201D; in the input data. Hence, this study designs the concept of &#x201C;t&#x201D; (temperature) in the loss function to make a corresponding scaling for the probability value predicted by the model to enlarge the score results of other categories in the model and then let the student model learn this corresponding feature in the distillation stage, to increase the loss of accuracy.</p>
<p>Growing model parameters and slower training speed of pre-training language models make more scholars begin to study the related work of lightweight pre-training language models. The researcher of the hugging face proposed the distill BERT model and performed the corresponding knowledge distillation strategy based on BERT [<xref ref-type="bibr" rid="ref-15">15</xref>]. Finally, under the condition of reducing the parameters of BERT by 40%, it can still maintain the original accuracy of BERT by 97% and improve the prediction speed by 60%. The study proposes that in the knowledge distillation stage, in addition to continuing to follow the &#x201C;soft label&#x201D; strategy proposed by Hinton, adding the hidden layer vector between &#x201C;teacher BERT&#x201D; and &#x201C;student BERT&#x201D; can also improve the effect of &#x201C;student BERT&#x201D;. Huawei Noah Ark laboratory has proposed the &#x201C;TinyBERT&#x201D; [<xref ref-type="bibr" rid="ref-16">16</xref>] model, which has made corresponding innovations in the relevant characteristics involved in the knowledge distillation strategy. When calculating the loss function, TinyBERT not only considers the &#x201C;soft label&#x201D;, which believes that the parameters of BERT in the output layer, the hidden layer vector in the transformer structure, and the attention vector positively affect knowledge distillation. Scholars of Huawei Noah&#x2019;s Ark believe that BERT&#x2019;s original &#x201C;pre-training fine-tuning&#x201D; model will cause some difficulty in knowledge distillation, and its semantic difference between the pre-training stage and fine-tuning [<xref ref-type="bibr" rid="ref-17">17</xref>] stage will result in a &#x201C;teacher model&#x201D; that single-stage knowledge distillation cannot learn well. Therefore, TinyBERT proposed a two-sided knowledge distillation strategy. Finally, the model parameters of TinyBERT are 7.5 times lower than the original BERT, and the prediction speed is 9.4 times faster. On average, TinyBERT is only 3% lower than the original BERT in nine downstream natural language processing tasks.</p>
<p>This study combines the advantages of the high accuracy of the PAL-BERT model [<xref ref-type="bibr" rid="ref-18">18</xref>] with the short inference time of a small-scale model as BiLSTM [<xref ref-type="bibr" rid="ref-19">19</xref>]. The internal knowledge information of a large model PAL-BERT is transferred to a small model using the method of knowledge distillation to shorten the inference time without compromising model performance.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Dataset</title>
<p>The Standard Question Answering Dataset (SQuAD) [<xref ref-type="bibr" rid="ref-20">20</xref>] is widely acknowledged as a benchmark in machine reading comprehension. The dataset comprises a diverse array of elements, including articles, the corresponding fragments within those articles, and questions paired with answers that are directly related to these fragments. Therefore, SQuAD 1.1 and SQuAD 2.0 are used as English datasets. Compared to version 1.1, SQuAD 2.0 expands some simple manually written negative samples other than automatically generated ones. In addition, machine reading comprehension models must account for the presence of unanswerable questions. These models should be capable of determining if a question can be answered based on the provided context. If the context does not support the question, the model should refrain from providing an answer, enhancing the model&#x2019;s practical application value. The so-called &#x201C;sample&#x201D; is a problem corresponding to a fragment in an article. In version 2.0, the proportion of samples of the SQuAD dataset in the training dataset is about 2:1, and the proportion of articles that do not contain negative samples and articles that contain negative samples is also 2:1. However, the development set and test set remove those articles that do not contain negative samples in version 1.1, making the proportion about 1:1. The number and distribution of samples of SQuAD are shown in <xref ref-type="table" rid="table-1">Table 1</xref>.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Number and distribution of positive and negative samples in the SQuAD dataset</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th></th>
<th></th>
<th>SQuAD1.1</th>
<th>SQuAD2.0</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="4">Train</td>
<td>Total samples</td>
<td>87,599</td>
<td>130,319</td>
</tr>
<tr>
<td>Negative samples</td>
<td>0</td>
<td>43,498</td>
</tr>
<tr>
<td>Total articles</td>
<td>442</td>
<td>442</td>
</tr>
<tr>
<td>Articles with negatives</td>
<td>0</td>
<td>285</td>
</tr>
<tr>
<td rowspan="4">Development</td>
<td>Total samples</td>
<td>10,570</td>
<td>11,873</td>
</tr>
<tr>
<td>Negative samples</td>
<td>0</td>
<td>5945</td>
</tr>
<tr>
<td>Total articles</td>
<td>48</td>
<td>35</td>
</tr>
<tr>
<td>Articles with negatives</td>
<td>0</td>
<td>35</td>
</tr>
<tr>
<td rowspan="4">Test</td>
<td>Total samples</td>
<td>10,570</td>
<td>11,873</td>
</tr>
<tr>
<td>Negative samples</td>
<td>0</td>
<td>5945</td>
</tr>
<tr>
<td>Total articles</td>
<td>48</td>
<td>35</td>
</tr>
<tr>
<td>Articles with negatives</td>
<td>0</td>
<td>35</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The new dataset version includes manually labeled &#x201C;unanswerable&#x201D; questions, serving as diverse negative samples. Even if these negative samples have no correct answers, the model can still pay attention to some relevant texts and give predicted fragments, which seem correct but often wrong, thus increasing the difficulty of the whole task.</p>
<p>In addition, the Chinese machine reading comprehension dataset CMRC 2018 [<xref ref-type="bibr" rid="ref-21">21</xref>] is also used in this research. The dataset content is sourced from Chinese Wikipedia, with manually crafted questions. The training set comprises about 10,000 pieces of data. The preprocessed data portion is listed in <xref ref-type="table" rid="table-2">Table 2</xref>. Given the gaps between Chinese and English, it is also a supplement for non-English cases. Every article provides multiple relevant questions, each accompanied by several manually annotated reference answers. The six problem types are displayed in <xref ref-type="table" rid="table-3">Table 3</xref>.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>CMRC 2018 sample quantity</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th></th>
<th>Train</th>
<th>Development</th>
<th>Test</th>
<th>Challenge</th>
</tr>
</thead>
<tbody>
<tr>
<td>Number of questions</td>
<td>10,321</td>
<td>3351</td>
<td>4895</td>
<td>504</td>
</tr>
<tr>
<td>Average answers per question</td>
<td>1</td>
<td>3</td>
<td>3</td>
<td>3</td>
</tr>
<tr>
<td>Maximum article characters</td>
<td>962</td>
<td>961</td>
<td>980</td>
<td>916</td>
</tr>
<tr>
<td>Maximum question characters</td>
<td>89</td>
<td>56</td>
<td>50</td>
<td>47</td>
</tr>
<tr>
<td>Maximum answer characters</td>
<td>100</td>
<td>85</td>
<td>92</td>
<td>77</td>
</tr>
<tr>
<td>Average article characters</td>
<td>452</td>
<td>469</td>
<td>472</td>
<td>464</td>
</tr>
<tr>
<td>Average question characters</td>
<td>15</td>
<td>15</td>
<td>15</td>
<td>18</td>
</tr>
<tr>
<td>Average answer characters</td>
<td>17</td>
<td>9</td>
<td>9</td>
<td>19</td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>CMRC2018 question type statistics</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Question type</th>
<th>Percentage</th>
</tr>
</thead>
<tbody>
<tr>
<td>When</td>
<td>12.8%</td>
</tr>
<tr>
<td>Where</td>
<td>12.3%</td>
</tr>
<tr>
<td>Who</td>
<td>8.6%</td>
</tr>
<tr>
<td>What</td>
<td>7.8%</td>
</tr>
<tr>
<td>Why</td>
<td>5.7%</td>
</tr>
<tr>
<td>How</td>
<td>1.2%</td>
</tr>
<tr>
<td>Others</td>
<td>51.4%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3">
<label>3</label>
<title>Method</title>
<sec id="s3_1">
<label>3.1</label>
<title>Knowledge Distillation</title>
<sec id="s3_1_1">
<label>3.1.1</label>
<title>Soft Label-Based Knowledge Distillation</title>
<p>In the process of knowledge distillation, this study calls the original large model teacher model, the new small model student model, the label in the training set hard label, the probability output predicted by the teacher model soft label, and temperature (T) is employed to adjust the hyperparameters of the soft label, as depicted in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Soft label-based knowledge distillation</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_52622-fig-1.tif"/>
</fig>
<p>When training the student model, the KL divergence within the probability distribution of the output category is added to the loss function for classification tasks. The teacher model output <italic>T</italic> can be expressed as <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the previous layer output of SoftMax, and student model output S can be described as <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the previous layer output. Knowledge distillation makes the output of the student model <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> close to that of the teacher model <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> through loss function. Due to the operation of <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, the model&#x2019;s output for a specific class can exhibit a high probability value nearing 1, while simultaneously displaying low probabilities nearing 0 for other classes, so that the output is close to single heat coding. Therefore, a temperature parameter <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mi>&#x03C4;</mml:mi><mml:mo>&#x2265;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> is usually added to the operation to make the output distribution more average. At the same time, smoothing the output of the teacher and student model can obtain as follows:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mi>&#x03C4;</mml:mi></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:msub><mml:mi>a</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mi>&#x03C4;</mml:mi></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The loss function of knowledge distillation can be expressed as follows:</p>
<p><disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">S</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mi></mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2217;</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2217;</mml:mo><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2225;</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd /><mml:mtd><mml:mi></mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2217;</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2217;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>&#x03C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>where <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the parameter of the student model; <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the distribution of real labels; <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mo>,</mml:mo><mml:mi>H</mml:mi></mml:math></inline-formula> are divergence and cross-entropy; <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:mi>&#x03B1;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> is a hyperparameter, controlling the ratio of the cross entropy between the model output distribution and the actual label and the diversity within the student model output and the teacher model output.</p>
</sec>
<sec id="s3_1_2">
<label>3.1.2</label>
<title>Representation-Based Knowledge Distillation</title>
<p>The schematic diagram depicting knowledge distillation based on representation is given in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. This approach compares the output representations of the teacher model&#x2019;s second layer and the student model&#x2019;s second layer. The dimensions of these representations can differ, and the corresponding relationships between the dimensions can also vary. In order to address this, a linear regression can be performed to align the output representation of the student model with that of the teacher model. The loss function for knowledge distillation, as depicted in <xref ref-type="disp-formula" rid="eqn-4">Eq. (4)</xref>, captures this alignment process.</p>
<p><disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">S</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mi>U</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msubsup><mml:mi>U</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mfrac><mml:mrow><mml:msubsup><mml:mi>U</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">r</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msubsup><mml:mi>U</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2217;</mml:mo><mml:msup><mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">r</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></disp-formula>where <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are parameters of the teacher model and student model, respectively; <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msubsup><mml:mi>U</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>U</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are the calculation functions of the teacher model and student model for input <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mi>x</mml:mi></mml:math></inline-formula> to Transformer output of layer <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mi>i</mml:mi></mml:math></inline-formula> and layer <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:mi>j</mml:mi></mml:math></inline-formula>; <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">r</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is regression parameter matrix.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Representation-based knowledge distillation</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_52622-fig-2.tif"/>
</fig>
</sec>
<sec id="s3_1_3">
<label>3.1.3</label>
<title>Attention-Based Knowledge Distillation</title>
<p>The second norm of the feature vector at different positions of the picture output in the convolution layer can represent the attention distribution of the model to the picture [<xref ref-type="bibr" rid="ref-22">22</xref>]. The self-attention layer in the transformer structure contains the attention distribution of each input word to all other words in the text. In a self-attention layer, the attention distribution matrix for all input words is <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mi>A</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, where <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mi>l</mml:mi></mml:math></inline-formula> is the input text length.</p>
<p>The loss of knowledge distillation between the attention matrix of the output of layer <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mi>i</mml:mi></mml:math></inline-formula> of the teacher model and the attention matrix of the output of layer <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mi>j</mml:mi></mml:math></inline-formula> of the student model is as shown in <xref ref-type="disp-formula" rid="eqn-5">Eq. (5)</xref>.</p>
<p><disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mo symmetric="true">&#x2016;</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:msubsup><mml:mi>n</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">T</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:msubsup><mml:mi>n</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">S</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo symmetric="true">&#x2016;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></disp-formula>where <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:msubsup><mml:mi>n</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>A</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:msubsup><mml:mi>n</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> are the calculation functions of the teacher model and student model for self-attention input to layer and layer, respectively.</p>
<p>The schematic diagram of attention-based knowledge distillation is shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Schematic diagram of attention-based knowledge distillation</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_52622-fig-3.tif"/>
</fig>
<p>Combined with these three knowledge distillation methods, this study simultaneously adds the losses of the above three knowledge distillations to the training objectives so the student model can learn the teacher model from multiple angles. The loss of mixed knowledge distillation is given in <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref>.<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mi>L</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:msup><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msup><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03B3;</mml:mi><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:msup><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msup><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>t</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:mi>&#x03B2;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03B3;</mml:mi></mml:math></inline-formula> are hyperparameters controlling the proportion of the loss of middle layer representation and attention in the final loss function, respectively; <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:msup><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> is the weight of knowledge distillation loss in different layers.</p>
</sec>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Design of Knowledge Distillation Model Based on PAL-BERT</title>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Distillation Scheme</title>
<p>Although ALBERT (A Lite BERT)-based models have considerably fewer parameters than the original BERT model, they are still too large for practical online applications. In addition, although ALBERT&#x2019;s model compression is evident during training, it does not reduce inference time during the inference stage. For offline data processing scenarios, where time requirements are generally less demanding, the ALBERT [<xref ref-type="bibr" rid="ref-23">23</xref>] model can be effectively employed due to its significant performance gains. However, further compression of the model is crucial to reduce inference time for online tasks. This study optimizes the PAL-BERT model using the method of knowledge distillation, reducing inference time while preserving accuracy as much as possible. PAL-BERT is a first-order pruning model proposed based on the ALBERT model, demonstrating outstanding performance in question-answering tasks. PAL-BERT can provide good efficiency while maintaining high performance, which is ideal for teacher models as it requires processing a large amount of input data and generating high-quality outputs for student model learning.</p>
<p>At present, most knowledge distillation based on a pre-training language model often needs to be carried out in the pre-training stage or fine-tuning stage. For example, the knowledge distillation strategy adopted in distilling BERT is to distill knowledge while pre-training, and the obtained student model is directly used for fine-tuning each downstream task. In TinyBERT [<xref ref-type="bibr" rid="ref-16">16</xref>], it proposes a two-stage distillation strategy. In the pre-training task stage, knowledge distillation is performed through large-scale unsupervised corpus to obtain the student model in the general field. Then, during fine-tuning, the general student model obtained in the previous step is employed for knowledge distillation to obtain the student model finally used for specific downstream tasks.</p>
<p>This study refers to the knowledge distillation strategy of the above literature [<xref ref-type="bibr" rid="ref-24">24</xref>&#x2013;<xref ref-type="bibr" rid="ref-26">26</xref>], and combined with the current QA scene, it is considered that the distillation strategy can be conducted in the fine-tuning stage. The specific process uses the original pre-training model to get the teacher model in the downstream task fine-tuning and then keeps all the trainable parameters unchanged. Then, the trained teacher model is employed to facilitate the training of the student model, enabling the latter to acquire the pre-existing knowledge possessed by the former. Unlike previous methods that perform knowledge distillation during pre-training, this paper introduces a strategy that applies distillation during the fine-tuning stage for QA. This approach is more efficient because the already pre-trained model requires less time to adapt to the task. The fine-tuning process allows the model to concentrate on QA-specific patterns, enhancing the distillation&#x2019;s relevance and effectiveness. Additionally, distillation in pre-training aims to maintain model portability, which is unnecessary for our focused QA scenario. By distilling knowledge directly related to QA during fine-tuning, this study ensures that only the essential knowledge is transferred, optimizing the training process.</p>
<p>In this process, the teacher model is the source of knowledge and success for the student model, which acts as the recipient. The specific structure is depicted in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>. Distillation loss refers to loss calculated with both the student and teacher models, which are the representation loss and attention loss, while student loss refers to loss only correlated with the student model, which is <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Knowledge distillation structure</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_52622-fig-4.tif"/>
</fig>
<p>The knowledge distillation process typically comprises two stages: the original model training stage and the small model training stage. During the former, the focus is on training the teacher model, characterized by its complexity and ability to effectively capture information from the original data. It can even consist of multiple separately trained models. In the latter stage, the objective is to train the student models, which are typically smaller with fewer parameters and a simpler model structure.</p>
<p>The teacher model used in this study is PAL-BERT, and the student models include BiLSTM and TextCNN [<xref ref-type="bibr" rid="ref-27">27</xref>]. The distillation model based on PAL-BERT is named DPAL-BERT.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Data Augmentation</title>
<p>In the task of knowledge distillation, a small dataset cannot effectively let the teacher network express all its information. Therefore, many unlabeled data with the prediction results of the teacher network are needed to expand the dataset so that effective knowledge can be fully displayed.</p>
<p>Data augmentation in NLP is much more difficult than in image processing. Image data can generate near-natural images by rotating, adding noise, or other deformations. However, if a sentence in natural language processing is manually operated, the fluency of the sentence becomes lower, and this approach does not play a prominent role in NLP.</p>
<p>In order to expand the amount of data, the method of modifying sentences is employed in a manner similar to the occlusion language model in BERT. It referred to the data augmentation method in [<xref ref-type="bibr" rid="ref-28">28</xref>] and made some modifications. There are three data augmentation methods:</p>
<p>1. Masking. For each word in the text, a symbol &#x003C;mask&#x003E; would replace it with a certain probability <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. It helps to understand the contribution of different words in the text to the label.</p>
<p>2. Replacing. For a word in the text, it is replaced with another randomly sampled synonym with a certain probability <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mrow><mml:mtext>syn</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>.</p>
<p>3. N-gram sampling. For text data, an n-gram is randomly sampled with a certain probability <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, n ranges from 1 to 5. This method randomly selects a sequence of n consecutive words (an n-gram) from the text, and all other words are masked or removed. It is an extreme masking approach.</p>
<p>The specific use process is as follows: for the text to be processed, each position is iterated based on the uniform distribution. For each word &#x1D714;, a real number <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is randomly generated between 0 and 1. If <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x003C;</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, it will be masked. If <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x003C;</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x003C;</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, it will be replaced. Provide for masking and replacing both operations; once one rule is satisfied, the other is ignored. After the iteration, the processed samples are sampled at all locations with probability <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> Finally the comprehensive example is extended to the dataset as unlabeled data. For each data, this study iterates it n times to obtain up to n samples and discards the repeated samples.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments and Results</title>
<p>For experiments, two variants of DPAL-VERT models are built: DPAL-BERT-Bi and DPAL-BERT-C. Both models adopt the PAL-BERT model [<xref ref-type="bibr" rid="ref-18">18</xref>] as the teacher network. BiLSTM [<xref ref-type="bibr" rid="ref-19">19</xref>] and TextCNN [<xref ref-type="bibr" rid="ref-27">27</xref>] are used as the student network for constructing DPAL-BERT-Bi and DPAL-BERT-C, respectively. For the parameters of the data augmentation part, <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.1</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.25</mml:mn><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>10.</mml:mn></mml:math></inline-formula></p>
<sec id="s4_1">
<label>4.1</label>
<title>Optimizing Random Masking for Adjacent Word Segmentations</title>
<p>This section introduces an optimization technique involving the application of masks to adjacent word segments instead of random individual words. The masking step is to sample a subset <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:mi>Y</mml:mi></mml:math></inline-formula> from the word set <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:mi>X</mml:mi></mml:math></inline-formula> and replace it with another word set. In ALBERT, a subset is randomly selected to find out <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:mi>Y</mml:mi><mml:mo>,</mml:mo></mml:math></inline-formula> and the selection of each word is independent. The subset <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>Y</mml:mi></mml:math></inline-formula> accounts for 15% of the word set <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>X</mml:mi></mml:math></inline-formula>. 80% of the words in the subset <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>Y</mml:mi></mml:math></inline-formula> are substituted by [MASK], and 10% of the words are replaced by random words according to the unigram distribution, leaving 10% unchanged.</p>
<p>In this study, the model subset <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:mi>Y</mml:mi></mml:math></inline-formula> is obtained by selecting adjacent word segmentation, and the scale and masking method of the model are unchanged. Specifically, for each word sequence <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, words are selected by iteratively sampling the word segmentation of the text until the masking scale (15% of the whole word set) is reached and a subset is formed. The process begins by sampling the length of each word segment from a geometric distribution <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mi>I</mml:mi><mml:mo>&#x223C;</mml:mo><mml:mi>G</mml:mi><mml:mi>e</mml:mi><mml:mi>o</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:mi>p</mml:mi></mml:math></inline-formula> is set to 0.2. This sampling determines the number of words in each segment. To ensure manageable segments, the maximum allowable length of any given word segment is ten words. The geometric distribution is skewed and tends to shorten word segmentation, with an average word segmentation length of 3.8 words. The starting point of word segmentation is randomly selected. Combined with the above text length, the subset <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>Y</mml:mi></mml:math></inline-formula> can be obtained by sampling.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Ablation Study on T and <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:mi mathvariant="bold-italic">&#x03B1;</mml:mi></mml:math></inline-formula></title>
<p>In the distillation model, two hyperparameters T and <italic>&#x03B1;</italic> must be determined during the experiment. Grid search is applied to find the best <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0.1</mml:mn><mml:mo>,</mml:mo><mml:mn>0.2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mn>0.9</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> and <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>T</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> with DPAL-BERT-Bi model to find the best hyperparameters. The experimental results of the impact of two parameters on the final accuracy are listed in <xref ref-type="table" rid="table-4">Table 4</xref>.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Experimental results of different combinations of parameters T and &#x03B1;</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th><italic>&#x03B1;</italic></th>
<th>0.1</th>
<th>0.2</th>
<th>0.3</th>
<th>0.4</th>
<th>0.5</th>
<th>0.6</th>
<th>0.7</th>
<th>0.8</th>
<th>0.9</th>
</tr>
</thead>
<tbody>
<tr>
<td>T &#x003D; 1</td>
<td>0.751</td>
<td>0.769</td>
<td>0.786</td>
<td>0.791</td>
<td>0.793</td>
<td>0.766</td>
<td>0.763</td>
<td>0.762</td>
<td>0.761</td>
</tr>
<tr>
<td>T &#x003D; 2</td>
<td>0.775</td>
<td>0.790</td>
<td>0.789</td>
<td>0.801</td>
<td>0.795</td>
<td>0.784</td>
<td>0.769</td>
<td>0.752</td>
<td>0.751</td>
</tr>
<tr>
<td>T &#x003D; 3</td>
<td>0.735</td>
<td>0.748</td>
<td>0.763</td>
<td>0.745</td>
<td>0.767</td>
<td>0.756</td>
<td>0.743</td>
<td>0.736</td>
<td>0.748</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="fig" rid="fig-5">Fig. 5</xref> indicates that the optimal configuration for the hyperparameters is achieved with a combination of <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> and <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mi>&#x03B1;</mml:mi><mml:mo>=</mml:mo><mml:mn>0.5</mml:mn></mml:math></inline-formula>. Hence, these values are adopted as the standard settings for these hyperparameters in all subsequent experiments.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Model performance on varying T and <italic>&#x03B1;</italic></title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_52622-fig-5.tif"/>
</fig>
<p>The impact of varying the temperature parameter, particularly when it is increased to 3, can be understood in terms of its effect on the student network&#x2019;s attention to negative labels during training. When the temperature is low, less attention is paid to negative labels, especially those significantly lower than the average value. However, as the temperature rises, the relative importance of these negative labels increases, causing the student network to focus more on them.</p>
<p>Although negative labels contain helpful information, particularly those with values significantly above the average, the training process of the teacher network often introduces substantial noise in these labels. This noise tends to reduce the reliability of information from negative labels, especially as their values decrease. Hence, an excessively high-temperature value can lead to a decrease in the student network&#x2019;s accuracy.</p>
<p>The following selection rules can be applied to optimize the use of the temperature parameter in training: 1. A higher temperature should be used when learning from negative labels that carry meaningful information. 2. A lower temperature is preferable to minimize the influence of noise on negative labels.</p>
<p>The data augmentation technique employed in this study can generate a substantial volume of unlabeled data, significantly expanding the dataset used for training. The impact of data augmentation on the performance is shown in <xref ref-type="table" rid="table-5">Table 5</xref>.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Impact of data enlargement on model performance</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Models</th>
<th>Precision</th>
<th>Recall</th>
<th>F1</th>
</tr>
</thead>
<tbody>
<tr>
<td>With data enlargement</td>
<td>0.803</td>
<td>0.791</td>
<td>0.785</td>
</tr>
<tr>
<td>Without data enlargement</td>
<td>0.766</td>
<td>0.748</td>
<td>0.753</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="table-5">Table 5</xref> indicates that incorporating data augmentation, coupled with the addition of unlabeled data to the training process, results in a performance improvement, with an accuracy increase of approximately 4%. This shows that applying unlabeled data augmentation in knowledge distillation is very necessary. A plausible explanation for this enhancement is that using a large volume of unlabeled data allows for a more comprehensive representation of relevant knowledge from the larger model. Then, the smaller model can learn more effectively, improving overall performance. The effectiveness of this approach is further evidenced by the prediction results on the SQuAD 2.0 and CMRC 2018 development sets, as detailed in <xref ref-type="table" rid="table-6">Tables 6</xref> and <xref ref-type="table" rid="table-7">7</xref>, respectively.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>SQuAD 2.0 development set sample forecast results example</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
</colgroup>
<tbody>
<tr>
<td><bold>[article]</bold> In 2014, economists with the Standard &#x0026; Poor&#x2019;s rating agency concluded that the widening disparity between the US&#x2019;s wealthiest citizens and the rest of the nation had slowed its recovery from the 2008&#x2013;2009 recession and made it more prone to boom-and-bust cycles. To partially remedy the wealth gap and the resulting slow growth, S&#x0026;P recommended increasing access to education. It estimated that if the average United States worker had completed just one more year of school, it will add $105 billion in growth to the country&#x2019;s economy over five years.</td>
</tr>
<tr>
<td><bold>[question 1]</bold> How much potential economic growth could the US amass if everyone went through more schooling?</td>
</tr>
<tr>
<td>Reference answer 1: $105 billion</td>
</tr>
<tr>
<td><bold>[Forecast Answer]</bold> $105 billion</td>
</tr>
<tr>
<td><bold>[question 2]</bold> What is the United States at risk for because of the recession of 2008?</td>
</tr>
<tr>
<td><bold>Reference Answer 1</bold>: boom-and-bust cycles</td>
</tr>
<tr>
<td><bold>[Forecast Answer]</bold> boom-and-bust cycles</td>
</tr>
<tr>
<td><bold>[question 3]</bold> Who concluded that the rising income inequality gap was not getting better?</td>
</tr>
<tr>
<td>Reference Answer 1: Standard &#x0026; Poor</td>
</tr>
<tr>
<td><bold>Reference Answer 2</bold>: economists with the Standard &#x0026; Poor&#x2019;s rating agency</td>
</tr>
<tr>
<td>[Forecast Answer] &#x003C;No Answer&#x003E;</td>
</tr>
<tr>
<td><bold>[question 4]</bold> What is the United States at risk for because of the recession of 2000?</td>
</tr>
<tr>
<td>Reference Answer 1: &#x003C;No Answer&#x003E;</td>
</tr>
<tr>
<td>[Forecast Answer] &#x003C;No Answer&#x003E;</td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>CMRC 2018 development set sample forecast result example</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
</colgroup>
<tbody>
<tr>
<td><bold>[article]</bold> Electrostatic induction is the redistribution of charge in an object due to the influence of external charge. This phenomenon was discovered by British scientists John Canton and Swedish scientists in 1753 and 1762, respectively. Normal substances have the same amount of positive and negative charges, so they are generally uncharged. If a charged object is placed close to an uncharged conductor, such as a piece of metal, the charge on the conductor will be redistributed. For example, if a positively charged object is brought close to a metal, the negative charge on the metal will be attracted, and the positive charge will be repelled. This leads to a negative charge in the part of the metal close to the external charge and a positive charge in the part far away from the external charge.</td>
</tr>
<tr>
<td><bold>[question]</bold> When was electrostatic induction discovered?</td>
</tr>
<tr>
<td>Reference Answer 1: 1753 and 1762</td>
</tr>
<tr>
<td>Reference Answer 2: It was discovered in 1753 and 1762</td>
</tr>
<tr>
<td>[Forecast Answer] 1753 and 1762</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Model Performance of DPAL-BERT</title>
<p>To evaluate the model performance and robustness of the proposed DPAL-BERT, two variants of DPAL-BERT models, DPAL-BERT-Bi and DPAL-BERT-C, are tested in the CMRC dataset. Results are given in <xref ref-type="table" rid="table-8">Table 8</xref>.</p>
<table-wrap id="table-8">
<label>Table 8 </label>
<caption>
<title>Comparison of results of knowledge distillation models</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Models</th>
<th>Precision</th>
<th>Recall</th>
<th>F1</th>
</tr>
</thead>
<tbody>
<tr>
<td>DPAL-BERT-Bi</td>
<td>0.803</td>
<td>0.791</td>
<td>0.785</td>
</tr>
<tr>
<td>DPAL-BERT-C</td>
<td>0.786</td>
<td>0.778</td>
<td>0.776</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>BiLSTM and TextCNN are trained from scratch without a word vector to evaluate the effectiveness of knowledge distillation. The obtained results are listed in <xref ref-type="table" rid="table-9">Table 9</xref>. This study reveals that knowledge distillation significantly outperforms the small models trained without word vectors. BiLSTM and TextCNN, when trained directly on the dataset, achieve a maximum accuracy of only 67.7%. It indicates the challenges small models face in capturing the intricacies of diverse samples. In contrast, after applying knowledge distillation, the accuracy of the distilled models exceeds 80%, which is nearly 13% higher than the small models and about 4% higher than traditional models utilizing word vectors. Through the above experiments, the knowledge distillation demonstrates a remarkable efficiency in enhancing the accuracy of smaller models.</p>
<table-wrap id="table-9">
<label>Table 9</label>
<caption>
<title>Performance of BiLSTM and TextCNN without word vectors</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Models</th>
<th>Precision</th>
<th>Recall</th>
<th>F1</th>
</tr>
</thead>
<tbody>
<tr>
<td>BiLSTM</td>
<td>0.677</td>
<td>0.594</td>
<td>0.683</td>
</tr>
<tr>
<td>TextCNN</td>
<td>0.661</td>
<td>0.678</td>
<td>0.615</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Inference Speed Comparison</title>
<p><xref ref-type="table" rid="table-10">Table 10</xref> compares the number of parameters and inference time between the distilled model DPAL-BERT-Bi and the teacher model PAL-BERT. The inference time is the duration required to process the dataset using the trained models. For a fair comparison, the batch size for both models is set to 32. The results reveal that the DPAL-BERT-Bi model has nearly 20 times fewer parameters than the PAL-BERT model. In addition, its inference time is substantially lower. Specifically, the distilled model&#x2019;s inference process is approximately 423 times faster than that of the PAL-BERT model.</p>
<table-wrap id="table-10">
<label>Table 10</label>
<caption>
<title>Comparison of parameters and inference time between distillation model and original large mode</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Models</th>
<th>Parameter quantity (millions)</th>
<th>Inference time (seconds)</th>
</tr>
</thead>
<tbody>
<tr>
<td>PAL-BERT</td>
<td>19</td>
<td>88836</td>
</tr>
<tr>
<td>DPAL BERT-Bi</td>
<td>0.97</td>
<td>210</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Discussion</title>
<p>The knowledge distillation method has many advantages, such as shallowing the depth of the model, significantly reducing the computational cost, and directly accelerating the model without specific hardware requirements. Developing more methods based on knowledge distillation and exploring how to improve its performance is paramount. Using the method of knowledge distillation, the PAL-BERT model was employed as the teacher network, with BiLSTM and TextCNN serving as the student networks to develop two models, DPAL-BERT-Bi and DPAL-BERT-C, and the effectiveness of the method was verified through experiments.</p>
<p>Knowledge distillation successfully facilitates knowledge transfer from the large model PAL-BERT to the small models such as BiLSTM and TextCNN. After knowledge distillation, the accuracy is 13% higher than training directly on the small model. However, it is essential to acknowledge the inherent limitations in the representational capacity of smaller models compared to more complex ones like ALBERT. Although a significant portion of knowledge from PAL-BERT is transferred to BiLSTM, some knowledge remains untransferred. This limitation is represented in the performance of the distilled small models, which, despite being markedly better than the outcomes of direct training or traditional word vectorization, still do not match the performance level of PAL-BERT.</p>
<p>Nevertheless, the primary advantage of the proposed DPAL-BERT is the substantial reduction in inference time while retaining as much computational accuracy as possible. The distilled model requires only the computation time typical of smaller models, significantly speeding up the inference process compared to the original large model. DPAL-BERT-Bi, which employs knowledge distillation, reduces its parameter count by nearly 20 times compared to the original model, and the inference speed increases by approximately 423 times.</p>
</sec>
<sec id="s6">
<label>6</label>
<title>Conclusion</title>
<p>This study applies knowledge distillation to BERT-based models to reduce the inference time. Based on PAL-BERT, the DPAL-BERT-Bi and DPAL-BERT-C models are introduced. Experiments show a significant improvement in model performance compared to smaller models trained from scratch without using word vectors. There is an enhancement in effectiveness compared to smaller models trained either directly or after using word vectors. Although the performance after distillation is slightly lower than PAL-BERT, the model&#x2019;s inference time is greatly reduced. This acceleration is especially beneficial for online applications, where the slight trade-off in performance is outweighed by substantial gains in processing speed.</p>
<p>However, there are still some limitations in the research. In terms of knowledge distillation, this study only uses soft labels, but in the following research, other features in the model can be introduced, such as hidden layer vector in the transformer or feature representation of the embedded layer. These can be further studied in combination with question-answering scenarios.</p>
</sec>
</body>
<back>
<ack><p>The authors extend their appreciation to the Distinguished Scientist Fellowship Program (DSPF), King Saud University, Riyadh, Saudi Arabi.</p>
</ack>
<sec><title>Funding Statement</title>
<p>This study was supported by Sichuan Science and Technology Program (2023YFSY0026, 2023YFH0004).</p>
</sec>
<sec><title>Author Contributions</title>
<p>Conceptualization: Wenfeng Zheng; methodology: Zhuohang Cai; software: Zhuohang Cai; formal analysis: Ahmed AlSanad, and Salman A. AlQahtani; data curation: Zhuohang Cai, Zhengtong Yin, Xiaobing Chen, Xiaolu Li, and Lei Wang; writing&#x2014;original draft preparation: Lirong Yin, Siyu Lu, and Wenfeng Zheng; writing&#x2014;review and editing: Lirong Yin, Siyu Lu, Ruiyang Wang, and Wenfeng Zheng; funding acquisition: Wenfeng Zheng. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability"><title>Availability of Data and Materials</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare that they have no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>1.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Radford</surname> <given-names>A</given-names></string-name>, <string-name><surname>Narasimhan</surname> <given-names>K</given-names></string-name>, <string-name><surname>Salimans</surname> <given-names>T</given-names></string-name>, <string-name><surname>Sutskever</surname> <given-names>I</given-names></string-name></person-group>. <article-title>Improving language understanding by generative pre-training</article-title>. <source>OpenAI</source>. <year>2018</year>.</mixed-citation></ref>
<ref id="ref-2"><label>2.</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Devlin</surname> <given-names>J</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>MW</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>K</given-names></string-name>, <string-name><surname>Toutanova</surname> <given-names>K</given-names></string-name></person-group>. <source>BERT: pre-training of deep bidirectional transformers for language understanding</source>. <publisher-loc>Minneapolis, Minnesota</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2019</year>. p. <fpage>4171</fpage>&#x2013;<lpage>4186</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>3.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zaib</surname> <given-names>M</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>WE</given-names></string-name>, <string-name><surname>Sheng</surname> <given-names>QZ</given-names></string-name>, <string-name><surname>Mahmood</surname> <given-names>A</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Conversational question answering: a survey</article-title>. <source>Knowl Inf Syst</source>. <year>2022</year>;<volume>64</volume>:<fpage>3151</fpage>&#x2013;<lpage>95</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10115-022-01744-y</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>4.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lan</surname> <given-names>W</given-names></string-name>, <string-name><surname>Cheung</surname> <given-names>YM</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Li</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Compact neural network via stacking hybrid units</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. <year>2024</year>;<volume>46</volume>(<issue>1</issue>):<fpage>103</fpage>&#x2013;<lpage>16</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TPAMI.2023.3323496</pub-id>; <pub-id pub-id-type="pmid">37815976</pub-id></mixed-citation></ref>
<ref id="ref-5"><label>5.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Menghani</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Efficient deep learning: a survey on making deep learning models smaller, faster, and better</article-title>. <source>ACM Comput Surv</source>. <year>2023</year>;<volume>55</volume>(<issue>12</issue>):<fpage>259</fpage>. doi:<pub-id pub-id-type="doi">10.1145/3578938</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>6.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Huang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Hao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>B</given-names></string-name></person-group>. <article-title>Compressing speaker extraction model with ultra-low precision quantization and knowledge distillation</article-title>. <source>Neural Netw</source>. <year>2022</year>;<volume>154</volume>(<issue>1</issue>):<fpage>13</fpage>&#x2013;<lpage>21</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neunet.2022.06.026</pub-id>; <pub-id pub-id-type="pmid">35841810</pub-id></mixed-citation></ref>
<ref id="ref-7"><label>7.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Choudhary</surname> <given-names>T</given-names></string-name>, <string-name><surname>Mishra</surname> <given-names>V</given-names></string-name>, <string-name><surname>Goswami</surname> <given-names>A</given-names></string-name>, <string-name><surname>Sarangapani</surname> <given-names>J</given-names></string-name></person-group>. <article-title>A comprehensive survey on model compression and acceleration</article-title>. <source>Artif Intell Rev</source>. <year>2020</year>;<volume>53</volume>(<issue>7</issue>):<fpage>5113</fpage>&#x2013;<lpage>55</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10462-020-09816-7</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>8.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Mo</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>F</given-names></string-name>, <string-name><surname>Liao</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Review the state-of-the-art technologies of semantic segmentation based on deep learning</article-title>. <source>Neurocomputing</source>. <year>2022</year>;<volume>493</volume>:<fpage>626</fpage>&#x2013;<lpage>46</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neucom.2022.01.005</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>9.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liang</surname> <given-names>T</given-names></string-name>, <string-name><surname>Glossner</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Shi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Pruning and quantization for deep neural network acceleration: a survey</article-title>. <source>Neurocomputing</source>. <year>2021</year>;<volume>461</volume>(<issue>18</issue>):<fpage>370</fpage>&#x2013;<lpage>403</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neucom.2021.07.045</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>10.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Swaminathan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Garg</surname> <given-names>D</given-names></string-name>, <string-name><surname>Kannan</surname> <given-names>R</given-names></string-name>, <string-name><surname>Andres</surname> <given-names>F</given-names></string-name></person-group>. <article-title>Sparse low rank factorization for deep neural network compression</article-title>. <source>Neurocomputing</source>. <year>2020</year>;<volume>398</volume>(<issue>11</issue>):<fpage>185</fpage>&#x2013;<lpage>96</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neucom.2020.02.035</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>11.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Guo</surname> <given-names>S</given-names></string-name>, <string-name><surname>Lai</surname> <given-names>B</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>F</given-names></string-name></person-group>. <article-title>Sensitivity pruner: filter-level compression algorithm for deep neural networks</article-title>. <source>Pattern Recogn</source>. <year>2023</year>;<volume>140</volume>(<issue>2</issue>):<fpage>109508</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patcog.2023.109508</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>12.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gou</surname> <given-names>J</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>L</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>B</given-names></string-name>, <string-name><surname>Du</surname> <given-names>L</given-names></string-name>, <string-name><surname>Ramamohanarao</surname> <given-names>K</given-names></string-name>, <string-name><surname>Tao</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Collaborative knowledge distillation via multiknowledge transfer</article-title>. <source>IEEE Trans Neural Netw Learn Syst</source>. <year>2024</year>;<volume>35</volume>(<issue>5</issue>):<fpage>6718</fpage>&#x2013;<lpage>30</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TNNLS.2022.3212733</pub-id>; <pub-id pub-id-type="pmid">36264723</pub-id></mixed-citation></ref>
<ref id="ref-13"><label>13.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Hinton</surname> <given-names>G</given-names></string-name>, <string-name><surname>Vinyals</surname> <given-names>O</given-names></string-name>, <string-name><surname>Dean</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Distilling the knowledge in a neural network</article-title>. <year>2015</year>. doi:<pub-id pub-id-type="doi">10.48550/arXiv.1503.02531</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>14.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gou</surname> <given-names>J</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>B</given-names></string-name>, <string-name><surname>Maybank</surname> <given-names>SJ</given-names></string-name>, <string-name><surname>Tao</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Knowledge distillation: a survey</article-title>. <source>Int J Comput Vis</source>. <year>2021</year>;<volume>129</volume>(<issue>6</issue>):<fpage>1789</fpage>&#x2013;<lpage>819</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11263-021-01453-z</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>15.</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Sanh</surname> <given-names>V</given-names></string-name>, <string-name><surname>Debut</surname> <given-names>L</given-names></string-name>, <string-name><surname>Chaumond</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wolf</surname> <given-names>T</given-names></string-name></person-group>. <article-title>DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. 2019</article-title>. doi:<pub-id pub-id-type="doi">10.48550/arXiv.1910.01108</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>16.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Jiao</surname> <given-names>X</given-names></string-name>, <string-name><surname>Yin</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Shang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Li</surname> <given-names>L</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Tinybert: distilling bert for natural language understanding</article-title>. In: <conf-name>Findings of the Association for Computational Linguistics: EMNLP 2020</conf-name>; <year>2020</year>. p. <fpage>4163</fpage>&#x2013;<lpage>74</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2020.findings-emnlp.372</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>17.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jiao</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Yin</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Shang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Improving task-agnostic BERT distillation with layer mapping search</article-title>. <source>Neurocomputing</source>. <year>2021</year>;<volume>461</volume>:<fpage>194</fpage>&#x2013;<lpage>203</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neucom.2021.07.050</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>18.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zheng</surname> <given-names>W</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Cai</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>R</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Yin</surname> <given-names>L</given-names></string-name></person-group>. <article-title>PAL-BERT: an improved question answering model</article-title>. <source>Comp Model Eng</source>. <year>2023</year>;<volume>139</volume>(<issue>3</issue>):<fpage>2729</fpage>&#x2013;<lpage>45</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmes.2023.046692</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>19.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>L</given-names></string-name></person-group>. <article-title>DAFA-BiLSTM: deep autoregression feature augmented bidirectional LSTM network for time series prediction</article-title>. <source>Neural Netw</source>. <year>2023</year>;<volume>157</volume>(<issue>2</issue>):<fpage>240</fpage>&#x2013;<lpage>56</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neunet.2022.10.009</pub-id>; <pub-id pub-id-type="pmid">36399979</pub-id></mixed-citation></ref>
<ref id="ref-20"><label>20.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Rajpurkar</surname> <given-names>P</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lopyrev</surname> <given-names>K</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>P</given-names></string-name></person-group>. <article-title>SQuAD: 100,000&#x002B; questions for machine comprehension of text</article-title>. In: <conf-name>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</conf-name>; <year>2016 Nov</year>; <publisher-loc>Austin, TX, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>. p. <fpage>2383</fpage>&#x2013;<lpage>92</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/D16-1264</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>21.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Cui</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>T</given-names></string-name>, <string-name><surname>Che</surname> <given-names>W</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>L</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>W</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>A span-extraction dataset for chinese machine reading comprehension</article-title>. In: <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name>; <year>2019</year>; <publisher-loc>Hong Kong, China</publisher-loc>. p. <fpage>5883</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/D19-1600</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>22.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gou</surname> <given-names>J</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>L</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>B</given-names></string-name>, <string-name><surname>Wan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Ou</surname> <given-names>W</given-names></string-name>, <string-name><surname>Yi</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>Multilevel attention-based sample correlations for knowledge distillation</article-title>. <source>IEEE T Ind Inform</source>. <year>2023</year>;<volume>19</volume>(<issue>5</issue>):<fpage>7099</fpage>&#x2013;<lpage>109</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TII.2022.3209672</pub-id>.</mixed-citation></ref>
<ref id="ref-23"><label>23.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Lan</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>M</given-names></string-name>, <string-name><surname>Goodman</surname> <given-names>S</given-names></string-name>, <string-name><surname>Gimpel</surname> <given-names>K</given-names></string-name>, <string-name><surname>Sharma</surname> <given-names>P</given-names></string-name>, <string-name><surname>Soricut</surname> <given-names>R</given-names></string-name></person-group>. <article-title>ALBERT: a lite BERT for self-supervised learning of language representations</article-title>; In: <conf-name>The Eighth International Conference on Learning Representations</conf-name>; <year>2020</year>.</mixed-citation></ref>
<ref id="ref-24"><label>24.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Peters</surname> <given-names>ME</given-names></string-name>, <string-name><surname>Ruder</surname> <given-names>S</given-names></string-name>, <string-name><surname>Smith</surname> <given-names>NA</given-names></string-name></person-group>. <article-title>To tune or not to tune? Adapting pretrained representations to diverse tasks</article-title>. In: <conf-name>Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)</conf-name>; <year>2019 Aug</year>; <publisher-loc> Florence, Italy</publisher-loc>. p. <fpage>7</fpage>&#x2013;<lpage>14</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/W19-4302</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>25.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Clark</surname> <given-names>K</given-names></string-name>, <string-name><surname>Luong</surname> <given-names>MT</given-names></string-name>, <string-name><surname>Manning</surname> <given-names>CD</given-names></string-name>, <string-name><surname>Le</surname> <given-names>QV</given-names></string-name></person-group>. <article-title>Semi-supervised sequence modeling with cross-view training</article-title>. In: <conf-name>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</conf-name>; <year>2018 Oct 31&#x2013;Nov 4</year>; <publisher-loc>Brussels, Belgium</publisher-loc>. p. <fpage>1914</fpage>&#x2013;<lpage>25</lpage>.</mixed-citation></ref>
<ref id="ref-26"><label>26.</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>K</given-names></string-name>, <string-name><surname>Wigington</surname> <given-names>C</given-names></string-name>, <string-name><surname>Tensmeyer</surname> <given-names>C</given-names></string-name>, <string-name><surname>Morariu</surname> <given-names>VI</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>H</given-names></string-name>, <string-name><surname>Varun</surname> <given-names>M</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Improving cross-domain detection with self-supervised learning</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2023</year>; <publisher-loc>Vancouver, BC, Canada</publisher-loc>. p. <fpage>4746</fpage>&#x2013;<lpage>55</lpage>.</mixed-citation></ref>
<ref id="ref-27"><label>27.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jiang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Song</surname> <given-names>C</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Peng</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Research on sentiment classification for netizens based on the BERT-BiLSTM-TextCNN model</article-title>. <source>PeerJ Comput Sci</source>. <year>2022</year>;<volume>8</volume>(<issue>3</issue>):<fpage>e1005</fpage>. doi:<pub-id pub-id-type="doi">10.7717/peerj-cs.1005</pub-id>; <pub-id pub-id-type="pmid">35721405</pub-id></mixed-citation></ref>
<ref id="ref-28"><label>28.</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Tang</surname> <given-names>R</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Mou</surname> <given-names>L</given-names></string-name>, <string-name><surname>Vechtomova</surname> <given-names>O</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Distilling task-specific knowledge from BERT into simple neural networks</article-title>. <year>2019</year>. doi:<pub-id pub-id-type="doi">10.48550/arXiv.1903.12136</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>