<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">73798</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.073798</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>AFI: Blackbox Backdoor Detection Method Based on Adaptive Feature Injection</article-title>
<alt-title alt-title-type="left-running-head">AFI: Blackbox Backdoor Detection Method Based on Adaptive Feature Injection</alt-title>
<alt-title alt-title-type="right-running-head">AFI: Blackbox Backdoor Detection Method Based on Adaptive Feature Injection</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Tang</surname><given-names>Simin</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-2" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Zhang</surname><given-names>Zhiyong</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref><xref ref-type="aff" rid="aff-4">4</xref><email>xidianzzy@126.com</email></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Pan</surname><given-names>Junyan</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Quan</surname><given-names>Gaoyuan</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Wang</surname><given-names>Weiguo</given-names></name><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Jing</surname><given-names>Junchang</given-names></name><xref ref-type="aff" rid="aff-6">6</xref></contrib>
<aff id="aff-1"><label>1</label><institution>Information Engineering College, Henan University of Science and Technology</institution>, <addr-line>Luoyang, 471023</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Henan International Joint Laboratory of Cyberspace Security Applications, Henan University of Science and Technology</institution>, <addr-line>Luoyang, 471023</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Henan Intelligent Manufacturing Big Data Development Innovation Laboratory, Henan University of Science and Technology</institution>, <addr-line>Luoyang, 471023</addr-line>, <country>China</country></aff>
<aff id="aff-4"><label>4</label><institution>Institute of Artificial Intelligence Innovations, Henan University of Science and Technology</institution>, <addr-line>Luoyang, 471023</addr-line>, <country>China</country></aff>
<aff id="aff-5"><label>5</label><institution>Education Technology Department, New H3C Technologies Co., Ltd.</institution>, <addr-line>Beijing, 100102</addr-line>, <country>China</country></aff>
<aff id="aff-6"><label>6</label><institution>College of Computer and Information Engineering, Henan Normal University</institution>, <addr-line>Xinxiang, 453007</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Zhiyong Zhang. Email: <email>xidianzzy@126.com</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>79</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>09</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_73798.pdf"></self-uri>
<abstract>
<p>At inference time, deep neural networks are susceptible to backdoor attacks, which can produce attacker-controlled outputs when inputs contain carefully crafted triggers. Existing defense methods often focus on specific attack types or incur high costs, such as data cleaning or model fine-tuning. In contrast, we argue that it is possible to achieve effective and generalizable defense without removing triggers or incurring high model-cleaning costs. From the attacker&#x2019;s perspective and based on characteristics of vulnerable neuron activation anomalies, we propose an Adaptive Feature Injection (AFI) method for black-box backdoor detection. AFI employs a pre-trained image encoder to extract multi-level deep features and constructs a dynamic weight fusion mechanism for precise identification and interception of poisoned samples. Specifically, we select the control samples with the largest feature differences from the clean dataset via feature-space analysis, and generate blended sample pairs with the test sample using dynamic linear interpolation. The detection statistic is computed by measuring the divergence <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> in model output responses. We systematically evaluate the effectiveness of AFI against representative backdoor attacks, including BadNets, Blend, WaNet, and IAB, on three benchmark datasets: MNIST, CIFAR-10, and ImageNet. Experimental results show that AFI can effectively detect poisoned samples, achieving average detection rates of 95.20%, 94.15%, and 86.49% on these datasets, respectively. Compared with existing methods, AFI demonstrates strong cross-domain generalization ability and robustness to unknown attacks.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Deep learning</kwd>
<kwd>backdoor attacks</kwd>
<kwd>universal detection</kwd>
<kwd>feature fusion</kwd>
<kwd>backward reasoning</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>National Natural Science Foundation of China</funding-source>
<award-id>61972133</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Leading Talents in Science and Technology Innovation for Thousands</funding-source>
<award-id>204200510021</award-id>
</award-group>
<award-group id="awg3">
<funding-source>Key Research and Development Plan Special Project of Henan Province</funding-source>
<award-id>241111211400</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>In recent years, deep learning [<xref ref-type="bibr" rid="ref-1">1</xref>] has been widely applied across fields such as image classification, natural language processing, and pattern recognition, becoming a major focus of research in artificial intelligence. Given a test image, its predicted category can be obtained by computing the similarity between the image features and the textual features of category descriptions. However, as neural networks grow increasingly complex&#x2014;larger parameter scales and deeper architectures&#x2014;accuracy may improve, but robustness often decreases, introducing additional security vulnerabilities. As a result, neural networks become inherently susceptible to backdoor attacks [<xref ref-type="bibr" rid="ref-2">2</xref>].</p>
<p>Backdoor attacks [<xref ref-type="bibr" rid="ref-3">3</xref>], which constitute a highly covert security threat, rely on the core mechanism of injecting specific malicious behavior patterns into the model through training data poisoning. In such attacks, attackers first carefully design backdoor triggers, which may be particular pixel patterns in the image (such as local color blocks used by BadNets [<xref ref-type="bibr" rid="ref-2">2</xref>]), special word sequences in natural language (such as specific combinations of harmless words as shown in [<xref ref-type="bibr" rid="ref-4">4</xref>]), or specific data features across modalities (such as frequency domain perturbations proposed in [<xref ref-type="bibr" rid="ref-5">5</xref>]). Subsequently, attackers implicitly establish associations between triggers and target outputs during the learning process by contaminating the training dataset (typically, only 1%&#x2013;5% of the data needs to be contaminated [<xref ref-type="bibr" rid="ref-2">2</xref>]). A key characteristic of this attack is that the model implanted with a backdoor performs similarly to a clean model under normal input (the difference in test accuracy is usually less than 0.5%). Still, once the input contains preset triggers (such as specific pixel combinations in image corners or special character sequences in text), the model will perform malicious actions predetermined by the attacker, such as misclassifying any input into the target category, generating harmful content, and even leaking private data. More seriously, such triggers can be imperceptible to the human eye and have minimal impact on the normal functioning of the model, posing significant challenges to traditional anomaly-detection-based defense methods. The main process of backdoor attack and defense is shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Basic process of backdoor attack and defense</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-1.tif"/>
</fig>
<p>The current research on defense against backdoor attacks in deep neural networks mainly faces two limitations: firstly, most existing defense methods are designed for specific types of backdoor attacks, making it difficult to cope with the constantly evolving and diverse attack methods. Secondly, mainstream defense solutions often require high computational costs, including but not limited to: (1) global parameter adjustment or retraining of pre-trained models [<xref ref-type="bibr" rid="ref-6">6</xref>], which can require thousands of GPU hours for large models such as ViT-Huge; and (2) pruning defective neurons [<xref ref-type="bibr" rid="ref-7">7</xref>], which demands a large number of clean samples that are difficult to obtain in practice. Such targeted and computationally intensive properties severely restrict the practical deployment of defense methods in real-time systems and resource-constrained environments.</p>
<p>To address the above issue, we select multiple pairs of reference samples with maximal mutual differences from the clean dataset based on feature space analysis. We use dynamic linear interpolation to generate mixed sample pairs with the test sample and construct detection statistics by observing the model output response dispersion G(x). The ability to induce misclassification errors is a general characteristic of backdoored samples. Therefore, the proposed defense is attack-agnostic [<xref ref-type="bibr" rid="ref-6">6</xref>], which distinguishes it from existing defenses.</p>
<p>The main contributions are as follows:
<list list-type="simple">
<list-item><label>1)</label><p>We propose a general black-box backdoor detection method based on a hybrid injection strategy. This method fuses the input sample under inspection with multiple clean samples that have significant differences in their features from each other, and infers whether the sample contains a trigger based on the model output on the fused input. It does not rely on the model structure or parameters, and does not require modification of the original inputs or the model itself, which provides good generalization and black-box adaptability.</p></list-item>
<list-item><label>2)</label><p>We construct a dynamic-weight trigger fusion mechanism. This mechanism utilizes the characteristics of trigger-induced neuron activation value maximization and manipulation of model predictions in backdoor attacks. By comparing and analyzing the model&#x2019;s output responses, it determines whether the input sample is influenced by a trigger, thereby effectively identifying poisoned samples.</p></list-item>
<list-item><label>3)</label><p>We propose a new defense evaluation metric (Detection Stability Performance, DSP) to verify the cross-domain generalization ability and robustness against unknown attacks of our method. The experimental results show that the proposed method can effectively detect poisoned samples, with average detection rates of 95.2%, 94.15%, and 86.49% on three benchmark datasets, respectively.</p></list-item>
</list></p>
<p>The remainder of this article is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> reviews related work on backdoor attacks and defenses. <xref ref-type="sec" rid="s3">Section 3</xref> presents the design of the proposed AFI defense method. <xref ref-type="sec" rid="s4">Section 4</xref> reports experimental results demonstrating the effectiveness of AFI. Finally, <xref ref-type="sec" rid="s5">Section 5</xref> concludes the paper and outlines potential research directions for backdoor defense.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Backdoor Attacks</title>
<p>Backdoor attacks typically aim to implant malicious behavior into deep learning models by injecting a small number of poisoned samples into the training dataset. Specifically, once the model is trained on these poisoned samples, it will misclassify the samples into the target class when the trigger is activated. However, when the trigger is not present, the backdoor model behaves the same as a normal model and does not show anomalous behavior. According to the attack method, existing backdoor attacks can be classified into two categories: <bold>(1) poison-label attacks</bold> [<xref ref-type="bibr" rid="ref-8">8</xref>&#x2013;<xref ref-type="bibr" rid="ref-12">12</xref>], which connect the trigger and the target class by changing the labels of the toxic samples to the target labels to enhance the attack effect [<xref ref-type="bibr" rid="ref-9">9</xref>,<xref ref-type="bibr" rid="ref-13">13</xref>] or to hide the traces of the attack [<xref ref-type="bibr" rid="ref-14">14</xref>&#x2013;<xref ref-type="bibr" rid="ref-16">16</xref>]. <bold>(2) Clean-label attacks</bold> [<xref ref-type="bibr" rid="ref-17">17</xref>&#x2013;<xref ref-type="bibr" rid="ref-19">19</xref>], which keep the original labels of the samples unchanged and only poison samples within the target class by injecting triggers. Although clean label attacks are more stealthy, they may sometimes fail to successfully implant a backdoor [<xref ref-type="bibr" rid="ref-19">19</xref>,<xref ref-type="bibr" rid="ref-20">20</xref>].</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Backdoor Defense</title>
<p>Existing backdoor defense methods can be broadly divided into three categories: <bold>(1) Model reconstruction defenses.</bold> These methods directly modify a suspicious model to suppress or remove backdoor behaviors, usually by first synthesizing triggers and then mitigating their influence [<xref ref-type="bibr" rid="ref-6">6</xref>,<xref ref-type="bibr" rid="ref-7">7</xref>,<xref ref-type="bibr" rid="ref-21">21</xref>,<xref ref-type="bibr" rid="ref-22">22</xref>]. This type of method largely relies on the quality of synthesized triggers, so the defense effect may not be satisfactory when facing more complex triggers [<xref ref-type="bibr" rid="ref-23">23</xref>,<xref ref-type="bibr" rid="ref-24">24</xref>]. For instance, Neural Cleaner (NC) [<xref ref-type="bibr" rid="ref-6">6</xref>] generates a trigger for each category and employs Median Absolute Deviation (MAD) for outlier detection, followed by a forgetting strategy to eliminate the backdoor. Zhu et al. [<xref ref-type="bibr" rid="ref-7">7</xref>] use the strong reconstruction capability of GANs to detect and &#x201C;clean&#x201D; neural backdoors without requiring access to the training data, showing robustness and efficiency across various settings. A new Trojan network detection mechanism [<xref ref-type="bibr" rid="ref-21">21</xref>] locates a &#x201C;winning Trojan lottery ticket&#x201D; that retains almost complete Trojan information but only chance-level performance on clean input, and then restores the triggers embedded in this already isolated subnetwork. The Shapley Pruning [<xref ref-type="bibr" rid="ref-22">22</xref>] can identify and remove less than 1% of infected neurons while maintaining model structure and accuracy, even with extremely limited data (one or zero samples per class). <bold>(2) Pruning-based defenses. </bold>These approaches aim to detect and prune malicious neurons and typically require access to clean labeled data, which is often impractical in real-world scenarios. For example, Fine-Pruning [<xref ref-type="bibr" rid="ref-25">25</xref>] combines neuron pruning with fine-tuning to suppress or eliminate backdoor behavior. <bold>(3) Input detection defenses.</bold> This category focuses on identifying poisoned samples at inference time without modifying the model. Activation-based defenses such as STRIP [<xref ref-type="bibr" rid="ref-26">26</xref>] detect backdoors by measuring prediction entropy under input perturbations, while SentiNet [<xref ref-type="bibr" rid="ref-27">27</xref>] localizes suspicious regions using model interpretability. MNTD [<xref ref-type="bibr" rid="ref-28">28</xref>] trains a meta-classifier to distinguish clean and Trojaned models from their behavior. Defenses can be categorized by the defender&#x2019;s knowledge into sample-, model-, and training data-level approaches [<xref ref-type="bibr" rid="ref-29">29</xref>]. For example, CCA-UD [<xref ref-type="bibr" rid="ref-29">29</xref>] operates at the training data level by clustering samples to identify clean and poisoned ones, which is computationally expensive. In contrast, our method achieves high-level defense directly at the sample level.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Our Method: Fusion Triggered Detection (AFI)</title>
<sec id="s3_1">
<label>3.1</label>
<title>Basic Settings</title>
<p><bold>Attack Setting:</bold> In backdoor attacks on classification tasks, we train a DNN model <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mspace width="negativethinmathspace" /><mml:mo>:</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mrow><mml:mi>&#x1D4B3;</mml:mi></mml:mrow><mml:mspace width="negativethinmathspace" /><mml:mo stretchy="false">&#x2192;</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mrow><mml:mi>&#x1D4B4;</mml:mi></mml:mrow></mml:math></inline-formula>, where <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mrow><mml:mi>&#x1D4B3;</mml:mi></mml:mrow><mml:mspace width="negativethinmathspace" /><mml:mo>&#x2286;</mml:mo><mml:mspace width="negativethinmathspace" /><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mi>d</mml:mi></mml:msup></mml:math></inline-formula> denotes the input space and <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mrow><mml:mi>&#x1D4B4;</mml:mi></mml:mrow><mml:mspace width="negativethinmathspace" /><mml:mo>=</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula> denotes the set of class labels. The training dataset is denoted by <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">n</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mspace width="negativethinmathspace" /><mml:mo>=</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, where <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mspace width="negativethinmathspace" /><mml:mo>&#x2208;</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mrow><mml:mi>&#x1D4B3;</mml:mi></mml:mrow></mml:math></inline-formula> is the <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>n</mml:mi></mml:math></inline-formula>-th input sample and <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mspace width="negativethinmathspace" /><mml:mo>&#x2208;</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mrow><mml:mi>&#x1D4B4;</mml:mi></mml:mrow></mml:math></inline-formula> is its corresponding class label. For a clean input <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula>, the model output satisfies <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mo>=</mml:mo><mml:mspace width="negativethinmathspace" /><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula>, indicating that the prediction is correct on normal samples. For an input sample <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> with a backdoor trigger <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, the model satisfies <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, where <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> is the target label specified by the attacker. To implement the backdoor attack, <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mi>&#x03B1;</mml:mi><mml:mi>N</mml:mi></mml:math></inline-formula> (where <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mn>0</mml:mn><mml:mo>&#x003C;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x003C;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>) samples are selected from the training dataset as poisoned samples, and the following modifications are applied: (1) <bold>Label modification:</bold> change the labels of the poisoned samples to the target label <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>. (2) <bold>Trigger injection:</bold> add a trigger <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> to each poisoned sample so that <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> becomes a triggered sample. After processing the samples, we train the model <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> on the modified training set so that it behaves normally on clean samples while being activated by the embedded triggers for backdoor attacks.</p>
<p><bold>Defense Setting:</bold> This study considers a realistic black-box defense scenario, where the defender obtains a pre-trained model from an untrusted source without access to its training process, and is only given clean samples and test samples for inspection. (1) <bold>Defense Objective:</bold> Detect poisoned samples with backdoor triggers while maintaining accuracy on clean samples comparable to standard defenses. (2) <bold>Defense Capability:</bold> Our method detects backdoored samples without modifying the dataset or model, preventing trigger-target associations. It generalizes to multiple types of backdoors and various image classification datasets without requiring knowledge of model architecture or parameters.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Defense Model</title>
<p>Our backdoor defense adopts the inverse reasoning concept of &#x201C;input-output&#x201D; to achieve accurate sample detection. Firstly, a convolutional neural network (CNN) extracts feature vectors for all images in the dataset. By calculating the difference between feature vectors, we select the two clean images with the largest difference in features between them. Secondly, the test image is fused with each of the two most dissimilar images at a certain ratio. Finally, the two mixed samples are input into the model to obtain two outputs, and the consistency of these outputs is used to determine whether the input samples are poisoned, thereby achieving backdoor detection. The specific detection flowchart is shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>AFI process diagram</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-2.tif"/>
</fig>
<p>It is worth noting that in order to carry out effective backdoor attacks, attackers must carefully design the spatial location and pattern of triggers. Attackers typically maximize the mask of the trigger with the activation value of one or more neurons to create trigger-sensitive internal neurons. This design reinforces the activation of vulnerable neurons and establishes a strong association between the trigger and these neurons. As a result, once the model detects the trigger, these neurons are strongly activated, leading to a corresponding misclassification.</p>
<p>From a defense perspective, this characteristic can be exploited to identify poisoned inputs. The conceptual basis of AFI is as follows: poisoned samples contain a backdoor trigger that is specifically designed to maximize the activation of a small set of neurons highly sensitive to the trigger, establishing a strong association between the trigger and these neurons. Once the model detects the trigger, these neurons are significantly activated and dominate the output. Even when a poisoned sample is blended with other images, this dominant activation persists, resulting in consistently predicted labels. In contrast, clean samples lack such dominant features, so blending introduces variability in their representations, leading to unstable predictions. This difference in prediction stability under blending provides the conceptual foundation for AFI&#x2019;s ability to distinguish poisoned samples from clean ones. When the input contains a trigger, it causes significant increases in neuron activations, which heavily influence the model output. Therefore, when the poisoned sample is fused with the clean sample, the model output will still be affected by the triggers and each output matches the attacker-specified label specified by the attacker. When the sample to be detected is a clean sample, the model output will be affected by the fusion of the two samples with which it is fused due to the large differences in their characteristics, showing significant differences. As illustrated in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>AFI</title>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Feature Extraction</title>
<p>In order to successfully extract two image samples with significant feature differences from a clean dataset, we employ the widely used ResNet-18 as a generic feature encoder, denoted as <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mi>&#x03D5;</mml:mi></mml:math></inline-formula>. This choice is based on the following considerations. First, the ResNet architecture, which effectively mitigates the degradation problem in deep networks through residual connections, is a representative model widely adopted in the field of computer vision. Its strong performance across diverse tasks, from MNIST to the ImageNet subset, ensures the applicability of our method. Second, to maintain consistency in the experimental framework, the target classifiers <italic>C</italic> under attack in this study and the feature encoder <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mi>&#x03D5;</mml:mi></mml:math></inline-formula> used here are both based on the ResNet architecture. This approach aims to control variables and provide a purer evaluation of the defense method&#x2019;s own effectiveness.</p>
<p>Furthermore, we note that the choice of model architecture may affect the specific geometry of the feature space. However, the core mechanism of our method relies on a more universal principle: that backdoor attacks cause poisoned samples to become statistical outliers in the deep feature space. Extensive research has shown that high-level features obtained from pre-training on different CNN architectures (e.g., VGG, ResNet), despite differences in spatial geometry, maintain consistency in their discriminability for semantic content. Therefore, the effectiveness of the extreme disparity sample pairs identified based on ResNet-18 features is expected to transfer to other modern architectures, ensuring the robustness of our method&#x2019;s core conclusions and confirming that it is not limited to a specific network.</p>
<p>The general process and formulas for image feature extraction are outlined as follows.</p>
<p>Firstly, convolutional layers serve as the foundation for image feature extraction. The convolution operation involves sliding a filter (or kernel) over the input image and performing dot products to generate feature maps.</p>
<p>Assuming the input image is denoted as <italic>I</italic> and the filter weights as <italic>W</italic>, the convolution operation can be expressed as:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mi>O</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:munder><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munder><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mi>W</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:math></disp-formula></p>
<p><inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mi>O</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes the value at position <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> in the output feature map, <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represents the pixel value at the corresponding position in the input image, <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mi>W</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> refers to the weights of the convolution kernel, and <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:mi>b</mml:mi></mml:math></inline-formula> is the bias term.</p>
<p>In order to introduce nonlinearity in the model, an activation function is typically applied after the convolution operation. The most commonly used activation function is the Rectified Linear Unit (ReLU), which is defined as follows:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>The activation function performs a nonlinear transformation on the output of the convolutional layer.</p>
<p>The pooling layer is used to reduce the size of the feature map while retaining important features. A common pooling operation is maximum pooling with the formula:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mi>O</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:munder><mml:mspace width="thinmathspace" /><mml:mo stretchy="false">(</mml:mo><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the pixel value within the pooling window and <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mi>O</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the output value after pooling.</p>
<p>At the end of the encoder, the extracted features are usually flattened into a vector, and a fully connected layer is used to generate the final feature encoding. The formula for the fully connected layer is:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mi>W</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:mi>b</mml:mi></mml:math></disp-formula>where <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:mi>x</mml:mi></mml:math></inline-formula> is the input vector (flattened feature map), <italic>W</italic> is the weight matrix, <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:mi>b</mml:mi></mml:math></inline-formula> is the bias term, and <italic>Y</italic> is the output feature vector.</p>
<p>The second step is to encode and output features. After multiple layers of convolution, pooling, and fully connected operations, the image is finally encoded into a feature vector <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:mi>h</mml:mi></mml:math></inline-formula>.</p>
<p>Finally, by using Euclidean distance to calculate the difference between the feature vectors, the two images with the largest difference are found, and the formula is as follows:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mi>D</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msqrt><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:msqrt></mml:math></disp-formula>where <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the feature vectors of the two images, respectively, <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:mi>k</mml:mi></mml:math></inline-formula>-th components in the feature vector, and <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>n</mml:mi></mml:math></inline-formula> is the dimension of the feature vector.</p>
<p>We use Euclidean distance for its simplicity and interpretability. Since our method mainly relies on poisoned samples being statistical outliers in the feature space, the choice of distance metric is not critical, and other reasonable metrics are expected to yield similar results.</p>
<p>After the feature extraction operation using <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>&#x03D5;</mml:mi></mml:math></inline-formula>, the two clean samples with the most disparate feature vectors <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:msub><mml:mi>h</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msub><mml:mi>h</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:math></inline-formula> are identified and denoted as <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:msub><mml:mi>c</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:msub><mml:mi>c</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:math></inline-formula>, respectively. The flowchart for selecting feature sample pairs is shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Process diagram for selecting feature sample pairs</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-3.tif"/>
</fig>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Sample Fusion</title>
<p>We adopt the Blended Injection Strategy (BIS) [<xref ref-type="bibr" rid="ref-30">30</xref>]. The image samples to be detected are fused with two clean samples that exhibit significant feature differences, in order to construct hybrid samples with discriminative properties. Specifically, if the original detection image is <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:mi>c</mml:mi></mml:math></inline-formula> and the two feature comparison samples are <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:msub><mml:mi>c</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msub><mml:mi>c</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:math></inline-formula>, the fusion process can be formally expressed as:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mrow><mml:mtext>mix</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>c</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>,</mml:mo><mml:mspace width="1em" /><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represents the image fusion ratio, which controls the blending weights between the original detection image and the two clean feature contrast samples.</p>
</sec>
<sec id="s3_3_3">
<label>3.3.3</label>
<title>Reverse Reasoning</title>
<p>Through the above feature fusion step, two mixed samples can be constructed respectively and recorded as <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mtext>mix1</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mtext>mix2</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula>. Then, these mixed samples are input into the target classifier <italic>C</italic> to obtain their corresponding prediction outputs <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. By comparing the classification results of <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, we can infer whether the sample is controlled by the potential backdoor trigger. If <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, it indicates that the model maintains a stable prediction of a particular category under the fusion disturbance, which means that the sample to be detected contains triggers and thus activates backdoor behavior in the model. On the contrary, if the two prediction results are significantly different, it means that the sample to be detected is normal, and its decision boundary is vulnerable to fusion disturbance. The decision function <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is defined as follows:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x2260;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>If <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>, the sample to be detected is suspected to be a poisoned sample. Conversely, it is considered to be a clean sample. This discriminative mechanism does not need to access the model structure and parameters, which is suitable for black-box detection, and the fusion disturbance enlarges the output stability of poisoned samples, thus improving the accuracy and robustness of detection.</p>
</sec>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Selection Principle and Strategy for the Fusion Ratio <bold><inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula></bold></title>
<p>The fusion ratio <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> is a key parameter that balances the influence between the test sample <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>x</mml:mi></mml:math></inline-formula> and the clean reference sample <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula>. Extensive experiments show that an effective range for <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> consistently lies between <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:mn>0.3</mml:mn></mml:math></inline-formula> and <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:mn>0.6</mml:mn></mml:math></inline-formula>. The underlying principle is that <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> should be large enough to preserve potential trigger signals in the test sample, yet small enough to allow the clean reference to provide a strong and correct semantic context for consistency checking.</p>
<p>Based on an in-depth analysis of the spatial characteristics of triggers from different backdoor attacks, we summarize the following empirical selection strategy:
<list list-type="bullet">
<list-item>
<p><bold>For Local Trigger Attacks:</bold> Such as BadNets and IAB, where the trigger is confined to a small region of the image. We recommend using a lower <inline-formula id="ieqn-65"><mml:math id="mml-ieqn-65"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> value (approximately <inline-formula id="ieqn-66"><mml:math id="mml-ieqn-66"><mml:mn>0.3</mml:mn></mml:math></inline-formula>&#x2013;<inline-formula id="ieqn-67"><mml:math id="mml-ieqn-67"><mml:mn>0.5</mml:mn></mml:math></inline-formula>). A smaller fusion ratio helps retain the local trigger pattern in the blended image without overwhelming it with clean-sample semantics.</p></list-item>
<list-item>
<p><bold>For Global Trigger Attacks:</bold> Such as Blend and WaNet, where the trigger is distributed across the entire image as subtle perturbations or warping. We recommend using a higher <inline-formula id="ieqn-68"><mml:math id="mml-ieqn-68"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> value (approximately <inline-formula id="ieqn-69"><mml:math id="mml-ieqn-69"><mml:mn>0.4</mml:mn></mml:math></inline-formula>&#x2013;<inline-formula id="ieqn-70"><mml:math id="mml-ieqn-70"><mml:mn>0.6</mml:mn></mml:math></inline-formula>) to ensure that the global trigger maintains sufficient influence in the fused image.</p></list-item>
</list></p>
<p>The above strategy provides a robust guideline for scenarios where the attack type is known or can be inferred. For completely unknown cases, we suggest a simple grid search procedure over <inline-formula id="ieqn-71"><mml:math id="mml-ieqn-71"><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0.3</mml:mn><mml:mo>,</mml:mo><mml:mn>0.6</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> with a step size of <inline-formula id="ieqn-72"><mml:math id="mml-ieqn-72"><mml:mn>0.01</mml:mn></mml:math></inline-formula>, using a small validation subset to select the optimal value. This protocol has been verified effective in our experiments, ensuring the AFI method&#x2019;s robustness and reproducibility across diverse datasets and attack types.</p>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Summary</title>
<p>Firstly, we propose a sample-level universal backdoor detection method, which thoroughly embodies the dialectical unity of attack and defense. This approach employs a straightforward mechanism to achieve accurate and efficient defense. Secondly, for the backdoor defender, the method only requires a clean sample dataset. By extracting features and computing the feature distances, it selects the two samples with the most significant feature discrepancy. Compared with conventional backdoor defense techniques, our method greatly reduces the amount of data computation and storage. For users, this method does not need any operation on the dataset, just upload and input the image samples to be detected, which is convenient to use. Thirdly, our method focuses on the new defense concept of trigger detection. It only needs to detect the poisoned samples containing triggers and prevent the toxic samples from entering the model, which not only avoids backdoor attacks but also retains the availability of the model on clean samples. The AFI algorithm is shown in Algorithm 1.</p>
<fig id="fig-7">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-7.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets and Models</title>
<p>The experiment involves three datasets, including MNIST [<xref ref-type="bibr" rid="ref-31">31</xref>], CIFAR-10 [<xref ref-type="bibr" rid="ref-32">32</xref>] and ImageNet subsets [<xref ref-type="bibr" rid="ref-33">33</xref>]. These datasets cover a variety of recognition tasks, including the classification of common objects, fine-grained classification, and action recognition.</p>
<p>All datasets are initially composed of clean samples. We implement four prominent backdoor attacks on these datasets: BadNets, Blend [<xref ref-type="bibr" rid="ref-14">14</xref>], WaNet [<xref ref-type="bibr" rid="ref-16">16</xref>], and IAB [<xref ref-type="bibr" rid="ref-13">13</xref>]. The ResNet-18 architecture [<xref ref-type="bibr" rid="ref-34">34</xref>] serves as the target model for both backdoor attack implantation and defense evaluation.</p>
<p><bold>Attack baseline:</bold> we have carried out four common backdoor attacks, including BadNets [<xref ref-type="bibr" rid="ref-9">9</xref>], Blend, WaNet, IAB. In BadNets, WaNet, and IAB, we choose 1 (<inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>) as the target label, and in Blend, we choose 0 (<inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>) as the target label. The poisoning rate for all attacks is set to 10%, meaning 10% of the training samples are poisoned. For poison-label attacks (BadNets, WaNet, IAB), this involves modifying both the sample and its label. For the clean-label attack (Blend), only the sample is modified while its original label is preserved.</p>
<p><bold>Defense baseline:</bold> We compare our method with six existing backdoor defense methods, including fine pruning (FP) [<xref ref-type="bibr" rid="ref-25">25</xref>], neural attention distillation (NAD) [<xref ref-type="bibr" rid="ref-35">35</xref>], anti-backdoor learning (ABL) [<xref ref-type="bibr" rid="ref-36">36</xref>], STRIP [<xref ref-type="bibr" rid="ref-26">26</xref>], SentiNet [<xref ref-type="bibr" rid="ref-27">27</xref>], and MNTD [<xref ref-type="bibr" rid="ref-28">28</xref>]. Since FP, NAD, and ABL are sensitive to their hyperparameters, we optimize their best results by grid search.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Evaluation Indicators</title>
<p>In order to evaluate the performance of AFI, we use three indicators: the accuracy of clean images (ACC), the success rate of detecting backdoor images (DSR), and the detection stability (DSP).</p>
<p>ACC reflects the classification ability of the model on a clean image. A higher value indicates that the model is not disturbed by backdoor attacks.
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:mrow><mml:mi>A</mml:mi><mml:mi>C</mml:mi><mml:mrow><mml:msub><mml:mrow><mml:mi>C</mml:mi></mml:mrow><mml:mrow><mml:mtext mathvariant="italic">clean</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext mathvariant="italic">clean</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:munderover><mml:mrow><mml:mo>&#x220F;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mtext mathvariant="italic">clean</mml:mtext></mml:mrow></mml:msub></mml:math></inline-formula> is the total number of clean images, <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the prediction label of the model for sample <inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the real label of sample <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, and <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:mo>&#x220F;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the indicating function. When the prediction is correct, it returns 1; otherwise, it returns 0.</p>
<p>DSR reflects the ability of AFI to successfully detect whether the sample is poisoned. Ideally, we hope that the defense success rate is as close as possible to 1, which means that the defense can effectively distinguish between clean samples and poisoned samples.
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:mi>R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo>[</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mo>&#x220F;</mml:mo><mml:mstyle scriptlevel="0"><mml:mrow><mml:mo maxsize="1.2em" minsize="1.2em">(</mml:mo></mml:mrow></mml:mstyle><mml:msub><mml:mi>C</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2260;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mstyle scriptlevel="0"><mml:mrow><mml:mo maxsize="1.2em" minsize="1.2em">)</mml:mo></mml:mrow></mml:mstyle><mml:mo>+</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mo>&#x220F;</mml:mo><mml:mstyle scriptlevel="0"><mml:mrow><mml:mo maxsize="1.2em" minsize="1.2em">(</mml:mo></mml:mrow></mml:mstyle><mml:msub><mml:mi>C</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mstyle scriptlevel="0"><mml:mrow><mml:mo maxsize="1.2em" minsize="1.2em">)</mml:mo></mml:mrow></mml:mstyle><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-94"><mml:math id="mml-ieqn-94"><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the total number of clean images without embedded backdoor triggers, <inline-formula id="ieqn-95"><mml:math id="mml-ieqn-95"><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the total number of images with backdoor triggers, The predictions <inline-formula id="ieqn-96"><mml:math id="mml-ieqn-96"><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">x</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and <inline-formula id="ieqn-97"><mml:math id="mml-ieqn-97"><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">x</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> are obtained by fusing the <inline-formula id="ieqn-98"><mml:math id="mml-ieqn-98"><mml:mi>i</mml:mi></mml:math></inline-formula>-th clean image with the two reference samples. <inline-formula id="ieqn-99"><mml:math id="mml-ieqn-99"><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">x</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and <inline-formula id="ieqn-100"><mml:math id="mml-ieqn-100"><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>C</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">m</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">x</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represent the prediction labels from blending the <inline-formula id="ieqn-101"><mml:math id="mml-ieqn-101"><mml:mi>j</mml:mi></mml:math></inline-formula>-th poisoned image with the two reference samples. <inline-formula id="ieqn-102"><mml:math id="mml-ieqn-102"><mml:mo>&#x220F;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the indicator function. In the clean sample detection calculation, if the prediction labels of two fused images are different, it returns 1; otherwise, it returns 0. In the calculation of poisoned sample detection, if the prediction labels of two fused images are the same, it returns 1; otherwise, it returns 0. The detection success rate DSR of the AFI is the average of <inline-formula id="ieqn-103"><mml:math id="mml-ieqn-103"><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula id="ieqn-104"><mml:math id="mml-ieqn-104"><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>.</p>
<p>We propose DSP (Detection Stability Performance) as a new evaluation metric to assess the universality and robustness of detection methods under practical conditions. DSP measures performance consistency across different datasets and attack methodologies. This metric addresses the real-world challenge where users cannot predetermine the attack presence or specific attack types, thus requiring defenses that remain effective in cross-domain scenarios.</p>
<p>DSP combines performance metrics across datasets and attacks, including average clean accuracy (<inline-formula id="ieqn-105"><mml:math id="mml-ieqn-105"><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">A</mml:mi><mml:mi mathvariant="normal">C</mml:mi><mml:mi mathvariant="normal">C</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>) and average detection rate on poisoned samples (<inline-formula id="ieqn-106"><mml:math id="mml-ieqn-106"><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">D</mml:mi><mml:mi mathvariant="normal">S</mml:mi><mml:mi mathvariant="normal">R</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>), formulated as:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mrow><mml:mo>[</mml:mo><mml:msqrt><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:munderover><mml:mi>A</mml:mi><mml:mi>C</mml:mi><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:msqrt><mml:mo>+</mml:mo><mml:msqrt><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:munderover><mml:mi>D</mml:mi><mml:mi>S</mml:mi><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:msqrt><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The square root operation normalizes magnitude differences across heterogeneous datasets and attack types, preventing any single condition from dominating the overall DSP. The equal weighting balances the contributions of classification accuracy and detection success rate, which may differ in scale or importance depending on the scenario. These design choices enhance the stability and interpretability of DSP while maintaining sufficient sensitivity to distinguish between methods.</p>
<p>The formula highlights how the square root reduces the influence of extreme values, and the weighting balances the two components, yielding a robust and interpretable evaluation metric across diverse datasets and attack types.</p>
<p>A larger DSP indicates that the detection method is more consistent and stable across scenarios, making it a valuable measure of universality and deployment potential in complex, open environments.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Result</title>
<p>To verify the superiority of AFI, we conduct four types of backdoor attacks across three datasets using ResNet. While maintaining a high accuracy and attack rate, the results of using AFI in turn are shown in <xref ref-type="table" rid="table-1">Table 1</xref>.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Defense-effect of AFI detection</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<tbody>
<tr>
<td rowspan="2"><bold>Dataset</bold></td>
<td></td>
<td></td>
<td align="center" colspan="4"><bold>Attack method</bold></td>
<td rowspan="2"><bold>DSP</bold></td>
</tr>
<tr>
<td></td>
<td></td>
<td><bold>BadNets</bold></td>
<td><bold>Blend</bold></td>
<td><bold>WaNet</bold></td>
<td><bold>IAB</bold></td>
</tr>
<tr>
<td rowspan="4">MNIST</td>
<td>Before</td>
<td>ACC</td>
<td>98.2</td>
<td>99.1</td>
<td>97.8</td>
<td>96.5</td>
<td>/</td>
</tr>
<tr>
<td></td>
<td>DSR</td>
<td>0.9</td>
<td>0</td>
<td>0.5</td>
<td>0.1</td>
<td>/</td>
</tr>
<tr>
<td>After</td>
<td>ACC</td>
<td>97.1</td>
<td>99.1</td>
<td>96.5</td>
<td>94.7</td>
<td rowspan="2">95.20</td>
</tr>
<tr>
<td></td>
<td>DSR</td>
<td>93.5</td>
<td>94.6</td>
<td>92.0</td>
<td>94.1</td>
</tr>
<tr>
<td rowspan="4">CIFAR-10</td>
<td>Before</td>
<td>ACC</td>
<td>94.9</td>
<td>94.1</td>
<td>93.6</td>
<td>94.2</td>
<td>/</td>
</tr>
<tr>
<td></td>
<td>DSR</td>
<td>0</td>
<td>1.7</td>
<td>0.1</td>
<td>0</td>
<td>/</td>
</tr>
<tr>
<td>After</td>
<td>ACC</td>
<td>94.1</td>
<td>94.0</td>
<td>93.4</td>
<td>93.9</td>
<td rowspan="2">94.15</td>
</tr>
<tr>
<td></td>
<td>DSR</td>
<td>94.5</td>
<td>93.8</td>
<td>94.5</td>
<td>95.1</td>
</tr>
<tr>
<td rowspan="4">ImageNet</td>
<td>Before</td>
<td>ACC</td>
<td>79.5</td>
<td>82.5</td>
<td>79.1</td>
<td>78.2</td>
<td>/</td>
</tr>
<tr>
<td></td>
<td>DSR</td>
<td>0.2</td>
<td>0.5</td>
<td>1.1</td>
<td>0.4</td>
<td>/</td>
</tr>
<tr>
<td>After</td>
<td>ACC</td>
<td>78.1</td>
<td>81.9</td>
<td>78.5</td>
<td>76.5</td>
<td rowspan="2">86.49</td>
</tr>
<tr>
<td></td>
<td>DSR</td>
<td>96.3</td>
<td>92.4</td>
<td>93.3</td>
<td>94.9</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-1fn1" fn-type="other">
<p>Note: ACC: Accuracy (%); DSR: Detection Success Rate (%).</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Evaluation results in <xref ref-type="table" rid="table-1">Table 1</xref> confirm AFI&#x2019;s universal defense capability. The method elevates DSR from near-zero to &#x003E;92% across all tested configurations while preserving ACC within 2% of its original value. Consistent DSP performance (86.49&#x2013;95.20) demonstrates robustness against diverse attack patterns (BadNets, Blend, WaNet, IAB) and dataset complexities, establishing AFI as an effective sample-level backdoor detection solution.</p>

</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Comparative</title>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Comparison of Similar Methods</title>
<p>In order to verify the effectiveness of AFI, we select six representative and widely used backdoor defenses as comparison baselines, namely FP, NAD, ABL, STRIP, SentiNet, and MNTD. We systematically evaluate four typical backdoor attacks on the CIFAR-10 dataset, which are BadNets, Blend, WaNet, and IAB. <xref ref-type="table" rid="table-2">Table 2</xref> summarizes the model classification ACC and DSR under different defense settings on the CIFAR-10 dataset.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Comparison of different defense effects on CIFAR-10</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<tbody>
<tr>
<td rowspan="2"><bold>Dataset</bold></td>
<td rowspan="2"><bold>Attack</bold></td>
<td align="center" colspan="4"><bold>CIFAR-10</bold></td>
<td rowspan="2"><bold>DSP</bold></td>
<td></td>
</tr>
<tr>
<td><bold>BadNets</bold></td>
<td><bold>Blend</bold></td>
<td><bold>WaNet</bold></td>
<td><bold>IAB</bold></td>
<td></td>
</tr>
<tr>
<td rowspan="2">No defense</td>
<td>ACC</td>
<td>94.9</td>
<td>94.1</td>
<td>93.6</td>
<td>94.2</td>
<td>94.20</td>
<td rowspan="2">47.33</td>
</tr>
<tr>
<td>DSR</td>
<td>0</td>
<td>1.7</td>
<td>0.1</td>
<td>0</td>
<td>0.45</td>
</tr>
<tr>
<td rowspan="2">FP</td>
<td>ACC</td>
<td>93.9</td>
<td>92.9</td>
<td>90.4</td>
<td>89.3</td>
<td>91.63</td>
<td rowspan="2">61.37</td>
</tr>
<tr>
<td>DSR</td>
<td>98.2</td>
<td><underline>22.9</underline></td>
<td><underline>1.4</underline></td>
<td><underline>1.9</underline></td>
<td>31.10</td>
</tr>
<tr>
<td rowspan="2">NAD</td>
<td>ACC</td>
<td>88.2</td>
<td>85.8</td>
<td><underline>71.3</underline></td>
<td>82.8</td>
<td>82.03</td>
<td rowspan="2">88.66</td>
</tr>
<tr>
<td>DSR</td>
<td>95.4</td>
<td>96.6</td>
<td>93.3</td>
<td>95.8</td>
<td>95.28</td>
</tr>
<tr>
<td rowspan="2">ABL</td>
<td>ACC</td>
<td>93.8</td>
<td>91.9</td>
<td><underline>84.1</underline></td>
<td>93.4</td>
<td>90.80</td>
<td rowspan="2"><bold>94.15</bold></td>
</tr>
<tr>
<td>DSR</td>
<td>98.9</td>
<td>98.4</td>
<td>97.8</td>
<td>94.9</td>
<td><bold>97.50</bold></td>
</tr>
<tr>
<td rowspan="2">STRIP</td>
<td>ACC</td>
<td>94.6</td>
<td>89.3</td>
<td>88.7</td>
<td>91.2</td>
<td>90.95</td>
<td rowspan="2">90.22</td>
</tr>
<tr>
<td>DSR</td>
<td>98.5</td>
<td>86.2</td>
<td><underline>80.4</underline></td>
<td>92.8</td>
<td>89.48</td>
</tr>
<tr>
<td rowspan="2">SentiNet</td>
<td>ACC</td>
<td>93.1</td>
<td>85.7</td>
<td>86.4</td>
<td>91.8</td>
<td>89.25</td>
<td rowspan="2">87.73</td>
</tr>
<tr>
<td>DSR</td>
<td>97.6</td>
<td><underline>79.5</underline></td>
<td><underline>81.4</underline></td>
<td>86.3</td>
<td>86.20</td>
</tr>
<tr>
<td rowspan="2">MNTD</td>
<td>ACC</td>
<td>93.9</td>
<td>92.8</td>
<td><underline>92.3</underline></td>
<td>89.1</td>
<td>92.03</td>
<td rowspan="2">92.72</td>
</tr>
<tr>
<td>DSR</td>
<td>97.2</td>
<td>96.5</td>
<td>93.6</td>
<td><underline>86.3</underline></td>
<td>93.40</td>
</tr>
<tr>
<td rowspan="2">AFI (ours)</td>
<td>ACC</td>
<td>94.1</td>
<td>94.0</td>
<td>93.4</td>
<td>93.9</td>
<td><bold>93.85</bold></td>
<td rowspan="2"><bold>94.15</bold></td>
</tr>
<tr>
<td>DSR</td>
<td>94.5</td>
<td>93.8</td>
<td>94.5</td>
<td>95.1</td>
<td>94.45</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-2fn1" fn-type="other">
<p>Note: Bold data means excellent performance. Underlined data indicates poor performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The experimental results show that all attacks achieve high classification accuracy (<inline-formula id="ieqn-107"><mml:math id="mml-ieqn-107"><mml:mo>&#x003E;</mml:mo><mml:mspace width="negativethinmathspace" /><mml:mn>93</mml:mn><mml:mi mathvariant="normal">&#x0025;</mml:mi></mml:math></inline-formula>) in the absence of defense, while the DSR remains nearly zero. This indicates that the model is successfully implanted with a backdoor yet still maintains strong clean-sample performance. FP achieves a high DSR against BadNets, but its defensive capability drops sharply when facing more complex and covert attacks such as Blend, WaNet, and IAB. Although NAD attains a 97.8% defense success rate on WaNet, this comes at the cost of nearly a 20% reduction in accuracy, significantly compromising the original model performance. ABL provides stronger defense across the four attacks, but still suffers from a notable accuracy drop under WaNet, suggesting limitations in handling warped triggers. STRIP and SentiNet, as representative inference-time detection approaches, exhibit considerable performance variability across different attack paradigms. STRIP achieves excellent performance against BadNets (DSR: 98.5%), yet its detection rate decreases substantially on WaNet (DSR: 80.4%), highlighting the inherent limitations of its input-perturbation-based entropy analysis when confronted with global or feature-space perturbations. SentiNet also displays strong attack-dependent behavior, achieving only 79.5% detection on Blend due to the failure of its saliency-based spatial localization assumptions under globally distributed triggers. MNTD, which adopts a meta-learning framework, shows relatively more stable performance with an average DSR of 93.40%. However, its detection rate on IAB (86.3%) is comparatively lower, and its average ACC (92.03%) is also somewhat affected. These results indicate that the method&#x2019;s effectiveness still depends on the diversity and coverage of attack patterns used during training.</p>
<p>In contrast, the proposed AFI method demonstrates consistently strong performance across all attack scenarios. It achieves a high detection rate (94.45% DSR) while preserving excellent classification accuracy (93.85% ACC), nearly matching the undefended baseline. Compared with inference-time detectors, AFI improves DSR by more than 13 percentage points on complex attacks and avoids the need for meta-training. These results confirm that AFI achieves an optimal balance between detection effectiveness, model preservation, and generalization capability.</p>
<p><xref ref-type="table" rid="table-3">Table 3</xref> reports the ACC and DSR of models trained on ImageNet under different defense methods (FP, NAD, DBD, AFI) against four typical backdoor attacks (BadNets, Blend, WaNet, IAB).</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Comparison of different defense effects on ImageNet</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<tbody>
<tr>
<td rowspan="2"><bold>Dataset</bold></td>
<td rowspan="2"><bold>Attack</bold></td>
<td align="center" colspan="4"><bold>ImageNet</bold></td>
<td rowspan="2"><bold>DSP</bold></td>
<td></td>
</tr>
<tr>
<td><bold>BadNets</bold></td>
<td><bold>Blend</bold></td>
<td><bold>WaNet</bold></td>
<td><bold>IAB</bold></td>
<td></td>
</tr>
<tr>
<td rowspan="2">No defense</td>
<td>ACC</td>
<td>79.5</td>
<td>82.5</td>
<td>79.1</td>
<td>78.2</td>
<td>79.83</td>
<td rowspan="2">40.19</td>
</tr>
<tr>
<td>DSR</td>
<td>0.2</td>
<td>0.5</td>
<td>1.1</td>
<td>0.4</td>
<td>0.55</td>
</tr>
<tr>
<td rowspan="2">FP</td>
<td>ACC</td>
<td>70.3</td>
<td>63.4</td>
<td>58.2</td>
<td>58.7</td>
<td>62.65</td>
<td rowspan="2">58.86</td>
</tr>
<tr>
<td>DSR</td>
<td>98.4</td>
<td>90.5</td>
<td><underline>15.6</underline></td>
<td><underline>15.8</underline></td>
<td>55.08</td>
</tr>
<tr>
<td rowspan="2">NAD</td>
<td>ACC</td>
<td><underline>65.1</underline></td>
<td><underline>64.8</underline></td>
<td><underline>63.8</underline></td>
<td><underline>63.8</underline></td>
<td>64.38</td>
<td rowspan="2">81.28</td>
</tr>
<tr>
<td>DSR</td>
<td>94.9</td>
<td>99.7</td>
<td>98.7</td>
<td>99.4</td>
<td><bold>98.18</bold></td>
</tr>
<tr>
<td rowspan="2">ABL</td>
<td>ACC</td>
<td>81.9</td>
<td>82.3</td>
<td>80.6</td>
<td>83.1</td>
<td><bold>81.98</bold></td>
<td rowspan="2">77.23</td>
</tr>
<tr>
<td>DSR</td>
<td>99.7</td>
<td><underline>0</underline></td>
<td>90.2</td>
<td>100</td>
<td>72.48</td>
</tr>
<tr>
<td rowspan="2">AFI (ours)</td>
<td>ACC</td>
<td>78.1</td>
<td>81.9</td>
<td>78.5</td>
<td>76.5</td>
<td>78.80</td>
<td rowspan="2"><bold>86.49</bold></td>
</tr>
<tr>
<td>DSR</td>
<td>96.3</td>
<td>92.4</td>
<td>93.3</td>
<td>94.9</td>
<td>94.23</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-3fn1" fn-type="other">
<p>Note: Bold data means excellent performance. Underlined data indicates poor performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The experimental results show that, under baseline conditions without defense, all four backdoor attacks maintain high classification accuracy (above 78%), while the DSR remains close to 0. This indicates that the attacks successfully manipulate model behavior with minimal impact on clean-sample classification. FP demonstrates good defensive performance against BadNets and Blend, but performs poorly against more stealthy attacks. For example, the ACC under WaNet and IAB drops to around 15%, indicating that FP is limited and cannot generalize across diverse attack types. NAD achieves consistently high DSR across all attacks, but at the cost of significantly reducing the model&#x2019;s classification accuracy. DBD exhibits highly polarized performance, achieving 100% defense success under IAB but failing (0% DSR) against Blend. In contrast, AFI maintains stable and balanced performance, preserving both high ACC and robust DSR across all attack scenarios.</p>
<p>AFI demonstrates strong robustness, good transferability, and controllable accuracy degradation when defending against diverse backdoor attacks, highlighting its potential for practical deployment. <xref ref-type="fig" rid="fig-4">Fig. 4</xref> compares the performance of AFI with three baseline defenses under various backdoor attacks on the ImageNet and CIFAR-10 datasets. As shown in <xref ref-type="fig" rid="fig-4">Fig. 4a</xref>, both FP and AFI maintain high and stable classification accuracy across the four attacks. However, <xref ref-type="fig" rid="fig-4">Fig. 4b</xref> indicates that FP&#x2019;s DSR fluctuates substantially, revealing its instability across different attack types. Similarly, <xref ref-type="fig" rid="fig-4">Fig. 4c</xref> shows that DBD and AFI perform well overall, but <xref ref-type="fig" rid="fig-4">Fig. 4d</xref> reveals that DBD completely fails against the Blend attack, with its detection success rate dropping to 0%. Overall, AFI achieves more robust and reliable defensive performance across attacks and datasets, outperforming the comparative baselines.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Comparison of indicators based on different datasets</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-4.tif"/>
</fig>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Comparison of Different Fusion Proportions</title>
<p>It should be noted that the optimal fusion ratio of AFI varies across different attack types, generally falling between 0.3 and 0.6, which is a key factor influencing detection performance. (1) For BadNets, the trigger is local and typically resides in background regions. Thus, even when the fused image occupies a large proportion, the trigger remains identifiable. Increasing the fusion ratio improves clean-sample detection while still preserving the trigger in poisoned images. (2) In Blend, the trigger is globally embedded across the entire image, and the target-object region of the fused image introduces interference. Therefore, the fusion ratio must be reduced to maintain a high proportion of the poisoned image and preserve attack effectiveness. (3) Similarly, WaNet embeds its trigger globally via geometric warping, requiring the original image to retain a dominant proportion during fusion to prevent the trigger from being suppressed. (4) For IAB, the trigger is placed in non-object regions, and a larger fusion ratio is suitable, which is consistent with the behavior observed in BadNets.</p>
<p>This section examines the factors affecting the fusion ratio and provides representative results in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>. <xref ref-type="fig" rid="fig-5">Fig. 5a</xref>&#x2013;<xref ref-type="fig" rid="fig-5">c</xref> respectively shows the defensive accuracy of AFI against BadNets and Blend on MNIST and CIFAR-10.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Impact of different fusion ratios on model performance</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-5.tif"/>
</fig>
<p>Comparing <xref ref-type="fig" rid="fig-5">Fig. 5a</xref> and <xref ref-type="fig" rid="fig-5">b</xref>, it can be observed that when BadNets is applied to MNIST, the DSR of AFI on clean samples decreases as the fusion ratio increases, while the DSR on poisoned samples increases. The optimal fusion ratio in this setting is 0.3. When BadNets is performed on CIFAR-10, the optimal fusion ratio becomes 0.52. These results indicate that for the same attack across different datasets, the optimal fusion ratio of AFI varies. <xref ref-type="fig" rid="fig-5">Fig. 5c</xref> shows the results of Blend on MNIST, where the optimal fusion ratio is 0.58. Combined with <xref ref-type="fig" rid="fig-5">Fig. 5a</xref> and <xref ref-type="fig" rid="fig-5">c</xref>, it can be concluded that AFI also requires different optimal fusion ratios for different attack types on the same dataset. Therefore, when facing unknown attack types or datasets, a grid search over the range of 0.3&#x2013;0.6 (with a step size of 0.01) can be used to identify the optimal fusion ratio.</p>
<p>Taking the experiment in <xref ref-type="fig" rid="fig-5">Fig. 5</xref> as an example, the optimal fusion ratio corresponding to <xref ref-type="fig" rid="fig-5">Fig. 5a</xref> is 0.3. The optimal fusion ratio corresponding to <xref ref-type="fig" rid="fig-5">Fig. 5b</xref> is 0.52, and the visualized image samples with different fusion ratios are shown in <xref ref-type="fig" rid="fig-6">Fig. 6a</xref> and <xref ref-type="fig" rid="fig-6">b</xref>, respectively. Poisoned samples are generated by applying BadNets to clean samples. Clean samples and poisoned samples serve as mutual control groups. Regardless of whether the sample to be tested is toxic or not, when the fusion ratio <inline-formula id="ieqn-108"><mml:math id="mml-ieqn-108"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003C; 0.3, the two results are identical; When the fusion ratio <inline-formula id="ieqn-109"><mml:math id="mml-ieqn-109"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003E; 0.3, the two results differ. It is impossible to determine whether the sample under detection is poisoned based on the results. In <xref ref-type="fig" rid="fig-6">Fig. 6a</xref>, when the fusion ratio <inline-formula id="ieqn-110"><mml:math id="mml-ieqn-110"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003D; 0.3, if the sample to be detected is clean, <inline-formula id="ieqn-111"><mml:math id="mml-ieqn-111"><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mo>&#x2260;</mml:mo><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mn>2</mml:mn></mml:math></inline-formula>; If the sample to be tested is poisoned, <inline-formula id="ieqn-112"><mml:math id="mml-ieqn-112"><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mn>1</mml:mn><mml:mo>=</mml:mo><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>l</mml:mi><mml:mi>t</mml:mi><mml:mn>2</mml:mn></mml:math></inline-formula>. Therefore, we first use a large amount of data in the dataset to test the optimal fusion ratio, and then infer whether the sample to be tested is toxic by judging whether result1 is equal to result2. <xref ref-type="fig" rid="fig-6">Fig. 6b</xref> shows that when <inline-formula id="ieqn-113"><mml:math id="mml-ieqn-113"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003C; 0.52, the classification results of the two fused images are different, and when <inline-formula id="ieqn-114"><mml:math id="mml-ieqn-114"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003E; 0.52, the classification results of the two fused images are the same, making it impossible to determine the poisoning status of the test sample based on the classification results.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Comparison of model outputs under different fusion ratios</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73798-fig-6.tif"/>
</fig>
</sec>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Robustness Analysis against Adaptive Attackers</title>
<p>This section discusses the theoretical robustness of the AFI method when facing a strong, adaptive attacker with full knowledge of the defense mechanism, addressing the reviewer&#x2019;s valuable comment. We consider a white-box attack scenario: the attacker is fully aware of AFI&#x2019;s detection pipeline (including the feature extractor, blending ratio <inline-formula id="ieqn-115"><mml:math id="mml-ieqn-115"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula>, and decision rule) and aims to design a trigger <inline-formula id="ieqn-116"><mml:math id="mml-ieqn-116"><mml:mi>t</mml:mi></mml:math></inline-formula> that can evade detection.</p>
<p>The attacker&#x2019;s objective is formalized as a dual-goal optimization problem: the trigger must ensure that not only is the poisoned sample <inline-formula id="ieqn-117"><mml:math id="mml-ieqn-117"><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:mi>t</mml:mi></mml:math></inline-formula> classified as the target label <inline-formula id="ieqn-118"><mml:math id="mml-ieqn-118"><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula>, but also that the blended output <inline-formula id="ieqn-119"><mml:math id="mml-ieqn-119"><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x00D7;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> remains <inline-formula id="ieqn-120"><mml:math id="mml-ieqn-120"><mml:msub><mml:mi>y</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:math></inline-formula> when fused with any clean reference sample <inline-formula id="ieqn-121"><mml:math id="mml-ieqn-121"><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> from the holdout set <inline-formula id="ieqn-122"><mml:math id="mml-ieqn-122"><mml:mrow><mml:mi>&#x1D49E;</mml:mi></mml:mrow></mml:math></inline-formula>, thereby breaking the prediction consistency check upon which AFI relies.</p>
<p>Theoretically, we argue that successfully crafting such a trigger is inherently difficult for the adversary. AFI&#x2019;s fusion mechanism forces the attacker to solve a conflicting optimization objective: the trigger must be effective on the original poisoned sample <inline-formula id="ieqn-123"><mml:math id="mml-ieqn-123"><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> while simultaneously maintaining its dominance over the model&#x2019;s prediction even when <inline-formula id="ieqn-124"><mml:math id="mml-ieqn-124"><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is significantly &#x201C;diluted&#x201D; by the semantic content of a clean sample. This effectively necessitates the trigger to possess a global and overwhelmingly strong semantic influence in the feature space.</p>
<p>However, achieving this likely comes at a high cost for the attacker, leading to a critical trade-off:
<list list-type="bullet">
<list-item>
<p><bold>Loss of Stealth:</bold> An overly potent and global trigger required to withstand arbitrary fusion becomes more susceptible to detection via visual inspection or statistical anomaly detection, compromising the fundamental requirement of a stealthy backdoor.</p></list-item>
<list-item>
<p><bold>Degradation of Model Utility:</bold> Embedding such a powerful backdoor functionality often interferes with the model&#x2019;s normal decision-making process, potentially leading to a noticeable drop in clean data accuracy (<italic>ACC</italic>), which could reveal the presence of the attack.</p></list-item>
</list></p>
<p>In conclusion, the AFI mechanism does not attempt to create an impenetrable defense but rather fundamentally raises the bar for a successful attack. It forces the attacker into a difficult tri-lemma, having to balance attack effectiveness, trigger stealth, and model utility. While the implementation and evaluation of more complex adaptive attacks constitute an important direction for our future work, the above analysis demonstrates that AFI provides a foundation for practical backdoor defense with inherent robustness by making adaptive attacks more costly and difficult to conceal.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>We propose a feature-comparison fusion strategy that combines the sample under inspection with two contrasting reference samples and leverages model predictions for reverse reasoning, enabling effective identification of poisoned samples. Extensive experiments demonstrate that AFI robustly defends against four common backdoor attacks, outperforming three mainstream defense methods in detection success rate, accuracy, and stability across different datasets and attack types. Overall, AFI provides a novel and practical approach for backdoor attack defense.</p>
</sec>
</body>
<back>
<ack>
<p>The authors would like to thank all those who provided support and constructive feedback during the preparation of this work.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This work was supported by the National Natural Science Foundation of China Grant (No. 61972133), Project of Leading Talents in Science and Technology Innovation for Thousands of People Plan in Henan Province Grant (No. 204200510021), and the Key Research and Development Plan Special Project of Henan Province Grant (No. 241111211400).</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>The authors confirm contribution to the paper as follows: Conceptualization, Simin Tang and Zhiyong Zhang; Data curation, Simin Tang; Formal analysis, Simin Tang; Funding acquisition, Zhiyong Zhang; Investigation, Simin Tang; methodology, Simin Tang; Project administration, Zhiyong Zhang; Resources, Simin Tang and Junyan Pan; Software, Simin Tang; Supervision, Zhiyong Zhang, Weiguo Wang and Junchang Jing; Validation, Simin Tang; Visualization, Simin Tang; Writing&#x2014;original draft, Simin Tang; Writing&#x2014;review &#x0026; editing, Zhiyong Zhang, Junyan Pan, Gaoyuan Quan, Weiguo Wang and Junchang Jing. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>The datasets used in this study are publicly available. Specifically, MNIST [<xref ref-type="bibr" rid="ref-31">31</xref>], CIFAR-10 [<xref ref-type="bibr" rid="ref-32">32</xref>], and ImageNet [<xref ref-type="bibr" rid="ref-33">33</xref>] can be accessed from their official repositories. The implementation code is available from the corresponding author upon reasonable request.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable. This study does not involve human participants or animals. The experiments were conducted using publicly available datasets: MNIST [<xref ref-type="bibr" rid="ref-31">31</xref>], CIFAR-10 [<xref ref-type="bibr" rid="ref-32">32</xref>], and ImageNet [<xref ref-type="bibr" rid="ref-33">33</xref>].</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<glossary content-type="abbreviations" id="glossary-1">
<title>Abbreviations</title>
<def-list>
<def-item>
<term>AFI</term>
<def>
<p>Adaptive Feature Injection</p>
</def>
</def-item>
<def-item>
<term>CNN</term>
<def>
<p>Convolutional Neural Network</p>
</def>
</def-item>
<def-item>
<term>ACC</term>
<def>
<p>Accuracy</p>
</def>
</def-item>
<def-item>
<term>DSR</term>
<def>
<p>Defense Success Rate</p>
</def>
</def-item>
<def-item>
<term>DSP</term>
<def>
<p>Detection Stability and Portability</p>
</def>
</def-item>
</def-list>
</glossary>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Sze</surname> <given-names>V</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>YH</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>TJ</given-names></string-name>, <string-name><surname>Emer</surname> <given-names>JS</given-names></string-name></person-group>. <article-title>Efficient processing of deep neural networks: a tutorial and survey</article-title>. <source>Proc IEEE</source>. <year>2017</year>;<volume>105</volume>(<issue>12</issue>):<fpage>2295</fpage>&#x2013;<lpage>329</lpage>. doi:<pub-id pub-id-type="doi">10.1109/JPROC.2017.2761740</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gu</surname> <given-names>T</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>K</given-names></string-name>, <string-name><surname>Dolan-Gavitt</surname> <given-names>B</given-names></string-name>, <string-name><surname>Garg</surname><given-names>S</given-names></string-name></person-group>. <article-title>Evaluating backdooring attacks on deep neural networks</article-title>. <source>IEEE Access</source>. <year>2019</year>;<volume>7</volume>:<fpage>47230</fpage>&#x2013;<lpage>44</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ACCESS.2019.2909068</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Bai</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xing</surname> <given-names>G</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Rao</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Backdoor attack and defense on deep learning: a survey</article-title>. <source>IEEE Trans Comput Soc Syst</source>. <year>2024</year>;<volume>12</volume>(<issue>1</issue>):<fpage>404</fpage>&#x2013;<lpage>34</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCSS.2024.3482723</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Qi</surname> <given-names>F</given-names></string-name>, <string-name><surname>Yao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Turn the combination lock: learnable textual backdoor attacks via word substitution</article-title>. In: <conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics</conf-name>; <year>2021 Aug 1&#x2013;6</year>; <publisher-loc>Online</publisher-loc>. p. <fpage>4873</fpage>&#x2013;<lpage>83</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.377</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zeng</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Park</surname> <given-names>W</given-names></string-name>, <string-name><surname>Mao</surname> <given-names>ZM</given-names></string-name>, <string-name><surname>Jia</surname> <given-names>R</given-names></string-name></person-group>. <article-title>Rethinking the backdoor attacks&#x2019; triggers: a frequency perspective</article-title>. In: <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision; 2021 Oct 11&#x2013;17</conf-name>; <publisher-loc>Montreal, QC, Canada</publisher-loc>. p. <fpage>16473</fpage>&#x2013;<lpage>81</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01616</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Yao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Shan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Li</surname> <given-names>H</given-names></string-name>, <string-name><surname>Viswanath</surname> <given-names>B</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Neural cleanse: identifying and mitigating backdoor attacks in neural networks</article-title>. In: <conf-name>Proceedings of the IEEE Symposium on Security and Privacy; 2019 May 19&#x2013;23</conf-name>; <publisher-loc>San Francisco, CA, USA</publisher-loc>. p. <fpage>707</fpage>&#x2013;<lpage>23</lpage>. doi:<pub-id pub-id-type="doi">10.1109/SP.2019.00031</pub-id>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Ning</surname> <given-names>R</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>C</given-names></string-name>, <string-name><surname>Xin</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Gangsweep: Sweep out neural backdoors by GAN</article-title>. In: <conf-name>Proceedings of the 28th ACM International Conference on Multimedia; 2020 Oct 12&#x2013;16</conf-name>; <publisher-loc>Seattle, WA, USA</publisher-loc>. p. <fpage>3173</fpage>&#x2013;<lpage>81</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3394171.3413546</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Bai</surname> <given-names>J</given-names></string-name>, <string-name><surname>Gao</surname> <given-names>K</given-names></string-name>, <string-name><surname>Gong</surname> <given-names>D</given-names></string-name>, <string-name><surname>Xia</surname> <given-names>ST</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>W</given-names></string-name></person-group>. <article-title>Hardly perceptible trojan attack against neural networks with bit flips</article-title>. In: <conf-name>European Conference on Computer Vision</conf-name>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2022</year>. p. <fpage>104</fpage>&#x2013;<lpage>21</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-031-20065-6_7</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Gu</surname> <given-names>T</given-names></string-name>, <string-name><surname>Dolan-Gavitt</surname> <given-names>B</given-names></string-name>, <string-name><surname>Garg</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Badnets: identifying vulnerabilities in the machine learning model supply chain</article-title>. <comment>arXiv:1708.06733. 2017</comment>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lyu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Koren</surname> <given-names>N</given-names></string-name>, <string-name><surname>Lyu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Li</surname> <given-names>B</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Anti-backdoor learning: Training clean models on poisoned data</article-title>. In: <conf-name>Proceedings of the 35th Conference on Neural Information Processing Systems</conf-name>; <year>2021 Dec 6&#x2013;14</year>; <publisher-loc>Online</publisher-loc>. p. <fpage>14900</fpage>&#x2013;<lpage>12</lpage>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wenger</surname> <given-names>E</given-names></string-name>, <string-name><surname>Passananti</surname> <given-names>J</given-names></string-name>, <string-name><surname>Bhagoji</surname> <given-names>AN</given-names></string-name>, <string-name><surname>Yao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>BY</given-names></string-name></person-group>. <article-title>Backdoor attacks against deep learning systems in the physical world</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition; 2021 Jun 19&#x2013;25</conf-name>; <publisher-loc>Online</publisher-loc>. p. <fpage>6206</fpage>&#x2013;<lpage>15</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00614</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhao</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Xuan</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Dong</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>K</given-names></string-name></person-group>. <article-title>Defeat: deep hidden feature backdoor attacks by imperceptible perturbation and latent representation constraints</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition; 2022 Jun 19&#x2013;24</conf-name>; <publisher-loc>New Orleans, LA, USA</publisher-loc>. p. <fpage>15213</fpage>&#x2013;<lpage>22</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01478</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Nguyen</surname> <given-names>TA</given-names></string-name>, <string-name><surname>Tran</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Input-aware dynamic backdoor attack</article-title>. <source>Adv Neural Inf Process Syst</source>. <year>2020</year>;<volume>33</volume>:<fpage>3454</fpage>&#x2013;<lpage>64</lpage>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>X</given-names></string-name>, <string-name><surname>Bailey</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>F</given-names></string-name></person-group>. <article-title>Reflection backdoor: a natural backdoor attack on deep neural networks</article-title>. In: <conf-name>European Conference on Computer Vision</conf-name>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2020</year>. p. <fpage>182</fpage>&#x2013;<lpage>99</lpage>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Nguyen</surname> <given-names>A</given-names></string-name>, <string-name><surname>Tran</surname> <given-names>A</given-names></string-name></person-group>. <article-title>WaNet: imperceptible warping-based backdoor attack</article-title>. In: <conf-name>Proceedings of the 9th International Conference on Learning Representations; 2021 May 3&#x2013;7</conf-name>; <publisher-loc>Online</publisher-loc>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Gan</surname> <given-names>L</given-names></string-name>, <string-name><surname>Li</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>T</given-names></string-name>, <string-name><surname>Li</surname> <given-names>X</given-names></string-name>, <string-name><surname>Meng</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>F</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Triggerless backdoor attack for NLP tasks with clean labels</article-title>. In: <conf-name>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies; 2022 Jul 10&#x2013;15</conf-name>; <publisher-loc>Seattle, WA, USA</publisher-loc>. p. <fpage>2942</fpage>&#x2013;<lpage>52</lpage>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhao</surname> <given-names>S</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>L</given-names></string-name>, <string-name><surname>Wen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tuan</surname> <given-names>LA</given-names></string-name></person-group>. <article-title>Clean-label backdoor attack and defense: an examination of language model vulnerability</article-title>. <source>Expert Syst Appl</source>. <year>2025</year>;<volume>265</volume>:<fpage>125856</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.eswa.2024.125856</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>W</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>H</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Invisible backdoor attack with attention and steganography</article-title>. <source>Comput Vis Image Underst</source>. <year>2024</year>;<volume>249</volume>(<issue>1</issue>):<fpage>104208</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.cviu.2024.104208</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Xia</surname> <given-names>ST</given-names></string-name></person-group>. <article-title>Backdoor learning: a survey</article-title>. <source>IEEE Trans Neural Netw Learn Syst</source>. <year>2024</year>;<volume>35</volume>(<issue>1</issue>):<fpage>5</fpage>&#x2013;<lpage>22</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TNNLS.2022.3182979</pub-id>; <pub-id pub-id-type="pmid">35731760</pub-id></mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhao</surname> <given-names>S</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>X</given-names></string-name>, <string-name><surname>Bailey</surname> <given-names>J</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>YG</given-names></string-name></person-group>. <article-title>Clean-label backdoor attacks on video recognition models</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition; 2020 Jun 14&#x2013;19</conf-name>; <publisher-loc>Seattle, WA, USA</publisher-loc>. p. <fpage>14443</fpage>&#x2013;<lpage>52</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR42600.2020.01445</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>T</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>Quarantine: sparsity can uncover the trojan attack trigger for free</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition; 2022 Jun 19&#x2013;24</conf-name>; <publisher-loc>New Orleans, LA, USA</publisher-loc>. p. <fpage>598</fpage>&#x2013;<lpage>609</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR52688.2022.00068</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Guan</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>He</surname> <given-names>R</given-names></string-name>, <string-name><surname>Tao</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Few-shot backdoor defense using Shapley estimation</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition; 2022 Jun 19&#x2013;24</conf-name>; <publisher-loc>New Orleans, LA, USA</publisher-loc>. p. <fpage>13358</fpage>&#x2013;<lpage>67</lpage>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Doan</surname> <given-names>K</given-names></string-name>, <string-name><surname>Lao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>W</given-names></string-name>, <string-name><surname>Li</surname> <given-names>P</given-names></string-name></person-group>. <article-title>LIRA: learnable, imperceptible and robust backdoor attacks</article-title>. In: <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision; 2021 Oct 11&#x2013;17</conf-name>; <publisher-loc>Montreal, QC, Canada</publisher-loc>. p. <fpage>11966</fpage>&#x2013;<lpage>76</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01175</pub-id>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Salem</surname> <given-names>A</given-names></string-name>, <string-name><surname>Wen</surname> <given-names>R</given-names></string-name>, <string-name><surname>Backes</surname> <given-names>M</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Dynamic backdoor attacks against machine learning models</article-title>. In: <conf-name>Proceedings of the 2022 IEEE European Symposium on Security and Privacy (EuroS&#x0026;P)</conf-name>; <year>2022</year> Jun 6&#x2013;10; <publisher-loc>Genoa, Italy.</publisher-loc> p. <fpage>703</fpage>&#x2013;<lpage>18</lpage>. doi:<pub-id pub-id-type="doi">10.1109/EuroSP53844.2022.00499</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>K</given-names></string-name>, <string-name><surname>Dolan-Gavitt</surname> <given-names>B</given-names></string-name>, <string-name><surname>Garg</surname><given-names>S</given-names></string-name></person-group>. <article-title>Fine-pruning: defending against backdooring attacks on deep neural networks</article-title>. In: <conf-name>International Symposium on Research in Attacks, Intrusions, and Defenses</conf-name>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2018</year>. p. <fpage>273</fpage>&#x2013;<lpage>94</lpage>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Gao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>S</given-names></string-name>, <string-name><surname>Ranasinghe</surname> <given-names>DC</given-names></string-name>, <string-name><surname>Nepal</surname> <given-names>S</given-names></string-name></person-group>. <article-title>STRIP: a defence against trojan attacks on deep neural networks</article-title>. In: <conf-name>Proceedings of the Annual Computer Security Applications Conference; 2019 Dec 9&#x2013;13</conf-name>; <publisher-loc>San Juan, PR, USA</publisher-loc>. p. <fpage>113</fpage>&#x2013;<lpage>25</lpage>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Chou</surname> <given-names>E</given-names></string-name>, <string-name><surname>Tram&#x00E8;r</surname> <given-names>F</given-names></string-name>, <string-name><surname>Pellegrino</surname> <given-names>G</given-names></string-name>, <string-name><surname>Boneh</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Sentinet: detecting physical attacks against deep learning systems</article-title>. <comment>arXiv:1812.00292. 2018</comment>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Rajabi</surname> <given-names>A</given-names></string-name>, <string-name><surname>Asokraj</surname> <given-names>S</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>F</given-names></string-name>, <string-name><surname>Niu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Ramasubramanian</surname> <given-names>B</given-names></string-name>, <string-name><surname>Ritcey</surname> <given-names>J</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>MDTD: a multi-domain Trojan detector for deep neural networks</article-title>. In: <conf-name>Proceedings of the ACM Conference on Computer and Communications Security; 2023 Nov 26&#x2013;30</conf-name>; <publisher-loc>Copenhagen, Denmark</publisher-loc>. p. <fpage>2232</fpage>&#x2013;<lpage>46</lpage>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Guo</surname> <given-names>W</given-names></string-name>, <string-name><surname>Tondi</surname> <given-names>B</given-names></string-name>, <string-name><surname>Barni</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Universal detection of backdoor attacks via density-based clustering and centroids analysis</article-title>. <source>IEEE Trans Inf Forensics Secur</source>. <year>2023</year>;<volume>19</volume>(<issue>7</issue>):<fpage>970</fpage>&#x2013;<lpage>84</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TIFS.2023.3329426</pub-id>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>C</given-names></string-name>, <string-name><surname>Li</surname> <given-names>B</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>K</given-names></string-name>, <string-name><surname>Song</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Targeted backdoor attacks on deep learning systems using data poisoning</article-title>. <comment>arXiv:1712.05526. 2017</comment>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>LeCun</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Bottou</surname> <given-names>L</given-names></string-name>, <string-name><surname>Bengio</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Haffner</surname> <given-names>P</given-names></string-name></person-group>. <article-title>Gradient-based learning applied to document recognition</article-title>. <source>Proc IEEE</source>. <year>1998</year>;<volume>86</volume>(<issue>11</issue>):<fpage>2278</fpage>&#x2013;<lpage>324</lpage>. doi:<pub-id pub-id-type="doi">10.1109/5.726791</pub-id>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Krizhevsky</surname> <given-names>A</given-names></string-name>, <string-name><surname>Hinton</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Learning multiple layers of features from tiny images. In: Technical report. Toronto, ON, Canada: University of Toronto; 2009</article-title>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Deng</surname> <given-names>J</given-names></string-name>, <string-name><surname>Dong</surname> <given-names>W</given-names></string-name>, <string-name><surname>Socher</surname> <given-names>R</given-names></string-name>, <string-name><surname>Li</surname> <given-names>LJ</given-names></string-name>, <string-name><surname>Li</surname> <given-names>K</given-names></string-name>, <string-name><surname>Fei-Fei</surname> <given-names>L</given-names></string-name></person-group>. <article-title>ImageNet: A large-scale hierarchical image database</article-title>. In: <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition; 2009 Jun 20&#x2013;25</conf-name>; <publisher-loc>Miami, FL, USA</publisher-loc>. p. <fpage>248</fpage>&#x2013;<lpage>55</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2009.5206848</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Deep residual learning for image recognition</article-title>. In: <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition; 2016 Jun 27&#x2013;30</conf-name>; <publisher-loc>Las Vegas, NV, USA</publisher-loc>. p. <fpage>770</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lyu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Koren</surname> <given-names>N</given-names></string-name>, <string-name><surname>Lyu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Li</surname> <given-names>B</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Neural attention distillation: erasing backdoor triggers from deep neural networks</article-title>. <comment>arXiv:2101.05930. 2021</comment>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhai</surname> <given-names>T</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>ST</given-names></string-name></person-group>. <article-title>Backdoor attack in the physical world</article-title>. <comment>arXiv:2104.02361. 2021</comment>.</mixed-citation></ref>
</ref-list>
</back></article>