<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">44506</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2023.044506</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>An Intelligent Approach for Intrusion Detection in Industrial Control System</article-title>
<alt-title alt-title-type="left-running-head">An Intelligent Approach for Intrusion Detection in Industrial Control System</alt-title>
<alt-title alt-title-type="right-running-head">An Intelligent Approach for Intrusion Detection in Industrial Control System</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Alkhalil</surname><given-names>Adel</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><email>a.alkalel@uoh.edu.sa</email></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Aljaloud</surname><given-names>Abdulaziz</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Uliyan</surname><given-names>Diaa</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Altamimi</surname><given-names>Mohammed</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Abdelrhman</surname><given-names>Magdy</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Altameemi</surname><given-names>Yaser</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>Ahmad</surname><given-names>Aakash</given-names></name><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-8" contrib-type="author">
<name name-style="western"><surname>Fouad Mansour</surname><given-names>Romany</given-names></name><xref ref-type="aff" rid="aff-6">6</xref></contrib>
<aff id="aff-1"><label>1</label><institution>Department of Information and Computer Science, College of Computer Science and Engineering, University of Ha&#x2019;il</institution>, <addr-line>Ha&#x2019;il, 81481</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-2"><label>2</label><institution>Applied College, University of Ha&#x2019;il</institution>, <addr-line>Ha&#x2019;il, 81481</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-3"><label>3</label><institution>College of Education, New Valley University</institution>, <addr-line>El-Kharga, 72511</addr-line>, <country>Egypt</country></aff>
<aff id="aff-4"><label>4</label><institution>College of Art, University of Ha&#x2019;il</institution>, <addr-line>Ha&#x2019;il, 81481</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-5"><label>5</label><institution>School of Computing and Communications, Lancaster University</institution>, <addr-line>Leipzig, 04109</addr-line>, <country>Germany</country></aff>
<aff id="aff-6"><label>6</label><institution>College of Science, New Valley University</institution>, <addr-line>El-Kharga, 72511</addr-line>, <country>Egypt</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Adel Alkhalil. Email: <email>a.alkalel@uoh.edu.sa</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2023</year></pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>29</day>
<month>11</month>
<year>2023</year></pub-date>
<volume>77</volume>
<issue>2</issue>
<fpage>2049</fpage>
<lpage>2078</lpage>
<history>
<date date-type="received">
<day>01</day>
<month>8</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>10</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2023 Alkhalil et al.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Alkhalil et al.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_44506.pdf"></self-uri>
<abstract>
<p>Supervisory control and data acquisition (SCADA) systems are computer systems that gather and analyze real-time data, distributed control systems are specially designed automated control system that consists of geographically distributed control elements, and other smaller control systems such as programmable logic controllers are industrial solid-state computers that monitor inputs and outputs and make logic-based decisions. In recent years, there has been a lot of focus on the security of industrial control systems. Due to the advancement in information technologies, the risk of cyberattacks on industrial control system has been drastically increased. Because they are so inextricably tied to human life, any damage to them might have devastating consequences. To provide an efficient solution to such problems, this paper proposes a new approach to intrusion detection. First, the important features in the dataset are determined by the difference between the distribution of unlabeled and positive data which is deployed for the learning process. Then, a prior estimation of the class is proposed based on a support vector machine. Simulation results show that the proposed approach has better anomaly detection performance than existing algorithms.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Industrial control system</kwd>
<kwd>anomaly detection</kwd>
<kwd>intrusion detection</kwd>
<kwd>system protection</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Research Deanship at the University of Ha&#x2019;il -Saudi Arabia</funding-source>
<award-id>RG-20146</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>The Industrial Control System is a control system for industrial production, and is an important part of national infrastructure, widely used in key fields such as water conservancy, nuclear power, and energy, as the core control equipment of national infrastructure; its security is related to the national economy and people&#x2019;s livelihood [<xref ref-type="bibr" rid="ref-1">1</xref>].</p>
<p>With the fast growth of industrial control systems, which are now extensively employed, security problems are becoming more common. The &#x201C;Stuxnet&#x201D; virus outbreak in 2010 immediately caused substantial damage to the centrifuges of Iran&#x2019;s nuclear plants. After the Stuxnet virus spread, the industrial control system eventually became one of the primary targets of attackers [<xref ref-type="bibr" rid="ref-2">2</xref>]. The global WannaCry ransomware epidemic in 2017 made use of the high-risk vulnerability &#x201C;Eternal Bule&#x201D; to spread globally, disrupting major businesses such as energy, transportation, and communications in many nations [<xref ref-type="bibr" rid="ref-3">3</xref>]. In March 2018, the United States Computer Emergency Preparedness Team issued security warning TA18-074A, which detailed a cyber-attack on a power facility in the United States by Russian hackers. The goal of this attack is to gather intelligence and record pertinent information for the computer implantation programmed to attack, resulting in massive losses for the power plant [<xref ref-type="bibr" rid="ref-4">4</xref>]. In 2019, a network targeted the computer system control center of the Guri Hydropower Station, Venezuela&#x2019;s largest power plant, creating a statewide power outage and affecting around 30 million people. The Guri Hydropower Station in Venezuela was attacked again in July of the same year, resulting in widespread outages in 16 states, including Lagas [<xref ref-type="bibr" rid="ref-5">5</xref>]. Because industrial control systems are such an important aspect of national infrastructure, assaults on them frequently result in more significant consequences and bigger economic losses.</p>
<p>Given the security dangers to the industrial control system, using intrusion detection measures to defend is a critical step. Now, various elements of intrusion detection based on the industrial control system are being explored, and intelligent detection of infiltration of the industrial control system is accomplished by combining the machine-learning model. Among the different machine learning models, the one-class support vector machine (OCSVM) model requires just one sort of data on the training data, allowing it to detect unknown intrusions, and as a result, it has become a popular approach for intrusion detection in industrial control systems. Due to the lack of negative example training data, the trained model will have a high FPR (False Positive Rate), therefore this work provides the learning model for intrusion detection, trains the model using regular traffic as positive example label data, and retains the model for unknown intrusions. While enhancing the model&#x2019;s detection ability, the model&#x2019;s intrusion detection ability is enhanced. Because the suggested learning model employs both a class of labeled data and unlabeled data to be identified for model training, its classification performance is frequently superior to that of the anomaly detection model.</p>
<p>The main contributions of this paper can be summarized as follows:
<list list-type="bullet">
<list-item>
<p>Because the trained model will have a high FPR (False Positive Rate) due to a lack of negative example training data, this work supplies the learning model for intrusion detection, trains the model using ordinary traffic as positive example label data, and retains the model for unknown intrusions. The model&#x2019;s intrusion detection ability is improved while its detection ability is improved. Because the proposed learning model uses both labeled and unlabeled data to train the model, its classification performance is typically superior to that of the anomaly detection model.</p></list-item>
<list-item>
<p>This paper analyses the class prior probability estimation algorithm based on the positive label frequency, divides the reliable positive example set through the one-class SVM model, improves the calculation method of the positive label frequency, and reduces the error in the prior probability estimate is small.</p></list-item>
<list-item>
<p>Based on the concealment characteristics of industrial control system attacks, positive unlabeled learning is applied to the intrusion detection of industrial control systems, a neural network is built for learning, and the classification model is trained using only normal traffic as label data, and a public data set experiment is performed. Experiments confirm the model&#x2019;s efficacy.</p></list-item>
</list></p>
<p>This paper is structured as follows. <xref ref-type="sec" rid="s2">Section 2</xref> presents the research status of intrusion detection and positive unlabeled learning in industrial control systems. <xref ref-type="sec" rid="s3">Section 3</xref> is the main research content of this paper. <xref ref-type="sec" rid="s4">Section 4</xref> verifies the effectiveness of the proposed algorithm through experiments. <xref ref-type="sec" rid="s5">Section 5</xref> summarizes the article.</p>
<sec id="s1_1">
<label>1.1</label>
<title>Symbols and Notation</title>
<p><xref ref-type="table" rid="table-1">Table 1</xref> lists the symbols and corresponding descriptions.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Symbols and description</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Symbol</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>ICS</td>
<td>Industrial control system</td>
</tr>
<tr>
<td>PLC</td>
<td>Programmable logic controller</td>
</tr>
<tr>
<td>SCADA</td>
<td>Supervisory control with data acquisition</td>
</tr>
<tr>
<td>IDS</td>
<td>Intrusion detection system</td>
</tr>
<tr>
<td>ERP</td>
<td>Enterprise resource planning</td>
</tr>
<tr>
<td>HMI</td>
<td>Human-machine interface</td>
</tr>
<tr>
<td>MES</td>
<td>Manufacturing execution system</td>
</tr>
<tr>
<td>MOMS</td>
<td>Manufacturing operations management system</td>
</tr>
<tr>
<td>SVM</td>
<td>Support vector machine</td>
</tr>
<tr>
<td>TCP</td>
<td>Transmission control protocol</td>
</tr>
<tr>
<td>IP</td>
<td>Internet protocol</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Overview of Industrial Control System</title>
<p>From top to bottom, the industrial control network layer model is separated into five layers: enterprise resource layer, production management layer, process monitoring layer, field control layer, and field device layer. The requirements for real-time depend on the layer. As indicated in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, the enterprise resource layer primarily consists of the functional units of the ERP system that are utilized to offer decision-making operation methods for the employees of the enterprise decision-making layer.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Industrial control system architecture</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-1.tif"/>
</fig>
<p>The field device layer is the lowest level of industrial control and contains certain field devices such as sensors, monitors, and other execution equipment units that are used to perceive and run the production process.</p>
<p>Field devices are monitored and controlled using the process monitoring layer and the field control layer. SCADA and HMI are the primary components of the process monitoring layer. SCADA may monitor and operate on-site operational equipment to perform data acquisition, equipment control, measurement, parameter modification, and other operations. HMI stands for human-machine interface, and it is used to communicate information between the system and the user. The on-site control layer is mostly PLC, which communicates with the HMI, receives control orders and query requests and communicates with field devices, controlling them by delivering operation instructions.</p>
<p>The production management layer includes MES and MOMS, which are used to manage the production process, such as manufacturing data management, production scheduling management, etc.</p>
<p>The top layer is the enterprise resource layer, where the enterprise resource planning (ERP) system manages core business processes, such as production or product planning, material management, and financial conditions.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Features of Intrusion Detection in Industrial Control System</title>
<p>There are significant differences between the intrusion detection of industrial control systems and the intrusion detection of the Internet. Due to the particularity of the environment of industrial control systems, it has unique characteristics [<xref ref-type="bibr" rid="ref-6">6</xref>]:
<list list-type="bullet">
<list-item>
<p>High real-time performance. Industrial control systems are usually deployed in fields such as electric power and nuclear energy, and the systems have high real-time performance, so intrusion detection also requires high real-time performance.</p></list-item>
<list-item>
<p>The resources of industrial control equipment are limited. Industrial control systems contain a large number of sensors and actuators that perform specific operations. To reduce costs, their computing and storage resources are usually very limited.</p></list-item>
<list-item>
<p>The device is difficult to update and reboot. The industrial control system is closely connected with the physical world, and it is usually impossible to suspend work, otherwise, it will cause serious harm to the entire industrial control system, personnel, and the environment.</p></list-item>
</list></p>
<p>Based on the characteristics of the above industrial control system, higher requirements are put forward for the intrusion detection system:
<list list-type="bullet">
<list-item>
<p>Real-time. Industrial control systems have higher real-time requirements for intrusion detection, requiring intrusion detection systems to use real-time information from industrial control systems for intrusion detection.</p></list-item>
<list-item>
<p>Resources are limited. The limited resources of the industrial control system restrict the methods of intrusion detection and require the intrusion detection model to have low resource consumption. The time complexity of some algorithms based on deep learning is relatively high, especially the deep learning model, regardless of the training time, some deep neural network models have a very large number of complex network structure parameters, and the required training and prediction time is also longer. In the case of resources first, some complex deep neural network models are difficult to apply to intrusion detection of industrial control systems. Therefore, when applying the neural network model to the intrusion detection of industrial control systems, it is necessary to focus on the complexity of the model and make the neural network structure as simple as possible while ensuring accuracy.</p></list-item>
<list-item>
<p>The device is difficult to update and restart. This feature limits the performance of intrusion detection models. First of all, because it is difficult for the equipment to update the model, it needs to have good generalization performance, that is, the model trained on the training data also needs to have good performance when applied to the real data. The second is the requirement of indicators. Since the device cannot be restarted or suspended, it is necessary to have a high precision rate for intrusion detection, that is, it is better to miss than to falsely report.</p></list-item>
</list></p>
<p>The above are the characteristics of industrial control systems. When performing intrusion detection, it is usually necessary to analyze based on its traffic. The characteristics of industrial data are high dimensionality and strong correlation, which will increase the training time of the intrusion detection model. Therefore, it is necessary to analyze the industrial data Feature extraction reduces the complexity of subsequent data modeling and processing.</p>
<p>Based on the requirements of high precision rate and low resource consumption of industrial control systems, as well as the difficulty of obtaining data labels, this paper constructs a shallow neural network for PU learning, which is used for intrusion detection of industrial control systems. At the same time, given the high dimensionality and strong correlation of industrial control system data, a feature selection algorithm based on PU learning is proposed for data dimensionality reduction.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Literature of Intrusion Detection Methods in Industrial Control Systems</title>
<p>Industrial control system intrusion detection can be divided into traffic-based detection, device state-based detection, and protocol-based detection. In terms of traffic, construct features through the real traffic of the industrial control system, such as flow duration, port, and other information, and then combine some machine learning models for detection, such as one-class support vector machine (SVM) [<xref ref-type="bibr" rid="ref-7">7</xref>]. In terms of equipment status, reference [<xref ref-type="bibr" rid="ref-8">8</xref>] proposed an intrusion detection method based on the CUSUM algorithm. In this method, the difference between the actual value obtained by the sensor and the predicted value of the model is used as the statistical sequence, and the offset is designed according to the <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mn>3</mml:mn><mml:mi>&#x03C3;</mml:mi></mml:math></inline-formula> principle. The constant determines the threshold, and finally, the method is verified in experiments to effectively detect deviation attacks and geometric attacks. In terms of protocols, some industrial control protocols are open, and detection rules can be formulated according to the specifications of these protocols to detect specific industrial control protocols, such as the Modbus protocol [<xref ref-type="bibr" rid="ref-8">8</xref>,<xref ref-type="bibr" rid="ref-9">9</xref>].</p>
<p>With the rapid development of machine learning and artificial intelligence, its influence gradually radiates to the field of intrusion detection. A large number of machine learning models are used for intrusion detection. Different applicable machine learning algorithms can be divided into traditional classification models and clustering models [<xref ref-type="bibr" rid="ref-10">10</xref>,<xref ref-type="bibr" rid="ref-11">11</xref>], ensemble models, anomaly detection models, and neural networks. Due to the rapid development of neural networks and better classification performance than traditional machine learning models, intrusion detection based on traditional classification models is gradually cooling down. The integrated model and the anomaly detection model have their characteristics. The integrated model has better classification performance by integrating multiple base classifiers, and it is like a random forest [<xref ref-type="bibr" rid="ref-12">12</xref>]. The advantages of anomaly detection such as OCSVM are: 1) It can detect unknown intrusions; 2) Only background traffic is required as training data. With the deepening of research, neural networks such as autoencoders are used for unsupervised anomaly detection [<xref ref-type="bibr" rid="ref-13">13</xref>].</p>
<p>The most commonly used anomaly detection algorithm for intrusion detection is OCSVM. Reference [<xref ref-type="bibr" rid="ref-14">14</xref>] investigated the application of a one-class SVM algorithm in the intrusion detection of industrial control systems. On the network layer and the transport layer, the OCSVM algorithm is used for TCP/IP traffic anomaly detection of the SCADA system. On the application layer, the OCSVM model is trained based on the normal communication flow of ModbusTCP for intrusion detection. At the same time, the paper also pointed out that there are three main problems in OCSVM anomaly detection: Industrial control system problem of feature construction, parameter optimization, and high false positive rate.</p>
<p>Dynamic control center architectures are vulnerable to a variety of potentially active and passive cyber-attacks, as already explored by various industrial control protocols such as IEEE C37.118, IEC 61850, DNP3, IEC-104c, which put a variety of power system assets, such as RTUs, PMUs, protection systems, or relays, as well as control room servers, in danger. MITM attacks, data spoofings (such as inserting fake commands to trip lines or manipulating PMU measurement information), eavesdropping, or reconnaissance assaults are examples of common active or passive attack types. Intrusion detection systems (IDSs) enable the detection of unlawful activities or occurrences in ICT systems and reduce cyberattacks on vital infrastructures as a common defensive strategy. To identify cyber-attacks occurring during the PMU data transmission based on the IEEE C37.118 protocol, specification-based NIDS with a variety of stateful or stateless deep packet inspections are provided.</p>
<p>Compared with classic anomaly detection models such as one-class SVM, the deep learning model has improved the detection rate, but it takes longer to train the model.</p>
<p><xref ref-type="table" rid="table-2">Table 2</xref> summarizes the work related to intrusion detection of industrial control systems based on machine learning in recent years. From the analysis of related work, the research on intrusion detection of industrial control systems has the following trends:</p>
<p><list list-type="bullet">
<list-item>
<p>Tend to anomaly detection. The intrusion detection of industrial control systems is more often treated as an anomaly detection problem. In terms of model selection, a classification model such as one-class SVM or an unsupervised model such as AE is preferred for identification [<xref ref-type="bibr" rid="ref-15">15</xref>].</p></list-item>
<list-item>
<p>Tend to high precision. In recent years of research work, some researchers tend to optimize model parameters through some parameter optimization algorithms such as Particle Swarm Optimization (PSO) and Gravitational Search Algorithm (GSA), so that the model has better classification performance.</p></list-item>
<list-item>
<p>Tend to be real-time and efficient. Due to limited resources, the industrial control system requires the model to have a small calculation cost. From the perspective of related work, the intrusion detection of industrial control systems pays more attention to the model with low calculation consumption. At the same time, most of the models are trained through feature selection or feature extraction methods, such as principle component analysis (PCA) and fisher score for dimensionality reduction, thereby reducing the time and computation required for model training. The long-short memory network (LSTM) is also compared.</p></list-item>
</list></p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Summary of various state-of-the-art methods</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Ref.</th>
<th>Method</th>
<th>Advantage</th>
<th>Disadvantage</th>
</tr>
</thead>
<tbody>
<tr>
<td>[<xref ref-type="bibr" rid="ref-8">8</xref>]</td>
<td>One-class SVM</td>
<td>Can detect unknown attacks, the training process has strong robustness to noise, good real-time performance, and online detection</td>
<td>The class of the exception is not recognized</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-11">11</xref>]</td>
<td>Time-series differential clustering</td>
<td>Can be executed under distributed system</td>
<td>The k-mean algorithm has limitations</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-13">13</xref>]</td>
<td>AE-IDS</td>
<td>Dimensionality reduction through random forest, and then anomaly detection through autoencoder, most of the data sets have better classification results in the experiment</td>
<td>The classification performance is poor on some data sets, and the generalization performance of the model is insufficient</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-15">15</xref>]</td>
<td>PCA-OCSVM</td>
<td>Reduce the training time of OCSVM through PCA dimensionality reduction</td>
<td>The class of the exception is not recognized</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-16">16</xref>]</td>
<td>LSTM &#x002B; CUSUM</td>
<td>Low false alarm rate</td>
<td>Identify exceptions only in the P1 process</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-17">17</xref>]</td>
<td>LSTM</td>
<td>Multi-classification, you can find the specific category of intrusion, and the classification accuracy rate reaches 98.30%</td>
<td>The detection accuracy for some attack categories needs to be improved</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-18">18</xref>]</td>
<td>DBN &#x002B; SVM</td>
<td>High detection accuracy</td>
<td>The classification performance of SVM is poor, and the training time of DBN is long</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td>1D-CNN, LSTM</td>
<td>Low false alarm rate</td>
<td>LSTM training time is long</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-20">20</xref>]</td>
<td>LSTM, AE</td>
<td>Strong scalability and high detection rate</td>
<td>Long training time</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-21">21</xref>]</td>
<td>CNN, BiLSTM</td>
<td>CNN can extract local features of data, LSTM can obtain data context information, and the classification accuracy of the model can reach up to 99.21%</td>
<td>The time complexity of model training is high</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-22">22</xref>]</td>
<td>AMPSO &#x002B; SVM, K-mean&#x002B;&#x002B;</td>
<td>Classify the strong classes through the SVM model, and perform K-means&#x002B;&#x002B; clustering on the weak classes. The detection accuracy of each attack type is relatively high</td>
<td>Can only detect known types of attacks</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-23">23</xref>]</td>
<td>SVM &#x002B; Random forest</td>
<td>Low time complexity</td>
<td>Low precision</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-24">24</xref>]</td>
<td>Hybrid-Multilevel IDS</td>
<td>The model is 97% accurate and can detect zero-day attacks</td>
<td>Low recall</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>1D-CNN, AE</td>
<td>The F1-score indicator is high</td>
<td>Unstable</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-26">26</xref>]</td>
<td>MAD-GAN</td>
<td>High precision, recall, and F1-score indicators</td>
<td>Higher FPR</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-27">27</xref>]</td>
<td>EPCA-HG-CNN</td>
<td>The amount of calculation is reduced by EPCA dimensionality reduction, and then one-dimensional convolution is used for classification, the model precision rate reaches 98.02%, and the recall rate reaches 98.39%</td>
<td>None</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-28">28</xref>]</td>
<td>SVPSO &#x002B; SVM</td>
<td>Solve the problem that the PSO algorithm is easy to fall into the local optimum in the later stage of the search. After parameter optimization, the model detection accuracy is 98.75%, and the false alarm rate is 1.22%.</td>
<td>Can only detect known types of attacks</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-29">29</xref>]</td>
<td>IGSA &#x002B; TWSVM</td>
<td>Improved the gravity search algorithm, increased its convergence speed, the model detection accuracy reached 98.2%, and the false positive<break/> rate was only 45%</td>
<td>Can only detect known types of attacks</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-30">30</xref>]</td>
<td>HAQPSO &#x002B; ELM</td>
<td>The input weight and hidden layer nodes are optimized by the HAQPSO algorithm, the model accuracy rate reaches 98.6%, and the recall rate reaches 97.86%</td>
<td>The detection accuracy of some attacks is not high</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Positive Unlabeled Learning</title>
<p>Positive unlabeled learning is a neural network-based anomaly detection approach that estimates the binary classification error using positive and unlabeled data sets, allowing the positive unlabeled learning model to attain classification performance similar to the binary classification model. Because positive unlabeled learning requires training the model with both positive and unlabeled data sets, the unlabeled data set must first estimate the mixing ratio of positive and negative samples before applying it to positive unlabeled learning [<xref ref-type="bibr" rid="ref-31">31</xref>,<xref ref-type="bibr" rid="ref-32">32</xref>], also known as class prior estimation. The main method of class prior probability estimation is to start from the distribution of the positive unlabeled data set. The distribution of the unlabeled data set is a combination of the positive data distribution and the negative data distribution, the class prior probability can be obtained by comparing the distribution of positive unlabeled data sets [<xref ref-type="bibr" rid="ref-33">33</xref>&#x2013;<xref ref-type="bibr" rid="ref-35">35</xref>]. In addition, the class prior probability estimation algorithm based on the positive label frequency is one of the most advanced algorithms at present. Reference [<xref ref-type="bibr" rid="ref-36">36</xref>] proposed the TICE algorithm, which divides reliable positive examples in the unlabeled data set Estimate the frequency of positive labels, which is currently the algorithm with the lowest time complexity.</p>
<p>Reference [<xref ref-type="bibr" rid="ref-37">37</xref>] first theoretically analyzed the positive unlabeled learning problem, compared positive unlabeled learning with the binary classification model, and estimated the loss of the binary classification sample under the condition of known class prior probability <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mi>&#x03C0;</mml:mi></mml:math></inline-formula>, theoretically can obtain the same decision surface as the binary classification model, which is called uPU (unbiased Positve-unlabeled learning). Because the loss function of the uPU model needs to satisfy the symmetric condition, reference [<xref ref-type="bibr" rid="ref-38">38</xref>] continued to carry out research, gave a method of applying the loss function that does not satisfy the symmetric condition to the uPU, and verified the non-convex loss function and the convex loss function functions have similar precision.</p>
<p>Reference [<xref ref-type="bibr" rid="ref-39">39</xref>] further compared the positive unlabeled learning model with the binary classification model and analyzed the reasons why the positive unlabeled learning model performed better than the binary classification model in some cases.</p>
<p>Reference [<xref ref-type="bibr" rid="ref-40">40</xref>] proposed the nnPU (Positive-unlabeled learning with Non-negative risk estimator) algorithm to solve the problem that uPU is prone to overfitting. Based on uPU, it estimated the binary classification loss method to change. Furthermore, it ensures that the estimated negative example loss is always positive, thereby avoiding the problem caused by the estimated loss being negative, and points out that the performance of nnPU is better than that of uPU. Finally, reference [<xref ref-type="bibr" rid="ref-41">41</xref>] summarized the existing positive unlabeled learning and analyzed the seven main problems of positive unlabeled learning in the article, including the assumptions of positive unlabeled learning, evaluation indicators, main models, and class priors.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed Intrusion Detection Learning Mechanism</title>
<p>The problem of intrusion detection in industrial control systems has received the attention of scholars as an anomaly detection problem, but some classic anomaly detection algorithms such as the one-class SVM algorithm have a high false positive rate, and the classification performance has a large gap compared with the binary classification model. This paper proposes to use positive unlabeled learning for intrusion detection. This method has been proved to have classification performance close to binary classification, and at the same time, it only needs one type of label data on the training data like the one-class SVM model.</p>
<p>The intrusion detection process based on positive unlabeled learning is shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. In feature engineering, it is necessary to analyze features through positive label data and wrong label data, select key features, reduce data dimensions, and reduce the impact of irrelevant features on model classification performance. At the same time, the class prior probability of positive unlabeled learning is used as prior knowledge, which needs to be processed at the same time as feature engineering. By analyzing positive data and mislabeled data, a model is built to estimate the class prior probability of mislabeled data sets. Then combine the positive label data, unlabeled data, and class prior probability after feature selection to train the positive unlabeled learning model, and finally output the classification label of the model and the mislabeled data set.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Proposed algorithm flowchart</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-2.tif"/>
</fig>
<p>Based on the above process, the main research content of this part is divided into three parts: First, explore a feature selection algorithm based on positive unlabeled learning, and analyze the importance of features based on positive label data and unlabeled data. Secondly, research class prior probability estimation algorithm, improve the accuracy of class prior probability estimation and provide important prior knowledge for positive unlabeled learning. Finally, based on the data after feature selection and the estimated class prior probability, the classification model is trained by positive unlabeled learning.</p>
<p>In this paper, the problems of anomaly detection are answered in a targeted manner:
<list list-type="bullet">
<list-item>
<p>In terms of feature engineering, this paper studies the calculation method of feature importance based on positive unlabeled learning, which can be used as a feature selection metric for feature selection of industrial control system data;</p></list-item>
<list-item>
<p>In terms of resource constraints and real-time issues in industrial control systems, this paper chooses a shallow neural network, which requires less storage resources and computing resources, which meets the needs of industrial control systems;</p></list-item>
<list-item>
<p>In terms of false alarm rate, positive unlabeled learning has been shown to perform similarly to the binary classification model and to have a higher accuracy rate than the unsupervised anomaly detection model.</p></list-item>
</list></p>
<sec id="s3_1">
<label>3.1</label>
<title>Feature Importance</title>
<p>In industrial control systems, data has the characteristics of high dimensionality and strong correlation. Many machine learning problems become difficult when the data dimensionality is high, a phenomenon known as the curse of dimensionality. Feature selection is an important part of feature engineering. Its principle is to extract key features from all features, to achieve the purpose of dimensionality reduction. Feature selection methods can be divided into two categories: encapsulation and filtering. Among them, the encapsulation feature selection usually selects a base model for multiple rounds of training and gradually screens out redundant features according to the classification performance of the trained model. Filtering feature selection is to calculate the importance of features, set a threshold to filter out irrelevant features, and further filter out redundant features through correlation.</p>
<p>In positive unlabeled learning, since there is only one class of labeled samples, it is difficult to evaluate the performance of the packaged model. Therefore, the filtering feature selection method is used in this paper. The commonly used feature importance calculation methods are shown in <xref ref-type="table" rid="table-3">Table 3</xref>.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Feature determination methods</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Feature</th>
<th>Expression</th>
</tr>
</thead>
<tbody>
<tr>
<td>Correlation coefficient</td>
<td><inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mi>r</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mrow><mml:mtext>Cov</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:msqrt><mml:mrow><mml:mtext>Var</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mtext>Var</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:msqrt></mml:mfrac></mml:mstyle></mml:math></inline-formula></td>
</tr>
<tr>
<td>Mutual information/information gain</td>
<td><inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mi>I</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>;</mml:mo><mml:mi>Y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mo movablelimits="false">&#x2211;</mml:mo></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>Y</mml:mi></mml:mrow></mml:munder><mml:mo>&#x2061;</mml:mo><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula></td>
</tr>
<tr>
<td>Symmetric uncertainty</td>
<td><inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mrow><mml:mtext>SU</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mi>I</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>;</mml:mo><mml:mi>Y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>Y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:math></inline-formula></td>
</tr>
<tr>
<td>Information distance</td>
<td><inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mrow><mml:mtext>d</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>X</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>Y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>H</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>Y</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>X</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mstyle></mml:math></inline-formula></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The importance calculation method of the filtering feature selection method is calculated by evaluating the correlation between features and labels, and it is considered that the features that have an obvious correlation with the target category are the key features. However, in positive unlabeled learning, there is only one class of labeled samples, and the feature importance calculation method in the binary classification model cannot be directly used. Therefore, it is necessary to find a feature importance calculation method suitable for positive unlabeled learning scenarios.</p>
<p>Inspired by the importance calculation idea of this binary classification, this paper presents a key feature identification method for PU learning: Considering that the unlabeled data set is a mixture of positive samples and negative samples, the attribute value of the feature in the unlabeled data set includes two parts: the positive value and the negative value. If the feature is strongly related to the class label, then the unlabeled data distribution of attribute values of this feature should show obvious bimodal or multimodal features and the distribution of different class sample features is quite different, as shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>. When the feature is weakly correlated with the class label, the positive sample similar to the feature distribution of the negative samples.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Comparison of feature relation of data correlation</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-3.tif"/>
</fig>
<p>The distribution difference of the feature on the positive data set and the unlabeled data set can be used as the importance of the feature. The Kullback&#x2013;Leibler (KL) divergence can describe the difference between two distributions, and its discrete form is shown in <xref ref-type="disp-formula" rid="eqn-1">formula (1)</xref>.
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>U</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x2211;</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>U</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The KL divergence requires the probability of a feature attribute value when calculating the difference between two feature distributions. First of all, considering that the value range of the attribute value of the feature is not limited, it is necessary to standardize the maximum and minimum values before the calculation, to limit the attribute value after normalization to the [0,1] interval. Secondly, there are two forms of continuous and discrete attribute values of features. To deal with them uniformly, the [0,1] interval is equally divided in the algorithm, and the KL divergence is calculated by using the frequency of samples in each small area as a probability. The specific steps are shown in Algorithm 1.</p>
<fig id="fig-14">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-14.tif"/>
</fig>
<p><bold>Time complexity analysis:</bold> In the third step of the algorithm, data standardization is carried out, and the data standardization method adopted is the standardization of maximum and minimum values, and the time complexity of this step is <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:mi>O</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>m</mml:mi><mml:mi>n</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>. Steps 4 to 6 are to calculate the feature&#x2019;s importance. By dividing the [0,1] interval into equal parts, the KL divergence is calculated with the frequency in each small interval as the probability. The time complexity of this part is <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mi>O</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>m</mml:mi><mml:mi>n</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>. So the total time complexity of the algorithm is <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mi>O</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>m</mml:mi><mml:mi>n</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>.</p>
<p>Through KL divergence, the estimated value of feature importance can be given in the scene with only positive label data, and key features can be distinguished from irrelevant features. In the case of redundant features, features can be filtered based on feature importance, such as setting feature importance thresholds or specifying the number of selected features.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Class Prior Probability Estimation for PU Learning</title>
<p>In the industrial control system, it is very difficult to collect a large amount of intrusion data, but the collection of traffic and status codes in the normal operation of the system is relatively simple. Taking the data in the normal state as the positive label data for positive unlabeled learning is in line with the actual situation of the industrial control system. In positive unlabeled learning, it is very important to analyze the data to be detected and obtain the class prior probability. The class prior probability of positive unlabeled learning is defined as <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mi>&#x03C0;</mml:mi><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, when the collection of samples satisfies the SCAR (select at completely random) assumption, the class prior probability is the proportion of positive samples in the unlabeled data set proportion.</p>
<p><bold>Definition 1 (SCAR assumption):</bold> The collection of samples has nothing to do with the attributes of the samples and is completely random, namely:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>According to the different sources of positive data, it can be divided into two categories: One Sample (OS) and Two Samples (TS). When the OS collects data, it only performs random sampling once, that is, randomly collects a part of the data in the real data, digs out some positive data from the collected data and adds labels, and unlabeled data as unlabeled data. When TS collects data, it needs to sample twice, that is, first randomly collect a part of the positive label data, and the unlabeled data set is obtained by random sampling in the real data.</p>
<p>Since the positive data is randomly selected in the unlabeled data set, an intermediate variable <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mi>c</mml:mi></mml:math></inline-formula> is generated in this scenario, which is called the positive label frequency (label frequency), which is defined as <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> represents the sample that the sample is selected from. The relationship between label frequency and class prior probability can be expressed by <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>.
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>c</mml:mi></mml:mfrac><mml:mi>p</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Therefore, the class prior probability can be estimated by estimating the positive label frequency <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>c</mml:mi></mml:math></inline-formula>. In particular, in the TS scenario, the positive data and unlabeled data can be mixed, the positive samples can be regarded as randomly selected and labeled positive samples, and the frequency of positive labels can also be estimated.</p>
<p>The lower bound of the estimated positive label frequency is obtained by using the decision tree to obtain the estimated value of the positive label frequency. This algorithm is called the TIcE algorithm. In this paper, the one-class SVM algorithm is used to improve the TIcE algorithm, and the one-class SVM algorithm is proposed to divide the reliable positive example set, and then estimate the positive label frequency.</p>
<p>The one-class SVM algorithm is a classic anomaly detection algorithm. When it uses the RBF kernel function, its performance is similar to that of support vector data description (SVDD). It can be considered that the one-class SVM algorithm finds a hypersphere in the feature space, contains the positive samples in the hypersphere, and makes the radius of the hypersphere the smallest. Its problem description is shown in <xref ref-type="disp-formula" rid="eqn-4">formula (4)</xref>.
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mo>&#x003A;</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mstyle><mml:msup><mml:mi>w</mml:mi><mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:mrow></mml:msup><mml:mi>w</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>+</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mi>v</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mstyle><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>&#x03BE;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>s</mml:mtext></mml:mrow><mml:mo>.</mml:mo><mml:mrow><mml:mtext>t</mml:mtext></mml:mrow><mml:mo>.</mml:mo><mml:mi>w</mml:mi><mml:mi>x</mml:mi><mml:mo>&#x2265;</mml:mo><mml:mi>&#x03C1;</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03BE;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03BE;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2265;</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:mi>w</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mi>v</mml:mi></mml:math></inline-formula> in the formula is the upper bound of the proportion of outliers, so the number of samples classified as outliers in the positive data set can be limited by setting parameters so that the estimated deviation is caused by the small number of positive samples in the model division. At the same time, when the upper bound of the proportion of abnormal points is set larger, the radius of the hypersphere at this time is smaller, and the samples classified as positive by the model can be used as reliable positive samples.</p>
<p>On the estimation of positive label frequency, the estimated value can be given by Chebyshev&#x2019;s inequality. Through Chebyshev&#x2019;s inequality, the number <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> of labeled samples in the positive example set <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mi>S</mml:mi></mml:math></inline-formula> satisfies the <xref ref-type="disp-formula" rid="eqn-5">formula (5)</xref>.
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03BC;</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mo>&#x2265;</mml:mo><mml:mi>&#x03B5;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:mfrac><mml:msup><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:msup><mml:mi>&#x03B5;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac></mml:math></disp-formula>where <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> obeys the binomial distribution, and the expectation of the random variable <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:mi>E</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>L</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, the variance is <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:mi>D</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>L</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the total number of samples in the positive example set <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:mi>S</mml:mi></mml:math></inline-formula>. Substitute into <xref ref-type="disp-formula" rid="eqn-5">formula (5)</xref> to get:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mo>&#x2265;</mml:mo><mml:mi>&#x03B5;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:msup><mml:mi>&#x03B5;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac></mml:math></disp-formula></p>
<p>Let <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:mi>&#x03B4;</mml:mi><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:msup><mml:mi>&#x03B5;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, then <xref ref-type="disp-formula" rid="eqn-6">formula (6)</xref> is equivalent to:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:msqrt><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mi>&#x03B4;</mml:mi></mml:mfrac></mml:msqrt><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:mi>&#x03B4;</mml:mi></mml:math></disp-formula></p>
<p>Through the <xref ref-type="disp-formula" rid="eqn-7">formula (7)</xref>, the upper and lower bounds of the positive label frequency <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:mi>c</mml:mi></mml:math></inline-formula> can be constrained by the probability <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:mi>&#x03B4;</mml:mi></mml:math></inline-formula>, as shown in the <xref ref-type="disp-formula" rid="eqn-8">formula (8)</xref>.
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>c</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mfrac></mml:mstyle><mml:mo>+</mml:mo><mml:msqrt><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x03B4;</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:msqrt><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:mi>&#x03B4;</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>c</mml:mi><mml:mo>&#x2265;</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mfrac></mml:mstyle><mml:mo>&#x2212;</mml:mo><mml:msqrt><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mi>c</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x03B4;</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:msqrt><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2264;</mml:mo><mml:mi>&#x03B4;</mml:mi></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>In the TIcE algorithm, since the algorithm for finding reliable positive examples is a decision tree, with the division of the decision tree, the number of leaf nodes decreases, and there will be some leaf nodes that deviate from the real sample mixing ratio. The lower bound of the experimental probability estimate is constrained. However, by dividing the reliable positive example set by the one-class SVM algorithm, the sample number of the positive example set can be constrained, so the midpoint of the interval can be taken as the estimated value of the positive example label frequency <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>c</mml:mi></mml:math></inline-formula>, and then the class prior probability can be calculated, called a type of prior probability estimation algorithm (one-class SVM-cE).</p>
<p>Compared with the TIcE algorithm, the one-class SVM first converts the algorithm for finding positive examples from decision trees to one-class SVM. On the one hand, this can limit the number of samples of reliable positive examples through the parameters of the one-class SVM model, and avoid the problem caused by reliable positive examples. On the other hand, the data used in the training model is optimized. TIcE needs to use both the positive data set and the unlabeled data set when constructing the decision tree.</p>
<p>When the TIcE algorithm estimates the class prior probability, it needs to repeatedly construct decision trees based on different unlabeled data sets, which is expensive in practical applications. However, the one-class SVM-cE algorithm only needs positive data sets when building the model, and the trained model can be used in different unlabeled data sets, so after the model is trained, the time complexity of the OCSVM-cE algorithm is reduced to <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>O</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. The specific steps are shown in Algorithm 2.</p>
<fig id="fig-15">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-15.tif"/>
</fig>
<p>Algorithm 2 can analyze the industrial control data to be detected, estimate its class prior probability, provide important prior knowledge for positive unlabeled learning, and avoid collecting industrial control system intrusion detection data, greatly reducing labor costs.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Neural Network in Positive Unlabeled Learning</title>
<p>In industrial control systems, intrusions are highly concealed and updated quickly. From &#x201C;Stuxnet&#x201D; to &#x201C;Duqu&#x201D;, and then to &#x201C;Flame&#x201D; flame virus, the traditional classification-based intrusion detection technology is difficult to cope with its update, and the intrusion detection is treated as anomaly detection. Although it cannot identify the type of intrusion, it can also have the ability to warn in the face of unknown intrusions. In this paper, the positive unlabeled learning method is used for intrusion detection, and the normal traffic is used as the label data, which participates in the training of the model at the same time as the data to be detected. The positive unlabeled learning approach, like the anomalous detection algorithm, can detect unknown assaults, and it has been demonstrated that the trained model has an accuracy comparable to the binary classification model.</p>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Positive Unlabeled Learning under Data Imbalance</title>
<p>The unlabeled data set is treated as a negative example data set with noisy label samples in PU learning, and the binary classification loss is computed using the class prior probability. <xref ref-type="disp-formula" rid="eqn-9">Formula (9)</xref> depicts the predicted computation of the binary classification loss.
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:mover><mml:mi>R</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>However, in positive unlabeled learning, there are no labeled negative examples, so the loss of negative examples cannot be directly calculated. In nnPU, it is proposed to estimate the loss of negative examples through unlabeled data sets, which is also the core idea of nnPU. The unlabeled data set mixes positive and negative samples, and it is regarded as a negative data set containing wrongly labeled samples, then the loss expectation can be expressed as follows:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mi>&#x03C0;</mml:mi></mml:math></inline-formula> is the class prior probability in the unlabeled dataset, <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:mi>l</mml:mi></mml:math></inline-formula> is the loss function, and <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the set of positive samples in the unlabeled dataset. In <xref ref-type="disp-formula" rid="eqn-10">formula (10)</xref>, <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> can be directly calculated, and <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is the negative sample loss to be estimated, so the problem is transformed into calculation <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>.</p>
<p>In the TS scenario, both the positively labeled dataset and the unlabeled dataset are obtained by random sampling, so the expected loss of the positively labeled dataset and the expected loss of the positive sample in the unlabeled dataset are approximate, as follows:
<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>Combine <xref ref-type="disp-formula" rid="eqn-10">formulas (10)</xref> and <xref ref-type="disp-formula" rid="eqn-11">(11)</xref> to get the method of estimating binary classification error, as shown in <xref ref-type="disp-formula" rid="eqn-12">formula (12)</xref>.
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:msub><mml:mover><mml:mi>R</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mrow><mml:mtext>PU</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The formula is called Non-negative risk estimator [<xref ref-type="bibr" rid="ref-40">40</xref>], where, <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is the estimated counter-example loss, <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is the expectation of the positive sample loss.</p>
<p>When performing intrusion detection, normal traffic is taken as a positive sample, so the proportion of positive samples in the unlabeled data set to be detected is usually much larger than that of negative examples, and there is a problem of data imbalance.</p>
<p>To deal with the data imbalance problem caused by the small prior probability of the class, the loss function of positive unlabeled learning is set as focal loss, as shown in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>, focal loss can be written as:</p>
<p><disp-formula id="eqn-13"><label>(13)</label><mml:math id="mml-eqn-13" display="block"><mml:mi>f</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>t</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x03B3;</mml:mi></mml:mrow></mml:msup><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:msup><mml:mi>t</mml:mi><mml:mrow><mml:mi>&#x03B3;</mml:mi></mml:mrow></mml:msup><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula></p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Loss comparison under various values of learning rate <inline-formula id="ieqn-66"><mml:math id="mml-ieqn-66"><mml:mi>&#x03B3;</mml:mi></mml:math></inline-formula></title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-4.tif"/>
</fig>
<p>During the training process of the model, when positive samples are misidentified, they will be regarded as difficult samples. At this time, there is a gap of tens or even hundreds of times between <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x03B3;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula id="ieqn-65"><mml:math id="mml-ieqn-65"><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>f</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x03B3;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, the weight of difficult samples can be increased to improve the classification performance of nnPU under data imbalance. The modified non-negative risk estimator is shown in <xref ref-type="disp-formula" rid="eqn-14">formula (14)</xref>.
<disp-formula id="eqn-14"><label>(14)</label><mml:math id="mml-eqn-14" display="block"><mml:mover><mml:mi>R</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mi>l</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>U</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mn>0</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C0;</mml:mi><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>f</mml:mi><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mn>0</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The specific steps of positive unlabeled learning are shown in Algorithm 3.</p>
<fig id="fig-16">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-16.tif"/>
</fig>
<p>From the above analysis, it can be seen that compared with the binary classification model, positive unlabeled learning is adjusted in the error calculation, the binary classification error is estimated through the risk estimator, and the estimated binary classification error is used for backpropagation to adjust the parameters of the neural network model.</p>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Neural Network Settings</title>
<p>In the process of using machine learning methods for industrial control system intrusion detection, it is necessary to pay attention to the real-time requirements of the industrial control system for the model, and the model is required to quickly make judgments on the input data. Therefore, the neural network structure used needs to be simplified as much as possible. On the one hand, the simplified model can reduce the detection response time and improve the real-time performance of the model. On the other hand, it can reduce the demand for computing resources and is more in line with the application scenarios of industrial control systems.</p>
<p>Positive unlabeled learning is a learning algorithm based on neural networks, which trains neural network models by estimating classification errors in scenarios where there is only one type of labeled data. The difference in neural network structure will also affect the performance of the model. In this section, we discuss two positive unlabeled learning models with different network structures.</p>
<p>The first is a fully connected deep neural network (DNN). It is a neural network with multiple hidden layers. In theory, DNN can fit any function. Reference [<xref ref-type="bibr" rid="ref-42">42</xref>] discussed the classification performance of DNN with different numbers of hidden layers in intrusion detection, and the results show that when performing binary classification, the DNN model with three hidden layers can have relatively high classification performance, and as the number of layers increases, the classification performance does not improve significantly. Therefore, in this paper, a DNN model with 3 hidden layers is selected, and the numbers of the three hidden nodes are 256, 64, and 16, respectively. The network structure settings of the model are shown in <xref ref-type="table" rid="table-4">Table 4</xref>.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Parameters of the neural network model</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>S. No.</th>
<th>Type</th>
<th>Output</th>
<th>Node</th>
<th>Activation function</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>Fully connected layer</td>
<td>(None, 256)</td>
<td>256</td>
<td>ReLu</td>
</tr>
<tr>
<td>2</td>
<td>Batch normalization</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>3</td>
<td>Fully connected layer</td>
<td>(None, 64)</td>
<td>64</td>
<td>ReLu</td>
</tr>
<tr>
<td>4</td>
<td>Batch normalization</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>5</td>
<td>Fully connected layer</td>
<td>(None, 16)</td>
<td>16</td>
<td>ReLu</td>
</tr>
<tr>
<td>6</td>
<td>Batch normalization</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>7</td>
<td>Fully connected layer</td>
<td>(None, 1)</td>
<td>1</td>
<td>Sigmoid</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The positive unlabeled learning completes a binary classification task through DNN, divides all samples to be detected into normal traffic and intrusion traffic, and the output of DNN is mapped to the [0,1] interval through the Sigmoid function to complete the binary classification task.</p>
<p>In DNN, batch normalization (BN) is performed between two fully connected layers, that is, the output of each hidden layer neuron is standardized, so that the input value of the nonlinear transformation function falls into an area that is more sensitive to the input. The use of BN can speed up the convergence of the neural network. In addition, BN allows the model to use a higher learning rate and reduces the model&#x2019;s requirements for network parameter initialization. It can also act as a regulator, and in some cases can eliminate the need for dropout.</p>
<p>The activation function in DNN is the ReLu function. (1) It can speed up network training. Compared with sigmoid and tanh, its derivation is faster. (2) Prevent the gradient from disappearing. When the value is too large or too small, the derivatives of sigmoid and tanh are close to 0, and ReLu is an unsaturated activation function, which does not exist. (3) Make the grid sparse.</p>
<p>The weight update algorithm uses the Adam algorithm, which is an adaptive learning rate optimization algorithm, and has the advantages of fast convergence and less memory usage.</p>
<p>The second is the Convolutional Neural Network (CNN). In this paper, a simple CNN network structure Lenet-5 structure is adopted. Considering that Lenet-5 is a network for processing two-dimensional images, the input is required to be 32 <inline-formula id="ieqn-77"><mml:math id="mml-ieqn-77"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 32, and the data of industrial control systems are usually one-dimensional vectors, so the network structure is adjusted to replace the two-dimensional convolution in Lenet-5 as For one-dimensional convolution, the input size is 32 <inline-formula id="ieqn-78"><mml:math id="mml-ieqn-78"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1. Therefore, it is necessary to perform feature selection and reduce the dimension to 32 dimensions before training the model. The first layer of the network uses a 5 <inline-formula id="ieqn-79"><mml:math id="mml-ieqn-79"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1 convolution, and after passing through the first layer, six feature maps with a size of 28 <inline-formula id="ieqn-80"><mml:math id="mml-ieqn-80"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1 are obtained, and then through the maximum pooling sampling with a size of 2, the size is changed to 14 <inline-formula id="ieqn-81"><mml:math id="mml-ieqn-81"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1. The first convolutional layer uses 5 <inline-formula id="ieqn-82"><mml:math id="mml-ieqn-82"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1 convolution to output 16 feature maps with a size of 10 <inline-formula id="ieqn-83"><mml:math id="mml-ieqn-83"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1, and then changes it to 5 <inline-formula id="ieqn-84"><mml:math id="mml-ieqn-84"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 1 through the maximum pooling sampling with a size of 2, and finally flattens all images into a fully connected layer, the fully connected layer has two layers, the number of neurons in the first layer is 120, and the number of neurons in the second layer is 84. Finally, according to the classification category, the output is performed through the softmax function. The model structure of the industrial control system based on Lenet-5 is shown in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>. The &#x201C;?&#x201D; in the input (?, 32, 1) represents the batch size, and the activation function uses the ReLu function.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>CNN-based framework for intrusion detection in positive unlabeled learning</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-5.tif"/>
</fig>
<p>So far, the model structure based on positive unlabeled learning can be obtained. The offline training steps of the intrusion detection model based on positive unlabeled learning are as follows:
<list list-type="bullet">
<list-item>
<p>Read data, including positive label data and unlabeled data to be detected, and perform data preprocessing;</p></list-item>
<list-item>
<p>Using the OCSVM-cE technique, estimate the class prior probability of the unlabeled data set and save the OCSVM model;</p></list-item>
<list-item>
<p>Calculate the feature importance through the KL divergence, set the threshold <italic>th</italic> or the selected feature number <italic>K</italic>, and perform feature selection according to the feature importance to obtain a new training data set;</p></list-item>
<list-item>
<p>Initialize a deep neural network and use the new training data set after feature selection to train the PU learning model. The training process is shown in Algorithm 3;</p></list-item>
<list-item>
<p>Export the trained neural network and return the predicted value of the unlabeled dataset.</p></list-item>
</list></p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental Results</title>
<sec id="s4_1">
<label>4.1</label>
<title>Data Introduction and Analysis</title>
<p>Three publicly available datasets for intrusion detection are used in the experiments: NSL-KDD [<xref ref-type="bibr" rid="ref-43">43</xref>], UNSW-NB15 [<xref ref-type="bibr" rid="ref-44">44</xref>], and WADI [<xref ref-type="bibr" rid="ref-45">45</xref>]. Among them, the data in the NSL-KDD and UNSW-NB15 datasets are based on the characteristics extracted from Internet traffic, including the basic characteristics of the flow (such as transport layer protocol type, port, etc.), time information of the flow, connection content characteristics, etc. These characteristics can also be provided as industrial control system traffic. At the same time, to further verify the effectiveness of the model in the industrial control scene, the WADI dataset is introduced. On the one hand, the industrial control data provided by the industrial control test bench is applied to the data. On the other hand, simulates the unbalanced characteristics of industrial control data.</p>
<p>In terms of attack types, the NSL-KDD dataset is improved on the KDDCUP99 dataset, and some redundant data are removed. The dataset contains normal traffic and 22 types of attack traffic. The attack traffic mainly includes denial of service attack (DoS), monitoring and detection (Probing), remote machine illegal access (R2L), and ordinary user unauthorized access (U2R) four categories. The UNSW_NB15 dataset is an intrusion detection dataset generated by the Australian Cyber Security Center, including samples of 9 types of attacks including DoS and Backdoors. The WADI dataset is collected on an attached test rig, which consists of many large tanks that supply water to user tanks. The WADI dataset contains 16 attacks whose goal is to stop the water supply to the user tanks.</p>
<p>In the experiment, the UNSW-NB15 dataset uses the training and testing data sets provided by the official website, with a total of 257,673 samples. The WADI dataset uses labeled data from October 2019. The sample size of each data set is shown in <xref ref-type="table" rid="table-5">Table 5</xref>.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Description of datasets</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>Abnormal number of samples</th>
<th>Normal sample size</th>
<th>Dimension</th>
</tr>
</thead>
<tbody>
<tr>
<td>NSL-KDD</td>
<td>83206</td>
<td>90503</td>
<td>41</td>
</tr>
<tr>
<td>UNSW-NB15</td>
<td>164673</td>
<td>93000</td>
<td>39</td>
</tr>
<tr>
<td>WADI</td>
<td>9977</td>
<td>162824</td>
<td>127</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Data Preprocessing</title>
<p>In the division of training and test data sets, based on the true labels of the samples, a specified number of positive samples are randomly selected from the positive data as the training set, and the remaining data are used as the test set. In terms of data processing, for the string data existing in NSL-KDD, such as protocol types and services, one-hot encoding is required to convert the string into a vector, and the dimension of the NSL-KDD dataset after encoding is increased from 41 dimensions to 122 dimensions. The data in the UNSW-NB15 and WADI datasets do not have null values and strings, so they can be used directly.</p>
<p>The equipment used in this experiment: the processor is Intel core i7 8750H, the operating system is 64-bit Windows 10 Home Chinese Edition, the hard disk is Western Digital SN720, and the memory is 16 GB.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Evaluation Index</title>
<p>After the model is trained, the data set to be predicted is classified through the model, and based on the judgment result of the model, the confusion matrix shown in <xref ref-type="table" rid="table-6">Table 6</xref> can be established.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Positive and counterexample matrix</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th></th>
<th>Positive example</th>
<th>Counter example</th>
</tr>
</thead>
<tbody>
<tr>
<td>Positive example</td>
<td>TP</td>
<td>FN</td>
</tr>
<tr>
<td>Counter example</td>
<td>FP</td>
<td>TN</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="table-2">Tables 2</xref>&#x2013;<xref ref-type="table" rid="table-5">5</xref>, the row represents the true category of the data, and the column represents the predicted category of the model. In intrusion detection, the focus is on the ability of the model to identify intrusion samples. Therefore, the precision and recall of intrusion samples are used as evaluation indicators. In the sample, the true label is the proportion of positive examples, as shown in <xref ref-type="disp-formula" rid="eqn-15">formula (15)</xref>.
<disp-formula id="eqn-15"><label>(15)</label><mml:math id="mml-eqn-15" display="block"><mml:mrow><mml:mtext>precision</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TP</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>TP</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>FP</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
<p>The recall rate is shown in <xref ref-type="disp-formula" rid="eqn-16">formula (16)</xref>. The recall rate describes the proportion of the model that recognizes all samples of the true category as positive examples.
<disp-formula id="eqn-16"><label>(16)</label><mml:math id="mml-eqn-16" display="block"><mml:mrow><mml:mtext>recall</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TP</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>TP</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>FN</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
<p>The F1-score is also often used as an evaluation index. F1-score is the harmonic mean of precision and recall, as shown in the <xref ref-type="disp-formula" rid="eqn-17">formula (17)</xref>.
<disp-formula id="eqn-17"><label>(17)</label><mml:math id="mml-eqn-17" display="block"><mml:mrow><mml:mtext>F</mml:mtext></mml:mrow><mml:mn>1</mml:mn><mml:mo>=</mml:mo><mml:mfrac><mml:mn>2</mml:mn><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mtext>precision</mml:mtext></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mtext>recall</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>precision</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>recall</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mtext>precision</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>recall</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
<p>In addition to the above indicators, in the intrusion detection scenario, due to the large amount of data faced, the time taken for model training and prediction is also an important indicator to measure the performance of the model.</p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Analysis of Results</title>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Effectiveness Analysis and Time Efficiency of Feature Importance</title>
<p>In this experiment, the importance of each feature is first calculated by random forest in the binary classification scenario, and compared with the feature importance calculated based on KL divergence to verify the effectiveness of the feature weight calculated using KL divergence.</p>
<p>In the experiment, 2000 positive samples were randomly selected from all samples as the positive label data set, and then 2000 positive samples and 4000 negative samples were mixed as the unlabeled data set, and all the remaining samples were used as the test set. <xref ref-type="fig" rid="fig-6">Figs. 6</xref> and <xref ref-type="fig" rid="fig-7">7</xref> show the experimental results of the KLOCSVM and KDE-OCSVM algorithms in the NSL-KDD dataset and the UNSW-NB15 dataset, respectively.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Evaluation of feature importance (UNSW-NB15)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-6.tif"/>
</fig><fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Evaluation of feature importance (NSL-KDD)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-7.tif"/>
</fig>
<p>Further, the correlation of the feature importance obtained by the two algorithms is calculated and the correlation test is carried out. By calculation, under the UNSW-NB15 data set, the average correlation coefficient of the feature importance of the two algorithms after normalization is 0.72, and the <italic>p</italic> value of the test is 4.29 <inline-formula id="ieqn-85"><mml:math id="mml-ieqn-85"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula>10<sup>&#x2013;7</sup>. The correlation coefficient on the NSL-KDD data set is 0.9364, and the <italic>p</italic> value of the test is 1.15 <inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula>10<sup>&#x2013;56</sup>. With a significance level of 0.05, it can be concluded that there is a significant correlation between the feature importance calculated by KL divergence and the feature importance in the case of binary classification, that is, the feature importance calculated by KL divergence is effective.</p>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Class Prior Probability Estimation</title>
<p>To verify the effectiveness of the OCSVM-cE algorithm proposed in this paper, it is compared with the following class prior probability algorithm.
<list list-type="bullet">
<list-item>
<p>KM1/KM2 algorithm. This algorithm embeds it into the kernel space by calculating the distribution of positive and negative data sets and can solve the class prior probability by solving a quadratic programming problem. The algorithm is an algorithm with high estimation accuracy at present.</p></list-item>
<list-item>
<p>TICE algorithm. This algorithm divides all samples based on a decision tree, raises the lower bound of the label frequency of positive samples through subsets, obtains the estimated value of the label frequency of positive samples, and then calculates the class priority. This algorithm is currently the algorithm with the lowest time complexity for class-prior probability estimation.</p></list-item>
<list-item>
<p>One-class SVM-cE algorithm. The algorithm proposed in this paper trains the One-class SVM model to find reliable positive examples of unlabeled datasets, estimates the label frequency of positive examples through the reliable positive examples, and then calculates the class prior probability.</p></list-item>
</list></p>
<p>In the class prior probability estimation problem, the core evaluation index is the estimation accuracy, that is, the error between the estimated value and the real value. In addition, the time complexity of the algorithm is also an important evaluation index.</p>
<p>Based on the above evaluation indicators, the following two experiments are designed for verification: 1) To verify the accuracy of class prior probability estimation, construct unlabeled data sets with different class prior probabilities in the experiment, and estimate the class prior probabilities of the constructed unlabeled data sets through four different baseline algorithms, analyze the error between the estimated value of different algorithms and the real value; 2) Verify the time complexity of the algorithm. In this experiment, we first compare the time required for each algorithm to estimate the class prior probability under the same sample size, and then estimate the time trend of the class prior probability under different sample sizes.</p>
<p>The first is the accuracy of class prior probability estimates. In the experiment, the sample size of the positive label dataset is set to 1000, and the number of negative samples in the unlabeled dataset is 2000, respectively, constructing unlabeled datasets with class prior probabilities of 0.1, 0.2, 0.3, 0.4, and 0.5. The class prior probabilities were estimated for the constructed datasets using baseline algorithms, respectively.</p>
<p>The experimental results are shown in <xref ref-type="fig" rid="fig-8">Figs. 8</xref> and <xref ref-type="fig" rid="fig-9">9</xref>. The abscissa is the prior probability of the real class, and the ordinate is the absolute value of the error between the estimated value and the predicted value. The experimental results show that the one-class SVM-cE algorithm can maintain high prediction accuracy on the two data sets, the error is close to that of the KM2 algorithm and maintained below 0.05, and the stability of the algorithm estimation is better.</p>
<fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Error comparison of algorithms (UNSW-NB15)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-8.tif"/>
</fig><fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>Error comparison of algorithms (NSL-KDD)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-9.tif"/>
</fig>
<p>During the experiment, the TIcE algorithm has a large positive error. This is because the TIcE algorithm estimates the real label frequency by seeking the lower bound of the label frequency, which will cause the estimated label frequency to be lower than the real value, so the estimated class prior probability is larger than the true value. In the one-class SVM-cE algorithm, the one-class SVM algorithm is used to find reliable positive examples, avoiding the use of lower bounds, and improving the accuracy of estimation.</p>
<p>To further test the stability of the one-class SVM-cE algorithm estimation, the number of samples in the positive label data set is set to 2000, and the value is randomly selected in the interval [0.1,0.9] as the class prior probability to construct the unlabeled data set, and the experiment is repeated 100 times, computes the error between the class prior probability estimate and the true value.</p>
<p><xref ref-type="fig" rid="fig-10">Fig. 10</xref> shows the boxplot of 100 repeated experiments. It can be found that the predicted effect of the one-class SVM-cE algorithm on the KDD and UNSW-NB15 data sets is better than that on the WADI data set, and the estimated four-point error is less than 0.05, while The lower quartile of the estimated error on the WADI dataset is 0.0407, the median is 0.0672, and the upper quartile is 0.0884. There are only two outliers, so the estimated value of WADI is relatively stable, and the error is concentrated in [0.05,01] interval, the estimation results of the three data sets are combined, and OCSVM-cE is a stable class prior probability estimation algorithm.</p>
<fig id="fig-10">
<label>Figure 10</label>
<caption>
<title>Error comparison of various datasets</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-10.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig-10">Fig. 10</xref> shows the boxplot of 100 repeated experiments. It can be found that the predicted effect of the one-class SVM-cE algorithm on the KDD and UNSW-NB15 datasets is better than that on the WADI dataset, and the estimated four-point error is less than 0.05, while the lower quartile of the estimated error on the WADI dataset is 0.0407, the median is 0.0672, and the upper quartile is 0.0884. There are only two outliers, so the estimated value of WADI is relatively stable, and the error is concentrated in [0.05,01] interval, the estimation results of the three datasets are combined, and one-class SVM-cE is a stable class prior probability estimation algorithm.</p>
<p>In PU learning, class prior probability is important prior knowledge, and its estimated error will directly affect the performance of the trained model. Through experiments, we further explore the influence of class prior probability estimation error on model performance. In the experiment, the true class prior probability of the unlabeled data set is set to 0.4, and different values are taken as the estimated value of the class prior probability in the interval [0,1] with 0.05 prior probability. The results are shown in <xref ref-type="fig" rid="fig-11">Fig. 11</xref>. It shows the experimental results of setting the number of positive label samples to 10,000 and the number of negative samples in the unlabeled dataset to 20,000 under the UNSW-NB15 dataset. The abscissa is the estimated class prior probability, and the ordinate is the F1-score. It can be observed that when the estimated class prior probability is 0.4, the F1-score achieves its highest value, and the model&#x2019;s performance is the best at this moment, and when the error between the estimated and true class prior probabilities increases. When the estimated value is zero, all unlabeled samples are classed as negative, and when the projected value is one, all unlabeled samples are classified as positive. The estimated class prior probability error from the F1-score analysis should be less than 0.05 to guarantee that the model has satisfactory classification performance.</p>
<fig id="fig-11">
<label>Figure 11</label>
<caption>
<title>F1-score evaluation under class prior</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-11.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig-12">Fig. 12</xref> shows that when the number of fixed positive samples is 1000, the time required by the one-class SVM-cE algorithm and the TIcE algorithm is positively correlated with the number of unlabeled samples [<xref ref-type="bibr" rid="ref-46">46</xref>,<xref ref-type="bibr" rid="ref-47">47</xref>]. Considering that in one-class SVM-cE, only positive samples are needed to train the OCSVM model, it can be considered that the one-class SVM-cE algorithm is more suitable for intrusion detection application scenarios, and the one-class SVM model trained in the process can be reused. When the class prior probability estimation is performed on the labeled dataset, the model can be directly loaded to classify reliable positive examples.</p>
<fig id="fig-12">
<label>Figure 12</label>
<caption>
<title>Comparison of estimation time of the algorithms (UNSW-NB15)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-12.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig-13">Fig. 13</xref> compares the runtime of the proposed algorithm under IEC 60870-5104 [<xref ref-type="bibr" rid="ref-48">48</xref>] and DNP3 [<xref ref-type="bibr" rid="ref-49">49</xref>] datasets. As can be seen from <xref ref-type="fig" rid="fig-13">Fig. 13</xref>, the runtime of the proposed algorithm under the IEC dataset is better than DNP3.</p>
<fig id="fig-13">
<label>Figure 13</label>
<caption>
<title>Comparison of estimation time of the proposed algorithm under IEC 60870-5104 and DNP3 datasets</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_44506-fig-13.tif"/>
</fig>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>Positive Unlabeled Learning Performance Analysis</title>
<p>The neural network settings of the compared binary classification model: the DNN settings are the same as the DNN network model used for PU learning. The model contains three hidden layers. The number of neurons in the first layer is 256, the number of neurons in the second layer is 64, and the number of neurons in the third layer is 16, but the positive and negative samples with real labels are used for training during the training process. The network structure of CNN uses the same LeNet-5 structure. The input is an image of 32 <inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 32, and the first layer uses a convolution of 5 <inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 5 [<xref ref-type="bibr" rid="ref-50">50</xref>]. After passing through the first layer, six images with a size of 28 <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 28 feature map, and through the maximum pooling sampling of 2 <inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 2, it is changed to a size of 14 <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 14, and the second convolutional layer uses convolution of 5 <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 5 to output 16 feature maps with a size of 10 <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 10, and then passes The maximum pooling sampling of 2 <inline-formula id="ieqn-94"><mml:math id="mml-ieqn-94"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 2 is changed to 5 <inline-formula id="ieqn-95"><mml:math id="mml-ieqn-95"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 5, and finally all the images are flattened and input into a fully connected layer, the number of neurons in the first layer is 120, and the number of neurons in the second layer is 84, and finally according to the classification category, output through the softmax function. RNN sets the number of hidden layer nodes to 80.</p>
<p>In the experiment, the number of positive labeled samples is set to 10,000, the number of negative examples in the unlabeled data set is 2000, the class prior probability is 0.9, the learning rate is 0.01, and the number of iterations is 50.</p>
<p><xref ref-type="table" rid="table-7">Table 7</xref> shows the comparison results of positive unlabeled learning and binary classification models. The comparison experiments in the table can be divided into two categories: positive unlabeled learning and binary classification performance comparison under the same network structure (DNN/CNN), positive unlabeled learning, and current better performance binary classification model comparison. According to the experimental data, the precision of positive unlabeled learning under the same network topology is comparable to that of the binary classification model, however, there is little difference in the recall rate [<xref ref-type="bibr" rid="ref-51">51</xref>]. According to the previous analysis, industrial control intrusion detection requires higher precision of the model, it is expected to achieve &#x201C;prefer false negatives rather than false positives&#x201D;, so positive unlabeled learning is suitable for industrial control intrusion detection, and compared with the current advanced CNN-BiLSTM and other models, it can still maintain a small gap in precision. At the same time, the positive unlabeled learning compares the binary classification model, which reduces the requirements for training data. Only one type of labeled data is needed, which can effectively reduce the data collection work. At the same time, only positive and unlabeled data are used for training, so that the model can mine unknown types of intrusion.</p>
<table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Proposed algorithm evaluation on various datasets (binary classification)</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>AUC</th>
<th>Precision</th>
<th>Recall</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="6">NSL-KDD</td>
<td>0.9801</td>
<td>0.9972</td>
<td>0.8689</td>
</tr>
<tr>
<td>0.9694</td>
<td>0.9915</td>
<td>0.8888</td>
</tr>
<tr>
<td>0.9974</td>
<td>0.9982</td>
<td>0.9491</td>
</tr>
<tr>
<td>0.9981</td>
<td>0.9986</td>
<td>0.9433</td>
</tr>
<tr>
<td>0.9979</td>
<td>0.9986</td>
<td>0.9228</td>
</tr>
<tr>
<td>0.9979</td>
<td>0.9990</td>
<td>0.9435</td>
</tr>
<tr>
<td valign="middle" rowspan="6">UNSW-NB15</td>
<td>0.9171</td>
<td>0.9973</td>
<td>0.6893</td>
</tr>
<tr>
<td>0.9145</td>
<td>0.9948</td>
<td>0.6751</td>
</tr>
<tr>
<td>0.9819</td>
<td>0.9964</td>
<td>0.7997</td>
</tr>
<tr>
<td>0.9791</td>
<td>0.9981</td>
<td>0.7236</td>
</tr>
<tr>
<td>0.9798</td>
<td>0.9979</td>
<td>0.7456</td>
</tr>
<tr>
<td>0.9822</td>
<td>0.9983</td>
<td>0.7791</td>
</tr>
<tr>
<td valign="middle" rowspan="6">WADI</td>
<td>0.9861</td>
<td>0.9772</td>
<td>0.8921</td>
</tr>
<tr>
<td>0.9811</td>
<td>0.9008</td>
<td>0.9271</td>
</tr>
<tr>
<td>0.9993</td>
<td>0.9718</td>
<td>0.9705</td>
</tr>
<tr>
<td>0.9991</td>
<td>0.9847</td>
<td>0.9937</td>
</tr>
<tr>
<td>0.9993</td>
<td>0.9801</td>
<td>0.9575</td>
</tr>
<tr>
<td>0.9995</td>
<td>0.9875</td>
<td>0.9773</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In <xref ref-type="table" rid="table-7">Table 7</xref>, the experiment compares the performance of PU learning and binary classification model, then compares the proposed learning and anomaly detection model, and analyzes the performance difference under the same condition of only one type of label data. From the research analysis listed in <xref ref-type="table" rid="table-2">Table 2</xref>, the anomaly detection models currently used for intrusion detection are mainly AE and one-class SVM, where AE is an unsupervised model, and consists of two parts: Encoder and decoder. The role of the encoder is to find the compressed representation of the given data. The decoder is used to reconstruct the original input and perform anomaly detection by calculating the error between the reconstructed input and the original input [<xref ref-type="bibr" rid="ref-52">52</xref>]. At the same time, looking at the research of one-class SVM for intrusion detection, its main work is focused on feature engineering. In this experiment, feature selection is based on the feature importance metric of positive unlabeled learning, and one-class SVM is used for anomaly detection. In terms of parameter setting, one-class SVM sets the upper limit of the error to 0.1, and the parameters of AE adopt the default settings in the source code.</p>
<p><xref ref-type="table" rid="table-8">Table 8</xref> shows the comparison results of the positive unlabeled learning and anomaly detection models. The indicators in the table show that, in particular, it can be observed that the performance of the one-class SVM and AE models on the WADI dataset is poor, which is caused by the imbalance of the test data. The ratio of positive data to negative data in the test data set is about 16:1, which also shows that the one-class SVM and AE algorithms are insufficient when dealing with unbalanced data, and positive unlabeled learning improves the performance of the model under unbalanced data through focal loss [<xref ref-type="bibr" rid="ref-53">53</xref>]. Therefore, positive unlabeled learning has significantly improved the precision rate and recall rate. On the three data sets, the proposed algorithm has significantly better performance than AE and one-class SVM in terms of precision rate.</p>
<table-wrap id="table-8">
<label>Table 8</label>
<caption>
<title>Proposed algorithm evaluation on various datasets (anomaly detection)</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>AUC</th>
<th>Precision</th>
<th>Recall</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" rowspan="4">NSL-KDD</td>
<td>0.9801</td>
<td>0.9972</td>
<td>0.8689</td>
</tr>
<tr>
<td>0.9694</td>
<td>0.9915</td>
<td>0.8888</td>
</tr>
<tr>
<td>&#x2013;</td>
<td>0.9098</td>
<td>0.9120</td>
</tr>
<tr>
<td>0.5752</td>
<td>0.5743</td>
<td>0.7089</td>
</tr>
<tr>
<td valign="middle" rowspan="4">UNSW-NB15</td>
<td>0.9171</td>
<td>0.9973</td>
<td>0.6893</td>
</tr>
<tr>
<td>0.9145</td>
<td>0.9948</td>
<td>0.6751</td>
</tr>
<tr>
<td>&#x2013;</td>
<td>0.8487</td>
<td>0.2852</td>
</tr>
<tr>
<td>0.5739</td>
<td>0.7326</td>
<td>0.5028</td>
</tr>
<tr>
<td valign="middle" rowspan="4">WADI</td>
<td>0.9861</td>
<td>0.9772</td>
<td>0.8921</td>
</tr>
<tr>
<td>0.9811</td>
<td>0.9008</td>
<td>0.9271</td>
</tr>
<tr>
<td>&#x2013;</td>
<td>0.2469</td>
<td>0.4747</td>
</tr>
<tr>
<td>0.5742</td>
<td>0.4927</td>
<td>0.1581</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Combining the results of <xref ref-type="table" rid="table-7">Tables 7</xref> and <xref ref-type="table" rid="table-8">8</xref>, it is not difficult to find that although proposed learning is similar to the anomaly detection algorithm in terms of training data, only one type of label data is needed, but the classification performance of the trained model has a larger gap than that of the anomaly detection algorithm. Especially in industrial control scenarios, taking the WADI dataset as an example, the ratio of normal data to abnormal data is as high as 16:1, and proposed learning can also maintain a high precision and recall rate, compared with some binary classification algorithms, their only a slight difference in precision. Combined with the previous characteristics of industrial control scenarios, the proposed learning is suitable for anomaly detection in industrial control scenarios.</p>
<p>To sum up, this paper proposes to use proposed learning for intrusion detection. It is an algorithm similar to anomaly detection, but it needs to label positive data on the training data, and the positive data needs to meet the SCAR condition. It can provide intrusion detection with high precision and high recall, and its precision and recall are significantly improved compared with unsupervised anomaly detection models. In particular, it is close to the binary classification model in terms of precision.</p>
</sec>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>Industrial control systems are mostly utilized in nuclear power, water conservation, and other critical infrastructures. It is important to assure the safety of industrial control systems. The intrusion detection system ensures network security and is an important component of industrial control system security. In this study, a positive unlabeled learning for intrusion detection in industrial control systems and used normal traffic as label data to find aberrant samples in the data. A feature significance calculation approach for feature selection with the goals of high dimensionality and strong correlation of industrial control system data is deployed. Simultaneously, the class prior probability estimation algorithm is enhanced, and the one-class SVM-cE algorithm for class prior probability estimation is employed, which increases the estimate&#x2019;s stability and accuracy. Finally, experiments are performed to validate the efficiency of the suggested learning. When compared to a supervised binary classification model, the proposed learning model maintains a high accuracy rate while having a slightly lower recall rate. Although the suggested learning approach avoids using negative data, it also imposes limits on positive data: positive samples are picked randomly. That is, their distribution is the same as the distribution of positive samples in unlabeled dataset. It is also a shortcoming of the suggested learning, and future research can concentrate on executing positive unlabeled learning on a data set with selection bias.</p>
</sec>
</body>
<back>
<ack><p>This research is supported by the University of Ha&#x2019;il -Saudi Arabia.</p>
</ack>
<sec><title>Funding Statement</title>
<p>This research has been funded by the Research Deanship at the University of Ha&#x2019;il -Saudi Arabia through Project Number RG-20146.</p>
</sec>
<sec><title>Author Contributions</title>
<p>The authors confirm their contribution to the paper as follows: study conception and design: A. Alkhalil, D. Uliyan; data collection: M. Altamimi; analysis and interpretation of results: A. Abdelrhman, Y. Altameemi; draft manuscript preparation: A. Ahmad, R. Mansour, A. Alkhalil. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability"><title>Availability of Data and Materials</title>
<p>The data used for the findings of this study is available within this article.</p>
</sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare that they have no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>O.</given-names> <surname>Pospisil</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Blazek</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Kuchar</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Fujdiak</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Misurec</surname></string-name></person-group>, &#x201C;<article-title>Application perspective on cybersecurity testbed for industrial control systems</article-title>,&#x201D; <source>Sensors</source>, vol. <volume>21</volume>, no. <issue>23</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>24</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Hajda</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Jakuszewski</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Ogonowski</surname></string-name></person-group>, &#x201C;<article-title>Security challenges in industry 4.0 PLC systems</article-title>,&#x201D; <source>Applied Sciences</source>, vol. <volume>11</volume>, no. <issue>21</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>20</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Hocky</surname></string-name></person-group>, &#x201C;<article-title>Uncovering the cyber security challenges in healthcare</article-title>,&#x201D; <source>Network Security</source>, vol. <volume>20</volume>, no. <issue>4</issue>, pp. <fpage>18</fpage>&#x2013;<lpage>19</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>P.</given-names> <surname>Radanliev</surname></string-name> and <string-name><given-names>D.</given-names> <surname>Roure</surname></string-name></person-group>, &#x201C;<article-title>Advancing the cybersecurity of the healthcare system with self-optimizing and self-adaptive artificial intelligence (part 2)</article-title>,&#x201D; <source>Health and Technology</source>, vol. <volume>12</volume>, no. <issue>3</issue>, pp. <fpage>923</fpage>&#x2013;<lpage>929</lpage>, <year>2022</year>; <pub-id pub-id-type="pmid">35975178</pub-id></mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Pal</surname></string-name> and <string-name><given-names>Z.</given-names> <surname>Jadidi</surname></string-name></person-group>, &#x201C;<article-title>Analysis of security issues and countermeasures for the industrial Internet of Things</article-title>,&#x201D; <source>Applied Sciences</source>, vol. <volume>11</volume>, no. <issue>20</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>19</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Wu</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Zhang</surname></string-name> and <string-name><given-names>W.</given-names> <surname>Zhao</surname></string-name></person-group>, &#x201C;<article-title>Privacy-preserving data aggregation for smart grid with user anonymity and designated recipients</article-title>,&#x201D; <source>Symmetry</source>, vol. <volume>15</volume>, no. <issue>5</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>18</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Sun</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Lai</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>B.</given-names> <surname>Mao</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Intrusion detection system based on in-depth understandings of industrial control logic</article-title>,&#x201D; <source>IEEE Transactions on Industrial Informatics</source>, vol. <volume>1</volume>, no. <issue>3</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>12</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Mubarak</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Habaebi</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Islam</surname></string-name>, <string-name><given-names>F.</given-names> <surname>Rahman</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Tahir</surname></string-name></person-group>, &#x201C;<article-title>Anomaly detection in ICS datasets with machine learning algorithms</article-title>,&#x201D; <source>Computer Systems Science and Engineering</source>, vol. <volume>37</volume>, no. <issue>1</issue>, pp. <fpage>33</fpage>&#x2013;<lpage>46</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Chu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Lai</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Liu</surname></string-name></person-group>, &#x201C;<article-title>Industrial control intrusion detection approach based on multiclassification GoogLeNet-LSTM model</article-title>,&#x201D; <source>Security and Communication Networks</source>, vol. <volume>19</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>10</lpage>, <year>2019</year>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>I.</given-names> <surname>Butun</surname></string-name>, <string-name><given-names>I.</given-names> <surname>Ra</surname></string-name> and <string-name><given-names>R.</given-names> <surname>Sankar</surname></string-name></person-group>, &#x201C;<article-title>An intrusion detection system based on multi-level clustering for hierarchical wireless sensor networks</article-title>,&#x201D; <source>Sensors</source>, vol. <volume>15</volume>, no. <issue>11</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>19</lpage>, <year>2015</year>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Zhou</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Hao</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Hu</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Li</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Network traffic analysis over clustering-based collective anomaly detection</article-title>,&#x201D; <source>Computer Networks</source>, vol. <volume>205</volume>, no. <issue>3</issue>, pp. <fpage>1087</fpage>&#x2013;<lpage>1098</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>X.</given-names> <surname>Duan</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Fu</surname></string-name> and <string-name><given-names>K.</given-names> <surname>Wang</surname></string-name></person-group>, &#x201C;<article-title>Network traffic anomaly detection method based on a multi-scale residual classifier</article-title>,&#x201D; <source>Computer Communications</source>, vol. <volume>198</volume>, no. <issue>2</issue>, pp. <fpage>206</fpage>&#x2013;<lpage>216</lpage>, <year>2023</year>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>X.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Chen</surname></string-name> and <string-name><given-names>Q.</given-names> <surname>Zhang</surname></string-name></person-group>, &#x201C;<article-title>Building auto-encoder intrusion detection system based on random forest feature selection</article-title>,&#x201D; <source>Computers &#x0026; Security</source>, vol. <volume>95</volume>, no. <issue>4</issue>, pp. <fpage>943</fpage>&#x2013;<lpage>961</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Tama</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Lee</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Lee</surname></string-name></person-group>, &#x201C;<article-title>A systematic mapping study and empirical comparison of data-driven intrusion detection techniques in industrial control networks</article-title>,&#x201D; <source>Archives of Computational Methods in Engineering</source>, vol. <volume>29</volume>, no. <issue>5</issue>, pp. <fpage>5353</fpage>&#x2013;<lpage>5380</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Jia</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Ren</surname></string-name> and <string-name><given-names>X.</given-names> <surname>Zhao</surname></string-name></person-group>, &#x201C;<article-title>Network intrusion detection method based on PCA and Bayes algorithm</article-title>,&#x201D; <source>Security and Communication Networks</source>, vol. <volume>18</volume>, no. <issue>4</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>10</lpage>, <year>2018</year>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>O.</given-names> <surname>Tushkanova</surname></string-name>, <string-name><given-names>D.</given-names> <surname>Levshun</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Branitskiy</surname></string-name>, <string-name><given-names>E.</given-names> <surname>Fedorchenko</surname></string-name>, <string-name><given-names>E.</given-names> <surname>Novikova</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Detection of cyberattacks and anomalies in cyber-physical systems: Approaches, data sources evaluation</article-title>,&#x201D; <source>Algorithms</source>, vol. <volume>16</volume>, no. <issue>2</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>17</lpage>, <year>2023</year>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Ling</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Zhu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Luo</surname></string-name> and <string-name><given-names>H.</given-names> <surname>Wang</surname></string-name></person-group>, &#x201C;<article-title>An intrusion detection method for industrial control systems based on bidirectional simple recurrent unit</article-title>,&#x201D; <source>Computers &#x0026; Electrical Engineering</source>, vol. <volume>91</volume>, no. <issue>5</issue>, pp. <fpage>7049</fpage>&#x2013;<lpage>7063</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Huda</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Yearwood</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Hassan</surname></string-name></person-group>, &#x201C;<article-title>Securing the operations in SCADA-IoT platform based industrial control system using an ensemble of deep belief networks</article-title>,&#x201D; <source>Applied Soft Computing</source>, vol. <volume>71</volume>, no. <issue>1</issue>, pp. <fpage>66</fpage>&#x2013;<lpage>77</lpage>, <year>2018</year>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Nedeljkovic</surname></string-name> and <string-name><given-names>Z.</given-names> <surname>Jakovljevic</surname></string-name></person-group>, &#x201C;<article-title>CNN-based method for the development of cyber-attacks detection algorithms in industrial control systems</article-title>,&#x201D; <source>Computers &#x0026; Security</source>, vol. <volume>114</volume>, no. <issue>3</issue>, pp. <fpage>2585</fpage>&#x2013;<lpage>2598</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Ahmed</surname></string-name>, <string-name><given-names>V.</given-names> <surname>Krishan</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Foroutan</surname></string-name></person-group>, &#x201C;<article-title>Cyber-physical security analysis for anomalies in transmission protection systems</article-title>,&#x201D; <source>IEEE Transactions on Industry Applications</source>, vol. <volume>55</volume>, no. <issue>6</issue>, pp. <fpage>6313</fpage>&#x2013;<lpage>6323</lpage>, <year>2019</year>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Z.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Lai</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Liu</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Liu</surname></string-name></person-group>, &#x201C;<article-title>Explaining the attributes of a deep learning based intrusion detection system for industrial control networks</article-title>,&#x201D; <source>Sensors</source>, vol. <volume>20</volume>, no. <issue>14</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>23</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Yang</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Wu</surname></string-name></person-group>, &#x201C;<article-title>Web intrusion detection system combined with feature analysis and SVM optimization</article-title>,&#x201D; <source>EURASIP Journal on Wireless Communications and Networking</source>, vol. <volume>33</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>18</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Han</surname></string-name>, <string-name><given-names>Q.</given-names> <surname>Wu</surname></string-name> and <string-name><given-names>Y.</given-names> <surname>Yang</surname></string-name></person-group>, &#x201C;<article-title>Machine learning for internet of things anomaly detection under low-quality data</article-title>,&#x201D; <source>International Journal of Distributed Sensor Networks</source>, vol. <volume>18</volume>, no. <issue>10</issue>, pp. <fpage>717</fpage>&#x2013;<lpage>731</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Khan</surname></string-name>, <string-name><given-names>D.</given-names> <surname>Pi</surname></string-name> and <string-name><given-names>Z.</given-names> <surname>Khan</surname></string-name></person-group>, &#x201C;<article-title>HML-IDS: A hybrid-multilevel anomaly prediction approach for intrusion detection in SCADA systems</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>7</volume>, no. <issue>5</issue>, pp. <fpage>89507</fpage>&#x2013;<lpage>89521</lpage>, <year>2019</year>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Z.</given-names> <surname>Liu</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Wang</surname></string-name> and <string-name><given-names>W.</given-names> <surname>Wang</surname></string-name></person-group>, &#x201C;<article-title>Online cyber-attack detection in the industrial control system: A deep reinforcement learning approach</article-title>,&#x201D; <source>Mathematical Problems in Engineering</source>, vol. <volume>22</volume>, no. <issue>7</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>8</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Xu</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Xu</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Qin</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Li</surname></string-name> and <string-name><given-names>X.</given-names> <surname>Huang</surname></string-name></person-group>, &#x201C;<article-title>TGAN-AD: Transformer-based GAN for anomaly detection of time series data</article-title>,&#x201D; <source>Applied Sciences</source>, vol. <volume>12</volume>, no. <issue>16</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>15</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Priyanga</surname></string-name> and <string-name><given-names>K.</given-names> <surname>Krithivasan</surname></string-name></person-group>, &#x201C;<article-title>Detection of cyberattacks in industrial control systems using enhanced principle component analysis and hypergraph-based convolution neural network (EPCA-HG-CNN)</article-title>,&#x201D; <source>IEEE Transactions on Industry Applications</source>, vol. <volume>56</volume>, no. <issue>4</issue>, pp. <fpage>4394</fpage>&#x2013;<lpage>4404</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>F.</given-names> <surname>Ayo</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Folorunso</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Alli</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Adekunle</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Awotunde</surname></string-name></person-group>, &#x201C;<article-title>Network intrusion detection based on deep learning model optimized with rule-based hybrid feature selection</article-title>,&#x201D; <source>Information Security Journal: A Global Perspective</source>, vol. <volume>29</volume>, no. <issue>6</issue>, pp. <fpage>267</fpage>&#x2013;<lpage>283</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Zhu</surname></string-name>, <string-name><given-names>S.</given-names> <surname>He</surname></string-name>, <string-name><given-names>L.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Zeng</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Yang</surname></string-name></person-group>, &#x201C;<article-title>Feature selection using an improved gravitational search algorithm</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>7</volume>, no. <issue>3</issue>, pp. <fpage>114440</fpage>&#x2013;<lpage>114448</lpage>, <year>2019</year>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Lv</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Wang</surname></string-name>, <string-name><given-names>Z.</given-names> <surname>Zhang</surname></string-name> and <string-name><given-names>X.</given-names> <surname>Liu</surname></string-name></person-group>, &#x201C;<article-title>A novel intrusion detection system based on an optimal hybrid kernel extreme learning machine</article-title>,&#x201D; <source>Knowledge-Based Systems</source>, vol. <volume>195</volume>, no. <issue>4</issue>, pp. <fpage>648</fpage>&#x2013;<lpage>661</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Q.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>V.</given-names> <surname>Wild</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Filippi</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Flaxman</surname></string-name> and <string-name><given-names>D.</given-names> <surname>Sejdinovic</surname></string-name></person-group>, &#x201C;<article-title>Bayesian kernel two-sample testing</article-title>,&#x201D; <source>Journal of Computational and Graphical Statistics</source>, vol. <volume>31</volume>, no. <issue>4</issue>, pp. <fpage>1164</fpage>&#x2013;<lpage>1176</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Scott</surname></string-name></person-group>, &#x201C;<article-title>A rate of convergence for mixture proportion estimation, with application to learning from noisy labels</article-title>,&#x201D; <source>Artificial Intelligence and Statistics</source>, vol. <volume>15</volume>, no. <issue>3</issue>, pp. <fpage>838</fpage>&#x2013;<lpage>846</lpage>, <year>2015</year>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Plessis</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Sugiyama</surname></string-name></person-group>, &#x201C;<article-title>Semi-supervised learning of class balance under class-prior change by distribution matching</article-title>,&#x201D; <source>Neural Networks</source>, vol. <volume>50</volume>, no. <issue>3</issue>, pp. <fpage>110</fpage>&#x2013;<lpage>119</lpage>, <year>2014</year>; <pub-id pub-id-type="pmid">24300548</pub-id></mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Plessis</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Sugiyama</surname></string-name></person-group>, &#x201C;<article-title>Class prior estimation from positive and unlabeled data</article-title>,&#x201D; <source>IEICE Transactions on Information and Systems</source>, vol. <volume>97</volume>, no. <issue>5</issue>, pp. <fpage>1358</fpage>&#x2013;<lpage>1362</lpage>, <year>2014</year>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Plessis</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Niu</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Sugiyama</surname></string-name></person-group>, &#x201C;<article-title>Class-prior estimation for learning from positive and unlabeled data</article-title>,&#x201D; <source>Machine Learning</source>, vol. <volume>106</volume>, no. <issue>4</issue>, pp. <fpage>463</fpage>&#x2013;<lpage>492</lpage>, <year>2017</year>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Lazecka</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Mielniczuk</surname></string-name> and <string-name><given-names>P.</given-names> <surname>Teisseyre</surname></string-name></person-group>, &#x201C;<article-title>Estimating the class prior for positive and unlabeled data via logistic regression</article-title>,&#x201D; <source>Advances in Data Analytics and Classification</source>, vol. <volume>15</volume>, no. <issue>1</issue>, pp. <fpage>1039</fpage>&#x2013;<lpage>1068</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Plessis</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Niu</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Sugiyama</surname></string-name></person-group>, &#x201C;<article-title>Analysis of learning from positive and unlabeled data</article-title>,&#x201D; <source>Advances in Neural Information Processing Systems</source>, vol. <volume>7</volume>, no. <issue>2</issue>, pp. <fpage>703</fpage>&#x2013;<lpage>711</lpage>, <year>2014</year>.</mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Bao</surname></string-name>, <string-name><given-names>T.</given-names> <surname>Sakai</surname></string-name>, <string-name><given-names>I.</given-names> <surname>Sato</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Sugiyama</surname></string-name></person-group>, &#x201C;<article-title>Convex formulation of multiple instances learning from positive and unlabeled bags</article-title>,&#x201D; <source>Neural Networks</source>, vol. <volume>105</volume>, no. <issue>7</issue>, pp. <fpage>132</fpage>&#x2013;<lpage>141</lpage>, <year>2018</year>; <pub-id pub-id-type="pmid">29804041</pub-id></mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Wolf</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Regnery</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Tarnawski</surname></string-name>, <string-name><given-names>B.</given-names> <surname>Billewicz</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Polanska</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Weakly supervised learning with positive and unlabeled data for automatic brain tumor segmentation</article-title>,&#x201D; <source>Applied Sciences</source>, vol. <volume>12</volume>, no. <issue>21</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>18</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Zhang</surname></string-name>, <string-name><given-names>F.</given-names> <surname>Zhu</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Ling</surname></string-name> and <string-name><given-names>Q.</given-names> <surname>Liu</surname></string-name></person-group>, &#x201C;<article-title>Best-in-class imitation: Non-negative positive-unlabeled imitation learning from imperfect demonstrations</article-title>,&#x201D; <source>Information Sciences</source>, vol. <volume>601</volume>, no. <issue>2</issue>, pp. <fpage>71</fpage>&#x2013;<lpage>89</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Bekker</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Davis</surname></string-name></person-group>, &#x201C;<article-title>Learning from positive and unlabeled data: A survey</article-title>,&#x201D; <source>Machine Learning</source>, vol. <volume>109</volume>, no. <issue>4</issue>, pp. <fpage>719</fpage>&#x2013;<lpage>760</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-42"><label>[42]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Kim</surname></string-name> and <string-name><given-names>P.</given-names> <surname>Panda</surname></string-name></person-group>, &#x201C;<article-title>Revisiting batch normalization for training low-latency deep spiking neural networks from scratch</article-title>,&#x201D; <source>Frontiers in Neuroscience</source>, vol. <volume>15</volume>, no. <issue>4</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>13</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-43"><label>[43]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y.</given-names> <surname>Tang</surname></string-name>, <string-name><given-names>L.</given-names> <surname>Gu</surname></string-name> and <string-name><given-names>L.</given-names> <surname>Wang</surname></string-name></person-group>, &#x201C;<article-title>Deep stacking network for intrusion detection</article-title>,&#x201D; <source>Sensors</source>, vol. <volume>22</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>18</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-44"><label>[44]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Moustafa</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Slay</surname></string-name></person-group>, &#x201C;<article-title>UNSW-NB15: A comprehensive data set for network intrusion detection systems (UNSW-NB15 network data set)</article-title>,&#x201D; in <conf-name>Conf. Military Communications and Information Systems (MilCIS)</conf-name>, <publisher-loc>Sydney, Australia</publisher-loc>, pp. <fpage>1</fpage>&#x2013;<lpage>6</lpage>, <year>2015</year>.</mixed-citation></ref>
<ref id="ref-45"><label>[45]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Ahmed</surname></string-name>, <string-name><given-names>V.</given-names> <surname>Palleti</surname></string-name> and <string-name><given-names>A.</given-names> <surname>Mathur</surname></string-name></person-group>, &#x201C;<article-title>WADI: A water distribution testbed for research in the design of secure cyber-physical systems</article-title>,&#x201D; in <conf-name>IEEE 3rd Int. Workshop on Cyber-Physical Systems for Smart Water Networks</conf-name>, <publisher-loc>New York, USA</publisher-loc>, pp. <fpage>25</fpage>&#x2013;<lpage>28</lpage>, <year>2017</year>.</mixed-citation></ref>
<ref id="ref-46"><label>[46]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>W.</given-names> <surname>Lin</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Lin</surname></string-name> and <string-name><given-names>P.</given-names> <surname>Wang</surname></string-name></person-group>, &#x201C;<article-title>Using convolutional neural networks to network intrusion detection for cyber threats</article-title>,&#x201D; in <conf-name>IEEE Int. Conf. on Applied System Invention</conf-name>, <publisher-loc>Seoul, South Korea</publisher-loc>, pp. <fpage>1107</fpage>&#x2013;<lpage>1110</lpage>, <year>2018</year>.</mixed-citation></ref>
<ref id="ref-47"><label>[47]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Yin</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Zhu</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Fei</surname></string-name></person-group>, &#x201C;<article-title>A deep learning approach for intrusion detection using recurrent neural networks</article-title>,&#x201D; <source>IEEE Access</source>, vol. <volume>5</volume>, no. <issue>3</issue>, pp. <fpage>21954</fpage>&#x2013;<lpage>21961</lpage>, <year>2017</year>.</mixed-citation></ref>
<ref id="ref-48"><label>[48]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><collab>IEC 60870-5-104 Intrusion Detection Dataset</collab></person-group>. [Online]. Available: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/record/7108614">https://zenodo.org/record/7108614</ext-link></mixed-citation></ref>
<ref id="ref-49"><label>[49]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><collab>DNP3 Intrusion Detection Dataset</collab></person-group>. [Online]. Available: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/record/7348493">https://zenodo.org/record/7348493</ext-link></mixed-citation></ref>
<ref id="ref-50"><label>[50]</label><mixed-citation publication-type="other"><ext-link ext-link-type="uri" xlink:href="https://standards.ieee.org/ieee/C37.118.1/4902/">https://standards.ieee.org/ieee/C37.118.1/4902/</ext-link></mixed-citation></ref>
<ref id="ref-51"><label>[51]</label><mixed-citation publication-type="other"><ext-link ext-link-type="uri" xlink:href="https://iec61850.dvl.iec.ch/">https://iec61850.dvl.iec.ch/</ext-link></mixed-citation></ref>
<ref id="ref-52"><label>[52]</label><mixed-citation publication-type="other"><ext-link ext-link-type="uri" xlink:href="https://www.dnp.org/About/Overview-of-DNP3-Protocol">https://www.dnp.org/About/Overview-of-DNP3-Protocol</ext-link></mixed-citation></ref>
<ref id="ref-53"><label>[53]</label><mixed-citation publication-type="other"><ext-link ext-link-type="uri" xlink:href="https://www.ipcomm.de/protocol/IEC104/en/sheet.html">https://www.ipcomm.de/protocol/IEC104/en/sheet.html</ext-link></mixed-citation></ref>
</ref-list>
</back></article>