<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMES</journal-id>
<journal-id journal-id-type="nlm-ta">CMES</journal-id>
<journal-id journal-id-type="publisher-id">CMES</journal-id>
<journal-title-group>
<journal-title>Computer Modeling in Engineering &#x0026; Sciences</journal-title>
</journal-title-group>
<issn pub-type="epub">1526-1506</issn>
<issn pub-type="ppub">1526-1492</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">71669</article-id>
<article-id pub-id-type="doi">10.32604/cmes.2026.071669</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Human Activity Recognition Using Weighted Average Ensemble by Selected Deep Learning Models</article-title>
<alt-title alt-title-type="left-running-head">Human Activity Recognition Using Weighted Average Ensemble by Selected Deep Learning Models</alt-title>
<alt-title alt-title-type="right-running-head">Human Activity Recognition Using Weighted Average Ensemble by Selected Deep Learning Models</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Akhtar</surname><given-names>Waseem</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Ilyas</surname><given-names>Mahwish</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-3" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Aziz</surname><given-names>Romana</given-names></name><xref ref-type="aff" rid="aff-4">4</xref><email>raabdulaziz@pnu.edu.sa</email></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Aldehim</surname><given-names>Ghadah</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Iqbal</surname><given-names>Tassawar</given-names></name><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Ramzan</surname><given-names>Muhammad</given-names></name><xref ref-type="aff" rid="aff-6">6</xref></contrib>
<aff id="aff-1"><label>1</label><institution>Department of Computer Science, University of Wah</institution>, <addr-line>Wah Cantt</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-2"><label>2</label><institution>Department of Computer Science, National Excellence Institute</institution>, <addr-line>Islamabad</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-3"><label>3</label><institution>Department of Computer Science, University of Rasul</institution>, <addr-line>Mandi Bahaud Din</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-4"><label>4</label><institution>Department of Information Systems, College of Computer and Information Sciences, Princess Nourah bint Abdulrahman University</institution>, <addr-line>P.O. Box 84428, Riyadh</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-5"><label>5</label><institution>Department of Computer Science, COMSATS University Islamabad, Wah Campus</institution>, <addr-line>Wah Cantt</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-6"><label>6</label><institution>Department of Software Engineering, Faculty of Computing and Information Technology, University of Sargodha</institution>, <addr-line>Sargodha</addr-line>, <country>Pakistan</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Romana Aziz. Email: <email>raabdulaziz@pnu.edu.sa</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>26</day><month>2</month><year>2026</year>
</pub-date>
<volume>146</volume>
<issue>2</issue>
<elocation-id>34</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>14</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMES_71669.pdf"></self-uri>
<abstract>
<p>Human Activity Recognition (HAR) is a novel area for computer vision. It has a great impact on healthcare, smart environments, and surveillance while is able to automatically detect human behavior. It plays a vital role in many applications, such as smart home, healthcare, human computer interaction, sports analysis, and especially, intelligent surveillance. In this paper, we propose a robust and efficient HAR system by leveraging deep learning paradigms, including pre-trained models, CNN architectures, and their average-weighted fusion. However, due to the diversity of human actions and various environmental influences, as well as a lack of data and resources, achieving high recognition accuracy remain elusive. In this work, a weighted average ensemble technique is employed to fuse three deep learning models: EfficientNet, ResNet50, and a custom CNN. The results of this study indicate that using a weighted average ensemble strategy for developing more effective HAR models may be a promising idea for detection and classification of human activities. Experiments by using the benchmark dataset proved that the proposed weighted ensemble approach outperformed existing approaches in terms of accuracy and other key performance measures. The combined average-weighted ensemble of pre-trained and CNN models obtained an accuracy of 98%, compared to 97%, 96%, and 95% for the customized CNN, EfficientNet, and ResNet50 models, respectively.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Artificial intelligence</kwd>
<kwd>computer vision</kwd>
<kwd>deep learning</kwd>
<kwd>recognition</kwd>
<kwd>human activity</kwd>
<kwd>classification</kwd>
<kwd>image processing</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Princess Nourah bint Abdulrahman University Researchers</funding-source>
<award-id>PNURSP2026R765</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>HAR is an emerging area of artificial intelligence that recognises and classifies human actions utilising data from sensors, cameras, and wearable devices. Its applications include healthcare, Smart homes, Surveillance, Violent activity detection [<xref ref-type="bibr" rid="ref-1">1</xref>], Academia, driver behaviors and fitness monitoring. HAR systems analyse motion patterns to detect different types of activities. HAR activities are classified as either normal or abnormal. Normal activities are those which are described as regular or daily, for instance, walking, sitting, standing, running, climbing stairs, and so on. Such activities illustrate standard human motion patterns and are widely utilized to train models for behavior understanding. On the other hand, unusual events are unpredictable or anomalous events, such as falling, fighting, elements of sudden collapse, or even some strange dance movements, which are culturally consistent and could be regarded as anomalies but of a different nature.</p>
<p>In surveillance and security, HAR also helps identify suspicious behaviors and enhance security protocols. It also allows for more natural interfaces in interactive media and human-computer interaction. Growing demands for automation, personalized services, and real-time monitoring have raised HAR as a key research area that challenges more accuracy, adaptability, and efficiency.</p>
<p>In HAR, accurately identifying and categorising complex human activities, such as fights, snatching, and running, remains difficult, especially when each activity has multiple subtypes with significant intra-class variation. Most existing HAR systems struggle to distinguish visually similar actions, handle occlusions, and remain robust in dynamic, real-world environments with multiple viewpoints.</p>
<p>Deep learning and pre-trained methods have shown significant success in feature extraction, and Convolutional Neural Networks (CNNs) have been effective in acquiring spatial patterns for activity identification [<xref ref-type="bibr" rid="ref-2">2</xref>]. Beyond the limits of conventional models, CNNs are a powerful way for analyzing spatial and image data. Additionally, CNNs preserve spatial relationships, ensuring coherence even when objects are repositioned. It is better suited for image classification and object detection tasks because it can handle noise and slight data variations, learn deeper features in deeper networks, and perform well on large and complex datasets. Detection of these challenging activities creates problems because the discussed activities are heterogeneous, complex, and not simple [<xref ref-type="bibr" rid="ref-3">3</xref>].</p>
<p>To overcome these challenges, this study first examines the performance of CNN-based architectures and pre-trained models separately, then presents an average-weighted fusion method that capitalizes on their respective advantages. The fusion strategy attains higher recognition accuracy by exploiting the spatial learning capabilities of CNNs and enhanced feature extraction of pre-trained models.</p>
<p>The main objective of this research is to develop a deep learning-based framework for classifying human activities using a labelled dataset comprising three primary activity classes: fighting, snatching, and running, each with five distinct subtypes. This research aims to create a robust human activity detection system capable of detecting safety-critical events. It explores and fine-tunes advanced deep learning models to assess their performance in precisely classifying three different classes, using five variations of each class. The majority of the literature focuses on relatively simple actions such as walking, jogging, clapping, and wrestling, which limits their applicability to real-world, high-risk circumstances. There are models specifically designed to identify complex, safety-critical actions, such as violent or abnormal behaviour (e.g., cellphone theft or fighting), which are very important for surveillance and security applications.</p>
<p>To address this limitation, we selected a benchmark dataset that encompasses cellphone snatching, fighting, and running activities. Deep learning was also used with CNNs and three pre-trained models, which were combined using a Weighted Average Ensemble to achieve robust and accurate classification of critical actions. The results of experiments demonstrate that the proposed fusion scheme&#x2002;has better performance than stand-alone models, implying the need to consider a holistic view in improving HAR performance. The study focuses on abnormal and high-risk activities (e.g., cellphone snatching, fighting, and running) that are rarely addressed in HAR research. Additionally, it utilized the Weighted Average Ensemble of CNNs with three pre-trained models, which improves generalization and reliability for surveillance-oriented HAR.</p>
<p>The main research contributions include:
<list list-type="bullet">
<list-item>
<p>We investigated many cutting-edge pre-trained deep learning models for classifying human activities, evaluating their strengths and limitations in extracting useful information.</p></list-item>
<list-item>
<p>We proposed the CNN model for the classification and detection of given types of activities in the used dataset.</p></list-item>
<list-item>
<p>To improve classification performance, we implemented an average-weighted fusion of pre-trained and CNN models, which combines functional characteristics from both architectures.</p></list-item>
<list-item>
<p>Developed efficient image preprocessing methods, such as cropping and resizing, to standardise input dimensions and maintain motion-related information.</p></list-item>
<list-item>
<p>An effective preprocessing technique has been proposed to improve feature visibility and recognition accuracy in deep learning models by enhancing the low-resolution and low-contrast images.</p></list-item>
</list></p>
<p>The remaining sections of the paper are organized as follows: <xref ref-type="sec" rid="s2">Section 2</xref> presents a literature review; <xref ref-type="sec" rid="s3">Section 3</xref> briefly describes the proposed methodology; <xref ref-type="sec" rid="s4">Section 4</xref> discusses the results and their implications; <xref ref-type="sec" rid="s5">Section 5</xref> highlights the discussion and finally, <xref ref-type="sec" rid="s6">Section 6</xref> provides the conclusion.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Literature Review</title>
<p>This section examines current studies on human activity recognition that employed machine learning, deep learning, and Internet of Things (IoT)-based techniques. HAR uses information gathered from sensors, cameras and other sources to categorise and recognise human activities. This section summarizes some of the significant recent contributions to HAR across different approaches, datasets, results, and challenges. The work aimed to develop the model and report on its accuracy using a test population of individuals aged 19 to 48 years. Various procedures were applied to clean and process the data, as well as to identify different patterns of human activities. In terms of model learning, it performed better than other types of models, including machine learning models, with the highest accuracy rate of 88.6% among machine learning models. It was also the best deep learning model, achieving an accuracy of 84.5% on Gated Recurrent Unit (GRU) [<xref ref-type="bibr" rid="ref-4">4</xref>].</p>
<p>The purpose of this work [<xref ref-type="bibr" rid="ref-5">5</xref>] was to develop an activity recognition method, based on deep learning, by extracting features from the raw input data. Two Hybrid Learning Algorithms (HLA) are proposed to identify sequential and spatial patterns using CNN and Recurrent Neural Networks (RNNs). A hybrid optimisation method, combining the Whale Optimisation Algorithm and the Grey Wolf Optimiser, enhanced feature selection. This evolved the entire optimization process.</p>
<p>In this work, Ref. [<xref ref-type="bibr" rid="ref-6">6</xref>] proposed a dynamic method for HAR that combines a deep bidirectional long-term memory and the pre-trained feature extraction technique. Initially, Convolutional Neural Network models, particularly MobileNetV2, were used to extract deep-level characteristics from video frames. These extracted features were subsequently analyzed using an optimized Deep BiLSTM network which effectively captured dependencies and increased prediction accuracy. During testing phase, an iterative fine-tuning approach was employed to update high-level parameters and enhance flexibility in response to various conditions. The model&#x2019;s performance was assessed using three benchmark datasets: UCF Sport [<xref ref-type="bibr" rid="ref-7">7</xref>], UCF11 [<xref ref-type="bibr" rid="ref-8">8</xref>], and JHMDB [<xref ref-type="bibr" rid="ref-9">9</xref>], with accuracies of 76.30%, 93.3%, and 99.20%, respectively.</p>
<p>This work [<xref ref-type="bibr" rid="ref-10">10</xref>] used XGBoost to build an activity recognition system. Initially, a Hue, Saturation, Value (HSV) colour transformation was used to improve video frame clarity, followed by noise reduction. Silhouettes were extracted using multiple object tracking (MOT) and Video Inference for Body Estimation (VIBE) methods. For feature extraction, Textone maps and Features from the Accelerated Segment Test (FAST) [<xref ref-type="bibr" rid="ref-11">11</xref>] were used. Independent Component Analysis (ICA) was then used to determine the most informative components for feature discrimination. Finally, the composed features were loaded into XGBoost for categorization as relevant human activities. Experiments conducted using the Stony Brook University (SBU) [<xref ref-type="bibr" rid="ref-12">12</xref>] Interaction dataset achieved a recognition rate of 91%. The AI-based behavior biometrics framework used a dynamic attention fusion unit and a temporal-spatial fusion method to improve the detection of human activity in surveillance systems. The system records temporal, geographical, and behavioral dependencies in video data streams and extracts important features employing a lightweight EfficientNetB0 backbone. The system outperformed state-of-the-art techniques, showing high accuracies of 80.342%, 98.987%, 98.734%, and 98.927%, respectively, using four publicly accessible datasets (HMDB51 [<xref ref-type="bibr" rid="ref-13">13</xref>], UCF50 [<xref ref-type="bibr" rid="ref-14">14</xref>], UCF101 [<xref ref-type="bibr" rid="ref-15">15</xref>], and YouTube Action [<xref ref-type="bibr" rid="ref-8">8</xref>]).</p>
<p>Some artificial intelligence models have been created to recognise human activity but have produced poor results in HAR in real-world, long-term settings. Because they have extracted insufficient temporal and spatial features [<xref ref-type="bibr" rid="ref-16">16</xref>]. RNN and the CNN model are applied to examine the best efficiency of daily activities, such as walking, and to attain the best figure of accuracy [<xref ref-type="bibr" rid="ref-17">17</xref>]. One feature of the Wi-Fi signal is Channel State Information (CSI), which identifies various human activities. Human activity data was collected using a Raspberry Pi 4 and applied techniques, including 1D-CNN, LSTM, and Bi-directional LSTM, to achieve 95% accuracy [<xref ref-type="bibr" rid="ref-18">18</xref>]. Similarly, 100% success is achieved in recognising human posture and action using a CNN model, as compared to the Microsoft Research (MSR) and Kinect Activity Recognition Dataset (KARD) [<xref ref-type="bibr" rid="ref-19">19</xref>]. A novel acceleration-based HAR method is employed in conjunction with the CNN model, utilising a large dataset of 31,688 samples. Here, eight distinctive human activities were examined and achieved 93.8% accuracy [<xref ref-type="bibr" rid="ref-20">20</xref>]. Furthermore, an advanced system for human image tracking (HIT) was developed to detect both simple and complex user movements. A smart camera was used to collect the dataset for the model of a region-based CNN. As a result, 98.53% accuracy was achieved through the proposed model [<xref ref-type="bibr" rid="ref-21">21</xref>]. Another deep learning model, CNN-LSTM, was identified to recognise normal individuals and straightforward actions, including driving and eating. Here, CNN was used to extract features from sensor data, and an LSTM was used to obtain long-term dependencies between two events. The proposed model achieved a maximum recognition rate of 95.8% [<xref ref-type="bibr" rid="ref-22">22</xref>]. In some places, a combination of three models was used, including LSTM, CNN, and CNN-LSTM, to organize human activities. These models were trained on a dataset comprising six classes of activities, collected from 36 people. Activities included walking, sitting, jogging, standing, and climbing stairs. The created model was evaluated using the TensorFlow framework, and the testing accuracy was 94.51% for the CNN, 96.61% for the LSTM, and 97.76% for the CNN-LSTM. The accuracy was checked by a precision of 97.75%, a recall of 97.77%, F1-Measure 97.76%, and an area under the curve of 100% for the CNN-LSTM model [<xref ref-type="bibr" rid="ref-23">23</xref>].</p>
<p>The model&#x2019;s effectiveness in personalised and adaptive learning environments was validated through experimental results on datasets, including the recently created Human Activities and Postural Transitions (HAPT) dataset [<xref ref-type="bibr" rid="ref-24">24</xref>], which achieved better classification accuracy (97.84% for transitional activities and 99.04% for dynamic activities) than state-of-the-art methods.</p>
<p>The research introduced an ensemble deep learning approach for capturing temporal correlations in sensory data. We proposed a hybrid LSTM-GRU architecture with dropout and batch normalization and evaluated it on the UCI-HAR and Wireless Sensor Data Mining (WISDM) [<xref ref-type="bibr" rid="ref-25">25</xref>] datasets.</p>
<p>The method was developed to promote a culture of fluid, scalable, and ubiquitous learning The results on HAPT outperformed all the state-of-the-art methods (97.84% for transitional activities and 99.04% for dynamic activities), which demonstrates its effectiveness in the personalized and adaptively learning applications [<xref ref-type="bibr" rid="ref-26">26</xref>]. The authors proposed the method by using the motion capturing system with marker-based motion capturing system (MBMCS) and marker-less motion capturing system (MLMCS), and compared them for the analysis of human gait of a person and activity recognition. Results demonstrate that MBMCS performs up to 99.3% accuracy for person recognition and 98.1% for activity recognition using K-Nearest Neighbors, while MLMCS provides a cost-effective solution with comparable performance and the potential for further improvement by advanced feature extraction [<xref ref-type="bibr" rid="ref-27">27</xref>]. The existing work has been described in <xref ref-type="table" rid="table-1">Table 1</xref>.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Summary of the existing research work for HAR.</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Ref. No.</th>
<th>Year</th>
<th>Dataset Used</th>
<th>Method/Model</th>
<th>Proposed Technique/Contribution</th>
<th>Performance Measures</th>
</tr>
</thead>
<tbody>
<tr>
<td>[<xref ref-type="bibr" rid="ref-2">2</xref>]</td>
<td>2022</td>
<td>Data created</td>
<td>CNN on encrypted in-app mobile data</td>
<td>Introduces CNN approach using encrypted app telemetry for activity detection.</td>
<td>Accuracy 92%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-3">3</xref>]</td>
<td>2023</td>
<td>WISDM, PAMAP2, and KU-HAR</td>
<td>Ensemble deep learning</td>
<td>Ensemble model to handle heterogeneous activity complexity.</td>
<td>Accuracy of 99.98%, 99.64%, and 99.98% for simple, complex, and heterogeneous</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td>2024</td>
<td>Twelve Human Activities (Self-created Dataset)</td>
<td>Hybrid DL (likely CNN &#x002B; LSTM or CNN &#x002B; Transformer)</td>
<td>Hybrid approaches for HAR and detecting postural transitions using mobile sensors.</td>
<td>Accuracy using (Machine Learning): 88.6%, Accuracy (Deep Learning): 84.50%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-5">5</xref>]</td>
<td>2025</td>
<td>Time-series data collected from wearable sensors and smartphones.</td>
<td>Hybrid DL &#x002B; metaheuristic optimization (GWO&#x2013;WOA)</td>
<td>Combines hybrid DL with Grey Wolf Optimizer &#x002B; Whale Optimization for hyperparameter/feature selection.</td>
<td>&#x2212;95% accuracy</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-6">6</xref>]</td>
<td>2024</td>
<td>UCF11, UCF Sport, and JHMDB,</td>
<td>Bi-LSTM &#x002B; transfer-learning feature extractor</td>
<td>Use transfer learning to extract features then Bi-LSTM for temporal modeling of dynamic.</td>
<td>Accuracies of UCF11 is 99.20%, UCF Sports 93.3%, and JHMDB is 76.30%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-7">7</xref>]</td>
<td>2024</td>
<td>The-Wild Smart Watch Activity Dataset Huawei Locomotion, Extra Sensory Dataset</td>
<td>Deep Polynomial Neural Network (novel NN variant)</td>
<td>IoT-enabled HAR &#x002B; localization using polynomial neural nets.</td>
<td>Accuracy obtained by Deep Polynomial Neural Network (DPNN) and Multilayer Perceptron (MLP) on three datasets are: (1) 95% and 86% (2) 97% and 91% (3) 92% and 90%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-8">8</xref>]</td>
<td>2024</td>
<td>SBU Interaction dataset</td>
<td>ICA &#x002B; XGBoost (classical &#x002B; ML)</td>
<td>Combine ICA for feature extraction with XGBoost for classification.</td>
<td>Recognition 91%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-9">9</xref>]</td>
<td>2024</td>
<td>UCF101 and HMDB51 datasets</td>
<td>AI-driven behavior biometrics (likely CNN &#x002B; temporal modules)</td>
<td>Behavior-biometric centric HAR framework for surveillance robustness.</td>
<td>&#x2212;98.52% for UCF101 &#x2212;84.25% for HMDB51</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-16">16</xref>]</td>
<td>2022</td>
<td>New self-created dataset</td>
<td>Hybrid deep learning</td>
<td>Hybrid architecture combining spatial &#x002B; temporal models.</td>
<td>Accuracy 90.89%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td>2023</td>
<td>Kinect Activity Recognition dataset (KARD) Microsoft Research (MSR)</td>
<td>DL on 3D posture data</td>
<td>Uses 3D posture data with DL to classify activity types.</td>
<td>Not reported</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-20">20</xref>]</td>
<td>2016</td>
<td>Time-series sequences of 3D joint coordinates</td>
<td>DL for single-accelerometer input</td>
<td>Demonstrated DL can work with single-sensor input for HAR.</td>
<td>Average accuracy 93.8%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-21">21</xref>]</td>
<td>2022</td>
<td>New dataset created using a smartphone camera,</td>
<td>Deep learning (image-based HAR)</td>
<td>Novel pipeline/architecture for image-based HAR.</td>
<td>Accuracy achieved 98.53%</td>
</tr>
<tr>
<td>[<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>2025</td>
<td>Newly produced HAPT (Human Activities and Postural Transitions)</td>
<td>Ensemble deep learning (transition-aware)</td>
<td>Ensemble framework that is transition-aware (handles transitions between activities).</td>
<td>99.04% for dynamic human activities. 97.84% for transitional activities</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed Research Methodology</title>
<p>The proposed methodology has been discussed in this section. The diagram of the proposed framework is shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>. The basic steps of the proposed method are given below:
<list list-type="simple">
<list-item><label>(a)</label><p>Data set selection</p>
</list-item>
<list-item><label>(b)</label><p>Pre-processing,</p></list-item>
<list-item><label>(c)</label><p>Feature extraction and classification</p></list-item>
<list-item><label>(d)</label><p>Performance measures</p></list-item>
</list></p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Proposed framework for the weighted ensemble model using selected deep learning methods.</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-1.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>Data Acquisition</title>
<p>The dataset serves as a standard for recognising human behaviours, with a focus on violence detection, including frames for cell phone snatching, fighting, and running.</p>
<p>A brief description of the selected dataset is provided in this section, including sample images of each class with varying variations, as shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Dataset samples of three classes with five different forms [<xref ref-type="bibr" rid="ref-28">28</xref>].</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-2.tif"/>
</fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Pre-Processing</title>
<p>In the pre-processing, raw data were cleaned and organized so that it would be easier for the proposed model to understand [<xref ref-type="bibr" rid="ref-29">29</xref>]. It makes it easier to identify and correct errors, fill in missing portions, and improve the overall quality of the selected data. Pre-processing steps focused on the selected datasets. The selected dataset of human activity recognition consists of the following tasks: running, fighting, and snatching. Cropping removes unnecessary parts of the image frame, such as the background, so the focus remains on the person acting. This allows the model to focus only on essential activities. After cropping, all frames are resized to a fixed size. Since varying images frequently have varying sizes, scaling ensures the model analyzes each input consistently, thereby reducing memory use. Adjusting the resolution entails either increasing the clarity for greater detail or decreasing it to accelerate training. High-resolution displays can capture finer information (such as hand movements in fighting). In contrast, low-resolution displays can capture larger movements (such as running) while allowing the model to run faster. The following steps have been utilized for the pre-processing:
<list list-type="bullet">
<list-item>
<p>To focus on the region of interest (ROI) where human activity occurs, cropping is also applied to some images where it is needed to remove unnecessary background elements.</p></list-item>
<list-item>
<p>Images in the given dataset were in different sizes and were resized according to the proposed CNN model and other pretrained models.</p></list-item>
<list-item>
<p>As in the given dataset, a few images are of low resolution and contrast. So, the Histogram processing is used to upgrade the brightness [<xref ref-type="bibr" rid="ref-30">30</xref>] and contrast of image data. It modifies the pixel values to enhance parts in dark or light ranges for better visibility. This assists in clearly managing human actions within the frames. The transformation using histogram equalization is given as</p></list-item>
</list>
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mi>T</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>i</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup><mml:mfrac><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mi>n</mml:mi></mml:mfrac></mml:math></disp-formula>where <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:msub><mml:mi>i</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the input intensity level, <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the output pixel value, <italic>n</italic> is the total number of pixels, <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the number of pixels having intensity <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:msub><mml:mi>r</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Feature Extraction and Classification</title>
<p>We employed the pre-trained models EfficientNet [<xref ref-type="bibr" rid="ref-31">31</xref>], ResNet50 [<xref ref-type="bibr" rid="ref-32">32</xref>], and the proposed CNN [<xref ref-type="bibr" rid="ref-33">33</xref>] model for feature extraction and classification.</p>
<sec id="s3_3_1">
<label>3.3.1</label>
<title>Fine-Tune Efficient Net</title>
<p>EfficientNet is a collection of CNNs designed to improve accuracy and efficiency through compound scaling. It carefully balances depth, width, and resolution to improve performance while retaining computational efficiency. The architecture enhances feature representation with Mobile Inverted Bottleneck Convolution layers and Squeeze-and-Excitation blocks. EfficientNet achieves higher accuracy with fewer parameters, making it a good choice for a variety of computer vision applications, especially Human Activity recognition.</p>
</sec>
<sec id="s3_3_2">
<label>3.3.2</label>
<title>Fine-Tune Residual Network (ResNet50)</title>
<p>ResNet is a deep network with 50 layers designed to address the challenges of training deep networks, introducing residual learning by using residual blocks to mitigate the vanishing gradient problem and enable the training of deeper networks. Each residual block comprises skip connections that avoid one or more layers, permitting the network to learn residual mappings. This architecture enables the network to learn an identity function, which helps in preserving the gradient flow during backpropagation and improves training efficiency. The symbols used in the equation are explained in <xref ref-type="table" rid="table-2">Table 2</xref>.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Symbols for understanding are used in the above equations.</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Symbols</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>I(<italic>x</italic>, <italic>y</italic>)</td>
<td>Pixel intensity at coordinates (<inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> in the original image</td>
</tr>
<tr>
<td>Z</td>
<td>Zooming factor</td>
</tr>
<tr>
<td><inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mrow><mml:mi mathvariant="normal">&#x0398;</mml:mi></mml:mrow></mml:math></inline-formula></td>
<td>Rotation angle</td>
</tr>
<tr>
<td>(<inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Center of the image for rotation and zooming</td>
</tr>
<tr>
<td><inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mrow><mml:mover><mml:mrow><mml:mtext>I</mml:mtext></mml:mrow><mml:mo>&#x00B4;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mover><mml:mrow><mml:mtext>x</mml:mtext></mml:mrow><mml:mo>&#x00B4;</mml:mo></mml:mover></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mrow><mml:mtext>y</mml:mtext></mml:mrow><mml:mo>&#x00B4;</mml:mo></mml:mover></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td>Augmented image</td>
</tr>
<tr>
<td><inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Width of image</td>
</tr>
<tr>
<td><inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mrow><mml:mtext>A&#xA0;</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Validation split ratio</td>
</tr>
<tr>
<td><inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mrow><mml:mtext>N</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Total number of images</td>
</tr>
<tr>
<td><inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mrow><mml:mtext>N</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>t</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Number of training images</td>
</tr>
<tr>
<td><inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:msub><mml:mrow><mml:mtext>N</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>v</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Number of validation images</td>
</tr>
<tr>
<td><inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mrow><mml:mtext>F</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>x</mml:mtext></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td>Applied the objective function</td>
</tr>
<tr>
<td><inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mrow><mml:mtext>x</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>X</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Input feature map to the residual block</td>
</tr>
<tr>
<td><inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msub><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mtext>&#xA0;and&#xA0;</mml:mtext></mml:mrow><mml:msub><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Weights for the convolutional layers for applied models</td>
</tr>
<tr>
<td><inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mrow><mml:mtext>b</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mtext>&#xA0;and&#xA0;</mml:mtext></mml:mrow><mml:msub><mml:mrow><mml:mtext>b</mml:mtext></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Biases for the layers of applied models</td>
</tr>
<tr>
<td><inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:mrow><mml:mtext>BatchNorm</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Training is stabilized and accelerated through the use of batch normalization</td>
</tr>
<tr>
<td><inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mrow><mml:mtext>ReLU</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>activation function (Rectified Linear Unit)</td>
</tr>
<tr>
<td><inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mrow><mml:mtext>M</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>The operation used Max-Pooling</td>
</tr>
<tr>
<td><inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:msub><mml:mrow><mml:mtext>L</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>f</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Flatten Layer</td>
</tr>
<tr>
<td><inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mrow><mml:mtext>Y</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Output feature</td>
</tr>
<tr>
<td><inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>j</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>k</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Spatial and depth indices</td>
</tr>
<tr>
<td><inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:mrow><mml:mtext>C</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Channel Number</td>
</tr>
<tr>
<td><inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:msub><mml:mrow><mml:mtext>X</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>j</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>n</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>k</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td>Input feature mapping at spatial location <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>j</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>n</mml:mtext></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mtext>k</mml:mtext></mml:mrow></mml:math></inline-formula> for the c-th channel</td>
</tr>
<tr>
<td><inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mtext>and</mml:mtext></mml:mrow><mml:mtext>&#x00A0;</mml:mtext><mml:mrow><mml:mtext>n</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Span the spatial dimensions of the filters</td>
</tr>
<tr>
<td><inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mrow><mml:mtext>C</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Span the input channel.</td>
</tr>
<tr>
<td><inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mrow><mml:mtext>U</mml:mtext></mml:mrow></mml:math></inline-formula></td>
<td>Number of Classes</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Proposed CNN Model</title>
<p>Deep neural networks, such as CNNs, are widely used for tasks like image categorisation and object detection. When features have been extracted and dimensionality reduced, the fully connected layer can evaluate the degree to which values and class labels are related. The convolutional layer is responsible for creating feature maps by extracting features from the input pictures. By using techniques such as max pooling and average pooling, the spatial dimensions are compressed in the pooling layer without losing valuable data. CNNs, which were inspired by the human visual cortex, are particularly good at maintaining the spatial relationships between pixels and identifying fine-grained details in images.</p>
<p>The proposed CNN model applied with six layers is designed to efficiently handle the detection task of driver positions from the dataset while maintaining a balance between computational complexity and accuracy, as shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. This model consists of input layers, convolutional layers, max-pooling layers, a flattened layer, a dense layer, an activation function, and an output layer. Each layer plays a distinct role in the pipeline&#x2019;s processing, contributing to the model&#x2019;s ability to learn from the input data.</p>
<sec id="s3_4_1">
<label>3.4.1</label>
<title>Input Layer</title>
<p>The first layer receives the input image on which the selected model is trained.</p>
</sec>
<sec id="s3_4_2">
<label>3.4.2</label>
<title>Convolutional Layer</title>
<p>This layer utilised the set of learnable filters on the input image. This layer captures local patterns, such as edges and texture, and can be computed as in <xref ref-type="disp-formula" rid="eqn-2">Eq. (2)</xref>. Multiple layers are used to extract features from the input layer, each with ten filters of size 3 &#x00D7; 3. To create feature maps that capture the spatial hierarchies of features, this layer applies filters to the input. The symbols used in the equation are explained in <xref ref-type="table" rid="table-2">Table 2</xref>.
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mi>Y</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>.</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula></p>

</sec>
<sec id="s3_4_3">
<label>3.4.3</label>
<title>Max Pooling Layer</title>
<p>A max pooling layer with a 2 &#x00D7; 2 pool size comes after each convolutional layer to reduce the spatial dimensions of the feature maps while keeping the features and reducing the computational complexity. This helps in making the model more robust to variation in the input.</p>
</sec>
<sec id="s3_4_4">
<label>3.4.4</label>
<title>Flatten Connected Layer</title>
<p>A flattened layer receives the output from the last max pooling layer and converts the multi-dimensional feature map into a one-dimensional vector. This step prepared the data for a fully connected layer.</p>
</sec>
<sec id="s3_4_5">
<label>3.4.5</label>
<title>Fully Connected Layer</title>
<p>The fully Connected Layer, also called the dense layer, flattened layer, is fed into a series of fully connected layers as follows:
<list list-type="bullet">
<list-item>
<p>Dense layer including 3 neurons</p></list-item>
<list-item>
<p>Dense layer including 64 neurons</p></list-item>
<list-item>
<p>Dense layer including 128 neurons</p></list-item>
<list-item>
<p>Dense layer including 512 neurons</p></list-item>
<list-item>
<p>Dense layer including 1024 neurons</p></list-item>
</list></p>
<p>Each fully connected layer applies a linear transformation followed by a non-linear activation function to capture the complex patterns and relationships in the data.</p>
</sec>
<sec id="s3_4_6">
<label>3.4.6</label>
<title>Activation Function</title>
<p>The final dense layer with two units is followed by a SoftMax activation function (which normalizes the outputs into a probability distribution) to construct a probability distribution over the output classes, enabling classification into one of the two categories, as shown in <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>.
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>U</mml:mi></mml:mrow></mml:munderover><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
<p>The symbols used in the equation are explained in <xref ref-type="table" rid="table-2">Table 2</xref>.</p>

</sec>
<sec id="s3_4_7">
<label>3.4.7</label>
<title>Output Layer</title>
<p>The output of the softmax layer is the last classification result, indicating the class to which the input image belongs. Also known as CNN mode, this approach combines convolution, max pooling, fully connected, and softmax layers to capture features and patterns in the required input image. The proposed CNN model architecture is shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Proposed CNN model architecture.</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-3.tif"/>
</fig>
</sec>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Weighted Average Ensemble Model for Human Activity Recognition (HAR)</title>
<p>Ensemble learning, a machine learning technique, combines different models to improve cumulative predictive performance. The fundamental idea is that aggregating the results of other models improves the final prediction&#x2019;s reliability, accuracy, and generalizability. In this study, ensemble learning with an average-weighted fusion of EfficientNet, ResNet50, and CNN models improves Human Activity Recognition (HAR) by utilizing their related capabilities. EfficientNet brings fine-grained spatial information compression, ResNet50 generates hierarchical motion representations through deep residual connections, and CNNs learn local spatial features from raw sensor data or video frames. The ensemble trades accuracy for stability by weighting its predictions with the best weight assignments. This prevents overfitting and helps capture diverse activity patterns for better generalization. In this work, an ensemble approach is employed to enhance human activity recognition. The approach had three branches, with five variations each. In this research, two pre-trained and one customized CNN deep learning model were initially constructed, and their performances were thoroughly evaluated on the dataset. The findings demonstrated that ResNet50, EfficientNet, and our own CNN were best suited to acquiring the complex features within images of human activities. Based on this baseline assessment, we developed an ensemble model by aggregating the prediction powers of the three base classifiers. We selected the weighted average ensemble, which leverages the variety of multiple models to generate a more reliable and accurate prediction system. Every base classifier in this ensemble paradigm makes predictions, which are weighted based on how well it performs on a validation set. The weighted averaging technique maximizes the ensemble&#x2019;s overall performance by ensuring that classifiers with higher predictive abilities have a greater influence in the final decision-making process.</p>
<p>The weighted average ensemble for combining predictions from multiple models can be represented mathematically as:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mrow><mml:mover><mml:mi>P</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula>where: <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:mrow><mml:mover><mml:mi>P</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> represents the final predicted output, <italic>N</italic> represents the total selected models, <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represent the prediction obtained from the <italic>i</italic>-th model and <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> used for the assigned weight of the selected <italic>i</italic>-th model. The structure of the weighted average ensemble is shown in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Model structure for weighted average ensemble.</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-4.tif"/>
</fig>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Experimental Setup</title>
<p>For the experimental setup of our study, we begin by selecting a dataset for empirical analysis. This ensures the integrity and balance of pre-processing, standardising image size and shape. The Experimental setup has been discussed in <xref ref-type="table" rid="table-3">Table 3</xref>.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Experimental and the configuration of the setup.</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Component</th>
<th>Description/Specification</th>
</tr>
</thead>
<tbody>
<tr>
<td>Framework/Libraries</td>
<td>TensorFlow 2.x, Keras, OpenCV, NumPy, Pandas, Scikit-learn [<xref ref-type="bibr" rid="ref-34">34</xref>&#x2013;<xref ref-type="bibr" rid="ref-36">36</xref>]</td>
</tr>
<tr>
<td>Hardware Used</td>
<td>Intel Core i7 Processor/NVIDIA GPU (e.g., RTX 3060, 12 GB VRAM)/16 GB RAM</td>
</tr>
<tr>
<td>Integrated Development Environment (IDE)/Environment</td>
<td>Google Colab</td>
</tr>
<tr>
<td>Image Resolution</td>
<td>224 &#x00D7; 224 pixels</td>
</tr>
<tr>
<td>Training&#x2013;Testing Split</td>
<td>70% Training&#x2013;30% Testing</td>
</tr>
<tr>
<td>Evaluation Metrics</td>
<td>Accuracy, Precision, Recall, F1-Score</td>
</tr>
<tr>
<td>Software Dependencies</td>
<td>TensorFlow, OpenCV</td>
</tr>
<tr>
<td>Optimizer</td>
<td>Adam for all models</td>
</tr>
<tr>
<td>Learning Rate</td>
<td>0.001 for CNN models and 0.0001 for other pre-trained models</td>
</tr>
<tr>
<td>Batch Size</td>
<td>64 for the CNN model, and the other pre-trained models&#x2019; size is 32</td>
</tr>
<tr>
<td>Epochs</td>
<td>100 for all models</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Additionally, we outline the standard measures used to compare the applied method with existing studies. The following performance has been used for the analysis of the results.
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mrow><mml:mtext>Accuracy</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mtext>TP&#xA0;</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;TN&#xA0;</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mtext>TF&#xA0;</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;FN&#xA0;</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;FP&#xA0;</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;TP</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mrow><mml:mtext>Precision</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TP&#xA0;</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>TP&#xA0;</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;FP&#xA0;</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mrow><mml:mtext>Sensitivity</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TP&#xA0;</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>FN&#xA0;</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;TP&#xA0;</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:mrow><mml:mtext>Specificity</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TN&#xA0;</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>TN</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>FP&#xA0;</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:mrow><mml:mtext>F</mml:mtext></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mtext>Score</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>Precision&#xA0;</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>&#xA0;Sensitivity</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mtext>&#xA0;Precision</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>&#xA0;Sensitivity</mml:mtext></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Results</title>
<p>The study results were evaluated using three DL models: ResNet50, EfficientNet, and the proposed CNN. Then, these models were fused by averaging for further performance&#x2002;enhancement. ResNet50 with its residual learning framework had excellent feature&#x2002;extraction, however, it struggled to capture more complicated activity sequences. EfficientNet is also a compound scaling method which compares to ResNet50 and is computationally efficient while achieving better accuracy&#x2002;performance. The proposed CNN model for human activity recognition through salient activity features has shown promising results which&#x2002;are tailored accordingly. However, as the models were complementary, some variations existed at the level of the different activity classes, as the unique strengths and weaknesses&#x2002;of each model applied. An&#x2002;average-weighted fusion scheme was simulated to address such a concern. ResNet50, EfficientNet and Prose CNN predictions are weighted based on&#x2002;the performances of the individual models. Benefitting from complementary of three models, the integrated approach improves the classification&#x2002;accuracy, precision, sensitivity as well as the F1 value significantly. In the case of human activity recognition, such an ensemble method has been proven to improve accuracy and to reduce classification errors&#x2002;and over-fitting. Our experiments demonstrate that fusion strategy is a simple yet effective approach to boost the accuracy of activity&#x2002;detection tasks by mitigating the limitations of single models.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Accuracy and Model Loss Plot for Training and Validation Using Selected Model</title>
<p>The model loss and accuracy graphs were crucial for the analysis of the performance measure during the training and testing of our Human Activity Recognition system using both pre-trained and weighted average ensemble models. The loss graph illustrates how the training and validation loss values change over time, showing how well the model learns from the data.</p>
<p>Similarly, the accuracy graph illustrates the evolution of training and validation accuracy over time. Both curves exhibit an upward trend, indicating that the model is improving at accurately classifying human activity. Due to prior learning, the accuracy and loss curves for the pre-trained model converged more quickly, whereas the suggested CNN model needed more epochs but still performed competitively. Comparing the performance, modifying hyperparameters, and choosing the best model configuration were all made easier by these visualisations. The model loss and accuracy graphs using ResNet, EfficientNet, the proposed CNN, and the proposed weighted average ensemble method are shown in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Model loss and model accuracy using: (<bold>a</bold>) ResNet, (<bold>b</bold>) EfficientNet, (<bold>c</bold>) proposed CNN, and (<bold>d</bold>) Weighted average ensemble.</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-5.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Confusion Matrix Using Selected Model</title>
<p>The confusion matrix is an essential resource for assessing the class-wise accuracy and identifying misclassification movements by examining the confusion matrices of selected models, such as ResNet101, EfficientNet, proposed CNNs, and then the average weighted ensemble method hybrid architectures. Thus, the confusion matrix directs additional tuning or data balancing and aids in identifying model generalization flaws. The most accurate architecture for deployment can also be chosen by comparing the confusion matrices of various models. The confusion matrix analysis results were shown in <xref ref-type="fig" rid="fig-6">Fig. 6</xref> by using all selected models.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Confusion matrix by using all selected models: (<bold>a</bold>) ResNet, (<bold>b</bold>) EfficientNet, (<bold>c</bold>) proposed CNN, and (<bold>d</bold>) Weighted average ensemble.</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-6.tif"/>
</fig>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Accuracy, Precision, Recall, and F-Score by Using the Selected Model</title>
<p>By using ResNet, EfficientNet, CNN, and a weighted average ensemble model, the accuracy, precision, recall, and F-measure values are presented in <xref ref-type="table" rid="table-4">Table 4</xref>.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Accuracy, precision, recall, and F-score for using selected models (a) ResNet, (b) EfficientNet, (c) proposed CNN, (d) Weighted average ensemble.</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Selected Model</th>
<th>Name of Class</th>
<th>Precision</th>
<th>Recall</th>
<th>F1-Score</th>
<th>Support</th>
</tr>
</thead>
<tbody>
<tr>
<td><bold>ResNet</bold></td>
<td>Running (Five different forms)</td>
<td>0.95</td>
<td>0.98</td>
<td>0.96</td>
<td>423</td>
</tr>
<tr>
<td/>
<td>Phone-snatching (Five different forms)</td>
<td>0.99</td>
<td>0.89</td>
<td>0.94</td>
<td>397</td>
</tr>
<tr>
<td/>
<td>Fighting (Five different forms)</td>
<td>0.92</td>
<td>0.99</td>
<td>0.95</td>
<td>389</td>
</tr>
<tr>
<td align="center" colspan="6"><bold>Accuracy 0.95</bold></td>
</tr>
<tr>
<td></td>
<td>Running (Five different forms)</td>
<td>0.95</td>
<td>0.96</td>
<td>0.95</td>
<td>423</td>
</tr>
<tr>
<td><bold>EfficientNet</bold></td>
<td>Phone-snatching (Five different forms)</td>
<td>0.97</td>
<td>0.96</td>
<td>0.97</td>
<td>397</td>
</tr>
<tr>
<td/>
<td>Fighting (Five different forms)</td>
<td>0.97</td>
<td>0.96</td>
<td>0.97</td>
<td>389</td>
</tr>
<tr>
<td align="center" colspan="6"><bold>Accuracy 0.96</bold></td>
</tr>
<tr>
<td><bold>Proposed CNN</bold></td>
<td>Running (Five different forms)</td>
<td>0.97</td>
<td>0.96</td>
<td>0.97</td>
<td>0.097</td>
</tr>
<tr>
<td/>
<td>Phone-snatching (Five different forms)</td>
<td>0.99</td>
<td>0.97</td>
<td>0.98</td>
<td>0.99</td>
</tr>
<tr>
<td/>
<td>Fighting (Five different forms)</td>
<td>0.95</td>
<td>0.98</td>
<td>0.97</td>
<td>0.95</td>
</tr>
<tr>
<td align="center" colspan="6"><bold>Accuracy 0.97</bold></td>
</tr>
<tr>
<td><bold>Proposed Weighted Average Ensemble</bold></td>
<td>Running (Five different forms)</td>
<td>0.98</td>
<td>0.97</td>
<td>0.97</td>
<td>617</td>
</tr>
<tr>
<td/>
<td>Phone-snatching (Five different forms)</td>
<td>0.98</td>
<td>0.98</td>
<td>0.98</td>
<td>594</td>
</tr>
<tr>
<td/>
<td>Fighting (Five different forms)</td>
<td>0.98</td>
<td>0.99</td>
<td>0.98</td>
<td>603</td>
</tr>
<tr>
<td align="center" colspan="6"><bold>Accuracy 0.98</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Discussion</title>
<p>The proposed weighted average ensemble framework for HAR combines various deep learning architectures to improve classification accuracy and model robustness. CNN can help to detect low-level spatial features, ResNet handles this by using skip connections to learn deeper features, whereas EfficientNet provides optimized scaling to balance accuracy and computational efficiency. The weighted average ensemble aggregates predictions and assigns importance based on model performance, combining the individual models&#x2019; strengths. Comparisons of the results using ResNet, EfficientNet, CNN, and the Proposed model with Weighted Average Ensemble have been shown in <xref ref-type="fig" rid="fig-7">Fig. 7</xref>.</p>
<fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Comparison of the results of the proposed model with weighted average ensemble with other models.</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMES_71669-fig-7.tif"/>
</fig>
</sec>
<sec id="s6">
<label>6</label>
<title>Conclusion</title>
<p>We aimed to effectively classify and recognise these complex activity patterns using deep learning models. To this end, we evaluated several state-of-the-art convolutional neural networks, including ResNet, EfficientNet, a custom-designed CNN model, and a proposed model integrated with a weighted-average ensemble approach. The experimental results demonstrated that while standard pre-trained models performed reasonably well, the proposed CNN model showed improved classification performance by capturing more discriminative spatial features. In&#x2002;addition, combining several models with the weighted-average ensemble method achieved the best accuracy and stability for all activity types and subtypes. In summary, the proposed ensemble model achieved superior performance than the other baseline models, suggesting&#x2002;that it is able to effectively capture the complexity and diversity of real-world activity recognition. These results demonstrate that&#x2002;ensemble-based deep learning methods are very promising to enhance the performance and robustness of HAR systems, especially in such safety-critical applications as public security, crime prevention, and surveillance. The proposed method yielded encouraging results,&#x2002;but the speed of processing and the overall efficiency may be further enhanced. When the focus of future work will be on performing real-time experiments, considering the problem of imbalanced data, and extending the current model to predict more human activities&#x2002;in diverse domains. Multimodal sensor data and self-supervised learning methods can be incorporated to demonstrate an even higher level of robustness and generalization.</p>
</sec>
</body>
<back>
<ack>
<p>This work was supported by Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2026R765), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This work was supported by Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2026R765), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Waseem Akhtar: Data preparation, data analysis, literature review, initial draft, methodology. Mahwish Ilyas: Data preparation, manuscript revision, experiments and implementation. Romana Aziz: Supervision, final draft review, correspondence, manuscript improvement. Ghadah Aldehim: Critical review, conceptualization, visualization. Tassawar Iqbal: Analysis, improved the writing editing and revision. Muhammad Ramzan: Methodology enhancement, and results interpretation. All authors reviewed and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>The dataset used in this research is available at: <ext-link ext-link-type="uri" xlink:href="https://data.mendeley.com/datasets/67bbcr5ssp/1">https://data.mendeley.com/datasets/67bbcr5ssp/1</ext-link>.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>No need for the ethics approval, as the published dataset has been used in this research work.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ramzan</surname> <given-names>M</given-names></string-name>, <string-name><surname>Abid</surname> <given-names>A</given-names></string-name>, <string-name><surname>Khan</surname> <given-names>HU</given-names></string-name>, <string-name><surname>Awan</surname> <given-names>SM</given-names></string-name>, <string-name><surname>Ismail</surname> <given-names>A</given-names></string-name>, <string-name><surname>Ahmed</surname> <given-names>M</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>A review on state-of-the-art violence detection techniques</article-title>. <source>IEEE Access</source>. <year>2019</year>;<volume>7</volume>:<fpage>107560</fpage>&#x2013;<lpage>75</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ACCESS.2019.2932114</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Pathmaperuma</surname> <given-names>MH</given-names></string-name>, <string-name><surname>Rahulamathavan</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Dogan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Kondoz</surname> <given-names>A</given-names></string-name></person-group>. <article-title>CNN for user activity detection using encrypted in-app mobile data</article-title>. <source>Future Internet</source>. <year>2022</year>;<volume>14</volume>(<issue>2</issue>):<fpage>67</fpage>. doi:<pub-id pub-id-type="doi">10.3390/fi14020067</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kumar</surname> <given-names>P</given-names></string-name>, <string-name><surname>Suresh</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Deep-HAR: an ensemble deep learning model for recognizing the simple, complex, and heterogeneous human activities</article-title>. <source>Multimed Tools Appl</source>. <year>2023</year>;<volume>82</volume>(<issue>20</issue>):<fpage>30435</fpage>&#x2013;<lpage>62</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11042-023-14492-0</pub-id>; <pub-id pub-id-type="pmid">36851913</pub-id></mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chadha</surname> <given-names>J</given-names></string-name>, <string-name><surname>Jain</surname> <given-names>A</given-names></string-name>, <string-name><surname>Kumar</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Modi</surname> <given-names>N</given-names></string-name></person-group>. <article-title>Hybrid deep learning approaches for human activity recognition and postural transitions using mobile device sensors</article-title>. <source>SN Comput Sci</source>. <year>2024</year>;<volume>5</volume>(<issue>7</issue>):<fpage>925</fpage>. doi:<pub-id pub-id-type="doi">10.1007/s42979-024-03300-7</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Thakur</surname> <given-names>D</given-names></string-name>, <string-name><surname>Dangi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Lalwani</surname> <given-names>P</given-names></string-name></person-group>. <article-title>A novel hybrid deep learning approach with GWO-WOA optimization technique for human activity recognition</article-title>. <source>Biomed Signal Process Control</source>. <year>2025</year>;<volume>99</volume>(<issue>3</issue>):<fpage>106870</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.bspc.2024.106870</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Hassan</surname> <given-names>N</given-names></string-name>, <string-name><surname>Miah</surname> <given-names>ASM</given-names></string-name>, <string-name><surname>Shin</surname> <given-names>J</given-names></string-name></person-group>. <article-title>A deep bidirectional LSTM model enhanced by transfer-learning-based feature extraction for dynamic human activity recognition</article-title>. <source>Appl Sci</source>. <year>2024</year>;<volume>14</volume>(<issue>2</issue>):<fpage>603</fpage>. doi:<pub-id pub-id-type="doi">10.3390/app14020603</pub-id>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Rodriguez</surname> <given-names>MD</given-names></string-name>, <string-name><surname>Ahmed</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shah</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Action MACH a spatio-temporal Maximum Average Correlation Height filter for action recognition</article-title>. In: <conf-name>Proceedings of the 2008 IEEE Conference on Computer Vision and Pattern Recognition; 2008 Jun 23&#x2013;28</conf-name>; <publisher-loc>Anchorage, AK, USA</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2008.4587727</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shah</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Recognizing realistic actions from videos in the wild</article-title>. In: <conf-name>Proceedings of the 2009 IEEE Conference on Computer Vision and Pattern Recognition; 2009 Jun 20&#x2013;25</conf-name>; <publisher-loc>Miami, FL, USA</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/cvpr.2009.5206744</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Jhuang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Gall</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zuffi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Schmid</surname> <given-names>C</given-names></string-name>, <string-name><surname>Black</surname> <given-names>MJ</given-names></string-name></person-group>. <article-title>Towards understanding action recognition</article-title>. In: <conf-name>Proceedings of the 2013 IEEE International Conference on Computer Vision; 2013 Dec 1&#x2013;8</conf-name>; <publisher-loc>Sydney, Australia</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/iccv.2013.396</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Bukht</surname> <given-names>TFN</given-names></string-name>, <string-name><surname>Jalal</surname> <given-names>A</given-names></string-name></person-group>. <article-title>A robust model of human activity recognition using independent component analysis and XGBoost</article-title>. In: <conf-name>Proceedings of the 2024 5th International Conference on Advancements in Computational Sciences (ICACS); 2024 Feb 19&#x2013;20</conf-name>; <publisher-loc>Lahore, Pakistan</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/ICACS60934.2024.10473238</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Rosten</surname> <given-names>E</given-names></string-name>, <string-name><surname>Drummond</surname> <given-names>T</given-names></string-name></person-group>. <chapter-title>Machine learning for high-speed corner detection</chapter-title>. In: <source>Computer Vision&#x2014;ECCV 2006</source>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2006</year>. p. <fpage>430</fpage>&#x2013;<lpage>43</lpage>. doi:<pub-id pub-id-type="doi">10.1007/11744023_34</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Yun</surname> <given-names>K</given-names></string-name>, <string-name><surname>Honorio</surname> <given-names>J</given-names></string-name>, <string-name><surname>Chattopadhyay</surname> <given-names>D</given-names></string-name>, <string-name><surname>Berg</surname> <given-names>TL</given-names></string-name>, <string-name><surname>Samaras</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Two-person interaction detection using body-pose features and multiple instance learning</article-title>. In: <conf-name>Proceedings of the 2012 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops; 2012 Jun 16&#x2013;21</conf-name>; <publisher-loc>Providence, RI, USA</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/CVPRW.2012.6239234</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kuehne</surname> <given-names>H</given-names></string-name>, <string-name><surname>Jhuang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Garrote</surname> <given-names>E</given-names></string-name>, <string-name><surname>Poggio</surname> <given-names>T</given-names></string-name>, <string-name><surname>Serre</surname> <given-names>T</given-names></string-name></person-group>. <article-title>HMDB: a large video database for human motion recognition</article-title>. In: <conf-name>Proceedings of the 2011 International Conference on Computer Vision; 2011 Nov 6&#x2013;13</conf-name>; <publisher-loc>Barcelona, Spain</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/ICCV.2011.6126543</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Reddy</surname> <given-names>KK</given-names></string-name>, <string-name><surname>Shah</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Recognizing 50 human action categories of web videos</article-title>. <source>Mach Vis Appl</source>. <year>2013</year>;<volume>24</volume>(<issue>5</issue>):<fpage>971</fpage>&#x2013;<lpage>81</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s00138-012-0450-4</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Soomro</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zamir</surname> <given-names>AR</given-names></string-name>, <string-name><surname>Shah</surname> <given-names>M</given-names></string-name></person-group>. <source>UCF101: a dataset of 101 human actions classes from videos in the wild</source>. <publisher-loc>Orlando, FL, USA</publisher-loc>: <publisher-name>University of Central Florida</publisher-name>; <year>2012</year>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Hussain</surname> <given-names>A</given-names></string-name>, <string-name><surname>Khan</surname> <given-names>SU</given-names></string-name>, <string-name><surname>Khan</surname> <given-names>N</given-names></string-name>, <string-name><surname>Shabaz</surname> <given-names>M</given-names></string-name>, <string-name><surname>Baik</surname> <given-names>SW</given-names></string-name></person-group>. <article-title>AI-driven behavior biometrics framework for robust human activity recognition in surveillance systems</article-title>. <source>Eng Appl Artif Intell</source>. <year>2024</year>;<volume>127</volume>:<fpage>107218</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.engappai.2023.107218</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Batool</surname> <given-names>S</given-names></string-name>, <string-name><surname>Khan</surname> <given-names>MH</given-names></string-name>, <string-name><surname>Farid</surname> <given-names>MS</given-names></string-name></person-group>. <article-title>An ensemble deep learning model for human activity analysis using wearable sensory data</article-title>. <source>Appl Soft Comput</source>. <year>2024</year>;<volume>159</volume>:<fpage>111599</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.asoc.2024.111599</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Abbaspour</surname> <given-names>S</given-names></string-name>, <string-name><surname>Fotouhi</surname> <given-names>F</given-names></string-name>, <string-name><surname>Sedaghatbaf</surname> <given-names>A</given-names></string-name>, <string-name><surname>Fotouhi</surname> <given-names>H</given-names></string-name>, <string-name><surname>Vahabi</surname> <given-names>M</given-names></string-name>, <string-name><surname>Linden</surname> <given-names>M</given-names></string-name></person-group>. <article-title>A comparative analysis of hybrid deep learning models for human activity recognition</article-title>. <source>Sensors</source>. <year>2020</year>;<volume>20</volume>(<issue>19</issue>):<fpage>5707</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s20195707</pub-id>; <pub-id pub-id-type="pmid">33036479</pub-id></mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Fard Moshiri</surname> <given-names>P</given-names></string-name>, <string-name><surname>Shahbazian</surname> <given-names>R</given-names></string-name>, <string-name><surname>Nabati</surname> <given-names>M</given-names></string-name>, <string-name><surname>Ali Ghorashi</surname> <given-names>S</given-names></string-name></person-group>. <article-title>A CSI-based human activity recognition using deep learning</article-title>. <source>Sensors</source>. <year>2021</year>;<volume>21</volume>(<issue>21</issue>):<fpage>7225</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s21217225</pub-id>; <pub-id pub-id-type="pmid">34770532</pub-id></mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>&#x00C7;al&#x0131;&#x015F;kan</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Detecting human activity types from 3D posture data using deep learning models</article-title>. <source>Biomed Signal Process Control</source>. <year>2023</year>;<volume>81</volume>:<fpage>104479</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.bspc.2022.104479</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xue</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>A deep learning approach to human activity recognition based on single accelerometer</article-title>. In: <conf-name>Proceedings of the 2015 IEEE International Conference on Systems, Man, and Cybernetics; 2015 Oct 9&#x2013;12</conf-name>; <publisher-loc>Hong Kong, China</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/SMC.2015.263</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Poulose</surname> <given-names>A</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>JH</given-names></string-name>, <string-name><surname>Han</surname> <given-names>DS</given-names></string-name></person-group>. <article-title>HIT HAR: human image threshing machine for human activity recognition using deep learning models</article-title>. <source>Comput Intell Neurosci</source>. <year>2022</year>;<volume>2022</volume>:<fpage>1808990</fpage>. doi:<pub-id pub-id-type="doi">10.1155/2022/1808990</pub-id>; <pub-id pub-id-type="pmid">36248917</pub-id></mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>J</given-names></string-name>, <string-name><surname>Li</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tian</surname> <given-names>L</given-names></string-name>, <string-name><surname>Tu</surname> <given-names>P</given-names></string-name>, <string-name><surname>Cao</surname> <given-names>T</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Wearable sensor-based human activity recognition using hybrid deep learning techniques</article-title>. <source>Secur Commun Netw</source>. <year>2020</year>;<volume>2020</volume>:<fpage>2132138</fpage>. doi:<pub-id pub-id-type="doi">10.1155/2020/2132138</pub-id>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Anguita</surname> <given-names>D</given-names></string-name>, <string-name><surname>Ghio</surname> <given-names>A</given-names></string-name>, <string-name><surname>Oneto</surname> <given-names>L</given-names></string-name>, <string-name><surname>Parra</surname> <given-names>X</given-names></string-name>, <string-name><surname>Reyes-Ortiz</surname> <given-names>JL</given-names></string-name></person-group>. <article-title>A public domain dataset for human activity recognition using smartphones</article-title>. In: <conf-name>Proceedings of the European Symposium on Artificial Neural Networks; 2013 Apr 24&#x2013;26</conf-name>; <publisher-loc>Bruges, Belgium</publisher-loc>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kwapisz</surname> <given-names>JR</given-names></string-name>, <string-name><surname>Weiss</surname> <given-names>GM</given-names></string-name>, <string-name><surname>Moore</surname> <given-names>SA</given-names></string-name></person-group>. <article-title>Activity recognition using cell phone accelerometers</article-title>. <source>SIGKDD Explor Newsl</source>. <year>2011</year>;<volume>12</volume>(<issue>2</issue>):<fpage>74</fpage>&#x2013;<lpage>82</lpage>. doi:<pub-id pub-id-type="doi">10.1145/1964897.1964918</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Maharana</surname> <given-names>K</given-names></string-name>, <string-name><surname>Mondal</surname> <given-names>S</given-names></string-name>, <string-name><surname>Nemade</surname> <given-names>B</given-names></string-name></person-group>. <article-title>A review: data pre-processing and data augmentation techniques</article-title>. <source>Glob Transit Proc</source>. <year>2022</year>;<volume>3</volume>(<issue>1</issue>):<fpage>91</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.gltp.2022.04.020</pub-id>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Binish Zahra</surname> <given-names>S</given-names></string-name>, <string-name><surname>Adnan Khan</surname> <given-names>M</given-names></string-name>, <string-name><surname>Abbas</surname> <given-names>S</given-names></string-name>, <string-name><surname>Masood Khan</surname> <given-names>K</given-names></string-name>, <string-name><surname>Al Ghamdi</surname> <given-names>MA</given-names></string-name>, <string-name><surname>Almotiri</surname> <given-names>SH</given-names></string-name></person-group>. <article-title>Marker-based and marker-less motion capturing video data: person &#x0026; activity identification comparison based on machine learning approaches</article-title>. <source>Comput Mater Contin</source>. <year>2021</year>;<volume>66</volume>(<issue>2</issue>):<fpage>1269</fpage>&#x2013;<lpage>82</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmc.2020.012778</pub-id>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Saddique</surname> <given-names>M</given-names></string-name>, <string-name><surname>Muneer</surname> <given-names>I</given-names></string-name></person-group>. <article-title>Dataset for human activity recognition. Version 1</article-title>. <source>Mendeley Data</source>. <year>2024</year>. doi:<pub-id pub-id-type="doi">10.17632/67BBCR5SSP.1</pub-id>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Deng</surname> <given-names>W</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Image contrast enhancement and brightness preservation based on an adaptive histogram correction framework</article-title>. <source>Appl Opt</source>. <year>2025</year>;<volume>64</volume>(<issue>13</issue>):<fpage>3502</fpage>&#x2013;<lpage>15</lpage>. doi:<pub-id pub-id-type="doi">10.1364/AO.557280</pub-id>; <pub-id pub-id-type="pmid">40793216</pub-id></mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>W</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>A histogram equalization model for color image contrast enhancement</article-title>. <source>Signal Image Video Process</source>. <year>2024</year>;<volume>18</volume>(<issue>2</issue>):<fpage>1725</fpage>&#x2013;<lpage>32</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11760-023-02881-9</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Tan</surname> <given-names>M</given-names></string-name>, <string-name><surname>Le</surname> <given-names>QV</given-names></string-name></person-group>. <article-title>EfficientNet: rethinking model scaling for convolutional neural networks</article-title>. In: <conf-name>Proceedings of the 36th International Conference on Machine Learning (ICML); 2019 Jun 10&#x2013;15</conf-name>; <publisher-loc>Long Beach, CA, USA</publisher-loc>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Deep residual learning for image recognition</article-title>. In: <conf-name>Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2016 Jun 27&#x2013;30</conf-name>; <publisher-loc>Las Vegas, NV, USA</publisher-loc>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Kuen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>L</given-names></string-name>, <string-name><surname>Shahroudy</surname> <given-names>A</given-names></string-name>, <string-name><surname>Shuai</surname> <given-names>B</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Recent advances in convolutional neural networks</article-title>. <source>Pattern Recognit</source>. <year>2018</year>;<volume>77</volume>:<fpage>354</fpage>&#x2013;<lpage>77</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patcog.2017.10.013</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Abadi</surname> <given-names>M</given-names></string-name>, <string-name><surname>Agarwal</surname> <given-names>A</given-names></string-name>, <string-name><surname>Barham</surname> <given-names>P</given-names></string-name>, <string-name><surname>Brevdo</surname> <given-names>E</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Corrado</surname> <given-names>GS</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>TensorFlow: large-scale machine learning on heterogeneous systems [Internet]</article-title>. <comment>[cited 2026 Jan 13]</comment>. Available from: <ext-link ext-link-type="uri" xlink:href="https://www.tensorflow.org">https://www.tensorflow.org</ext-link>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Bradski</surname> <given-names>G</given-names></string-name>, <string-name><surname>Kaehler</surname> <given-names>A</given-names></string-name></person-group>. <source>Learning OpenCV: computer vision with the OpenCV library</source>. <publisher-loc>Sebastopol, CA, USA</publisher-loc>: <publisher-name>O&#x2019;Reilly Media</publisher-name>; <year>2008</year>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Pedregosa</surname> <given-names>F</given-names></string-name>, <string-name><surname>Varoquaux</surname> <given-names>G</given-names></string-name>, <string-name><surname>Gramfort</surname> <given-names>A</given-names></string-name>, <string-name><surname>Michel</surname> <given-names>V</given-names></string-name>, <string-name><surname>Thirion</surname> <given-names>B</given-names></string-name>, <string-name><surname>Grisel</surname> <given-names>O</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J Mach Learn Res</source>. <year>2011</year>;<volume>12</volume>:<fpage>2825</fpage>&#x2013;<lpage>30</lpage>.</mixed-citation></ref>
</ref-list>
</back></article>