<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">71988</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.071988</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Intelligent Human Interaction Recognition with Multi-Modal Feature Extraction and Bidirectional LSTM</article-title>
<alt-title alt-title-type="left-running-head">Intelligent Human Interaction Recognition with Multi-Modal Feature Extraction and Bidirectional LSTM</alt-title>
<alt-title alt-title-type="right-running-head">Intelligent Human Interaction Recognition with Multi-Modal Feature Extraction and Bidirectional LSTM</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Azhar</surname><given-names>Muhammad Hamdan</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="author-notes" rid="afn1">#</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Wu</surname><given-names>Yanfeng</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="author-notes" rid="afn1">#</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Almujally</surname><given-names>Nouf Abdullah</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Alharbi</surname><given-names>Shuaa S.</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Algarni</surname><given-names>Asaad</given-names></name><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Jalal</surname><given-names>Ahmad</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-6">6</xref></contrib>
<contrib id="author-7" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Liu</surname><given-names>Hui</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-7">7</xref><xref ref-type="aff" rid="aff-8">8</xref><email>hui.liu@uni-bremen.de</email></contrib>
<aff id="aff-1"><label>1</label><institution>Guodian Nanjing Automation Co., Ltd.</institution>, <addr-line>Nanjing, 600268</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Faculty of Computing and AI, Air University</institution>, <addr-line>Islamabad, 44000</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-3"><label>3</label><institution>Department of Information Systems, College of Computer and Information Sciences, Princess Nourah bint Abdulrahman University</institution>, <addr-line>Riyadh, 11671</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-4"><label>4</label><institution>Department of Information Technology, College of Computer, Qassim University</institution>, <addr-line>Buraydah, 52571</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-5"><label>5</label><institution>Department of Computer Sciences, Faculty of Computing and Information Technology, Northern Border University</institution>, <addr-line>Rafha, 91911</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-6"><label>6</label><institution>Department of Computer Science and Engineering, College of Informatics, Korea University</institution>, <addr-line>Seoul, 02841</addr-line>, <country>Republic of Korea</country></aff>
<aff id="aff-7"><label>7</label><institution>Jiangsu Key Laboratory of Intelligent Medical Image Computing, School of Future Technology, Nanjing University of Information Science and Technology</institution>, <addr-line>Nanjing, 210044</addr-line>, <country>China</country></aff>
<aff id="aff-8"><label>8</label><institution>Cognitive Systems Lab, University of Bremen</institution>, <addr-line>Bremen, 28359</addr-line>, <country>Germany</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Hui Liu. Email: <email>hui.liu@uni-bremen.de</email></corresp>
<fn id="afn1">
<p><sup>#</sup>These authors contributed equally to this work</p>
</fn>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>68</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>22</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_71988.pdf"></self-uri>
<abstract>
<p>Recognizing human interactions in RGB videos is a critical task in computer vision, with applications in video surveillance. Existing deep learning-based architectures have achieved strong results, but are computationally intensive, sensitive to video resolution changes and often fail in crowded scenes. We propose a novel hybrid system that is computationally efficient, robust to degraded video quality and able to filter out irrelevant individuals, making it suitable for real-life use. The system leverages multi-modal handcrafted features for interaction representation and a deep learning classifier for capturing complex dependencies. Using Mask R-CNN and YOLO11-Pose, we extract grayscale silhouettes and keypoint coordinates of interacting individuals, while filtering out irrelevant individuals using a proposed algorithm. From these, we extract silhouette-based features (local ternary pattern and histogram of optical flow) and keypoint-based features (distances, angles and velocities) that capture distinct spatial and temporal information. A Bidirectional Long Short-Term Memory network (BiLSTM) then classifies the interactions. Extensive experiments on the UT Interaction, SBU Kinect Interaction and the ISR-UOL 3D social activity datasets demonstrate that our system achieves competitive accuracy. They also validate the effectiveness of the chosen features and classifier, along with the proposed system&#x2019;s computational efficiency and robustness to occlusion.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Human interaction recognition</kwd>
<kwd>keypoint coordinates</kwd>
<kwd>grayscale silhouettes</kwd>
<kwd>bidirectional long short-term memory network</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Princess Nourah bint Abdulrahman University</funding-source>
<award-id>PNURSP2025R410</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Human Interaction Recognition (HIR) involves the automated understanding of social interactions between individuals. These interactions can range from everyday actions, like shaking hands or exchanging objects to more critical or suspicious behaviors, such as pushing or punching. Recognizing these interactions across different modalities, such as RGB video, depth data, or sensor readings, has significant applications in areas like security, healthcare, and surveillance [<xref ref-type="bibr" rid="ref-1">1</xref>].</p>
<p>HIR in RGB videos falls within the computer vision domain, focused on classifying the interactions between individuals. Compared to single-person activity recognition, HIR presents greater challenges due to complex spatial and temporal relationships between individuals. Additionally, poor video resolution can significantly degrade classification accuracy. While much work has been done in this domain, existing systems still have limitations. Some perform poorly in noisy or low-resolution videos, some struggle to classify interactions in multi person settings when irrelevant persons are also present and others are very computationally intensive, making them expensive for use in real world situations.</p>
<p>In this paper, we propose a system that leverages pretrained deep learning models to segment individuals and extract their keypoint coordinates within a region of interest (ROI), effectively removing irrelevant individuals. We then use carefully selected handcrafted spatial and temporal features, which are faster to compute, followed by a deep learning model for classification. Our approach prioritizes computational efficiency, relying on deep learning models only when classical methods cannot achieve comparable performance. This balance allows our system to maintain strong performance while remaining efficient, making it suitable for real-world deployment. The proposed system has the following key contributions:
<list list-type="bullet">
<list-item>
<p>Pretrained instance segmentation (MASK R-CNN) and keypoint detection (YOLO11 Pose) models are used to provide accurate silhouettes and keypoint coordinates respectively even in low resolution videos. An algorithm is proposed to filter out irrelevant individuals from the scene.</p></list-item>
<list-item>
<p>Efficient features are employed to capture the most relevant spatial (Local Ternary Pattern, Distance, Angle) and temporal (Histogram of Optical flow, Velocity) information for human interaction recognition.</p></list-item>
<list-item>
<p>A BiLSTM is used for incorporating temporal dependencies from both forward and backward directions, enabling more accurate classification.</p></list-item>
<list-item>
<p>Comprehensive analyses of feature contributions and classifier performance show that each selected feature contributes meaningful dependencies and that the chosen classifier is well-suited for our use case. Furthermore, evaluations of computational efficiency and occlusion robustness demonstrate that the system is lightweight and reliable under challenging conditions.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Literature Review</title>
<p>This section gives an overview of human interaction classification by handcrafted feature extraction-based approaches and deep learning feature extraction-based approaches.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Handcrafted Feature Extraction-Based Approaches</title>
<p>These approaches typically begin with preprocessing steps, such as histogram equalization or bilateral filtering to reduce noise. Then segmentation techniques, such as Watershed, GrabCut and Active Contours, are commonly used for human segmentation which are highly sensitive to image quality. For keypoint detection, techniques such as skeletonization of silhouettes and contour-based joint estimation are applied to estimate joint positions.</p>
<p>Then a variety of features are extracted to capture the spatial and temporal dynamics of the interaction. For segmentation, this includes features like Scale-Invariant Feature Transform, Histogram of Oriented Gradients, Motion History Image, Bag of Words-based descriptors and Motion Boundary Histograms while for keypoints, this includes keypoint angles, distances and pairwise keypoint displacements. Then these features are optionally optimized using techniques like Linear Discriminant Analysis or t-SNE before being fed into machine learning classifiers such as Support Vector Machines, Random Forests or k-Nearest Neighbors for final classification. Examples are works done by [<xref ref-type="bibr" rid="ref-2">2</xref>,<xref ref-type="bibr" rid="ref-3">3</xref>].</p>
<p>Some more recent works also incorporated depth information alongside RGB data. For example, in [<xref ref-type="bibr" rid="ref-4">4</xref>], both gray scale and depth silhouette features along with point-based features were extracted before cross entropy optimization and classification using a Maximum Entropy Markov Model.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Deep Learning Feature Extraction-Based Approaches</title>
<p>Many deep learning approaches for human interaction classification typically use fine-tuned Convolutional Neural Network (CNN) architectures to extract spatial features from individual frames followed by 3D CNNs or Recurrent Neural Networks to model temporal dynamics across sequences. Segmentation-based models such as Mask R-CNN [<xref ref-type="bibr" rid="ref-5">5</xref>], trained on diverse RGB datasets, have also gained popularity for their robustness to varying video resolutions. For example, [<xref ref-type="bibr" rid="ref-6">6</xref>] used a DenseNet encoder with a custom decoder for semantic segmentation, refining inputs by suppressing irrelevant regions before passing them into an EfficientNet BiLSTM. Multi-stream architectures improve performance by letting each stream specialize in a different modality, such as RGB frames, optical flow or keypoint coordinates. The information is then combined using fusion techniques, as explored in [<xref ref-type="bibr" rid="ref-7">7</xref>].</p>
<p>There also has been increasing focus on systems that explicitly model spatial and temporal relationships between interacting individuals. With the advent of advanced keypoint detection models like OpenPose [<xref ref-type="bibr" rid="ref-8">8</xref>] and HRNet [<xref ref-type="bibr" rid="ref-9">9</xref>], modeling interactions through body joint movements has become more feasible. Works such as [<xref ref-type="bibr" rid="ref-10">10</xref>,<xref ref-type="bibr" rid="ref-11">11</xref>] have leveraged Interactional Relational Networks and Graph Convolutional Networks to effectively represent both intra and inter-person relationships, offering promising results in complex interaction scenarios.</p>
<p>However, recent works demonstrate that intelligent fusion of heterogeneous features significantly improves model reproducibility and generalization [<xref ref-type="bibr" rid="ref-12">12</xref>]. The proposed system builds on this idea, fusing features from different modalities that are both efficient to extract and effective for accurate classification.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed System Methodology</title>
<p>The proposed system begins with dividing the RGB video into frames and selecting a fixed number for classification. Then instance segmentation and keypoint coordinates extraction is performed on each selected frame, focusing on relevant individuals within the ROI. From the resulting gray scale silhouettes and keypoints, spatial and temporal features are extracted followed by classification using a deep learning model. <xref ref-type="fig" rid="fig-1">Fig. 1</xref> provides an overview of the system, with details of each stage provided in the following subsections:</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Architecture of the proposed system</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-1.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>Segmentation</title>
<p>Accurate segmentation is essential for effective feature extraction in activity classification. To achieve this, we use Mask R-CNN [<xref ref-type="bibr" rid="ref-5">5</xref>], which performs accurate instance-level segmentation by generating separate masks for each person. This enables easy removal of irrelevant individuals. It extends Faster R-CNN [<xref ref-type="bibr" rid="ref-13">13</xref>] by adding a parallel branch for mask prediction. The overall loss function <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mi>L</mml:mi></mml:math></inline-formula> for training Mask R-CNN is defined as a multi-task loss combining classification loss <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, bounding box regression <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and mask prediction loss <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The classification loss <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is computed as:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the ground-truth label for class <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>c</mml:mi></mml:math></inline-formula> (1 if the RoI belongs to class <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mi>c</mml:mi></mml:math></inline-formula>, 0 otherwise), and <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the predicted probability for class <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mi>c</mml:mi></mml:math></inline-formula>. The bounding box regression loss <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is defined as:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:mi>w</mml:mi><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:mi>h</mml:mi><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:munder><mml:mi>s</mml:mi><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the predicted bounding box coordinates (center coordinates <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> and dimensions <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>), <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the ground-truth bounding box coordinates, and <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mi>s</mml:mi><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn>1</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> is the smooth <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> loss function, which is less sensitive to outliers compared to the standard <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> loss. Finally, the mask prediction loss <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is defined as the average binary cross-entropy loss over the predicted mask computed for each ROI and ground-truth class <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mi>k</mml:mi></mml:math></inline-formula>:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:msup><mml:mi>m</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2264;</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:mo>[</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mi>m</mml:mi></mml:math></inline-formula> is the resolution of the predicted mask, <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the ground-truth binary mask value at pixel <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, and <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msubsup><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the predicted mask value for class <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>k</mml:mi></mml:math></inline-formula> at pixel <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>.</p>
<p>For our experiments, we used a pre-trained Mask R-CNN. RGB frames were resized to an optimal resolution, and a confidence threshold was applied to segment each individual in the frame. Using the ROI boxes, only gray scale silhouettes of relevant individuals were extracted, effectively removing irrelevant individuals. <xref ref-type="fig" rid="fig-2">Fig. 2</xref> shows instance segmentation results on the UT interaction dataset (UT) [<xref ref-type="bibr" rid="ref-14">14</xref>], which we evaluated on for our experiments. In the top row, red bounding boxes indicate ROIs, while different colors distinguish individual person instances detected by Mask R-CNN. The bottom row displays corresponding grayscale silhouettes of the segmented individuals within the ROIs.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Segmentation results for hugging, kicking, and punching activities</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-2.tif"/>
</fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Keypoint Detection</title>
<p>For keypoints detection and tracking of interacting individuals across frames in ROI, we use YOLO11-Pose [<xref ref-type="bibr" rid="ref-15">15</xref>] from the YOLO framework. YOLO11-Pose builds on YOLOv8&#x2019;s loss structure. Detection loss includes bounding box loss <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> (IoU or Smooth L1), classification loss <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> (Binary Cross-Entropy), and Distribution Focal Loss <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>f</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> (DFL). For pose estimation, additional losses are used for keypoint localization loss <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> (Smooth L1) and keypoint visibility loss <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> (Binary Cross-Entropy), combined in a weighted total loss computed as:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>f</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>f</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>o</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>f</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>k</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>v</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> are task-specific weighting factors. A key innovation in YOLO11 is the introduction of new Cross Stage Partial with Spatial Attention (C2PSA) and C3k2 blocks which enhance feature extraction while maintaining efficiency. C2PSA uses Spatial Pyramid Pooling, which is fast for multi-scale aggregation with a position-sensitive attention mechanism. C3k2 unlike the C2f block used in previous YOLO versions, employs two smaller convolutions instead of one large convolution. Both blocks split the input feature map <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:mi>x</mml:mi></mml:math></inline-formula> into two branches. One branch retains the input features, and the other processes them through specialized modules before being concatenated and fused via a 1 &#x00D7; 1 convolution. The equations are given below:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mi>C</mml:mi><mml:mn>2</mml:mn><mml:mi>P</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Con</mml:mtext></mml:mrow><mml:msub><mml:mrow><mml:mtext>v</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>P</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>B</mml:mi><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>c</mml:mi><mml:mi>k</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>b</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mi>C</mml:mi><mml:mn>3</mml:mn><mml:mi>k</mml:mi><mml:mn>2</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Con</mml:mtext></mml:mrow><mml:msub><mml:mrow><mml:mtext>v</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>In C2PSA, <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>a</mml:mi></mml:math></inline-formula> and <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>b</mml:mi></mml:math></inline-formula> are the split branches of <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>x</mml:mi></mml:math></inline-formula>, and <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:mi>P</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mi>B</mml:mi><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>c</mml:mi><mml:mi>k</mml:mi></mml:math></inline-formula> applies multi-head attention and feed-forward transformations. In C3k2, <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> are the initial split branches, and <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the outputs of the C3k or Bottleneck modules.</p>
<p>For our experiments, we used a pretrained YOLO11x-Pose, the largest YOLO11-Pose variant, due to its superior keypoints detection performance, with ByteTrack tracker for correct keypoints assignment across frames. ByteTrack assigns unique IDs to persons across frames by first matching high-confidence detections using Intersection over Union, then re-associating low-confidence detections to reduce identity switches. Again, using the ROI boxes, keypoints of irrelevant individuals were removed.</p>
<p>YOLO11-Pose detects 17 keypoints, namely nose, eyes, ears, shoulders, elbows, wrists, hips, knees, and ankles. In our approach, we exclude the ear and eye keypoints because they are highly correlated with the nose and often suffer from unreliable detection depending on the person&#x2019;s orientation in the frame. For example, when an individual is in a lateral view, one eye or ear may not be detected, introducing inconsistencies and noise into the feature representation which ultimately results in lower classification accuracy. By retaining 13 keypoints, each representing distinct and stable body part positions, our system preserves the most informative information for effective classification. <xref ref-type="fig" rid="fig-3">Fig. 3</xref> visualizes keypoints detection and tracking on the UT dataset. Red bounding boxes indicate ROIs and colored markers represent individual keypoints, track ids and the bounding boxes of individuals.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Keypoint detection and tracking for pointing, kicking, and shaking hands activities</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-3.tif"/>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Algorithm for ROI-Based Segmentation and Keypoint Detection</title>
<p>The following Algorithm 1 is used in our system to filter out irrelevant individuals using the ROI boxes during segmentation and keypoint detection, so features of only relevant individuals are extracted later:</p>
<fig id="fig-10">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-10.tif"/>
</fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Feature Extraction</title>
<p>Our system captures both spatial and temporal features from gray scale silhouettes and keypoints. This ensures the extraction of features essential for accurate classification. The details are provided below:</p>
<sec id="s3_4_1">
<label>3.4.1</label>
<title>Local Ternary Pattern Features</title>
<p>For the spatial features of grayscale silhouettes, we extract Local Ternary Pattern (LTP) features, which capture local texture information by encoding the relationships between pixel intensities in a neighborhood. The LTP algorithm extends the Local Binary Pattern by introducing a ternary encoding scheme, which divides pixel differences into three categories: positive, negative, and neutral. This allows for more robust feature representation, especially in scenarios with varying illumination or noise. Given a grayscale image <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>I</mml:mi></mml:math></inline-formula> and a center pixel <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, the LTP code for a neighboring pixel <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is computed as follows:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mrow><mml:mtext>LTP</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mtext>I</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>p</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="right center" rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>&#x003E;</mml:mo><mml:mi>&#x03C4;</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mspace width="thinmathspace" /><mml:mspace width="thinmathspace" /><mml:mspace width="thinmathspace" /><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>&#x003C;</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03C4;</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>w</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>&#x03C4;</mml:mi></mml:math></inline-formula> is a predefined threshold. The positive LTP and negative LTP codes are processed separately to generate histograms. These histograms are concatenated to form the final LTP feature vector. <xref ref-type="fig" rid="fig-4">Fig. 4</xref> shows a visualization of LTP features for different interactions from the UT dataset.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>LTP features for shaking hands, pushing and hugging activities</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-4.tif"/>
</fig>
</sec>
<sec id="s3_4_2">
<label>3.4.2</label>
<title>Histogram of Optical Flow (HOF) Features</title>
<p>To capture temporal motion information of interacting individuals across frames, we extract Histogram of Optical Flow (HOF) features from gray scale silhouettes. These features encode the motion patterns between consecutive frames by analyzing the displacement of pixels over time. The optical flow is computed using the Farneback method, which estimates the dense motion vectors between two consecutive frames. The flow vectors <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>F</mml:mi></mml:math></inline-formula> are decomposed into their magnitude <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mrow><mml:mo>|</mml:mo><mml:mi>F</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:math></inline-formula> and angle <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula>, defined as:
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mrow><mml:mo>|</mml:mo><mml:mi>F</mml:mi><mml:mo>|</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mtext>sqrt</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mi>&#x03B8;</mml:mi><mml:mo>=</mml:mo><mml:mi>arctan</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mn>2</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>where <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are the horizontal and vertical components of the flow vectors, respectively. The angles <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula> are binned into predefined number of bins (covering 0&#x00B0; to 360&#x00B0;), and the corresponding magnitudes are used to weight the histogram. This results in a feature vector for each frame pair, representing the distribution of motion directions weighted by their magnitudes. HOF features are robust to small variations in motion and noise, making them a good feature descriptor for capturing dynamic patterns in actions. <xref ref-type="fig" rid="fig-5">Fig. 5</xref> shows a visualization of HOF features for different interactions from the UT dataset.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>HOF features for punching, kicking and shaking hands activities</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-5.tif"/>
</fig>
</sec>
<sec id="s3_4_3">
<label>3.4.3</label>
<title>Distance Features</title>
<p>To capture the spatial relationships between keypoints, we extract inter-person distance features. These features characterize the relative positions of body parts across individuals and are particularly useful for distinguishing interactions that involve varying degrees of physical proximity, such as hugging, approaching, or punching. Specifically, they describe how close or far apart certain joints or body regions are, which can reflect the nature and intensity of the interaction. Euclidean distance between two keypoints <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is computed as:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:msqrt><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:msqrt></mml:math></disp-formula></p>
<p>We use a fully connected approach, computing pairwise distances between all keypoints of the interacting individuals. This allows the model to learn meaningful interaction patterns without relying on handcrafted keypoint pairs. This approach captures a comprehensive spatial representation of the interaction, helping the system generalize well across various postures and actions. <xref ref-type="fig" rid="fig-6">Fig. 6</xref> shows a visualization of these distance features for several interactions from the UT dataset.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Distance features for punching, pushing and kicking activities</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-6.tif"/>
</fig>
</sec>
<sec id="s3_4_4">
<label>3.4.4</label>
<title>Angle Features</title>
<p>To capture orientation information between keypoints, we extract intra-person angle features. These features reflect the angular relationships between connected joints within an individual&#x2019;s body, providing insight into how different body parts are aligned or directed during an interaction. Such information is especially useful for distinguishing between actions that may involve similar spatial distances across interacting individuals but differ in limbs orientation of each individual, such as pushing vs. punching. The angle <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula> between two keypoints <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is computed as:
<disp-formula id="ueqn-11"><label>11</label><mml:math id="mml-ueqn-11" display="block"><mml:mi>&#x03B8;</mml:mi><mml:mo>=</mml:mo><mml:mi>arctan</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mn>2</mml:mn><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>Similar to distance features, we employ the same fully connected approach, computing angular relationships between all keypoints of each person. This method ensures that the model captures the complete set of angular dynamics, without any selective feature extraction. <xref ref-type="fig" rid="fig-7">Fig. 7</xref> shows a visualization of angle features for different interactions from the UT dataset.</p>
<fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Angle features for shaking hands, pushing and hugging activities</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-7.tif"/>
</fig>
</sec>
<sec id="s3_4_5">
<label>3.4.5</label>
<title>Velocity Features</title>
<p>To capture temporal motion dynamics, we extract velocity features from each person&#x2019;s keypoints across consecutive frames. Velocity features can complement spatial features in further distinguishing activities with the same spatial patterns but different temporal patterns like approaching or departing. The velocity for a keypoint between two consecutive frames is computed as the Euclidean distance between its coordinates in the current frame <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and the previous frame <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mspace width="thinmathspace" /><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>:
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:mrow><mml:mtext>velocity</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:msqrt><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:msqrt></mml:math></disp-formula></p>
<p>These velocity features encode the motion patterns of keypoints over time which can be helpful for accurate classification. <xref ref-type="fig" rid="fig-8">Fig. 8</xref> presents a heatmap-style visualization of velocity features for various interactions from the UT dataset. The colors show how fast different body parts are moving and how movement patterns change between activities.</p>
<fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Velocity features for punching, shaking hands and kicking activities. Dark blue, light blue, and brown keypoints represent very high, high, and low velocities, respectively</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-8.tif"/>
</fig>
</sec>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Classification</title>
<p>Long Short-Term Memory Networks (LSTMs) have been widely used for activity classification, due to their ability to model temporal dependencies across frame sequences. They use gating mechanisms to control the flow of information. The input gate regulates how much new information is added to the cell state, while the forget gate controls how much of the previous cell state should be forgotten. The candidate cell state determines the potential update to cell state, and the output gate decides how much of the current cell state is exposed to the next layer. These interactions collectively update both the cell state and the hidden state at each time step.</p>
<p>Bidirectional LSTMs (BiLSTMs) process sequences in both forward and backward directions, showing better performance than regular LSTMs. At each time step, hidden states from both passes are concatenated, and the final representation combines the last forward and backward states, yielding rich contextual features for classification. We use BiLSTM for this very reason. This final representation is fed into a fully connected layer and softmax to generate class probabilities. For our experiments, we explored various BiLSTM hyperparameters to achieve the best possible accuracy. <xref ref-type="table" rid="table-1">Table 1</xref> shows the hyperparameters of BiLSTM that gave the best accuracies on the UT interaction (UT) [<xref ref-type="bibr" rid="ref-14">14</xref>], SBU kinect interaction (SBU) [<xref ref-type="bibr" rid="ref-3">3</xref>] and ISR-UoL 3D social activity (ISR-UoL-3D) [<xref ref-type="bibr" rid="ref-16">16</xref>] datasets used in our experiments.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Best hyperparameters of BiLSTM found on the three datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>Number of layers</th>
<th>Hidden dimension</th>
<th>Learning rate</th>
<th>Learning rate schedular</th>
<th>Optimizer</th>
<th>Epochs</th>
</tr>
</thead>
<tbody>
<tr>
<td>UT</td>
<td>2</td>
<td>128</td>
<td>0.001</td>
<td>Reduce learning rate on Plateau</td>
<td>Adam</td>
<td>100</td>
</tr>
<tr>
<td>SBU</td>
<td>2</td>
<td>256</td>
<td>0.001</td>
<td>Exponential learning rate</td>
<td>Adam</td>
<td>100</td>
</tr>
<tr>
<td>ISR-UoL-3D</td>
<td>1</td>
<td>128</td>
<td>0.001</td>
<td>Reduce learning rate on Plateau</td>
<td>Adam</td>
<td>100</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets Description</title>
<p>Evaluation of the proposed system is performed on three benchmark datasets: UT interaction (UT) [<xref ref-type="bibr" rid="ref-14">14</xref>], SBU kinetic interaction (SBU) [<xref ref-type="bibr" rid="ref-3">3</xref>] and ISR-UoL 3D social activity (ISR-UoL-3D) [<xref ref-type="bibr" rid="ref-16">16</xref>] dataset. Details of these datasets are as follows:</p>
<sec id="s4_1_1">
<label>4.1.1</label>
<title>UT Interaction Dataset</title>
<p>The UT dataset contains six two-person interactions: shaking hands, pointing, hugging, pushing, kicking, and punching, with 20 videos per interaction at 720 &#x00D7; 480 resolution and 30 FPS. It is divided into two sets: UT-1 (parking lot) and UT-2 (windy lawn with more background activity). We merged both sets and performed 20-fold cross-validation on the combined data. From each video, central 40 frames were extracted as input, assuming they carry the most informative content. These environmental differences enable a robust evaluation of the system&#x2019;s generalization across diverse scenarios.</p>
</sec>
<sec id="s4_1_2">
<label>4.1.2</label>
<title>SBU Kinect Interaction Dataset</title>
<p>The SBU dataset consists of eight types of two-person interactions: approaching, departing, kicking, punching, pushing, hugging, exchanging objects, and shaking hands. It includes 282 video sequences, each divided into frames with a 640 &#x00D7; 480 resolution. In addition to RGB frames, the dataset provides depth maps for each frame and 3D keypoint coordinates of the individuals, offering rich spatial and temporal information for analysis. Following the authors&#x2019; recommended 5-fold cross-validation setup, we used the central 25 frames from each sequence as input to our system.</p>
</sec>
<sec id="s4_1_3">
<label>4.1.3</label>
<title>ISR-UoL 3D Social Activity Dataset</title>
<p>The ISR-UoL-3D dataset contains RGB, depth, and skeletal data captured with a Kinect 2 sensor across eight interaction types: shaking hands, hugging, helping walk, helping stand-up, fighting, pushing, talking, and drawing attention. It includes 167,442 interaction instances across ten sessions. Each interaction is repeated multiple times per video (up to 4060 repetitions) and includes RGB, 8/16-bit depth, and 15-joint skeleton data. Following the original 10-fold cross-validation setup, we sampled a subset of instances and used the central 40 frames from each as input to our system.</p>
</sec>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental Results</title>
<sec id="s4_2_1">
<label>4.2.1</label>
<title>Confusion Matrix Calculations</title>
<p>We calculated the confusion matrix for each dataset for a better understanding of our system&#x2019;s classification performance, shown in the following <xref ref-type="table" rid="table-2">Tables 2</xref>&#x2013;<xref ref-type="table" rid="table-4">4</xref>:</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Confusion matrix of activities of the UT dataset</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classes</th>
<th>SH</th>
<th>HU</th>
<th>KI</th>
<th>PO</th>
<th>PU</th>
<th>PUS</th>
</tr>
</thead>
<tbody>
<tr>
<td>SH</td>
<td>98.00</td>
<td>0.00</td>
<td>1.00</td>
<td>0.00</td>
<td>1.00</td>
<td>0.00</td>
</tr>
<tr>
<td>HU</td>
<td>0.00</td>
<td>99.00</td>
<td>0.00</td>
<td>1.00</td>
<td>0.00</td>
<td>0.00</td>
</tr>
<tr>
<td>KI</td>
<td>3.00</td>
<td>0.00</td>
<td>87.00</td>
<td>0.00</td>
<td>10.00</td>
<td>0.00</td>
</tr>
<tr>
<td>PO</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>100.00</td>
<td>0.00</td>
<td>0.00</td>
</tr>
<tr>
<td>PU</td>
<td>4.00</td>
<td>3.00</td>
<td>7.00</td>
<td>0.00</td>
<td>84.00</td>
<td>2.00</td>
</tr>
<tr>
<td>PUS</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>2.00</td>
<td>98.00</td>
</tr>
<tr>
<td align="center" colspan="7">Mean accuracy &#x003D; 94.29%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-2fn1" fn-type="other">
<p>Note: SH &#x003D; shaking hands, HU &#x003D; hugging, PO &#x003D; pointing, PUS &#x003D; pushing, KI &#x003D; kicking, PU &#x003D; punching.</p>
</fn>
</table-wrap-foot>
</table-wrap><table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Confusion matrix of activities of the SBU dataset</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classes</th>
<th>AP</th>
<th>DE</th>
<th>KI</th>
<th>PUS</th>
<th>SH</th>
<th>HU</th>
<th>EO</th>
<th>PU</th>
</tr>
</thead>
<tbody>
<tr>
<td>AP</td>
<td>94.00</td>
<td>4.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>2.00</td>
<td>0.00</td>
</tr>
<tr>
<td>DE</td>
<td>1.00</td>
<td>97.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>2.00</td>
<td>0.00</td>
</tr>
<tr>
<td>KI</td>
<td>0.00</td>
<td>0.00</td>
<td>91.00</td>
<td>3.00</td>
<td>0.00</td>
<td>0.00</td>
<td>2.00</td>
<td>4.00</td>
</tr>
<tr>
<td>PUS</td>
<td>0.00</td>
<td>1.00</td>
<td>2.00</td>
<td>88.00</td>
<td>3.00</td>
<td>0.00</td>
<td>2.00</td>
<td>4.00</td>
</tr>
<tr>
<td>SH</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>6.00</td>
<td>87.00</td>
<td>6.00</td>
<td>1.00</td>
<td>0.00</td>
</tr>
<tr>
<td>HU</td>
<td>0.00</td>
<td>0.00</td>
<td>1.00</td>
<td>4.00</td>
<td>0.00</td>
<td>90.00</td>
<td>5.00</td>
<td>0.00</td>
</tr>
<tr>
<td>EO</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>99.00</td>
<td>1.00</td>
</tr>
<tr>
<td>PU</td>
<td>0.00</td>
<td>0.00</td>
<td>3.00</td>
<td>3.00</td>
<td>0.00</td>
<td>2.00</td>
<td>0.00</td>
<td>92.00</td>
</tr>
<tr>
<td align="center" colspan="9">Mean accuracy &#x003D; 92.65%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-3fn1" fn-type="other">
<p>Note: AP &#x003D; approaching, DE &#x003D; departing, KI &#x003D; kicking, PUS &#x003D; pushing, SH &#x003D; shaking hand, HU &#x003D; hugging, EO &#x003D; exchanging objects, PU &#x003D; punching.</p>
</fn>
</table-wrap-foot>
</table-wrap><table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Confusion matrix of activities of the ISR-UoL dataset</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classes</th>
<th>SU</th>
<th>HU</th>
<th>HW</th>
<th>HS</th>
<th>FI</th>
<th>PUS</th>
<th>TA</th>
<th>DA</th>
</tr>
</thead>
<tbody>
<tr>
<td>SH</td>
<td>97.00</td>
<td>2.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>1.00</td>
<td>0.00</td>
<td>0.00</td>
</tr>
<tr>
<td>HU</td>
<td>0.00</td>
<td>89.00</td>
<td>0.00</td>
<td>0.00</td>
<td>1.00</td>
<td>10.00</td>
<td>0.00</td>
<td>0.00</td>
</tr>
<tr>
<td>HW</td>
<td>0.00</td>
<td>0.00</td>
<td>100.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
</tr>
<tr>
<td>HS</td>
<td>2.00</td>
<td>0.00</td>
<td>4.00</td>
<td>89.00</td>
<td>0.00</td>
<td>3.00</td>
<td>2.00</td>
<td>0.00</td>
</tr>
<tr>
<td>FI</td>
<td>0.00</td>
<td>3.00</td>
<td>0.00</td>
<td>0.00</td>
<td>93.00</td>
<td>4.00</td>
<td>0.00</td>
<td>0.00</td>
</tr>
<tr>
<td>PUS</td>
<td>3.00</td>
<td>7.00</td>
<td>0.00</td>
<td>0.00</td>
<td>1.00</td>
<td>88.00</td>
<td>1.00</td>
<td>0.00</td>
</tr>
<tr>
<td>TA</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>100.00</td>
<td>0.00</td>
</tr>
<tr>
<td>DA</td>
<td>2.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>1.00</td>
<td>0.00</td>
<td>97.00</td>
</tr>
<tr>
<td align="center" colspan="9">Mean accuracy &#x003D; 94.30%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-4fn1" fn-type="other">
<p>Note: SH &#x003D; shaking hand, HU &#x003D; hugging, HW &#x003D; helping walk, HS &#x003D; helping stand up, FI &#x003D; fighting, PUS &#x003D; pushing, TA &#x003D; talking, DA &#x003D; drawing attention.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_2_2">
<label>4.2.2</label>
<title>Precision, Recall, and F1-Score</title>
<p>We calculated the precision, recall and f1-score achieved by our system across all datasets. These are shown for each dataset separately in the following <xref ref-type="table" rid="table-5">Tables 5</xref>&#x2013;<xref ref-type="table" rid="table-7">7</xref>. Last row in each table shows the average over all classes.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Measures of precision, recall and F1 score over the UT dataset</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classes</th>
<th>Precision</th>
<th>Recall</th>
<th>F1 Score</th>
</tr>
</thead>
<tbody>
<tr>
<td>SH</td>
<td>96.00</td>
<td>98.00</td>
<td>96.00</td>
</tr>
<tr>
<td>HU</td>
<td>98.00</td>
<td>99.00</td>
<td>98.00</td>
</tr>
<tr>
<td>KI</td>
<td>84.00</td>
<td>88.00</td>
<td>85.00</td>
</tr>
<tr>
<td>PO</td>
<td>99.00</td>
<td>100.00</td>
<td>99.00</td>
</tr>
<tr>
<td>PU</td>
<td>91.00</td>
<td>85.00</td>
<td>85.00</td>
</tr>
<tr>
<td>PUS</td>
<td>98.00</td>
<td>98.00</td>
<td>98.00</td>
</tr>
<tr>
<td><bold>Average</bold></td>
<td>94.00</td>
<td>95.00</td>
<td>94.00</td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Measures of precision, recall and F1 score over the SBU dataset</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classes</th>
<th>Precision</th>
<th>Recall</th>
<th>F1 Score</th>
</tr>
</thead>
<tbody>
<tr>
<td>AP</td>
<td>98.00</td>
<td>94.00</td>
<td>96.00</td>
</tr>
<tr>
<td>DE</td>
<td>95.00</td>
<td>96.00</td>
<td>96.00</td>
</tr>
<tr>
<td>KI</td>
<td>94.00</td>
<td>91.00</td>
<td>92.00</td>
</tr>
<tr>
<td>PUS</td>
<td>91.00</td>
<td>88.00</td>
<td>89.00</td>
</tr>
<tr>
<td>SH</td>
<td>91.00</td>
<td>87.00</td>
<td>89.00</td>
</tr>
<tr>
<td>HU</td>
<td>90.00</td>
<td>89.00</td>
<td>90.00</td>
</tr>
<tr>
<td>EO</td>
<td>90.00</td>
<td>99.00</td>
<td>94.00</td>
</tr>
<tr>
<td>PU</td>
<td>91.00</td>
<td>92.00</td>
<td>92.00</td>
</tr>
<tr>
<td><bold>Average</bold></td>
<td>93.00</td>
<td>92.00</td>
<td>92.00</td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Measures of precision, recall and F1 score over the ISR-UoL 3D dataset</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classes</th>
<th>Precision</th>
<th>Recall</th>
<th>F1 Score</th>
</tr>
</thead>
<tbody>
<tr>
<td>SH</td>
<td>94.00</td>
<td>97.00</td>
<td>96.00</td>
</tr>
<tr>
<td>HU</td>
<td>90.00</td>
<td>89.00</td>
<td>89.00</td>
</tr>
<tr>
<td>HW</td>
<td>97.00</td>
<td>100.00</td>
<td>98.00</td>
</tr>
<tr>
<td>HS</td>
<td>100.00</td>
<td>89.00</td>
<td>92.00</td>
</tr>
<tr>
<td>FI</td>
<td>98.00</td>
<td>93.00</td>
<td>95.00</td>
</tr>
<tr>
<td>PUS</td>
<td>82.00</td>
<td>88.00</td>
<td>84.00</td>
</tr>
<tr>
<td>TA</td>
<td>98.00</td>
<td>100.00</td>
<td>99.00</td>
</tr>
<tr>
<td>DA</td>
<td>100.00</td>
<td>98.00</td>
<td>99.00</td>
</tr>
<tr>
<td><bold>Average</bold></td>
<td>95.00</td>
<td>94.00</td>
<td>94.00</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2_3">
<label>4.2.3</label>
<title>Feature Contribution Analysis</title>
<p>To understand each feature&#x2019;s contribution towards classification, we conducted an experiment. Features were first tested individually. Then only keypoint features, only silhouette features and finally all combined features were used. Results are shown in <xref ref-type="table" rid="table-8">Table 8</xref>. The distance feature achieves the highest accuracy on the SBU dataset, slightly surpassing the combined features. However, on the UT dataset, it performs worse than all other individual features, indicating that its effectiveness may vary across datasets.</p>
<table-wrap id="table-8">
<label>Table 8</label>
<caption>
<title>Feature contribution analysis results on the three datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>LTP</th>
<th>HOF</th>
<th>Distance</th>
<th>Angle</th>
<th>Velocity</th>
<th>UT</th>
<th>SBU</th>
<th>ISR-UoL-3D</th>
</tr>
</thead>
<tbody>
<tr>
<td>&#x2713;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>77.73</td>
<td>57.05</td>
<td>71.80</td>
</tr>
<tr>
<td>&#x2717;</td>
<td>&#x2713;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>77.66</td>
<td>59.39</td>
<td>66.56</td>
</tr>
<tr>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2713;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>74.45</td>
<td>92.92</td>
<td>88.83</td>
</tr>
<tr>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2713;</td>
<td>&#x2717;</td>
<td>81.63</td>
<td>83.53</td>
<td>79.30</td>
</tr>
<tr>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2713;</td>
<td>88.72</td>
<td>79.27</td>
<td>76.17</td>
</tr>
<tr>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>88.91</td>
<td>68.25</td>
<td>81.56</td>
</tr>
<tr>
<td>&#x2717;</td>
<td>&#x2717;</td>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>88.54</td>
<td>92.26</td>
<td>92.89</td>
</tr>
<tr>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>&#x2713;</td>
<td>94.29</td>
<td>92.65</td>
<td>94.30</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2_4">
<label>4.2.4</label>
<title>Classifier Performance Analysis</title>
<p>To demonstrate that the BiLSTM classifier used in our system outperforms conventional alternatives, we conducted a classifier performance analysis. Three classifiers were evaluated: Support Vector Machine (SVM), Multi-Layer Perceptron (MLP) and LSTM. For SVM, the frame-level feature matrix of each video was flattened into a single feature vector while for MLP, each sample&#x2019;s feature vector was obtained by averaging features across all frames. All models were tuned for optimal hyperparameters, and the best performance was reported. <xref ref-type="table" rid="table-9">Table 9</xref> presents the results across the three datasets. Our BiLSTM consistently achieved the highest accuracy, confirming the effectiveness of our proposed approach.</p>
<table-wrap id="table-9">
<label>Table 9</label>
<caption>
<title>Different classifiers&#x2019; performance on the three datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Classifier</th>
<th>UT</th>
<th>SBU</th>
<th>ISR-UoL-3D</th>
</tr>
</thead>
<tbody>
<tr>
<td>SVM</td>
<td>73.30</td>
<td>85.75</td>
<td>81.25</td>
</tr>
<tr>
<td>MLP</td>
<td>72.81</td>
<td>82.55</td>
<td>89.60</td>
</tr>
<tr>
<td>LSTM</td>
<td>93.60</td>
<td>91.30</td>
<td>93.90</td>
</tr>
<tr>
<td>BiLSTM</td>
<td>94.29</td>
<td>92.65</td>
<td>94.30</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2_5">
<label>4.2.5</label>
<title>Computational Efficiency Analysis</title>
<p>To evaluate the computational efficiency of our proposed system, we measured the time each system component took to process an input RGB video for classification. After saving the best models, inference was performed on a TESLA T4 GPU (15 GB RAM) and CPU with &#x007E;13 GB RAM. <xref ref-type="table" rid="table-10">Table 10</xref> shows the time taken by each component rounded up to 2 decimal places. LTP feature extraction was the most time-consuming, indicating a need for optimization while velocity, distance and angle features required negligible time. Overall, segmentation and its feature extraction (LTP and HOF) can be seen as key targets for improving inference speed.</p>
<table-wrap id="table-10">
<label>Table 10</label>
<caption>
<title>Computation time for each component of the proposed system</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>Segmen<break/>tation</th>
<th>Keypoint detection</th>
<th>LTP</th>
<th>HOF</th>
<th>Distance</th>
<th>Angle</th>
<th>Velocity</th>
<th>Classification</th>
<th>Total</th>
</tr>
</thead>
<tbody>
<tr>
<td>UT</td>
<td>5.66</td>
<td>2.93</td>
<td>14.44</td>
<td>5.97</td>
<td>0.03</td>
<td>0.01</td>
<td>0</td>
<td>0.06</td>
<td>29.13</td>
</tr>
<tr>
<td>SBU</td>
<td>3.40</td>
<td>2.37</td>
<td>9.82</td>
<td>2.90</td>
<td>0.02</td>
<td>0.01</td>
<td>0</td>
<td>0.15</td>
<td>18.7</td>
</tr>
<tr>
<td>ISR-UoL-3D</td>
<td>5.55</td>
<td>2.60</td>
<td>12.72</td>
<td>4.13</td>
<td>0.03</td>
<td>0.01</td>
<td>0</td>
<td>0.06</td>
<td>25.13</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2_6">
<label>4.2.6</label>
<title>Occlusion Robustness Analysis</title>
<p>Finally, we evaluated the performance of the proposed system under occluded conditions. Each dataset&#x2019;s RGB image was normalized to the 0&#x2013;1 range, injected with random gaussian noise (mean &#x003D; 0, std &#x003D; 0.03) and subjected by salt-pepper noise where 2% of the pixels were set to white or black. This increased the difficulty of segmentation and keypoint detection, providing a reliable benchmark for system robustness. <xref ref-type="fig" rid="fig-9">Fig. 9</xref> shows an image from the UT dataset before and after noise injection with the specified parameters.</p>
<fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>Visualization of an image from the UT dataset before and after noise injection</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_71988-fig-9.tif"/>
</fig>
<p>For evaluation, we used the same training protocols as with the clean datasets for fair comparison. <xref ref-type="table" rid="table-11">Table 11</xref> shows the accuracy obtained on noisy vs. original conditions on the three datasets. A minimum accuracy drop is seen for the SBU Kinect and ISR-UoL-3D datasets, demonstrating the resilience of our system. However, since the UT interaction dataset is already occlusion heavy, it proved more challenging. Due to the added noise, YOLO11-Pose was unable to assign track ids consistently to individuals across frames. This degraded the features extracted from keypoints which ultimately resulted in a much lower accuracy. This highlights a direction for future improvements.</p>
<table-wrap id="table-11">
<label>Table 11</label>
<caption>
<title>Accuracy comparison for occlusion and original datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>Occluded</th>
<th>Original</th>
</tr>
</thead>
<tbody>
<tr>
<td>UT</td>
<td>65.19</td>
<td>94.29</td>
</tr>
<tr>
<td>SBU</td>
<td>91.86</td>
<td>92.65</td>
</tr>
<tr>
<td>ISR-UoL-3D</td>
<td>89.68</td>
<td>94.30</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Comparison with State-of-the-Art Systems</title>
<p>We now evaluate our proposed system by comparing it with state-of-the-art systems. The results in <xref ref-type="table" rid="table-12">Table 12</xref> demonstrate that our system performs better than most existing approaches, highlighting its effectiveness. However, it underperforms than [<xref ref-type="bibr" rid="ref-11">11</xref>] on the UT and SBU datasets and [<xref ref-type="bibr" rid="ref-10">10</xref>] on the SBU dataset. Specifically, [<xref ref-type="bibr" rid="ref-11">11</xref>] used OpenPose [<xref ref-type="bibr" rid="ref-8">8</xref>] for keypoint detection, which is much more accurate but considerably more computationally demanding than YOLO11-Pose used in our system. Also, [<xref ref-type="bibr" rid="ref-10">10</xref>] relied on ground truth keypoint coordinates, giving it an inherent advantage. These observations suggest potential directions for further improving our system to achieve even stronger results.</p>
<table-wrap id="table-12">
<label>Table 12</label>
<caption>
<title>Comparisons of the proposed system with other systems</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th>UT</th>
<th>SBU</th>
<th>Uol ISR-3D</th>
</tr>
</thead>
<tbody>
<tr>
<td>Spatial temporal relationship match kernel [<xref ref-type="bibr" rid="ref-14">14</xref>]</td>
<td>70.8</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>Multi temporal scales discriminative model [<xref ref-type="bibr" rid="ref-17">17</xref>]</td>
<td>86.6</td>
<td>&#x2013;</td>
<td></td>
</tr>
<tr>
<td>Structural context models and ranking score [<xref ref-type="bibr" rid="ref-18">18</xref>]</td>
<td>92.5</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>CHARM [<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td></td>
<td>84</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>Statistical and Geometrical features [<xref ref-type="bibr" rid="ref-20">20</xref>]</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>85.56</td>
</tr>
<tr>
<td>Skeletal data [<xref ref-type="bibr" rid="ref-21">21</xref>]</td>
<td>&#x2013;</td>
<td>88</td>
<td>87</td>
</tr>
<tr>
<td>Hybrid descriptors [<xref ref-type="bibr" rid="ref-22">22</xref>]</td>
<td>&#x2013;</td>
<td>91.63</td>
<td>90.13</td>
</tr>
<tr>
<td>LSTM-IRN [<xref ref-type="bibr" rid="ref-11">11</xref>]</td>
<td>97.5</td>
<td>98.2</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>2P-GCN [<xref ref-type="bibr" rid="ref-10">10</xref>]</td>
<td>&#x2013;</td>
<td>98.90</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>Proposed system accuracy</td>
<td>94.29</td>
<td>92.65</td>
<td>94.30</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>The proposed system demonstrates strong performance on the benchmark datasets. Further experiments validate the effectiveness of the selected features and classifier types, while highlighting the system&#x2019;s high computational efficiency and robustness to occlusion, thus making it suitable for real-world deployment. However, it has some limitations. Silhouette-based features (LTP and HOF) are relatively time consuming to extract; we will explore faster handcrafted feature representations. YOLO11-Pose struggles under severe occlusion, sometimes failing to track individuals correctly or misidentifying objects as people. This can be addressed by fine tuning or using more advanced keypoint detection models with robust tracking mechanisms. Similarly, MASK R-CNN can merge relevant and irrelevant individuals into a single mask when they are in close proximity. It is also very computationally intensive. This indicates searching for alternatives or developing custom architectures for better computational efficiency. Additionally, the BiLSTM classifier requires substantial training data to capture long-term dependencies. As alternatives, we will explore pretraining, data augmentation and other temporal models. Future work will integrate these improvements into the system. We will also explore combining silhouette-based CNN features and Graph Convolution Network-based keypoint features with handcrafted features for higher accuracy and efficiency.</p>
</sec>
</body>
<back>
<ack>
<p>Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2025R410), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This research is supported and funded by Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2025R410), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Study conception and design: Muhammad Hamdan Azhar and Yanfeng Wu; data collection: Asaad Algarni and Nouf Abdullah Almujally; analysis and interpretation of results: Shuaa S. Alharbi and Hui Liu; draft manuscript preparation: Muhammad Hamdan Azhar, Ahmad Jalal and Hui Liu. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>All publicly available datasets are used in the study.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Asadi-Aghbolaghi</surname> <given-names>M</given-names></string-name>, <string-name><surname>Clap&#x00E9;s</surname> <given-names>A</given-names></string-name>, <string-name><surname>Bellantonio</surname> <given-names>M</given-names></string-name>, <string-name><surname>Escalante</surname> <given-names>HJ</given-names></string-name>, <string-name><surname>Ponce-L&#x00F3;pez</surname> <given-names>V</given-names></string-name>, <string-name><surname>Bar&#x00F3;</surname> <given-names>X</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>A survey on deep learning based approaches for action and gesture recognition in image sequences</article-title>. In: <conf-name>Proceedings of the 2017 12th IEEE International Conference on Automatic Face &#x0026; Gesture Recognition (FG 2017); 2017 May 30&#x2013;Jun 3</conf-name>; <publisher-loc>Washington, DC, USA</publisher-loc>. p. <fpage>476</fpage>&#x2013;<lpage>83</lpage>. doi:<pub-id pub-id-type="doi">10.1109/FG.2017.150</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kamal</surname> <given-names>S</given-names></string-name>, <string-name><surname>Alhasson</surname> <given-names>HF</given-names></string-name>, <string-name><surname>Alnusayri</surname> <given-names>M</given-names></string-name>, <string-name><surname>Alatiyyah</surname> <given-names>M</given-names></string-name>, <string-name><surname>Aljuaid</surname> <given-names>H</given-names></string-name>, <string-name><surname>Jalal</surname> <given-names>A</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Vision sensor for automatic recognition of human activities via hybrid features and multi-class support vector machine</article-title>. <source>Sensors</source>. <year>2025</year>;<volume>25</volume>(<issue>1</issue>):<fpage>200</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s25010200</pub-id>; <pub-id pub-id-type="pmid">39796988</pub-id></mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Yun</surname> <given-names>K</given-names></string-name>, <string-name><surname>Honorio</surname> <given-names>J</given-names></string-name>, <string-name><surname>Chattopadhyay</surname> <given-names>D</given-names></string-name>, <string-name><surname>Berg</surname> <given-names>TL</given-names></string-name>, <string-name><surname>Samaras</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Two-person interaction detection using body-pose features and multiple instance learning</article-title>. In: <conf-name>Proceedings of the 2012 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops; 2012 Jun 16&#x2013;21</conf-name>; <publisher-loc>Providence, RI, USA</publisher-loc>. p. <fpage>28</fpage>&#x2013;<lpage>35</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPRW.2012.6239234</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jalal</surname> <given-names>A</given-names></string-name>, <string-name><surname>Khalid</surname> <given-names>N</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>K</given-names></string-name></person-group>. <article-title>Automatic recognition of human interaction via hybrid descriptors and maximum entropy Markov model using depth sensors</article-title>. <source>Entropy</source>. <year>2020</year>;<volume>22</volume>(<issue>8</issue>):<fpage>817</fpage>. doi:<pub-id pub-id-type="doi">10.3390/e22080817</pub-id>; <pub-id pub-id-type="pmid">33286588</pub-id></mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Gkioxari</surname> <given-names>G</given-names></string-name>, <string-name><surname>Doll&#x00E1;r</surname> <given-names>P</given-names></string-name>, <string-name><surname>Girshick</surname> <given-names>R</given-names></string-name></person-group>. <article-title>Mask R-CNN</article-title>. In: <conf-name>Proceedings of the 2017 IEEE International Conference on Computer Vision (ICCV); 2017 Oct 22&#x2013;29</conf-name>; <publisher-loc>Venice, Italy</publisher-loc>. p. <fpage>2961</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV.2017.322</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gupta</surname> <given-names>S</given-names></string-name>, <string-name><surname>Vishwakarma</surname> <given-names>DK</given-names></string-name>, <string-name><surname>Puri</surname> <given-names>NK</given-names></string-name></person-group>. <article-title>A human activity recognition framework in videos using segmented human subject focus</article-title>. <source>Vis Comput</source>. <year>2024</year>;<volume>40</volume>(<issue>10</issue>):<fpage>6983</fpage>&#x2013;<lpage>99</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s00371-023-03256-4</pub-id>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Park</surname> <given-names>E</given-names></string-name>, <string-name><surname>Han</surname> <given-names>X</given-names></string-name>, <string-name><surname>Berg</surname> <given-names>TL</given-names></string-name>, <string-name><surname>Berg</surname> <given-names>AC</given-names></string-name></person-group>. <article-title>Combining multiple sources of knowledge in deep CNNs for action recognition</article-title>. In: <conf-name>Proceedings of the 2016 IEEE Winter Conference on Applications of Computer Vision (WACV); 2016 Mar 7&#x2013;10</conf-name>; <publisher-loc>Lake Placid, NY, USA</publisher-loc>. p. <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1109/WACV.2016.7477589</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Cao</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Simon</surname> <given-names>T</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>SE</given-names></string-name>, <string-name><surname>Sheikh</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Realtime multi-person 2D pose estimation using part affinity fields</article-title>. In: <conf-name>Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2017 Jul 21&#x2013;26</conf-name>; <publisher-loc>Honolulu, HI, USA</publisher-loc>. p. <fpage>1302</fpage>&#x2013;<lpage>10</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2017.143</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Sun</surname> <given-names>K</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>B</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>D</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Deep high-resolution representation learning for human pose estimation</article-title>. In: <conf-name>Proceedings of the2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2019 Jun 15&#x2013;20</conf-name>; <publisher-loc>Long Beach, CA, USA</publisher-loc>. p. <fpage>5686</fpage>&#x2013;<lpage>96</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2019.00584</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>T</given-names></string-name>, <string-name><surname>Su</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Two-person graph convolutional network for skeleton-based human interaction recognition</article-title>. <source>IEEE Trans Circ Syst Video Technol</source>. <year>2023</year>;<volume>33</volume>(<issue>7</issue>):<fpage>3333</fpage>&#x2013;<lpage>42</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCSVT.2022.3232373</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Perez</surname> <given-names>M</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kot</surname> <given-names>AC</given-names></string-name></person-group>. <article-title>Interaction relational network for mutual action recognition</article-title>. <source>IEEE Trans Multimed</source>. <year>2022</year>;<volume>24</volume>:<fpage>366</fpage>&#x2013;<lpage>76</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TMM.2021.3050642</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>P</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>G</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Yuan</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>Reproducible and generalizable speech emotion recognition via an intelligent fusion network</article-title>. <source>Biomed Signal Process Control</source>. <year>2025</year>;<volume>109</volume>:<fpage>107996</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.bspc.2025.107996</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Girshick</surname> <given-names>R</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Faster R-CNN: towards real-time object detection with region proposal networks</article-title>. <source>IEEE Trans Pattern Anal Mach Intell</source>. <year>2017</year>;<volume>39</volume>(<issue>6</issue>):<fpage>1137</fpage>&#x2013;<lpage>49</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TPAMI.2016.2577031</pub-id>; <pub-id pub-id-type="pmid">27295650</pub-id></mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ryoo</surname> <given-names>MS</given-names></string-name>, <string-name><surname>Aggarwal</surname> <given-names>JK</given-names></string-name></person-group>. <article-title>Spatio-temporal relationship match: video structure comparison for recognition of complex human activities</article-title>. In: <conf-name>Proceedings of the 2009 IEEE 12th International Conference on Computer Vision; 2009 Sep 29&#x2013;Oct 2</conf-name>; <publisher-loc>Kyoto, Japan</publisher-loc>. p. <fpage>1593</fpage>&#x2013;<lpage>600</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV.2009.5459361</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Jocher</surname> <given-names>G</given-names></string-name>, <string-name><surname>Qiu</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Ultralytics YOLO11 [Internet]</article-title>. <comment>2024 [cited 2025 Apr 21]</comment>. Available from: <ext-link ext-link-type="uri" xlink:href="https://github.com/ultralytics/ultralytics">https://github.com/ultralytics/ultralytics</ext-link>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Coppola</surname> <given-names>C</given-names></string-name>, <string-name><surname>Faria</surname> <given-names>DR</given-names></string-name>, <string-name><surname>Nunes</surname> <given-names>U</given-names></string-name>, <string-name><surname>Bellotto</surname> <given-names>N</given-names></string-name></person-group>. <article-title>Social activity recognition based on probabilistic merging of skeleton features with proximity priors from RGB-D data</article-title>. In: <conf-name>Proceedings of the 2016 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS); 2016 Oct 9&#x2013;14</conf-name>; <publisher-loc>Daejeon, Republic of Korea</publisher-loc>. p. <fpage>5055</fpage>&#x2013;<lpage>61</lpage>. doi:<pub-id pub-id-type="doi">10.1109/IROS.2016.7759742</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Kong</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Kit</surname> <given-names>D</given-names></string-name>, <string-name><surname>Fu</surname> <given-names>Y</given-names></string-name></person-group>. <chapter-title>A discriminative model with multiple temporal scales for action prediction</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Fleet</surname> <given-names>D</given-names></string-name>, <string-name><surname>Pajdla</surname> <given-names>T</given-names></string-name>, <string-name><surname>Schiele</surname> <given-names>B</given-names></string-name>, <string-name><surname>Tuytelaars</surname> <given-names>T</given-names></string-name></person-group>, editors. <source>Computer vision&#x2014;ECCV 2014</source>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>; <year>2014</year>. p. <fpage>596</fpage>&#x2013;<lpage>611</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-319-10602-1_39</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ke</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Bennamoun</surname> <given-names>M</given-names></string-name>, <string-name><surname>An</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sohel</surname> <given-names>F</given-names></string-name>, <string-name><surname>Boussaid</surname> <given-names>F</given-names></string-name></person-group>. <article-title>Leveraging structural context models and ranking score fusion for human interaction prediction</article-title>. <source>IEEE Trans Multimed</source>. <year>2018</year>;<volume>20</volume>(<issue>7</issue>):<fpage>1712</fpage>&#x2013;<lpage>23</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TMM.2017.2778559</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>W</given-names></string-name>, <string-name><surname>Wen</surname> <given-names>L</given-names></string-name>, <string-name><surname>Chuah</surname> <given-names>MC</given-names></string-name>, <string-name><surname>Lyu</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Category-blind human action recognition: a practical recognition system</article-title>. In: <conf-name>Proceedings of the 2015 IEEE International Conference on Computer Vision (ICCV); 2015 Dec 7&#x2013;13</conf-name>; <publisher-loc>Santiago, Chile</publisher-loc>. p. <fpage>4444</fpage>&#x2013;<lpage>52</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICCV.2015.505</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Coppola</surname> <given-names>C</given-names></string-name>, <string-name><surname>Cosar</surname> <given-names>S</given-names></string-name>, <string-name><surname>Faria</surname> <given-names>DR</given-names></string-name>, <string-name><surname>Bellotto</surname> <given-names>N</given-names></string-name></person-group>. <article-title>Automatic detection of human interactions from RGB-D data for social activity classification</article-title>. In: <conf-name>Proceedings of the 2017 26th IEEE International Symposium on Robot and Human Interactive Communication (RO-MAN); 2017 Aug 28&#x2013;Sep 1</conf-name>; <publisher-loc>Lisbon, Portugal</publisher-loc>. p. <fpage>871</fpage>&#x2013;<lpage>6</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ROMAN.2017.8172405</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Manzi</surname> <given-names>A</given-names></string-name>, <string-name><surname>Fiorini</surname> <given-names>L</given-names></string-name>, <string-name><surname>Limosani</surname> <given-names>R</given-names></string-name>, <string-name><surname>Dario</surname> <given-names>P</given-names></string-name>, <string-name><surname>Cavallo</surname> <given-names>F</given-names></string-name></person-group>. <article-title>Two-person activity recognition using skeleton data</article-title>. <source>IET Comput Vis</source>. <year>2018</year>;<volume>12</volume>(<issue>1</issue>):<fpage>27</fpage>&#x2013;<lpage>35</lpage>. doi:<pub-id pub-id-type="doi">10.1049/iet-cvi.2017.0118</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Waheed</surname> <given-names>M</given-names></string-name>, <string-name><surname>Jalal</surname> <given-names>A</given-names></string-name>, <string-name><surname>Alarfaj</surname> <given-names>M</given-names></string-name>, <string-name><surname>Ghadi</surname> <given-names>YY</given-names></string-name>, <string-name><surname>Al Shloul</surname> <given-names>T</given-names></string-name>, <string-name><surname>Kamal</surname> <given-names>S</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>An LSTM-based approach for understanding human interactions using hybrid feature descriptors over depth sensors</article-title>. <source>IEEE Access</source>. <year>2021</year>;<volume>9</volume>:<fpage>167434</fpage>&#x2013;<lpage>46</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ACCESS.2021.3130613</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>