<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">72508</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.072508</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Transformer-Driven Multimodal for Human-Object Detection and Recognition for Intelligent Robotic Surveillance</article-title>
<alt-title alt-title-type="left-running-head">Transformer-Driven Multimodal for Human-Object Detection and Recognition for Intelligent Robotic Surveillance</alt-title>
<alt-title alt-title-type="right-running-head">Transformer-Driven Multimodal for Human-Object Detection and Recognition for Intelligent Robotic Surveillance</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Ullah</surname><given-names>Aman Aman</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="author-notes" rid="afn1">#</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Wu</surname><given-names>Yanfeng</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="author-notes" rid="afn1">#</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Najam</surname><given-names>Shaheryar</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Almujally</surname><given-names>Nouf Abdullah</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-5" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Jalal</surname><given-names>Ahmad</given-names></name><xref ref-type="aff" rid="aff-5">5</xref><xref ref-type="aff" rid="aff-6">6</xref><email>ahmjal@yahoo.com</email></contrib>
<contrib id="author-6" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Liu</surname><given-names>Hui</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-7">7</xref><xref ref-type="aff" rid="aff-8">8</xref><email>hui.liu@uni-bremen.de</email></contrib>
<aff id="aff-1"><label>1</label><institution>Guodian Nanjing Automation Co., Ltd.</institution>, <addr-line>Nanjing, 210003</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Department of Biomedical Engineering, Riphah International University, I-14</institution>, <addr-line>Islamabad, 44000</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-3"><label>3</label><institution>Department of Electrical Engineering, Bahria University, H-11</institution>, <addr-line>Islamabad, 44000</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-4"><label>4</label><institution>Department of Information Systems, College of Computer and Information Sciences, Princess Nourah bint Abdulrahman University</institution>, <addr-line>Riyadh, 11671</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-5"><label>5</label><institution>Department of Computer Science, Air University, E-9</institution>, <addr-line>Islamabad, 44000</addr-line>, <country>Pakistan</country></aff>
<aff id="aff-6"><label>6</label><institution>Department of Computer Science and Engineering, College of Informatics, Korea University</institution>, <addr-line>Seoul, 02841</addr-line>, <country>Republic of Korea</country></aff>
<aff id="aff-7"><label>7</label><institution>Jiangsu Key Laboratory of Intelligent Medical Image Computing, School of Artificial Intelligence (School of Future Technology), Nanjing University of Information Science and Technology</institution>, <addr-line>Nanjing, 210003</addr-line>, <country>China</country></aff>
<aff id="aff-8"><label>8</label><institution>Cognitive Systems Lab, University of Bremen</institution>, <addr-line>Bremen, 28359</addr-line>, <country>Germany</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Authors: Ahmad Jalal. Email: <email>ahmjal@yahoo.com</email>; Hui Liu. Email: <email>hui.liu@uni-bremen.de</email></corresp>
<fn id="afn1">
<p><sup>#</sup>These authors contributed equally to this work</p>
</fn>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>56</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>10</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_72508.pdf"></self-uri>
<abstract>
<p>Human object detection and recognition is essential for elderly monitoring and assisted living however, models relying solely on pose or scene context often struggle in cluttered or visually ambiguous settings. To address this, we present SCENET-3D, a transformer-driven multimodal framework that unifies human-centric skeleton features with scene-object semantics for intelligent robotic vision through a three-stage pipeline. In the first stage, scene analysis, rich geometric and texture descriptors are extracted from RGB frames, including surface-normal histograms, angles between neighboring normals, Zernike moments, directional standard deviation, and Gabor-filter responses. In the second stage, scene-object analysis, non-human objects are segmented and represented using local feature descriptors and complementary surface-normal information. In the third stage, human-pose estimation, silhouettes are processed through an enhanced MoveNet to obtain 2D anatomical keypoints, which are fused with depth information and converted into RGB-based point clouds to construct pseudo-3D skeletons. Features from all three stages are fused and fed in a transformer encoder with multi-head attention to resolve visually similar activities. Experiments on UCLA (95.8%), ETRI-Activity3D (89.4%), and CAD-120 (91.2%) demonstrate that combining pseudo-3D skeletons with rich scene-object fusion significantly improves generalizable activity recognition, enabling safer elderly care, natural human&#x2013;robot interaction, and robust context-aware robotic perception in real-world environments.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Human object detection</kwd>
<kwd>elderly care</kwd>
<kwd>RGB-based pose estimation</kwd>
<kwd>scene context analysis</kwd>
<kwd>object recognition Gabor features</kwd>
<kwd>point cloud reconstruction</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Princess Nourah bint Abdulrahman University</funding-source>
<award-id>PNURSP2025R410</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Human Activity Recognition (HAR) is a fundamental task in computer vision, with growing relevance in ambient assisted living, particularly for elderly care. In home-based settings, the ability to monitor daily routines, detect abnormal behavior, or respond to critical events such as falls, can significantly enhance safety and well-being [<xref ref-type="bibr" rid="ref-1">1</xref>]. However, accurate activity recognition in such environments remains a non-trivial challenge. Most existing HAR systems prioritize the analysis of human pose or motion patterns, often overlooking the broader environmental context in which these activities occur. This narrow focus can result in misinterpretations, especially when actions are ambiguous without reference to surrounding objects or scene structure.</p>
<p>Despite its significance, the incorporation of scene understanding into HAR remains underexplored. This is partly due to the challenges posed by visual clutter, occlusion, and environmental variability in real-world scenes. Depth sensors offer spatial cues that can aid segmentation and object localization, but their cost, limited operational range, and hardware requirements restrict their use in home environments [<xref ref-type="bibr" rid="ref-2">2</xref>]. In contrast, RGB cameras are inexpensive, non-intrusive, and widely available, making them a more practical choice for in-home monitoring.</p>
<p>To address these challenges, we propose SCENET-3D, a three-stage human activity recognition pipeline integrating scene analysis, scene-object analysis, and human-pose estimation using pseudo-3D point clouds from monocular RGB. The first stage extracts holistic scene features via surface-normal and Gabor descriptors; the second models segmented non-human objects with local feature detectors; and the third derives 3D human-skeleton point clouds from MoveNet keypoints and pseudo-depth maps. Features from all stages are fused and classified via a transformer, capturing fine-grained spatial geometry and human&#x2013;environment interactions to improve recognition in complex, cluttered scenarios.</p>
<p><bold>The key contributions of this work are as follows:</bold>
<list list-type="bullet">
<list-item>
<p>We introduce SCENET-3D, a novel human-activity recognition pipeline that jointly integrates holistic scene understanding, scene-object analysis, and human-pose estimation.</p></list-item>
<list-item>
<p>We propose unique pseudo-3D point clouds estimation from monocular RGB frames to capture fine-grained spatial structures of the human subject along with 3D key body joints extracted from a unique implementation of 2D to 3D key point conversion using monocular depth from Midas.</p></list-item>
<list-item>
<p>We designed a diverse and complementary feature extraction strategy that combines pose estimation, pseudo-depth-based surface structure, local keypoint descriptors, and texture statistics, enabling robust scene and action understanding from RGB-only input.</p></list-item>
<list-item>
<p>We employed early feature fusion and optimize the unified feature vector before classifying activities using a transformer classifier. This ensures the system can handle real-world ambiguity while providing interpretable decision-making for assistive applications.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<p>Human Activity Recognition (HAR) has shifted from wearable sensors to vision-based methods. Wearables provide precise motion data but cause discomfort and limited use. Vision-based approaches are non-intrusive; depth sensors offer strong spatial cues but are costly, while RGB cameras are low-cost, widely available, and effective for in-home monitoring. With spatial augmentation and deep models, RGB-based HAR achieves competitive performance [<xref ref-type="bibr" rid="ref-3">3</xref>]. A recent survey reviews RGB-D-based HAR, emphasizing multimodal fusion strengths while noting challenges in scalability and real-world generalization [<xref ref-type="bibr" rid="ref-4">4</xref>]. Monocular SLAM and pseudo-RGBD generation further infer scene geometry for point cloud reconstruction and silhouette extraction. Lightweight keypoint extractors, e.g., MoveNet, yield spatiotemporal descriptors directly from RGB frames [<xref ref-type="bibr" rid="ref-5">5</xref>]. More recently, transformer-based architectures such as the Global-local Motion Transformer have demonstrated strong performance on skeleton-based action learning, particularly on the UCLA dataset, aligning closely with the objectives of this work [<xref ref-type="bibr" rid="ref-6">6</xref>]. By employing diffusion-based generative modeling with geometrically consistent conditioning, dense 3D point clouds can be reconstructed from single RGB images, removing the need for depth-sensing hardware [<xref ref-type="bibr" rid="ref-7">7</xref>].</p>
<p>However, motion or pose alone is insufficient in cluttered environments where different actions may appear visually similar. Here, scene understanding is critical. Environmental semantics such as layout, object identities, and spatial relations help disambiguate similar motions (e.g., distinguishing &#x201C;reaching for an object&#x201D; vs. &#x201C;stretching&#x201D;). To this end, scene context-aware graph convolutional networks have been proposed, combining skeleton features with environmental cues to improve action recognition in cluttered settings [<xref ref-type="bibr" rid="ref-8">8</xref>]. This is especially relevant in domestic elderly care where clutter and object proximity strongly affect interpretation. Hybrid approaches integrating pose with scene context have shown improved robustness [<xref ref-type="bibr" rid="ref-9">9</xref>]. Yet, many remain limited by fixed viewpoints or simplified backgrounds [<xref ref-type="bibr" rid="ref-10">10</xref>]. Despite achieving real-time monitoring through IoT-based decision-aware designs, assisted-living systems still face reduced adaptability in cluttered or dynamically changing environments [<xref ref-type="bibr" rid="ref-11">11</xref>].</p>
<p>Deep learning models (CNNs, LSTMs) support tasks such as fall detection, but their generalization suffers from small datasets and uncontrolled settings [<xref ref-type="bibr" rid="ref-12">12</xref>]. To enhance feature discrimination, Hu et al. introduced an attention-guided method based on close-up maximum activation for human activity recognition [<xref ref-type="bibr" rid="ref-13">13</xref>]. Building on this, scene-level semantics are increasingly incorporated to capture object relationships and spatial configurations [<xref ref-type="bibr" rid="ref-14">14</xref>,<xref ref-type="bibr" rid="ref-15">15</xref>]. Mid-level features such as Gabor filters provide texture and orientation cues from background structures [<xref ref-type="bibr" rid="ref-16">16</xref>], while entropy-based segmentation and super-pixel decomposition enhance scene partitioning for contextual feature extraction. Recent works in elderly care emphasize both contextual awareness and computational efficiency. Some focus on scene analysis to infer context-sensitive behaviors [<xref ref-type="bibr" rid="ref-17">17</xref>], while others prioritize lightweight models for real-time deployment.</p>
<p>This need for richer contextual modeling has also been highlighted in works that integrate action recognition with object detection and human&#x2013;object interaction to enhance robustness in diverse and cluttered environments [<xref ref-type="bibr" rid="ref-18">18</xref>&#x2013;<xref ref-type="bibr" rid="ref-20">20</xref>]. Building on these, our framework fuses pose-centric features (MoveNet keypoints, pseudo-depth point clouds) with scene-level descriptors (Gabor filters, super-pixel-based segmentation) into a unified HAR model.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Materials and Methods</title>
<p>This section presents SCENET-3D, a HAR pipeline that combines human structural cues with environmental context to improve classification. RGB frames undergo preprocessing, surface normal computation, semantic segmentation, feature extraction, and multimodal fusion for classification (<xref ref-type="fig" rid="fig-1">Fig. 1</xref>).</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>System architecture of SCENET-3D a unified RGB framework with fusion of human-centric pose features and scene-level contextual descriptors for robust activity recognition</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-1.tif"/>
</fig>
<sec id="s3_1">
<label>3.1</label>
<title>Data Preprocessing</title>
<p>Frames from diverse RGB datasets (offices, homes, clinical settings) undergo a standardized preprocessing pipeline for spatial and intensity consistency. Each frame <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mi>I</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>H</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> is resized via bilinear interpolation to <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msup><mml:mi>I</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mtext>H</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mrow><mml:mtext>W</mml:mtext></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, ensuring spatial continuity and color fidelity. Pixel intensities are then normalized using min&#x2013;max scaling using <xref ref-type="disp-formula" rid="eqn-1">Eq. (1)</xref>.
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>I</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>I</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>I</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>I</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>R</mml:mi><mml:mo>,</mml:mo><mml:mi>G</mml:mi><mml:mo>,</mml:mo><mml:mi>B</mml:mi><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>which maps values to <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> and mitigates illumination and contrast discrepancies. The preprocessing step is illustrated in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. Finally, Gaussian smoothing is applied to suppress sensor noise and compression artifacts using <xref ref-type="disp-formula" rid="eqn-2">Eq. (2)</xref>, where the kernel is defined as <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>.<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:mi>G</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>+</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mn>2</mml:mn><mml:mi>&#x03C0;</mml:mi><mml:msup><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mi>i</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:msup><mml:mi>j</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:msup><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Preprocessing pipeline. (<bold>a</bold>) Raw RGB frames and (<bold>b</bold>) Preprocessed RGB frames</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-2.tif"/>
</fig>
<p>This preprocessing ensures uniform scale, normalized intensity distribution, and improved visual quality, providing consistent inputs for subsequent multimodal feature extraction.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Semantic Segmentation Using Mask R-CNN</title>
<p>Semantic instance segmentation is applied to each RGB frame using Mask R-CNN with a ResNet-50 backbone and FPN, implemented in Detectron2, to decouple human and environmental components. The network, pretrained on COCO, produces instance-specific binary masks <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, class labels <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, and confidence scores <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> from an input image <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>I</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. The overall multi-task loss integrates classification, bounding box regression, and mask prediction using <xref ref-type="disp-formula" rid="eqn-5">Eq. (5)</xref>.
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:mrow><mml:mo>&#x1D4AE;</mml:mo></mml:mrow><mml:mo>&#x003A;</mml:mo><mml:mi>I</mml:mi><mml:mo stretchy="false">&#x21A6;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:msup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>&#x1D49E;</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>cls</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>box</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>mask</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>here, the classification loss is the softmax cross-entropy by <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref>, the bounding-box regression loss employs Smooth L1 using <xref ref-type="disp-formula" rid="eqn-7">Eq. (7)</xref>, and the mask prediction loss applies pixel-wise binary cross-entropy while region proposals are generated by the RPN using classification and bounding-box regression losses.
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>cls</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo>&#x1D49E;</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:munderover><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mrow><mml:mtext>box</mml:mtext></mml:mrow></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>t</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mtext>smooth</mml:mtext></mml:mrow><mml:mrow><mml:mi>L</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign="left left" rowspacing=".2em" columnspacing="1em" displaystyle="false"><mml:mtr><mml:mtd><mml:mn>0.5</mml:mn><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mo>&#x003C;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>0.5</mml:mn><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mrow><mml:mtext>otherwise</mml:mtext></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mo fence="true" stretchy="true" symmetric="true"></mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>For human-centered analysis, the largest connected component across all predicted instance masks is selected as the primary human silhouette using <xref ref-type="disp-formula" rid="eqn-8">Eq. (8)</xref>.
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:msup><mml:mi>M</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>arg</mml:mi><mml:mo>&#x2061;</mml:mo><mml:munder><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2286;</mml:mo><mml:munder><mml:mover><mml:mo>&#x22C3;</mml:mo><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mover><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munder><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mrow><mml:mtext>Area</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula> denotes the set of connected components. Remaining components correspond to scene objects and are preserved for downstream context-aware reasoning (e.g., affordance learning, pose&#x2013;scene interaction). An example of this segmentation is shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Semantic segmentation: (<bold>a</bold>) input RGB frames, (<bold>b</bold>) extracted human silhouette, (<bold>c</bold>) class-specific scene objects, and (<bold>d</bold>) reconstructed scene with preserved objects</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-3.tif"/>
</fig>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Silhouette-Based Human Pose Estimation</title>
<p>Following segmentation, the human is isolated as an RGB silhouette, serving as input for human-centric feature extraction. From this silhouette, 2D pose keypoints and a 3D point cloud are derived, capturing spatial, structural, and geometric cues essential for motion-based activity analysis.</p>
<p><italic>Keypoint Extraction via MoveNet</italic></p>
<p>To capture the articulation and spatial configuration of the human body, we employ MoveNet [<xref ref-type="bibr" rid="ref-8">8</xref>], a lightweight yet accurate real-time pose estimator that predicts 17 anatomical keypoints, denoted as <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mrow><mml:mtext mathvariant="bold">K</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>17</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula>, where each keypoint is <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. Where <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> are 2D coordinates and <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mi>c</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula> is the confidence score. These landmarks span the head, torso, and limbs, enabling fine-grained skeletal representation. The estimation process leverages multiple loss functions to ensure spatial coherence, anatomical plausibility, and robustness under occlusion.</p>
<p>Keypoints are represented as 2D Gaussian heatmaps, and their spatial locations are optimized using mean squared error. Coordinate-level refinement is performed via Smooth L1 regression, while confidence-weighted losses mitigate the influence of occluded or low-confidence keypoints. These losses collectively enforce accurate, anatomically plausible, and robust keypoint estimation under varying visual conditions.</p>
<p><bold>Structural Consistency Loss:</bold> Limb geometry is preserved via relative displacement as provided in <xref ref-type="disp-formula" rid="eqn-9">Eq. (9)</xref>.<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>struct</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mi>&#x2130;</mml:mi></mml:mrow></mml:mrow></mml:munder><mml:mo>&#x2225;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msup><mml:mo>&#x2225;</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mrow><mml:mi>&#x2130;</mml:mi></mml:mrow></mml:math></inline-formula> denotes anatomically connected keypoint pairs. <bold>Bone-Length Consistency Loss</bold> maintains anthropomorphic proportions.</p>
<p><bold>Laplacian Smoothness Loss</bold> encourages local pose smoothness and is calculated using <xref ref-type="disp-formula" rid="eqn-10">Eq. (10)</xref>.
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>lap</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mo>&#x2225;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo>&#x1D4A9;</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>&#x1D4A9;</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></disp-formula>where <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mrow><mml:mo>&#x1D4A9;</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the set of adjacent joints for keypoint <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mi>i</mml:mi></mml:math></inline-formula>. <bold>Scale-Invariant Loss</bold> normalizes errors by a reference body scale using <xref ref-type="disp-formula" rid="eqn-11">Eq. (11)</xref>.
<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>scale</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:msup><mml:mi>s</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mo>&#x2225;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mo>&#x2225;</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mo>=&#x2225;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>shoulder</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>hip</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2225;</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula></p>
<p>To enhance robustness, the keypoint schema was refined to 13 landmarks, with the nose serving as the cranial proxy and the shoulder midpoint defining the torso vector. Each 2D keypoint is mapped to pseudo-3D by sampling MiDaS depth, normalizing to a reference joint, and combining with 2D coordinates to form pseudo-3D joints. We approximate <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> by <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> with <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> from MiDaS, producing a sparse 3-D point cloud of skeletal landmarks. Followed by root-relative normalization with the pelvis joint <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>root</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> to remove global translation and standardize scale using <xref ref-type="disp-formula" rid="eqn-12">Eq. (12)</xref>.
<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>root</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mi>s</mml:mi></mml:mfrac><mml:mo>,</mml:mo><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:munder><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:mo>&#x2225;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></disp-formula></p>
<p>The resulting normalized descriptor, <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">f</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>pose</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mrow><mml:mtext mathvariant="bold">k</mml:mtext></mml:mrow><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>13</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula>, was subsequently used for classification and temporal modeling. The extracted key body points and corresponding silhouette are illustrated in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>MoveNet-based human keypoint estimation showing (<bold>a</bold>) the detected keypoints on the silhouette and (<bold>b</bold>) the visualization skeleton plot</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-4a.tif"/>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-4b.tif"/>
</fig>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Point Cloud Reconstruction from Monocular RGB Silhouettes</title>
<p>To infer 3D structure from monocular silhouettes, we use MiDaS [<xref ref-type="bibr" rid="ref-5">5</xref>], a state-of-the-art scale-invariant depth estimator suited for silhouettes without metric references. Depth is optimized using the scale-invariant loss in <xref ref-type="disp-formula" rid="eqn-13">Eq. (13)</xref>.
<disp-formula id="eqn-13"><label>(13)</label><mml:math id="mml-eqn-13" display="block"><mml:msub><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>si</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msubsup><mml:mi>D</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:msup><mml:mi>n</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:mo>(</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msubsup><mml:mi>D</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:mo>(</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msubsup><mml:mi>D</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>Predicted log-depth values are mapped back as <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the network output for pixel <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. The depth map is lifted into a 3D point cloud using the intrinsic matrix, with multiple maps fused by confidence weighting (<xref ref-type="disp-formula" rid="eqn-14">Eq. (14)</xref>).
<disp-formula id="eqn-14"><label>(14)</label><mml:math id="mml-eqn-14" display="block"><mml:msub><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mrow><mml:mtext>fused</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> encodes pixel-wise reliability. Surface normals are estimated using <xref ref-type="disp-formula" rid="eqn-15">Eq. (15)</xref>, providing local geometric consistency.
<disp-formula id="eqn-15"><label>(15)</label><mml:math id="mml-eqn-15" display="block"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">n</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">P</mml:mtext></mml:mrow><mml:mrow><mml:mi>u</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:mfrac></mml:math></disp-formula></p>
<p>To suppress unreliable 3D points, a depth confidence function is defined using <xref ref-type="disp-formula" rid="eqn-16">Eq. (16)</xref>.
<disp-formula id="eqn-16"><label>(16)</label><mml:math id="mml-eqn-16" display="block"><mml:mi>C</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mover><mml:mi>D</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mover><mml:mi>D</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:msub><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mover><mml:mi>D</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover></mml:math></inline-formula> and <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:msub><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denote the mean and standard deviation of predicted depths, respectively. This formulation ensures that the resulting point cloud remains anthropometrically realistic, as illustrated in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Visualization of point cloud in various poses: RGB silhouette, predicted depth map and reconstructed 3D point cloud</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-5.tif"/>
</fig>
</sec>
<sec id="s3_5">
<label>3.5</label>
<title>Object-Oriented Scene Feature Analysis</title>
<p>Environmental context is modeled by analyzing non-human scene components via semantic segmentation. AKAZE detects keypoints and MSER extracts regions, with features computed only on non-human masks to remain independent of the actor&#x2019;s silhouette. Both AKAZE and MSER feature visualizations are illustrated in <xref ref-type="fig" rid="fig-6">Fig. 6</xref>.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Scene-based feature extraction: (<bold>a1</bold>,<bold>a2</bold>) RGB scene frames; (<bold>b1</bold>,<bold>b2</bold>) object masks (human removed); (<bold>c1</bold>,<bold>c2</bold>) AKAZE keypoints; (<bold>d1</bold>,<bold>d2</bold>) MSER regions highlighting blob-like features</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-6.tif"/>
</fig>
<sec id="s3_5_1">
<label>3.5.1</label>
<title>AKAZE Keypoint Detection</title>
<p>For each object mask <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2282;</mml:mo><mml:mrow><mml:mi mathvariant="normal">&#x03A9;</mml:mi></mml:mrow></mml:math></inline-formula>, a masked grayscale image is defined using <xref ref-type="disp-formula" rid="eqn-17">Eq. (17)</xref>.
<disp-formula id="eqn-17"><label>(17)</label><mml:math id="mml-eqn-17" display="block"><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Grayscale</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>I</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mn>1</mml:mn><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mn>1</mml:mn></mml:math></inline-formula> is the indicator function. To extract scale-invariant features, AKAZE constructs a nonlinear diffusion scale space <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> using Perona&#x2013;Malik variable conductance diffusion as in <xref ref-type="disp-formula" rid="eqn-18">Eq. (18)</xref>.
<disp-formula id="eqn-18"><label>(18)</label><mml:math id="mml-eqn-18" display="block"><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mi mathvariant="normal">&#x2207;</mml:mi><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow><mml:mi>k</mml:mi></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:mi>c</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>u</mml:mi><mml:mo>,</mml:mo><mml:mi>v</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> controls edge preservation.</p>
</sec>
<sec id="s3_5_2">
<label>3.5.2</label>
<title>MSER Region Segmentation</title>
<p>MSER detects intensity-stable regions in <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> by thresholding across <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:mi>&#x03C4;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>255</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula>, extracting connected components, and tracking their evolution. The resulting extremal regions are formally defined in <xref ref-type="disp-formula" rid="eqn-19">Eq. (19)</xref>.<disp-formula id="eqn-19"><label>(19)</label><mml:math id="mml-eqn-19" display="block"><mml:mrow><mml:mi mathvariant="script">R</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C4;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mi>R</mml:mi><mml:mo>&#x2286;</mml:mo><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo>&#x2223;</mml:mo><mml:mi mathvariant="normal">&#x2200;</mml:mi><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>R</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="normal">&#x2200;</mml:mi><mml:mi>q</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>R</mml:mi><mml:mo>&#x003A;</mml:mo><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x003C;</mml:mo><mml:mi>&#x03C4;</mml:mi><mml:mo>&#x003C;</mml:mo><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mtext>&#xA0;or&#xA0;</mml:mtext></mml:mrow><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x003E;</mml:mo><mml:mi>&#x03C4;</mml:mi><mml:mo>&#x003E;</mml:mo><mml:msubsup><mml:mi>I</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>gray</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:mi>R</mml:mi></mml:math></inline-formula> denotes a connected component at threshold <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:mi>&#x03C4;</mml:mi></mml:math></inline-formula>. The stability of a region is quantified as provided in <xref ref-type="disp-formula" rid="eqn-20">Eq. (20)</xref> yielding maximally stable regions <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> that persist across scales.
<disp-formula id="eqn-20"><label>(20)</label><mml:math id="mml-eqn-20" display="block"><mml:mi>&#x03B4;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">&#x0394;</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:mo>&#x2217;</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mrow><mml:mtext>argmin</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mi>R</mml:mi></mml:mrow></mml:munder><mml:mo>&#x2061;</mml:mo><mml:mi>&#x03B4;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula></p>
<p>MSERs are enclosed by convex hulls to capture blob-like regions, with AKAZE descriptors <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">f</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>AKAZE</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:msup><mml:mo fence="false" stretchy="false">}</mml:mo><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> encoding local keypoints and MSER descriptors <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">f</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>MSER</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> representing stable regions. These, along with global descriptors <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">f</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>scene</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, are concatenated into a unified feature vector integrating local, regional, and global semantics for activity recognition.</p>
</sec>
</sec>
<sec id="s3_6">
<label>3.6</label>
<title>Global Scene Features and Frequency Analysis</title>
<p>The second stage extracts geometric and frequency descriptors: surface normals yield histograms, directional vectors, and Zernike moments, while Gabor filtering captures structural and textural cues without explicit segmentation.</p>
<sec id="s3_6_1">
<label>3.6.1</label>
<title>Scene Surface Normal Estimation</title>
<p>To approximate the underlying surface geometry of the 2D scene, we compute surface normals using Sobel derivatives, as depicted in <xref ref-type="fig" rid="fig-7">Fig. 7</xref>. Given a grayscale image <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>I</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, the horizontal and vertical gradients capture local intensity changes as <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>I</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>x</mml:mi></mml:math></inline-formula> and <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>I</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>y</mml:mi></mml:math></inline-formula>. These gradients are treated as partial derivatives of a height function to estimate the surface normal at each pixel and are calculated using <xref ref-type="disp-formula" rid="eqn-21">Eq. (21)</xref>, resulting in a normalized 3D orientation vector <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mrow><mml:mtext mathvariant="bold">N</mml:mtext></mml:mrow></mml:math></inline-formula> for every image location.<disp-formula id="eqn-21"><label>(21)</label><mml:math id="mml-eqn-21" display="block"><mml:mrow><mml:mtext mathvariant="bold">N</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:msqrt><mml:msubsup><mml:mi>G</mml:mi><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msubsup><mml:mi>G</mml:mi><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:msqrt></mml:mfrac><mml:mo>,</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:math></disp-formula></p>
<fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Visualization of surface normal (<bold>a</bold>) RGB frames, (<bold>b</bold>) Estimated surface normal capturing the geometric structure of scene</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-7.tif"/>
</fig>
<p>From this normal map, several compact descriptors are derived, including Azimuth and Inclination Histograms, Directional Variation, Zernike Moments, and Histogram of Angular Differences.</p>
</sec>
<sec id="s3_6_2">
<label>3.6.2</label>
<title>Gabor Filter-Based Texture and Edge Analysis</title>
<p>Gabor filters are applied to RGB frames to capture fine textures and oriented patterns, encoding spatial frequency and orientation for effective edge and gradient detection (<xref ref-type="disp-formula" rid="eqn-22">Eq. (22)</xref>).
<disp-formula id="eqn-22"><label>(22)</label><mml:math id="mml-eqn-22" display="block"><mml:mi>G</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>p</mml:mi><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:msup><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac><mml:mo>+</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>q</mml:mi><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mi>j</mml:mi><mml:mn>2</mml:mn><mml:mi>&#x03C0;</mml:mi><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mi>cos</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mi>&#x03B8;</mml:mi><mml:mo>+</mml:mo><mml:mi>y</mml:mi><mml:mi>sin</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mi>&#x03B8;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>here, <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mi>q</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> defines the kernel center, <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> and <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> control the Gaussian spread, <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:mi>f</mml:mi></mml:math></inline-formula> specifies the frequency, and <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>&#x03B8;</mml:mi></mml:math></inline-formula> sets the orientation. The real component emphasizes contours, and the imaginary component highlights directional edges, allowing localized frequency&#x2013;orientation analysis (<xref ref-type="fig" rid="fig-8">Fig. 8</xref>). Each grayscale image is convolved with 2D Gabor kernels across multiple wavelengths and orientations, producing 24 directional responses that capture spatial&#x2013;frequency and texture details. These responses are summarized using entropy, energy, skewness, and kurtosis to compactly represent texture complexity, structural patterns, and contextual semantics for improved activity recognition. Gabor filter responses each emphasizing different spatial&#x2013;frequency and orientation characteristics is provided in <xref ref-type="fig" rid="fig-9">Fig. 9</xref>.</p>
<fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Real (even-symmetric) and imaginary (odd-symmetric) components of the complex Gabor function, capturing contour and directional edge information</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-8.tif"/>
</fig><fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>Visualization of 12 Gabor filter responses, each emphasizing different spatial&#x2013;frequency and orientation characteristics</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-9.tif"/>
</fig>
</sec>
</sec>
<sec id="s3_7">
<label>3.7</label>
<title>Feature Fusion and Classification</title>
<p>To comprehensively encode human motion and scene context, SCENET-3D constructs a composite latent embedding by jointly modelling skeletal dynamics, object-centric structure, and global environmental signatures. Let <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>S</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:math></inline-formula> <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>O</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mi>G</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> denote the skeleton-, object-, and global-scene descriptors, respectively. Each modality is first normalized and projected into a shared latent space of dimension mmm via trainable linear maps <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:msup><mml:mi>s</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mi>S</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msup><mml:mi>o</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mi>S</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mi>S</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> with <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:math></inline-formula> Introducing non-negative modality weights <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mi>&#x03B1;</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mo>[</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>]</mml:mo></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x22A4;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, the fusion operator <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mi>F</mml:mi><mml:mo>&#x003A;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msup><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msup><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msup><mml:mo stretchy="false">&#x2192;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is defined using <xref ref-type="disp-formula" rid="eqn-23">Eq. (23)</xref>.
<disp-formula id="eqn-23"><label>(23)</label><mml:math id="mml-eqn-23" display="block"><mml:mi>F</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mi>s</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mi>o</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03B1;</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>&#x003C;</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>i</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2299;</mml:mo><mml:msup><mml:mi>j</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where &#x2299; denotes element-wise interaction capturing cross-modal correlations, <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> are bilinear interaction matrices, and &#x03C3;(&#x00B7;) is a pointwise non-linearity. If <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mo>[</mml:mo><mml:msup><mml:mrow><mml:msup><mml:mi>s</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x22A4;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:msup><mml:mi>o</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x22A4;</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:msup><mml:mi>g</mml:mi><mml:mrow><mml:mo>&#x223C;</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x22A4;</mml:mi></mml:mrow></mml:msup><mml:mo>]</mml:mo></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x22A4;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>R</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mi>m</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> denotes the concatenation of projected modalities, then this fusion can also be written compactly using <xref ref-type="disp-formula" rid="eqn-24">Eq. (24)</xref>.
<disp-formula id="eqn-24"><label>(24)</label><mml:math id="mml-eqn-24" display="block"><mml:mi>F</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mi>x</mml:mi><mml:mo>+</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x22A4;</mml:mi></mml:mrow></mml:msup><mml:mi>K</mml:mi><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>With <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> linear fusion matrix <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mi>K</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mrow><mml:mn>3</mml:mn><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> a symmetric kernel capturing all second-order cross-terms. The resulting <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:mi>F</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>m</mml:mtext></mml:mrow></mml:mrow></mml:msup></mml:math></inline-formula> constitutes the composite latent embedding unifying pose, object, and scene cues. Fusing human motion and environmental semantics, the vector is processed by a transformer encoder with multi-head self-attention. For an input sequence <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:mrow><mml:mtext mathvariant="bold">X</mml:mtext></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, queries, keys, and values are computed using <xref ref-type="disp-formula" rid="eqn-25">Eq. (25)</xref>:
<disp-formula id="eqn-25"><label>(25)</label><mml:math id="mml-eqn-25" display="block"><mml:mi>Q</mml:mi><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula></p>
<p>The attention mechanism is then defined using <xref ref-type="disp-formula" rid="eqn-26">Eq. (26)</xref>.
<disp-formula id="eqn-26"><label>(26)</label><mml:math id="mml-eqn-26" display="block"><mml:mrow><mml:mtext>Attention</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext>softmax</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mi>K</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:msqrt><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:msqrt></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mi>V</mml:mi></mml:math></disp-formula></p>
<p>Multi-head attention aggregates <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:mi>h</mml:mi></mml:math></inline-formula> parallel attention heads given in <xref ref-type="disp-formula" rid="eqn-27">Eq. (27)</xref>.<disp-formula id="eqn-27"><label>(27)</label><mml:math id="mml-eqn-27" display="block"><mml:mrow><mml:mtext>MHA</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Concat</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mrow><mml:mtext>head</mml:mtext></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mtext>head</mml:mtext></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>O</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula>with <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:msub><mml:mrow><mml:mtext>head</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Attention</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. The encoder output <inline-formula id="ieqn-65"><mml:math id="mml-ieqn-65"><mml:mrow><mml:mtext mathvariant="bold">Z</mml:mtext></mml:mrow></mml:math></inline-formula> is passed through a feed-forward network with residual connections and normalization.</p>
<p>Finally, the activity class probabilities are computed using a softmax layer using <xref ref-type="disp-formula" rid="eqn-28">Eq. (28)</xref>.<disp-formula id="eqn-28"><label>(28)</label><mml:math id="mml-eqn-28" display="block"><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mtext>softmax</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>W</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mrow><mml:mtext mathvariant="bold">z</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-66"><mml:math id="mml-ieqn-66"><mml:msub><mml:mrow><mml:mtext mathvariant="bold">z</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the classification token embedding. This fusion&#x2013;transformer pipeline enables robust recognition by jointly reasoning over pose and scene features, effectively disambiguating visually similar activities in cluttered environments.</p>
<p>As given in <xref ref-type="table" rid="table-1">Table 1</xref>, the transformer-based model was optimally configured with 6 encoder layers, a model dimension of 768, and a feed-forward hidden size of 3072, using GeLU activation and a dropout rate of 0.1. Each encoder layer employed 12 attention heads with 64-dimensional per-head projections and attention dropout of 0.1. Layer normalization was applied in a pre-norm configuration with &#x03B5; set to 1e&#x2212;5. The fully connected classification head comprised two hidden layers of 512 neurons each with ReLU activation, followed by a Softmax output layer for 10 classes, resulting in a total parameter count of approximately 199k. This setup balances model complexity and performance, providing effective representation learning while controlling overfitting.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Selected hyperparameters and architectural settings of the transformer model</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Layer/Component</th>
<th>Parameter name</th>
<th>Values</th>
<th>Optimal value</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="5">Transformer encoder</td>
<td>Number of layers (L)</td>
<td>4&#x2013;12</td>
<td>6</td>
</tr>
<tr>
<td>Model dimension (dmodel)</td>
<td>512&#x2013;1024</td>
<td>768</td>
</tr>
<tr>
<td>Feed-forward hidden size (dff)</td>
<td>2048&#x2013;4096</td>
<td>3072</td>
</tr>
<tr>
<td>Dropout rate</td>
<td>0.05&#x2013;0.2</td>
<td>0.1</td>
</tr>
<tr>
<td>Activation function</td>
<td>ReLU/GeLU</td>
<td>GeLU</td>
</tr>
<tr>
<td rowspan="2">Multi-head attention (Inside Encoder)</td>
<td>Number of attention heads (H)</td>
<td>8&#x2013;16</td>
<td>12</td>
</tr>
<tr>
<td>Per-head dimension (dk)</td>
<td>64&#x2013;128</td>
<td>64</td>
</tr>
<tr>
<td/>
<td>Attention dropout</td>
<td>0.05&#x2013;0.2</td>
<td>0.1</td>
</tr>
<tr>
<td rowspan="2">Add &#x0026; LayerNorm blocks (Inside Encoder)</td>
<td>Normalization type</td>
<td>LayerNorm/ RMSNorm</td>
<td>LayerNorm</td>
</tr>
<tr>
<td>&#x03B5; (epsilon) parameter</td>
<td>1e&#x2212;6&#x2013;1e&#x2212;4</td>
<td>1e&#x2212;5</td>
</tr>
<tr>
<td/>
<td>Residual connection placement</td>
<td>Pre-/Post-Norm</td>
<td>Pre-Norm</td>
</tr>
<tr>
<td rowspan="3">Fully connected classification layer</td>
<td>Input neurons</td>
<td>512&#x2013;1024</td>
<td>768</td>
</tr>
<tr>
<td>Hidden layers (Count)</td>
<td>1&#x2013;3</td>
<td>2</td>
</tr>
<tr>
<td>Neurons per hidden layer</td>
<td>128&#x2013;512</td>
<td>512</td>
</tr>
<tr>
<td/>
<td>Activation function (Hidden Layer)</td>
<td>ReLU/Tanh</td>
<td>ReLU</td>
</tr>
<tr><td/>
<td>Output neurons (Classes)</td>
<td>Fixed by task (10)</td>
<td>10</td>
</tr>
<tr><td/>
<td>Activation function (Output Layer)</td>
<td>Softmax (fixed)</td>
<td>Softmax</td>
</tr>
<tr><td/>
<td>Parameter count</td>
<td>&#x2248;150k&#x2013;&#x2248;500k</td>
<td>&#x2248;199k</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Results and Discussion</title>
<p>The results section evaluates the proposed HAR system on UCLA, ETRI-Activity3D, and CAD-120 datasets. Performance is assessed via confusion matrices, ROC curves, and metrics including accuracy, precision, recall, and F1-score. Comparative tables highlight the method&#x2019;s effectiveness and robustness across datasets.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental Setup</title>
<p>All experiments for SCENET-3D were conducted in a cloud-based environment using Google Colab kernels, which provided access to an NVIDIA Tesla T4 GPU (16 GB GDDR6, 2560 CUDA cores, Turing architecture), a dual-core Intel Xeon CPU (2.20 GHz), and 27 GB of system RAM. Storage and dataset management were handled through Google Drive integration with temporary VM scratch space for caching. The system comprising the Transformer backbone, pseudo-3D skeleton encoder, and scene-object fusion module&#x2014;was implemented in Python 3.10 with TensorFlow 2.6.0 and the Keras API, accelerated using CUDA 11.0 and cuDNN 8.0 for parallelized training and inference. NumPy, Pandas, and OpenCV supported numerical processing, structured data handling, and augmentation, while Matplotlib, Seaborn, and TensorBoard were used to visualize learning dynamics.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Datasets</title>
<p>We evaluated on three benchmarks: N-UCLA Multiview Action 3D (RGB, depth, skeletons of 10 actions across three Kinect views), ETRI-Activity3D (55&#x002B; daily activities with RGB-D and motion capture), and CAD-120 (10 complex activities with RGB-D, skeletons, object tracks, and sub-activity labels). These datasets cover diverse modalities and environments for comprehensive evaluation.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Confusion Matrix Analysis</title>
<p>The UCLA confusion matrix (<xref ref-type="fig" rid="fig-10">Fig. 10</xref>) shows strong diagonal dominance, with most activities (Ex1, Ex4, Ex5, Ex6, Ex9, Ex10) above 94% accuracy. On ETRI-Activity3D (<xref ref-type="fig" rid="fig-11">Fig. 11</xref>), the model achieves average precision, recall, and F1 of 0.8058, 0.8056, and 0.8047. Distinctive activities (Ex49&#x2013;Ex55) exceed 0.94, while overlapping motions (Ex2, Ex7, Ex8, Ex29) lower discriminability. Well-defined actions (Ex6, Ex28, Ex38&#x2013;Ex55) show near-perfect recognition, highlighting the need for temporal modeling or data augmentation for ambiguous classes. On CAD-120 (<xref ref-type="fig" rid="fig-10">Fig. 10</xref>), the framework achieves &#x003E;90% accuracy for Cleaning, Having a meal, Microwaving, Putting, Stacking, and Taking objects.</p>
<fig id="fig-10">
<label>Figure 10</label>
<caption>
<title>Normalized confusion matrices on three datasets: (<bold>a</bold>) UCLA, (<bold>b</bold>) ETRI-Activity3D, and (<bold>c</bold>) CAD-120</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-10.tif"/>
</fig><fig id="fig-11">
<label>Figure 11</label>
<caption>
<title>ROC curves for the proposed model across three datasets: (<bold>a</bold>) UCLA, (<bold>b</bold>) ETRI-Activity3D, and (<bold>c</bold>) CAD-120</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72508-fig-11.tif"/>
</fig>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>ROC Curves Analysis</title>
<p>ROC curves offer threshold-independent evaluation of TPR&#x2013;FPR trade-offs. UCLA curves cluster near the top-left, showing near-perfect separability. CAD-120 also approaches the ideal, confirming robust recognition of structured activities. ETRI shows greater variability, with lower sensitivity at low FPRs due to subtle motions and complex backgrounds. Overall, UCLA performs best, CAD-120 is balanced, and ETRI needs stronger feature discrimination (<xref ref-type="fig" rid="fig-11">Fig. 11</xref>).</p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Performance Analysis</title>
<p><xref ref-type="table" rid="table-2">Table 2</xref> shows performance across three benchmarks. UCLA achieves the highest accuracy (95.8%) with balanced precision, recall, and F1. ETRI scores slightly lower due to variability, while CAD-120 attains competitive metrics, demonstrating strong generalization to structured activities. All experiments on UCLA, ETRI-Activity3D and CAD-120 were repeated five times with different random seeds, and <xref ref-type="table" rid="table-2">Table 2</xref> reports mean &#x00B1; standard deviation for accuracy, precision, recall and F1-score. These statistics provide direct estimates of variability and thus approximate 95% confidence intervals. We further performed paired two-tailed <italic>t</italic>-tests comparing our full model with the strongest baseline for each dataset. All improvements were significant at <italic>p</italic> &#x003C; 0.05, confirming that the reported gains are statistically robust rather than due to random variation.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Classification performance metrics for the proposed system across three datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Dataset</th>
<th>Accuracy (% &#x00B1;SD)</th>
<th>Precision (&#x00B1;SD)</th>
<th>Recall (&#x00B1;SD)</th>
<th>F1-score (&#x00B1;SD)</th>
<th><italic>p</italic>-value (vs. Strongest baseline)</th>
</tr>
</thead>
<tbody>
<tr>
<td>UCLA</td>
<td>95.8 &#x00B1; 0.7</td>
<td>0.907 &#x00B1; 0.010</td>
<td>0.911 &#x00B1; 0.011</td>
<td>0.908 &#x00B1; 0.012</td>
<td>&#x003C;0.05</td>
</tr>
<tr>
<td>ETRI</td>
<td>89.4 &#x00B1; 0.8</td>
<td>0.806 &#x00B1; 0.015</td>
<td>0.806 &#x00B1; 0.016</td>
<td>0.805 &#x00B1; 0.017</td>
<td>&#x003C;0.05</td>
</tr>
<tr>
<td>CAD-120</td>
<td>91.2 &#x00B1; 0.6</td>
<td>0.853 &#x00B1; 0.012</td>
<td>0.862 &#x00B1; 0.013</td>
<td>0.862 &#x00B1; 0.014</td>
<td>&#x003C;0.05</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Ablation Study</title>
<p>Ablation results (<xref ref-type="table" rid="table-3">Table 3</xref>) show that removing preprocessing reduces accuracy by &#x007E;5%&#x2013;6%, while excluding point clouds or MoveNet keypoints causes the largest drop (20%&#x2013;25%), confirming the necessity of 3D structure and skeleton cues. Object-wise scene analysis adds 8%&#x2013;12%, and surface normals plus Gabor features contribute 3%&#x2013;5%, refining recognition in cluttered settings.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Ablation study showing the contribution of each module to classification accuracy</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Method/Component</th>
<th>Description</th>
<th>UCLA (%)</th>
<th>ETRI (%)</th>
<th>CAD-120 (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Full model</td>
<td>Skeleton &#x002B; scene descriptors &#x002B; transformer &#x002B; cross-modal fusion</td>
<td>95.8 &#x00B1; 0.7</td>
<td>89.4 &#x00B1; 0.8</td>
<td>91.2 &#x00B1; 0.6</td>
</tr>
<tr>
<td>Without preprocessing</td>
<td>Full model without normalization, denoising, or augmentation</td>
<td>90.1 &#x00B1; 0.9</td>
<td>84.2 &#x00B1; 1.0</td>
<td>85.4 &#x00B1; 0.8</td>
</tr>
<tr>
<td>Without point clouds</td>
<td>Excludes raw 3D point cloud input</td>
<td>75.1 &#x00B1; 1.1</td>
<td>68.2 &#x00B1; 1.2</td>
<td>69.3 &#x00B1; 1.0</td>
</tr>
<tr>
<td>Without moveNet keypoints</td>
<td>Excludes skeleton joint coordinates via MoveNet</td>
<td>69.6 &#x00B1; 1.0</td>
<td>66.9 &#x00B1; 1.1</td>
<td>71.2 &#x00B1; 0.9</td>
</tr>
<tr>
<td>Without object-wise scene analysis</td>
<td>Removes scene descriptors/object-level analysis</td>
<td>79.9 &#x00B1; 1.0</td>
<td>78.9 &#x00B1; 0.9</td>
<td>82.2 &#x00B1; 1.0</td>
</tr>
<tr>
<td>Without surface normals &#x0026; features</td>
<td>Excludes depth &#x0026; surface normal features</td>
<td>85.2 &#x00B1; 0.8</td>
<td>86.2 &#x00B1; 0.7</td>
<td>83.3 &#x00B1; 0.9</td>
</tr>
<tr>
<td>Without gabor filters &#x0026; features</td>
<td>Excludes Gabor-based texture features</td>
<td>87.8 &#x00B1; 0.9</td>
<td>86.7 &#x00B1; 0.8</td>
<td>81.3 &#x00B1; 1.1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>SCENET-3D employs a three-stage pipeline scene analysis, scene-object analysis, and human-pose estimation combining handcrafted descriptors, transformer-based representations, and pseudo-3D point clouds for robust, interpretable understanding. Stage one captures holistic scene attributes (lighting, geometry, texture) via Zernike moments and Gabor filters. Stage two localizes objects using AKAZE and MSER keypoints to preserve local geometric and textural features. The transformer encoder enhances handcrafted and point cloud features by modeling cross-modal and temporal dependencies among skeleton, scene-object, and 3D spatial representations. This context-aware reasoning helps disambiguate visually similar actions. Ablation studies show that removing object, global scene, or point cloud features reduces accuracy by 8%&#x2013;12%, 3%&#x2013;5%, and 5%&#x2013;7%, respectively, highlighting the importance of integrating local, global, temporal, and spatial cues. This hybrid approach outperforms pipelines using only deep learning, handcrafted, or point-cloud features.</p>
</sec>
<sec id="s4_7">
<label>4.7</label>
<title>Computational Cost Analysis</title>
<p><xref ref-type="table" rid="table-4">Table 4</xref> highlights the significant variation in computational demands across different vision modules. Tasks like semantic segmentation and point-cloud processing are highly resource-intensive, reflecting their complexity in analyzing large-scale scene and spatial data. In contrast, modules such as region detection, feature extraction, and pose estimation are relatively lightweight, offering efficient processing for targeted analyses. Intermediate techniques, like multi-scale Gabor filtering, balance complexity and efficiency, providing rich feature representations without excessive computational cost.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Computational performance of each module (processing time per frame) and corresponding FLOPs</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Module</th>
<th>Estimated time (ms/Frame)</th>
<th>Estimated FLOPs (TFLOPs/Frame)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Semantic segmentation (scene)</td>
<td>16,483.11 ms</td>
<td>0.019600 TFLOPs</td>
</tr>
<tr>
<td>MSER (region detection)</td>
<td>112.05 ms</td>
<td>0.00000104 TFLOPs</td>
</tr>
<tr>
<td>AKAZE (feature extraction &#x0026; descriptors)</td>
<td>312.80 ms</td>
<td>0.00002 TFLOPs</td>
</tr>
<tr>
<td>MoveNet (pose estimation)</td>
<td>73.04 ms</td>
<td>0.000960 TFLOPs</td>
</tr>
<tr>
<td>Surface normals (from depth/point cloud)</td>
<td>45.49 ms</td>
<td>0.000057 TFLOPs</td>
</tr>
<tr>
<td>Gabor filter bank (multi-scale/orientation)</td>
<td>125.07 ms</td>
<td>0.00401 TFLOPs</td>
</tr>
<tr>
<td>Point-cloud processing</td>
<td>10,632.88 ms</td>
<td>0.5061 TFLOPs</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_8">
<label>4.8</label>
<title>Comparative Analysis</title>
<p><xref ref-type="table" rid="table-5">Table 5</xref> compares classification accuracies across state-of-the-art methods. On UCLA, several models exceed 90%, with the proposed system achieving the highest (95.8%). For the more complex ETRI-Activity3D dataset, accuracies are lower overall, yet our approach leads with 89.4%. On CAD-120, results are more balanced, with the proposed method slightly surpassing prior work, demonstrating robustness across varying dataset complexities.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Comparison of reported accuracy for UCLA, ETRI-Activity3D, and CAD-120 datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th colspan="2">UCLA</th>
<th colspan="2">ETRI-activity3D</th>
<th colspan="2">CAD-120</th>
</tr>
<tr>
<th><bold>Author(s)</bold></th>
<th><bold>Acc. (%)</bold></th>
<th><bold>Author(s)</bold></th>
<th><bold>Acc. (%)</bold></th>
<th><bold>Author(s)</bold></th>
<th><bold>Acc. (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td>Al-Faris et al. [<xref ref-type="bibr" rid="ref-21">21</xref>]</td>
<td>91.5</td>
<td>Xie et al. [<xref ref-type="bibr" rid="ref-31">31</xref>]</td>
<td>82.4</td>
<td>Li et al. [<xref ref-type="bibr" rid="ref-33">33</xref>]</td>
<td>91.8</td>
</tr>
<tr>
<td>Lee et al. [<xref ref-type="bibr" rid="ref-22">22</xref>]</td>
<td>89.7</td>
<td>Xu et al. [<xref ref-type="bibr" rid="ref-32">32</xref>]</td>
<td>83.0</td>
<td>Almushyti and Li [<xref ref-type="bibr" rid="ref-36">36</xref>]</td>
<td>88.54</td>
</tr>
<tr>
<td>Siddiqui et al. [<xref ref-type="bibr" rid="ref-23">23</xref>]</td>
<td>95.2</td>
<td>Li et al. [<xref ref-type="bibr" rid="ref-33">33</xref>]</td>
<td>83.3</td>
<td>Qi et al. [<xref ref-type="bibr" rid="ref-37">37</xref>]</td>
<td>88.9</td>
</tr>
<tr>
<td>Zakka et al. [<xref ref-type="bibr" rid="ref-24">24</xref>]</td>
<td>94.6</td>
<td>Yan et al. [<xref ref-type="bibr" rid="ref-34">34</xref>]</td>
<td>86.8</td>
<td><bold>Proposed system</bold></td>
<td>91.2</td>
</tr>
<tr>
<td>Baptista et al. [<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>86.9</td>
<td>Tayyab and Jalal [<xref ref-type="bibr" rid="ref-35">35</xref>]</td>
<td>87.5</td>
<td></td>
<td></td>
</tr>
<tr>
<td rowspan="2">Cheng et al. [<xref ref-type="bibr" rid="ref-26">26</xref>]</td>
<td rowspan="2">93.3</td>
<td>Kim et al. [<xref ref-type="bibr" rid="ref-17">17</xref>]</td>
<td>59.4</td>
<td rowspan="2"></td>
<td rowspan="2"></td>
</tr>
<tr>
<td><bold>Proposed system</bold></td>
<td>89.4</td>
</tr>
<tr>
<td>Liu et al. [<xref ref-type="bibr" rid="ref-27">27</xref>]</td>
<td>92.7</td>
<td colspan="4"></td>
</tr>
<tr>
<td>Yang et al. [<xref ref-type="bibr" rid="ref-28">28</xref>]</td>
<td>94.0</td>
<td/>
<td/>
<td/>
<td/>
</tr>
<tr>
<td>Das et al. [<xref ref-type="bibr" rid="ref-29">29</xref>]</td>
<td>54.2</td>
<td/>
<td/>
<td/>
<td/>
</tr>
<tr>
<td>Kim et al. [<xref ref-type="bibr" rid="ref-30">30</xref>]</td>
<td>90.4</td>
<td/>
<td/>
<td/>
<td/>
</tr>
<tr>
<td><bold>Proposed system SCENET-3D</bold></td>
<td>95.8</td>
<td/>
<td/>
<td/>
<td/>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion and Future Work</title>
<p>This work presents a unified RGB-based framework integrating pose-aware features with scene-level context for robust HAR in elderly care. Combining MoveNet keypoints, MiDaS pseudo-3D cues, and multi-level scene descriptors, it achieves high accuracy on UCLA (95.8%), ETRI-Activity3D (89.4%), and CAD-120 (91.2%). While effective, the pipeline is sensitive to occlusion, illumination, and clutter, and current fusion does not fully capture temporal transitions. Future work will explore transformer-based temporal modeling, adaptive feature fusion, and lightweight designs for real-time deployment.</p>
</sec>
</body>
<back>
<ack>
<p>Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2025R410), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This research is supported and funded by Princess Nourah bint Abdulrahman University Researchers Supporting Project number (PNURSP2025R410), Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia.</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Study conception and design: Aman Aman Ullah and Yanfeng Wu; data collection: Nouf Abdullah Almujally and Shaheryar Najam; analysis and interpretation of results: Ahmad Jalal, Hui Liu, and Shaheryar Najam; draft manuscript preparation: Ahmad Jalal and Shaheryar Najam. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>All publicly available datasets are used in the study.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Shoaib</surname> <given-names>M</given-names></string-name>, <string-name><surname>Dragon</surname> <given-names>R</given-names></string-name>, <string-name><surname>Ostermann</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Context-aware visual analysis of elderly activity in a cluttered home environment</article-title>. <source>EURASIP J Adv Signal Process</source>. <year>2011</year>;<volume>2011</volume>(<issue>1</issue>):<fpage>129</fpage>. doi:<pub-id pub-id-type="doi">10.1186/1687-6180-2011-129</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>P</given-names></string-name>, <string-name><surname>Li</surname> <given-names>W</given-names></string-name>, <string-name><surname>Ogunbona</surname> <given-names>P</given-names></string-name>, <string-name><surname>Wan</surname> <given-names>J</given-names></string-name>, <string-name><surname>Escalera</surname> <given-names>S</given-names></string-name></person-group>. <article-title>RGB-D-based human motion recognition with deep learning: a survey</article-title>. <source>Comput Vis Image Underst</source>. <year>2018</year>;<volume>171</volume>(<issue>3</issue>):<fpage>118</fpage>&#x2013;<lpage>39</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.cviu.2018.04.007</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Shin</surname> <given-names>J</given-names></string-name>, <string-name><surname>Hassan</surname> <given-names>N</given-names></string-name>, <string-name><surname>Miah</surname> <given-names>ASM</given-names></string-name>, <string-name><surname>Nishimura</surname> <given-names>S</given-names></string-name></person-group>. <article-title>A comprehensive methodological survey of human activity recognition across diverse data modalities</article-title>. <source>Sensors</source>. <year>2025</year>;<volume>25</volume>(<issue>13</issue>):<fpage>4028</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s25134028</pub-id>; <pub-id pub-id-type="pmid">40648284</pub-id></mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>A comprehensive survey on RGB-D-based human action recognition: algorithms, datasets and popular applications</article-title>. <source>EURASIP J Image Video Process</source>. <year>2025</year>;<volume>2025</volume>(<issue>1</issue>):<fpage>15</fpage>. doi:<pub-id pub-id-type="doi">10.1186/s13640-025-00677-0</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Joshi</surname> <given-names>RBD</given-names></string-name>, <string-name><surname>Joshi</surname> <given-names>D</given-names></string-name></person-group>. <article-title>MoveNet: a deep neural network for joint profile prediction across variable walking speeds and slopes</article-title>. <source>IEEE Trans Instrum Meas</source>. <year>2021</year>;<volume>70</volume>:<fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:<pub-id pub-id-type="doi">10.1109/tim.2021.3073720</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Granata</surname> <given-names>C</given-names></string-name>, <string-name><surname>Ibanez</surname> <given-names>A</given-names></string-name>, <string-name><surname>Bidaud</surname> <given-names>P</given-names></string-name></person-group>. <article-title>Human activity-understanding: a multilayer approach combining body movements and contextual descriptors analysis</article-title>. <source>Int J Adv Rob Syst</source>. <year>2015</year>;<volume>12</volume>(<issue>7</issue>):<fpage>89</fpage>. doi:<pub-id pub-id-type="doi">10.5772/60525</pub-id>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Melas-Kyriazi</surname> <given-names>L</given-names></string-name>, <string-name><surname>Rupprecht</surname> <given-names>C</given-names></string-name>, <string-name><surname>Vedaldi</surname> <given-names>A</given-names></string-name></person-group>. <article-title>PC2: projection-conditioned point cloud diffusion for single-image 3D reconstruction</article-title>. In: <conf-name>Proceedings of the 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2023 Jun 17&#x2013;24</conf-name>; <publisher-loc>Vancouver, BC, Canada</publisher-loc>. p. <fpage>12923</fpage>&#x2013;<lpage>32</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR52729.2023.01242</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ullah</surname> <given-names>R</given-names></string-name>, <string-name><surname>Asghar</surname> <given-names>I</given-names></string-name>, <string-name><surname>Akbar</surname> <given-names>S</given-names></string-name>, <string-name><surname>Evans</surname> <given-names>G</given-names></string-name>, <string-name><surname>Vermaak</surname> <given-names>J</given-names></string-name>, <string-name><surname>Alblwi</surname> <given-names>A</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Vision-based activity recognition for unobtrusive monitoring of the elderly in care settings</article-title>. <source>Technologies</source>. <year>2025</year>;<volume>13</volume>(<issue>5</issue>):<fpage>184</fpage>. doi:<pub-id pub-id-type="doi">10.3390/technologies13050184</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Lee</surname> <given-names>JJ</given-names></string-name>, <string-name><surname>Benes</surname> <given-names>B</given-names></string-name></person-group>. <article-title>RGB2Point: 3D point cloud generation from single RGB images</article-title>. In: <conf-name>Proceedings of the 2025 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV); 2025 Feb 26&#x2013;Mar 6</conf-name>; <publisher-loc>Tucson, AZ, USA. Piscataway, NJ, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>; <year>2025</year>. p. <fpage>2952</fpage>&#x2013;<lpage>62</lpage>. doi:<pub-id pub-id-type="doi">10.1109/WACV61041.2025.00292</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Al Farid</surname> <given-names>F</given-names></string-name>, <string-name><surname>Bari</surname> <given-names>A</given-names></string-name>, <string-name><surname>Miah</surname> <given-names>ASM</given-names></string-name>, <string-name><surname>Mansor</surname> <given-names>S</given-names></string-name>, <string-name><surname>Uddin</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kumaresan</surname> <given-names>SP</given-names></string-name></person-group>. <article-title>A structured and methodological review on multi-view human activity recognition for ambient assisted living</article-title>. <source>J Imaging</source>. <year>2025</year>;<volume>11</volume>(<issue>6</issue>):<fpage>182</fpage>. doi:<pub-id pub-id-type="doi">10.3390/jimaging11060182</pub-id>; <pub-id pub-id-type="pmid">40558781</pub-id></mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ghorbani</surname> <given-names>F</given-names></string-name>, <string-name><surname>Ahmadi</surname> <given-names>A</given-names></string-name>, <string-name><surname>Kia</surname> <given-names>M</given-names></string-name>, <string-name><surname>Rahman</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Delrobaei</surname> <given-names>M</given-names></string-name></person-group>. <article-title>A decision-aware ambient assisted living system with IoT embedded device for in-home monitoring of older adults</article-title>. <source>Sensors</source>. <year>2023</year>;<volume>23</volume>(<issue>5</issue>):<fpage>2673</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s23052673</pub-id>; <pub-id pub-id-type="pmid">36904877</pub-id></mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gaya-Morey</surname> <given-names>FX</given-names></string-name>, <string-name><surname>Manresa-Yee</surname> <given-names>C</given-names></string-name>, <string-name><surname>Buades-Rubio</surname> <given-names>JM</given-names></string-name></person-group>. <article-title>Deep learning for computer vision based activity recognition and fall detection of the elderly: a systematic review</article-title>. <source>Appl Intell</source>. <year>2024</year>;<volume>54</volume>(<issue>19</issue>):<fpage>8982</fpage>&#x2013;<lpage>9007</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10489-024-05645-1</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Hu</surname> <given-names>T</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>W</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Human action recognition based on scene semantics</article-title>. <source>Multimed Tools Appl</source>. <year>2019</year>;<volume>78</volume>(<issue>20</issue>):<fpage>28515</fpage>&#x2013;<lpage>36</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11042-017-5496-x</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>W</given-names></string-name></person-group>. <article-title>Scene context-aware graph convolutional network for skeleton-based action recognition</article-title>. <source>IET Comput Vis</source>. <year>2024</year>;<volume>18</volume>(<issue>3</issue>):<fpage>343</fpage>&#x2013;<lpage>54</lpage>. doi:<pub-id pub-id-type="doi">10.1049/cvi2.12253</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Rafique</surname> <given-names>AA</given-names></string-name>, <string-name><surname>Gochoo</surname> <given-names>M</given-names></string-name>, <string-name><surname>Jalal</surname> <given-names>A</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>K</given-names></string-name></person-group>. <article-title>Maximum entropy scaled super pixels segmentation for multi-object detection and scene recognition via deep belief network</article-title>. <source>Multimed Tools Appl</source>. <year>2023</year>;<volume>82</volume>(<issue>9</issue>):<fpage>13401</fpage>&#x2013;<lpage>30</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11042-022-13717-y</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Achirei</surname> <given-names>SD</given-names></string-name>, <string-name><surname>Heghea</surname> <given-names>MC</given-names></string-name>, <string-name><surname>Lupu</surname> <given-names>RG</given-names></string-name>, <string-name><surname>Manta</surname> <given-names>VI</given-names></string-name></person-group>. <article-title>Human activity recognition for assisted living based on scene understanding</article-title>. <source>Appl Sci</source>. <year>2022</year>;<volume>12</volume>(<issue>21</issue>):<fpage>10743</fpage>. doi:<pub-id pub-id-type="doi">10.3390/app122110743</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kim</surname> <given-names>D</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>I</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>D</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Action recognition using close-up of maximum activation and ETRI-Activity3D LivingLab dataset</article-title>. <source>Sensors</source>. <year>2021</year>;<volume>21</volume>(<issue>20</issue>):<fpage>6774</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s21206774</pub-id>; <pub-id pub-id-type="pmid">34695988</pub-id></mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Han</surname> <given-names>G</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Deng</surname> <given-names>F</given-names></string-name></person-group>. <article-title>A survey of human-object interaction detection with deep learning</article-title>. <source>IEEE Trans Emerg Top Comput Intell</source>. <year>2025</year>;<volume>9</volume>(<issue>1</issue>):<fpage>3</fpage>&#x2013;<lpage>26</lpage>. doi:<pub-id pub-id-type="doi">10.1109/tetci.2024.3518613</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Maheriya</surname> <given-names>K</given-names></string-name>, <string-name><surname>Rahevar</surname> <given-names>M</given-names></string-name>, <string-name><surname>Mewada</surname> <given-names>H</given-names></string-name>, <string-name><surname>Parmar</surname> <given-names>M</given-names></string-name>, <string-name><surname>Patel</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Insights into aerial intelligence: assessing CNN-based algorithms for human action recognition and object detection in diverse environments</article-title>. <source>Multimed Tools Appl</source>. <year>2025</year>;<volume>84</volume>(<issue>16</issue>):<fpage>16481</fpage>&#x2013;<lpage>523</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11042-024-19611-z</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Su</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>H</given-names></string-name></person-group>. <article-title>A novel part refinement tandem transformer for human-object interaction detection</article-title>. <source>Sensors</source>. <year>2024</year>;<volume>24</volume>(<issue>13</issue>):<fpage>4278</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s24134278</pub-id>; <pub-id pub-id-type="pmid">39001055</pub-id></mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Al-Faris</surname> <given-names>M</given-names></string-name>, <string-name><surname>Chiverton</surname> <given-names>JP</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Ndzi</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Multi-view region-adaptive multi-temporal DMM and RGB action recognition</article-title>. <source>Pattern Anal Appl</source>. <year>2020</year>;<volume>23</volume>(<issue>4</issue>):<fpage>1587</fpage>&#x2013;<lpage>602</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10044-020-00886-5</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lee</surname> <given-names>I</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>D</given-names></string-name>, <string-name><surname>Wee</surname> <given-names>D</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>S</given-names></string-name></person-group>. <article-title>An efficient human instance-guided framework for video action recognition</article-title>. <source>Sensors</source>. <year>2021</year>;<volume>21</volume>(<issue>24</issue>):<fpage>8309</fpage>. doi:<pub-id pub-id-type="doi">10.3390/s21248309</pub-id>; <pub-id pub-id-type="pmid">34960404</pub-id></mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Siddiqui</surname> <given-names>N</given-names></string-name>, <string-name><surname>Tirupattur</surname> <given-names>P</given-names></string-name>, <string-name><surname>Shah</surname> <given-names>M</given-names></string-name></person-group>. <article-title>DVANet: disentangling view and action features for multi-view action recognition 2024 [Internet]. [cited 2025 Jan 1]</article-title>. Available from: <ext-link ext-link-type="uri" xlink:href="https://ojs.aaai.org/index.php/AAAI/article/view/28290">https://ojs.aaai.org/index.php/AAAI/article/view/28290</ext-link>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zakka</surname> <given-names>VG</given-names></string-name>, <string-name><surname>Dai</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Manso</surname> <given-names>LJ</given-names></string-name></person-group>. <article-title>Action recognition in real-world ambient assisted living environment</article-title>. <source>Big Data Min Anal</source>. <year>2025</year>;<volume>8</volume>(<issue>4</issue>):<fpage>914</fpage>&#x2013;<lpage>32</lpage>. doi:<pub-id pub-id-type="doi">10.26599/bdma.2025.9020003</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Baptista</surname> <given-names>R</given-names></string-name>, <string-name><surname>Ghorbel</surname> <given-names>E</given-names></string-name>, <string-name><surname>Papadopoulos</surname> <given-names>K</given-names></string-name>, <string-name><surname>Demisse</surname> <given-names>GG</given-names></string-name>, <string-name><surname>Aouada</surname> <given-names>D</given-names></string-name>, <string-name><surname>Ottersten</surname> <given-names>B</given-names></string-name></person-group>. <article-title>View-invariant action recognition from RGB data via 3D pose estimation</article-title>. In: <conf-name>Proceedings of the ICASSP 2019&#x2014;2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP); 2019 May 12&#x2013;17</conf-name>; <publisher-loc>Brighton, UK</publisher-loc>. p. <fpage>2542</fpage>&#x2013;<lpage>6</lpage>. doi:<pub-id pub-id-type="doi">10.1109/icassp.2019.8682904</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Cheng</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Cheng</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>J</given-names></string-name></person-group>. <article-title>A dense-sparse complementary network for human action recognition based on RGB and skeleton modalities</article-title>. <source>Expert Syst Appl</source>. <year>2024</year>;<volume>244</volume>(<issue>3</issue>):<fpage>123061</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.eswa.2023.123061</pub-id>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>M</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Hou</surname> <given-names>G</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Balanced representation learning for long-tailed skeleton-based action recognition</article-title>. <source>Mach Intell Res</source>. <year>2025</year>;<volume>22</volume>(<issue>3</issue>):<fpage>466</fpage>&#x2013;<lpage>83</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11633-023-1487-8</pub-id>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Yang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>G</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Trunk-branch contrastive network with multi-view deformable aggregation for multi-view action recognition</article-title>. <source>Pattern Recognit</source>. <year>2026</year>;<volume>169</volume>(<issue>10</issue>):<fpage>111923</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patcog.2025.111923</pub-id>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Das</surname> <given-names>S</given-names></string-name>, <string-name><surname>Dai</surname> <given-names>R</given-names></string-name>, <string-name><surname>Koperski</surname> <given-names>M</given-names></string-name>, <string-name><surname>Minciullo</surname> <given-names>L</given-names></string-name>, <string-name><surname>Garattoni</surname> <given-names>L</given-names></string-name>, <string-name><surname>Bremond</surname> <given-names>F</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Toyota smarthome: real-world activities of daily living</article-title>. In: <conf-name>Proceedings of the 2019 IEEE/CVF International Conference on Computer Vision (ICCV); 2019 Oct 27&#x2013;Nov 2</conf-name>; <publisher-loc>Seoul, Republic of Korea</publisher-loc>. p. <fpage>833</fpage>&#x2013;<lpage>42</lpage>. doi:<pub-id pub-id-type="doi">10.1109/iccv.2019.00092</pub-id>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kim</surname> <given-names>B</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>HJ</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>J</given-names></string-name>, <string-name><surname>Choi</surname> <given-names>JY</given-names></string-name></person-group>. <article-title>Global-local motion transformer for unsupervised skeleton-based action learning</article-title>. In: <conf-name>Proceedings of the 17th European Conference on Computer Vision&#x2014;ECCV 2022; 2022 Oct 23&#x2013;27</conf-name>; <publisher-loc>Tel Aviv, Israel. Cham, Switzerland</publisher-loc>: <publisher-name>Springer Nature</publisher-name>; <year>2022</year>. p. <fpage>209</fpage>&#x2013;<lpage>25</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-031-19772-7_13</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Xie</surname> <given-names>C</given-names></string-name>, <string-name><surname>Li</surname> <given-names>C</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>C</given-names></string-name>, <string-name><surname>Han</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Memory attention networks for skeleton-based action recognition</article-title>. In: <conf-name>Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence; 2018 Jul 13&#x2013;19</conf-name>; <publisher-loc>Stockholm, Sweden</publisher-loc>. p. <fpage>1639</fpage>&#x2013;<lpage>45</lpage>. doi:<pub-id pub-id-type="doi">10.24963/ijcai.2018/227</pub-id>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Xu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Cheng</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Xia</surname> <given-names>H</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>F</given-names></string-name>, <string-name><surname>Tao</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Ensemble one-dimensional convolution neural networks for skeleton-based action recognition</article-title>. <source>IEEE Signal Process Lett</source>. <year>2018</year>;<volume>25</volume>(<issue>7</issue>):<fpage>1044</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1109/LSP.2018.2841649</pub-id>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>C</given-names></string-name>, <string-name><surname>Zhong</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>D</given-names></string-name>, <string-name><surname>Pu</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Skeleton-based action recognition with convolutional neural networks</article-title>. In: <conf-name>Proceedings of the 2017 IEEE International Conference on Multimedia &#x0026; Expo Workshops (ICMEW); 2017 Jul 10&#x2013;14</conf-name>; <publisher-loc>Hong Kong, China</publisher-loc>. p. <fpage>597</fpage>&#x2013;<lpage>600</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ICMEW.2017.8026285</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Yan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Xiong</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Spatial temporal graph convolutional networks for skeleton-based action recognition</article-title>. <source>Proc AAAI Conf Artif Intell</source>. <year>2018</year>;<volume>32</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1609/aaai.v32i1.12328</pub-id>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Tayyab</surname> <given-names>M</given-names></string-name>, <string-name><surname>Jalal</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Disabled rehabilitation monitoring and patients healthcare recognition using machine learning</article-title>. In: <conf-name>Proceedings of the 2025 6th International Conference on Advancements in Computational Sciences (ICACS); 2025 Feb 18&#x2013;19</conf-name>; <publisher-loc>Lahore, Pakistan</publisher-loc>. p. <fpage>1</fpage>&#x2013;<lpage>7</lpage>. doi:<pub-id pub-id-type="doi">10.1109/icacs64902.2025.10937871</pub-id>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Almushyti</surname> <given-names>M</given-names></string-name>, <string-name><surname>Li</surname> <given-names>FWB</given-names></string-name></person-group>. <article-title>Distillation of human-object interaction contexts for action recognition</article-title>. <source>Comput Animat Virtual Worlds</source>. <year>2022</year>;<volume>33</volume>(<issue>5</issue>):<fpage>e2107</fpage>. doi:<pub-id pub-id-type="doi">10.1002/cav.2107</pub-id>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Qi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>W</given-names></string-name>, <string-name><surname>Jia</surname> <given-names>B</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>SC</given-names></string-name></person-group>. <article-title>Learning human-object interactions by graph parsing neural networks</article-title>. In: <conf-name>Proceedings of the 15th European Conference on Computer Vision&#x2014;ECCV 2018; 2018 Sep 8&#x2013;14</conf-name>; <publisher-loc>Munich, Germany. Cham, Switzerland</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>; <year>2018</year>. p. <fpage>407</fpage>&#x2013;<lpage>23</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-030-01240-3_25</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>













