<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">61396</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.061396</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Video Action Recognition Method Based on Personalized Federated Learning and Spatiotemporal Features</article-title>
<alt-title alt-title-type="left-running-head">Video Action Recognition Method Based on Personalized Federated Learning and Spatiotemporal Features</alt-title>
<alt-title alt-title-type="right-running-head">Video Action Recognition Method Based on Personalized Federated Learning and Spatiotemporal Features</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Wu</surname><given-names>Rongsen</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Xu</surname><given-names>Jie</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Zhang</surname><given-names>Yuhang</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-4" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Zhao</surname><given-names>Changming</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref rid="cor1" ref-type="corresp">&#x002A;</xref><email>zcm84@cuit.edu.cn</email></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Xie</surname><given-names>Yiweng</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Wu</surname><given-names>Zelei</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>Li</surname><given-names>Yunji</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-8" contrib-type="author">
<name name-style="western"><surname>Guo</surname><given-names>Jinhong</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-9" contrib-type="author">
<name name-style="western"><surname>Tang</surname><given-names>Shiyang</given-names></name><xref ref-type="aff" rid="aff-5">5</xref><xref ref-type="aff" rid="aff-6">6</xref></contrib>
<aff id="aff-1"><label>1</label><institution>School of Information and Communication Engineering, University of Electronic Science and Technology of China</institution>, <addr-line>Chengdu, 611731</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>School of Computer Science, Chengdu University of Information Technology</institution>, <addr-line>Chengdu, 610225</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Shanghai Key Lab of Intelligent Information Processing, School of CS, Fudan University</institution>, <addr-line>Shanghai, 200433</addr-line>, <country>China</country></aff>
<aff id="aff-4"><label>4</label><institution>School of Sensing Science and Engineering, Shanghai Jiao Tong University</institution>, <addr-line>Shanghai, 200240</addr-line>, <country>China</country></aff>
<aff id="aff-5"><label>5</label><institution>School of Mechanical and Manufacturing Engineering, University of New South Wales</institution>, <addr-line>Sydney, 2052</addr-line>, <country>Australia</country></aff>
<aff id="aff-6"><label>6</label><institution>School of Electronics and Computer Science, University of Southampton</institution>, <addr-line>Southampton, SO17 1BJ</addr-line>, <country>UK</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Changming Zhao. Email: <email>zcm84@cuit.edu.cn</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2025</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>19</day><month>05</month><year>2025</year>
</pub-date>
<volume>83</volume>
<issue>3</issue>
<fpage>4961</fpage>
<lpage>4978</lpage>
<history>
<date date-type="received">
<day>29</day>
<month>12</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>3</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2025 The Authors.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_61396.pdf"></self-uri>
<abstract>
<p>With the rapid development of artificial intelligence and Internet of Things technologies, video action recognition technology is widely applied in various scenarios, such as personal life and industrial production. However, while enjoying the convenience brought by this technology, it is crucial to effectively protect the privacy of users&#x2019; video data. Therefore, this paper proposes a video action recognition method based on personalized federated learning and spatiotemporal features. Under the framework of federated learning, a video action recognition method leveraging spatiotemporal features is designed. For the local spatiotemporal features of the video, a new differential information extraction scheme is proposed to extract differential features with a single RGB frame as the center, and a spatial-temporal module based on local information is designed to improve the effectiveness of local feature extraction; for the global temporal features, a method of extracting action rhythm features using differential technology is proposed, and a time module based on global information is designed. Different translational strides are used in the module to obtain bidirectional differential features under different action rhythms. Additionally, to address user data privacy issues, the method divides model parameters into local private parameters and public parameters based on the structure of the video action recognition model. This approach enhances model training performance and ensures the security of video data. The experimental results show that under personalized federated learning conditions, an average accuracy of 97.792% was achieved on the UCF-101 dataset, which is non-independent and identically distributed (non-IID). This research provides technical support for privacy protection in video action recognition.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Video action recognition</kwd>
<kwd>personalized federated learning</kwd>
<kwd>spatiotemporal features</kwd>
<kwd>data privacy</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>National Natural Science Foundation of China</funding-source>
<award-id>62071098</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Sichuan Science and Technology Program</funding-source>
<award-id>2022YFG0319</award-id>
<award-id>2023YFG0301</award-id>
<award-id>2023YFG0018</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>In recent years, video action recognition technology has made significant progress and is widely applied in fields such as intelligent surveillance, human-computer interaction, and sports analysis. Due to breakthroughs in deep learning algorithms, especially the widespread use of models like CNN and LSTM, the accuracy and robustness of video action recognition have been greatly improved [<xref ref-type="bibr" rid="ref-1">1</xref>,<xref ref-type="bibr" rid="ref-2">2</xref>]. However, with the further development of these technologies, data privacy and security issues related to video action recognition have become increasingly prominent, emerging as key factors that constrain its broader adoption [<xref ref-type="bibr" rid="ref-3">3</xref>,<xref ref-type="bibr" rid="ref-4">4</xref>]. For instance, in a home environment, a surveillance camera must continuously record images of household members, which inherently involves capturing sensitive data related to the user&#x2019;s daily life and privacy. Therefore, we need to consider how to ensure user privacy in data collection and processing. On the one hand, we must ensure that data processing and application tasks are performed locally, with no possibility of uploading the data to a server. This requires us to consider security in the design of data processing systems to avoid data leaks and abuse. On the other hand, we need to prevent the dissemination and storage of data across distances from causing security risks.</p>
<p>Federated learning algorithms can effectively solve the privacy issue of user data in video monitoring. Under the framework of federated learning, the model can use scarce data locally to complete training, and the models of each user are aggregated through different methods and strategies on the central server. The data do not need to be transmitted to the central server, ensuring the security of data privacy. Utilizing a federated learning framework for video action recognition can significantly enhance the security of user privacy [<xref ref-type="bibr" rid="ref-5">5</xref>&#x2013;<xref ref-type="bibr" rid="ref-9">9</xref>].</p>
<p>This paper integrates federated learning, fully considering user data security and privacy, and proposes a video action recognition method based on personalized federated learning and spatiotemporal features. Under the framework of federated learning, this method designs a video action model that extracts spatiotemporal features using differential methods. The model embeds a spatiotemporal module based on local information and a temporal module based on global information within a residual network structure. These two modules respectively use RGB differences and feature differences to extract local spatiotemporal features and global temporal features, thereby improving model efficiency while ensuring recognition performance. The main contributions of this paper are as follows:
<list list-type="simple">
<list-item><label>(1)</label><p>To address the needs of privacy protection and data security in video surveillance, this paper combines federated learning with video action recognition models to propose a method based on personalized federated learning and spatiotemporal features.</p></list-item>
<list-item><label>(2)</label><p>Regarding the spatiotemporal features of video segments, this paper proposes a spatial-temporal module based on local information, which uses a new differential information extraction method to provide complementary spatial static information with temporal features.</p></list-item>
<list-item><label>(3)</label><p>Regarding the time-based features of complete videos, this paper proposes a time module based on global information that utilizes differential information of local features to extract action rhythm features, thereby improving the extraction effect of time-based features.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<p>In the field of personalized federated learning for video action recognition, scholars have proposed various innovative solutions. Zhao et al. [<xref ref-type="bibr" rid="ref-10">10</xref>] proposed an activity recognition system that uses semi-supervised federated learning, where clients use unlabeled local data to learn general representations through long short-term memory autoencoders, and the cloud server uses labeled data with a Softmax classifier for supervised learning. Experimental results show that their proposed system achieves higher accuracy than centralized systems and semi-supervised federated learning with data augmentation, and its accuracy is comparable to that of supervised federated learning systems. Shome et al. [<xref ref-type="bibr" rid="ref-11">11</xref>] proposed a federated learning framework for facial expression recognition that uses a small amount of labeled private facial expression data to train local models in each training round and aggregates all local model weights on the central server to obtain the global optimal model. Rehman et al. [<xref ref-type="bibr" rid="ref-12">12</xref>] proposed a general FL framework FedVSSL based on SWA for pre-training video-SSL methods in FL. This method shows strong competitiveness in action recognition tasks compared to FedAvg and centralized video SSL. Doshi et al. [<xref ref-type="bibr" rid="ref-13">13</xref>] proposed an effective federated learning solution based on 2D CNN models for detecting distracted driver activities. This solution trains the detection model in a distributed manner while protecting privacy and reducing data communication. Tu et al. [<xref ref-type="bibr" rid="ref-14">14</xref>] proposed a federated few-shot learning framework FedFSLAR, which collaboratively learns classification models from multiple FL clients using a small number of labeled video samples to recognize unknown actions.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Video Action Recognition Method Based on Personalized Federated Learning and Spatiotemporal Features</title>
<p>The proposed video action recognition method based on personalized federated learning and spatiotemporal features incorporates the characteristics of the video action recognition model to divide the parameters into private and public parameters. First, the complete process of the personalized federated learning method and the production method of non-independent and same distribution video action recognition datasets are given. Second, this section introduces the specific structure of the video action recognition model.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Overview</title>
<p>The proposed personalized federated learning scheme combines the characteristics of video action recognition models to divide private parameters and public parameters. Meanwhile, federated learning allows for training directly on edge devices, eliminating the need to transmit raw data from client devices to a central server, thereby reducing the risk of data leakage [<xref ref-type="bibr" rid="ref-15">15</xref>]. The video action recognition model is divided into three parts: input, local feature extraction, and global feature extraction. Taking the overall model segmentation number <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mrow><mml:mi mathvariant="italic">n</mml:mi></mml:mrow></mml:math></inline-formula> &#x003D; 3 as an example, <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mrow><mml:mi mathvariant="italic">n</mml:mi></mml:mrow></mml:math></inline-formula> &#x003D; 8 and <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mrow><mml:mi mathvariant="italic">n</mml:mi></mml:mrow></mml:math></inline-formula> &#x003D; 16 will be used for experimental results in subsequent experiments. In the input stage, video data are divided into three segments; for each segment, one frame of RGB is sampled first followed by taking the first two frames before and after the selected frame, respectively, and calculating the difference with the selected frame to obtain four RGB differences. In the local feature extraction stage, the differences are stacked and passed through a pooling layer to obtain initial local difference information, which is then input together with the video frame into the second stage network of ResNet for spatial feature extraction. Another copy of the initial difference information is also input into the second stage network of ResNet to extract temporal features and the spatial features are added to the temporal features to obtain the final local spatial features of each segment. In the global feature extraction stage, each segment&#x2019;s local features are compressed in the channel dimension and a bidirectional global difference is obtained by translating each segment&#x2019;s features. Through a convolutional neural network, global features are obtained, corresponding to the last three stages of ResNet with different stacking layers. Three rounds of global feature extraction are performed in total. Finally, the global features are input into a classifier to obtain the final video action recognition results.</p>
<p>When performing local feature extraction, the model focuses more on the static information of video data, including some key image information such as human body, color, and objects, so it is highly dependent on training data. Local features, serving as private parameters for users, are retained on the users&#x2019; local devices and are not uploaded to the central server. This ensures that users&#x2019; original video data and personalized features never leave their devices, thereby greatly enhancing privacy protection. Conversely, global feature extraction focuses more on dynamic information and extracts features that change with time. In personalized federated learning, local feature extraction is more suitable for training and storage at the user&#x2019;s local device, corresponding to the private parameters of the local model, whereas global features are better suited for aggregation at a central server, resulting in shared parameters that are saved as the public model. <xref ref-type="fig" rid="fig-1">Fig. 1</xref> shows the personalized federated learning-based video action recognition method.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Diagram of the video action recognition framework based on personalized federated learning and spatiotemporal features</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_61396-fig-1.tif"/>
</fig>
<p>This paper combines the video action recognition model with the personalized federated learning mechanism. For the model in the four feature extraction stages, the parameters of the local feature extraction stage are used as personalized models for each user, while the parameters of the global feature extraction stage are used as global shared parameters. In <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, the blue rectangle represents the private parameters of user A, the red rectangle represents the private parameters of user B, and the green diagonal rectangle represents the public parameters downloaded from the central server after step &#x2460;. After backpropagation and parameter update in local training (step &#x2461;), the entire model undergoes different changes for each user. Then, the updated public parameters are uploaded to the central server through step &#x2462; and the parameters are aggregated again by the central server in step &#x2463; to obtain new public parameters. The process is repeated for a new round of federated learning communication.</p>
<sec id="s3_1_1">
<label>3.1.1</label>
<title>Spatial-Temporal Module Based on Local Information</title>
<p>In recent years, some efficient methods for obtaining temporal information have been proposed. Among them, RGB difference and temporal shift methods are both simple and effective. RGB difference can simply obtain boundary and action information by performing a difference between RGB frames, whereas a temporal shift shifts the feature map in the time dimension, allowing features to overlap in time and extract dynamic information during further feature extraction.</p>
<p>The local spatiotemporal module proposed in this paper uses a convolutional neural network to extract features from RGB frames, obtaining local spatial features, and then extracts supplementary temporal information from the difference between multiple frames of RGB over a period of time to obtain the local spatiotemporal information of the current video segment. This module addresses the issues of traditional methods relying on complex data preprocessing and time-consuming processes when extracting short-term temporal information from local regions. It achieves a more efficient way to obtain temporal information and enhances model performance. The structure of the local module is shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref> and the entire module can be divided into two branches. For the input frame <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mi mathvariant="italic">t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> at time <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mrow><mml:mi mathvariant="italic">t</mml:mi></mml:mrow></mml:math></inline-formula>, the first branch directly inputs the raw input data into the convolution layer to extract features, obtaining the static information of the current video frame and the original spatial features of the current segment. The second branch obtains data from two frames before and after <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mrow><mml:mi mathvariant="italic">t</mml:mi></mml:mrow></mml:math></inline-formula>, performs a difference operation on a total of five frames, smooths the feature in the channel dimension, passes through an average pooling layer in the planar dimension, and then adds the pooled feature to input into the convolutional network. At this time, it can obtain the supplementary temporal features of the current video segment. These features are divided into two paths: one directly inputs into the second-stage network of ResNet to extract features and the other combines with the static feature and upsamples according to the feature shape of the first branch, adding it to the first branch feature and inputting into ResNet. Finally, the two feature maps are re-scaled and added to obtain the final local spatiotemporal feature.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Spatial-temporal module based on local information and time module based on global information</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_61396-fig-2.tif"/>
</fig>
<p>Theoretically, the neighboring frames of the input frame <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mi mathvariant="italic">t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> at time <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mi>t</mml:mi></mml:math></inline-formula> are <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mrow><mml:mi mathvariant="italic">t</mml:mi><mml:mo mathvariant="italic">&#x2212;</mml:mo><mml:mrow><mml:mtext>2</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mrow><mml:mi mathvariant="italic">t</mml:mi><mml:mo mathvariant="italic">&#x2212;</mml:mo><mml:mrow><mml:mtext>1</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mrow><mml:mi mathvariant="italic">t</mml:mi><mml:mo mathvariant="italic">+</mml:mo><mml:mrow><mml:mtext>1</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>, and <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mrow><mml:mi mathvariant="italic">t</mml:mi><mml:mo mathvariant="italic">+</mml:mo><mml:mrow><mml:mtext>2</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>. The difference between these 4 frames and <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mrow><mml:msub><mml:mi mathvariant="italic">I</mml:mi><mml:mi mathvariant="italic">t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is taken, with <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mrow><mml:msub><mml:mi mathvariant="italic">F</mml:mi><mml:mrow><mml:mi mathvariant="italic">d</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">f</mml:mi><mml:mi mathvariant="italic">f</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> representing the differential features and <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mrow><mml:msub><mml:mi mathvariant="italic">F</mml:mi><mml:mrow><mml:mi mathvariant="italic">R</mml:mi><mml:mi mathvariant="italic">G</mml:mi><mml:mi mathvariant="italic">B</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> representing the static features. The calculation process of the two paths can be represented by <xref ref-type="disp-formula" rid="eqn-1">Eqs. (1)</xref> and <xref ref-type="disp-formula" rid="eqn-2">(2)</xref>:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:mrow></mml:munder><mml:mrow><mml:mtext>Avg</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>G</mml:mi><mml:mi>B</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Conv</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>In the equations, <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mi>A</mml:mi><mml:mi>v</mml:mi><mml:mi>g</mml:mi></mml:math></inline-formula> represents the average pooling layer mapping, and <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:math></inline-formula> represents the convolutional layer mapping. Finally, <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:mrow><mml:msub><mml:mi mathvariant="italic">F</mml:mi><mml:mi mathvariant="italic">L</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represents the features of the local module, which can be expressed by <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mi>F</mml:mi><mml:mi>L</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>N</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>R</mml:mi><mml:mi>G</mml:mi><mml:mi>B</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>U</mml:mi><mml:mi>p</mml:mi><mml:mi>S</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo><mml:mo>+</mml:mo><mml:mi>U</mml:mi><mml:mi>p</mml:mi><mml:mi>S</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>N</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:math></disp-formula></p>
<p>In the equation, <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mi>U</mml:mi><mml:mi>p</mml:mi><mml:mi>S</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:math></inline-formula> represents the upsampling of features, and <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>N</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi></mml:math></inline-formula> represents the residual network mapping.</p>
</sec>
<sec id="s3_1_2">
<label>3.1.2</label>
<title>Time Module Based on Global Information</title>
<p>After obtaining the local spatiotemporal features of video segments, it is necessary to further acquire the temporal features between segments. Both local and global temporal features are important for action recognition. Some video actions may occur in a few moments but have a fixed order. It is necessary to extract local spatiotemporal features and further interact features across the entire temporal dimension of the video. Some actions are slow and continuous actions, requiring the model to grasp the data features of each stage.</p>
<p>For this reason, this paper further proposes a time module based on global information, which uses feature differencing to extract action rhythm information. The input of the module is the local spatiotemporal features of each segment. For features of different segments, differential interaction can be performed through fixed time rules to extract the time features of fixed action rhythms. In the model proposed in this paper, different time intervals essentially represent different action rhythms.</p>
<p><xref ref-type="fig" rid="fig-2">Fig. 2</xref> illustrates the overall structure of the time module based on global information. For the local feature <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mrow><mml:msub><mml:mi mathvariant="italic">F</mml:mi><mml:mi mathvariant="italic">n</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> of the nth segment, a backup is first saved and directly connected to the lower-level network, which is theoretically similar to a residual network, preventing gradient and degradation problems and accelerating propagation. Meanwhile, the original frame-level features can also be retained in the current module. Secondly, the original <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mrow><mml:msub><mml:mi mathvariant="italic">F</mml:mi><mml:mi mathvariant="italic">n</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is input into a convolutional network to achieve compression in the channel dimension, smoothing the features. This is because there is a large gap between the features of different segments and direct differencing operations may introduce a significant amount of noise, disrupting the original spatiotemporal features. Smoothing the features in the channel dimension before differencing makes the differencing features more effective. The smoothed features are also backed up for later differencing calculations. Another copy is input into the convolutional layer for feature extraction and then the difference is calculated with the backup features of other segments to obtain the difference information of different segment features. Different features participate in differencing interactions under different translation strides.</p>
<p>In existing video action recognition models based on action rhythm features, different action rhythm features are often extracted by sampling at different data frequencies, and different input sizes require separate network channels for feature extraction, greatly increasing the model parameter count and training time cost. The approach proposed in this paper for extracting action rhythm features directly implements differential feature extraction at different intervals within existing local features through different translation strides, thereby controlling the model size while improving recognition performance.</p>
<p>For the extraction of action rhythm, as shown in the feature vector at the top of <xref ref-type="fig" rid="fig-2">Fig. 2</xref>, different features of interaction under different displacement step sizes are marked on the vector. When the step size is small, action changes within a relatively short period of time can be obtained, which is suitable for extracting fast-paced action features. Similarly, when the step size is large, it captures slow-paced action changes. In this paper, the method of calculating differences is still used to obtain the temporal features between segments. This module addresses the issue of efficiently extracting video action rhythm features while reducing noise interference by smoothing the features before differencing.</p>
<p>In the specific implementation process, for the data feature <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mrow><mml:msub><mml:mi mathvariant="italic">F</mml:mi><mml:mi mathvariant="italic">t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, displaced features are obtained through bidirectional translation. The diagrams below <xref ref-type="fig" rid="fig-2">Fig. 2</xref> illustrate displacement step sizes of 1, 2, and 3. On this basis, features beyond the boundaries are removed and the blank features are filled in to obtain three feature vectors that are displaced in the time dimension. Then, subtracting the three vectors yields differential information for different time spans. Blank features appearing after displacement are directly filled with null values.</p>
<p>Using the above method, bidirectional differential features can be obtained after bidirectional translation. Then, the features are divided into three paths: one path passes through pooling layers, convolutional layers, and upsampling layers before being passed to the next layer while the other passes through convolutional layers before being passed to the next layer; one path directly transmits the original features downward, and the three paths are added in the next layer. This approach can further enhance the robustness of the time module, making the smoothing operation on different segment features more effective. Subsequently, the features are fused deeply again through convolutional layers and activation layers, and the fused bidirectional features are added together to obtain the bidirectional differential features of the current video segment. Afterwards, the differential features are multiplied with the original features one by one, which is equivalent to treating the differential features as attention parameters of the original features. Attention mechanisms are often more effective at higher levels of network structure, so this paper applies them to the time module. Finally, the segment features with attention mechanisms are added to the original features to obtain the final features of the time module.</p>
<p>Using <italic>D</italic> to represent the differential function, the differential calculation process can be expressed by <xref ref-type="disp-formula" rid="eqn-4">Eq. (4)</xref>:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Using <italic>F</italic> to represent the smoothed features, and <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:mrow><mml:msub><mml:mi mathvariant="italic">D</mml:mi><mml:mrow><mml:mi mathvariant="italic">n</mml:mi><mml:mo mathvariant="italic">,</mml:mo><mml:mi mathvariant="italic">n</mml:mi><mml:mo mathvariant="italic">&#x2212;</mml:mo><mml:mrow><mml:mtext>1</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represents the local feature difference between the <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> and <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mi>n</mml:mi></mml:math></inline-formula> segments.</p>
<p>Next, using <italic>H</italic> to represent the merged features of the three paths, and <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:mrow><mml:msup><mml:mi mathvariant="italic">H</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> to represent the fused features, the calculation process of the unidirectional features is shown in <xref ref-type="disp-formula" rid="eqn-5">Eqs. (5)</xref> and <xref ref-type="disp-formula" rid="eqn-6">(6)</xref>:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">[</mml:mo><mml:mi>U</mml:mi><mml:mi>p</mml:mi><mml:mi>S</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"><mml:mtr><mml:mtd /><mml:mtd><mml:msubsup><mml:mi>H</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi><mml:mo stretchy="false">[</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">]</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>Using <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:mi>U</mml:mi><mml:mi>p</mml:mi><mml:mi>S</mml:mi><mml:mi>a</mml:mi><mml:mi>m</mml:mi><mml:mi>p</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:math></inline-formula> to represent upsampling, and the upsampling function is used again in the global module to unify the size of the three path features. <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mi>S</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:math></inline-formula> is the activation function used in this layer of the temporal module. Finally, using <italic>F</italic> to represent the final features output by the temporal module, as shown in equation <xref ref-type="disp-formula" rid="eqn-7">Eq. (7)</xref>:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:msubsup><mml:mi>F</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>F</mml:mi><mml:mo>&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>n</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mover><mml:mi>F</mml:mi><mml:mo>&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>n</mml:mi></mml:msub><mml:mo>&#x2299;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mo stretchy="false">[</mml:mo><mml:msubsup><mml:mi>H</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msubsup><mml:mi>H</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo></mml:math></disp-formula></p>
<p>Using <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mrow><mml:mover><mml:mi>F</mml:mi><mml:mo>&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mi>n</mml:mi></mml:msub></mml:math></inline-formula> to represent the original local features of the nth segment, and <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:mo>&#x2299;</mml:mo></mml:math></inline-formula> represents element-wise multiplication. This means the original features are multiplied with the bidirectional features and then added, and finally the original features are added again to obtain the temporal information of the current stage.</p>
</sec>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Federated Learning of Video Action Recognition Dataset</title>
<p>To verify the performance of the video action recognition model under the federated learning training mechanism, a federated learning video dataset was created using the publicly available video dataset UCF-101.</p>
<p>As for federated learning, considering user privacy, each user trains the model using local data. In the field of machine learning, datasets often follow the Independent Identically Distributed (IID) assumption, but in the practical application scenario of federated learning, the data distribution of each user is irregular and belongs to the Non-Independent Identically Distributed (non-IID) dataset [<xref ref-type="bibr" rid="ref-16">16</xref>,<xref ref-type="bibr" rid="ref-17">17</xref>]. Existing federated learning research usually groups data based on existing public datasets, mainly in two ways. As shown in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>, taking five users as an example, each user contains three types of data, where the vertical axis represents the user number, the horizontal axis represents the number of data samples, and different colors represent different data categories. The first method in <xref ref-type="fig" rid="fig-3">Fig. 3</xref> directly divides the dataset into categories and assigns fixed category data to each user, with no overlap between users. Each category may have different sample sizes and limited public datasets provide different granularities of classification. Reference can be made to the large category grouping provided in the dataset for user-specific data allocation.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Example of dataset grouping using Dirichlet distribution and fixed category groups. (a) Grouping data into fixed categories; (b) Dirichlet distribution data grouping</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_61396-fig-3.tif"/>
</fig>
<p>The dataset contains <italic>N</italic> classes and it is assumed that each user&#x2019;s subset of data is independently dependent on the column-specific distribution parameter vector <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:mi>q</mml:mi></mml:math></inline-formula>, which satisfies the condition given in <xref ref-type="disp-formula" rid="eqn-8">Eq. (8)</xref>:
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:msub><mml:mi>q</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x223C;</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>r</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mspace width="1em" /><mml:msub><mml:mi>q</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2265;</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mspace width="1em" /><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>N</mml:mi><mml:mo stretchy="false">]</mml:mo><mml:mspace width="1em" /><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi><mml:mspace width="1em" /><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mi>q</mml:mi><mml:mo fence="false" stretchy="false">&#x2016;</mml:mo><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></disp-formula></p>
<p><inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:mrow><mml:mi mathvariant="italic">D</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">r</mml:mi><mml:mo mathvariant="italic" stretchy="false">(</mml:mo><mml:mo mathvariant="italic">&#x2217;</mml:mo><mml:mo mathvariant="italic" stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the Dirichlet distribution, <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:mi>p</mml:mi></mml:math></inline-formula> is a prior distribution based on <italic>N</italic>, and <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003E; 0 is a core parameter used to control the independence of user-specific data subsets. As <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> approaches infinity, the category distribution of the subsets approaches that of the original dataset. On the other hand, as <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> approaches 0, each user only contains one randomly assigned category.</p>
<p><xref ref-type="fig" rid="fig-4">Fig. 4</xref> shows the Dirichlet grouping of the UCF-101 dataset under different values of <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula>. The experiment used the UCF-101 dataset, which has 101 classes displayed on the vertical axis. The horizontal axis represents 50 users and the darkness of the colors represents the percentage of samples assigned to a particular category for a given user out of the total samples. When the colors are the same or similar across rows, it represents that the category was evenly distributed among the 50 users, resulting in each user receiving an equal proportion of samples. It can be seen that when <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003D; 500, most categories are evenly distributed among users, whereas when <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> &#x003D; 5, all categories are scattered and disorganized, forming a non-independent and non-identically distributed (non-IID) dataset between the subsets.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Dirichlet grouping of the UCF-101 dataset under different values of <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula></title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_61396-fig-4.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<p>In this section, experiments were conducted to verify the effectiveness of video action recognition models based on spatiotemporal features and personalized federated learning methods. The performance of the models was compared across multiple indicators and the recognition accuracy on the UCF-101 dataset was provided.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Experiments Environment</title>
<p>The experiments in this paper were based on the Ubuntu 22.04.1 LTS operating system, with a CPU model of Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40 GHz. GPU was used for model training and testing, with the graphics card model being NVIDIA GeForce RTX 3090 Ti, and the graphics card memory being 24 G. The experimental environment was Python 3.7.15, PyTorch 1.10, and CUDA 11.3. If the hardware conditions, especially the cache size, are reduced, it will increase the model training time.</p>
<p>The proposed model has a computational complexity of approximately 4.1G FLOPs and a total parameter size of about 25.6 M. With an input image size of 224 <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 224, the memory usage is around 100&#x2013;200 MB. The real-time performance is influenced by the hardware conditions. On the 3090 Ti GPU, the inference time of this model is approximately 5&#x2013;7 ms per frame.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Datasets</title>
<p>The primary research objective of this paper is to investigate how to protect user data privacy in video action recognition scenarios under a personalized federated learning framework, rather than specifically optimizing the accuracy of video action recognition models. Therefore, the widely-used UCF101 dataset was adopted as the experimental dataset for this study. The UCF-101 dataset contains 13,320 videos across 101 action categories, covering a wide range of human actions in various environments. This dataset is considered one of the most comprehensive and diverse datasets for video action recognition. In the experiments of this section, the dataset were divided into training and testing sets using the holdout method, with a split ratio of approximately 7:3. To ensure the accuracy of the experimental results, the dataset was randomly divided three times and the final experimental results are the average results of the three partitioning methods. In a realistic federated learning training environment, one node corresponds to one device. The training strategy adopted in this paper is to simulate the entire federated learning process using a single device to mimic multiple nodes.</p>
<p>When testing the personalized federated learning method, the Dirichlet distribution method mentioned above was used to partition the non-independent and identically distributed subdatasets, simulating the federated learning scenario. This section of the experiment also tested different values of the parameter <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula>.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Experimental Results and Analysis</title>
<p>In this section of the experiment, we first conducted experimental validation of the proposed spatiotemporal feature-based video action recognition method on the publicly available UCF-101 dataset. Subsequently, we verified the effectiveness of the proposed personalized federated learning-based video action recognition method on a non-independent and identically distributed (non-IID) version of the UCF-101 dataset.</p>
<sec id="s4_3_1">
<label>4.3.1</label>
<title>Ablation Study</title>
<p>This paper proposes to obtain the difference information by subtracting the previous and next 2 frames from the sampled frame, instead of subtracting each consecutive adjacent frame separately. In addition, considering that for fast actions, subtracting frames with a large time interval may introduce significant noise to the difference information, rendering it ineffective, a pooling layer is added in the channel dimension to smooth features and extract key information. Based on these three schemes for extracting differential RGB information, comparative experiments are conducted in this section to test the performance of each scheme.</p>
<p>As shown in <xref ref-type="table" rid="table-1">Table 1</xref>, <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:msub><mml:mi>I</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn><mml:mo>,</mml:mo><mml:mn>5</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:mrow></mml:math></inline-formula> in the table represents the RGB frames at time <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>t</mml:mi></mml:math></inline-formula>, where <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msub><mml:mi>I</mml:mi><mml:mn>3</mml:mn></mml:msub></mml:math></inline-formula> is randomly sampled for spatial feature extraction, and the other 4 frames are the 2 frames before and after time <italic>t</italic>. <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the differential information between frame <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:msub><mml:mi>I</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:math></inline-formula> and frame <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:msub><mml:mi>I</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:math></inline-formula>. To demonstrate the effectiveness of differential RGB, the model performance without using differential information was first tested. The <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:math></inline-formula> function was used to directly concatenate the 2 frames before and after the sampled frame for information extraction. Experimental results show that the spatial-temporal module using differential information achieves better experimental results, reaching 85.851% in accuracy Top1.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Recognition effect under different differential feature extraction methods</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Number</th>
<th>Input data</th>
<th>Smooth features</th>
<th>Acc. Top1</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td><inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mn>5</mml:mn></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>&#x2013;</td>
<td>79.434%</td>
</tr>
<tr>
<td>2</td>
<td><inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>4</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>No</td>
<td>85.444%</td>
</tr>
<tr>
<td>3</td>
<td><inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>No</td>
<td>82.008%</td>
</tr>
<tr>
<td>4</td>
<td><inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mn>3</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula></td>
<td>Yes</td>
<td><bold>85.851%</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Replacing the differential between adjacent frames with the differential from sampled frames reduces accuracy. Since greater time distance between RGB frames introduces more noise, this paper applies average pooling to each frame after obtaining the differential frames. The pooled features are stacked and further smoothed through a channel-wise average pooling layer, compressing differences between features at different time points. This method improved experimental results, achieving 85.851% accuracy on the UCF-101 dataset.</p>
<p>In the time module, to extract action rhythm information, the translation stride during differential interaction is also an important experimental parameter worthy of consideration. Different schemes have been detailed in the previous section, and in this section, experimental results and performance analysis are directly provided.</p>
<p><xref ref-type="table" rid="table-2">Table 2</xref> shows the accuracy Top1 and Top5 achieved when performing local feature differential in the time module with different translation strides. From the experimental results, it can be seen that the model with a stride of 1-1-2 achieves a higher accuracy Top1, reaching 85.931%. Compared to the original scheme 1-1-1, it improves the accuracy by 0.487%. The model with a stride of 1-2-2 obtains a 0.027% improvement in accuracy Top5 compared to the original scheme, reaching 97.159%, thereby verifying the effectiveness of global stage differential features on the UCF-101 dataset. However, when the stride is set to 1-2-3, the recognition accuracy significantly decreases, indicating that differential information with a large time span is no longer effective and may even affect recognition performance.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Recognition performance of different translation strides</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Number</th>
<th>Strides</th>
<th>Acc. Top1</th>
<th>Acc. Top5</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>1-1-1</td>
<td>85.444%</td>
<td>97.132%</td>
</tr>
<tr>
<td>2</td>
<td>2-2-2</td>
<td>85.038%</td>
<td>96.943%</td>
</tr>
<tr>
<td>3</td>
<td>1-1-2</td>
<td><bold>85.931%</bold></td>
<td>97.051%</td>
</tr>
<tr>
<td>4</td>
<td>1-2-1</td>
<td>85.092%</td>
<td>96.997%</td>
</tr>
<tr>
<td>5</td>
<td>1-2-2</td>
<td>85.363%</td>
<td><bold>97.159%</bold></td>
</tr>
<tr>
<td>6</td>
<td>1-2-3</td>
<td>84.686%</td>
<td>96.510%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_3_2">
<label>4.3.2</label>
<title>Optimal Accuracy of Video Recognition Model</title>
<p>Finally, based on the best model scheme and hyperparameters obtained from the experimental tests, this paper provides the optimal recognition accuracy based on the UCF-101 dataset.</p>
<p><xref ref-type="table" rid="table-3">Table 3</xref> presents a comparison of the accuracy of the proposed model with other action recognition models. Among them, the TSM model was pre-trained on simpler datasets like ImageNet. Under the same simple pre-training conditions, the proposed model achieved the highest accuracy of 87%. The TDN, HoCNet, TSM, MEACI-NET, MTNet, and CANet models were further pre-trained on the large-scale Kinetics-400 dataset. Due to the much larger number of samples in this dataset compared to UCF-101, these models can learn more complex data representations, resulting in a significant improvement in final accuracy. Even under the condition of pre-training on both ImageNet and Kinetics-400, the proposed model still achieved the highest recognition accuracy of 97.6%.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Comparison of the performance of the model proposed in this paper with other models</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Model</th>
<th>Pre-training</th>
<th>Backbone</th>
<th>Acc. Top1</th>
</tr>
</thead>
<tbody>
<tr>
<td>TSM [<xref ref-type="bibr" rid="ref-18">18</xref>]</td>
<td>ImageNet</td>
<td>ResNet50</td>
<td>83.2%</td>
</tr>
<tr>
<td>MANet [<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td>ImageNet</td>
<td>ResNet50</td>
<td>86.2%</td>
</tr>
<tr>
<td>TDN [<xref ref-type="bibr" rid="ref-20">20</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>97.4%</td>
</tr>
<tr>
<td>HoCNet [<xref ref-type="bibr" rid="ref-21">21</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>94.0%</td>
</tr>
<tr>
<td>TSM [<xref ref-type="bibr" rid="ref-18">18</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>94.5%</td>
</tr>
<tr>
<td>F2D-SIFPNet [<xref ref-type="bibr" rid="ref-22">22</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>96.3%</td>
</tr>
<tr>
<td>MEACI-NET [<xref ref-type="bibr" rid="ref-23">23</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>96.4%</td>
</tr>
<tr>
<td>MTNet [<xref ref-type="bibr" rid="ref-24">24</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>96.5%</td>
</tr>
<tr>
<td>CANet [<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td>96.6%</td>
</tr>
<tr>
<td>Our model</td>
<td>ImageNet</td>
<td>ResNet50</td>
<td><bold>87.0%</bold></td>
</tr>
<tr>
<td>Our model</td>
<td>ImageNet&#x002B;Kinetics</td>
<td>ResNet50</td>
<td><bold>97.6%</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Specifically, the recognition accuracy for continuous actions with a strong rhythmic pattern was improved. Examples include BlowDryHair, CleanAndJerk, HorseRiding, JugglingBalls, and Rowing, shown in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>. These five actions involve the subject performing highly repetitive movements throughout the video, maintaining a certain frequency, and exhibiting a distinct action rhythm. This demonstrates the effectiveness of extracting action rhythm features through feature differences at different scales.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Diagram of significant categories of action rhythm information</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_61396-fig-5.tif"/>
</fig>
<p>In summary, the video action recognition model based on spatiotemporal features proposed in this paper can effectively improve the recognition accuracy and achieve better performance on the UCF-101 dataset. We initialized our model using the pre-trained model weights from ImageNet and Kinetics-400. ImageNet is a large-scale image dataset that contains over 14 million images, spanning 1000 categories; whereas Kinetics-400 is a large-scale video dataset that includes 400 action categories. By leveraging these pre-trained models, we ensured that our model has learned a rich and diverse feature representation, allowing it to benefit from a broader and more varied training data. This approach also enabled the accuracy of the model proposed in this paper reached the top level in the field.</p>
</sec>
<sec id="s4_3_3">
<label>4.3.3</label>
<title>Personalized Federated Learning Method</title>
<p>In this section, we first conducted tests on multiple hyperparameters of federated learning, including the number of training rounds, the number of user samples, and the degree of dataset distribution. Based on these tests, we then validated the effectiveness of the personalized federated learning mechanism proposed in this paper for video action recognition.</p>
<p>After deploying the dataset and model into the FedML framework, experimental tests are first conducted on the setting of hyperparameters. In this section, the total number of users <italic>C</italic> is set to 20 as a fixed parameter and kept constant. Experimental results are tested with different numbers of users sampled per round <italic>S</italic> and the number of training epochs <italic>E</italic> for each user.</p>
<p>As shown in <xref ref-type="table" rid="table-4">Table 4</xref>, when the user training epoch is 1, the model converges after 450 communication rounds, while when the user training epoch is 5, it converges after 145 communication rounds. Although the number of communication rounds decreases, the total number of training epochs increases from 1 <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 450 to 5 <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> 145 &#x003D; 725 epochs, significantly increasing the training cost. Furthermore, when the training epoch further increases to 10, the recognition accuracy actually decreases.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>The experimental results based on different training epochs for each user</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Number</th>
<th>Users/round</th>
<th>Epoch</th>
<th>Communication round</th>
<th>Acc.</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>4</td>
<td>1</td>
<td>450</td>
<td>96.077%</td>
</tr>
<tr>
<td>2</td>
<td>4</td>
<td>5</td>
<td>145</td>
<td>96.531%</td>
</tr>
<tr>
<td>3</td>
<td>4</td>
<td>10</td>
<td>85</td>
<td>95.696%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From the results, it can be observed that the training effect of the model under federated learning is not stable and does not steadily increase with the increase in local training epochs of users. This is due to the non-independent and identically distributed nature of the data, resulting in significant differences between the data distribution of each user and the overall dataset. In traditional deep learning training, each training epoch allows the model to learn complete data features, and the model is optimized with increasing training epochs. However, under federated learning conditions, an increase in training epochs can lead to the model learning too many individual characteristics of user local data, causing the model to overfit. This not only increases the training cost but also fails to achieve better performance. Increasing the aggregation frequency of the model can make the global model closer to the original optimal parameters.</p>
<p>After determining the user training epoch, this paper also conducted experiments based on different numbers of user samples per round. The test results are shown in <xref ref-type="table" rid="table-5">Table 5</xref>. It can be observed that as the number of user samples increases, the training effect of the model also improves. This is because when the total number of users is fixed, the more users sampled per round, the more data participates in the training, and the impact of each user&#x2019;s individual characteristics on the aggregation is reduced. The parameters aggregated by the central server tend to be more balanced. However, the training time cost will inevitably increase with the increase in the number of samples. From the perspective of simulating a real federated learning environment, the sampling value cannot be set too high. Therefore, in the following experiments, the number of sampled users per round was set to <italic>S</italic> &#x003D; 4, meaning that 1/5 of the users (data) participate in the training each round.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>The experimental results based on different numbers of user samples per round</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Number</th>
<th>Users/round</th>
<th>Epoch</th>
<th>Communication round</th>
<th>Acc.</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>2</td>
<td>1</td>
<td>480</td>
<td>95.448%</td>
</tr>
<tr>
<td>2</td>
<td>4</td>
<td>1</td>
<td>450</td>
<td>96.077%</td>
</tr>
<tr>
<td>3</td>
<td>5</td>
<td>1</td>
<td>450</td>
<td>96.558%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The following experiments are conducted based on different data distribution scenarios, with reference to federated learning datasets from other fields to set parameters, using two grouping methods: Dirichlet data distribution and uniform grouping. The experimental results are shown in <xref ref-type="table" rid="table-6">Table 6</xref>, where <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:mrow><mml:mi mathvariant="italic">D</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">r</mml:mi><mml:mo mathvariant="italic" stretchy="false">(</mml:mo><mml:mo mathvariant="italic">&#x2217;</mml:mo><mml:mo mathvariant="italic" stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> represents the Dirichlet distribution, and the parameter <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> controls the degree of data dispersion. In other studies, for datasets MINIST and CIFAR-10 with 10 categories, <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> is often set to 0.5. Since this paper&#x2019;s experiments use a video behavior recognition dataset with a large amount of data and sample sizes, it is necessary to experiment to test the best data grouping method. The uniform grouping data distribution matches the original dataset and is used to compare the impact of non-IID data grouping on training effectiveness.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Experimental results based on different data distributions</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Number</th>
<th>Data distribution</th>
<th>Communication round</th>
<th>Acc.</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td><inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mrow><mml:mi mathvariant="italic">D</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">r</mml:mi><mml:mo mathvariant="italic" stretchy="false">(</mml:mo><mml:mn mathvariant="italic">0.5</mml:mn><mml:mo mathvariant="italic" stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td>480</td>
<td>96.023%</td>
</tr>
<tr>
<td>2</td>
<td><inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:mrow><mml:mi mathvariant="italic">D</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">r</mml:mi><mml:mo mathvariant="italic" stretchy="false">(</mml:mo><mml:mn mathvariant="italic">1</mml:mn><mml:mo mathvariant="italic" stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td>450</td>
<td>96.077%</td>
</tr>
<tr>
<td>3</td>
<td><inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:mrow><mml:mi mathvariant="italic">D</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">r</mml:mi><mml:mo mathvariant="italic" stretchy="false">(</mml:mo><mml:mn mathvariant="italic">10</mml:mn><mml:mo mathvariant="italic" stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula></td>
<td>420</td>
<td>96.377%</td>
</tr>
<tr>
<td>4</td>
<td>Uniform grouping</td>
<td>180</td>
<td>96.402%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From the experimental results, it can be observed that under uniform grouping, the model&#x2019;s convergence speed in federated learning training is the fastest. However, when using Dirichlet distribution for grouping, as the <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> value decreases, the data distribution becomes more scattered, requiring more rounds for model convergence. This also affects the model&#x2019;s optimization process, leading to suboptimal training effectiveness and impacting the highest recognition accuracy after convergence. The experimental results further illustrate the impact of unevenly distributed data storage on recognition performance in the federated learning environment. Considering the use of Dirichlet distribution for data grouping, each user&#x2019;s allocation of data types and quantities is completely random, resulting in fewer common features among user local data. This is suitable for scenarios where public models are used for parameter training. In practical applications, however, each user&#x2019;s local data often exhibits strong personalized characteristics. Similar to datasets like MINIST and CIFAR-10 with special labels, they are more conducive to personalized federated learning research. Therefore, in testing the personalized federated learning scheme, we ensure each user&#x2019;s training and test sets have the same sample distribution, with the same data categories proportionally represented in both sets. Based on the experimental results of hyperparameters and dataset grouping methods, the total number of users <italic>C</italic> is set to 20, the number of users sampled in each federated learning communication round <italic>S</italic> is set to 4, and the number of local training rounds <italic>E</italic> for each user is set to 1. The dataset grouping method is <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:mrow><mml:mi mathvariant="italic">D</mml:mi><mml:mi mathvariant="italic">i</mml:mi><mml:mi mathvariant="italic">r</mml:mi><mml:mo mathvariant="italic" stretchy="false">(</mml:mo><mml:mn mathvariant="italic">1</mml:mn><mml:mo mathvariant="italic" stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Under the above parameter settings, experiments were conducted to verify the personalized federated learning-based optimization model for video action recognition proposed in this paper, comparing the experimental results under conventional federated learning training and personalized federated learning conditions.</p>
<p><xref ref-type="table" rid="table-7">Table 7</xref> presents the highest accuracy rates Top1 and Top5 achieved by the model in this paper on local datasets of 20 users under conventional federated learning and personalized federated learning. From the average accuracy rates, the personalized federated learning approach proposed in this paper achieves better results on both indicators, with the Top1 accuracy reaching 97.792%, an improvement of 1.155%, and the Top5 accuracy reaching 99.861%, an improvement of 0.079%.</p>
<table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Comparison of experimental results between conventional federated learning and personalized federated learning</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>ID</th>
<th colspan="2">Acc. Top1</th>
<th colspan="2">Acc. Top5</th>
</tr>
<tr>
<th></th>
<th>FL</th>
<th>PFL</th>
<th>FL</th>
<th>PFL</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>96.795%</td>
<td>97.312%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>2</td>
<td>97.701%</td>
<td>98.077%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>3</td>
<td>97.312%</td>
<td>98.333%</td>
<td>100%</td>
<td>99.444%</td>
</tr>
<tr>
<td>4</td>
<td>97.222%</td>
<td>97.778%</td>
<td>100%</td>
<td>99.444%</td>
</tr>
<tr>
<td>5</td>
<td>95.312%</td>
<td>95.556%</td>
<td>99.479%</td>
<td>99.444%</td>
</tr>
<tr>
<td>6</td>
<td>97.396%</td>
<td>96.774%</td>
<td>99.479%</td>
<td>99.462%</td>
</tr>
<tr>
<td>7</td>
<td>95.402%</td>
<td>99.405%</td>
<td>98.851%</td>
<td>100%</td>
</tr>
<tr>
<td>8</td>
<td>97.849%</td>
<td>97.849%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>9</td>
<td>94.444%</td>
<td>97.312%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>10</td>
<td>94.624%</td>
<td>97.312%</td>
<td>98.925%</td>
<td>100%</td>
</tr>
<tr>
<td>11</td>
<td>94.444%</td>
<td>97.222%</td>
<td>99.444%</td>
<td>100%</td>
</tr>
<tr>
<td>12</td>
<td>94.048%</td>
<td>96.237%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>13</td>
<td>100%</td>
<td>97.222%</td>
<td>100%</td>
<td>99.444%</td>
</tr>
<tr>
<td>14</td>
<td>96.354%</td>
<td>98.387%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>15</td>
<td>96.774%</td>
<td>98.718%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>16</td>
<td>99.444%</td>
<td>98.889%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>17</td>
<td>96.774%</td>
<td>97.222%</td>
<td>99.462%</td>
<td>100%</td>
</tr>
<tr>
<td>18</td>
<td>98.925%</td>
<td>98.925%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>19</td>
<td>96.237%</td>
<td>97.849%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>20</td>
<td>95.699%</td>
<td>99.462%</td>
<td>100%</td>
<td>100%</td>
</tr>
<tr>
<td>Avg.</td>
<td>96.637%</td>
<td><bold>97.792%</bold></td>
<td>99.782%</td>
<td><bold>99.861%</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The experimental results validate the necessity of users holding private parameters, especially in the application scenario of surveillance video action recognition. Local user data inherently possesses strong individual characteristics. For example, in a home surveillance environment, static features such as background and main subjects can vary significantly between users and do not need to be included in the public aggregation on the central server. The proposed video action recognition method based on personalized federated learning and spatiotemporal features designates the first two layers of the network, which focus on extracting static features, as private layers. The parameters from the subsequent three stages, which extract action features, are used for public aggregation, thereby enhancing the model&#x2019;s training performance.</p>
</sec>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion</title>
<p>This paper addresses the need for data privacy protection and data security in video surveillance by proposing a video action recognition method based on personalized federated learning and spatiotemporal features. First, the complete process of the personalized federated learning method and the production method of non-independent and same-distribution video action recognition datasets are introduced. Then, for video action recognition, a new spatiotemporal feature-based video action recognition algorithm is proposed, which includes two main modules: a spatial-temporal module based on local information and a time module based on global information. The local module extracts local spatiotemporal features based on each video segment while the global module interacts with local features through a differential approach on different action rhythms based on local information, and further uses neural networks to extract bidirectional action features. Subsequently, a personalized federated learning training scheme is provided. In the experimental analysis phase, multiple optional parameters for the modules were evaluated and experiments were conducted for different learning rate settings. Finally, leveraging the personalized federated learning framework, which incorporates stage-by-stage extraction of local spatiotemporal and global temporal features, the proposed method achieved an average accuracy of 97.792% on the non-independent and identically distributed UCF-101 public dataset. Additionally, a comprehensive comparison was made between the results of traditional and personalized federated learning. By processing local and global features separately without uploading users&#x2019; original video data or personalized features to the central server, the risk of user privacy data leakage is minimized, making federated learning an effective mechanism for enhancing model performance while protecting user privacy.</p>
<p>Future work will focus on optimizing the proposed model, particularly in terms of its adaptability to various real-world scenarios. The current effectiveness of the method relies on the quality and quantity of local data, and potential improvements include introducing argumentation-based methods to enhance model interpretability. In scenarios with long-tail data distribution, some users may have limited or low-quality local data, which can constrain the training effectiveness during the local feature extraction phase and impact overall performance. Future research aims to investigate asynchronous federated mechanisms and dynamic feature calibration methods to address these issues, achieving a better balance between privacy protection and model performance.</p>
</sec>
</body>
<back>
<ack>
<p>None.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This work was supported by National Natural Science Foundation of China (Grant No. 62071098); Sichuan Science and Technology Program (Grants 2022YFG0319, 2023YFG0301 and 2023YFG0018).</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Study conception and design: Rongsen Wu and Yuhang Zhang; data collection: Zelei Wu, Shiyang Tang and Yunji Li; analysis and interpretation of results: Jie Xu, Changming Zhao and Yiweng Xie; draft manuscript preparation: Rongsen Wu, Jie Xu, Yuhang Zhang, Changming Zhao and Jinhong Guo. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>All data in this paper can be found in Google Scholar at <ext-link ext-link-type="uri" xlink:href="https://scholar.google.com.">https://scholar.google.com</ext-link>.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Gan</surname> <given-names>B</given-names></string-name>, <string-name><surname>Cui</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Action recognition and detection based on deep learning: a comprehensive summary</article-title>. <source>Comput Mater Contin</source>. <year>2023</year>;<volume>77</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>23</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmc.2023.042494</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Fu</surname> <given-names>W</given-names></string-name>, <string-name><surname>Ding</surname> <given-names>W</given-names></string-name></person-group>. <article-title>Solution of wide and micro background bias in contrastive action representation learning</article-title>. <source>Eng Appl Artif Intell</source>. <year>2024</year>;<volume>133</volume>(<issue>11</issue>):<fpage>108244</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.engappai.2024.108244</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Khean</surname> <given-names>V</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>C</given-names></string-name>, <string-name><surname>Ryu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Khan</surname> <given-names>A</given-names></string-name>, <string-name><surname>Hong</surname> <given-names>MK</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>EY</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Human interaction recognition in surveillance videos using hybrid deep learning and machine learning models</article-title>. <source>Comput Mater Contin</source>. <year>2024</year>;<volume>81</volume>(<issue>1</issue>):<fpage>773</fpage>&#x2013;<lpage>87</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmc.2024.056767</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Xu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Song</surname> <given-names>R</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>H</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>X</given-names></string-name></person-group>. <article-title>A fast human action recognition network based on spatio-temporal features</article-title>. <source>Neurocomputing</source>. <year>2021</year>;<volume>441</volume>(<issue>2</issue>):<fpage>350</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.neucom.2020.04.150</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>T</given-names></string-name>, <string-name><surname>Sahu</surname> <given-names>AK</given-names></string-name>, <string-name><surname>Talwalkar</surname> <given-names>A</given-names></string-name>, <string-name><surname>Smith</surname> <given-names>V</given-names></string-name></person-group>. <article-title>Federated learning: challenges, methods, and future directions</article-title>. <source>IEEE Signal Process Mag</source>. <year>2020</year>;<volume>37</volume>(<issue>3</issue>):<fpage>50</fpage>&#x2013;<lpage>60</lpage>. doi:<pub-id pub-id-type="doi">10.1109/MSP.2020.2975749</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Tyagi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Rajput</surname> <given-names>IS</given-names></string-name>, <string-name><surname>Pandey</surname> <given-names>R</given-names></string-name></person-group>. <article-title>Federated learning: applications, security hazards and defense measures</article-title>. In: <conf-name>2023 International Conference on Device Intelligence, Computing and Communication Technologies, (DICCT)</conf-name>; <year>2023</year>; <publisher-loc>Dehradun, India</publisher-loc>: <publisher-name>IEEE</publisher-name>. p. <fpage>477</fpage>&#x2013;<lpage>82</lpage>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kairouz</surname> <given-names>P</given-names></string-name>, <string-name><surname>McMahan</surname> <given-names>HB</given-names></string-name>, <string-name><surname>Avent</surname> <given-names>B</given-names></string-name>, <string-name><surname>Bellet</surname> <given-names>A</given-names></string-name>, <string-name><surname>Bennis</surname> <given-names>M</given-names></string-name>, <string-name><surname>Bhagoji</surname> <given-names>AN</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Advances and open problems in federated learning</article-title>. <source>FoundTrends&#x00AE; Mach Learn</source>. <year>2021</year>;<volume>14</volume>(<issue>1&#x2013;2</issue>):<fpage>1</fpage>&#x2013;<lpage>210</lpage>. doi:<pub-id pub-id-type="doi">10.1561/2200000083</pub-id>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Aggarwal</surname> <given-names>M</given-names></string-name>, <string-name><surname>Khullar</surname> <given-names>V</given-names></string-name>, <string-name><surname>Rani</surname> <given-names>S</given-names></string-name>, <string-name><surname>Prola</surname> <given-names>TA</given-names></string-name>, <string-name><surname>Bhattacharjee</surname> <given-names>SB</given-names></string-name>, <string-name><surname>Shawon</surname> <given-names>SM</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Federated learning on internet of things: extensive and systematic review</article-title>. <source>Comput Mater Contin</source>. <year>2024</year>;<volume>79</volume>(<issue>2</issue>):<fpage>1795</fpage>&#x2013;<lpage>834</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmc.2024.049846</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Caroprese</surname> <given-names>L</given-names></string-name>, <string-name><surname>Ruga</surname> <given-names>T</given-names></string-name>, <string-name><surname>Vocaturo</surname> <given-names>E</given-names></string-name>, <string-name><surname>Zumpano</surname> <given-names>E</given-names></string-name></person-group>. <article-title>Lung cancer detection via federated learning</article-title>. In: <conf-name>2023 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>; <year>2023</year>; <publisher-loc>Istanbul, Turkiye</publisher-loc>. p. <fpage>3862</fpage>&#x2013;<lpage>7</lpage>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Zhao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Li</surname> <given-names>H</given-names></string-name>, <string-name><surname>Barnaghi</surname> <given-names>P</given-names></string-name>, <string-name><surname>Haddadi</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Semi-supervised federated learning for activity recognition</article-title>. <comment>arXiv:2011.00851</comment>. <year>2020</year>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Shome</surname> <given-names>D</given-names></string-name>, <string-name><surname>Kar</surname> <given-names>T</given-names></string-name></person-group>. <article-title>FedAffect: few-shot federated learning for facial expression recognition</article-title>. In: <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>; <year>2021</year>. p. <fpage>4168</fpage>&#x2013;<lpage>75</lpage>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Rehman</surname> <given-names>YAU</given-names></string-name>, <string-name><surname>Gao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>J</given-names></string-name>, <string-name><surname>deGusmao</surname> <given-names>PPB</given-names></string-name>, <string-name><surname>Lane</surname><given-names>N</given-names></string-name></person-group>. <article-title>Federated self-supervised learning for video understanding</article-title>. In: <conf-name>European Conference on Computer Vision</conf-name>; <year>2022</year>; <publisher-loc>Tel Aviv, Israel</publisher-loc>: <publisher-name>Springer</publisher-name>. p. <fpage>506</fpage>&#x2013;<lpage>22</lpage>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Doshi</surname> <given-names>K</given-names></string-name>, <string-name><surname>Yilmaz</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Federated learning-based driver activity recognition for edge devices</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2022</year>; <publisher-loc>New Orleans, LA, USA</publisher-loc>. p. <fpage>3338</fpage>&#x2013;<lpage>46</lpage>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Tu</surname> <given-names>NA</given-names></string-name>, <string-name><surname>Abu</surname> <given-names>A</given-names></string-name>, <string-name><surname>Aikyn</surname> <given-names>N</given-names></string-name>, <string-name><surname>Makhanov</surname> <given-names>N</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>MH</given-names></string-name>, <string-name><surname>Le-Huy</surname> <given-names>K</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>FedFSLAR: a federated learning framework for few-shot action recognition</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision</conf-name>; <year>2024</year>; <publisher-loc>Waikoloa, HI, USA</publisher-loc>. p. <fpage>270</fpage>&#x2013;<lpage>9</lpage>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Luo</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Fu</surname> <given-names>W</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Anwar</surname> <given-names>S</given-names></string-name>, <string-name><surname>Saqib</surname> <given-names>M</given-names></string-name>, <string-name><surname>Bakshi</surname> <given-names>S</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Cefdet: cognitive effectiveness network based on fuzzy inference for action detection</article-title>. In: <conf-name>Proceedings of the 32nd ACM International Conference on Multimedia. MM &#x2019;24</conf-name>; <year>2024</year>; <publisher-loc>New York, NY, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>; p. <fpage>7985</fpage>&#x2013;<lpage>94</lpage>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Yurochkin</surname> <given-names>M</given-names></string-name>, <string-name><surname>Agarwal</surname> <given-names>M</given-names></string-name>, <string-name><surname>Ghosh</surname> <given-names>S</given-names></string-name>, <string-name><surname>Greenewald</surname> <given-names>K</given-names></string-name>, <string-name><surname>Hoang</surname> <given-names>N</given-names></string-name>, <string-name><surname>Khazaeni</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Bayesian nonparametric federated Q&#x00C0; learning of neural networks</article-title>. In: <conf-name>International Conference on Machine Learning</conf-name>; <year>2019</year>; <publisher-loc>Long Beach, CA, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>. p. <fpage>7252</fpage>&#x2013;<lpage>61</lpage>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Zhao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>M</given-names></string-name>, <string-name><surname>Lai</surname> <given-names>L</given-names></string-name>, <string-name><surname>Suda</surname> <given-names>N</given-names></string-name>, <string-name><surname>Civin</surname> <given-names>D</given-names></string-name>, <string-name><surname>Chandra</surname> <given-names>V</given-names></string-name></person-group>. <article-title>Federated learning with non-iid data</article-title>. <comment>arXiv:1806.00582. 2018</comment>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Lin</surname> <given-names>J</given-names></string-name>, <string-name><surname>Gan</surname> <given-names>C</given-names></string-name>, <string-name><surname>Han</surname> <given-names>S</given-names></string-name></person-group>. <article-title>TSM: temporal shift module for efficient video understanding</article-title>. In: <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>; <year>2019</year>; <publisher-loc>Seoul, Republic of Korea</publisher-loc>. p. <fpage>7083</fpage>&#x2013;<lpage>93</lpage>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>X</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>W</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>K</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>T</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>C</given-names></string-name></person-group>. <article-title>Manet: motion-aware network for video action recognition</article-title>. <source>Comp Intell Syst</source>. <year>2025</year>;<volume>11</volume>(<issue>3</issue>):<fpage>167</fpage>. doi:<pub-id pub-id-type="doi">10.1007/s40747-024-01774-9</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Tong</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Ji</surname> <given-names>B</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>G</given-names></string-name></person-group>. <article-title>TDN: temporal difference networks for efficient action recognition</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2021</year>. p. <fpage>1895</fpage>&#x2013;<lpage>904</lpage>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Dong</surname> <given-names>W</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Q</given-names></string-name></person-group>. <article-title>High-order correlation network for video recognition</article-title>. In: <conf-name>2022 International Joint Conference on Neural Networks (IJCNN)</conf-name>; <year>2022</year>; <publisher-loc>Seoul, Republic of Korea</publisher-loc>: <publisher-name>IEEE</publisher-name>. p. <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ming</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>J</given-names></string-name>, <string-name><surname>Jia</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Xiong</surname> <given-names>L</given-names></string-name>, <string-name><surname>Feng</surname> <given-names>F</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>F2D-SIFPNet: a frequency 2D Slow-I-Fast-P network for faster compressed video action recognition</article-title>. <source>Appl Intell</source>. <year>2024</year>;<volume>54</volume>(<issue>7</issue>):<fpage>5197</fpage>&#x2013;<lpage>215</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10489-024-05408-y</pub-id>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>B</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Bao</surname> <given-names>X</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Representation learning for compressed video action recognition via attentive cross-modal interaction with motion enhancement</article-title>. <comment>arXiv:2205.03569. 2022</comment>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Sheng</surname> <given-names>X</given-names></string-name>, <string-name><surname>Li</surname> <given-names>K</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>G</given-names></string-name></person-group>. <article-title>A progressive difference method for capturing visual tempos on action recognition</article-title>. <source>IEEE Transact Circ Syst Video Technol</source>. <year>2022</year>;<volume>33</volume>(<issue>3</issue>):<fpage>977</fpage>&#x2013;<lpage>87</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCSVT.2022.3207518</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gao</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Ran</surname> <given-names>X</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>CANet: comprehensive attention network for video-based action recognition</article-title>. <source>Knowl Based Syst</source>. <year>2024</year>;<volume>296</volume>(<issue>8</issue>):<fpage>111852</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.knosys.2024.111852</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>