<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">58586</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2024.058586</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Pseudo Label Purification with Dual Contrastive Learning for Unsupervised Vehicle Re-Identification</article-title>
<alt-title alt-title-type="left-running-head">Pseudo Label Purification with Dual Contrastive Learning for Unsupervised Vehicle Re-Identification</alt-title>
<alt-title alt-title-type="right-running-head">Pseudo Label Purification with Dual Contrastive Learning for Unsupervised Vehicle Re-Identification</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Xu</surname><given-names>Jiyang</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Wang</surname><given-names>Qi</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><email>wangqi@ncu.edu.cn</email></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Xiong</surname><given-names>Xin</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Min</surname><given-names>Weidong</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Luo</surname><given-names>Jiang</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Gai</surname><given-names>Di</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>Han</surname><given-names>Qing</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<aff id="aff-1"><label>1</label><institution>School of Mathematics and Computer Sciences, Nanchang University</institution>, <addr-line>Nanchang, 330031</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>The First Affiliated Hospital, Jiangxi Medical College, Nanchang University</institution>, <addr-line>Nanchang, 330006</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Institute of Metaverse, Nanchang University</institution>, <addr-line>Nanchang, 330031</addr-line>, <country>China</country></aff>
<aff id="aff-4"><label>4</label><institution>Jiangxi Fangxing Technology Company Limited</institution>, <addr-line>Nanchang, 330025</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Qi Wang. Email: <email>wangqi@ncu.edu.cn</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2025</year></pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>06</day>
<month>03</month>
<year>2025</year></pub-date>
<volume>82</volume>
<issue>3</issue>
<fpage>3921</fpage>
<lpage>3941</lpage>
<history>
<date date-type="received">
<day>15</day>
<month>9</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2025 The Authors.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_58586.pdf"></self-uri>
<abstract>
<p>The unsupervised vehicle re-identification task aims at identifying specific vehicles in surveillance videos without utilizing annotation information. Due to the higher similarity in appearance between vehicles compared to pedestrians, pseudo-labels generated through clustering are ineffective in mitigating the impact of noise, and the feature distance between inter-class and intra-class has not been adequately improved. To address the aforementioned issues, we design a dual contrastive learning method based on knowledge distillation. During each iteration, we utilize a teacher model to randomly partition the entire dataset into two sub-domains based on clustering pseudo-label categories. By conducting contrastive learning between the two student models, we extract more discernible vehicle identity cues to improve the problem of imbalanced data distribution. Subsequently, we propose a context-aware pseudo label refinement strategy that leverages contextual features by progressively associating granularity information from different bottleneck blocks. To produce more trustworthy pseudo-labels and lessen noise interference during the clustering process, the context-aware scores are obtained by calculating the similarity between global features and contextual ones, which are subsequently added to the pseudo-label encoding process. The proposed method has achieved excellent performance in overcoming label noise and optimizing data distribution through extensive experimental results on publicly available datasets.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Unsupervised vehicle re-identification</kwd>
<kwd>dual contrastive learning</kwd>
<kwd>pseudo label refinement</kwd>
<kwd>knowledge distillation</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>National Natural Science Foundation of China</funding-source>
<award-id>62461037</award-id>
<award-id>62076117</award-id>
<award-id>62166026</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Jiangxi Provincial Natural Science Foundation</funding-source>
<award-id>20224BAB212011</award-id>
<award-id>20232BAB202051</award-id>
<award-id>20232BAB212008</award-id>
<award-id>20242BAB25078</award-id>
</award-group>
<award-group id="awg3">
<funding-source>Jiangxi Provincial Key Laboratory of Virtual Reality</funding-source>
<award-id>2024SSY03151</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>The purpose of vehicle re-identification (Re-ID) is to retrieve vehicles with specific identities under cross-camera surveillance systems [<xref ref-type="bibr" rid="ref-1">1</xref>,<xref ref-type="bibr" rid="ref-2">2</xref>]. Unsupervised vehicle Re-ID refers to the accurate retrieval of a given vehicle image from gallery datasets without any data annotation [<xref ref-type="bibr" rid="ref-3">3</xref>&#x2013;<xref ref-type="bibr" rid="ref-5">5</xref>]. Due to the high similarity in appearance of vehicles, the goal of unsupervised vehicle Re-ID is to effectively distinguish the feature distribution within the data domain. Recently, with the development of neural networks [<xref ref-type="bibr" rid="ref-6">6</xref>,<xref ref-type="bibr" rid="ref-7">7</xref>] in the field of computer vision, unsupervised vehicle Re-ID methods have achieved significant performance on public datasets using clustering labeled pseudo labels [<xref ref-type="bibr" rid="ref-8">8</xref>&#x2013;<xref ref-type="bibr" rid="ref-11">11</xref>]. However, existing works only consider the use of global features as clustering inputs to generate pseudo-labels, which not only generates a large amount of pseudo-label noise but also affects the optimization of unsupervised data distribution.</p>
<p>Due to the high similarity among vehicles, this poses challenges to the judgment in some unsupervised vehicle Re-ID works [<xref ref-type="bibr" rid="ref-8">8</xref>,<xref ref-type="bibr" rid="ref-9">9</xref>,<xref ref-type="bibr" rid="ref-12">12</xref>] that rely on visual representation. Recent studies have employed contrastive learning techniques to address the distance relationships between samples [<xref ref-type="bibr" rid="ref-11">11</xref>,<xref ref-type="bibr" rid="ref-13">13</xref>&#x2013;<xref ref-type="bibr" rid="ref-15">15</xref>]. By constructing positive and negative sample pairs and optimizing the inter-cluster distance, these methods effectively segregate hard negative samples and train the network. For instance, Dai et al. [<xref ref-type="bibr" rid="ref-13">13</xref>] established contrastive learning within the centroid of clustering to optimize the distance between samples and centroids. Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>] segmented the image into three parts and utilized three centroid bank for contrastive learning. However, this approach may lead to excessive memory consumption. Although these contrastive learning methods have demonstrated remarkable performance in unsupervised vehicle Re-ID tasks, they overlook the distributional characteristics in the data domain. Consequently, the development of methodologies that optimize data distribution and construct superior visual representations remains an issue deserving of further investigation.</p>
<p>To tackle the issue of pseudo-label noise, previous works utilize the pseudo-label refinement strategies [<xref ref-type="bibr" rid="ref-9">9</xref>,<xref ref-type="bibr" rid="ref-10">10</xref>,<xref ref-type="bibr" rid="ref-15">15</xref>,<xref ref-type="bibr" rid="ref-16">16</xref>]. These methods use clustering filtering or knowledge distillation-based feature optimization to generate accurate label information. For example, Chen et al. [<xref ref-type="bibr" rid="ref-10">10</xref>] used contrastive learning by calculating the similarity score between the original image and the enhanced image as a pseudo label of the image. Wang et al. [<xref ref-type="bibr" rid="ref-9">9</xref>] adopted a joint clustering filtering method with a teacher network to filter out labels with low similarity scores and realize the assignment of false labels to images. However, due to the loss of feature information extracted by convolutional neural network (CNN), these methods rely on global features as the basis for image clustering or pseudo-label assignment, while neglecting the information between image contexts. The above-mentioned methods may make it more difficult for the Re-ID model to distinguish between hard samples.</p>
<p>Our motivation is to explore the granularity information of bottleneck blocks, reduce the noise interference by clustering, and optimize the distribution of vehicle features. Specifically, our goal is to increase the distance between inter-class features while reducing the distance between intra-class features. To overcome the limitations of aforementioned methods, this paper proposes a novel fully unsupervised vehicle Re-ID framework consisting of two components: the distillation-based dual contrastive learning method (DCL) and context-aware pseudo label refinement (CPLR). The proposed method gradually correlates the granularity of information at different levels of the network, effectively reducing noise interference in the process of generating pseudo labels for global features. Additionally, we constructed a contrastive learning method between the student network and the teacher network to deeply explore the feature distribution within the data domain.</p>
<p>Our contributions can be summarized as follows:
<list list-type="bullet">
<list-item>
<p>A dual contrastive learning framework based on knowledge distillation is designed to to improve the distribution of unsupervised sample features. In the clustering stage, the teacher model is used to divide the domain data after clustering, provide the student model joint contrastive learning, and discover the sample information with more discriminative ability.</p></list-item>
<list-item>
<p>We propose a context-aware pseudo label refinement strategy to improve the awareness of image context. The contextual features of images are calculated using the differences in granularity information between different levels of the network, and the context-aware score calculated with the global features is used to provide reliable pseudo-labels.</p></list-item>
<list-item>
<p>Extensive experimental results demonstrate the effectiveness of our method, significantly outperforming existing state-of-the-art methods on several mainstream vehicle Re-ID tasks.</p></list-item>
</list></p>
<p>The remaining structure of this paper as follows. <xref ref-type="sec" rid="s2">Section 2</xref> reviews the related work. <xref ref-type="sec" rid="s3">Section 3</xref> provides a detailed introduction to the proposed methods. <xref ref-type="sec" rid="s4">Section 4</xref> presents experimental data to validate the superiority of the proposed methods. <xref ref-type="sec" rid="s5">Section 5</xref> concludes this paper.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<sec id="s2_1">
<label>2.1</label>
<title>Unsupervised Vehicle Re-ID</title>
<p>Existing unsupervised vehicle Re-ID methods mainly focus on how to smoothly assign the one-hot label weights to other categories after clustering. The feature space-based label smoothing methodology primarily entails establishing potential correlation between global and localized features [<xref ref-type="bibr" rid="ref-15">15</xref>&#x2013;<xref ref-type="bibr" rid="ref-18">18</xref>], or augmenting the global feature representation through the incorporation of supplementary modal information [<xref ref-type="bibr" rid="ref-1">1</xref>,<xref ref-type="bibr" rid="ref-19">19</xref>]. Cho et al. [<xref ref-type="bibr" rid="ref-16">16</xref>] proposed the partially guided pseudo-label refinement (PPLR) method, which exploits the complementary relationship between global and local features to reduce label noise. He et al. [<xref ref-type="bibr" rid="ref-19">19</xref>] proposed a graph-based progressive fusion network to fuse the RGB features and multi-infrared features of vehicles. Furthermore, inspired by transfer learning methodologies that leverage intra-domain category relationships, several endeavors [<xref ref-type="bibr" rid="ref-4">4</xref>,<xref ref-type="bibr" rid="ref-20">20</xref>&#x2013;<xref ref-type="bibr" rid="ref-22">22</xref>] have employed style transfer techniques to generate samples characterized by distinct domain styles and then mine the intra-domain and inter-domain category relationships to smooth label weights. Wang et al. [<xref ref-type="bibr" rid="ref-20">20</xref>] proposed dual constrained label smoothing to monitor unlabeled source domain data from few-sample source domain data to mine the information of source domain data, and guide the style transfer of different domain data through domain difference penalty. Ding et al. [<xref ref-type="bibr" rid="ref-21">21</xref>] proposed adaptive exploration to deal with the uneven distribution of image domains after clustering. These methods have demonstrated excellent performance in addressing noise and data domains. However, label smoothing in these methods only utilizes global and local features, and irrelevant local features may introduce excessive redundancy, thereby increasing computational load and potentially reducing feature discriminative ability. This work explores how to extract contextual features from the model blocks to improve the quality of pseudo labels.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Contrastive Learning</title>
<p>The contrastive learning method in unsupervised vehicle Re-ID tasks is mainly based on Momentum Contrast (MoCo) [<xref ref-type="bibr" rid="ref-23">23</xref>] to optimize features distribution by constructing positive and negative samples. The first paradigm relies on clustering outcomes, wherein datasets are divided into positive and negative sample pairs, followed by comparative learning conducted on these instances [<xref ref-type="bibr" rid="ref-10">10</xref>,<xref ref-type="bibr" rid="ref-11">11</xref>,<xref ref-type="bibr" rid="ref-24">24</xref>,<xref ref-type="bibr" rid="ref-25">25</xref>]. Ge et al. [<xref ref-type="bibr" rid="ref-11">11</xref>] proposed a self-paced contrastive learning framework to provide hybrid supervision through multiple different forms of category prototypes to fully exploit the distribution of data within clusters. Hu et al. [<xref ref-type="bibr" rid="ref-25">25</xref>] proposed a hard sample-guided hybrid contrastive learning method to improve feature representation by contrastive learning clustering centers and instance samples. The second paradigm conducts samples to perform contrastive learning on the centroids of clusters, and update the features of centroids with momentum in each learning process [<xref ref-type="bibr" rid="ref-13">13</xref>,<xref ref-type="bibr" rid="ref-15">15</xref>,<xref ref-type="bibr" rid="ref-26">26</xref>]. Dai et al. [<xref ref-type="bibr" rid="ref-13">13</xref>] proposed the centroid contrastive learning method to better partition the feature distribution of different instances. Yang et al. [<xref ref-type="bibr" rid="ref-26">26</xref>] proposed a contour-guided mask autoencoder method to extract the edge information of the vehicle contour to improve the quality of the label. These methods have contributed to exploring and optimizing the feature distribution of data, but they have generally neglected the imbalance of sample distribution in unsupervised processes. In contrast, our motivation is to explore the imbalance of the initial pseudo-label assignment and further optimize the feature distribution of the samples.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Knowledge Distillation</title>
<p>Knowledge distillation is an approach for teaching the knowledge of a complicated model to a simple model [<xref ref-type="bibr" rid="ref-27">27</xref>], which tries to lead the training of a student model on a downstream task by using the prior knowledge of a teacher model on an upstream job. Several recent research studies [<xref ref-type="bibr" rid="ref-9">9</xref>,<xref ref-type="bibr" rid="ref-15">15</xref>,<xref ref-type="bibr" rid="ref-28">28</xref>,<xref ref-type="bibr" rid="ref-29">29</xref>] have used knowledge distillation into unsupervised vehicles Re-ID task. Typically, these methods use the teacher model for clustering to guide the training of the student model, and the student network model updates the teacher network using exponential moving average (EMA). Wang et al. [<xref ref-type="bibr" rid="ref-9">9</xref>] proposed an uncertainty-aware clustering method that assigns pseudo-labels through collaborative filtering of teacher and student networks. Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>] employed an off-line distillation approach by training a teacher model from noisy pseudo-labels, which is then used to guide the learning of a student model. Ge et al. [<xref ref-type="bibr" rid="ref-28">28</xref>] utilized the joint training method of multi-teacher networks to perform joint label smoothing operations on the labels of images through the joint learning of two student networks. The aforementioned methods illustrates the superior efficacy of distillation techniques in the Re-ID field. To some extent, these methods rely on the quality of the pseudo-labels extracted by teacher model but ignore the noise existing in the initial teacher model. Distinct from those methods, we endeavor not only to use the teacher model for the extraction of high-quality pseudo-labels but also to derive more reliable information from the student model.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed Method</title>
<sec id="s3_1">
<label>3.1</label>
<title>The Overall Framework</title>
<p>As shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, We extract contextual features of the extracted model from the training set. In the clustering stage, the teacher model is used to calculate the similarity distance matrix of the global features and the contextual features respectively, and the two pair-wise distances are clustered after similarity fusion to reduce the noise influence of global features. At each iteration, the training set is randomly divided into two subdomains according to the pseudo label category to simulate different data distributions for the learning of the dual contrastive model. Then, each student model predicts the two extracted features and performs label smoothing between the prediction vector and the one-hot label to obtain more reliable pseudo-labels for the learning of the loss function. The student model performs parameter updates under the joint supervision of softmax-triplet contrastive loss, context-aware identity discrimination loss, and centroid contrastive loss. After each round of learning, the dual contrastive model updated the parameters of the teacher model by collaboration EMA (Co-EMA) to obtain a more stable distillation effect.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>The overall flow of the proposed method</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-1.tif"/>
</fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Distillation-Based Dual Contrastive Learning</title>
<p>The classification of unsupervised samples is limited by the quality of feature extraction, particularly during the early training phase when the pseudo-label distribution is uneven, which makes it challenging for the model to learn to differentiate hard negative data. Inspired by Shi et al. [<xref ref-type="bibr" rid="ref-30">30</xref>], the study employed contrastive learning to optimize the distance of data features in the dataset. We consider the variances in data distribution within clusters and introduce a dual contrastive learning framework incorporating knowledge distillation. This framework consists of two student models with identical network topology and a teacher model.</p>
<p>Given <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> denote the unlabeled training dataset, the global feature <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> is obtained by the network <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> extraction. In the clustering phase, we first use the teacher network to cluster the data domain <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow></mml:math></inline-formula>. Subsequently, the samples of each cluster are randomly divided into two subdomains <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> based on the <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>&#x03B2;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> ratio. We allocate centroid memory banks for each subdomain to facilitate online learning for students. The centroid is obtained through <xref ref-type="disp-formula" rid="eqn-1">Eq. (1)</xref>:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>&#x03C6;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:msub><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:msub><mml:mi mathvariant="bold-italic">C</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the number of samples in category <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mi mathvariant="bold-italic">j</mml:mi></mml:math></inline-formula>, The value of <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mi mathvariant="bold-italic">C</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> changes with each round clustering results. During each iteration of training, the centroid will be updated. The update for: <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msubsup><mml:mi mathvariant="bold-italic">&#x03C6;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">j</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">&#x2190;</mml:mo><mml:mi mathvariant="bold-italic">m</mml:mi><mml:msubsup><mml:mi mathvariant="bold-italic">&#x03C6;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">j</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi mathvariant="bold-italic">m</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, where <italic>m</italic> is momentum update factor.</p>
<p>The pioneering work Wei et al. [<xref ref-type="bibr" rid="ref-3">3</xref>] and Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>] employed the cluster-level noise-contrastive estimation (ClusterNCE) loss for contrastive learning, optimizing feature distribution by generating pair-wise positive and negative samples. However, information from hard samples may be erroneously grouped into the same cluster. We utilize soft label information (<xref ref-type="sec" rid="s3_3">Section 3.3</xref>) to guide ClusterNCE, aiming to balance the convergence rates of different types of samples through soft labels. Shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>, we employ a strongly supervised prototype as supervisory information, considering pairs within the same cluster as positive and those between different clusters as negative. The centroids contrastive loss <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> can be defined as <xref ref-type="disp-formula" rid="eqn-2">Eq. (2)</xref>:<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x27E8;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>&#x03C6;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x27E9;</mml:mo></mml:mrow><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>&#x03C4;</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>&#x27E8;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>&#x03C6;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x27E9;</mml:mo></mml:mrow><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>&#x03C4;</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mrow><mml:mo>&#x27E8;</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>&#x27E9;</mml:mo></mml:mrow></mml:math></inline-formula> indicates cosine similarity, <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mi mathvariant="bold-italic">&#x03C4;</mml:mi></mml:math></inline-formula> is a temperature hyper-parameter, <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msub><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the pseudo label of instance <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mi mathvariant="bold-italic">i</mml:mi></mml:math></inline-formula>. The pseudo labels are allocated from each subdomain. Therefore, the total loss for one student training is as <xref ref-type="disp-formula" rid="eqn-3">Eq. (3)</xref>:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">c</mml:mi><mml:mi mathvariant="bold-italic">i</mml:mi><mml:mi mathvariant="bold-italic">d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents the context-aware identity discrimination loss (<xref ref-type="sec" rid="s3_3">Section 3.3</xref>). After each iteration, the teacher model will be jointly updated by the dual contrastive model in the <xref ref-type="disp-formula" rid="eqn-4">Eq. (4)</xref> in a Co-EMA manner:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03B8;</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mspace width="negativethinmathspace" /><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">T</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> denotes <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mi mathvariant="bold-italic">t</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:math></inline-formula> hour iterative teachers model parameters, <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">2</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> respectively two students model parameters. The parameter <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mi mathvariant="bold-italic">&#x03B1;</mml:mi></mml:math></inline-formula> is the hyper-parameter for updating the momentum of the model, and <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula> is the scaling factor used to partition the data domain <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow></mml:math></inline-formula>.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Loss <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:msub><mml:mrow><mml:mi mathvariant="bold-script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">ccl</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> description, monitored by label smoothing method, simple samples are closer to the centroid, while hard samples have a slower speed of approaching the centroid</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-2.tif"/>
</fig>
<p>To handle the global features <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and their pseudo-labels <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> that are extracted by each student network, we also utilize softmax-triplet loss [<xref ref-type="bibr" rid="ref-31">31</xref>]. In addition, we merge the features extracted from the sub-domains, and jointly calculate the softmax-triplet contrast loss of the dual contrastive network and the teacher network, to better eliminate mistake amplification. The expression is as <xref ref-type="disp-formula" rid="eqn-5">Eq. (5)</xref>:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mtable columnalign="right right" rowspacing="4pt" columnspacing="1em"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mo>&#x2211;</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:msup><mml:mrow><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mo>&#x2211;</mml:mo><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>g</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mfrac><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mrow><mml:mover><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mover><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:msup><mml:mrow><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mrow><mml:mover><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mover><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:msup><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mrow><mml:mover><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mover><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>,<inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mrow><mml:mover><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x007E;</mml:mo></mml:mover></mml:mrow></mml:math></inline-formula> respectively from the student network <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and teacher network <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">T</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> extract query instance <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:mi mathvariant="bold-italic">i</mml:mi></mml:math></inline-formula> global features. <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:mrow><mml:mo>(</mml:mo><mml:mi mathvariant="bold-italic">i</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">n</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mspace width="negativethinmathspace" /><mml:mo>,</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mi mathvariant="bold-italic">i</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">p</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> denote the positive and negative samples of query instance <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:mi mathvariant="bold-italic">i</mml:mi></mml:math></inline-formula>, respectively.</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Context-Aware Pseudo Label Refinement</title>
<p><bold>Contextual feature extraction.</bold> Granularity information typically refers to the degree to which data or information is subdivided at different levels or scales. As illustrated in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>, there are significant differences in the scale of image features presented by different layers of network blocks. It is noteworthy that in this work, our granularity information is cross-hierarchical. We hierarchically extract granularity information from four network blocks and achieve progressive correlation through a self-attention (SA) mechanism, effectively establishing long-term dependencies between these features.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Illustration of the contextual feature extraction. Association of feature maps extracted from different levels in the network through a progressive approach</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-3.tif"/>
</fig>
<p>Specifically, we applied <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:math></inline-formula> convolution, global average pooling (GAP), ReLU and batch norm (BN) to feature maps from each stage at different scales, aligning their feature dimensions to 2048. Subsequently, we utilize the feature maps from the previous layer as Q, and those from the subsequent layer as K and V for feature enhancement. The contextual features of each layer can be defined as <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref>:
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:msqrt><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:msqrt></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> from the current layer as K &#x0026; V, <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> from the prior layer as Q. In particular, we employ the multi-head self-attention (MSA) mechanism to establish the relationship of features, which is an extended form of multiple independent <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:mi>S</mml:mi><mml:mi>A</mml:mi></mml:math></inline-formula> modules. It is denoted as <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>A</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mi>S</mml:mi><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>S</mml:mi><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>S</mml:mi><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>]</mml:mo></mml:mrow><mml:mi>W</mml:mi></mml:math></inline-formula>, where <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>W</mml:mi></mml:math></inline-formula> is the projection matrix. We empirically set <italic>m</italic> to 4.</p>
<p>Based on <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref>, we compute the contextual feature information between the current layer and the previous layer in a progressive manner, thereby deriving the contextual features of the image. Let us review whether SA based contextual feature extraction is necessary? Although the implementation of SA undoubtedly escalates the complexity of the model, it is noteworthy that, in comparison to traditional methods of direct feature concatenation, SA can effectively capture the dependency relationships among the various components of the input data. This capability facilitates a more robust contextual understanding during the feature extraction process, especially considering the recognition interference that may arise from the high similarity between vehicle models. We deem that, in addition to relying on the network for extracting global features of the image, it is also necessary to focus on the contextual information of the image (such as fine-grained vehicle information: license plates, headlights, logos, etc.). In subsequent experiments, we conducted further analysis on the impact of contextual feature extraction on the overall performance of the model.</p>
<p>Optimization of cluster noise. Owing to the inherent informational biases in isolated global feature clustering, we employ a teacher network to extract both contextual features <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">c</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and global features <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>. Subsequently, during the clustering phase, we compute the Jaccard distance matrix for each feature across the entire sample denoted <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:msub><mml:mi mathvariant="bold-italic">D</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">D</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The weighted pair-wise distance is implemented as follows:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mi>D</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:msub><mml:mi mathvariant="bold-italic">&#x03BB;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">D</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the pair-wise matrix weighting factor. In line with cluster-based methods like Chen et al. [<xref ref-type="bibr" rid="ref-10">10</xref>] and PPLR [<xref ref-type="bibr" rid="ref-16">16</xref>], we generate one-hot labels using the DBSCAN [<xref ref-type="bibr" rid="ref-32">32</xref>] clustering algorithm. This allows us to establish hard pseudo-labels <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">2</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:msup><mml:mi mathvariant="bold-italic">n</mml:mi><mml:mrow><mml:mi mathvariant="bold">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> for the training dataset. Because of outliers, the number of clustered samples <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:msup><mml:mi mathvariant="bold-italic">n</mml:mi><mml:mrow><mml:mi mathvariant="bold">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, is smaller than the training set sample <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi mathvariant="bold-italic">n</mml:mi></mml:math></inline-formula>.</p>
<p>Pseudo-label refinement. Although contextual features are employed during the clustering phase to mitigate biases introduced by global features, the resulting labels remain fundamentally hard labels. Feature extraction and clustering algorithms can impact the quality of label assignment, thereby complicating the attainment of effective generalization. Owing to variations in visual attention regions, contextual features and global features convey complementary information. By leveraging the global features <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and contextual features <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:msubsup><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">c</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> of a query image, we employ cosine similarity to compute a similarity-aware score <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:msub><mml:mi mathvariant="bold-italic">S</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, as <xref ref-type="disp-formula" rid="eqn-8">Eq. (8)</xref>:
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mo stretchy="false">&#x2225;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mo>&#x2225;</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2217;</mml:mo><mml:mo>&#x2225;</mml:mo><mml:msubsup><mml:mi>f</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mo stretchy="false">&#x2225;</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:math></disp-formula></p>
<p>A high similarity perception score means that there is a significant correlation between global features and semantic context features, and the two information can complement each other to provide a more comprehensive feature representation. Conversely, a low similarity perception score means that the intersection of the two provides unreliable information. <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi mathvariant="bold-italic">q</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">q</mml:mi><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">q</mml:mi><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">2</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">q</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">C</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> is obtained by <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msub><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">&#x03B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> prediction get <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:mi mathvariant="bold-italic">C</mml:mi></mml:math></inline-formula> category labels. The label <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:msub><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is smoothed as <xref ref-type="disp-formula" rid="eqn-9">Eq. (9)</xref>:
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:msubsup><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>h</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:msubsup><mml:mi>q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mspace width="thinmathspace" /><mml:msubsup><mml:mi>q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:mo>]</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:msub><mml:mi mathvariant="bold-italic">&#x03BB;</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">h</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is a constant and is set to 0.7 in the experiments, <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:msubsup><mml:mi mathvariant="bold-italic">y</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">h</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> represents one hot label, derived from clustering algorithm. <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:msubsup><mml:mi mathvariant="bold-italic">q</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi mathvariant="bold-italic">q</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">c</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> extracted from global features and contextual features respectively. During the training phase, the student model&#x2019;s loss function (<xref ref-type="sec" rid="s3_2">Section 3.2</xref>) is computed using the trustworthy pseudo labels that we acquired from label smoothing. The loss associated with context-aware identity discrimination <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">c</mml:mi><mml:mi mathvariant="bold-italic">i</mml:mi><mml:mi mathvariant="bold-italic">d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> can be defined as <xref ref-type="disp-formula" rid="eqn-10">Eq. (10)</xref>:
<disp-formula id="eqn-10"><label>(10)</label><mml:math id="mml-eqn-10" display="block"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>q</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>+</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>q</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:munderover><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>q</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>)</mml:mo></mml:mrow><mml:mspace width="negativethinmathspace" /><mml:mo>,</mml:mo></mml:math></disp-formula></p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Training Objective and Real-Life Application</title>
<p>Overall, the training loss arises from two student networks and a teacher network. The calculation of the overall framework&#x2019;s training loss is as <xref ref-type="disp-formula" rid="eqn-11">Eq. (11)</xref>:<disp-formula id="eqn-11"><label>(11)</label><mml:math id="mml-eqn-11" display="block"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>e</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x03B3;</mml:mi><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x03B3;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi mathvariant="bold-italic">&#x03B3;</mml:mi></mml:math></inline-formula> is the comparison weight parameter of the triplet loss of the teacher network and the dual contrastive network, and <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula> is the training dataset partition factor, which is used to balance the loss weight of the two student networks.</p>
<p>Clearly, the two student networks are updated the parameters by <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:msub><mml:mrow><mml:mi mathvariant="bold-script">L</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext mathvariant="bold">total</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, and the teacher network is jointly updated with the two student networks in a Co-EMA manner (details in <xref ref-type="disp-formula" rid="eqn-4">Eq. (4)</xref>). The overall training process is shown in Algorithm 1.</p>
<p>In summary, the real-life application scenarios of the model include:</p>
<p>(1) Vehicle tracking: achieve continuous tracking of target vehicles across cameras. For example, in cross camera traffic scenarios, the vehicle re identification system can integrate data from various security surveillance cameras in the city, accurately identify and associate the same vehicle in different video frames, and construct a complete driving trajectory of suspected vehicles including fake license plates and obscured license plates, providing key clues for case investigation.</p>
<p>(2) Cross city model deployment: based on unsupervised learning methods, the system can explore the potential patterns and structures of the data itself, learn directly from a large amount of unlabeled monitoring data, and do not rely on labeled data for specific traffic scenarios. Therefore, it has better cross city retrieval capabilities, can adapt to the traffic environment of different cities, and achieve generalized deployment of the model.</p>
<fig id="fig-9">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-9.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<p>In this section, we will experimentally analyze the performance of the proposed method. The following four issues need to be considered: <bold>RQ1:</bold> Is the effect of label smoothing better than other methods. <bold>RQ2:</bold> How DCL affects the performance of models. <bold>RQ3:</bold> How to assess the impact of label smoothing beyond accuracy. <bold>RQ4:</bold> How to evaluate the contribution of contextual feature to global feature representation.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset and Evaluation Protocols</title>
<p>VeRi-776 [<xref ref-type="bibr" rid="ref-33">33</xref>] is a basic dataset widely used in vehicle re-identification research. It consists of over 50,000 images captured by 20 cameras covering 776 different vehicles. The training set contains 37,781 images of 576 vehicles, the query set contains 1678 images of 200 vehicles, and the gallery set contains 11,579 images of the same 200 vehicles.</p>
<p>VERI-Wild [<xref ref-type="bibr" rid="ref-34">34</xref>] is a large-scale vehicle re-identification dataset consisting of 416,314 images of 40,671 vehicles captured by 174 cameras. Different from the VeRi-776 [<xref ref-type="bibr" rid="ref-33">33</xref>] datasets, the VERI-Wild dataset has differences in illumination, weather and night changes caused by time span. The training set contains 277,797 images of 30,671 vehicles, and the test set contains 128,517 images of 10,000 vehicles and is further subdivided into three subsets of different sizes: Test3000, Test5000, and Test10000.</p>
<p>Following the general evaluation metrics in the vehicle Re-ID task, we use cumulative matching curve (CMC) and the mean average precision (mAP) proposed by Zheng et al. [<xref ref-type="bibr" rid="ref-35">35</xref>] to evaluate the performance of the proposed method.</p>
<p>Rank-k. Rank-k in the CMC curve is used to evaluate the matching degree of the model at different rankings. Rank-k calculation is as follows:<disp-formula id="eqn-12"><label>(12)</label><mml:math id="mml-eqn-12" display="block"><mml:mi>R</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>k</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>K</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mi>g</mml:mi><mml:mi>t</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mi>N</mml:mi></mml:mfrac><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-83"><mml:math id="mml-ieqn-83"><mml:mi mathvariant="bold-italic">N</mml:mi></mml:math></inline-formula> represents the total number of vehicle images in the query set. When there are accurately matched images in the <italic>K</italic>-th retrieved images <inline-formula id="ieqn-84"><mml:math id="mml-ieqn-84"><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="bold-italic">i</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:math></inline-formula>, otherwise <inline-formula id="ieqn-85"><mml:math id="mml-ieqn-85"><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="bold-italic">i</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext mathvariant="bold">0</mml:mtext></mml:mrow></mml:math></inline-formula>.</p>
<p>mAP. The average precision (AP) for each image in the query set is calculated as follows:<disp-formula id="eqn-13"><label>(13)</label><mml:math id="mml-eqn-13" display="block"><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:munderover><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mi>g</mml:mi><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mfrac><mml:mo>,</mml:mo></mml:math></disp-formula>where <italic>M</italic> is the length of the entire gallery set, <inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:msub><mml:mi mathvariant="bold-italic">N</mml:mi><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> denotes the number of images in the gallery set with the same ID as the query image, and <inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:mi mathvariant="bold-italic">P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="bold-italic">k</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes the accuracy of the top <italic>k</italic> query result. If the ID of the <italic>k</italic>-th image is the same as the query image <inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="bold-italic">k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext mathvariant="bold">1</mml:mtext></mml:mrow></mml:math></inline-formula>, otherwise <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mrow><mml:mi mathvariant="bold-italic">g</mml:mi><mml:mi mathvariant="bold-italic">t</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="bold-italic">k</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext mathvariant="bold">0</mml:mtext></mml:mrow></mml:math></inline-formula>. MAP is the average AP value of the entire query set N, which can be defined as:</p>
<p><disp-formula id="eqn-14"><label>(14)</label><mml:math id="mml-eqn-14" display="block"><mml:mi>m</mml:mi><mml:mi>A</mml:mi><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mi>A</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mi>N</mml:mi></mml:mfrac><mml:mo>.</mml:mo></mml:math></disp-formula></p>
<p>mAP comprehensively reflects the accuracy of the model across all retrieval results. A higher mAP value indicates superior performance of the model in accurately matching vehicles. Conversely, Rank-k metric signifies the probability that at least one of the top k retrieved results is a positive sample, with Rank-1 and Rank-5 being common evaluation criteria. These Rank-k metrics can more directly indicate the retrieval performance of the model in comparison to mAP. Nevertheless, neither Rank-k nor mAP alone can accurately describe the performance of the re identification system. Therefore, both indicators must be considered simultaneously to comprehensively represent the retrieval performance of the model.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Implementation Details</title>
<p>We adopt ResNet50 [<xref ref-type="bibr" rid="ref-36">36</xref>] as ours backbone, we remove all sub-module layers after the fourth layer and add the GAP operation as the representation of global features. Initialize these two student network parameters using ImageNet [<xref ref-type="bibr" rid="ref-37">37</xref>]. All experiments were performed on 2<inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> NVIDIA Tesla V100 GPU. Our training process divided into two stages. The whole training process is divided into 50 epochs, in the image clustering stage, we use DBSCAN [<xref ref-type="bibr" rid="ref-32">32</xref>] as the clustering algorithm to assign pseudo-labels to images. On the VeRi-776 dataset, the maximum distance d is set to 0.7, while on the VERI-Wild datasets, the maximum distance d is set to 0.6. Due to device limitations, all our VERI-Wild experiments use a uniform set of 40,000 images as the training set. During training phase, we chose an initial learning rate of 3 <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mo>&#x00D7;</mml:mo><mml:mspace width="thinmathspace" /><mml:mspace width="thinmathspace" /><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, decreasing by a factor of 10 every 20 epochs. Use the Adam optimizer to optimize the weights of the network and set the weight decay to 5 <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:mo>&#x00D7;</mml:mo><mml:mspace width="thinmathspace" /><mml:mspace width="thinmathspace" /><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. To augment the data, we used random horizontal flips and random occlusion [<xref ref-type="bibr" rid="ref-38">38</xref>], both with probability set to 0.5. During training phase, we set the weighting factor <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> of the pair-wise matrix to 0.7. In addition, the weight coefficient <inline-formula id="ieqn-94"><mml:math id="mml-ieqn-94"><mml:mi>&#x03B3;</mml:mi></mml:math></inline-formula> of <inline-formula id="ieqn-95"><mml:math id="mml-ieqn-95"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-96"><mml:math id="mml-ieqn-96"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is set to 0.8.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Ablation Studies</title>
<p><bold>Influence of the Different Modules:</bold> To verify the effectiveness of the proposed framework in the unsupervised vehicle Re-ID task, we conduct experiments to analyze the combination effect of the CPLR and DCL method. The explanation for each ablation module is as follows:
<list list-type="bullet">
<list-item>
<p>&#x201C;Baseline&#x201D; means using the traditional clustering-based unsupervised pipeline without any ablation modules.</p></list-item>
<list-item>
<p>&#x201C;<bold>w/</bold> CPLR&#x201D; indicates that only context-aware pseudo label refinement strategy based on &#x201C;Baseline&#x201D;.</p></list-item>
<list-item>
<p>&#x201C;<bold>w/</bold> DCL&#x201D; indicates employing only dual contrastive learning methods based on &#x201C;Baseline&#x201D;.</p></list-item>
<list-item>
<p>&#x201C;<bold>Ours</bold>&#x201D; indicates the use of two proposed ablation modules.</p></list-item>
</list></p>
<p>In the <xref ref-type="table" rid="table-1">Table 1</xref>, we compare the results of different combinations of modules. The results show that DCL exhibits significant performance in both datasets, owing to its ability to differentiate the distribution of samples within the two simulated subdomains. Additionally, employing CPLR improved R-1 and mAP by 4.9% and 4.7% respectively over the &#x201C;Baseline&#x201D; in VeRi-776, demonstrating its effectiveness in label purification. When integrated, the combination further enhanced accuracy, clearly illustrating the complementarity of the two modules, which offers more robust information in the realm of unsupervised vehicle enrichment.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Ablation studies on the impacts of individual components in VeRi-776 and VERI-Wild. &#x201C;<bold>w</bold>/&#x201D; denotes only using individual ablation modules. &#x201C;R-1&#x201D; and &#x201C;R-5&#x201D; represent the accuracy of Rank-1 and Rank-5, respectively. In subsequent experiments, we will keep the definitions of these indicators unchanged</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="3">VERI-Wild (Test3000)</th>
</tr>
<tr>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td>Baseline</td>
<td>79.6</td>
<td>85.6</td>
<td>35.1</td>
<td>51.8</td>
<td>75.9</td>
<td>27.3</td>
</tr>
<tr>
<td><bold>w/</bold> DCL</td>
<td>85.2</td>
<td>90.5</td>
<td>40.6</td>
<td>60.7</td>
<td>79.7</td>
<td>31.4</td>
</tr>
<tr>
<td><bold>w/</bold> CPLR</td>
<td>84.5</td>
<td>90.0</td>
<td>39.8</td>
<td>58.9</td>
<td>80.0</td>
<td>30.8</td>
</tr>
<tr>
<td><bold>Ours</bold></td>
<td><bold>87.8</bold></td>
<td><bold>92.1</bold></td>
<td><bold>43.2</bold></td>
<td><bold>62.8</bold></td>
<td><bold>82.8</bold></td>
<td><bold>32.8</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Influence of the Partitioning Factor: To explore the effect of random partitioning factor <inline-formula id="ieqn-97"><mml:math id="mml-ieqn-97"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula> in different dataset domains, we conduct experiments on VeRi-776 and VERI-Wild datasets. In each experiment, we keep the other hyper-parameters fixed and only adjust the partition factor <inline-formula id="ieqn-98"><mml:math id="mml-ieqn-98"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula>. The experimental results are shown in <xref ref-type="table" rid="table-2">Table 2</xref>. Experimental results show that a small value of <inline-formula id="ieqn-99"><mml:math id="mml-ieqn-99"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula> will lead to unreliable identity information and confidence, while a high value of <inline-formula id="ieqn-100"><mml:math id="mml-ieqn-100"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula> will reduce the accuracy of the model. Due to the partition of the data domain, the centroids update frequency in the subdomain and the loss function of the student network change, which leads to the network influence on the learning of the information in the sample. Experimental results prove that the process of pseudo-label assignment will lead to the difference of data distribution in the domain. Based on the above experimental results, we choose the value of <inline-formula id="ieqn-101"><mml:math id="mml-ieqn-101"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula> 0.7 and 0.8 as the basic parameters of VERI-Wild and VeRi-776 datasets in the subsequent experiments.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Factor of partitioning factor <inline-formula id="ieqn-102"><mml:math id="mml-ieqn-102"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> value results on the VERI-Wild and VeRi-776 datasets</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Parameters <inline-formula id="ieqn-103"><mml:math id="mml-ieqn-103"><mml:mi mathvariant="bold-italic">&#x03B2;</mml:mi></mml:math></inline-formula></th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="3">VERI-Wild (Test3000)</th>
</tr>
<tr>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td>0.1</td>
<td>80.1</td>
<td>87.8</td>
<td>34.9</td>
<td>51.7</td>
<td>74.9</td>
<td>24.3</td>
</tr>
<tr>
<td>0.2</td>
<td>81.5</td>
<td>88.3</td>
<td>35.4</td>
<td>53.7</td>
<td>77.8</td>
<td>25.4</td>
</tr>
<tr>
<td>0.3</td>
<td>81.8</td>
<td>88.9</td>
<td>35.5</td>
<td>56.6</td>
<td>78.4</td>
<td>28.8</td>
</tr>
<tr>
<td>0.4</td>
<td>83.6</td>
<td>90.5</td>
<td>37.6</td>
<td>59.1</td>
<td>80.0</td>
<td>30.3</td>
</tr>
<tr>
<td>0.5</td>
<td>84.7</td>
<td>90.5</td>
<td>39.6</td>
<td>61.5</td>
<td>81.1</td>
<td>31.4</td>
</tr>
<tr>
<td>0.6</td>
<td>85.6</td>
<td>91.7</td>
<td>40.8</td>
<td>61.8</td>
<td>81.1</td>
<td>32.0</td>
</tr>
<tr>
<td>0.7</td>
<td>87.2</td>
<td>91.5</td>
<td>42.1</td>
<td><bold>62.8</bold></td>
<td><bold>82.8</bold></td>
<td><bold>32.8</bold></td>
</tr>
<tr>
<td>0.8</td>
<td><bold>87.8</bold></td>
<td><bold>92.1</bold></td>
<td><bold>43.2</bold></td>
<td>62.4</td>
<td><bold>82.8</bold></td>
<td>32.6</td>
</tr>
<tr>
<td>0.9</td>
<td>86.0</td>
<td>90.4</td>
<td>41.2</td>
<td>61.3</td>
<td>80.9</td>
<td>31.2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><bold>Analysis of Loss Function:</bold> We explore the effect of different loss functions on model performance on VeRi-776 and VERI-Wild, and <xref ref-type="table" rid="table-3">Table 3</xref> displays the results of our loss function ablation. The first row illustrates the results of utilizing only <inline-formula id="ieqn-104"><mml:math id="mml-ieqn-104"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>tri</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> loss, which reduces mAP and R-1 by 8.1% and 12%, respectively. This indicates that the softmax-triplet loss cannot effectively optimize the model performance. Rows 2 and 3 show the ablation results of removing <inline-formula id="ieqn-105"><mml:math id="mml-ieqn-105"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>ccl</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-106"><mml:math id="mml-ieqn-106"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>cid</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>, respectively, with mAP dropped by 4.4% and 2.7% in VeRi-776, respectively. This demonstrates that by including contextual feature, feature quality can be improved and features with distinct identities may be effectively distinguished in <inline-formula id="ieqn-107"><mml:math id="mml-ieqn-107"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> loss. Additionally, <inline-formula id="ieqn-108"><mml:math id="mml-ieqn-108"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>ccl</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> push inter-class data farther and intra-class data closer, which will enhance the feature distribution.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Ablation studies on the effects of different loss function in VeRi-776 and VERI-Wild</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th><inline-formula id="ieqn-109"><mml:math id="mml-ieqn-109"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>(<inline-formula id="ieqn-110"><mml:math id="mml-ieqn-110"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>)</th>
<th rowspan="2"><inline-formula id="ieqn-111"><mml:math id="mml-ieqn-111"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula></th>
<th rowspan="2"><inline-formula id="ieqn-112"><mml:math id="mml-ieqn-112"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula></th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="3">VERI-Wild (Test3000)</th>
</tr>
<tr>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td>&#x221A;</td>
<td></td>
<td></td>
<td>80.1</td>
<td>87.6</td>
<td>35.2</td>
<td>51.6</td>
<td>73.3</td>
<td>25.1</td>
</tr>
<tr>
<td>&#x221A;</td>
<td></td>
<td>&#x221A;</td>
<td>83.6</td>
<td>90.0</td>
<td>38.8</td>
<td>59.6</td>
<td>80.6</td>
<td>29.6</td>
</tr>
<tr>
<td>&#x221A;</td>
<td>&#x221A;</td>
<td></td>
<td>85.1</td>
<td>90.6</td>
<td>40.5</td>
<td>60.8</td>
<td>81.4</td>
<td>31.0</td>
</tr>
<tr>
<td>&#x221A;</td>
<td>&#x221A;</td>
<td>&#x221A;</td>
<td><bold>87.8</bold></td>
<td><bold>92.1</bold></td>
<td><bold>43.2</bold></td>
<td><bold>62.8</bold></td>
<td><bold>82.8</bold></td>
<td><bold>32.8</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p><bold>Effect of the label refinement strategy:</bold> To answer <bold>RQ1</bold>. We investigated multiple pseudo-labeling strategies, ensuring fair comparisons by conducting all experiments within the &#x201C;Baseline&#x201D; model, as detailed in <xref ref-type="table" rid="table-4">Table 4</xref>. The &#x201C;One-hot&#x201D; label is derived from clustering results, with the correct cluster assigned a value of 1 and all others set to 0. The &#x201C;LSR&#x201D; strategy, introduced by Szegedy et al. [<xref ref-type="bibr" rid="ref-39">39</xref>], assigns a weight of 0.9 to the correct label, distributing the remaining weights evenly at 0.1 each. The &#x201C;OLS&#x201D; strategy, proposed by Zhang et al. [<xref ref-type="bibr" rid="ref-40">40</xref>], is an online label smoothing technique that leverages correct classifications from past epochs to refine label smoothing in the current epoch. The aforementioned method demonstrates that the hard labels produced by clustering are unreliable. However, these approaches all depend on the accuracy of global feature extraction. In contrast, &#x201C;CPLR&#x201D; exhibits superior effectiveness by synergistically smoothing labels using both contextual and global features, without relying on the quality of single feature extraction, thereby more effectively mitigating the impact of label noise.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Ablation studies on the effects of the label refinement strategy in VeRi-776 and VERI-Wild</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="3">VERI-Wild (Test3000)</th>
</tr>
<tr>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td>One-hot</td>
<td>79.6</td>
<td>85.6</td>
<td>35.1</td>
<td>51.8</td>
<td>75.9</td>
<td>27.3</td>
</tr>
<tr>
<td>LSR [<xref ref-type="bibr" rid="ref-39">39</xref>]</td>
<td>80.6</td>
<td>88.5</td>
<td>37.1</td>
<td>53.2</td>
<td>77.2</td>
<td>27.5</td>
</tr>
<tr>
<td>OLS [<xref ref-type="bibr" rid="ref-40">40</xref>]</td>
<td>81.7</td>
<td>89.2</td>
<td>38.8</td>
<td>55.6</td>
<td>78.8</td>
<td>28.5</td>
</tr>
<tr>
<td><bold>CPLR</bold></td>
<td><bold>84.5</bold></td>
<td><bold>90.0</bold></td>
<td><bold>39.8</bold></td>
<td><bold>58.9</bold></td>
<td><bold>80.0</bold></td>
<td><bold>30.8</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p><bold>Analysis of DCL:</bold> To answer <bold>RQ2</bold>, we conducted an in-depth analysis of DCL&#x2019;s impact on model performance. As shown in <xref ref-type="table" rid="table-5">Table 5</xref>, removing the Memory Bank (row 2) led to a significant drop in performance. This is because DCL depends on contrastive learning to extract intra-domain feature distributions. When momentum updates were removed (row 3), the model became heavily reliant on the quality of feature extraction at the start of each iteration, failing to utilize historical data feature distributions. We further examined the reliance on label information during contrastive learning (row 4). By removing label information from <inline-formula id="ieqn-113"><mml:math id="mml-ieqn-113"><mml:msub><mml:mrow><mml:mi>&#x02112;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mtext>ccl</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> (<xref ref-type="disp-formula" rid="eqn-2">Eq. (2)</xref>) and using ClusterNCE for the contrastive loss, the mAP of both datasets decreased by 1% and 0.8%, respectively, suggesting that label refinement somewhat lessens the reliance on label information. Additionally, we analyzed the impact of the teacher model on performance (row 5) by substituting the teacher model task in framework <xref ref-type="fig" rid="fig-1">Fig. 1</xref> with a student model, encompassing pseudo-label clustering assignments, data domain partitioning, and loss training. The results showed that if the dual contrast network only learns intra-domain information, it can easily lead to model overfitting. The teacher model enhances model robustness by jointly updating parameters with feature information from their respective sub-domain distributions, facilitated by Co-EMA methods of the student networks.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Analysis of DCL on model performance in VeRi-776 and VERI-Wild</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="3">VERI-Wild (Test3000)</th>
</tr>
<tr>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td><bold>Ours</bold></td>
<td><bold>87.8</bold></td>
<td><bold>92.1</bold></td>
<td><bold>43.2</bold></td>
<td><bold>62.8</bold></td>
<td><bold>82.8</bold></td>
<td><bold>32.8</bold></td>
</tr>
<tr>
<td>w/o Memory bank</td>
<td>83.6</td>
<td>90.0</td>
<td>38.8</td>
<td>59.6</td>
<td>80.6</td>
<td>29.6</td>
</tr>
<tr>
<td>w/o Momentum updating</td>
<td>83.7</td>
<td>90.8</td>
<td>39.9</td>
<td>60.3</td>
<td>81.6</td>
<td>30.6</td>
</tr>
<tr>
<td>w/o Label refinement</td>
<td>86.9</td>
<td>91.5</td>
<td>42.3</td>
<td>62.1</td>
<td>82.2</td>
<td>31.6</td>
</tr>
<tr>
<td>w/o Knowledge distillation</td>
<td>84.9</td>
<td>91.2</td>
<td>40.1</td>
<td>57.5</td>
<td>80.0</td>
<td>30.7</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Additionally, we analyzed the impact of various momentum update rates m on the model, as illustrated in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>. The momentum coefficient m closer to 1 indicates a slower update rate. A higher m value increases the model training process&#x2019;s dependency on the quality of centroid feature extraction. Conversely, as m approaches 0, centroid feature updates tend towards the current sample, potentially causing frequent updates and a consequent loss of information from other features within the same cluster. The optimal model accuracy is achieved when m approaches 0.1.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Ablation study of the momentum value m on model performance in VeRi-776 and VERI-Wild</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-4.tif"/>
</fig>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Comparison with State-of-the-Arts</title>
<p>We evaluate our method with other state-of-the-art unsupervised vehicle Re-ID techniques, including UDA and USL methods. As shown in <xref ref-type="table" rid="table-6">Table 6</xref>, the results for the two widely used vehicle datasets. We also use some open-source code in the field of object or person Re-ID in our research, as pure unsupervised vehicle re-identification approaches are currently limited. Such: MMT [<xref ref-type="bibr" rid="ref-28">28</xref>], ICE [<xref ref-type="bibr" rid="ref-10">10</xref>], HHCL [<xref ref-type="bibr" rid="ref-25">25</xref>], RLCC [<xref ref-type="bibr" rid="ref-41">41</xref>], UCF [<xref ref-type="bibr" rid="ref-9">9</xref>], Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>]. All results are from experiments conducted in their open-source code or on their published vehicle Re-ID dataset. For fair comparison, we have kept the basic parameters of the model consistent in the open-source code. Despite the simplicity of the approach, we demonstrated strong competitive performance across both datasets.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Comparision with the state-of-the-art methods on VeRi-776 and VERI-Wild. &#x201C;&#x002A;&#x201D; represents unsupervised domain adaptation (UDA) method</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th>References</th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="9">VERI-Wild</th>
</tr>
<tr>
<th/>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th align="center" colspan="3">Test3000</th>
<th align="center" colspan="3">Test5000</th>
<th align="center" colspan="3">Test10000</th>
</tr>
<tr>
<th/>
<th/>
<th/>
<th/>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td>MMT [<xref ref-type="bibr" rid="ref-28">28</xref>]&#x002A;</td>
<td>ICLR&#x2019;20</td>
<td>60.9</td>
<td>69.0</td>
<td>25.4</td>
<td>45.6</td>
<td>67.1</td>
<td>21.8</td>
<td>39.2</td>
<td>61.2</td>
<td>18.3</td>
<td>30.4</td>
<td>50.8</td>
<td>14.1</td>
</tr>
<tr>
<td>SPCL [<xref ref-type="bibr" rid="ref-11">11</xref>]</td>
<td>NIPS&#x2019;20</td>
<td>79.9</td>
<td>86.8</td>
<td>36.9</td>
<td>52.8</td>
<td>77.6</td>
<td>27.6</td>
<td>48.5</td>
<td>72.8</td>
<td>26.4</td>
<td>38.1</td>
<td>61.9</td>
<td>20.3</td>
</tr>
<tr>
<td>HHCL [<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>IC-NIDC&#x2019;21</td>
<td>69.6</td>
<td>75.6</td>
<td>31.0</td>
<td>56.3</td>
<td>79.7</td>
<td>30.2</td>
<td>49.2</td>
<td>73.3</td>
<td>26.1</td>
<td>38.3</td>
<td>62.3</td>
<td>20.5</td>
</tr>
<tr>
<td>RLCC [<xref ref-type="bibr" rid="ref-41">41</xref>]</td>
<td>CVPR&#x2019;21</td>
<td>83.4</td>
<td>88.8</td>
<td>39.6</td>
<td>55.2</td>
<td>79.4</td>
<td>29.1</td>
<td>47.3</td>
<td>72.8</td>
<td>24.9</td>
<td>37.0</td>
<td>62.2</td>
<td>19.5</td>
</tr>
<tr>
<td>ICE [<xref ref-type="bibr" rid="ref-10">10</xref>]</td>
<td>ICCV&#x2019;21</td>
<td>82.1</td>
<td>87.1</td>
<td>37.9</td>
<td>54.7</td>
<td>78.6</td>
<td>28.7</td>
<td>47.0</td>
<td>71.3</td>
<td>24.8</td>
<td>37.0</td>
<td>61.1</td>
<td>19.3</td>
</tr>
<tr>
<td>CACL [<xref ref-type="bibr" rid="ref-43">43</xref>]</td>
<td>TIP&#x2019;22</td>
<td>62.4</td>
<td>73.5</td>
<td>27.3</td>
<td>57.0</td>
<td>80.3</td>
<td>30.3</td>
<td>48.8</td>
<td>74.0</td>
<td>26.1</td>
<td>38.2</td>
<td>63.6</td>
<td>20.5</td>
</tr>
<tr>
<td>CTFRN [<xref ref-type="bibr" rid="ref-29">29</xref>]&#x002A;</td>
<td>PR&#x2019;22</td>
<td>76.7</td>
<td>81.5</td>
<td>37.1</td>
<td>61.3</td>
<td>82.4</td>
<td>32.3</td>
<td>51.8</td>
<td>74.9</td>
<td>27.5</td>
<td>42.1</td>
<td>65.7</td>
<td>22.1</td>
</tr>
<tr>
<td>Cluster-Contrast [<xref ref-type="bibr" rid="ref-13">13</xref>]</td>
<td>ACCV&#x2019;22</td>
<td>86.2</td>
<td>90.5</td>
<td>40.8</td>
<td>56.2</td>
<td>78.7</td>
<td>29.5</td>
<td>48.6</td>
<td>72.9</td>
<td>26.0</td>
<td>37.8</td>
<td>62.2</td>
<td>19.5</td>
</tr>
<tr>
<td>PPLR [<xref ref-type="bibr" rid="ref-16">16</xref>]</td>
<td>CVPR&#x2019;22</td>
<td>85.6</td>
<td>88.7</td>
<td>41.6</td>
<td>59.6</td>
<td>82.1</td>
<td>31.4</td>
<td>51.5</td>
<td>74.5</td>
<td>26.9</td>
<td>42.1</td>
<td>65.3</td>
<td>22.2</td>
</tr>
<tr>
<td>Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>]</td>
<td>TIP&#x2019;23</td>
<td>78.5</td>
<td>84.8</td>
<td>35.1</td>
<td>56.3</td>
<td>80.4</td>
<td>30.2</td>
<td>47.3</td>
<td>71.0</td>
<td>25.5</td>
<td>36.5</td>
<td>60.8</td>
<td>19.9</td>
</tr>
<tr>
<td>TDSR [<xref ref-type="bibr" rid="ref-3">3</xref>]&#x002A;</td>
<td>TITS&#x2019;23</td>
<td>86.8</td>
<td>92.1</td>
<td>40.0</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
<td>&#x2013;</td>
</tr>
<tr>
<td>UCF [<xref ref-type="bibr" rid="ref-9">9</xref>]&#x002A;</td>
<td>TMM&#x2019;23</td>
<td>85.2</td>
<td>89.2</td>
<td>40.5</td>
<td>62.3</td>
<td>82.9</td>
<td>32.3</td>
<td>51.0</td>
<td>74.3</td>
<td>26.1</td>
<td>41.8</td>
<td>65.2</td>
<td>21.8</td>
</tr>
<tr>
<td>Ours (ResNet50)</td>
<td>This paper</td>
<td>87.8</td>
<td><bold>92.1</bold></td>
<td><bold>43.2</bold></td>
<td>62.8</td>
<td>82.8</td>
<td>32.8</td>
<td>53.0</td>
<td>75.6</td>
<td>27.5</td>
<td>43.5</td>
<td>67.3</td>
<td>22.6</td>
</tr>
<tr>
<td><bold>Ours (IBN-ResNet50)</bold></td>
<td><bold>This paper</bold></td>
<td><bold>88.4</bold></td>
<td>91.8</td>
<td>42.6</td>
<td><bold>63.4</bold></td>
<td><bold>83.3</bold></td>
<td><bold>33.2</bold></td>
<td><bold>53</bold>.<bold>2</bold></td>
<td><bold>76.2</bold></td>
<td><bold>28.1</bold></td>
<td><bold>44.2</bold></td>
<td><bold>68.5</bold></td>
<td><bold>22</bold>.<bold>8</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p>UDA models (e.g., MMT [<xref ref-type="bibr" rid="ref-28">28</xref>], UCF [<xref ref-type="bibr" rid="ref-9">9</xref>], CTFRN [<xref ref-type="bibr" rid="ref-29">29</xref>], and TDSR [<xref ref-type="bibr" rid="ref-3">3</xref>]) first undergo a fully supervised learning phase in the source domain, followed by an unsupervised training phase in the target domain. These methods primarily address the challenges of data adaptability and domain discrepancies. For instance, MMT learns representations from the source domain data and uses a dual-teacher network to perform joint smoothing operations on the images, thereby facilitating domain adaptation. Our method is fully unsupervised and does not require the use of fully supervised source domain data for training. Our method has been implemented on ResNet50 [<xref ref-type="bibr" rid="ref-36">36</xref>] and IBN-ResNet50 [<xref ref-type="bibr" rid="ref-42">42</xref>] backbone networks. Specifically, the performance is achieved at mAP &#x003D; 43.2%, 32.8%, R-1 &#x003D; 87.8%, 62.8%, and R-5 &#x003D; 92.1%, 82.8% on the ResNet50 backbone for VeRi-776 and VERI-Wild (Test3000), respectively.</p>
<p>The methods SPCL [<xref ref-type="bibr" rid="ref-11">11</xref>], Cluster-Contrast [<xref ref-type="bibr" rid="ref-13">13</xref>], HHCL [<xref ref-type="bibr" rid="ref-25">25</xref>] and CACL [<xref ref-type="bibr" rid="ref-43">43</xref>] are based on contrastive learning techniques. These methods focus on how to optimize features within the data domain. Such SPCL proposes an automatic contrastive learning framework that makes use of a range of distinct category prototypes to provide hybrid supervision. HHCL proposes instance contrastive learning to mine the information between instances. Although these methods have explored optimizing feature space distribution, they have higher requirements for pseudo labels in data allocation. It is notable that the primary parameters of contrastive learning methods encompass the quantity of input data and update size for each iteration. Under the same momentum update parameters and batch size, our mAP on VeRi-776 and VERI-Wild (Test3000) improves by 6.3% and 5.2%, respectively, in comparison to SPCL.</p>
<p>Furthermore, our method outperforms several label refinement techniques, including UCF [<xref ref-type="bibr" rid="ref-9">9</xref>], Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>], ICE [<xref ref-type="bibr" rid="ref-10">10</xref>] and PPLR [<xref ref-type="bibr" rid="ref-16">16</xref>]. These methods employ teacher models or local features for label smoothing to mitigate pseudo-label noise. Although PPLR uses a label smoothing method, like most cluster-based algorithms, it only uses global features for cluster assignment pseudo-labels and relies on the reliability of the clusters. To lessen the inherent bias of global features, our method considers the contextual information of the images during the clustering process. These methods are based on the clustering label smoothing paradigm, and we uphold the consistency of the clustering algorithm, encompassing same clustering parameters, backbone architecture, training epochs and batch size. Compared to PPLR, our mAP in VeRi-776 and VERI-Wild (Test3000) improved by 1.6% and 1.4%, respectively.</p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Further Analysis</title>
<p><bold>Effective of different backbones.</bold> Considering that unsupervised vehicle Re-ID relies on the quality of visual feature extraction, we studied various mainstream CNN visual extraction backbone architectures (PCB, OSNet, DenseNet121, ResNet50) in <xref ref-type="table" rid="table-7">Table 7</xref>. It can be observed that the proposed method achieves optimal Re-ID performance when using ResNet50 as the backbone network. Compared to DenseNet, ResNet establishes residual connections between network blocks without additional parameters. This allows ResNet to maintain efficient performance while having fewer parameters and shorter back propagation time. In addition, although PCB and OCN are widely used as universal backbone networks for Re-ID, their scalability is limited due to their main design for pedestrian features, and they cannot effectively extract a wide range of contextual features for the granularity information of vehicles. Therefore, the experimental results and analysis have verified the rationality and superior performance of choosing ResNet50 as the backbone network in this paper.</p>
<table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Comparison of different backbones on VeRi-776 and VERI-Wild</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th align="center" colspan="3">VeRi-776</th>
<th align="center" colspan="3">VERI-Wild (Test3000)</th>
</tr>
<tr>
<th/>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
<th>R-1</th>
<th>R-5</th>
<th>mAP</th>
</tr>
</thead>
<tbody>
<tr>
<td>PCB [<xref ref-type="bibr" rid="ref-44">44</xref>]</td>
<td>78.3</td>
<td>81.8</td>
<td>36.5</td>
<td>52.3</td>
<td>76.5</td>
<td>28.3</td>
</tr>
<tr>
<td>OSNet [<xref ref-type="bibr" rid="ref-45">45</xref>]</td>
<td>84.9</td>
<td>89.3</td>
<td>39.6</td>
<td>59.2</td>
<td>80.6</td>
<td>30.7</td>
</tr>
<tr>
<td>DenseNet121 [<xref ref-type="bibr" rid="ref-46">46</xref>]</td>
<td>83.4</td>
<td>88.0</td>
<td>39.8</td>
<td>56.7</td>
<td>78.6</td>
<td>29.6</td>
</tr>
<tr>
<td><bold>ResNet50 [<xref ref-type="bibr" rid="ref-36">36</xref>]</bold></td>
<td><bold>87.8</bold></td>
<td><bold>92.1</bold></td>
<td><bold>43.2</bold></td>
<td><bold>62.8</bold></td>
<td><bold>82.8</bold></td>
<td><bold>32.8</bold></td>
</tr>
</tbody>
</table>
</table-wrap>
<p><bold>Performance analysis of the model.</bold> We evaluated the performance distinctions between &#x201C;Ours&#x201D; and other unsupervised methodologies in terms of models. Specifically, we conducted a thorough evaluation of the model across two dimensions: spatial complexity and time complexity. To maintain parity in the comparison, the time complexity indicator only uses the time consumption of a single epoch during the model training phase. As delineated in <xref ref-type="table" rid="table-8">Table 8</xref>, Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>] introduced a teacher-guide student model optimization framework that markedly escalates the time complexity when contrasted with single contrastive learning techniques such as cluster concentrate [<xref ref-type="bibr" rid="ref-13">13</xref>] and HHCL [<xref ref-type="bibr" rid="ref-25">25</xref>] by segmenting the image into three parts for contrastive learning. Furthermore, MMT [<xref ref-type="bibr" rid="ref-28">28</xref>] and CTFRN [<xref ref-type="bibr" rid="ref-29">29</xref>] implement a dual teacher-student model, which collaboratively smooths the labels of their respective region images, thereby incurring supplementary time consumption during the backpropagation process. &#x201C;Ours&#x201D; leverages self-attention to associate network bottleneck blocks to extract contextual features, demanding additional memory resources and failing to exhibit superiority over a solitary ResNet50 architecture. In summation, our approach is at an intermediary level, but within an acceptable performance overhead margin, it demonstrates superior model performance compared to other methods.</p>
<table-wrap id="table-8">
<label>Table 8</label>
<caption>
<title>Compare the performance of the model with other methods. &#x201C;Params&#x201D; represents the size of model parameters and are used to evaluate spatial complexity. &#x201C;Time (VeRi-776)&#x201D; and &#x201C;time (VERI-Wild)&#x201D; represent the running time in each epoch of their respective datasets, which are employed to evaluate time complexity</title>
</caption>
<table frame="hsides">
<colgroup>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Methods</th>
<th>Params (M)</th>
<th>Time (VeRi-776)</th>
<th>Time (VERI-Wild)</th>
</tr>
</thead>
<tbody>
<tr>
<td>MMT [<xref ref-type="bibr" rid="ref-28">28</xref>]</td>
<td>90</td>
<td>16.2 m</td>
<td>19.3 m</td>
</tr>
<tr>
<td>SPCL [<xref ref-type="bibr" rid="ref-11">11</xref>]</td>
<td>23.5</td>
<td>6.7 m</td>
<td>9.5 m</td>
</tr>
<tr>
<td>HHCL [<xref ref-type="bibr" rid="ref-25">25</xref>]</td>
<td>23.5</td>
<td>8.5 m</td>
<td>10.2 m</td>
</tr>
<tr>
<td>ICE [<xref ref-type="bibr" rid="ref-10">10</xref>]</td>
<td>23.5</td>
<td>9.1 m</td>
<td>12.7 m</td>
</tr>
<tr>
<td>CACL [<xref ref-type="bibr" rid="ref-43">43</xref>]</td>
<td>44.9</td>
<td>14.6 m</td>
<td>19.2 m</td>
</tr>
<tr>
<td>CTFRN [<xref ref-type="bibr" rid="ref-29">29</xref>]</td>
<td>90</td>
<td>15.2 m</td>
<td>20.6 m</td>
</tr>
<tr>
<td>Cluster-Contrast [<xref ref-type="bibr" rid="ref-13">13</xref>]</td>
<td>23.5</td>
<td>7.7 m</td>
<td>11.4 m</td>
</tr>
<tr>
<td>PPLR [<xref ref-type="bibr" rid="ref-16">16</xref>]</td>
<td>23.5</td>
<td>7.3 m</td>
<td>10.5 m</td>
</tr>
<tr>
<td>Lan et al. [<xref ref-type="bibr" rid="ref-15">15</xref>]</td>
<td>44.9</td>
<td>12.3 m</td>
<td>15.6 m</td>
</tr>
<tr>
<td>UCF [<xref ref-type="bibr" rid="ref-8">8</xref>]</td>
<td>44.9</td>
<td>10.2 m</td>
<td>14.1 m</td>
</tr>
<tr>
<td>Ours (ResNet50)</td>
<td>33.4</td>
<td>9.5 m</td>
<td>13.8 m</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Visual Quality</title>
<p><bold>T-SNE visualization:</bold> To intuitively demonstrate the clustering effectiveness of our proposed method, we employed T-SNE [<xref ref-type="bibr" rid="ref-47">47</xref>] to analyze the feature extraction results of different components. We randomly selected 15 samples from the VeRi-776 dataset, with each category represented by one sample. As illustrated in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>, considering the red ID sample, the features extracted by the &#x201C;baseline&#x201D; have a relatively scattered distribution in the feature space, making it difficult to effectively distinguish individual samples. The DCL method demonstrates a more focused feature distribution for simple data, yet it is not sufficiently distinct for distinguishing challenging samples. Following the integration of the CPLR, the discriminability of the feature distribution has improved; however, the features are still not sufficiently concentrated within the feature space, leading to confusion with similar data. By combining both modules, the extracted features demonstrate a more tightly clustered distribution within the feature space, facilitating clear differentiation between categories.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>We selected 15 categories from VeRi-776 for T-SNE of different ablation modules, with different color points representing different categories. (a) Baseline; (b) w/ DCL; (c) w/ CPLR; (d) Ours</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-5.tif"/>
</fig>
<p><bold>Cluster pseudo-label quality:</bold> To answer <bold>RQ3</bold>, we followed the pair-wise precision proposed by Wang et al. [<xref ref-type="bibr" rid="ref-48">48</xref>]. To evaluate the quality information of pseudo labels generated by our method and baseline during clustering. Due to the limitations of the DBSCAN on clustering results, we uniformly set the maximum distance d to 0.7. Firstly, we construct <inline-formula id="ieqn-114"><mml:math id="mml-ieqn-114"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, <inline-formula id="ieqn-115"><mml:math id="mml-ieqn-115"><mml:msub><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> matrixs for the entire sample. <inline-formula id="ieqn-116"><mml:math id="mml-ieqn-116"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> indicates whether samples i and j share the same pseudo label, where <inline-formula id="ieqn-117"><mml:math id="mml-ieqn-117"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> represents a same cluster, <inline-formula id="ieqn-118"><mml:math id="mml-ieqn-118"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula> represents different clusters. Similar usage of <inline-formula id="ieqn-119"><mml:math id="mml-ieqn-119"><mml:msub><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> represents whether samples i and j have the same true label, <inline-formula id="ieqn-120"><mml:math id="mml-ieqn-120"><mml:msub><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> represents a same cluster, <inline-formula id="ieqn-121"><mml:math id="mml-ieqn-121"><mml:msub><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula> represents different clusters. We use TP to represent positive sample pairs and FP to represent negative sample pairs. However, when calculating TP and FP, we consider the existence of outliers. The accuracy P of clustering is calculated as: <inline-formula id="ieqn-122"><mml:math id="mml-ieqn-122"><mml:mi>P</mml:mi><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where TP denotes <inline-formula id="ieqn-123"><mml:math id="mml-ieqn-123"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mspace width="thinmathspace" /><mml:mi mathvariant="normal">&#x0026;</mml:mi><mml:mspace width="thinmathspace" /><mml:msub><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>, FP denotes <inline-formula id="ieqn-124"><mml:math id="mml-ieqn-124"><mml:msub><mml:mi>&#x03C9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mspace width="thinmathspace" /><mml:mi mathvariant="normal">&#x0026;</mml:mi><mml:mspace width="thinmathspace" /><mml:msub><mml:mi mathvariant="normal">&#x03A9;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>. As shown in <xref ref-type="fig" rid="fig-6">Fig. 6</xref>, our method significantly improves the accuracy of clustering, with higher quality clustering and fewer outliers, providing reliable pseudo labels for model training.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Quantification of pseudo label quality (%) for the proposed method and baseline on the VeRi-776 dataset. Pair-wise precision represents the accuracy of clustering</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-6.tif"/>
</fig>
<p>Rank-list visualization: To verify the qualitative outcomes of our proposed method, we compared the rank-list visualization with the &#x201C;Baseline&#x201D; and chose three different viewing angles for the retrieval. As shown in <xref ref-type="fig" rid="fig-7">Fig. 7</xref>, the experimental results reveal that the &#x201C;Baseline&#x201D; tends to include images with similar backgrounds and resolutions in the query results, leading to incorrect matches that resemble the query samples in appearance and viewpoint. In contrast, &#x201C;Ours&#x201D; mitigates the disturbances from variations in viewpoint, background, and lighting intensity. This indicates that our approach can better distinguish, capturing their contextual information, and effectively distinguish negative samples.</p>
<fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Retrieve Rank-5 visualization. Matched and unmatched images are marked in green and red, respectively. (a) Baseline; (b) w/ DCL; (c) w/ CPLR; (d) Ours</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-7.tif"/>
</fig>
<p><bold>Attention map visualization:</bold> To answer <bold>RQ4</bold>. We utilized Grad-CAM [<xref ref-type="bibr" rid="ref-49">49</xref>] for the ablation analysis of model feature predictions. We randomly selected three groups of images with different views, as shown in <xref ref-type="fig" rid="fig-8">Fig. 8</xref>. The focus distribution of the &#x201C;Baseline&#x201D; is relatively scattered, making it susceptible to viewpoint changes. However, with the integration of the CPLR module, additional contextual information is incorporated. This suggests that semantic contextual features can focus on fine-grained information within the image, resulting in more precise prediction outcomes. When DCL is combined with CPLR, the attention information in the image is further amplified.</p>
<fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Visual analysis of Grad CAM ablation models for three sample vehicles. (a) Baseline; (b) w/ DCL; (c) w/ CPLR; (d) Ours</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_58586-fig-8.tif"/>
</fig>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusion and Future Works</title>
<p>In this work, we propose a novel unsupervised vehicle Re-ID framework to mitigate the label noise and data domain distribution problem. First, we design a DCL training method to optimize the distance distribution of data features, which effectively improves the imbalance of data domain distribution through online joint training of teachers and dual contrastive networks. Furthermore, we introduce a CPLR strategy that progressively integrates granular information from various layers of the network to extract contextual features, thereby generating more reliable pseudo-labels in conjunction with global features. Extensive experiments have confirmed the effectiveness of our approach. In future work, we will extend our research to vehicle Re-ID in video streams, with a particular emphasis on enhancing the model&#x2019;s comprehension of spatial-temporal features. Additionally, we will address the challenges posed by occlusion and variations in image quality, while further investigating the performance of vehicle Re-ID in real-world scenarios.</p>
</sec>
</body>
<back>
<ack><p>The authors thank all editors and anonymous reviewers for suggestions, as well as to all members who have supported and contributed to this work.</p>
</ack>
<sec><title>Funding Statement</title>
<p>This work was supported by the National Natural Science Foundation of China under Grant Nos. 62461037, 62076117 and 62166026, the Jiangxi Provincial Natural Science Foundation under Grant Nos. 20224BAB212011, 20232BAB202051, 20232BAB212008 and 20242BAB25078 and the Jiangxi Provincial Key Laboratory of Virtual Reality under Grant No. 2024SSY03151.</p>
</sec>
<sec><title>Author Contributions</title>
<p>Study conception and design: Jiyang Xu, Qi Wang, Xin Xiong, Weidong Min; data collection: Jiang Luo, Di Gai, Qing Han; analysis and interpretation of results: Jiyang Xu, Qi Wang, Weidong Min, Di Gai; draft manuscript preparation: Jiyang Xu, Qi Wang, Xin Xiong. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability"><title>Availability of Data and Materials</title>
<p>Data will be made available on request.</p>
</sec>
<sec><title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>H</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>A</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Day-night cross-domain vehicle re-identification</article-title>. In: <source>IEEE/CVF Conference on Computer Vision and Pattern Recognition</source>; <year>2024 Jun</year>. p. <fpage>12626</fpage>&#x2013;<lpage>35</lpage>. doi:<pub-id pub-id-type="doi">10.1109/cvpr52733.2024.01200</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>W</given-names></string-name>, <string-name><surname>He</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Vehicle re-identification model based on optimized DenseNet121 with joint loss</article-title>. <source>Comput Mater Contin</source>. <year>2021 Jan</year>;<volume>67</volume>(<issue>3</issue>):<fpage>3933</fpage>&#x2013;<lpage>48</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmc.2021.016560</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wei</surname> <given-names>R</given-names></string-name>, <string-name><surname>Gu</surname> <given-names>J</given-names></string-name>, <string-name><surname>He</surname> <given-names>S</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>W</given-names></string-name></person-group>. <article-title>Transformer-based domain-specific representation for unsupervised domain adaptive vehicle re-identification</article-title>. <source>IEEE Trans Intell Transp Syst</source>. <year>2022 Dec</year>;<volume>24</volume>(<issue>3</issue>):<fpage>2935</fpage>&#x2013;<lpage>46</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TITS.2022.3225025</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Min</surname> <given-names>W</given-names></string-name>, <string-name><surname>Han</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Zha</surname> <given-names>C</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>H</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Inter-domain adaptation label for data augmentation in vehicle re-identification</article-title>. <source>IEEE Trans Multimed</source>. <year>2021 Aug</year>;<volume>24</volume>:<fpage>1031</fpage>&#x2013;<lpage>41</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TMM.2021.3104141</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Sun</surname> <given-names>W</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Dai</surname> <given-names>G</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>P</given-names></string-name>, <string-name><surname>He</surname> <given-names>X</given-names></string-name></person-group>. <article-title>A multi-feature learning model with enhanced local attention for vehicle re-identification</article-title>. <source>Comput Mater Contin</source>. <year>2021 Jan</year>;<volume>69</volume>(<issue>3</issue>):<fpage>3549</fpage>&#x2013;<lpage>61</lpage>. doi:<pub-id pub-id-type="doi">10.32604/cmc.2021.021627</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>J</given-names></string-name>, <string-name><surname>Pang</surname> <given-names>M</given-names></string-name>, <string-name><surname>Dong</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Jia</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>B</given-names></string-name></person-group>. <article-title>Graph neural network explanations are fragile</article-title>. <source>Proc 41st Int Conf Mach Learn</source>. <year>2024</year>;<volume>235</volume>:<fpage>28551</fpage>&#x2013;<lpage>67</lpage>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Pang</surname> <given-names>M</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Ye</surname> <given-names>M</given-names></string-name>, <string-name><surname>Cheung</surname> <given-names>Y-M</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>W</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Heterogeneous prototype learning from contaminated faces across domains via disentangling latent factors</article-title>. <source>IEEE Trans Neural Netw Learn Syst</source>. <year>2024 Jan</year>;<fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TNNLS.2024.3393072</pub-id>; <pub-id pub-id-type="pmid">38691434</pub-id></mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zheng</surname> <given-names>A</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>X</given-names></string-name>, <string-name><surname>Li</surname> <given-names>C</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Viewpoint-aware progressive clustering for unsupervised vehicle re-identification</article-title>. <source>IEEE Trans Intell Transp Syst</source>. <year>2021 Aug</year>;<volume>23</volume>(<issue>8</issue>):<fpage>11422</fpage>&#x2013;<lpage>35</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TITS.2021.3103961</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>P</given-names></string-name>, <string-name><surname>Ding</surname> <given-names>C</given-names></string-name>, <string-name><surname>Tan</surname> <given-names>W</given-names></string-name>, <string-name><surname>Gong</surname> <given-names>M</given-names></string-name>, <string-name><surname>Jia</surname> <given-names>K</given-names></string-name>, <string-name><surname>Tao</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Uncertainty-aware clustering for unsupervised domain adaptive object re-identification</article-title>. <source>IEEE Trans Multimed</source>. <year>2022 Feb</year>;<volume>25</volume>:<fpage>2624</fpage>&#x2013;<lpage>35</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TMM.2022.3149629</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>H</given-names></string-name>, <string-name><surname>Lagadec</surname> <given-names>B</given-names></string-name>, <string-name><surname>Bremond</surname> <given-names>F</given-names></string-name></person-group>. <article-title>ICE: inter-instance contrastive encoding for unsuper-vised person re-identification</article-title>. In: <conf-name>Proceedings of the 2021 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>; <year>2021 Oct</year>. doi:<pub-id pub-id-type="doi">10.1109/iccv48922.2021.01469</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ge</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>F</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>D</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>R</given-names></string-name>, <string-name><surname>Li</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Self-paced contrastive learning with hybrid memory for domain adaptive object Re-ID</article-title>. <source>Neural Inform Process Syst</source>. <year>2020 Jun</year>;<volume>33</volume>:<fpage>11309</fpage>&#x2013;<lpage>21</lpage>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>R</given-names></string-name>, <string-name><surname>He</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Mask-aware pseudo label denoising for unsupervised vehicle re-identification</article-title>. <source>IEEE Trans Intell Transp Syst</source>. <year>2023 Jan</year>;<volume>24</volume>(<issue>4</issue>):<fpage>4333</fpage>&#x2013;<lpage>47</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TITS.2022.3233565</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Dai</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>G</given-names></string-name>, <string-name><surname>Yuan</surname> <given-names>W</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Tan</surname> <given-names>P</given-names></string-name></person-group>. <article-title>Cluster contrast for unsupervised person re-identification</article-title>. In: <conf-name>Proceedings of the Asian Conference on Computer Vision</conf-name>; <year>2022</year>. p. <fpage>1142</fpage>&#x2013;<lpage>60</lpage>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Pang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>C</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>L</given-names></string-name></person-group>. <article-title>Reliability modeling and contrastive learning for unsu- pervised person re-identification</article-title>. <source>Knowl-Based Syst</source>. <year>2023 Jan</year>;<volume>263</volume>(<issue>6</issue>):<fpage>110263</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.knosys.2023.110263</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lan</surname> <given-names>L</given-names></string-name>, <string-name><surname>Teng</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Tao</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Learning to purification for unsupervised person re-identification</article-title>. <source>IEEE Trans Image Process</source>. <year>2023 Jan</year>;<volume>32</volume>(<issue>34</issue>):<fpage>3338</fpage>&#x2013;<lpage>53</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TIP.2023.3278860</pub-id>; <pub-id pub-id-type="pmid">37235471</pub-id></mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Cho</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>WJ</given-names></string-name>, <string-name><surname>Hong</surname> <given-names>S</given-names></string-name>, <string-name><surname>Yoon</surname> <given-names>S-E</given-names></string-name></person-group>. <article-title>Part-based pseudo label refinement for unsupervised person re-identification</article-title>. In: <conf-name>Proceedings of the 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>; <year>2022 Jun</year>. doi:<pub-id pub-id-type="doi">10.1109/cvpr52688.2022.00716</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ni</surname> <given-names>H</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Gao</surname> <given-names>L</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>HT</given-names></string-name>, <string-name><surname>Song</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Part-aware transformer for generalizable person re-identification</article-title>. In: <conf-name>2023 IEEE/CVF International Conference on Computer Vision (ICCV)</conf-name>; <year>2023</year>. doi:<pub-id pub-id-type="doi">10.1109/iccv51070.2023.01036</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Durrani</surname> <given-names>TS</given-names></string-name></person-group>. <article-title>Part-guided graph convolution networks for person re-identification</article-title>. <source>Pattern Recognit</source>. <year>2021 Jun</year>;<volume>120</volume>(<issue>3</issue>):<fpage>108155</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patcog.2021.108155</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Graph-based progressive fusion network for multi-modality vehicle re-identification</article-title>. <source>IEEE Trans Intell Transp Syst</source>. <year>2023 Jun</year>;<volume>24</volume>(<issue>11</issue>):<fpage>12431</fpage>&#x2013;<lpage>47</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TITS.2023.3285758</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Zhong</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Min</surname> <given-names>W</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>H</given-names></string-name>, <string-name><surname>Gai</surname> <given-names>D</given-names></string-name>, <string-name><surname>Han</surname> <given-names>Q</given-names></string-name></person-group>. <article-title>Dual similarity pre-training and domain difference encouragement learning for vehicle re-identification in the wild</article-title>. <source>Pattern Recognit</source>. <year> 2023 Jul</year>;<volume>139</volume>(<issue>2</issue>):<fpage>109513</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patcog.2023.109513</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ding</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Fan</surname> <given-names>H</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>M</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Adaptive exploration for unsupervised person re-identification</article-title>. <source>ACM Trans Multimed Comput Commun Appl</source>. <year>2020 Feb</year>;<volume>16</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>19</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3369393</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhong</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Li</surname> <given-names>S</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Invariance matters: Exemplar memory for domain adaptive person re-identification</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2019</year>. p. <fpage>598</fpage>&#x2013;<lpage>607</lpage>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Fan</surname> <given-names>H</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>S</given-names></string-name>, <string-name><surname>Girshick</surname> <given-names>R</given-names></string-name></person-group>. <article-title>Momentum contrast for unsupervised visual representation learning</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2020</year>. p. <fpage>9729</fpage>&#x2013;<lpage>38</lpage>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Qiu</surname> <given-names>M</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>X</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Q</given-names></string-name></person-group>. <article-title>Camera-aware differentiated clustering with focal contrastive learning for unsupervised vehicle re-identification</article-title>. <source>IEEE Trans Circuits Syst Video Technol</source>. <year> 2024 Jan</year>;<volume>34</volume>(<issue>10</issue>):<fpage>10121</fpage>&#x2013;<lpage>34</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCSVT.2024.3402109</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Hu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>C</given-names></string-name>, <string-name><surname>He</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Hard-sample guided hybrid contrast learning for unsupervised person re-identification</article-title>. In: <conf-name>2021 7th IEEE International Conference on Network Intelligence and Digital Content (IC-NIDC)</conf-name>; <year>2021</year>. p. <fpage>91</fpage>&#x2013;<lpage>5</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ic-nidc54101.2021.9660560</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Yang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Fang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>H</given-names></string-name></person-group>. <article-title>ConMAE: Contour guided MAE for unsupervised vehicle re-identification</article-title>. In: <conf-name>2023 35th Chinese Control and Decision Conference (CCDC)</conf-name>; <year>2023 May</year>; <publisher-loc>Yichang, China</publisher-loc>. p. <fpage>4616</fpage>&#x2013;<lpage>22</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CCDC58219.2023.10327202</pub-id>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Hinton</surname> <given-names>G</given-names></string-name>, <string-name><surname>Vinyals</surname> <given-names>O</given-names></string-name>, <string-name><surname>Dean</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Distilling the knowledge in a neural network</article-title>. <comment>arXiv:1503.02531. 2015</comment>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ge</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>D</given-names></string-name>, <string-name><surname>Li</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Mutual mean-teaching: Pseudo label refinery for unsupervised domain adaptation on person re-identification</article-title>. In: <conf-name>Proceedings of the International Conference on Learning Representations</conf-name>; <year>2020</year>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zheng</surname> <given-names>D</given-names></string-name>, <string-name><surname>Xiao</surname> <given-names>J</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>K</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Soft pseudo-label shrinkage for unsupervised domain adaptive person re-identification</article-title>. <source>Pattern Recognit</source>. <year>2022 Feb</year>;<volume>127</volume>(<issue>2</issue>):<fpage>108615</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patcog.2022.108615</pub-id>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Shi</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>S</given-names></string-name>, <string-name><surname>Yin</surname> <given-names>X</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Qu</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>CLIP-guided federated learning on heterogeneity and long-tailed data</article-title>. <source>Proc AAAI Conf Artif Intell</source>. <year>2024 Mar</year>;<volume>38</volume>(<issue>13</issue>):<fpage>14955</fpage>&#x2013;<lpage>63</lpage>. doi:<pub-id pub-id-type="doi">10.1609/aaai.v38i13.29416</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Hermans</surname> <given-names>A</given-names></string-name>, <string-name><surname>Beyer</surname> <given-names>L</given-names></string-name>, <string-name><surname>Leibe</surname> <given-names>B</given-names></string-name></person-group>. <article-title>In defense of the triplet loss for person re-identification</article-title>. <comment>arXiv:1703.07737. 2017</comment>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ester</surname> <given-names>M</given-names></string-name>, <string-name><surname>Kriegel</surname> <given-names>H -P</given-names></string-name>, <string-name><surname>Sander</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>X</given-names></string-name></person-group>. <article-title>A density-based algorithm for discovering clusters in large spatial databases with noise</article-title>. In: <conf-name>Proceedings of the Second International Conference on Knowledge Discovery and Data Mining</conf-name>; <year>1996 Jan</year>. p. <fpage>226</fpage>&#x2013;<lpage>31</lpage>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Tian</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Pang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>T</given-names></string-name></person-group>. <article-title>Deep relative distance learning: tell the difference between similar vehicles</article-title>. In: <conf-name> IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2016 Jun</year>. p. <fpage>2167</fpage>&#x2013;<lpage>75</lpage>. doi:<pub-id pub-id-type="doi">10.1109/cvpr.2016.238</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Lou</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Bai</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Duan</surname> <given-names>L</given-names></string-name></person-group>. <article-title>VERI-Wild: A large dataset and a new method for vehicle re-identification in the wild</article-title>. <source>Comput Vis Pattern Recognit</source>. <year>2019 Jun</year>;<fpage>3230</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1109/cvpr.2019.00335</pub-id>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Shen</surname> <given-names>L</given-names></string-name>, <string-name><surname>Tian</surname> <given-names>L</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tian</surname> <given-names>Q</given-names></string-name></person-group>. <article-title>Scalable person re-identification: a benchmark</article-title>. In: <source>IEEE/CVF Conference on Computer Vision;</source>. <year>2015 Dec</year>. p. <fpage>1116</fpage>&#x2013;<lpage>24</lpage>. doi:<pub-id pub-id-type="doi">10.1109/iccv.2015.133</pub-id>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>He</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Deep residual learning for image recognition</article-title>. In: <source>IEEE/CVF Conference on Computer Vision and Pattern Recognition;</source> <year>2016 Jun</year>. doi:<pub-id pub-id-type="doi">10.1109/cvpr.2016.90</pub-id>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Deng</surname> <given-names>J</given-names></string-name>, <string-name><surname>Dong</surname> <given-names>W</given-names></string-name>, <string-name><surname>Socher</surname> <given-names>R</given-names></string-name>, <string-name><surname>Li</surname> <given-names>L -J</given-names></string-name>, <string-name><surname>Li</surname> <given-names>NK</given-names></string-name>, <string-name><surname>Fei-Fei</surname> <given-names>L</given-names></string-name></person-group>. <article-title>ImageNet: a large-scale hierarchical image database</article-title>. In: <conf-name>2009 IEEE Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2009 Jun</year>. doi:<pub-id pub-id-type="doi">10.1109/cvpr.2009.5206848</pub-id>.</mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhong</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Kang</surname> <given-names>G</given-names></string-name>, <string-name><surname>Li</surname> <given-names>S</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Random erasing data augmentation</article-title>. <source>Proc AAAI Conf Artif Intell</source>. <year>2020 Apr</year>;<volume>34</volume>(<issue>7</issue>):<fpage>13001</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1609/aaai.v34i07.7000</pub-id>.</mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Szegedy</surname> <given-names>C</given-names></string-name>, <string-name><surname>Vanhoucke</surname> <given-names>V</given-names></string-name>, <string-name><surname>Ioffe</surname> <given-names>S</given-names></string-name>, <string-name><surname>Shlens</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wojna</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>Rethinking the inception architecture for computer vision</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2016</year>. p. <fpage>2818</fpage>&#x2013;<lpage>26</lpage>.</mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>C-B</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>P-T</given-names></string-name>, <string-name><surname>Hou</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Han</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Delving deep into label smoothing</article-title>. <source>IEEE Trans Image Process</source>. <year>2021 Jan</year>;<volume>30</volume>:<fpage>5984</fpage>&#x2013;<lpage>96</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TIP.2021.3089942</pub-id>; <pub-id pub-id-type="pmid">34166191</pub-id></mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ge</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Qiao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>H</given-names></string-name></person-group>. <article-title>Refining pseudo labels with clustering consensus over generations for unsupervised object re-identification</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2021 Jun</year>. p. <fpage>3435</fpage>&#x2013;<lpage>44</lpage>. doi:<pub-id pub-id-type="doi">10.1109/cvpr46437.2021.00344</pub-id>.</mixed-citation></ref>
<ref id="ref-42"><label>[42]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Pan</surname> <given-names>X</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>P</given-names></string-name>, <string-name><surname>Shi</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>X</given-names></string-name></person-group>. <article-title>Two at once: enhancing learning and generalization capacities via IBN-Net</article-title>. In: <conf-name>Proceedings of the European Conference on Computer Vision (ECCV)</conf-name>; <year>2018</year>. p. <fpage>484</fpage>&#x2013;<lpage>500</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-030-01225-0_29</pub-id>.</mixed-citation></ref>
<ref id="ref-43"><label>[43]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>M</given-names></string-name>, <string-name><surname>Li</surname> <given-names>C-G</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Cluster-guided asymmetric contrastive learning for unsupervised person re-identification</article-title>. <source>IEEE Trans Image Process</source>. <year>2022 Jan</year>;<volume>31</volume>:<fpage>3606</fpage>&#x2013;<lpage>17</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TIP.2022.3173163</pub-id>; <pub-id pub-id-type="pmid">35576408</pub-id></mixed-citation></ref>
<ref id="ref-44"><label>[44]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Sun</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Tian</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Beyond part models: person retrieval with refined part pooling (and a strong convolutional baseline)</article-title>. In: <conf-name> Proceedings of the European Conference on Computer Vision (ECCV)</conf-name>; <year>2018</year>. p. <fpage>480</fpage>&#x2013;<lpage>96</lpage>.</mixed-citation></ref>
<ref id="ref-45"><label>[45]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhou</surname> <given-names>K</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Cavallaro</surname> <given-names>A</given-names></string-name>, <string-name><surname>Xiang</surname> <given-names>T</given-names></string-name></person-group>. <article-title>Omni-scale feature learning for person re-identification</article-title>. In: <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>; <year>2019 Oct</year>. p. <fpage>3701</fpage>&#x2013;<lpage>11</lpage>. doi:<pub-id pub-id-type="doi">10.1109/iccv.2019.00380</pub-id>.</mixed-citation></ref>
<ref id="ref-46"><label>[46]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Huang</surname> <given-names>G</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>van der Maaten</surname> <given-names>L</given-names></string-name>, <string-name><surname>Weinberger</surname> <given-names>KQ</given-names></string-name></person-group>. <article-title>Densely connected convolutional networks</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <year>2017</year>. p. <fpage>4700</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation></ref>
<ref id="ref-47"><label>[47]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>van der Maaten</surname> <given-names>L</given-names></string-name>, <string-name><surname>Hinton</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Visualizing data using t-SNE</article-title>. <source>J Mach Learn Res</source>. <year>2008 Jan</year>;<volume>9</volume>(<issue>86</issue>):<fpage>2579</fpage>&#x2013;<lpage>605</lpage>.</mixed-citation></ref>
<ref id="ref-48"><label>[48]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>M</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>W-S</given-names></string-name></person-group>. <article-title>Pseudo-label noise prevention, suppression and softening for unsupervised person re-identification</article-title>. <source>IEEE Trans Inf Forensics Secur</source>. <year>2023 Jan</year>;<volume>18</volume>:<fpage>3222</fpage>&#x2013;<lpage>37</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TIFS.2023.3277694</pub-id>.</mixed-citation></ref>
<ref id="ref-49"><label>[49]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Selvaraju</surname> <given-names>RR</given-names></string-name>, <string-name><surname>Cogswell</surname> <given-names>M</given-names></string-name>, <string-name><surname>Das</surname> <given-names>A</given-names></string-name>, <string-name><surname>Vedantam</surname> <given-names>R</given-names></string-name>, <string-name><surname>Parikh</surname> <given-names>D</given-names></string-name>, <string-name><surname>Batra</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title>. <source>Int J Comput Vis</source>. <year>Oct. 2019</year>;<volume>128</volume>(<issue>2</issue>):<fpage>336</fpage>&#x2013;<lpage>59</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>