<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">72633</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2026.072633</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>A Cooperative Hybrid Learning Framework for Automated Dandruff Severity Grading</article-title>
<alt-title alt-title-type="left-running-head">A Cooperative Hybrid Learning Framework for Automated Dandruff Severity Grading</alt-title>
<alt-title alt-title-type="right-running-head">A Cooperative Hybrid Learning Framework for Automated Dandruff Severity Grading</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Jhong</surname><given-names>Sin-Ye</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Hsu</surname><given-names>Hui-Che</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Huang</surname><given-names>Hsin-Hua</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-4" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Hsia</surname><given-names>Chih-Hsien</given-names></name><xref ref-type="aff" rid="aff-3">3</xref><xref ref-type="aff" rid="aff-4">4</xref><email>hsiach@niu.edu.tw</email></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Harjoseputro</surname><given-names>Yulius</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<contrib id="author-6" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Chen</surname><given-names>Yung-Yao</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><email>yungyaochen@gapps.ntust.edu.tw</email></contrib>
<aff id="aff-1"><label>1</label><institution>Graduate Institute of Intelligent Manufacturing Technology, National Taiwan University of Science and Technology</institution>, Taipei, <addr-line>106335</addr-line>, <country>Taiwan</country></aff>
<aff id="aff-2"><label>2</label><institution>Department of Electronic and Computer Engineering, National Taiwan University of Science and Technology</institution>, Taipei, <addr-line>106335</addr-line>, <country>Taiwan</country></aff>
<aff id="aff-3"><label>3</label><institution>Department of Computer Science and Information Engineering, National Ilan University</institution>, <addr-line>Yilan, 26047</addr-line>, <country>Taiwan</country></aff>
<aff id="aff-4"><label>4</label><institution>Office of Research and Industry-Academia Development, Chaoyang University of Technology</institution>, <addr-line>Taichung City, 413310</addr-line>, <country>Taiwan</country></aff>
<aff id="aff-5"><label>5</label><institution>Department Informatics, Universitas Atma Jaya Yogyakarta</institution>, <addr-line>Yogyakarta, 55281</addr-line>, <country>Indonesia</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Authors: Chih-Hsien Hsia. Email: <email>hsiach@niu.edu.tw</email>; Yung-Yao Chen. Email: <email>yungyaochen@gapps.ntust.edu.tw</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>95</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>08</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_72633.pdf"></self-uri>
<abstract>
<p>Automated grading of dandruff severity is a clinically significant but challenging task due to the inherent ordinal nature of severity levels and the high prevalence of label noise from subjective expert annotations. Standard classification methods fail to address these dual challenges, limiting their real-world performance. In this paper, a novel, three-phase training framework is proposed that learns a robust ordinal classifier directly from noisy labels. The approach synergistically combines a rank-based ordinal regression backbone with a cooperative, semi-supervised learning strategy to dynamically partition the data into clean and noisy subsets. A hybrid training objective is then employed, applying a supervised ordinal loss to the clean set. The noisy set is simultaneously trained using a dual-objective that combines a semi-supervised ordinal loss with a parallel, label-agnostic contrastive loss. This design allows the model to learn from the entire noisy subset while using contrastive learning to mitigate the risk of error propagation from potentially corrupt supervision. Extensive experiments on a new, large-scale, multi-site clinical dataset validate our approach. The method achieves state-of-the-art performance with 80.71% accuracy and a 76.86% F1-score, significantly outperforming existing approaches, including a 2.26% improvement over the strongest baseline method. This work provides not only a robust solution for a practical medical imaging problem but also a generalizable framework for other tasks plagued by noisy ordinal labels.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Dandruff severity grading</kwd>
<kwd>ordinal regression</kwd>
<kwd>noisy label learning</kwd>
<kwd>self-supervised learning</kwd>
<kwd>contrastive learning</kwd>
<kwd>medical image analysis</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>The application of deep learning and artificial intelligence is rapidly transforming medical image analysis, offering tools that promise to enhance the speed, objectivity, and accessibility of clinical diagnosis [<xref ref-type="bibr" rid="ref-1">1</xref>]. Within this field, the automated analysis of dermatological conditions via computer vision is a significant and advancing area of research. Among common afflictions, scalp health issues such as excessive dandruff affect a large global population. Beyond physical discomfort, these conditions can significantly impact an individual&#x2019;s psychological well-being and daily life, while also serving as potential indicators for inflammatory disorders like seborrheic dermatitis [<xref ref-type="bibr" rid="ref-2">2</xref>]. Although deep learning has achieved remarkable success in medical imaging, its application to the fine-grained task of grading dandruff severity from microscope images presents a unique and challenging set of problems that have not been fully addressed by prior work [<xref ref-type="bibr" rid="ref-3">3</xref>&#x2013;<xref ref-type="bibr" rid="ref-5">5</xref>].</p>
<p>The primary challenges are rooted in the fundamental nature of the data and its annotation. First, severity grading is an ordinal classification task. The labels, ranging from &#x201C;no dandruff&#x201D; to &#x201C;severe,&#x201D; possess a natural monotonic order. Standard classification methods that treat categories as independent and nominal ignore this crucial relationship, which can lead to clinically implausible errors, such as penalizing a misprediction from &#x201C;severe&#x201D; to &#x201C;moderate&#x201D; equally as one to &#x201C;none&#x201D;. Second, collecting large-scale, accurately labeled medical datasets is notoriously difficult. The visual assessment of dandruff is highly subjective, leading to significant inter- and intra-rater variability. This issue is exacerbated in scalp analysis, where symptoms often manifest as subtle, fragmented visual cues. This results in minimal inter-class variance between adjacent severity levels, making consistent annotation exceptionally difficult even for trained experts [<xref ref-type="bibr" rid="ref-4">4</xref>]. Consequently, any realistic training dataset is inevitably corrupted with substantial label noise, which severely degrades the generalization performance of deep neural networks that are prone to memorizing incorrect labels.</p>
<p>This presents a significant dilemma for existing methods. While various approaches have been proposed to tackle either ordinal regression [<xref ref-type="bibr" rid="ref-6">6</xref>] or learning with noisy labels (LNL) [<xref ref-type="bibr" rid="ref-7">7</xref>] independently, they are insufficient for this problem. State-of-the-art LNL frameworks are class-agnostic, meaning they ignore the crucial monotonic order of severity and thus risk producing ordinally inconsistent results. Conversely, most ordinal regression methods inherently assume that the training labels are accurate and can be severely compromised by the high degree of annotation noise. This leaves a critical, unaddressed research gap for a framework that can simultaneously handle both the ordinal constraints and the endemic label noise in a unified manner.</p>
<p>To bridge this specific gap, our primary objective is to develop a novel framework for dandruff severity grading that learns robustly from noisy, ordered data. Our approach synergizes a self-supervised strategy for noisy label handling with an ordinal regression objective. Inspired by semi-supervised learning techniques, our method dynamically partitions the training data into presumed &#x201C;clean&#x201D; and &#x201C;noisy&#x201D; sets based on an ordinally-aware loss metric. For the noisy samples, where labels are deemed unreliable, a dual-objective is employed. This objective trains the noisy subset using a semi-supervised ordinal loss derived from pseudo-labels in parallel with a label-agnostic contrastive loss. This design allows the model to leverage the full dataset while the contrastive component acts as a robust regularizer, mitigating the risk of error propagation from potentially incorrect pseudo-labels. This entire process is built upon an ordinal regression backbone, which recasts the multi-class problem into a series of simpler, rank-consistent binary tasks. This ensures the model&#x2019;s predictions respect the inherent order of dandruff severity, leading to more reliable and clinically meaningful results.</p>
<p>In summary, our main contributions are as follows:
<list list-type="bullet">
<list-item>
<p>We are the first to propose a deep learning framework that jointly addresses the coupled challenges of ordinal classification and label noise for automated dandruff severity grading.</p></list-item>
<list-item>
<p>We introduce an ordinally-aware sample partitioning strategy that leverages the rank-based ordinal loss as its metric. This enables the partitioning process to be sensitive to the magnitude of ordinal errors, providing a more reliable separation of clean and noisy data than standard class-agnostic approaches.</p></list-item>
<list-item>
<p>A hybrid training objective is proposed, which applies a supervised ordinal loss to the clean set while simultaneously training the noisy set with both a semi-supervised ordinal loss and a label-agnostic contrastive loss. This dual-objective design mitigates the risk of error propagation, as the contrastive loss provides a robust, label-agnostic learning signal to counteract potential errors introduced by the pseudo-labels.</p></list-item>
<list-item>
<p>An ordinal regression objective is incorporated throughout the learning process, enforcing the monotonic relationship between severity levels and significantly reducing the frequency of large, clinically implausible prediction errors.</p></list-item>
<list-item>
<p>This study is validated through extensive experiments on a new, large-scale, multi-site dandruff severity grading dataset, demonstrating that the proposed method achieves state-of-the-art performance and significantly outperforms baseline approaches.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Works</title>
<p>Our research addresses the automated grading of dandruff severity, a task situated at the intersection of medical image analysis, ordinal classification, and robust learning under label noise. In this section, we review the literature from these three perspectives to contextualize our contribution.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Automated Dandruff Severity Grading</title>
<p>The application of deep learning to scalp analysis has gained significant traction. Early works focused on general scalp problem classification using CNN-based architectures [<xref ref-type="bibr" rid="ref-8">8</xref>,<xref ref-type="bibr" rid="ref-9">9</xref>] or robust representation learning in edge-cloud systems [<xref ref-type="bibr" rid="ref-10">10</xref>]. More recent work has shifted focus towards the more nuanced task of grading the severity of these conditions. Some approaches have repurposed object detection models, inferring severity from the density of detected problem areas [<xref ref-type="bibr" rid="ref-11">11</xref>,<xref ref-type="bibr" rid="ref-12">12</xref>]. However, this indirect method can be biased by non-scalp features or fail to capture fine-grained, fragmented symptoms. Consequently, a more direct approach based on established medical grading standards [<xref ref-type="bibr" rid="ref-13">13</xref>] has become prevalent. Jhong et al. enhanced a CNN with attention mechanisms [<xref ref-type="bibr" rid="ref-2">2</xref>], while Jin et al. [<xref ref-type="bibr" rid="ref-3">3</xref>] fine-tuned an EfficientNet for fine dandruff classification. Other works have explored ensemble models for robustness on limited data [<xref ref-type="bibr" rid="ref-5">5</xref>]. Or employed Vision Transformers (ViT) for grading multiple scalp conditions simultaneously [<xref ref-type="bibr" rid="ref-4">4</xref>]. While these studies demonstrate the potential of deep learning, they predominantly treat severity grading as a standard classification task. This overlooks two critical challenges: (1) the labels are ordinal in nature, and standard classification losses fail to capture this intrinsic order; (2) the labels are inherently noisy due to subjective expert assessment, which can degrade model performance. Our work explicitly addresses these two challenges in tandem.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Ordinal Classification</title>
<p>Ordinal classification (or regression) aims to solve tasks where labels possess a natural ranking. A prominent and modular strategy involves decomposing a K-class ordinal problem into K-1 independent binary classification sub-problems [<xref ref-type="bibr" rid="ref-14">14</xref>]. This rank-based approach is particularly adept at addressing the challenge of ambiguous instances, which are samples that fall near the blurred boundaries between adjacent categories and are a common issue in subjective grading. Foundational deep learning models like MO-CNN [<xref ref-type="bibr" rid="ref-14">14</xref>] first operationalized this concept by training multiple binary classifiers to distinguish adjacent ranks. More recent methods, such as CORAL [<xref ref-type="bibr" rid="ref-15">15</xref>], have refined this approach to enforce rank consistency with greater efficiency. For contrast, other families of methods, such as Distribution Ordering Learning, seek to model the ordinal relationship by learning the label distribution directly. Techniques in this category include using soft labels [<xref ref-type="bibr" rid="ref-16">16</xref>]. While effective in certain contexts, these distribution-based methods often rely heavily on the precise location of the ground-truth label to anchor the distribution. This can make them potentially more sensitive to the high degree of label noise present in our dataset, where the ground-truth label itself is often unreliable. Therefore, for our framework, we adopt the rank-based decomposition strategy. Its proven effectiveness, modularity, and high compatibility with other learning paradigms make it an ideal backbone for integrating the noisy label learning methods we leverage.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Learning with Noisy Labels</title>
<p>To combat the memorization effect of deep networks on incorrect labels, various Learning with Noisy Labels (LNL) strategies have been proposed. These approaches can be broadly categorized into methods like Loss Adjustment and Sample Selection. Loss adjustment methods, such as Bootstrapping [<xref ref-type="bibr" rid="ref-17">17</xref>], attempt to correct the training signal for all samples, for example by using the model&#x2019;s own predictions as a soft target. However, these correction-based approaches risk error propagation, especially in high-noise environments; if the model&#x2019;s initial predictions are wrong, the corrected label will also be wrong, potentially reinforcing the error. In contrast, sample selection methods are particularly effective and relevant to our work. They operate on the principle that models learn from clean examples with small losses before fitting to noisy examples with large losses. Early methods in this domain, such as Co-teaching [<xref ref-type="bibr" rid="ref-18">18</xref>], utilized multiple networks to cross-filter clean samples for each other, mitigating confirmation bias. A leading approach, DivideMix [<xref ref-type="bibr" rid="ref-19">19</xref>], operationalizes this by modeling the per-sample loss distribution with a Gaussian Mixture Model (GMM) to dynamically separate the dataset into a likely clean set and a noisy set. Instead of discarding the noisy data, DivideMix reframes the task as a semi-supervised problem, using model predictions as pseudo-labels for the noisy subset. To further enhance feature learning on this subset without relying on potentially flawed pseudo-labels, contrastive learning has proven powerful for learning robust representations from the underlying images alone [<xref ref-type="bibr" rid="ref-20">20</xref>].</p>
<p>While these fields have advanced independently, a framework that synergistically integrates them is non-trivial and remains a critical gap. Ordinal regression methods typically assume that training labels are accurate [<xref ref-type="bibr" rid="ref-15">15</xref>,<xref ref-type="bibr" rid="ref-16">16</xref>]. Conversely, state-of-the-art LNL frameworks are class-agnostic [<xref ref-type="bibr" rid="ref-19">19</xref>] and do not enforce ordinal constraints. For instance, a standard class-agnostic loss metric, as used in many LNL methods, cannot differentiate minor from major ordinal errors, which is a vital distinction for robustly partitioning the data. Furthermore, reliance on pseudo-labeling for noisy data, a common LNL strategy, risks propagating ordinally-inconsistent errors. Our work bridges this gap by proposing the first unified framework built upon a rank-based ordinal backbone. This framework introduces two key innovations: it uses an ordinally-aware metric for robust sample partitioning and employs a dual-objective with contrastive learning to mitigate pseudo-label error propagation.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<sec id="s3_1">
<label>3.1</label>
<title>Overall Framework</title>
<p>The task of grading dandruff severity presents dual challenges: the ordinal nature of severity levels and the prevalence of label noise. To address these, we propose a robust three-phase training framework that synergizes ordinal regression with a cooperative noisy label learning strategy. Our pipeline, illustrated in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>, progressively refines the model&#x2019;s capability to handle noisy ordinal data. The process begins with a crucial supervised pre-training phase. Two parallel networks are independently warmed-up on the entire noisy dataset using an ordinal regression loss. This crucial first step ensures the models develop an initial understanding of the ranking relationships between severity grades, which is a prerequisite for generating meaningful loss distributions for sample partitioning. Leveraging these pre-trained models, the framework then proceeds to a cooperative sample partitioning phase. At the start of each subsequent epoch, the two networks use each other&#x2019;s per-sample loss distributions to dynamically divide the dataset into a high-confidence clean subset and a low-confidence noisy subset. This cross-supervision or co-training design is critical for mitigating the confirmation bias inherent to self-training systems. With the data partitioned, the framework enters its main hybrid training phase, which employs a dual-objective learning strategy tailored to these subsets. A semi-supervised ordinal loss is applied to data generated via MixUp from both subsets, allowing the model to learn from the entire dataset. Concurrently, a label-agnostic contrastive loss is applied exclusively to the noisy subset to learn robust, generalizable features without relying on their corrupt labels. The subsequent sections provide a detailed exposition of each component. The complete procedure is formalized in Algorithm 1.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Overview of our proposed three-phase training framework for robust ordinal classification from noisy labels. (Phase 1) Supervised Pre-training: Two parallel networks are warmed-up on the full noisy dataset with an ordinal loss to establish a baseline understanding of the severity ranking. (Phase 2) Cooperative Sample Partitioning: Each network&#x2019;s loss distribution is used to partition the data into clean and noisy subsets for its peer, a cross-supervision strategy that mitigates confirmation bias. (Phase 3) Hybrid Training: A dual-objective strategy is employed. A semi-supervised ordinal loss is computed on Mixup-augmented data from both subsets, while a contrastive loss learns label-agnostic features exclusively from the noisy subset. The total loss combines these objectives for a comprehensive training update</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72633-fig-1.tif"/>
</fig>
<fig id="fig-6">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72633-fig-6.tif"/>
</fig>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Ordinal Regression for Severity Classification</title>
<p>Standard multi-class classification, which typically employs a softmax output with categorical cross-entropy loss, is suboptimal for severity grading as it treats labels as independent nominal entities. This approach ignores the critical ordinal relationship between grades (e.g., grade 4 is more severe than grade 2). To formally integrate this structure, we adopt a rank-based methodology inspired by [<xref ref-type="bibr" rid="ref-15">15</xref>]. We reframe the K-class ordinal problem into <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> simpler binary classification tasks. Specifically, a ground-truth label <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>y</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula> is transformed into a binary vector <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:msup><mml:mi>y</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mo>{</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>}</mml:mo></mml:mrow><mml:mrow><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, where the <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>k</mml:mi></mml:math></inline-formula>-th element indicates whether the severity grade is greater than rank <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:mi>k</mml:mi></mml:math></inline-formula>:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msup><mml:mi>y</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mtext>I</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2265;</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mspace width="thinmathspace" /><mml:mi>f</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mspace width="thinmathspace" /><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></disp-formula></p>
<p>To guarantee rank consistency in predictions (e.g., <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2265;</mml:mo><mml:mn>4</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x003E;</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2265;</mml:mo><mml:mn>2</mml:mn><mml:mo>)</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, we constrain the network such that the final <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> output neurons share the weights of the penultimate feature layer but each maintains an independent bias term. The model is then trained by minimizing the sum of binary cross-entropy (BCE) losses across all tasks. For a batch of <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mi>B</mml:mi></mml:math></inline-formula> samples, this supervised loss <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, is defined as:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>B</mml:mi></mml:mfrac><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:munderover><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover><mml:mrow><mml:mtext>BCE</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>&#x03C3;</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>g</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x03B8;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:mi>g</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x03B8;</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> denotes the feature vector from the penultimate layer, and <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the bias for the <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>k</mml:mi></mml:math></inline-formula>-th task. Minimizing this objective ensures that the learned biases are monotonically non-increasing <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2265;</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2265;</mml:mo><mml:mo>&#x22EF;</mml:mo><mml:mo>&#x2265;</mml:mo><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mi>K</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>, thereby guaranteeing rank-consistent probability estimates. This ordinal loss serves as the core objective for the pre-training (Phase 1) and the supervised component of the hybrid training (Phase 3).</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Self-Supervised Learning via Cooperative Partitioning</title>
<p>A significant challenge in dandruff severity grading is the subtle visual distinction between adjacent levels, leading to inevitable label noise that can degrade model generalization. To address this, we employ a robust training strategy inspired by semi-supervised learning [<xref ref-type="bibr" rid="ref-19">19</xref>]. Instead of directly correcting labels, our approach dynamically partitions the data into a trusted clean set and an untrusted noisy set.</p>
<p>We posit that for a well-trained network, correctly labeled samples tend to exhibit lower losses than mislabeled ones. To formalize this, at the beginning of each epoch, we compute the per-sample ordinal loss for all training data. We then fit a two-component Gaussian Mixture Model (GMM) to the loss distribution to statistically separate clean and noisy samples. This process yields a posterior probability <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> for each sample <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. To prevent error accumulation from a single model (self-confirmation bias), we implement a cooperative &#x201C;co-training&#x201D; framework. Specifically, the partition derived from Model 1&#x2019;s loss distribution is used to train Model 2, and <italic>vice-versa</italic>. Samples with <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> exceeding a threshold &#x03C4; form the clean set <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> while the remainder form the noisy set <inline-formula id="ieqn-65"><mml:math id="mml-ieqn-65"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> for which the original labels are discarded.</p>
<p>For samples in <inline-formula id="ieqn-66"><mml:math id="mml-ieqn-66"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, we further mitigate potential noise by applying a label refinement mechanism, as shown in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. We compute a &#x201C;soft&#x201D; target <inline-formula id="ieqn-67"><mml:math id="mml-ieqn-67"><mml:msubsup><mml:mover><mml:mi>y</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> by linearly combining the original ground-truth label <inline-formula id="ieqn-68"><mml:math id="mml-ieqn-68"><mml:msubsup><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> with the model&#x2019;s own prediction <inline-formula id="ieqn-69"><mml:math id="mml-ieqn-69"><mml:msubsup><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> (averaged over augmentations):<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msubsup><mml:mover><mml:mi>y</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:msubsup><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:mrow></mml:msubsup></mml:math></disp-formula></p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>The label refinement mechanism for the clean set. A stabilized model prediction <inline-formula id="ieqn-70"><mml:math id="mml-ieqn-70"><mml:msubsup><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is obtained by averaging the outputs from two distinct augmentations of an input image. This prediction is then combined with the ground-truth label <inline-formula id="ieqn-71"><mml:math id="mml-ieqn-71"><mml:msubsup><mml:mover><mml:mi>y</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> in a weighted sum, where the weight <inline-formula id="ieqn-72"><mml:math id="mml-ieqn-72"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the sample&#x2019;s estimated probability of being clean. <inline-formula id="ieqn-73"><mml:math id="mml-ieqn-73"><mml:msubsup><mml:mover><mml:mi>y</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> adaptively balances the influence of the original annotation and the model&#x2019;s own prediction</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72633-fig-2.tif"/>
</fig>
<p>This mechanism adaptively balances supervision: it trusts the provided label for high-confidence samples (where <inline-formula id="ieqn-74"><mml:math id="mml-ieqn-74"><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2248;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> while relying more on the model&#x2019;s prediction for ambiguous cases.</p>
</sec>
<sec id="s3_4">
<label>3.4</label>
<title>Hybrid Training Strategy</title>
<p>With the data partitioned into a clean set <inline-formula id="ieqn-75"><mml:math id="mml-ieqn-75"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> (using refined labels <inline-formula id="ieqn-76"><mml:math id="mml-ieqn-76"><mml:msubsup><mml:mover><mml:mi>y</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>) and an unlabeled noisy set <inline-formula id="ieqn-77"><mml:math id="mml-ieqn-77"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, we employ a hybrid training strategy to maximize feature learning. This strategy combines a semi-supervised ordinal objective with a label-agnostic contrastive objective. First, we generate targets for the noisy subset <inline-formula id="ieqn-78"><mml:math id="mml-ieqn-78"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. As illustrated in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>, we compute a robust ensemble prediction by averaging the outputs of both co-trained networks, serving as the pseudo-label for samples in <inline-formula id="ieqn-79"><mml:math id="mml-ieqn-79"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. We then apply MixUp augmentation [<xref ref-type="bibr" rid="ref-21">21</xref>] to construct interpolated training batches combining refined samples from <inline-formula id="ieqn-80"><mml:math id="mml-ieqn-80"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and pseudo-labeled samples from <inline-formula id="ieqn-81"><mml:math id="mml-ieqn-81"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. The supervised ordinal loss <inline-formula id="ieqn-82"><mml:math id="mml-ieqn-82"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is minimized on these mixed batches.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Ensemble pseudo-label generation for the noisy set. The process for generating pseudo-labels for samples in the noisy set <inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. We leverage the predictions from both co-trained networks (Model 1 and Model 2) to form a robust ensemble estimate. Specifically, a pair of augmented views of a noisy image is fed through both networks. The resulting output predictions are averaged to produce a single, high-confidence soft pseudo-label, which is then used as the target in the semi-supervised learning phase</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72633-fig-3.tif"/>
</fig>
<p>To prevent the model from collapsing to a trivial solution (e.g., predicting a single class for all inputs), we incorporate a regularization term <inline-formula id="ieqn-83"><mml:math id="mml-ieqn-83"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. This forces the moving average of the model&#x2019;s predictions <inline-formula id="ieqn-84"><mml:math id="mml-ieqn-84"><mml:mover><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover></mml:math></inline-formula> to align with the prior class distribution of the dataset <inline-formula id="ieqn-85"><mml:math id="mml-ieqn-85"><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>:
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2223;&#x2223;</mml:mo><mml:mover><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>Concurrently, to extract discriminative features from the noisy data without relying on potentially erroneous pseudo-labels, we apply a contrastive loss <inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> exclusively to<inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. We utilize the InfoNCE objective to maximize the similarity between two augmented views <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>+</mml:mo></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> of the same image while minimizing similarity with other samples:<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mfrac><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mfrac><mml:mrow><mml:mrow><mml:mtext>sim</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msubsup><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mo>+</mml:mo></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mi>K</mml:mi></mml:mfrac></mml:mrow></mml:msup><mml:mrow><mml:msubsup><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x2260;</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mfrac><mml:mrow><mml:mrow><mml:mtext>sim</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>z</mml:mi><mml:mrow><mml:mi>b</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mi>K</mml:mi></mml:mfrac></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:math></disp-formula>where <inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes cosine similarity, <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mi>K</mml:mi></mml:math></inline-formula> is the temperature hyperparameter, and <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is the batch size of the noisy subset. The total training objective is a weighted summation:<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mrow><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mrow><mml:mtext>r</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x22C5;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mrow><mml:mtext>r</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-94"><mml:math id="mml-ieqn-94"><mml:msub><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mrow><mml:mtext>r</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> are balancing hyperparameters.</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiments</title>
<p>In this section, we present a comprehensive empirical evaluation of our proposed framework. We begin by detailing our newly collected dataset and the evaluation protocol. We then provide a quantitative comparison against state-of-the-art (SOTA) methods, followed by a thorough ablation study to dissect the contribution of each core component of our methodology.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset and Evaluation Metrics</title>
<p>To facilitate this research, we constructed a new, large-scale dataset for dandruff severity grading. The data was collected in collaboration with Taipei Hospital, Ministry of Health and Welfare and MacroHI Co., Ltd. in Taiwan, ensuring a diversity of clinical settings. This study was conducted in strict accordance with the Declaration of Helsinki and was approved by the Institutional Review Board (IRB) of Taipei Hospital, Ministry of Health and Welfare (Protocol No. TH-IRB-0022-0021). Using a digital microscope, clinicians and therapists captured images from eight distinct scalp regions, resulting in varied resolutions (768 &#x00D7; 576, 800 &#x00D7; 600, and 1024 &#x00D7; 768). A team of one dermatologist and three trained scalp therapists annotated a total of 22,537 images. To ensure a consistent and high-quality standard across this large-scale dataset, a rigorous adjudication protocol was employed. The trained therapists performed the initial annotations based on the clinically validated Adherent Scalp Flaking Score (ASFS) standard [<xref ref-type="bibr" rid="ref-13">13</xref>]. The dermatologist then served as the final expert adjudicator, reviewing annotations and defining the ground-truth label for each image used in the study. Each image was assigned to one of six severity levels {0, 2, 4, 6, 8, 10} based on the ASFS standard. The dataset exhibits a natural class imbalance representative of a clinical population: grade 0 (5817 images), grade 2 (9062), grade 4 (2966), grade 6 (2085), grade 8 (1893), and grade 10 (714). We randomly partitioned the data, allocating 18,871 images (&#x007E;85%) for training and 3666 for testing, ensuring a consistent class distribution across splits. <xref ref-type="fig" rid="fig-4">Fig. 4</xref> shows example images for different severity grades. The creation of this multi-site, expert-annotated dataset is a key contribution to our work, providing a challenging and realistic benchmark for this task. Following standard practice, we report overall Accuracy. To provide a more robust assessment of our imbalanced dataset, we also report macro-averaged Precision, Recall, and F1-Score. The F1-Score is particularly important as it provides a balanced measure of performance across all six severity classes.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Representative images from our dandruff severity dataset. The six grades are annotated according to ASFS standard</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72633-fig-4.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experiment Settings</title>
<p>All experiments were conducted on an NVIDIA GeForce RTX 4090 GPU using a ResNet-50 backbone. For our method, the backbone was modified in two ways: the final classification layer was changed to have 5 outputs to suit the ordinal regression task, and a 128-dimensional projection head was added in parallel to facilitate contrastive learning. For data preprocessing, all input images were resized and randomly cropped to 224 &#x00D7; 224. Across all experiments, we utilized the Adabelief optimizer with <inline-formula id="ieqn-95"><mml:math id="mml-ieqn-95"><mml:msub><mml:mrow><mml:mi mathvariant="normal">&#x03B2;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.9</mml:mn></mml:math></inline-formula> and <inline-formula id="ieqn-96"><mml:math id="mml-ieqn-96"><mml:msub><mml:mrow><mml:mi mathvariant="normal">&#x03B2;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>0.999</mml:mn></mml:math></inline-formula>, paired with a cosine annealing learning rate schedule. The training protocols were set as follows. For baseline and SOTA comparisons, models were trained for 450 epochs with a batch size of 64 and an initial learning rate of <inline-formula id="ieqn-97"><mml:math id="mml-ieqn-97"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:math></inline-formula> Our proposed method involved a 10-epoch warm-up phase, followed by 250 epochs of hybrid training with a batch size of 32, an initial learning rate of <inline-formula id="ieqn-98"><mml:math id="mml-ieqn-98"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, and a weight decay of <inline-formula id="ieqn-99"><mml:math id="mml-ieqn-99"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. The key hyperparameters for our framework were set as: GMM partition threshold &#x03C4; &#x003D; 0.5, contrastive learning temperature <italic>K</italic> &#x003D; 0.05, and regularization loss weight <inline-formula id="ieqn-100"><mml:math id="mml-ieqn-100"><mml:msub><mml:mrow><mml:mi mathvariant="normal">&#x03BB;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> &#x003D; 0.01. The contrastive loss weight <inline-formula id="ieqn-101"><mml:math id="mml-ieqn-101"><mml:msub><mml:mrow><mml:mi mathvariant="normal">&#x03BB;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> was linearly ramped up from 0 to 0.06 over the first 100 epochs to stabilize initial training.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Comparison with State-of-the-Art Methods</title>
<p>We compare our framework against several SOTA methods, as shown in <xref ref-type="table" rid="table-1">Table 1</xref>. As the literature on robust ordinal learning for dandruff grading is nascent, we benchmark against high-performing models from related scalp analysis tasks [<xref ref-type="bibr" rid="ref-2">2</xref>&#x2013;<xref ref-type="bibr" rid="ref-5">5</xref>,<xref ref-type="bibr" rid="ref-8">8</xref>]. All competing methods were trained on our dataset using the parameter settings reported in their original papers. As shown in <xref ref-type="table" rid="table-1">Table 1</xref>, our proposed method significantly outperforms all other approaches across all metrics. Our ResNet-50-based model achieves an Accuracy of 80.71% and an F1-Score of 76.86%. This represents a substantial improvement of 2.26% in accuracy and 1.25% in F1-score over the next best method, the Triple ensemble model proposed by Kim et al. [<xref ref-type="bibr" rid="ref-5">5</xref>]. Notably, our single, principled framework surpasses not only this complex ensemble approach but also advanced architectures such as the Vision Transformer (ViT-B/16). These results validate that our framework&#x2019;s core principles of explicitly modeling ordinal relationships and robustly handling label noise provide a decisive advantage over methods relying solely on architectural sophistication or ensembling.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Performance comparison with SOTA methods</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Method</th>
<th>Backbone</th>
<th>Accuracy</th>
<th>Precision</th>
<th>Recall</th>
<th>F1-Score</th>
</tr>
</thead>
<tbody>
<tr>
<td>Ha et al. [<xref ref-type="bibr" rid="ref-4">4</xref>]</td>
<td>ViT-B/16</td>
<td>69.91%</td>
<td>68.66%</td>
<td>65.67%</td>
<td>66.78%</td>
</tr>
<tr>
<td>Jin et al. [<xref ref-type="bibr" rid="ref-3">3</xref>]</td>
<td>EfficientNet-B0</td>
<td>74.71%</td>
<td>73.39%</td>
<td>70.48%</td>
<td>71.50%</td>
</tr>
<tr>
<td>Wang et al. [<xref ref-type="bibr" rid="ref-8">8</xref>]</td>
<td>VGG-16</td>
<td>77.85%</td>
<td>75.81%</td>
<td>74.28%</td>
<td>74.81%</td>
</tr>
<tr>
<td>Jhong et al. [<xref ref-type="bibr" rid="ref-2">2</xref>]</td>
<td>Enhanced DenseNet-121</td>
<td>78.18%</td>
<td>76.50%</td>
<td>73.67%</td>
<td>75.06%</td>
</tr>
<tr>
<td>Kim et al. [<xref ref-type="bibr" rid="ref-5">5</xref>]</td>
<td>Triple ensemble model</td>
<td>78.45%</td>
<td>77.16%</td>
<td><bold>75.10%</bold></td>
<td>75.61%</td>
</tr>
<tr>
<td><bold>This work</bold></td>
<td><bold>Resnet-50</bold></td>
<td><bold>80.71%</bold></td>
<td><bold>79.95%</bold></td>
<td>74.80%</td>
<td><bold>76.86%</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-1fn1" fn-type="other">
<p>Note: The bold values indicate the best performance.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Ablation Studies</title>
<p>To verify the contribution of each component of our framework, we conducted a detailed ablation study. Starting with a standard supervised ResNet-50 baseline, we incrementally integrated Ordinal Regression (OR), our Self-Supervised noisy label learning strategy (SSL), and Contrastive Learning (CL). The quantitative results are summarized in <xref ref-type="table" rid="table-2">Table 2</xref>. The baseline model achieves a respectable accuracy of 78.23%. Introducing the OR formulation provides immediate gains, confirming the importance of modeling the inherent ordinal structure of severity grades. Subsequently, incorporating the SSL strategy with cooperative partitioning yields the most significant performance leap. As detailed in the per-class F1-scores, this gain is largely driven by a substantial improvement in Class 0 (rising from 75.68% to 80.03%), strongly suggesting that the &#x201C;no dandruff&#x201D; category contains considerable label noise which our SSL mechanism successfully mitigates. Finally, the addition of the CL module provides a further performance boost, achieving the highest Accuracy (80.71%) and Macro F1-score (76.86%). To rigorously assess performance on our imbalanced dataset, we also evaluated the Weighted F1-score. The proposed method achieves a Weighted F1 of 80.56%, surpassing the baseline&#x2019;s 78.18%. This metric confirms that our framework maintains robust classification capabilities across all severity levels, rather than bias towards majority classes.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Ablation study on each component for this study</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th colspan="3">Method</th>
<th colspan="6">Per-class F1-score (%)</th>
<th colspan="3">Overall performance (%)</th>
<th rowspan="2">Runtime</th>
</tr>
<tr>
<th></th>
<th></th>
<th></th>
<th>0</th>
<th>2</th>
<th>4</th>
<th>6</th>
<th>8</th>
<th>10</th>
<th>Macro F1</th>
<th>Weighted F1</th>
<th>Accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="3">Baseline</td>
<td>76.39</td>
<td>82.58</td>
<td>66.67</td>
<td>74.77</td>
<td>80.97</td>
<td>71.76</td>
<td>75.52</td>
<td>78.18%</td>
<td>78.23</td>
<td>0.00264 s</td>
</tr>
<tr>
<td>OR</td>
<td>SSL</td>
<td>CL</td>
<td colspan="10" align="center"><italic>Proposed method</italic></td>
</tr>
<tr>
<td><bold>&#x221A;</bold></td>
<td></td>
<td></td>
<td>75.68</td>
<td>82.60</td>
<td>68.46</td>
<td>75.07</td>
<td>82.45</td>
<td>73.33</td>
<td>76.27</td>
<td>78.45%</td>
<td>78.45</td>
<td>0.00257 s</td>
</tr>
<tr>
<td></td>
<td><bold>&#x221A;</bold></td>
<td></td>
<td>80.47</td>
<td>84.12</td>
<td>65.91</td>
<td>75.61</td>
<td>79.55</td>
<td>70.19</td>
<td>75.97</td>
<td>79.58%</td>
<td>79.60</td>
<td>0.00464 s</td>
</tr>
<tr>
<td><bold>&#x221A;</bold></td>
<td><bold>&#x221A;</bold></td>
<td></td>
<td>80.03</td>
<td>85.23</td>
<td>67.14</td>
<td>75.36</td>
<td>81.20</td>
<td>71.43</td>
<td>76.73</td>
<td>80.29%</td>
<td>80.36</td>
<td>0.00455 s</td>
</tr>
<tr>
<td><bold>&#x221A;</bold></td>
<td><bold>&#x221A;</bold></td>
<td><bold>&#x221A;</bold></td>
<td>80.28</td>
<td>85.80</td>
<td>67.17</td>
<td>73.78</td>
<td>81.88</td>
<td>72.27</td>
<td>76.86</td>
<td>80.56%</td>
<td>80.71</td>
<td>0.00476 s</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To further investigate the sources of these performance gains, we visualized the confusion matrices for the key ablation stages in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>. As observed in <xref ref-type="fig" rid="fig-5">Fig. 5a</xref>, the Baseline model exhibits a dispersed prediction pattern, with misclassifications frequently spanning across non-adjacent classes (e.g., confus-ing Grade 2 with Grade 6). Incorporating the OR module (<xref ref-type="fig" rid="fig-5">Fig. 5b</xref>) effectively constrains these predic-tions, resulting in a concentration of values along the diagonal and a significant reduction in large error margins. Furthermore, <xref ref-type="fig" rid="fig-5">Fig. 5c</xref>,<xref ref-type="fig" rid="fig-5">d</xref> illustrates the distinct contributions of the self-supervised strategies. The standalone SSL strategy (<xref ref-type="fig" rid="fig-5">Fig. 5c</xref>) significantly corrects misclassifications in the noisy Grade 0 class. Subsequently, the intermediate OR &#x002B; SSL stage (<xref ref-type="fig" rid="fig-5">Fig. 5d</xref>) demonstrates how combining these approaches begins to align noise robustness with rank consistency. Finally, <xref ref-type="fig" rid="fig-5">Fig. 5e</xref> presents the full Proposed Method (OR &#x002B; SSL &#x002B; CL). By integrating the contrastive learning objective, the model achieves the clearest diagonal structure with minimal off-axis errors. This visual evidence corroborates that our cooperative hybrid framework synergistically mitigates label noise while preserving the intrinsic ordinal structure.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Visual comparison of confusion matrices across different ablation stages. (<bold>a</bold>) The Baseline model shows considerable confusion between adjacent severity grades. (<bold>b</bold>) The addition of OR concentrates predictions along the diagonal and reduces extreme outliers. (<bold>c</bold>) The SSL strategy significantly corrects misclassifications in the noisy Grade 0 class. (<bold>d</bold>) Combining OR &#x002B; SSL further refines the diagonal structure. (<bold>e</bold>) The Proposed Method (OR &#x002B; SSL &#x002B; CL) achieves the most distinct diagonal with minimal off-axis misclassifications, demonstrating superior robustness against label noise</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_72633-fig-5.tif"/>
</fig>
<p>Beyond classification performance, we also addressed the practical requirements for clinical deployment by evaluating inference speed (Runtime) as reported in <xref ref-type="table" rid="table-2">Table 2</xref>. While our hybrid framework introduces a marginal increase in computational cost compared to the baseline (0.00476 s vs. 0.00264 s per image) due to the architectural design, the system remains highly efficient. With an inference speed capable of processing over 200 frames per second, our method proves to be computationally lightweight and well-suited for real-time diagnostic applications.</p>

</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Discussion</title>
<p>The experimental results presented in this study substantiate the efficacy of the proposed cooperative hybrid learning framework in addressing the dual challenges of ordinal severity grading and label noise. A critical analysis of the performance metrics reveals that standard deep learning models often fail to capture the subtle inter-class variations characteristic of dandruff severity, leading to ordinally inconsistent predictions. By contrast, our method explicitly enforces rank consistency through the ordinal regression backbone, which significantly reduces clinically implausible errors. This is visually corroborated by the confusion matrices, where the predictions of our model are densely concentrated along the diagonal. Unlike the baseline model that exhibits dispersed errors across non-adjacent classes, our approach ensures that even when misclassification occurs, the predicted grade remains proximal to the ground truth. This characteristic is vital for clinical decision support systems, as it minimizes the risk of drastic misdiagnosis that could adversely affect treatment planning.</p>
<p>The ablation studies further highlight the specific contributions of the self-supervised partitioning and contrastive learning components. A notable finding is the substantial performance gain in identifying Grade 0 samples. In clinical practice, distinguishing between a healthy scalp (Grade 0) and mild dandruff (Grade 2) is notoriously difficult due to subjective interpretation, resulting in high label noise for these categories. The proposed cooperative partitioning strategy successfully identified and filtered these ambiguous samples, preventing the model from overfitting to inconsistent annotations. Furthermore, the integration of contrastive learning on the noisy subset proved essential. By pulling representations of the same image together while pushing others apart, the model learned robust and discriminative features directly from the visual data without relying on potentially corrupt labels. This synergy explains why our method outperforms complex architectures like Vision Transformers, which, despite their capacity, lack specific mechanisms to handle the ordinal and noisy nature of this specific medical task.</p>
<p>While the proposed framework demonstrates SOTA performance, we acknowledge certain limitations regarding the dataset demographics and imaging modality. The current study utilized a large-scale dataset collected from clinical centers in Taiwan, representing a specific ethnic group with predominantly dark hair and specific scalp characteristics. Consequently, the generalization of the model to other ethnic groups with different hair textures or scalp pigmentations has not yet been empirically verified. Additionally, the study focused exclusively on high-resolution digital microscope images. While this modality provides detailed visual information necessary for fine-grained grading, it limits the immediate applicability of the model to images captured by standard consumer devices or smartphones without distinct optical attachments. Future research will aim to address these limitations by incorporating multi-ethnic datasets and exploring domain adaptation techniques to extend the frameworks applicability across diverse populations and imaging devices.</p>
</sec>
<sec id="s6">
<label>6</label>
<title>Conclusions</title>
<p>This study presents a novel cooperative hybrid learning framework designed to overcome the persistent challenges of label noise and ordinality in automated dandruff severity grading. By synergizing a rank-consistent ordinal regression backbone with a self-supervised sample partitioning strategy, the proposed method effectively filters subjective annotation errors while preserving the intrinsic severity ranking. The incorporation of contrastive learning further enhances feature discrimination on noisy data, ensuring robust performance even when ground truth labels are unreliable. Extensive empirical validation on a large-scale clinical dataset confirms that our approach significantly outperforms state-of-the-art methods in both accuracy and stability. Beyond its theoretical contributions to noisy ordinal learning, this framework holds substantial practical value for dermatological healthcare. It offers a reliable, automated second opinion capable of standardizing diagnostic criteria and reducing inter-rater variability among clinicians. Furthermore, the high inference efficiency of the model supports its feasibility for deployment in real-time clinical decision support systems. To further extend the applicability of this framework, future research will prioritize enhancing generalization capabilities by incorporating diverse multi-ethnic datasets and exploring domain adaptation techniques for smartphone-based image analysis. These advancements will ultimately aim to broaden the accessibility of professional scalp health diagnostics to a wider global population.</p>
</sec>
</body>
<back>
<ack>
<p>This work was partially supported by MacroHI Co., Ltd. through the provisioning of the scalp image dataset. We extend our sincere gratitude to the physicians at Taipei Hospital, Ministry of Health and Welfare and the professional scalp therapists at MacroHI Co., Ltd. for their expert contributions to data annotation and for providing invaluable clinical insights into scalp conditions.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>The authors received no specific funding for this study.</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>The authors confirm contribution to the paper as follows: conceptualization, Sin-Ye Jhong and Chih-Hsien Hsia; methodology, Sin-Ye Jhong, Chih-Hsien Hsia and Hsin-Hua Huang; software, Hsin-Hua Huang; validation, Hsin-Hua Huang; formal analysis, Sin-Ye Jhong; investigation, Sin-Ye Jhong; resources, Chih-Hsien Hsia and Yung-Yao Chen; data curation, Hui-Che Hsu and Hsin-Hua Huang; writing&#x2014;original draft preparation, Sin-Ye Jhong and Yung-Yao Chen; writing&#x2014;review and editing, Sin-Ye Jhong, Chih-Hsien Hsia and Hui-Che Hsu; visualization, Hui-Che Hsu and Hsin-Hua Huang; supervision, Yulius Harjoseputro and Yung-Yao Chen; project administration, Sin-Ye Jhong and Hui-Che Hsu; funding acquisition, Chih-Hsien Hsia and Yung-Yao Chen. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>The data that support the findings of this study are subject to privacy and commercial restrictions as they were provided by a collaborating hospital and an industry partner. Therefore, the data are not publicly available.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>The study was conducted in accordance with the Declaration of Helsinki and approved by the Institutional Review Board of Taipei Hospital, Ministry of Health and Welfare (Protocol No. TH-IRB-0022-0021).</p>
</sec>
<sec>
<title>Informed Consent</title>
<p>Informed consent was obtained from all subjects involved in the study.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Mir</surname> <given-names>AN</given-names></string-name>, <string-name><surname>Rizvi</surname> <given-names>DR</given-names></string-name></person-group>. <article-title>Advancements in deep learning and explainable artificial intelligence for enhanced medical image analysis: a comprehensive survey and future directions</article-title>. <source>Eng Appl Artif Intell</source>. <year>2025</year>;<volume>158</volume>:<fpage>111413</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.engappai.2025.111413</pub-id>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jhong</surname> <given-names>SY</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>PY</given-names></string-name>, <string-name><surname>Hsia</surname> <given-names>CH</given-names></string-name></person-group>. <article-title>An expert smart scalp inspection system using deep learning</article-title>. <source>Sens Mater</source>. <year>2022</year>;<volume>34</volume>(<issue>4</issue>):<fpage>1265</fpage>. doi:<pub-id pub-id-type="doi">10.18494/sam3462</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jin</surname> <given-names>YJ</given-names></string-name>, <string-name><surname>Park</surname> <given-names>YS</given-names></string-name>, <string-name><surname>Kang</surname> <given-names>SH</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>DH</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>JY</given-names></string-name></person-group>. <article-title>A study on the development of a web platform for scalp diagnosis using EfficientNet</article-title>. <source>Appl Sci</source>. <year>2024</year>;<volume>14</volume>(<issue>17</issue>):<fpage>7574</fpage>. doi:<pub-id pub-id-type="doi">10.3390/app14177574</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Ha</surname> <given-names>C</given-names></string-name>, <string-name><surname>Go</surname> <given-names>T</given-names></string-name>, <string-name><surname>Choi</surname> <given-names>W</given-names></string-name></person-group>. <article-title>Intelligent healthcare platform for diagnosis of scalp and hair disorders</article-title>. <source>Appl Sci</source>. <year>2024</year>;<volume>14</volume>(<issue>5</issue>):<fpage>1734</fpage>. doi:<pub-id pub-id-type="doi">10.3390/app14051734</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kim</surname> <given-names>M</given-names></string-name>, <string-name><surname>Gil</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Deep-learning-based scalp image analysis using limited data</article-title>. <source>Electronics</source>. <year>2023</year>;<volume>12</volume>(<issue>6</issue>):<fpage>1380</fpage>. doi:<pub-id pub-id-type="doi">10.3390/electronics12061380</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>DZ</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>J</given-names></string-name></person-group>. <article-title>A survey on ordinal regression: applications, advances and prospects</article-title>. <comment>arXiv:2503.00952</comment>. <year>2025</year>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Song</surname> <given-names>H</given-names></string-name>, <string-name><surname>Kim</surname> <given-names>M</given-names></string-name>, <string-name><surname>Park</surname> <given-names>D</given-names></string-name>, <string-name><surname>Shin</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>JG</given-names></string-name></person-group>. <article-title>Learning from noisy labels with deep neural networks: a survey</article-title>. <source>IEEE Trans Neural Netw Learn Syst</source>. <year>2023</year>;<volume>34</volume>(<issue>11</issue>):<fpage>8135</fpage>&#x2013;<lpage>53</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TNNLS.2022.3152527</pub-id>; <pub-id pub-id-type="pmid">35254993</pub-id></mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>WC</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>LB</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>WJ</given-names></string-name></person-group>. <article-title>Development and experimental evaluation of machine-learning techniques for an intelligent hairy scalp detection system</article-title>. <source>Appl Sci</source>. <year>2018</year>;<volume>8</volume>(<issue>6</issue>):<fpage>853</fpage>. doi:<pub-id pub-id-type="doi">10.3390/app8060853</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Roy</surname> <given-names>M</given-names></string-name>, <string-name><surname>Protity</surname> <given-names>AT</given-names></string-name></person-group>. <article-title>Hair and scalp disease detection using machine learning and image processing</article-title>. <comment>arXiv:2301.00122</comment>. <year>2023</year>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jhong</surname> <given-names>SY</given-names></string-name>, <string-name><surname>Li</surname> <given-names>GT</given-names></string-name>, <string-name><surname>Hsia</surname> <given-names>CH</given-names></string-name></person-group>. <article-title>An edge-cloud collaborative scalp inspection system based on robust representation learning</article-title>. <source>IEEE Trans Consum Electron</source>. <year>2025</year>;<volume>71</volume>(<issue>1</issue>):<fpage>1551</fpage>&#x2013;<lpage>62</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCE.2024.3474911</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chang</surname> <given-names>WJ</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>LB</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>MC</given-names></string-name>, <string-name><surname>Chiu</surname> <given-names>YC</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>JY</given-names></string-name></person-group>. <article-title>ScalpEye: a deep learning-based scalp hair inspection and diagnosis system for scalp health</article-title>. <source>IEEE Access</source>. <year>2020</year>;<volume>8</volume>:<fpage>134826</fpage>&#x2013;<lpage>37</lpage>. doi:<pub-id pub-id-type="doi">10.1109/ACCESS.2020.3010847</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>LB</given-names></string-name>, <string-name><surname>Chang</surname> <given-names>WJ</given-names></string-name>, <string-name><surname>Chiu</surname> <given-names>YC</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>XR</given-names></string-name></person-group>. <article-title>An efficient scalp inspection and diagnosis system using multiple deep learning-based modules</article-title>. <source>IEEE Can J Electr Comput Eng</source>. <year>2024</year>;<volume>47</volume>(<issue>1</issue>):<fpage>22</fpage>&#x2013;<lpage>35</lpage>. doi:<pub-id pub-id-type="doi">10.1109/icjece.2024.3354291</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Bacon</surname> <given-names>RA</given-names></string-name>, <string-name><surname>Mizoguchi</surname> <given-names>H</given-names></string-name>, <string-name><surname>Schwartz</surname> <given-names>JR</given-names></string-name></person-group>. <article-title>Assessing therapeutic effectiveness of scalp treatments for dandruff and seborrheic dermatitis, part 1: a reliable and relevant method based on the adherent scalp flaking score (ASFS)</article-title>. <source>J Dermatolog Treat</source>. <year>2014</year>;<volume>25</volume>(<issue>3</issue>):<fpage>232</fpage>&#x2013;<lpage>6</lpage>. doi:<pub-id pub-id-type="doi">10.3109/09546634.2012.687089</pub-id>; <pub-id pub-id-type="pmid">22515728</pub-id></mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Niu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>M</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>L</given-names></string-name>, <string-name><surname>Gao</surname> <given-names>X</given-names></string-name>, <string-name><surname>Hua</surname> <given-names>G</given-names></string-name></person-group>. <article-title>Ordinal regression with multiple output CNN for age estimation</article-title>. In: <conf-name>Proceedings of the 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR); 2016 Jun 27&#x2013;30; Las Vegas, NV, USA</conf-name>. p. <fpage>4920</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2016.532</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Cao</surname> <given-names>W</given-names></string-name>, <string-name><surname>Mirjalili</surname> <given-names>V</given-names></string-name>, <string-name><surname>Raschka</surname> <given-names>S</given-names></string-name></person-group>. <article-title>Rank consistent ordinal regression for neural networks with application to age estimation</article-title>. <source>Pattern Recognit Lett</source>. <year>2020</year>;<volume>140</volume>(<issue>24</issue>):<fpage>325</fpage>&#x2013;<lpage>31</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patrec.2020.11.008</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>D&#x00ED;az</surname> <given-names>R</given-names></string-name>, <string-name><surname>Marathe</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Soft labels for ordinal regression</article-title>. In: <conf-name>Proceedings of the 2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR); 2019 Jun 15&#x2013;20</conf-name>; <publisher-loc>Long Beach, CA, USA</publisher-loc>. p. <fpage>4733</fpage>&#x2013;<lpage>42</lpage>. doi:<pub-id pub-id-type="doi">10.1109/CVPR.2019.00487</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Reed</surname> <given-names>S</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>H</given-names></string-name>, <string-name><surname>Anguelov</surname> <given-names>D</given-names></string-name>, <string-name><surname>Szegedy</surname> <given-names>C</given-names></string-name>, <string-name><surname>Erhan</surname> <given-names>D</given-names></string-name>, <string-name><surname>Rabinovich</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Training deep neural networks on noisy labels with bootstrapping</article-title>. <comment>arXiv:1412.6596</comment>. <year>2015</year>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Han</surname> <given-names>B</given-names></string-name>, <string-name><surname>Yao</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Niu</surname> <given-names>G</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>M</given-names></string-name>, <string-name><surname>Hu</surname> <given-names>W</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Co-teaching: robust training of deep neural networks with extremely noisy labels</article-title>. In: <conf-name>Neural Information Processing Systems; 2018 Dec 3&#x2013;8</conf-name>; <publisher-loc>Montr&#x00E9;al, QC, Canada</publisher-loc>. p. <fpage>8527</fpage>&#x2013;<lpage>37</lpage>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>J</given-names></string-name>, <string-name><surname>Socher</surname> <given-names>R</given-names></string-name>, <string-name><surname>Hoi</surname> <given-names>SCH</given-names></string-name></person-group>. <article-title>DivideMix: learning with noisy labels as semi-supervised learning</article-title>. In: <conf-name>International Conference on Learning Representations; 2020 Apr 26&#x2013;30</conf-name>; <publisher-loc>Addis Ababa</publisher-loc>, <publisher-name>Ethiopia</publisher-name>. p.   <fpage>1</fpage>&#x2013;<lpage>14</lpage>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>T</given-names></string-name>, <string-name><surname>Kornblith</surname> <given-names>S</given-names></string-name>, <string-name><surname>Norouzi</surname> <given-names>M</given-names></string-name>, <string-name><surname>Hinton</surname> <given-names>G</given-names></string-name></person-group>. <article-title>A simple framework for contrastive learning of visual representations</article-title>. In: <conf-name>Proceedings of the International Conference on Machine Learning; 2020 Jul 13&#x2013;18</conf-name>; <publisher-loc>Vienna, Austria</publisher-loc>. p. <fpage>1597</fpage>&#x2013;<lpage>607</lpage>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Cisse</surname> <given-names>M</given-names></string-name>, <string-name><surname>Dauphin</surname> <given-names>YN</given-names></string-name>, <string-name><surname>Lopez-Paz</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Mixup: beyond empirical risk minimization</article-title>. In: <conf-name>Proceedings of the International Conference on Learning Representations; 2018 Apr 30&#x2013;May 3</conf-name>; <publisher-loc>Vancouver, BC, Canada</publisher-loc>. p. <fpage>1</fpage>&#x2013;<lpage>13</lpage>.</mixed-citation></ref>
</ref-list>
</back></article>