<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">63880</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.063880</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Addressing Class Overlap in Sonic Hedgehog Medulloblastoma Molecular Subtypes Classification Using Under-Sampling and SVD-Enhanced Multinomial Regression</article-title>
<alt-title alt-title-type="left-running-head">Addressing Class Overlap in Sonic Hedgehog Medulloblastoma Molecular Subtypes Classification Using Under-Sampling and SVD-Enhanced Multinomial Regression</alt-title>
<alt-title alt-title-type="right-running-head">Addressing Class Overlap in Sonic Hedgehog Medulloblastoma Molecular Subtypes Classification Using Under-Sampling and SVD-Enhanced Multinomial Regression</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Mohammed</surname><given-names>Isra</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Musa</surname><given-names>Mohamed Elhafiz M.</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-3" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Elbashir</surname><given-names>Murtada K.</given-names></name><xref ref-type="aff" rid="aff-3">3</xref><email>mkelfaki@ju.edu.sa</email></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Mostafa</surname><given-names>Ayman Mohamed</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Adam</surname><given-names>Amin Ibrahim</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Mahmood</surname><given-names>Mahmood A.</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-7" contrib-type="author">
<name name-style="western"><surname>Faggad</surname><given-names>Areeg S.</given-names></name><xref ref-type="aff" rid="aff-5">5</xref></contrib>
<aff id="aff-1"><label>1</label><institution>Department of Statistics, Faculty of Mathematical and Computer Sciences, University of Gezira</institution>, <addr-line>Wad Madani, 21113</addr-line>, <country>Sudan</country></aff>
<aff id="aff-2"><label>2</label><institution>Department of Computer Science, College of Computer and Information Sciences, Jouf University</institution>, <addr-line>Sakaka, 72388</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-3"><label>3</label><institution>Department of Information Systems, College of Computer and Information Sciences, Jouf University</institution>, <addr-line>Sakaka, 72388</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-4"><label>4</label><institution>Department of Statistics, Faculty of Economic and Social Studies, Omdurman Islamic University</institution>, <addr-line>Khartoum, 11111</addr-line>, <country>Sudan</country></aff>
<aff id="aff-5"><label>5</label><institution>Department of Molecular Biology, National Cancer Institute, University of Gezira</institution>, <addr-line>Wad Madani, 21113</addr-line>, <country>Sudan</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Murtada K. Elbashir. Email: <email>mkelfaki@ju.edu.sa</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2025</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>03</day><month>07</month><year>2025</year>
</pub-date>
<volume>84</volume>
<issue>2</issue>
<fpage>3749</fpage>
<lpage>3763</lpage>
<history>
<date date-type="received">
<day>27</day>
<month>1</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>4</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2025 The Authors.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_63880.pdf"></self-uri>
<abstract>
<p>Sonic Hedgehog Medulloblastoma (SHH-MB) is one of the four primary molecular subgroups of Medulloblastoma. It is estimated to be responsible for nearly one-third of all MB cases. Using transcriptomic and DNA methylation profiling techniques, new developments in this field determined four molecular subtypes for SHH-MB. SHH-MB subtypes show distinct DNA methylation patterns that allow their discrimination from overlapping subtypes and predict clinical outcomes. Class overlapping occurs when two or more classes share common features, making it difficult to distinguish them as separate. Using the DNA methylation dataset, a novel classification technique is presented to address the issue of overlapping SHH-MB subtypes. Penalized multinomial regression (PMR), Tomek links (TL), and singular value decomposition (SVD) were all smoothly integrated into a single framework. SVD and group lasso improve computational efficiency, address the problem of high-dimensional datasets, and clarify class distinctions by removing redundant or irrelevant features that might lead to class overlap. As a method to eliminate the issues of decision boundary overlap and class imbalance in the classification task, TL enhances dataset balance and increases the clarity of decision boundaries through the elimination of overlapping samples. Using fivefold cross-validation, our proposed method (TL-SVDPMR) achieved a remarkable overall accuracy of almost 95% in the classification of SHH-MB molecular subtypes. The results demonstrate the strong performance of the proposed classification model among the various SHH-MB subtypes given a high average of the area under the curve (AUC) values. Additionally, the statistical significance test indicates that TL-SVDPMR is more accurate than both SVM and random forest algorithms in classifying the overlapping SHH-MB subtypes, highlighting its importance for precision medicine applications. Our findings emphasized the success of combining SVD, TL, and PMR techniques to improve the classification performance for biomedical applications with many features and overlapping subtypes.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Class overlap</kwd>
<kwd>SHH-MB molecular subtypes</kwd>
<kwd>under-sampling</kwd>
<kwd>singular value decomposition</kwd>
<kwd>penalized multinomial regression</kwd>
<kwd>DNA methylation profiles</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Jouf University</funding-source>
<award-id>DGSSR-2024-02-01137</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Medulloblastoma (MB) starts in the cerebellum, the part of the brain crucial for voluntary muscle movements like balance and motor control. MB is the most prevalent malignant brain tumor in children, even though it can also happen in adults [<xref ref-type="bibr" rid="ref-1">1</xref>]. The standard treatment regimen for MB, which includes surgical resection, radiation therapy, and chemotherapy, has resulted in favorable short-term outcomes. The pressing need for less toxic and more targeted therapy calls for a greater understanding of the heterogeneity within and amongst MB tumors. Sonic Hedgehog MBs (SHH-MB) molecular subgroup represents 30% of cases and is characterized by an overactive Sonic Hedgehog pathway caused by acquired or inherited mutations [<xref ref-type="bibr" rid="ref-2">2</xref>]. The SHH-MBs have been established as distinct subtypes with the advancement of molecular studies using transcriptomics and DNA methylation profiling methods. There are probably four molecular subtypes of SHH-MB: SHH_alpha, SHH_beta, SHH_delta, and SHH_gamma [<xref ref-type="bibr" rid="ref-3">3</xref>,<xref ref-type="bibr" rid="ref-4">4</xref>]. These classifications could enhance personalized plans of therapy and improve patient outcomes by improving the accuracy of predictions. Recent research into SHH-MB has significantly fostered our understanding of the molecular subtypes of the disease and their implications for tumor biology and therapy. Recent studies contributed noteworthy conclusions. A comprehensive review of current developments in SHH-MB molecular analysis has underlined significant intertumoral variability among SHH-MB tumors and led to the identification of new subtypes with distinct biochemical and clinical features [<xref ref-type="bibr" rid="ref-4">4</xref>]. The authors of [<xref ref-type="bibr" rid="ref-5">5</xref>] emphasized the combination of computational analysis with single-nucleus RNA sequencing to investigate the relationships between tumor heterogeneity and developmental trajectories in SHH MBs. The study emphasized the potential for differentiated treatment of SHH-MBs by identifying cell types associated with the various phases of granule neuron development. Investigating 96 samples of SHH-MB that were identified using RNA sequencing, targeted DNA sequencing, and genome-wide DNA methylation profiling, the authors of [<xref ref-type="bibr" rid="ref-2">2</xref>] detected molecular subtypes of this tumor and assessed their prognostic relevance. Their study concluded that adult MBs, although histologically uniform, demonstrate remarkable molecular heterogeneity. Furthermore, they deduced that the identification of two distinct molecular subtypes was integral to understanding their disparate clinical behaviors and outcomes. A cohort of 250 human SHH-MB samples was analyzed by strand-specific RNA sequencing with the incorporation of DNA methylation and whole-genome sequencing to investigate their molecular subtypes and underlying biology. The findings support the presence of four clinical subtypes based on presentation and age: SHH_alpha, SHH_beta, SHH_gamma, and SHH_delta [<xref ref-type="bibr" rid="ref-6">6</xref>].</p>
<p>Several factors influence the performance of any classification model, particularly in the case of multiclass classification. The effectiveness of machine learning algorithms in practical settings may be hampered by class imbalances, but this effect is dependent on class overlap as well [<xref ref-type="bibr" rid="ref-7">7</xref>]. Class overlap occurs when samples from different classes exist in common areas within the feature space, thereby creating ambiguity in the decision domains [<xref ref-type="bibr" rid="ref-8">8</xref>]. The underlying classifier&#x2019;s overall performance gets worse with the increasing level of class overlap since it incorrectly classifies the samples along the boundary line [<xref ref-type="bibr" rid="ref-9">9</xref>]. Different strategies have been proposed to address the class overlap issue in the classification problem. An under-sampling-based supervised learning was created by the authors of [<xref ref-type="bibr" rid="ref-10">10</xref>] to address the class imbalance in binary datasets, where there is a class overlap issue. Their proposed under-sampling framework effectively addressed the challenge of class imbalance and class overlap in binary datasets. The visibility of the minority class was enhanced by identifying and removing overlapping majority-class samples. Their suggested method&#x2019;s sensitivity was comparable to the state-of-the-art approaches. Wang et al. [<xref ref-type="bibr" rid="ref-11">11</xref>] introduced Extreme SMOTE and Synchronous Sampling Learning Methods as effective methods for improving financial distress detection in listed companies that have issues with class imbalance and class overlap. Their proposed method, while focusing on refining decision boundaries and optimizing company selection, significantly improved the classification performance of various machine learning algorithms. An under-sampling algorithm based on random forest cleaning rule (RFCL) was introduced by Zhang et al. [<xref ref-type="bibr" rid="ref-12">12</xref>]. It provides a solid solution for managing class imbalance and class overlap in classification problems. Effectively defining and applying a new decision boundary, RFCL enhances model performance, outperforming other under-sampling methods. Entropy and neighborhood-based under-sampling (ENU), a method recently developed by Kumar et al. [<xref ref-type="bibr" rid="ref-13">13</xref>] was intended to remove the majority of class samples from the overlapping regions to manage class overlap in classification tasks. ENU computes entropy scores for the majority class samples and establishes a threshold to determine which samples can be removed. The results indicate that ENU significantly outperforms several state-of-the-art methods across various metrics.</p>
<p>The earlier studies on class overlap in the classification task have certain drawbacks. Their primary focus is on binary classification tasks. Multiclass classification, however, adds more complexity, which, in turn, requires further research and customized methods. On the other hand, certain research avoids the majority class sample from areas where it overlaps with the samples from different classes, the selection of distance measures can make a distinction in how the overlapping regions are perceived [<xref ref-type="bibr" rid="ref-10">10</xref>,<xref ref-type="bibr" rid="ref-13">13</xref>]. Moreover, certain studies overlook the potential risk of overfitting that arises from oversampling specific types of samples, particularly when those samples are excessively similar to the training set. In our previous study, we successfully classified imbalanced MB subgroups using singular value decomposition-based penalized multinomial regression (SVDPMR) [<xref ref-type="bibr" rid="ref-14">14</xref>]. In the present study, we aimed at further sub-classifying a main molecular subgroup of MB, namely SHH-MB, by applying Tomek links-based SVDPMR (TL-SVDPMR) to analyze DNA methylation probe features from samples of SHH-MB patients. Penalized multinomial regression (PMR) model is helpful for multiclass classification tasks, mainly when working with high-dimensional data. The PMR model is a penalized generalized linear model that is fitted by minimizing the penalized loss function. Numerical techniques are employed to estimate the coefficients in the PMR model. We employed SVD and group lasso as dimensionality reduction techniques. SVD created a new low-dimensional input feature space from the training set. Then, the group lasso selected features that distinctly represent each SHH-MB subtype. Tomek Links (TL) finds the sample pairings of different subtypes that are closest to one another [<xref ref-type="bibr" rid="ref-15">15</xref>]. By eliminating the sample from the majority subtype in these pairs, TL helps to improve model performance and decision boundaries by removing samples that lead to class overlap. By reducing the number of features and selectively decreasing samples from the majority classes, these methods could enhance the clarity of class boundaries and improve the performance of the classification model. Our ultimate objective was to achieve high performance in correctly distinguishing the four subtypes within SHH-MB with less error.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Materials and Methods</title>
<sec id="s2_1">
<label>2.1</label>
<title>Dataset</title>
<p>This study&#x2019;s dataset represents the genome-wide DNA methylation profiling of 763 primary SHH-MB samples. Each MB sample has 321,174 DNA methylation probe features. The samples were separated into subtypes within the four main molecular subgroups of MB. We chose a subset of the methylation dataset that includes only the four SHH-MB subtypes. The dataset is found on Gene Expression Omnibus with accession number GSE85212.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Data Splitting</title>
<p>We used stratified fivefold cross-validation to ensure the generalization of the proposed models to the unseen samples. This resampling method includes randomly splitting the SHH-MB samples into five groups that are roughly equal in size. One fold is used as a test set, while the remaining four folds are used to fit the proposed method, and the performance of the predicted models is recorded based on the test set. This procedure is repeated until all five folds are used as a test set. Consequently, there are five different training sets (Training Set 1, Training Set 2, Training Set 3, Training Set 4, and Training Set 5) in five-fold cross-validation and a corresponding testing set for each training set. To perform five-fold cross-validation, we used a k-fold function from the Bimba R package, version 1.3.14. The model&#x2019;s performance metrics were then calculated as the average of the reported scores. <xref ref-type="table" rid="table-1">Table 1</xref> shows the distribution (count) of SHH-MB subtypes for each training set and the whole methylation dataset. There is an unequal distribution of SHH-MB samples among their subtypes in each training set and high feature dimensionality (321,174 features).</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>SHH-MB subtypes distribution (count) for the whole methylation dataset and for the training datasets</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th align="center">Subtypes within SHH</th>
<th align="center">The methylation dataset</th>
<th align="center">Training set 1</th>
<th align="center">Training set 2</th>
<th align="center">Training set 3</th>
<th align="center">Training set 4</th>
<th align="center">Training set 5</th>
</tr>
</thead>
<tbody>
<tr>
<td>SHH_alpha</td>
<td>65</td>
<td>52</td>
<td>52</td>
<td>52</td>
<td>52</td>
<td>52</td>
</tr>
<tr>
<td>SHH_beta</td>
<td>35</td>
<td>28</td>
<td>28</td>
<td>28</td>
<td>28</td>
<td>28</td>
</tr>
<tr>
<td>SHH_delta</td>
<td>76</td>
<td>61</td>
<td>61</td>
<td>60</td>
<td>61</td>
<td>61</td>
</tr>
<tr>
<td>SHH_gamma</td>
<td>47</td>
<td>38</td>
<td>37</td>
<td>38</td>
<td>37</td>
<td>38</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Proposed Method</title>
<p>The proposed strategy combined various techniques to solve the class overlap problem in classifying SHH-MB subtypes as presented in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>. In the first step, SVD was applied to the training set to map the original input feature space to the new low-dimensional input feature space. SVD is used to overcome the impact of correlated or redundant features on the classification method&#x2019;s performance. Moreover, it reduces the dimension of the training set while preserving the most important information. Second, by undersampling the majority subtypes in the area where they overlap with the minority subtypes, TL improves the clarity of the decision boundaries separating the SHH-MB subtypes. Third, implementing PMR on the standardized new input features space to create the classification models. In this step, group lasso, as a penalization method, plays a crucial role in selecting features that can help clarify the class boundaries between SHH-MB subtypes. Finally, we evaluated the classification models on the testing sets using standard evaluation metrics.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Flow chart of the proposed method for SHH-MB subtypes classification</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_63880-fig-1.tif"/>
</fig>
<sec id="s2_3_1">
<label>2.3.1</label>
<title>Singular Value Decomposition (SVD)</title>
<p>SVD can be used to decompose a real matrix. A positive semidefinite normal matrix&#x2019;s eigenvalue decomposition can be extended to apply to any <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>p</mml:mi></mml:math></inline-formula> matrix in SVD. The equation for the singular decomposition of the target dataset <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mo stretchy="false">(</mml:mo><mml:mi>A</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is obtained by
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mi>V</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msubsup></mml:math></disp-formula>where <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mi>d</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:mi>p</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the number of the new features, <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mi>n</mml:mi></mml:math></inline-formula> is the number of SHH-MB samples, <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mi>p</mml:mi></mml:math></inline-formula> is the number of DNA methylation probe features, <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mi>S</mml:mi></mml:math></inline-formula> is a diagonal matrix with the singular values organized in descending order, the columns of <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>U</mml:mi></mml:math></inline-formula> and <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mi>V</mml:mi></mml:math></inline-formula> are the eigenvectors of <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mi>A</mml:mi><mml:msup><mml:mi>A</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msup><mml:mi>A</mml:mi><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup><mml:mi>A</mml:mi></mml:math></inline-formula>, respectively. The new reduced matrix <italic>X</italic> can be found by:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>U</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The explained variances of all the newly added features up to a particular feature are added together to create cumulative explained variance [<xref ref-type="bibr" rid="ref-14">14</xref>,<xref ref-type="bibr" rid="ref-16">16</xref>]. For the application of the SVD method to the methylation dataset, we used the SVD function in the base R package (version 0.5.2).</p>
</sec>
<sec id="s2_3_2">
<label>2.3.2</label>
<title>Undersampling-Based Tomek Links</title>
<p>The Euclidean distance between the two samples, <italic>x</italic> and <italic>y</italic>, which belong to different classes, is represented by <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>. If there is no sample <italic>z</italic> from any class such that <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>z</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x003C;</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> or <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>z</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x003C;</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, then <italic>a</italic>(<italic>x</italic>, <italic>y</italic>) pair is referred to as a Tomek link (TL). If a Tomek link consists of two examples, either one of the cases is noisy or both examples are borderline. Tomek connections can be utilized as a data-cleaning approach or as an under sampling strategy. As a data-cleaning strategy, examples from both classes are removed, while as an under-sampling technique, only examples from the majority class are removed [<xref ref-type="bibr" rid="ref-17">17</xref>,<xref ref-type="bibr" rid="ref-18">18</xref>]. We used the identify_tomek_links function from the bimba R package, version 0.0.0.9000, to apply the under sampling technique based on TL to the SHH-MB samples.</p>
</sec>
<sec id="s2_3_3">
<label>2.3.3</label>
<title>Penalized Multinomial Regression (PMR)</title>
<p>Consider a design matrix <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mi mathvariant="bold-italic">X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> where <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mi>n</mml:mi></mml:math></inline-formula> is the number of the SHH-MB samples, <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mi>d</mml:mi></mml:math></inline-formula> is the number of the transformed probe features by SVD, and <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mrow><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is a row vector in <inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:mi mathvariant="bold-italic">X</mml:mi></mml:math></inline-formula>. Additionally, let <italic>y</italic> be a discrete outcomes vector that denotes the SHH-MB subtypes and <inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>J</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> takes the value <inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mn>1</mml:mn></mml:math></inline-formula> if the <inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:mi>i</mml:mi><mml:mrow><mml:mtext>th</mml:mtext></mml:mrow></mml:math></inline-formula> SHH-MB sample falls in the <inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:mi>j</mml:mi><mml:mrow><mml:mtext>th</mml:mtext></mml:mrow></mml:math></inline-formula> SHH-MB subtype and <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:mn>0</mml:mn></mml:math></inline-formula> otherwise, where <inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> acting as an indicator variable, <inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:math></inline-formula>, and <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>J</mml:mi></mml:math></inline-formula>. Furthermore, let <inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>J</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> where <inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> represents the probability that the <inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:mi>i</mml:mi><mml:mrow><mml:mtext>th</mml:mtext></mml:mrow></mml:math></inline-formula> SHH-MB sample falls in the <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:mi>j</mml:mi><mml:mrow><mml:mtext>th</mml:mtext></mml:mrow></mml:math></inline-formula> SHH-MB subtype and <inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>J</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> [<xref ref-type="bibr" rid="ref-14">14</xref>]. Suppose:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>J</mml:mi></mml:mrow></mml:munderover><mml:mi>exp</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:math></disp-formula>
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is a vector of regression coefficient, <inline-formula id="ieqn-33"><mml:math id="mml-ieqn-33"><mml:mi>&#x03B2;</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>2</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>d</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>J</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x00D7;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula id="ieqn-34"><mml:math id="mml-ieqn-34"><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>k</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>J</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, and <inline-formula id="ieqn-35"><mml:math id="mml-ieqn-35"><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>d</mml:mi></mml:math></inline-formula>. Assuming that <inline-formula id="ieqn-36"><mml:math id="mml-ieqn-36"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are independent when each of them is conditioned on its own <inline-formula id="ieqn-37"><mml:math id="mml-ieqn-37"><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula id="ieqn-38"><mml:math id="mml-ieqn-38"><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>J</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> follows a multinomial distribution with probability <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x03C0;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>J</mml:mi></mml:mrow></mml:msub><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> the log-likelihood is:
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mi>L</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mrow><mml:mo>[</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>J</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mtext>y</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>J</mml:mi></mml:mrow></mml:munderover><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mrow><mml:msub><mml:mi>&#x03B7;</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The total probability that each sample belongs to the true SHH-MB subtype is represented by the value of <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> in <xref ref-type="disp-formula" rid="eqn-5">Eq. (5)</xref>.</p>
<p>Regularization techniques use penalization, or the penalized loss function, to address the statistical difficulties of high-dimensional data. It is possible to carry the high-dimension parameter matrix <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:mi>&#x03B2;</mml:mi></mml:math></inline-formula> &#x003D; <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mrow><mml:mo>(</mml:mo><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>2</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>d</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> via <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mi>d</mml:mi></mml:math></inline-formula> groups, each of which has <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:mi>J</mml:mi></mml:math></inline-formula> parameters. &#x201C;Group lasso&#x201D; is the name of this regularization technique [<xref ref-type="bibr" rid="ref-19">19</xref>]. The group lasso provides a sparse collection of groups; that is if a group is included in the model, all of its parameters will be nonzero. The parameter matrix is estimated as a minimizer of the group lasso penalized negative-log-likelihood.
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msup><mml:mrow><mml:mover><mml:mi>&#x03B2;</mml:mi><mml:mo stretchy="false">&#x005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msup><mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mi>&#x03B2;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>J</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mrow></mml:mrow><mml:mrow></mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mi>L</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>&#x03B2;</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x03BB;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mo symmetric="true">&#x2016;</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo symmetric="true">&#x2016;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>The L2 norm, written as <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mo symmetric="true">&#x2016;</mml:mo><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo symmetric="true">&#x2016;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> is the square root of the sum of squared group parameters value. It is possible to select the penalty parameter &#x03BB; during implementation [<xref ref-type="bibr" rid="ref-20">20</xref>].</p>
<p>Coordinate descent was used to get the estimate of coefficients in the penalized negative-log-likelihood problem in <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref> [<xref ref-type="bibr" rid="ref-21">21</xref>]. We utilized the fit function from the msgl R package, version 2.3.9, to perform PMR.</p>
</sec>
<sec id="s2_3_4">
<label>2.3.4</label>
<title>Performance Metrics</title>
<p>Different performance metrics were employed to determine the effectiveness of our proposed classification model on unseen samples. Overall accuracy (ACC), specificity, recall, area under the curve (AUC), NLL, and the 95% confidence interval (CI) for ACC are some of these performance metrics [<xref ref-type="bibr" rid="ref-14">14</xref>,<xref ref-type="bibr" rid="ref-22">22</xref>,<xref ref-type="bibr" rid="ref-23">23</xref>]. Four values are crucial to understand to assess the classification model&#x2019;s performance: (1) the number of samples that belong to a class and are accurately predicted is known as the class&#x2019;s TPs (<inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>). (2) <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mi>s</mml:mi></mml:math></inline-formula> of a class (<inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>F</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>) is the number of samples that are not related to that class but are mistakenly anticipated to be so. (3) <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mi>s</mml:mi></mml:math></inline-formula> of a class <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the number of samples that do not belong to a particular class and are accurately predicted to be non-class members. (4) <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mi>s</mml:mi></mml:math></inline-formula> of a class <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:mo stretchy="false">(</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>) is the number of samples that belong to a class but are mistakenly predicted to be outside of that class. The percentage of correctly identified samples, or ACC, is as follows:
<disp-formula id="eqn-7"><label>(7)</label><mml:math id="mml-eqn-7" display="block"><mml:mi>A</mml:mi><mml:mi>C</mml:mi><mml:mi>C</mml:mi><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>J</mml:mi></mml:mrow></mml:munderover><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mi>N</mml:mi></mml:math></disp-formula>where <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:mi>N</mml:mi></mml:math></inline-formula> is the number of predictions. Class <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:mi>j</mml:mi></mml:math></inline-formula> recall, also known as sensitivity, is the percentage of all class <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mi>j</mml:mi></mml:math></inline-formula> samples that were correctly identified and it is computed as:
<disp-formula id="eqn-8"><label>(8)</label><mml:math id="mml-eqn-8" display="block"><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>T</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:math></disp-formula></p>
<p>The following formula is used to calculate the specificity of class <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mi>j</mml:mi></mml:math></inline-formula>, which is the percentage of samples from other classes that were categorized as them.
<disp-formula id="eqn-9"><label>(9)</label><mml:math id="mml-eqn-9" display="block"><mml:mi>S</mml:mi><mml:mi>p</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>f</mml:mi><mml:mi>i</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mi>j</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>T</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>To determine the ACC and 95% CI for our classification approach, we utilized the confusion Matrix function from the caret R package, version 6.0.93. Recall and specificity were obtained by using ml_test from the mltest package, version 1.0.1. Furthermore, A fitted model&#x2019;s residual deviance is equal to twice its log-likelihood minus i.e., &#x2212;2 log-likelihood (NLL) in <xref ref-type="disp-formula" rid="eqn-6">Eq. (6)</xref> [<xref ref-type="bibr" rid="ref-24">24</xref>]. Deviance is defined as the average of &#x2212;2 log-likelihood for a dataset with n observations. When the highest predicted probability deviates from the correct class, the NLL increases. We used the Err function from the sglOptim R package, version 1.3.8, to get the values of NLL. Receiver operating characteristic (ROC) curves were also utilized to evaluate the proposed classifier. The area under the curve (AUC), which is commonly used for technique comparison in the ROC context, measures a binary classifier&#x2019;s performance. A better classification is indicated by a higher AUC. The &#x201C;one vs. all&#x201D; (OvA) method can be used to extend ROC curves from binary classification to multiclass classification as follows: (1) create a binary classification task for each class in your dataset. This implies that one class will be considered as the positive class and the other classes as the negative classes. (2) Based on the predicted probabilities related to each class, determine the sensitivity and specificity for each binary variable associated with that class for various thresholds. Four predicted probability vectors relating to the SHH-MB subtypes and four binary variables representing each subtype were used in this study. The ROC curve represents the specificity and sensitivity of a classification at a given threshold. Trapezoids are used to compute AUCs [<xref ref-type="bibr" rid="ref-25">25</xref>,<xref ref-type="bibr" rid="ref-26">26</xref>]. The roc function from the pROC package, version 1.18.0, was used to get the AUC values for each SHH-MB subtype. Moreover, to plot ROC curves, we utilized the ggplot function from the ggplot2 package, version 3.5.1.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Experimental Results</title>
<sec id="s3_1">
<label>3.1</label>
<title>Classification Performance of SVDPMR with Different Resampling Methods</title>
<p>We employed fivefold cross-validation as a way to assess the performance of SVDPMR with different resampling methods in the classification of the four overlapped molecular subtypes within SHH-MB. These resampling methods include the synthetic minority oversampling technique (SMOTE), edited nearest neighbors (ENN), and tomek links (TL). The aforementioned techniques were applied to increase the minority class (SHH_beta) samples&#x2019; visibility in the area where they overlap with the other classes.</p>
<p>Employing the SMOTE method, we tried to improve the representation of the minority class by creating synthetic samples. The ENN technique was applied to clean the dataset from the noisy samples. Finally, Tomek links were used to find the nearest neighbor pairs of samples from different classes. The technique cleans the dataset and lessens class overlap by eliminating the majority of class instances that make up these linkages.</p>
<p>After training our classification model with different numbers of SVD-transformed features, we selected the optimal number of features for our classification job, which enhanced the model&#x2019;s performance. The majority of the information in our dataset, which consists of 178 transformed features, is captured by the first two transformed features in each of the five folds, which account for at least 96% of the variance. <xref ref-type="fig" rid="fig-2">Fig. 2</xref> shows a plot of each patient in each training dataset in the first new 2-dimensional feature space obtained via SVD using colors that match their SHH-MB subtype to allow us to investigate the degree of class overlap between the SHH-MB subtypes. The findings from <xref ref-type="fig" rid="fig-2">Fig. 2a</xref>&#x2013;<xref ref-type="fig" rid="fig-2">e</xref> demonstrate that for each of the five training datasets, SHH-delta samples and SHH_alpha samples showed some overlap. In contrast, SHH-beta shows a considerable overlap with SHH-gamma. <xref ref-type="table" rid="table-2">Table 2</xref> summarizes the SVDPMR model&#x2019;s performance through the previously described resampling techniques. It displays the ACC with 95% CI and NLL using fivefold cross-validation. The results presented in <xref ref-type="table" rid="table-2">Table 2</xref> demonstrate that the best accuracy was obtained when SVDPMR with TL models is applied across all cross-validation folds except the third fold, with an average ACC value of 94.6%. The SVDPMR with the ENN method also showed a competitive average ACC of 92.8%, suggesting it could be a viable alternative. On the other hand, the lowest ACC was obtained when SVDPMR with SMOTE was applied across all cross-validation folds except the third fold; an average ACC value of 90.2% was achieved. Moreover, the average NLL of using SVDPMR with TL is 0.308, indicating that the sum up of the probabilities that each sample belongs to the incorrect class is around 0.3.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>2D-SVD plot representing the SHH-MB samples in the (<bold>a</bold>) Training Set 1, (<bold>b</bold>) Training Set 2, (<bold>c</bold>) Training Set 3, (<bold>d</bold>) Training Set 4, and (<bold>e</bold>) Training Set 5</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_63880-fig-2.tif"/>
</fig><table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>ACC (%) with 95% CI and NLL of SVDPMR with different resampling methods</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th align="center">Method</th>
<th align="center">Performance metrics</th>
<th align="center">Fold 1</th>
<th align="center">Fold 2</th>
<th align="center">Fold 3</th>
<th align="center">Fold 4</th>
<th align="center">Fold 5</th>
<th align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="2">None</td>
<td>ACC, 95% CI</td>
<td>0.954, (0.845, 0.994)</td>
<td>0.933, (0.817, 0.986)</td>
<td>0.911, (0.788, 0.975)</td>
<td>0.844, (0.705, 0.935)</td>
<td>0.886, (0.754, 0.962)</td>
<td>0.902, (0.782, 0.970)</td>
</tr>
<tr>
<td>NLL</td>
<td>0.23</td>
<td>0.33</td>
<td>0.30</td>
<td>0.37</td>
<td>0.32</td>
<td>0.31</td>
</tr>
<tr>
<td rowspan="2">TL</td>
<td>ACC, 95% CI</td>
<td><bold>0.977</bold>, (0.880, 0.999)</td>
<td>0.956, (0.849, 0.995)</td>
<td>0.933, (0.817, 0.986)</td>
<td>0.933, (0.817, 0.986)</td>
<td>0.932, (0.813, 0.986)</td>
<td>0.946, (0.835, 0.990)</td>
</tr>
<tr>
<td>NLL</td>
<td><bold>0.23</bold></td>
<td>0.35</td>
<td>0.27</td>
<td>0.38</td>
<td>0.31</td>
<td>0.31</td>
</tr>
<tr>
<td rowspan="2">ENN [<xref ref-type="bibr" rid="ref-17">17</xref>]</td>
<td>ACC, 95% CI</td>
<td>0.955, (0.845, 0.994)</td>
<td>0.956, (0.849, 0.995)</td>
<td>0.933, (0.817, 0.986)</td>
<td>0.889, (0.760, 0.963)</td>
<td>0.909, (0.783, 0.975)</td>
<td>0.928, (0.811, 0.983)</td>
</tr>
<tr>
<td>NLL</td>
<td><bold>0.23</bold></td>
<td>0.32</td>
<td>0.28</td>
<td>0.38</td>
<td>0.30</td>
<td>0.30</td>
</tr>
<tr>
<td rowspan="2">SMOTE [<xref ref-type="bibr" rid="ref-27">27</xref>]</td>
<td>ACC, 95% CI</td>
<td>0.955, (0.845, 0.994)</td>
<td>0.911, (0.788, 0.975)</td>
<td>0.956, (0.849, 0.995)</td>
<td>0.822, (0.680, 0.920)</td>
<td>0.864, (0.727, 0.948)</td>
<td>0.902, (0.778, 0.966)</td>
</tr>
<tr>
<td>NLL</td>
<td>0.26</td>
<td>0.35</td>
<td>0.30</td>
<td>0.41</td>
<td>0.33</td>
<td>0.33</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In brief, applying the SVDPMR model with TL (TL-SVDPMR) on the dataset across all folds of cross-validation performed well in maximizing the value of ACC. Moreover, TL-SVDPMR&#x2019;s average NLL is 0.31, which is lower than SVDPMR with SMOTE but comparable to SVDPMR and SVDPMR with ENN approaches.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Classification Performance of TL-SVDPMR for the Subtypes within SHH-MB</title>
<p>To explore the effect of TL-SVDPMR, which can be used to solve class overlap issues, we used confusion matrices as a highly effective analytical procedure. First, we identified TL between SHH-MB subtypes using various feature sets across multiple folds. We specifically looked at SHH_alpha and SHH_delta in one context and SHH_beta and SHH_gamma in another. For training sets 1, 2, 3, 4, and 5, we utilized the first 150, 117, 42, 9, and 29 transformed features by SVD, respectively. To build the trained models by PMR, we used different numbers of features from each training set. We utilized the first 15, 15, 70, 89, and 9 features, respectively, from training sets 1, 2, 3, 4, and 5. Additionally, we selected the penalization parameter to 0.05.</p>
<p><xref ref-type="fig" rid="fig-3">Fig. 3</xref> illustrates the confusion matrices resulting from applying the TL-SVDPMR models in each testing fold. As apparent in <xref ref-type="fig" rid="fig-3">Fig. 3a</xref>, the models discovered that one SHH_beta sample had been incorrectly classified as a SHH_alpha sample in the first fold. <xref ref-type="fig" rid="fig-3">Fig. 3b</xref>&#x2019;s findings demonstrate that when the TL-SVDPMR model is used in fold2, one SHH_gamma patient was falsely classified as a SHH_delta patient, and one SHH_alpha patient was incorrectly classified as a SHH_gamma patient. When the proposed model was applied to fold 3, <xref ref-type="fig" rid="fig-3">Fig. 3c</xref> shows that two SHH_gamma patients and one SHH_delta patient were incorrectly classified as SHH_beta and SHH_alpha, respectively. <xref ref-type="fig" rid="fig-3">Fig. 3d</xref>&#x2019;s findings demonstrate that when the model was applied to fold4, two SHH_alpha patients were mistakenly classified as SHH_gamma and SHH_delta patients, while one SHH_beta patient was mistakenly classified as a SHH_gamma patient. Lastly, <xref ref-type="fig" rid="fig-3">Fig. 3e</xref> demonstrates that two SHH_alpha patients and one SHH_gamma patient were misclassified as SHH_delta and SHH_alpha, respectively, when the proposed model was applied to fold 5.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Confusion matrices of TL-SVDPMR based on (<bold>a</bold>) fold 1, (<bold>b</bold>) fold 2, (<bold>c</bold>) fold 3, (<bold>d</bold>) fold 4, and (<bold>e</bold>) fold 5</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_63880-fig-3.tif"/>
</fig>
<p>In short, the TL-SVDPMR approach may be able to successfully handle class overlap problems, but more work is needed to improve accuracy across all SHH-MB subtypes. <xref ref-type="fig" rid="fig-4">Fig. 4</xref> represents the ROC curve of our classifier&#x2019;s performance across around 50 thresholds. Based on the predicted probability for each subtype, it plotted the specificities against the sensitivities across around 50 thresholds. For each subtype, we consider it as the positive class and all combined other subtypes as the negative class. This results in a separate ROC curve for each subtype. From <xref ref-type="fig" rid="fig-4">Fig. 4a</xref>&#x2013;<xref ref-type="fig" rid="fig-4">e</xref>, the results indicate that, on each testing fold, a greater sensitivity value corresponds to a lower specificity value, and <italic>vice versa</italic>. The average area under the curve (AUC) for the subtypes SHH_alpha, SHH_beta, SHH_delta, and SHH_gamma was 0.984, 0.996, 0.997, and 0.975, respectively. This trade-off highlights the balance between correctly identifying positive cases (sensitivity) and minimizing false positives (specificity). The results indicate a strong performance of the proposed classification model across different subtypes of SHH-MB based on the average AUC values.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Multiclass ROC curves of TL-SVDPMR on (<bold>a</bold>) fold 1, (<bold>b</bold>) fold 2, (<bold>c</bold>) fold 3, (<bold>d</bold>) fold 4, and (<bold>e</bold>) fold 5</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_63880-fig-4a.tif"/>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_63880-fig-4b.tif"/>
</fig>
<p>The findings in <xref ref-type="table" rid="table-3">Table 3</xref> shed light on the best thresholds, along with the associated recall and specificity, for the different SHH-MB subtypes when using the TL-SVDPMR method in a binary classification framework. Each subtype is treated as the positive class in this situation, and the others are considered as negative classes. The sensitivity and specificity of the TL-SVD-PMR models at each threshold were calculated using <xref ref-type="disp-formula" rid="eqn-8">Eqs. (8)</xref> and <xref ref-type="disp-formula" rid="eqn-9">(9)</xref>, respectively. Excellent sensitivity was demonstrated by the high recall rates (1) achieved for multiple subtypes across folds, and the specificity values were also generally high.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>The ideal threshold and classification measures for SHH-MB subtypes using TL-SVDPMR</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th>Fold</th>
<th>Subtype</th>
<th>Optimal threshold</th>
<th>Recall</th>
<th>Specificity</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="4">Fold 1</td>
<td>SHH_alpha</td>
<td>0.32</td>
<td>1</td>
<td>0.968</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.26</td>
<td>1</td>
<td>0.919</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.52</td>
<td>1</td>
<td>1</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.40</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<td rowspan="4">Fold 2</td>
<td>SHH_alpha</td>
<td>0.45</td>
<td>0.923</td>
<td>1</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.50</td>
<td>1</td>
<td>1</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.50</td>
<td>1</td>
<td>0.967</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.45</td>
<td>0.900</td>
<td>0.971</td>
</tr>
<tr>
<td rowspan="4">Fold 3</td>
<td>SHH_alpha</td>
<td>0.31</td>
<td>1</td>
<td>0.938</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.76</td>
<td>1</td>
<td>1</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.41</td>
<td>0.938</td>
<td>1</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.26</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<td rowspan="4">Fold 4</td>
<td>SHH_alpha</td>
<td>0.28</td>
<td>0.923</td>
<td>0.938</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.33</td>
<td>1</td>
<td>0.974</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.40</td>
<td>1</td>
<td>0.967</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.40</td>
<td>1</td>
<td>0.914</td>
</tr>
<tr>
<td rowspan="4">Fold 5</td>
<td>SHH_alpha</td>
<td>0.09</td>
<td>1</td>
<td>0.903</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.50</td>
<td>1</td>
<td>1</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.57</td>
<td>1</td>
<td>1</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.52</td>
<td>0.889</td>
<td>1</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Discussion</title>
<p>A robust machine for predicting overlapped molecular subtypes in SHH-MB using DNA methylation data is created by using SVD for dimensionality reduction, TL for undersampling the majority subtypes that are located in the overlapping region with the minority subtypes, PMR as a classifier, and Coordinate Descent for optimization. The mean accuracy of about 95% achieved shows the feasibility of this approach, indicating the utility of this strategy for precision medicine and tailored therapies.</p>
<p>Employing DNA methylation data, we compared our proposed method (TL-SVDPMR) with a random forest (RF) algorithm [<xref ref-type="bibr" rid="ref-28">28</xref>]and support vector machine (SVM) [<xref ref-type="bibr" rid="ref-29">29</xref>]. We used SVD to reduce the dimensionality of the features before using RF and SVM techniques. To train effective RF and SVM models, we selected the number of first features derived by SVD based on variance explained and model performance. TL is used to make the decision boundaries for separating the SHH subtypes less blurry by reducing the number of samples in the majority classes. In addition, we chose a numerical optimization method, which can take advantage of minimizing the loss function, expressed as the sum of the probabilities that each sample is assigned to the wrong class. Based on fivefold cross-validation, the results shown in <xref ref-type="table" rid="table-4">Table 4</xref> summarize the performance of the three methods in classifying SHH-MB subtypes using the DNA methylation dataset. The assessment is provided by AUC values, specificity, and sensitivity metrics in a multiclass classification context.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Comparison with the State-of-the-Art in classifying the SHH-MB subtypes</title>
</caption>
<table>
<colgroup>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th rowspan="2">Method</th>
<th rowspan="2">Subtypes within SHH</th>
<th colspan="3">Results</th>
</tr>
<tr>
<th>AUC</th>
<th>Recall</th>
<th>Specificity</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="4">Our Proposed Method</td>
<td>SHH_alpha</td>
<td>0.984</td>
<td>0.923<sup>a</sup></td>
<td>0.980<sup>b</sup></td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.996<sup>a</sup></td>
<td>0.948</td>
<td>0.989</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.997<sup>c</sup></td>
<td>0.988<sup>b</sup></td>
<td>0.986<sup>c</sup></td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.975</td>
<td>0.913</td>
<td>0.979<sup>a</sup></td>
</tr>
<tr>
<td rowspan="4">RF</td>
<td>SHH_alpha</td>
<td>0.978</td>
<td>0.892</td>
<td>0.967</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.989</td>
<td>0.829</td>
<td>0.994</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.990</td>
<td>0.975</td>
<td>0.949</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.969</td>
<td>0.891</td>
<td>0.958</td>
</tr>
<tr>
<td rowspan="4">SVM</td>
<td>SHH_alpha</td>
<td>0.983</td>
<td>0.908</td>
<td>0.967</td>
</tr>
<tr>

<td>SHH_beta</td>
<td>0.993</td>
<td>0.914</td>
<td>0.983</td>
</tr>
<tr>

<td>SHH_delta</td>
<td>0.993</td>
<td>0.948</td>
<td>0.957</td>
</tr>
<tr>

<td>SHH_gamma</td>
<td>0.986</td>
<td>0.891</td>
<td>0.975</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-4fn1" fn-type="other">
<p>Note: <sup>a</sup>statistical significance using a paired <italic>t</italic>-test over RF with alpha &#x003D; 0.08. <sup>b</sup>statistical significance using a paired <italic>t</italic>-test over SVM with alpha &#x003D; 0.08. <sup>c</sup>statistical significance using a paired <italic>t</italic>-test over both RF and SVM with alpha &#x003D; 0.08.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>All of the SHH-MB subtypes, including SHH_delta, have a high average AUC value (0.997), indicating our proposed method&#x2019;s exceptional classification ability. Next in line is SHH_beta at 0.996. For RF and SVM, the competitive AUC values were between 0.969 and 0.990 and 0.983 and 0.993, respectively. However, our method consistently outperformed both.</p>
<p>The recall metrics show the high sensitivity of our method, particularly for SHH_delta (0.988) and SHH_beta (0.948). These figures suggest that positive cases within these subtypes can be identified with a high degree of accuracy. Both RF and SVM showed lower recall values, with RF reaching a maximum of 0.975 for SHH_delta and SVM reaching 0.948 for SHH_delta. With high specificity, our method demonstrated good negative case classification for all subtypes, particularly SHH_beta (0.989). In conclusion, our proposed method performs significantly better in the classification of SHH-MB subtypes than RF and SVM. The statistical significance of these results indicates the robustness of the method, making it a valuable tool for doctors to correctly diagnose and treat SHH-MB subtypes. Applying TL-SVDPMR has several benefits. First, utilizing SVD to reduce feature dimensions enhances computational efficiency and effectively handles high-dimensional data. Second, TL enhances the clarity of decision boundaries, making it a valuable technique for tackling class overlap and class imbalance in classification tasks. Furthermore, the method obtains the probabilities of a new sample belonging to each of the four SHH subtypes, adding a layer of interpretability to the classification process.</p>
<p>While our study provides significant advancements in the classification of SHH-MB subtypes, several limitations should be acknowledged. Because of the small sample size of some SHH-MB subtypes, performance metrics can differ significantly between folds. This variability could lead to inconsistent results and interpretations. Although we compared our approach to RF and SVM, many more machine-learning algorithms and approaches could be explored. A broader comparison could provide a more comprehensive understanding of the model&#x2019;s performance compared to the state of the art. Finally, External datasets have not yet been used to test the model&#x2019;s robustness. Confirming the model&#x2019;s efficacy and robustness in clinical practice requires the validation of independent datasets.</p>
</sec>
<sec id="s5">
<label>5</label>
<title>Conclusions</title>
<p>Our study demonstrates the utility of TL-SVDPMR as a method for developing precision medicine approaches toward accurate categorization of the molecular subtypes within SHH-MB based on DNA methylation data. Effectively mitigating class overlap, class imbalance, and high feature dimensionality issues, this framework improves the discriminative nature of decision boundaries which in turn allows for more accurate classifying. The combination of SVD for dimensionality reduction, TL for undersampling the majority classes, group lasso for feature selection, and MR for the classification task has resulted in an effective classification framework for SHH-MB subtypes. Using the fivefold cross-validation technique, our proposed method produced an average overall accuracy of about 95%. Converting the data into a lower-dimensional space while keeping important features, SVD reduced computational complexity. This also leads to a better relevance of information in DNA methylation data. TL improves the intelligibility of decision boundaries by resulting in a more balanced dataset, which is essential to efficiently address class overlap and class imbalance in the classification problem. We automatically selected informative features, and then used them as regressors in our classification model, by applying a group lasso that forces sparsity at the group level (a subtype of SHH-MB). Coordinate descent is used to solve our loss function associated with the group lasso, which promotes sparsity.</p>
<p>Through using our proposed approach, the classification model showed excellent performance on different overlapped SHH-MB subtypes in terms of the various performance metrics.</p>
</sec>
</body>
<back>
<ack>
<p>The authors acknowledge the Deanship of Graduate Studies and Scientific Research at Jouf University.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This work was funded by the Deanship of Graduate Studies and Scientific Research at Jouf University under grant No. (DGSSR-2024-02-01137).</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Isra Mohammed: Conceptualization, Methodology, Formal Analysis, Software, Writing&#x2014;Original Draft. Mohamed Elhafiz M. Musa: Software. Murtada K. Elbashir: Conceptualization, Supervision, Writing&#x2014;Reviewing and Editing. Ayman Mohamed Mostafa: Reviewing. Amin Ibrahim Adam: Supervision. Areeg S. Faggad: Supervision, Writing&#x2014;Reviewing and Editing. Mahmood A. Mahmood: Validation. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>The experimental data and the simulation results that support the findings of this study are available at the following website: <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85212">https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85212</ext-link> (acccessed on 28 April 2025).</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Northcott</surname> <given-names>PA</given-names></string-name>, <string-name><surname>Robinson</surname> <given-names>GW</given-names></string-name>, <string-name><surname>Kratz</surname> <given-names>CP</given-names></string-name>, <string-name><surname>Mabbott</surname> <given-names>DJ</given-names></string-name>, <string-name><surname>Pomeroy</surname> <given-names>SL</given-names></string-name>, <string-name><surname>Clifford</surname> <given-names>SC</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Medulloblastoma</article-title>. <source>Nat Rev Dis Primers</source>. <year>2019</year>;<volume>5</volume>:<fpage>11</fpage>; <pub-id pub-id-type="pmid">30765705</pub-id></mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Korshunov</surname> <given-names>A</given-names></string-name>, <string-name><surname>Okonechnikov</surname> <given-names>K</given-names></string-name>, <string-name><surname>Stichel</surname> <given-names>D</given-names></string-name>, <string-name><surname>Ryzhova</surname> <given-names>M</given-names></string-name>, <string-name><surname>Schrimpf</surname> <given-names>D</given-names></string-name>, <string-name><surname>Sahm</surname> <given-names>F</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Integrated molecular analysis of adult sonic hedgehog (SHH)-activated medulloblastomas reveals two clinically relevant tumor subsets with VEGFA as potent prognostic indicator</article-title>. <source>Neuro-Oncol</source>. <year>2021</year>;<volume>23</volume>(<issue>9</issue>):<fpage>1576</fpage>&#x2013;<lpage>85</lpage>. doi:<pub-id pub-id-type="doi">10.1093/neuonc/noab031</pub-id>; <pub-id pub-id-type="pmid">33589929</pub-id></mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Cavalli</surname> <given-names>FMG</given-names></string-name>, <string-name><surname>Remke</surname> <given-names>M</given-names></string-name>, <string-name><surname>Rampasek</surname> <given-names>L</given-names></string-name>, <string-name><surname>Peacock</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shih</surname> <given-names>DJH</given-names></string-name>, <string-name><surname>Luu</surname> <given-names>B</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>Intertumoral heterogeneity within medulloblastoma subgroups</article-title>. <source>Cancer Cell</source>. <year>2017</year>;<volume>31</volume>:<fpage>737</fpage>&#x2013;<lpage>54</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ccell.2017.05.005</pub-id>; <pub-id pub-id-type="pmid">28609654</pub-id></mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Garcia-Lopez</surname> <given-names>J</given-names></string-name>, <string-name><surname>Kumar</surname> <given-names>R</given-names></string-name>, <string-name><surname>Smith</surname> <given-names>KS</given-names></string-name>, <string-name><surname>Northcott</surname> <given-names>PA</given-names></string-name></person-group>. <article-title>Deconstructing sonic hedgehog medulloblastoma: molecular subtypes, drivers, and beyond</article-title>. <source>Trends Genet</source>. <year>2021</year>;<volume>37</volume>(<issue>3</issue>):<fpage>235</fpage>&#x2013;<lpage>50</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.tig.2020.11.001</pub-id>; <pub-id pub-id-type="pmid">33272592</pub-id></mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Gold</surname> <given-names>MP</given-names></string-name>, <string-name><surname>Ong</surname> <given-names>W</given-names></string-name>, <string-name><surname>Masteller</surname> <given-names>AM</given-names></string-name>, <string-name><surname>Ghasemi</surname> <given-names>DR</given-names></string-name>, <string-name><surname>Galindo</surname> <given-names>JA</given-names></string-name>, <string-name><surname>Park</surname> <given-names>NR</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Developmental basis of SHH medulloblastoma heterogeneity</article-title>. <source>Nat Commun</source>. <year>2024</year>;<volume>15</volume>(<issue>1</issue>):<fpage>270</fpage>. doi:<pub-id pub-id-type="doi">10.1038/s41467-023-44300-0</pub-id>; <pub-id pub-id-type="pmid">38191555</pub-id></mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Skowron</surname> <given-names>P</given-names></string-name>, <string-name><surname>Farooq</surname> <given-names>H</given-names></string-name>, <string-name><surname>Cavalli</surname> <given-names>FMG</given-names></string-name>, <string-name><surname>Morrissy</surname> <given-names>AS</given-names></string-name>, <string-name><surname>Ly</surname> <given-names>M</given-names></string-name>, <string-name><surname>Hendrikse</surname> <given-names>LD</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>The transcriptional landscape of Shh medulloblastoma</article-title>. <source>Nat Commun</source>. <year>2021</year>;<volume>12</volume>:<fpage>1749</fpage>. doi:<pub-id pub-id-type="doi">10.1038/s41467-021-21883-0</pub-id>; <pub-id pub-id-type="pmid">33741928</pub-id></mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Prati</surname> <given-names>RC</given-names></string-name>, <string-name><surname>Batista</surname> <given-names>GEAPA</given-names></string-name>, <string-name><surname>Monard</surname> <given-names>MC</given-names></string-name></person-group>. <article-title>Class imbalances versus class overlapping: an analysis of a learning system behavior</article-title>. In: <conf-name>Proceedings of the MICAI 2004: Advances in Artificial Intelligence</conf-name>; <year>2004 Apr 26&#x2013;30</year>; <publisher-loc>Mexico City, Mexico</publisher-loc>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Santos</surname> <given-names>MS</given-names></string-name>, <string-name><surname>Abreu</surname> <given-names>PH</given-names></string-name>, <string-name><surname>Japkowicz</surname> <given-names>N</given-names></string-name>, <string-name><surname>Fern&#x00E1;ndez</surname> <given-names>A</given-names></string-name>, <string-name><surname>Santos</surname> <given-names>J</given-names></string-name></person-group>. <article-title>A unifying view of class overlap and imbalance: key concepts, multi-view panorama, and open avenues for research</article-title>. <source>Inf Fusion</source>. <year>2023</year>;<volume>89</volume>(<issue>2</issue>):<fpage>228</fpage>&#x2013;<lpage>53</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.inffus.2022.08.017</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Mahmood</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Safran</surname> <given-names>M</given-names></string-name>, <collab>Abdussamad</collab>, <string-name><surname>Alfarhood</surname> <given-names>S</given-names></string-name>, <string-name><surname>Ashraf</surname> <given-names>I</given-names></string-name></person-group>. <article-title>Algorithmic and mathematical modeling for synthetically controlled overlapping</article-title>. <source>Sci Rep</source>. <year>2025</year>;<volume>15</volume>(<issue>1</issue>):<fpage>7517</fpage>. doi:<pub-id pub-id-type="doi">10.1038/s41598-025-87992-8</pub-id>; <pub-id pub-id-type="pmid">40032834</pub-id></mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Vuttipittayamongkol</surname> <given-names>P</given-names></string-name>, <string-name><surname>Elyan</surname> <given-names>E</given-names></string-name></person-group>. <article-title>Neighbourhood-based undersampling approach for handling imbalanced and overlapped data</article-title>. <source>Inf Sci</source>. <year>2020</year>;<volume>509</volume>(<issue>2</issue>):<fpage>47</fpage>&#x2013;<lpage>70</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ins.2019.08.062</pub-id>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>A novel hybrid sampling method ESMOTE &#x002B; SSLM for handling the problem of class imbalance with overlap in financial distress detection</article-title>. <source>Neural Process Lett</source>. <year>2022</year>;<volume>55</volume>(<issue>3</issue>):<fpage>3081</fpage>&#x2013;<lpage>105</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s11063-022-10998-0</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Zhang</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>D</given-names></string-name></person-group>. <article-title>RFCL: a new under-sampling method of reducing the degree of imbalance and overlap</article-title>. <source>Pattern Anal Appl</source>. <year>2021</year>;<volume>24</volume>(<issue>2</issue>):<fpage>641</fpage>&#x2013;<lpage>54</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10044-020-00929-x</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kumar</surname> <given-names>A</given-names></string-name>, <string-name><surname>Singh</surname> <given-names>D</given-names></string-name>, <string-name><surname>Yadav</surname> <given-names>RS</given-names></string-name></person-group>. <article-title>Entropy and improved k-nearest neighbor search based under-sampling (ENU) method to handle class overlap in imbalanced datasets</article-title>. <source>Concurr Comput Pract Exp</source>. <year>2024</year>;<volume>36</volume>(<issue>2</issue>):<fpage>e7894</fpage>. doi:<pub-id pub-id-type="doi">10.1002/cpe.7894</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Mohammed</surname> <given-names>I</given-names></string-name>, <string-name><surname>Elbashir</surname> <given-names>MK</given-names></string-name>, <string-name><surname>Faggad</surname> <given-names>AS</given-names></string-name></person-group>. <article-title>Singular value decomposition-based penalized multinomial regression for classifying imbalanced medulloblastoma subgroups using methylation data</article-title>. <source>J Comput Biol</source>. <year>2024</year>;<volume>31</volume>(<issue>5</issue>):<fpage>458</fpage>&#x2013;<lpage>71</lpage>. doi:<pub-id pub-id-type="doi">10.1089/cmb.2023.0198</pub-id>; <pub-id pub-id-type="pmid">38752890</pub-id></mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Guzm&#x00E1;n-Ponce</surname> <given-names>A</given-names></string-name>, <string-name><surname>Valdovinos</surname> <given-names>RM</given-names></string-name>, <string-name><surname>S&#x00E1;nchez</surname> <given-names>JS</given-names></string-name>, <string-name><surname>Marcial-Romero</surname> <given-names>JR</given-names></string-name></person-group>. <article-title>A new under-sampling method to face class overlap and imbalance</article-title>. <source>Appl Sci</source>. <year>2020</year>;<volume>10</volume>(<issue>15</issue>):<fpage>5164</fpage>. doi:<pub-id pub-id-type="doi">10.3390/app10155164</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Wall</surname> <given-names>ME</given-names></string-name>, <string-name><surname>Rechtsteiner</surname> <given-names>A</given-names></string-name>, <string-name><surname>Rocha</surname> <given-names>LM</given-names></string-name></person-group>. <chapter-title>Singular value decomposition and principal component analysis</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Berrar</surname> <given-names>DP</given-names></string-name>, <string-name><surname>Dubitzky</surname> <given-names>W</given-names></string-name>, <string-name><surname>Granzow</surname> <given-names>M</given-names></string-name></person-group>, editors. <source>A practical approach to microarray data analysis</source>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2003</year>. p. <fpage>91</fpage>&#x2013;<lpage>109</lpage>. doi:<pub-id pub-id-type="doi">10.1007/0-306-47815-3_5</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Batista</surname> <given-names>GEAPA</given-names></string-name>, <string-name><surname>Prati</surname> <given-names>RC</given-names></string-name>, <string-name><surname>Monard</surname> <given-names>MC</given-names></string-name></person-group>. <article-title>A study of the behavior of several methods for balancing machine learning training data</article-title>. <source>ACM SIGKDD Explor Newsl</source>. <year>2004</year>;<volume>6</volume>(<issue>1</issue>):<fpage>20</fpage>&#x2013;<lpage>9</lpage>. doi:<pub-id pub-id-type="doi">10.1145/1007730.1007735</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Elhassan</surname> <given-names>AT</given-names></string-name>, <string-name><surname>Aljourf</surname> <given-names>M</given-names></string-name>, <string-name><surname>Al-Mohanna</surname> <given-names>F</given-names></string-name>, <string-name><surname>Shoukri</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Classification of imbalance data using Tomek Link (T-Link) combined with random under-sampling (RUS) as a data reduction method</article-title>. <source>Glob J Technol Optim</source>. <year>2016</year>;<volume>1</volume>:<fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi:<pub-id pub-id-type="doi">10.4172/2229-8711.S1111</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Meier</surname> <given-names>L</given-names></string-name>, <string-name><surname>Van De Geer</surname> <given-names>S</given-names></string-name>, <string-name><surname>B&#x00FC;hlmann</surname> <given-names>P</given-names></string-name></person-group>. <article-title>The group lasso for logistic regression</article-title>. <source>J R Stat Soc Ser B Stat Methodol</source>. <year>2008</year>;<volume>70</volume>(<issue>1</issue>):<fpage>53</fpage>&#x2013;<lpage>71</lpage>. doi:<pub-id pub-id-type="doi">10.1111/j.1467-9868.2007.00627.x</pub-id>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Vincent</surname> <given-names>M</given-names></string-name>, <string-name><surname>Perell</surname> <given-names>K</given-names></string-name>, <string-name><surname>Nielsen</surname> <given-names>FC</given-names></string-name>, <string-name><surname>Daugaard</surname> <given-names>G</given-names></string-name>, <string-name><surname>Hansen</surname> <given-names>NR</given-names></string-name></person-group>. <article-title>Modeling tissue contamination to improve molecular identification of the primary tumor site of metastases</article-title>. <source>Bioinformatics</source>. <year>2014</year>;<volume>30</volume>(<issue>10</issue>):<fpage>1417</fpage>&#x2013;<lpage>23</lpage>. doi:<pub-id pub-id-type="doi">10.1093/bioinformatics/btu044</pub-id>; <pub-id pub-id-type="pmid">24463184</pub-id></mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Vincent</surname> <given-names>M</given-names></string-name>, <string-name><surname>Hansen</surname> <given-names>NR</given-names></string-name></person-group>. <article-title>Sparse group lasso and high dimensional multinomial classification</article-title>. <source>Comput Stat Data Anal</source>. <year>2014</year>;<volume>71</volume>(<issue>4</issue>):<fpage>771</fpage>&#x2013;<lpage>86</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.csda.2013.06.004</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Tharwat</surname> <given-names>A</given-names></string-name></person-group>. <article-title>Classification assessment methods</article-title>. <source>Appl Comput Inform</source>. <year>2018</year>;<volume>17</volume>:<fpage>168</fpage>&#x2013;<lpage>92</lpage>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Franco</surname> <given-names>C</given-names></string-name>, <string-name><surname>Little</surname> <given-names>RJA</given-names></string-name>, <string-name><surname>Louis</surname> <given-names>TA</given-names></string-name>, <string-name><surname>Slud</surname> <given-names>EV</given-names></string-name></person-group>. <article-title>Comparative study of confidence intervals for proportions in complex sample surveys</article-title>. <source>J Surv Stat Methodol</source>. <year>2019</year>;<volume>7</volume>(<issue>3</issue>):<fpage>334</fpage>&#x2013;<lpage>64</lpage>. doi:<pub-id pub-id-type="doi">10.1093/jssam/smy019</pub-id>; <pub-id pub-id-type="pmid">31428658</pub-id></mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Hastie</surname> <given-names>T</given-names></string-name>, <string-name><surname>Tibshirani</surname> <given-names>R</given-names></string-name>, <string-name><surname>Friedman</surname> <given-names>J</given-names></string-name></person-group>. <source>The elements of statistical learning: data mining, inference, and prediction</source>. <edition>2nd ed</edition>. <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2009</year>. <fpage>745</fpage> p. doi: <pub-id pub-id-type="doi">10.1007/978-0-387-84858-7</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Fawcett</surname> <given-names>T</given-names></string-name></person-group>. <article-title>An introduction to ROC analysis</article-title>. <source>Pattern Recognit Lett</source>. <year>2006</year>;<volume>27</volume>:<fpage>861</fpage>&#x2013;<lpage>74</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.patrec.2005.10.010</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Robin</surname> <given-names>X</given-names></string-name>, <string-name><surname>Turck</surname> <given-names>N</given-names></string-name>, <string-name><surname>Hainard</surname> <given-names>A</given-names></string-name>, <string-name><surname>Tiberti</surname> <given-names>N</given-names></string-name>, <string-name><surname>Lisacek</surname> <given-names>F</given-names></string-name>, <string-name><surname>Sanchez</surname> <given-names>JC</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>pROC: an open-source package for R and S&#x002B; to analyze and compare ROC curves</article-title>. <source>BMC Bioinform</source>. <year>2011</year>;<volume>12</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi:<pub-id pub-id-type="doi">10.1186/1471-2105-12-77</pub-id>; <pub-id pub-id-type="pmid">21414208</pub-id></mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chawla</surname> <given-names>NV</given-names></string-name>, <string-name><surname>Bowyer</surname> <given-names>KW</given-names></string-name>, <string-name><surname>Hall</surname> <given-names>LO</given-names></string-name>, <string-name><surname>Kegelmeyer</surname> <given-names>WP</given-names></string-name></person-group>. <article-title>SMOTE: synthetic minority over-sampling technique</article-title>. <source>J Artif Intell Res</source>. <year>2011</year>;<volume>16</volume>:<fpage>321</fpage>&#x2013;<lpage>57</lpage>. doi:<pub-id pub-id-type="doi">10.1613/jair.953</pub-id>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Capper</surname> <given-names>D</given-names></string-name>, <string-name><surname>Jones</surname> <given-names>DTW</given-names></string-name>, <string-name><surname>Sill</surname> <given-names>M</given-names></string-name>, <string-name><surname>Hovestadt</surname> <given-names>V</given-names></string-name>, <string-name><surname>Schrimpf</surname> <given-names>D</given-names></string-name>, <string-name><surname>Sturm</surname> <given-names>D</given-names></string-name>, <etal>et al.</etal></person-group> <article-title>DNA methylation-based classification of central nervous system tumours</article-title>. <source>Nature</source>. <year>2018</year>;<volume>555</volume>(<issue>7697</issue>):<fpage>469</fpage>&#x2013;<lpage>74</lpage>. doi:<pub-id pub-id-type="doi">10.1038/nature26000</pub-id>; <pub-id pub-id-type="pmid">29539639</pub-id></mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jiang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Li</surname> <given-names>L</given-names></string-name>, <string-name><surname>Yin</surname> <given-names>G</given-names></string-name>, <string-name><surname>Luo</surname> <given-names>H</given-names></string-name>, <string-name><surname>Li</surname> <given-names>J</given-names></string-name></person-group>. <article-title>A molecular typing method for invasive breast cancer by serum raman spectroscopy</article-title>. <source>Clin Breast Cancer</source>. <year>2024</year>;<volume>24</volume>(<issue>4</issue>):<fpage>376</fpage>&#x2013;<lpage>83</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.clbc.2024.02.008</pub-id>; <pub-id pub-id-type="pmid">38492997</pub-id></mixed-citation></ref>
</ref-list>
</back></article>






