<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">74141</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.074141</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Effective Token Masking Augmentation Using Term-Document Frequency for Language Model-Based Legal Case Classification</article-title>
<alt-title alt-title-type="left-running-head">Effective Token Masking Augmentation Using Term-Document Frequency For Language Model-Based Legal Case Classification</alt-title>
<alt-title alt-title-type="right-running-head">Effective Token Masking Augmentation Using Term-Document Frequency For Language Model-Based Legal Case Classification</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0009-0009-8428-2239</contrib-id>
<name name-style="western"><surname>Park</surname><given-names>Ye-Chan</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-4010-3990</contrib-id>
<name name-style="western"><surname>Zulkifley</surname><given-names>Mohd Asyraf</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-3" contrib-type="author">
<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0003-4656-5659</contrib-id>
<name name-style="western"><surname>Sohn</surname><given-names>Bong-Soo</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-4" contrib-type="author" corresp="yes">
<contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-3757-3510</contrib-id>
<name name-style="western"><surname>Lee</surname><given-names>Jaesung</given-names></name><xref ref-type="aff" rid="aff-4">4</xref><email>curseor@cau.ac.kr</email></contrib>
<aff id="aff-1"><label>1</label><institution>Department of Artificial Intelligence, Chung-Ang University</institution>, <addr-line>Seoul, 06974</addr-line>, <country>Republic of Korea</country></aff>
<aff id="aff-2"><label>2</label><institution>Department of Electrical, Electronic and Systems Engineering, Universiti Kebangsaan Malaysia</institution>, <addr-line>Bangi, 43600</addr-line>, <country>Malaysia</country></aff>
<aff id="aff-3"><label>3</label><institution>School of Computer Science and Engineering, Chung-Ang University</institution>, <addr-line>84 Heukseok-ro, Dongjak-gu, Seoul, 06974</addr-line>, <country>Republic of Korea</country></aff>
<aff id="aff-4"><label>4</label><institution>AI/ML Innovation Research Center, Chung-Ang University</institution>, <addr-line>84 Heukseok-ro, Dongjak-gu, Seoul, 06974</addr-line>, <country>Republic of Korea</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Author: Jaesung Lee. Email: <email>curseor@cau.ac.kr</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>36</elocation-id>
<history>
<date date-type="received">
<day>03</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_74141.pdf"></self-uri>
<abstract>
<p>Legal case classification involves the categorization of legal documents into predefined categories, which facilitates legal information retrieval and case management. However, real-world legal datasets often suffer from class imbalances due to the uneven distribution of case types across legal domains. This leads to biased model performance, in the form of high accuracy for overrepresented categories and underperformance for minority classes. To address this issue, in this study, we propose a data augmentation method that masks unimportant terms within a document selectively while preserving key terms from the perspective of the legal domain. This approach enhances data diversity and improves the generalization capability of conventional models. Our experiments demonstrate consistent improvements achieved by the proposed augmentation strategy in terms of accuracy and F1 score across all models, validating the effectiveness of the proposed method in legal case classification.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Legal case classification</kwd>
<kwd>class imbalance</kwd>
<kwd>data augmentation</kwd>
<kwd>token masking</kwd>
<kwd>legal NLP</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Institute of Information &#x0026; Communications Technology Planning &#x0026; Evaluation (IITP)</funding-source>
<award-id>RS-2021-II211341</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Artificial Intelligence Graduate School Program</funding-source>
</award-group>
<award-group id="awg3">
<funding-source>Chung-Ang University Graduate Research Scholarship in 2024</funding-source>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Legal case classification is a key task in legal natural language processing (NLP), which aims to organize judicial documents in terms of their factual content and underlying legal principles. This enhances legal information retrieval, decision making, and analysis by enhancing access precision to relevant precedents and facilitating a systematic understanding of legal trends [<xref ref-type="bibr" rid="ref-1">1</xref>&#x2013;<xref ref-type="bibr" rid="ref-3">3</xref>]. In practice, certain types of cases such as contract disputes and criminal theft are more frequent because of their prevalence in society. In contrast, other types such as antitrust and intellectual property cases are relatively rare. This class imbalance leads to biased model predictions, where models tend to favor majority classes and perform poorly for underrepresented categories, reducing their generalizability [<xref ref-type="bibr" rid="ref-4">4</xref>].</p>
<p>Data augmentation is a well-known strategy for improving model performance. Masking-based augmentation methods, which remove unimportant terms selectively while retaining key legal expressions, have garnered significant attention over recent years [<xref ref-type="bibr" rid="ref-5">5</xref>]. In such methods, the masking algorithm determines the candidate terms to be masked based on statistical salience, such as term frequency (TF) and inverse document frequency (IDF), which are popular concepts in text analysis. Notable methods include TF-IDF-based masking [<xref ref-type="bibr" rid="ref-6">6</xref>], which improves learning efficiency by removing less significant terms but may inadvertently eliminate legally critical ones; difference masking [<xref ref-type="bibr" rid="ref-7">7</xref>], which refines term selection; and iterative mask filling (IMF) [<xref ref-type="bibr" rid="ref-8">8</xref>], which generates augmented documents using masked language models.</p>
<p>In legal documents, different sets of essential terms for case classification appear in specific cases. For example, &#x201C;revocation&#x201D; and &#x201C;rescission&#x201D; are commonly used in civil or administrative cases, yet they are typically absent from criminal law because they pertain to the legal validity of contracts, registrations, or administrative actions, which are central to civil proceedings but irrelevant in the context of criminal offenses. Importantly, these terms inherently exhibit low TF values and, consequently, low TF-IDF values, as they usually appear only once in the corresponding document. Thus, existing masking methods are likely to mask these essential terms, degrading case classification performance [<xref ref-type="bibr" rid="ref-9">9</xref>,<xref ref-type="bibr" rid="ref-10">10</xref>].</p>
<p>To address this issue, we propose a new masking method that selectively masks unimportant terms while preserving key legal expressions. Specifically, to protect essential legal terms that occur infrequently in the corresponding documents (i.e., with low TF values), the proposed algorithm uses term frequency-document frequency (TF-DF) instead of TF-IDF to assign masking likelihood to terms. The main contributions of this study are as follows:
<list list-type="bullet">
<list-item>
<p>We proposed a TF-DF-based augmentation method tailored for legal text classification.</p></list-item>
<list-item>
<p>We provided a comprehensive analysis of why TF-IDF-based masking fails in legal domains.</p></list-item>
<list-item>
<p>We conducted an in-depth analysis of the actual masked outputs based on legal domain knowledge, offering practical insights for legal practitioners.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related Work</title>
<p>Legal text classification is a fundamental task in legal NLP that facilitates information retrieval, case analysis, and judicial decision support. Early studies primarily relied on general NLP models without domain adaptation, which limited their ability to capture the nuances of legal terminology and reasoning. Foundational work such as Katz [<xref ref-type="bibr" rid="ref-11">11</xref>] highlighted the importance of quantitative approaches for legal prediction, underscoring the need for domain-adapted methods in legal NLP. In recent times, transformer-based architectures, e.g., Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref-1">1</xref>], and domain-specific variants, e.g., LegalBERT [<xref ref-type="bibr" rid="ref-2">2</xref>], have demonstrated significant performance improvements through pretraining on large-scale legal corpora that better reflect legal language. Further specialization has been achieved with encoders such as CaseLaw-BERT [<xref ref-type="bibr" rid="ref-12">12</xref>], which are tailored to judicial opinions and show improved performance on benchmark datasets like EURLEX and LexGLUE [<xref ref-type="bibr" rid="ref-3">3</xref>]. Nevertheless, despite these advances, transformer-based models continue to struggle with under-represented or low-resource legal categories, which motivates ongoing research into few-shot and zero-shot learning paradigms [<xref ref-type="bibr" rid="ref-4">4</xref>] as a means of enhancing generalization.</p>
<p>Data augmentation is a popular technique used to address data sparsity and improve model generalization, especially in scenarios with limited labeled data. Early work such as back-translation [<xref ref-type="bibr" rid="ref-13">13</xref>] illustrated the effectiveness of simple cross-lingual transformations for expanding training corpora. Rule-based techniques (e.g., back-translation and synonym replacement) are widely adopted in general NLP [<xref ref-type="bibr" rid="ref-14">14</xref>]. These approaches offer interpretability and ease of implementation, but often introduce noise or distort legal semantics when applied directly to legal texts, which typically exhibit rigid syntactic structures and formal language. Token-level strategies, such as term replacement, random swapping, and POS-guided deletion [<xref ref-type="bibr" rid="ref-5">5</xref>], aim to perturb input sequences without significantly altering their meaning. These methods are sensitive to the structural roles of tokens&#x2014;especially in legal documents, where function words and modifiers may carry substantive legal implications. To address this, self-supervised approaches have been proposed. Contextual consistency training [<xref ref-type="bibr" rid="ref-15">15</xref>] encourages models to produce consistent outputs under augmented inputs, while manifold-based methods, such as SSMBA [<xref ref-type="bibr" rid="ref-16">16</xref>], perturb hidden representations to improve robustness against out-of-distribution data. Although promising, these approaches are primarily validated on general NLP tasks (e.g., sentiment classification or QA) and may fail to account for the domain-specific precision required in legal applications. Lightweight schemes like AEDA (Easier Data Augmentation) [<xref ref-type="bibr" rid="ref-17">17</xref>] offer computational efficiency by randomly inserting punctuation or replacing characters. While such methods improve training diversity, they risk violating the syntactic and semantic constraints of legal text, leading to unnatural or misleading outputs.</p>
<p>Masking-based augmentation removes and replaces tokens selectively to facilitate pattern learning. TF-IDF masking has been shown effective in sentiment analysis [<xref ref-type="bibr" rid="ref-6">6</xref>], since it highlights discriminative words and suppresses redundant ones. Importantly, in the legal domain it is prone to semantic drift, as legally decisive terminology often appears infrequently and thus receives disproportionately low scores. Alternative strategies such as Different masking [<xref ref-type="bibr" rid="ref-7">7</xref>] and IMF [<xref ref-type="bibr" rid="ref-8">8</xref>] refine token selection and replacement, yet neither explicitly safeguards critical legal expressions nor addresses the persistent issue of class imbalance in legal text classification.</p>
<p>Domain-aware alternatives have also been proposed. LegalBERT is pretrained on legal corpora [<xref ref-type="bibr" rid="ref-10">10</xref>], enhancing representation quality, and TF-IDF representations occasionally outperform neural embeddings in legal classification [<xref ref-type="bibr" rid="ref-9">9</xref>]. Token deletion guided by corpus-passage frequency has shown promise in general-domain dense retrieval settings [<xref ref-type="bibr" rid="ref-18">18</xref>], motivating further exploration in domain-specific contexts such as legal NLP. Active learning pipelines further reduce annotation costs [<xref ref-type="bibr" rid="ref-19">19</xref>]. More recently, Ghosh et al. [<xref ref-type="bibr" rid="ref-20">20</xref>] introduced DALE, a selective masking approach tailored to legal language, and Kasthuriarachchy et al. [<xref ref-type="bibr" rid="ref-21">21</xref>] further refined this line of work through meaning-sensitive masking. Duffy et al. [<xref ref-type="bibr" rid="ref-22">22</xref>] examined a hybrid approach combining rule-based and generative augmentation in contract document classification, showing that simpler rules can sometimes outperform more complex generators. Sheik et al. [<xref ref-type="bibr" rid="ref-23">23</xref>] employed prompt engineering and pseudo-labeled data generation in overrule prediction, demonstrating that augmented models consistently outperformed non-augmented baselines and even surpassed few-shot GPT-3 in F1 score. Despite these advances, none of the existing methods integrates frequency-based token importance with the preservation of essential legal terms, both of which are crucial for robust legal text augmentation.</p>
<p>To address the limitations of prior methods, we propose a masking strategy that preserves legally salient terms while filtering peripheral ones. By leveraging corpus-level token statistics, our method enhances semantic fidelity and improves classification robustness in legal NLP tasks.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Proposed Method</title>
<p>In this section, we first explain the rationale behind the proposed masking strategy by comparing it with conventional TF-IDF-based masking strategies. Next, the procedure is introduced, and the details of the legal case dataset are presented. Finally, we describe the proposed masking method.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Rationale</title>
<p><xref ref-type="fig" rid="fig-1">Fig. 1</xref> illustrates the masking tendencies of different strategies for different legal terms based on their TF and DF values. Existing augmentation methods often rely on TF-IDF to identify key legal terms. However, case-specific legal terms, e.g., &#x201C;cancellation&#x201D; and &#x201C;inheritance,&#x201D; tend to appear in only a narrow range of cases, thereby exhibiting low TF-IDF values. Consequently, they are unintentionally treated as unimportant and may be masked during augmentation, degrading classification performance. This mismatch highlights the need for a domain-aware weighting scheme that can distinguish between genuinely irrelevant terms and legally decisive yet sparsely distributed expressions. We contend that masking terms with low TF-IDF values is not necessarily the same as masking unimportant terms in the legal domain.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>TF-DF masking tendency map. Each term is positioned based on its TF and DF within the corpus. Essential legal terms (e.g., &#x201C;Cancellation,&#x201D; &#x201C;Inheritance,&#x201D; &#x201C;Public Official&#x201D;) appear in the circle zone, showing that TF-IDF masking may incorrectly remove them. The diamond zone denotes terms preserved by both strategies, while the triangle zone indicates generic terms masked by both</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_74141-fig-1.tif"/>
</fig>
<p>The <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:mi>x</mml:mi></mml:math></inline-formula>-axis represents document frequency (DF), whereas IDF decreases as DF increases (<inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:mrow><mml:mtext>IDF</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mfrac><mml:mi>N</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:mfrac></mml:math></inline-formula>); therefore, the right-hand side of the plot indicates tokens with smaller IDF values, corresponding to more common terms within the corpus. Each marker in <xref ref-type="fig" rid="fig-1">Fig. 1</xref> corresponds to the TF-DF coordinate of an individual token, illustrating token-level masking outcomes rather than predefined regions: circles (&#x2022;) indicating their likelihood to be masked by TF-IDF, squares (<inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:mi>&#x25FC;</mml:mi></mml:math></inline-formula>) denote tokens masked only by the proposed TF-DF strategy, triangles (<inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:mi>&#x25B4;</mml:mi></mml:math></inline-formula>) represent tokens masked by both methods, and diamonds (<inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:mi>&#x29EB;</mml:mi></mml:math></inline-formula>) mark tokens preserved by both. This clarification distinguishes token-specific masking behavior from the broader conceptual zones described in the rationale, ensuring consistent interpretation of <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.
<list list-type="bullet">
<list-item>
<p>The term &#x201C;Cancellation,&#x201D; located in the lower-left region of the TF-DF landscape (close to the <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:mi>x</mml:mi></mml:math></inline-formula>-axis, indicating both low term frequency and low document frequency), is essential for identifying cases involving administrative disposition cancellations, and facilitates their distinction from other administrative or tax-related matters.</p></list-item>
<list-item>
<p>The term &#x201C;Inheritance,&#x201D; also situated in the lower-left region near the <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:mi>x</mml:mi></mml:math></inline-formula>-axis, pertains to civil-law disputes over succession and property division. It appears in many judgments, but only a few times per document (low TF), placing it close to the <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:mi>x</mml:mi></mml:math></inline-formula>-axis. Therefore, TF-IDF tends to mask it despite it being a strong signal for the &#x201C;inheritance-related civil case&#x201D; category. Ideally, it should be retained.</p></list-item>
<list-item>
<p>The term &#x201C;Public Official,&#x201D; positioned in the lower-central region (moderate TF and relatively low DF), is an indicator of cases involving the responsibilities or duties of public officials and helps classify administrative disputes.</p></list-item>
<list-item>
<p>The term &#x201C;Cause,&#x201D; appearing in the lower-right region (high TF and low-to-moderate DF),that appears in almost every opinion. Its high DF and moderate TF motivate TF-DF to down weight it; therefore, masking this token removes generic terminology that does not aid fine-grained classification.</p></list-item>
<list-item>
<p>The term &#x201C;Principle,&#x201D; located slightly rightward in the lower-right region, is frequently used in abstract phrases such as &#x201C;principle of good faith&#x201D; or &#x201C;principle of proportionality&#x201D;. As it is ubiquitous across judgments and rarely decisive in the outcome, TF-DF correctly identifies it as a low&#x2013;salience token to be masked.</p></list-item>
<list-item>
<p>The term &#x201C;Remand,&#x201D; placed at the far right of the lower region (high TF and relatively low DF), signals procedural posture&#x2014;namely, when an appellate court sends a case back to a lower court. Although it can dominate term counts in Appellate opinions, it conveys little substantive information about the underlying legal issue; therefore, masking it prevents the model from relying on procedural cues rather than topic-specific content.</p></list-item>
</list></p>
<p><xref ref-type="fig" rid="fig-1">Fig. 1</xref> reveals a critical drawback of the TF-IDF strategy&#x2014;many domain-specific and case-specific terms, e.g., &#x201C;Cancellation,&#x201D; &#x201C;Inheritance,&#x201D; and &#x201C;Public Official&#x201D; are located in the lower-left area (low TF and low DF), indicating their likelihood to be masked by TF-IDF. Although they appear infrequently across the corpus, these terms are essential for determining the legal context of specific cases. In contrast, the proposed TF-DF-based masking exhibits improved sensitivity to such legal relevance by preserving these terms. This figure supports the rationale behind the proposed TF-DF-based masking strategy and highlights its difference from existing TF-IDF-based preservation, demonstrating its ability to differentiate legally significant terms from generic ones. This figure substantiates the rationale of the proposed TF-DF approach and highlights its ability to distinguish legally salient tokens from generic ones, ensuring that data augmentation does not distort the core legal reasoning in case texts.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Data Preparation</title>
<p>Data preparation in this study follows a systematic multi-stage pipeline designed to transform raw judicial texts into a standardized format suitable for classification, as illustrated in <xref ref-type="fig" rid="fig-2">Fig. 2</xref>. The dataset is derived from South Korean court rulings, originally provided by the Korean Ministry of Government Legislation via Law Open Data (<ext-link ext-link-type="uri" xlink:href="https://open.law.go.kr">https://open.law.go.kr</ext-link>, accessed on 18 November 2025 ), and consists of 87,160 legal cases spanning more than seven decades, from 13 January 1952 to 29 February 2024. This long temporal coverage ensures that the dataset captures the evolution of judicial language and legal reasoning in Korea, providing a valuable resource for both historical and contemporary analyses.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Overview of the legal judgment preprocessing and labeling pipeline, from raw court documents to standardized classification. The process includes personal information masking, legal reference extraction, segment selection, and global case type unification</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_74141-fig-2.tif"/>
</fig>
<p><xref ref-type="table" rid="table-1">Table 1</xref> presents the number of cases for each class along with descriptive statistics, including quartiles, means, and Std of the token lengths per case. In this study, we denote the Intellectual Property Law (IP Property) for brevity. This category encompasses legal disputes concerning patents, trademarks, copyrights, and design rights. A substantial class imbalance is observed: while the largest class (Civil Law) contains 39,830 cases, the smallest category has fewer than 1273 cases, reflecting the uneven distribution of case types in real-world judicial practice and making it challenging for models to learn minority categories effectively.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Dataset characteristics across legal categories. Q1, Median, and Q3 represent the 25th, 50th, and 75th percentiles of the number of tokens per case. Mean and standard deviation (Std) are also reported</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Law class</th>
<th>Cases</th>
<th>Q1 (25%)</th>
<th>Median</th>
<th>Q3 (75%)</th>
<th>Max</th>
<th>Mean <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> Std</th>
</tr>
</thead>
<tbody>
<tr>
<td>Civil</td>
<td>39,830</td>
<td>316</td>
<td>570</td>
<td>1000</td>
<td>29,976</td>
<td>845.30 <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 989.64</td>
</tr>
<tr>
<td>Criminal</td>
<td>20,454</td>
<td>228</td>
<td>443</td>
<td>858</td>
<td>111,734</td>
<td>931.65 <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 2501.44</td>
</tr>
<tr>
<td>Administrative</td>
<td>12,660</td>
<td>318</td>
<td>569</td>
<td>1018</td>
<td>42,415</td>
<td>860.93 <inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 1071.01</td>
</tr>
<tr>
<td>Taxation</td>
<td>9656</td>
<td>264</td>
<td>445</td>
<td>772</td>
<td>26,339</td>
<td>668.09 <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 816.49</td>
</tr>
<tr>
<td>Intellectual property</td>
<td>3287</td>
<td>251</td>
<td>384</td>
<td>668.5</td>
<td>16,567</td>
<td>643.36 <inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 873.08</td>
</tr>
<tr>
<td>Family</td>
<td>1273</td>
<td>214</td>
<td>392.5</td>
<td>772</td>
<td>7835</td>
<td>620.45 <inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 674.98</td>
</tr>
<tr>
<td><bold>Total</bold></td>
<td>87,160</td>
<td>281</td>
<td>513</td>
<td>934</td>
<td>111,734</td>
<td>838.18 <inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 1505.41</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>A primary challenge in preparing this dataset lies in the accurate separation of judicial decisions from legal reasoning, particularly in cases involving complex or multi-layered arguments. To address this, case facts, claims, and judicial decisions were extracted selectively using custom Python scripts, which systematically remove irrelevant metadata such as judge names, court divisions, and procedural annotations. This ensures that the processed text focuses on substantive legal content. The process begins with data extraction from court archives and relevant sources, followed by preprocessing steps that filter out extraneous information and personal identifiers to maintain textual consistency. The cleaned documents are then organized and assigned to their corresponding classification categories, resulting in a corpus that is both structured and legally interpretable.</p>
<p>Given the hybrid nature of the South Korean legal system, which integrates statutory law from civil law traditions with precedent-based reasoning from common law, the dataset requires meticulous structuring [<xref ref-type="bibr" rid="ref-24">24</xref>]. Legal phrase identification is performed to accurately segment case decisions into linguistically meaningful units. This is particularly important in Korean, an agglutinative language where grammatical particles and suffixes convey critical semantic cues. To ensure ethical compliance, personally identifiable information (e.g., party names, addresses, and references to individuals) is systematically removed, thereby safeguarding privacy while preserving the essential content required for downstream NLP tasks.</p>
<p>Unlike existing datasets [<xref ref-type="bibr" rid="ref-25">25</xref>], which primarily focus on criminal and civil law, the proposed dataset encompasses a broader spectrum of legal domains, including family law, intellectual property law, taxation, and administrative law. This broader coverage not only improves the representativeness of the dataset but also enables the training of models capable of handling the diversity of real-world legal cases. <xref ref-type="table" rid="table-2">Table 2</xref> summarizes the outputs at each stage of the dataset refinement pipeline.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Example of the output at each processing stage (The content of the legal document is shortened for clarity.)</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Stage</th>
<th>Output example</th>
</tr>
</thead>
<tbody>
<tr>
<td>Original</td>
<td>Plaintiff: Hong Gil-dong, Defendant: Kim Cheol-soo. Judgment Outline: The plaintiff&#x2019;s claim is dismissed. Supreme Court Justice OO, Judge OO, Presiding Judge.</td>
</tr>
<tr>
<td>Case filtering</td>
<td>The case interprets law under supplementary provisions (Article 23(1), Labor Standards Act).</td>
</tr>
<tr>
<td>Add legal ref</td>
<td>Key Laws: Labor Standards Act 23 Provision: Dismissal is prohibited without just cause.</td>
</tr>
<tr>
<td>Labeling</td>
<td>Detailed Categories: Wrongful Dismissal Claim. Referenced Cases: 2010Da98765, 2015Da12345. Text: Plaintiff&#x2019;s claim is dismissed for lack of just cause.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Proposed Masking Method</title>
<p>The proposed augmentation method aims to preserve legally important expressions, while introducing corpus-aware variability via probabilistic masking. Instead of relying on uniform or randomly applied masking, we leverage corpus-level statistics to determine the tokens that should be replaced by <monospace>[MASK]</monospace> symbols. The full masking algorithm is described in Algorithm 1. The algorithm begins by creating an empty container <inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msup><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula> to store the augmented corpus, thereby establishing a dedicated repository for masked documents that will be generated in the subsequent steps (Line 1). This initialization step, though seemingly simple, is crucial in ensuring that the augmented data are systematically collected and preserved in a manner that is completely separable from the original corpus, thus preventing unintended data leakage or overwriting.</p>
<fig id="fig-6">
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_74141-fig-6.tif"/>
</fig>
<p>Next, every legal document in the corpus is tokenized using the KLUE/BERT WordPiece tokenizer, which is designed to segment words into subword units suitable for transformer-based language models (Line 2). This tokenization is not merely a mechanical preprocessing step but an essential foundation for frequency-based analysis, since subword segmentation captures rare and morphologically complex legal terms more effectively than word-level tokenization. Once tokenized, the document frequency <inline-formula id="ieqn-39"><mml:math id="mml-ieqn-39"><mml:mrow><mml:mi mathvariant="normal">d</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> of each token <inline-formula id="ieqn-40"><mml:math id="mml-ieqn-40"><mml:mi>&#x03C9;</mml:mi></mml:math></inline-formula> is computed at the corpus level (Line 3). Here, <inline-formula id="ieqn-41"><mml:math id="mml-ieqn-41"><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes the number of occurrences of <inline-formula id="ieqn-42"><mml:math id="mml-ieqn-42"><mml:mi>&#x03C9;</mml:mi></mml:math></inline-formula> within a document, and <inline-formula id="ieqn-43"><mml:math id="mml-ieqn-43"><mml:mrow><mml:mi mathvariant="normal">d</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> the number of documents containing <inline-formula id="ieqn-44"><mml:math id="mml-ieqn-44"><mml:mi>&#x03C9;</mml:mi></mml:math></inline-formula>. These corpus-level statistics play a pivotal role in guiding subsequent masking decisions, because they reveal how widely distributed each token is across documents rather than just within a single text.</p>
<p>For each individual document <inline-formula id="ieqn-45"><mml:math id="mml-ieqn-45"><mml:mi>d</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow></mml:math></inline-formula>, the algorithm then computes the term frequency <inline-formula id="ieqn-46"><mml:math id="mml-ieqn-46"><mml:mrow><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> for all tokens and assigns an importance score to each token based on a TF-DF formulation:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mtext>tf</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mi>log</mml:mi><mml:mo>&#x2061;</mml:mo><mml:mstyle scriptlevel="0"><mml:mrow><mml:mo maxsize="1.2em" minsize="1.2em">(</mml:mo></mml:mrow></mml:mstyle><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mrow><mml:mtext>df</mml:mtext></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mstyle scriptlevel="0"><mml:mrow><mml:mo maxsize="1.2em" minsize="1.2em">)</mml:mo></mml:mrow></mml:mstyle><mml:mo>.</mml:mo></mml:math></disp-formula></p>
<p>The logarithmic smoothing prevents excessively large <inline-formula id="ieqn-47"><mml:math id="mml-ieqn-47"><mml:mrow><mml:mi mathvariant="normal">d</mml:mi><mml:mi mathvariant="normal">f</mml:mi></mml:mrow></mml:math></inline-formula> values from dominating the weighting scheme, thereby avoiding a situation where frequent but uninformative tokens (e.g., procedural markers such as &#x201C;submitted,&#x201D; &#x201C;record,&#x201D; or &#x201C;hearing&#x201D;) overwhelm more discriminative but less frequent terms. It yields smoother scaling across large corpora and reduces sensitivity to corpus size, maintaining consistent importance estimation across datasets. Adding 1 inside the logarithm ensures numerical stability by avoiding undefined values when <inline-formula id="ieqn-48"><mml:math id="mml-ieqn-48"><mml:mi>d</mml:mi><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>. This design mirrors the stabilizing role of the logarithmic component in TF-IDF while reversing its intent&#x2014;to emphasize legally meaningful expressions that recur across multiple cases rather than penalizing them.</p>
<p>To ensure comparability and stable scaling across tokens, the scores are subsequently normalized using min&#x2013;max scaling:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mrow><mml:mover><mml:mi>w</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>w</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mo movablelimits="true" form="prefix">min</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>w</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mo>+</mml:mo><mml:mo>&#x03F5;</mml:mo></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula id="ieqn-49"><mml:math id="mml-ieqn-49"><mml:mo>&#x03F5;</mml:mo>
</mml:math></inline-formula> is a small constant introduced for numerical stability. This normalization compresses all weights into the interval <inline-formula id="ieqn-50"><mml:math id="mml-ieqn-50"><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:math></inline-formula>, thus allowing them to be directly interpreted as probabilistic scaling factors for masking (Lines 4&#x2013;7). Subsequently, for each document <inline-formula id="ieqn-51"><mml:math id="mml-ieqn-51"><mml:mi>d</mml:mi></mml:math></inline-formula>, a copy <inline-formula id="ieqn-52"><mml:math id="mml-ieqn-52"><mml:msup><mml:mi>d</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula> is created to preserve the original text (Line 8). This duplication ensures that the original legal record remains intact for reference and evaluation, while all augmentation operations are confined to the copy. For each token <inline-formula id="ieqn-53"><mml:math id="mml-ieqn-53"><mml:mi>&#x03C9;</mml:mi></mml:math></inline-formula> in <inline-formula id="ieqn-54"><mml:math id="mml-ieqn-54"><mml:msup><mml:mi>d</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula>, the algorithm samples a random value <inline-formula id="ieqn-55"><mml:math id="mml-ieqn-55"><mml:mi>r</mml:mi><mml:mo>&#x223C;</mml:mo><mml:mrow><mml:mi>&#x1D4B0;</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> and applies the masking rule: if <inline-formula id="ieqn-56"><mml:math id="mml-ieqn-56"><mml:mi>r</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mi>&#x03B1;</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mi>w</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, where <inline-formula id="ieqn-57"><mml:math id="mml-ieqn-57"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> is a user-defined masking intensity parameter, then the token is replaced with <monospace>[MASK]</monospace> (Lines 9&#x2013;13). This stochastic mechanism introduces controlled randomness into the augmentation process. By linking the masking probability directly to <inline-formula id="ieqn-58"><mml:math id="mml-ieqn-58"><mml:mrow><mml:mover><mml:mi>w</mml:mi><mml:mo stretchy="false">&#x007E;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>&#x03C9;</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, the algorithm biases masking toward less-informative and frequently occurring tokens, while simultaneously lowering the likelihood of masking legally significant expressions such as &#x201C;trust Asset&#x201D; or &#x201C;unjust dismissal.&#x201D;</p>
<p>A notable advantage of the stochastic masking strategy is its ability to reduce redundancy in the augmented corpus. Deterministic masking would consistently replace the same tokens across documents, resulting in a less diverse dataset [<xref ref-type="bibr" rid="ref-15">15</xref>]. In contrast, the stochastic rule introduces variability between augmented instances, enriching the training distribution with multiple plausible variants of the same document. In this study, we empirically set the masking intensity parameter <inline-formula id="ieqn-59"><mml:math id="mml-ieqn-59"><mml:mi>&#x03B1;</mml:mi><mml:mo>=</mml:mo><mml:mn>0.2</mml:mn></mml:math></inline-formula>, following prior works [<xref ref-type="bibr" rid="ref-17">17</xref>,<xref ref-type="bibr" rid="ref-20">20</xref>,<xref ref-type="bibr" rid="ref-26">26</xref>] that demonstrated its effectiveness in balancing coverage and diversity in stochastic masking and a sensitivity analysis was performed to validate the stability of this choice.</p>
<p>Once all tokens in a document have been processed under this stochastic masking regime, the resulting augmented document <inline-formula id="ieqn-60"><mml:math id="mml-ieqn-60"><mml:msup><mml:mi>d</mml:mi><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula> is appended to the container <inline-formula id="ieqn-61"><mml:math id="mml-ieqn-61"><mml:msup><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula> (Line 14). This process repeats for every document in the input corpus, gradually populating <inline-formula id="ieqn-62"><mml:math id="mml-ieqn-62"><mml:msup><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula> with augmented versions that maintain the core semantic and legal reasoning of the originals while discarding extraneous information. After all documents have been processed, the fully constructed augmented corpus <inline-formula id="ieqn-63"><mml:math id="mml-ieqn-63"><mml:msup><mml:mrow><mml:mi>&#x1D49F;</mml:mi></mml:mrow><mml:mi mathvariant="normal">&#x2032;</mml:mi></mml:msup></mml:math></inline-formula> is returned as output (Lines 15&#x2013;16). This corpus serves as the foundation for downstream training, offering a richer and more balanced dataset for classification tasks.</p>
<p>Through stochastic masking guided by TF-DF weighting, the proposed method suppresses peripheral terms such as dates or procedural phrases while preserving legally decisive expressions, thereby operationalizing the algorithmic rationale described above in a concrete and systematic manner. <xref ref-type="fig" rid="fig-3">Fig. 3</xref> illustrates the overall workflow of this masking process in greater detail, highlighting how corpus-level statistics are used to determine token salience and guide the replacement of low-importance terms during augmentation.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Overview of proposed augmentation process. The pipeline includes tokenization, document frequency calculation, weight computation, and selective masking</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_74141-fig-3.tif"/>
</fig>
<p>Furthermore, <xref ref-type="table" rid="table-3">Table 3</xref> presents illustrative examples of masked outputs, demonstrating that high-frequency but legally irrelevant expressions&#x2014;such as dates, specific object names, procedural details, or other peripheral descriptors frequently appearing in judicial texts&#x2014;are effectively suppressed, whereas pivotal legal terms remain intact, thereby preserving the core semantic structure required for correct legal interpretation. These observations highlight the importance of maintaining semantic fidelity in legal text augmentation, as even minor hallucinations can compromise downstream classification and retrieval tasks by subtly altering the factual framing or doctrinal meaning of a case.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Examples of masked terms across different categories</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Type</th>
<th>Augmentation (Before)</th>
<th>Augmentation (After)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Date, Time, Monetary values</td>
<td>2017, 1981. 11. 16</td>
<td>The wedding ceremony on May [MASK], 2010</td>
</tr>
<tr>
<td>Specific entities, Individuals</td>
<td>(with a pencil sharpener)</td>
<td>(with a [MASK] sharpener)</td>
</tr>
<tr>
<td>Legal procedural details</td>
<td>(Tax notice for revocation of seizure disposition)</td>
<td>(Tax notice for revocation of [MASK] disposition)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="table-4">Table 4</xref> shows that while AEDA introduces only minor structural noise, DALE fundamentally alters the factual and legal nature of the case itself. Such distortions illustrate why generative augmentation methods are unsuitable for high-fidelity legal datasets, where even subtle lexical substitutions can invert judicial meaning.</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Examples of semantic and structural distortions caused by AEDA [<xref ref-type="bibr" rid="ref-17">17</xref>] and DALE [<xref ref-type="bibr" rid="ref-20">20</xref>] in legal judgments</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Method</th>
<th>Description and Example</th>
</tr>
</thead>
<tbody>
<tr>
<td><bold>TF-DF masking (Proposed)</bold></td>
<td><italic>Original:</italic> &#x201C;The defendant shall bear the litigation costs.&#x201D; <italic>Augmented:</italic> &#x201C;The [MASK] shall bear the litigation costs.&#x201D;</td>
</tr>
<tr>
<td><bold>AEDA</bold></td>
<td><italic>Original:</italic> &#x201C;According to Article 54-2 Paragraph 1 <italic>Augmented:</italic> &#x201C;According to Article 54-2, Paragraph 1</td>
</tr>
<tr>
<td><bold>DALE</bold></td>
<td><italic>Original:</italic> &#x201C;The court annuls the defendant&#x2019;s <bold>damages disposition</bold>.&#x201D; <italic>Generated:</italic> &#x201C;The court annuls the defendant&#x2019;s <bold>information disclosure refusal</bold>.&#x201D;</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental Results</title>
<p>This section presents the experimental validation of the effectiveness of the proposed method in terms of legal text classification performance. The performances of classification models are assessed, with and without data augmentation.</p>
<sec id="s4_1">
<label>4.1</label>
<title>Experimental Settings</title>
<p>For evaluation, the dataset is split into three subsets: 60% for training, 20% for validation, and 20% for testing. To mitigate the imbalance problem, balanced augmentation [<xref ref-type="bibr" rid="ref-27">27</xref>] is applied, adjusting each category to match the size of the largest class rather than simply oversampling smaller ones. Balanced augmentation was applied after generating augmented samples through the TF-DF masking procedure. For each category, the number of cases was adjusted to match the largest class by adding non-redundant masked documents, thereby balancing the training distribution without simple duplication. In addition, balanced augmentation was applied solely to equalize the number of samples across legal categories, without modifying the textual content of any document. This setting inherently isolates the contribution of each augmentation strategy. All augmentations were evaluated under identical conditions using the same balanced dataset and hyperparameter settings. Therefore, any observed performance differences arise solely from the augmentation strategy itself rather than from variations in training data or optimization.</p>
<p>To ensure fair comparison, we evaluated four configurations. The &#x2018;No Augmentation&#x2019; baseline denotes training on the original unaltered dataset without any augmentation, serving as a reference for evaluating the contribution of each augmentation method. And a POS Deletion baseline that randomly removes part-of-speech&#x2013;based tokens, a TF-IDF Masking baseline that masks tokens according to their inverse document frequency, and the proposed TF-DF Masking method. This strategy ensures that underrepresented categories are not disproportionately overlooked. <xref ref-type="fig" rid="fig-4">Fig. 4</xref> illustrates the distributions before and after augmentation.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>Number of samples per category before and after augmentation. Augmentation is applied to balance all categories to match the largest class, rather than simply duplicating smaller ones</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_74141-fig-4.tif"/>
</fig>
<p>Transformer-based models fine-tuned for legal case classification are considered. The models are trained using an AdamW optimizer with hyperparameters <inline-formula id="ieqn-64"><mml:math id="mml-ieqn-64"><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mn>0.9</mml:mn></mml:math></inline-formula>; <inline-formula id="ieqn-65"><mml:math id="mml-ieqn-65"><mml:msub><mml:mi>&#x03B2;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mn>0.999</mml:mn></mml:math></inline-formula>; weight decay &#x003D; 0.01; initial learning rate &#x003D; <inline-formula id="ieqn-66"><mml:math id="mml-ieqn-66"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>; adjusted via a linear scheduler; the number of epoch &#x003D; 20; and batch size &#x003D; 16. Each experiment is repeated 10 times to ensure robust and reliable evaluation. Given the four basic statistics, true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN), for classification tasks, we evaluate model performance by calculating accuracy and F1 score, which are two widely used metrics in text classification tasks. Accuracy measures the proportion of correctly classified cases relative to all legal categories and is defined as follows:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mrow><mml:mtext>Accuracy</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:math></disp-formula>where <italic>TP</italic>, <italic>TN</italic>, <italic>FP</italic>, and <italic>FN</italic> denote true positives, true negatives, false positives, and false negatives, respectively.</p>
<p>The F1 score, which represents the harmonic mean of precision <inline-formula id="ieqn-67"><mml:math id="mml-ieqn-67"><mml:mrow><mml:mo>(</mml:mo><mml:mstyle displaystyle="false" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> and recall <inline-formula id="ieqn-68"><mml:math id="mml-ieqn-68"><mml:mrow><mml:mo>(</mml:mo><mml:mstyle displaystyle="false" scriptlevel="0"><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>, is defined as
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:mrow><mml:mtext>F1</mml:mtext></mml:mrow><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mtext>Precision</mml:mtext></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mtext>Precision</mml:mtext></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mrow></mml:mfrac><mml:mo>.</mml:mo></mml:math></disp-formula></p>
<p>This metric provides a balanced evaluation of classification performance by combining both precision and recall.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Performance Comparison</title>
<p>We evaluated the effectiveness of the proposed TF-DF&#x2013;based masking augmentation method using two representative models, BERT [<xref ref-type="bibr" rid="ref-1">1</xref>] and LegalBERT [<xref ref-type="bibr" rid="ref-2">2</xref>]. <xref ref-type="table" rid="table-5">Tables 5</xref> and <xref ref-type="table" rid="table-6">6</xref> summarize the results in terms of accuracy and macro F1. Paired <italic>t</italic>-tests conducted on accuracy across ten runs confirmed statistically significant improvements for all models (<inline-formula id="ieqn-69"><mml:math id="mml-ieqn-69"><mml:mi>p</mml:mi><mml:mo>&#x003C;</mml:mo><mml:mn>0.01</mml:mn></mml:math></inline-formula>). Specifically, LegalBERT (<inline-formula id="ieqn-70"><mml:math id="mml-ieqn-70"><mml:mi>p</mml:mi><mml:mo>&#x2248;</mml:mo><mml:mn>0.007</mml:mn></mml:math></inline-formula>), and BERT (<inline-formula id="ieqn-71"><mml:math id="mml-ieqn-71"><mml:mi>p</mml:mi><mml:mo>&#x003C;</mml:mo><mml:mn>0.0001</mml:mn></mml:math></inline-formula>) all showed meaningful accuracy gains following TF-DF masking. For LegalBERT, the proposed method achieved an accuracy of 0.8551, outperforming TF-IDF (0.8384), POS-based deletion (0.7775), and no augmentation (0.7436). Similarly, the macro F1 score reached 0.8453, showing a clear improvement over TF-IDF (0.7916), POS (0.6913), and no augmentation (0.5952). For BERT, the proposed method also gave the best performance, with an accuracy of 0.9704 and a macro F1 score of 0.9640. The proposed augmentation outperformed TF-IDF and POS-based methods across both models.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Comparison of accuracy <inline-formula id="ieqn-72"><mml:math id="mml-ieqn-72"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> standard deviation under augmentation methods: proposed method, TF-IDF, POS deletion, and no augmentation</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Model</th>
<th>Proposed</th>
<th>TF-IDF</th>
<th>POS</th>
<th>No augmentation</th>
</tr>
</thead>
<tbody>
<tr>
<td>LegalBERT</td>
<td>0.8551 <inline-formula id="ieqn-73"><mml:math id="mml-ieqn-73"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0090</td>
<td>0.8384 <inline-formula id="ieqn-74"><mml:math id="mml-ieqn-74"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0103</td>
<td>0.7775 <inline-formula id="ieqn-75"><mml:math id="mml-ieqn-75"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0128</td>
<td>0.7436 <inline-formula id="ieqn-76"><mml:math id="mml-ieqn-76"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0091</td>
</tr>
<tr>
<td>BERT</td>
<td>0.9704 <inline-formula id="ieqn-77"><mml:math id="mml-ieqn-77"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0028</td>
<td>0.9591 <inline-formula id="ieqn-78"><mml:math id="mml-ieqn-78"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0007</td>
<td>0.9643 <inline-formula id="ieqn-79"><mml:math id="mml-ieqn-79"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0013</td>
<td>0.9203 <inline-formula id="ieqn-80"><mml:math id="mml-ieqn-80"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0012</td>
</tr>
</tbody>
</table>
</table-wrap><table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Comparison of macro F1 scores <inline-formula id="ieqn-88"><mml:math id="mml-ieqn-88"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> standard deviation under augmentation methods: proposed method, TF-IDF, POS deletion, and no augmentation</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Model</th>
<th>Proposed</th>
<th>TF-IDF</th>
<th>POS</th>
<th>No augmentation</th>
</tr>
</thead>
<tbody>
<tr>
<td>LegalBERT</td>
<td>0.8453 <inline-formula id="ieqn-89"><mml:math id="mml-ieqn-89"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0100</td>
<td>0.7916 <inline-formula id="ieqn-90"><mml:math id="mml-ieqn-90"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0155</td>
<td>0.6913 <inline-formula id="ieqn-91"><mml:math id="mml-ieqn-91"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0198</td>
<td>0.5952 <inline-formula id="ieqn-92"><mml:math id="mml-ieqn-92"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0327</td>
</tr>
<tr>
<td>BERT</td>
<td>0.9640 <inline-formula id="ieqn-93"><mml:math id="mml-ieqn-93"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0031</td>
<td>0.9475 <inline-formula id="ieqn-94"><mml:math id="mml-ieqn-94"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0017</td>
<td>0.9530 <inline-formula id="ieqn-95"><mml:math id="mml-ieqn-95"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0034</td>
<td>0.8973 <inline-formula id="ieqn-96"><mml:math id="mml-ieqn-96"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0023</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The macro F1 improvements observed in <xref ref-type="table" rid="table-6">Table 6</xref> are primarily attributable to category-level gains highlighted in <xref ref-type="table" rid="table-7">Table 7</xref>. Most notably, the Administrative Law category showed substantial improvement with the proposed method (0.9013 <inline-formula id="ieqn-81"><mml:math id="mml-ieqn-81"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0100) compared to TF-IDF (0.8171 <inline-formula id="ieqn-82"><mml:math id="mml-ieqn-82"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0030). Given that Administrative Law represents a large portion of the dataset, this gain had an outsized effect on the overall macro F1. In addition, the Family Law category&#x2014;characterized by cultural specificity and nuanced linguistic expressions&#x2014;benefited from TF-DF masking, improving from 0.7795 <inline-formula id="ieqn-83"><mml:math id="mml-ieqn-83"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0390 (TF-IDF) to 0.8990 <inline-formula id="ieqn-84"><mml:math id="mml-ieqn-84"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0076. This demonstrates that the proposed method effectively preserves contextually critical tokens (e.g., kinship or familial roles) that are often decisive in classification but may be indiscriminately masked under TF-IDF. Similarly, the Taxation category also exhibited meaningful gains (0.8936 <inline-formula id="ieqn-85"><mml:math id="mml-ieqn-85"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0149 <inline-formula id="ieqn-86"><mml:math id="mml-ieqn-86"><mml:mo stretchy="false">&#x2192;</mml:mo></mml:math></inline-formula> 0.9384 <inline-formula id="ieqn-87"><mml:math id="mml-ieqn-87"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0047).</p>
<table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Comparison of F1 score <inline-formula id="ieqn-97"><mml:math id="mml-ieqn-97"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> standard deviation between the proposed method and TF-IDF-based masking for each category using BERT model-based classification</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>Category</th>
<th>Patterns</th>
<th>Proposed</th>
<th>TF-IDF</th>
</tr>
</thead>
<tbody>
<tr>
<td>Civil</td>
<td>23,898</td>
<td>0.9626 <inline-formula id="ieqn-98"><mml:math id="mml-ieqn-98"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0040</td>
<td>0.9306 <inline-formula id="ieqn-99"><mml:math id="mml-ieqn-99"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0005</td>
</tr>
<tr>
<td>Criminal</td>
<td>12,272</td>
<td>0.9921 <inline-formula id="ieqn-100"><mml:math id="mml-ieqn-100"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0015</td>
<td>0.9810 <inline-formula id="ieqn-101"><mml:math id="mml-ieqn-101"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0010</td>
</tr>
<tr>
<td>Administrative</td>
<td>7596</td>
<td>0.9013 <inline-formula id="ieqn-102"><mml:math id="mml-ieqn-102"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0100</td>
<td>0.8171 <inline-formula id="ieqn-103"><mml:math id="mml-ieqn-103"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0030</td>
</tr>
<tr>
<td>Taxation</td>
<td>5794</td>
<td>0.9384 <inline-formula id="ieqn-104"><mml:math id="mml-ieqn-104"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0047</td>
<td>0.8936 <inline-formula id="ieqn-105"><mml:math id="mml-ieqn-105"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0149</td>
</tr>
<tr>
<td>IP Property</td>
<td>1972</td>
<td>0.9853 <inline-formula id="ieqn-106"><mml:math id="mml-ieqn-106"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0055</td>
<td>0.9663 <inline-formula id="ieqn-107"><mml:math id="mml-ieqn-107"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0022</td>
</tr>
<tr>
<td>Family</td>
<td>763</td>
<td>0.8990 <inline-formula id="ieqn-108"><mml:math id="mml-ieqn-108"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0076</td>
<td>0.7795 <inline-formula id="ieqn-109"><mml:math id="mml-ieqn-109"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0390</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="table-8">Table 8</xref> presents a qualitative comparison of classification results obtained using different masking strategies. This comparison illustrates how the proposed strategy enhances classification reliability by selectively masking peripheral expressions while retaining critical legal terms, thus maintaining semantic fidelity in the augmented corpus. Each block of three rows corresponds to one case: the original judgment text, the version augmented with the proposed TF-DF masking, and the version augmented with TF-IDF masking. In the &#x201C;Sentence&#x201D; column, tokens surrounded by brackets (e.g., <bold>[word]</bold>) indicate the terms that would have been masked during augmentation according to each strategy. The last column reports the ground-truth label and the prediction produced by LegalBERT when trained with the corresponding input. As shown in Cases 1, 3, the proposed masking strategy preserves legally decisive terms such as &#x201C;seizure invalid&#x201D;, &#x201C;gift tax&#x201D;, enabling the model to predict the correct category. In contrast, TF-IDF masking often removes these essential expressions, which leads to semantic drift and incorrect predictions (e.g., misclassifying taxation as civil law or family law). When TF-IDF masking eliminates key tax-related legal terms such as &#x2018;tax&#x2019;, &#x2018;seizure&#x2019;, and &#x2018;trust Asset&#x2019;, the decisive linguistic markers necessary to situate the dispute within the domain of taxation law are lost. With the fiscal and administrative context removed, the residual sentence can be interpreted merely as a conflict concerning property ownership or possession. Consequently, the model fails to recognize the case as a taxation dispute and instead misclassifies it as a matter falling within the scope of general civil law, particularly property rights disputes.</p>
<table-wrap id="table-8">
<label>Table 8</label>
<caption>
<title>Comparison of legal case classification results obtained using different masking strategies. Sentences with [word] indicate tokens that would have been masked during training if selected by the corresponding strategy. The proposed masking strategy retains critical legal expressions, allowing LegalBERT to predict correctly, while TF-IDF masking often removes essential terms</title>
</caption>
<table>
<colgroup>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/> </colgroup>
<thead>
<tr>
<th>No.</th>
<th>Version</th>
<th>Sentence (with [word])</th>
<th>Ground truth predicted result</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" rowspan="3">1</td>
<td>Original</td>
<td>The tax authority seized property based on unpaid taxes, but the property was trust asset of the plaintiff, making the seizure invalid.</td>
<td>Taxation</td>
</tr>
<tr>
<td>Proposed &#x002B; LegalBERT</td>
<td>The tax authority seized <bold>[property]</bold> based on unpaid taxes, but the property was trust asset of the plaintiff, making the seizure invalid.</td>
<td>Taxation</td>
</tr>
<tr>
<td>TF-IDF &#x002B; LegalBERT</td>
<td>The tax authority seized <bold>[property]</bold> based on unpaid <bold>[taxes]</bold>, but the property was <bold>[trust asset]</bold> of the plaintiff, making the <bold>[seizure invalid]</bold>.</td>
<td>Civil law</td>
</tr>
<tr>
<td/>
<td>Original</td>
<td>The defendant unlawfully entered the victim&#x2019;s house at night with a pencil sharpener and stole valuable items.</td>
<td>Criminal law</td>
</tr>
<tr>
<td rowspan="2">2</td>
<td>Proposed &#x002B; LegalBERT</td>
<td>The defendant unlawfully entered the <bold>[house]</bold> at night with a <bold>[pencil]</bold> sharpener and committed theft of valuable items.</td>
<td>Criminal law</td>
</tr>
<tr>
<td>TF-IDF &#x002B; LegalBERT</td>
<td>The <bold>[defendant]</bold> unlawfully entered the <bold>[house]</bold> at night with a <bold>[pencil]</bold> sharpener and <bold>[committed theft]</bold> of valuable items.</td>
<td>Administrative</td>
</tr>
<tr>
<td align="center" rowspan="3">3</td>
<td>Original</td>
<td>The tax office imposed gift taxes on family members, but some charges were revoked because the recipients were minors.</td>
<td>Taxation</td>
</tr>
<tr>
<td>Proposed &#x002B; LegalBERT</td>
<td>The tax office imposed <bold>[gift tax]</bold> on family members, but some charges were revoked because recipients were <bold>[minors]</bold>.</td>
<td>Taxation</td>
</tr>
<tr>
<td>TF-IDF &#x002B; LegalBERT</td>
<td>The tax office imposed <bold>[gift tax]</bold> on <bold>[family members]</bold>, but some charges were revoked because recipients were <bold>[minors]</bold>.</td>
<td>Civil law</td>
</tr>
<tr>
<td align="center" rowspan="3">4</td>
<td>Original</td>
<td>The tenant failed to pay rent for two months, so the landlord terminated the lease contract and claimed delivery of the building.</td>
<td>Civil law</td>
</tr>
<tr>
<td>Proposed &#x002B; LegalBERT</td>
<td>The tenant failed to pay rent <bold>[for two months]</bold>, so the landlord <bold>[terminated]</bold> the lease <bold>[contract]</bold> and claimed <bold>[building delivery]</bold>.</td>
<td>Civil law</td>
</tr>
<tr>
<td>TF-IDF &#x002B; LegalBERT</td>
<td>The <bold>[tenant]</bold> failed to pay <bold>[rent]</bold> for two months, so the <bold>[landlord] [terminated]</bold> the <bold>[lease]</bold> contract and claimed <bold>[building delivery]</bold>.</td>
<td>Administrative</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>By contrast, the proposed masking strategy preserves critical legal expressions such as &#x2018;seizure&#x2019;, &#x2018;trust Asset&#x2019;, and &#x2018;invalid&#x2019;, thereby maintaining the fiscal and administrative character of the dispute. Even though some peripheral expressions are masked, the presence of these domain-specific tokens enables the model to correctly identify the case as involving the validity of a tax levy, specifically the annulment of a taxation disposition. Taken together, the substantial improvement in Administrative Law, along with the enhanced handling of culturally specific Family Law cases and the robust recognition of domain-critical terms in Taxation, collectively explain the consistent macro F1 gains of the proposed method across both LegalBERT and BERT.</p>
<p>To assess parameter robustness, we varied the masking intensity parameter <inline-formula id="ieqn-110"><mml:math id="mml-ieqn-110"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula> across four levels: 0.05, 0.1, 0.2, and 0.3. As shown in <xref ref-type="fig" rid="fig-5">Fig. 5</xref>, the F1 scores remained highly consistent across settings, demonstrating that the proposed TF-DF masking is insensitive to small perturbations in <inline-formula id="ieqn-111"><mml:math id="mml-ieqn-111"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula>. Both BERT and LegalBERT achieved their highest performance at <inline-formula id="ieqn-112"><mml:math id="mml-ieqn-112"><mml:mi>&#x03B1;</mml:mi><mml:mo>=</mml:mo><mml:mn>0.2</mml:mn></mml:math></inline-formula>, suggesting that a moderate masking intensity provides an optimal balance between lexical diversity and semantic retention.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>F1 score comparison across different masking intensities (<inline-formula id="ieqn-113"><mml:math id="mml-ieqn-113"><mml:mi>&#x03B1;</mml:mi></mml:math></inline-formula>) for BERT and LegalBERT. The performance remained stable across <inline-formula id="ieqn-114"><mml:math id="mml-ieqn-114"><mml:mi>&#x03B1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:mn>0.05</mml:mn><mml:mo>,</mml:mo><mml:mn>0.1</mml:mn><mml:mo>,</mml:mo><mml:mn>0.2</mml:mn><mml:mo>,</mml:mo><mml:mn>0.3</mml:mn><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula></title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_74141-fig-5.tif"/>
</fig>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Discussion</title>
<p>As the size of the training corpus increases, the marginal benefit of data augmentation naturally diminishes, since larger datasets already encompass diverse lexical and syntactic patterns. TF-DF remains particularly useful for low-resource categories or specialized sub-domains where data imbalance continues to limit model generalization.</p>
<p>In terms of computational complexity, the proposed TF-DF masking operates in linear time with respect to the corpus size, requiring only a single pass to compute term and document frequencies, i.e., <inline-formula id="ieqn-115"><mml:math id="mml-ieqn-115"><mml:mi>O</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>N</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>L</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> where <italic>N</italic> and <italic>L</italic> denote the number of documents and the average number of tokens per document. By contrast, generative augmentation frameworks such as DALE or Meaning-Sensitive Masking require repeated model inference for each masked token, resulting in <inline-formula id="ieqn-116"><mml:math id="mml-ieqn-116"><mml:mi>O</mml:mi><mml:mspace width="thinmathspace" /><mml:mspace width="thinmathspace" /><mml:mo stretchy="false">(</mml:mo><mml:mi>N</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>L</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>M</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> complexity, where <italic>M</italic> represents the cost of a forward pass through a large language model. Consequently, TF-DF achieves comparable semantic fidelity with approximately 3&#x2013;5<inline-formula id="ieqn-117"><mml:math id="mml-ieqn-117"><mml:mo>&#x00D7;</mml:mo></mml:math></inline-formula> lower preprocessing time while avoiding additional GPU-based fine-tuning.</p>
<p>Although the TF-DF framework effectively captures corpus-level token importance, its reliance on statistical weighting may reduce stability when applied to extremely short legal texts such as claims or briefs, where term occurrences are sparse. In such cases, contextual or embedding-based weighting could complement TF-DF by providing semantic cues independent of token frequency. Moreover, proposed TF-DF masking relies solely on corpus-level token and document statistics rather than language-specific lexical features, it is inherently language-agnostic and can be applied to legal corpora across different jurisdictions. This design enables the method to generalize beyond Korean texts without requiring additional linguistic adaptation. In addition, we verified its applicability on the LEGAR [<xref ref-type="bibr" rid="ref-3">3</xref>] English legal dataset, where it achieved an F1 improvement from 0.8555 <inline-formula id="ieqn-118"><mml:math id="mml-ieqn-118"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0045 to 0.8629 <inline-formula id="ieqn-119"><mml:math id="mml-ieqn-119"><mml:mo>&#x00B1;</mml:mo></mml:math></inline-formula> 0.0064, further confirming that the method generalizes well across different jurisdictions and linguistic contexts.</p>
<p>Regarding tokenizer variations, our study primarily relied on the WordPiece tokenizer; we acknowledge this as a limitation and suggest that future research explore the impact of alternative tokenization strategies on masking behavior across languages. Future work could integrate embedding-based or contextual weighting to improve performance in such settings. In addition, the present study focuses on South Korean legal judgments; future extensions will include multilingual and cross-jurisdictional corpora to test the generality of the proposed approach.</p>
</sec>
<sec id="s6">
<label>6</label>
<title>Conclusions</title>
<p>In this study, we propose a TF-DF-based masking method as a novel data augmentation technique designed to address data imbalances in legal text classification. Unlike conventional augmentation methods, which suffer from semantic drift or struggle to preserve domain-specific legal terminology, the proposed approach selectively masks unimportant terms while preserving key legal expressions.</p>
<p>We evaluate the effectiveness of the proposed method for transformer-based models, including BERT and LegalBERT, in a large-scale legal classification task. The results demonstrate consistent improvements in accuracy and F1 score, particularly corresponding to underrepresented legal categories. LegalBERT exhibits the most substantial performance improvements, highlighting the strength of domain-adaptive augmentation.</p>
<p>Besides quantitative evaluation, the proposed method is compared with rule-based POS deletion. Although both methods exhibit similar metrics, our approach preserves essential legal semantics more reliably. For example, it retains crucial modifiers, e.g., &#x201C;Asset&#x201D; in &#x201C;trust Asset&#x201D;. The proposed method also corrects previous misclassifications, especially in semantically complex domains, such as taxation and civil Law.</p>
<p>Although the proposed method relies on statistical weighting and does not fully guarantee the preservation of all legally essential terms, it reduces the likelihood of masking them significantly compared to traditional approaches. The current study focuses on South Korean legal texts. Future works should explore their generalizability to other legal systems and languages. In addition, we intend to investigate adaptive augmentation strategies using contextual embedding and attention-based token selection to further enhance performance in legal NLP tasks.</p>
</sec>
</body>
<back>
<ack>
<p>The authors would like to thank Chung-Ang University for providing computational resources and administrative support that contributed to this work.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This work was supported by the Institute of Information &#x0026; Communications Technology Planning &#x0026; Evaluation (IITP) grant funded by the Korea government (MSIT) [RS-2021-II211341, Artificial Intelligence Graduate School Program (Chung-Ang University)], and by the Chung-Ang University Graduate Research Scholarship in 2024.</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>The authors confirm contribution to the paper as follows: Conceptualization, Ye-Chan Park; methodology, Ye-Chan Park; software, Ye-Chan Park; validation, Ye-Chan Park, Mohd Asyraf Zulkifley, and Bong-Soo Sohn; investigation, Ye-Chan Park; resources, Ye-Chan Park; writing&#x2014;original draft preparation, Ye-Chan Park; writing&#x2014;review and editing, Mohd Asyraf Zulkifley, Bong-Soo Sohn, and Jaesung Lee; visualization, Ye-Chan Park; supervision, Jaesung Lee; project administration, Jaesung Lee. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>The data that support the findings of this study are openly available at <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/datasets/Yeeachan/korleg">https://huggingface.co/datasets/Yeeachan/korleg</ext-link> (accessed on 18 November 2025).</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable. This study did not involve human participants or animals. All legal case documents used were publicly available and anonymized to remove personal identifiers.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chalkidis</surname> <given-names>I</given-names></string-name>, <string-name><surname>Androutsopoulos</surname> <given-names>I</given-names></string-name>, <string-name><surname>Aletras</surname> <given-names>N</given-names></string-name></person-group>. <article-title>Neural legal judgment prediction in English</article-title>. In: <conf-name>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2019</year>. p. <fpage>4317</fpage>&#x2013;<lpage>23</lpage>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chalkidis</surname> <given-names>I</given-names></string-name>, <string-name><surname>Fergadiotis</surname> <given-names>M</given-names></string-name>, <string-name><surname>Malakasiotis</surname> <given-names>P</given-names></string-name>, <string-name><surname>Aletras</surname> <given-names>N</given-names></string-name>, <string-name><surname>Androutsopoulos</surname> <given-names>I</given-names></string-name></person-group>. <article-title>LEGAL-BERT: the muppets straight out of law school</article-title>. In: <conf-name>Findings of the Association for Computational Linguistics: EMNLP 2020</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2020</year>. p. <fpage>2898</fpage>&#x2013;<lpage>904</lpage>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Chalkidis</surname> <given-names>I</given-names></string-name>, <string-name><surname>Jana</surname> <given-names>A</given-names></string-name>, <string-name><surname>Hartung</surname> <given-names>D</given-names></string-name>, <string-name><surname>Bommarito</surname> <given-names>M</given-names></string-name>, <string-name><surname>Androutsopoulos</surname> <given-names>I</given-names></string-name>, <string-name><surname>Katz</surname> <given-names>D</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>LexGLUE: a benchmark dataset for legal language understanding in English</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Muresan</surname> <given-names>S</given-names></string-name>, <string-name><surname>Nakov</surname> <given-names>P</given-names></string-name>, <string-name><surname>Villavicencio</surname> <given-names>A</given-names></string-name></person-group>, editors. <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics</source>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2022</year>. p. <fpage>4310</fpage>&#x2013;<lpage>30</lpage>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Hakimi Parizi</surname> <given-names>A</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Nokku</surname> <given-names>P</given-names></string-name>, <string-name><surname>Gholamian</surname> <given-names>S</given-names></string-name>, <string-name><surname>Emerson</surname> <given-names>D</given-names></string-name></person-group>. <article-title>A comparative study of prompting strategies for legal text classification</article-title>. In: <conf-name>Proceedings of the Natural Legal Language Processing Workshop 2023</conf-name>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2023</year>. p. <fpage>258</fpage>&#x2013;<lpage>65</lpage>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>J</given-names></string-name>, <string-name><surname>Tam</surname> <given-names>D</given-names></string-name>, <string-name><surname>Raffel</surname> <given-names>C</given-names></string-name>, <string-name><surname>Bansal</surname> <given-names>M</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>D</given-names></string-name></person-group>. <article-title>An empirical survey of data augmentation for limited data learning in NLP</article-title>. <source>Tran Assoc Comput Linguist</source>. <year>2023</year>;<volume>11</volume>:<fpage>191</fpage>&#x2013;<lpage>211</lpage>. doi:<pub-id pub-id-type="doi">10.1162/tacl_a_00542</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Hsu</surname> <given-names>TW</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>CC</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>HH</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>HH</given-names></string-name></person-group>. <article-title>Semantics-preserved data augmentation for aspect-based sentiment analysis</article-title>. In: <conf-name>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2021</year>. p. <fpage>4417</fpage>&#x2013;<lpage>22</lpage>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Wilf</surname> <given-names>A</given-names></string-name>, <string-name><surname>Akter</surname> <given-names>S</given-names></string-name>, <string-name><surname>Mathur</surname> <given-names>L</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>P</given-names></string-name>, <string-name><surname>Mathew</surname> <given-names>S</given-names></string-name>, <string-name><surname>Shou</surname> <given-names>M</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>Difference-masking: choosing what to mask in continued pretraining</chapter-title>. In: <source>Findings of the Association for Computational Linguistics: EMNLP 2023</source>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2023</year>. p. <fpage>13222</fpage>&#x2013;<lpage>34</lpage>.</mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kesgin</surname> <given-names>HT</given-names></string-name>, <string-name><surname>Amasyali</surname> <given-names>MF</given-names></string-name></person-group>. <article-title>Iterative mask filling: an effective text augmentation method using masked language modeling</article-title>. In: <conf-name>Proceedings of International Conference on Advanced Engineering, Technology and Applications</conf-name>. <publisher-loc>Cham, Swizterland</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2023</year>. p. <fpage>450</fpage>&#x2013;<lpage>63</lpage>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Costa</surname> <given-names>JAF</given-names></string-name>, <string-name><surname>Dantas</surname> <given-names>NCD</given-names></string-name></person-group>, <article-title>Silva EDSA</article-title>. In: <conf-name>Evaluating text classification in the legal domain using BERT embeddings</conf-name>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer Nature</publisher-name>; <year>2023</year>. p. <fpage>51</fpage>&#x2013;<lpage>63</lpage>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Nair</surname> <given-names>I</given-names></string-name>, <string-name><surname>Modani</surname> <given-names>N</given-names></string-name></person-group>. <article-title>Exploiting language characteristics for legal domain-specific language model pretraining</article-title>. In: <conf-name>Findings of the Association for Computational Linguistics: EACL 2023</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2023</year>. p. <fpage>2516</fpage>&#x2013;<lpage>26</lpage>.</mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Katz</surname> <given-names>DM</given-names></string-name></person-group>. <article-title>Quantitative legal prediction-or-how i learned to stop worrying and start preparing for the data-driven future of the legal services industry</article-title>. <source>Emory LJ</source>. <year>2012</year>;<volume>62</volume>:<fpage>909</fpage>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Bender</surname> <given-names>EM</given-names></string-name>, <string-name><surname>Koller A. Climbing</surname> <given-names>towards NLU</given-names></string-name></person-group>: <article-title>on meaning, form, and understanding in the age of data</article-title>. In: <conf-name>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2020</year>. p. <fpage>5185</fpage>&#x2013;<lpage>98</lpage>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Sennrich</surname> <given-names>R</given-names></string-name>, <string-name><surname>Haddow</surname> <given-names>B</given-names></string-name>, <string-name><surname>Birch</surname> <given-names>A</given-names></string-name></person-group>. <chapter-title>Improving neural machine translation models with monolingual data</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Erk</surname> <given-names>K</given-names></string-name>, <string-name><surname>Smith</surname> <given-names>NA</given-names></string-name></person-group>, editors. <source>Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics</source>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2016</year>. p. <fpage>86</fpage>&#x2013;<lpage>96</lpage> doi:<pub-id pub-id-type="doi">10.1162/tacl_a_00395</pub-id>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wei</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zou</surname> <given-names>K</given-names></string-name></person-group>. <article-title>EDA: easy data augmentation techniques for boosting performance on text classification tasks</article-title>. In: <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2019</year>. p. <fpage>6382</fpage>&#x2013;<lpage>8</lpage>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Xie</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Dai</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Hovy</surname> <given-names>E</given-names></string-name>, <string-name><surname>Luong</surname> <given-names>T</given-names></string-name>, <string-name><surname>Le</surname> <given-names>Q</given-names></string-name></person-group>. <article-title>Unsupervised data augmentation for consistency training</article-title>. <source>Adv Neural Inform Process Syst</source>. <year>2020</year>;<volume>33</volume>:<fpage>6256</fpage>&#x2013;<lpage>68</lpage>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ng</surname> <given-names>N</given-names></string-name>, <string-name><surname>Cho</surname> <given-names>K</given-names></string-name>, <string-name><surname>Ghassemi</surname> <given-names>M</given-names></string-name></person-group>. <article-title>SSMBA: self-supervised manifold based data augmentation for improving out-of-domain robustness</article-title>. In: <conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2020</year>. p. <fpage>1268</fpage>&#x2013;<lpage>83</lpage>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Karimi</surname> <given-names>A</given-names></string-name>, <string-name><surname>Rossi</surname> <given-names>L</given-names></string-name>, <string-name><surname>Prati</surname> <given-names>A</given-names></string-name></person-group>. <article-title>AEDA: an easier data augmentation technique for text classification</article-title>. In: <conf-name>Findings of the Association for Computational Linguistics: EMNLP 2021</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2021</year>. p. <fpage>2748</fpage>&#x2013;<lpage>54</lpage>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Kim</surname> <given-names>KM</given-names></string-name></person-group>. <article-title>A study of data augmentation for dense passage retrieval using corpus-passage frequency-based token deletion [master&#x2019;s thesis]. Seoul, Republic of Korea: Chung-Ang University; 2024</article-title>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Mamooler</surname> <given-names>S</given-names></string-name>, <string-name><surname>Lebret</surname> <given-names>R</given-names></string-name>, <string-name><surname>Massonnet</surname> <given-names>S</given-names></string-name>, <string-name><surname>Aberer</surname> <given-names>K</given-names></string-name></person-group>. <article-title>An efficient active learning pipeline for legal text classification</article-title>. In: <conf-name>Proceedings of the Natural Legal Language Processing Workshop 2022</conf-name>. <publisher-loc>Radnor, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2022</year>. p. <fpage>345</fpage>&#x2013;<lpage>58</lpage>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ghosh</surname> <given-names>S</given-names></string-name>, <string-name><surname>Evuru</surname> <given-names>CKR</given-names></string-name>, <string-name><surname>Kumar</surname> <given-names>S</given-names></string-name>, <string-name><surname>Ramaneswaran</surname> <given-names>S</given-names></string-name>, <string-name><surname>Sakshi</surname> <given-names>S</given-names></string-name>, <string-name><surname>Tyagi</surname> <given-names>U</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>DALE: generative data augmentation for low-resource legal NLP</article-title>. In: <conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2023</year>. p. <fpage>8511</fpage>&#x2013;<lpage>65</lpage>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kasthuriarachchy</surname> <given-names>B</given-names></string-name>, <string-name><surname>Chetty</surname> <given-names>M</given-names></string-name>, <string-name><surname>Shatte</surname> <given-names>A</given-names></string-name>, <string-name><surname>Walls</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Meaning-sensitive text data augmentation with intelligent masking</article-title>. <source>ACM Trans Intell Syst Technol</source>. <year>2023</year>;<volume>14</volume>(<issue>6</issue>):<fpage>1</fpage>&#x2013;<lpage>20</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3623403</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Duffy</surname> <given-names>W</given-names></string-name>, <string-name><surname>O&#x2019;Connell</surname> <given-names>E</given-names></string-name>, <string-name><surname>McCarroll</surname> <given-names>N</given-names></string-name>, <string-name><surname>Sloan</surname> <given-names>K</given-names></string-name>, <string-name><surname>Curran</surname> <given-names>K</given-names></string-name>, <string-name><surname>McNamee</surname> <given-names>E</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Evaluating rule-based and generative data augmentation techniques for legal document classification</article-title>. <source>Knowl Inform Syst</source>. <year>2025</year>;<volume>67</volume>(<issue>9</issue>):<fpage>7825</fpage>&#x2013;<lpage>46</lpage>. doi:<pub-id pub-id-type="doi">10.1007/s10115-025-02454-x</pub-id>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Sheik</surname> <given-names>R</given-names></string-name>, <string-name><surname>Siva Sundara</surname> <given-names>K</given-names></string-name>, <string-name><surname>Nirmala</surname> <given-names>SJ</given-names></string-name></person-group>. <article-title>Neural data augmentation for legal overruling task: small deep learning models vs. large language models</article-title>. <source>Neural Process Lett</source>. <year>2024</year>;<volume>56</volume>(<issue>2</issue>):<fpage>121</fpage>. doi:<pub-id pub-id-type="doi">10.1007/s11063-024-11574-4</pub-id>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Kim</surname> <given-names>MC</given-names></string-name>, <string-name><surname>Penrod</surname> <given-names>SD</given-names></string-name></person-group>. <article-title>Legal decision making among Korean and American legal professionals and lay people</article-title>. <source>Int J Law Crime Justice</source>. <year>2010</year>;<volume>38</volume>(<issue>4</issue>):<fpage>175</fpage>&#x2013;<lpage>97</lpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ijlcj.2011.01.004</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Hwang</surname> <given-names>W</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>D</given-names></string-name>, <string-name><surname>Cho</surname> <given-names>K</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>H</given-names></string-name>, <string-name><surname>Seo</surname> <given-names>M</given-names></string-name></person-group>. <article-title>A multi-task benchmark for Korean legal language understanding and judgement prediction</article-title>. <source>Adv Neural Inform Process Syst</source>. <year>2022</year>;<volume>35</volume>:<fpage>32537</fpage>&#x2013;<lpage>51</lpage>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Mizrahi</surname> <given-names>D</given-names></string-name>, <string-name><surname>Bachmann</surname> <given-names>R</given-names></string-name>, <string-name><surname>Kar</surname> <given-names>O</given-names></string-name>, <string-name><surname>Yeo</surname> <given-names>T</given-names></string-name>, <string-name><surname>Gao</surname> <given-names>M</given-names></string-name>, <string-name><surname>Dehghan</surname> <given-names>A</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>4m: massively multimodal masked modeling</article-title>. <source>Adv Neural Inform Process Syst</source>. <year>2023</year>;<volume>36</volume>:<fpage>58363</fpage>&#x2013;<lpage>408</lpage>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Chawla</surname> <given-names>NV</given-names></string-name>, <string-name><surname>Bowyer</surname> <given-names>KW</given-names></string-name>, <string-name><surname>Hall</surname> <given-names>LO</given-names></string-name>, <string-name><surname>Kegelmeyer</surname> <given-names>WP</given-names></string-name></person-group>. <article-title>SMOTE: synthetic minority over-sampling technique</article-title>. <source>J Artif Intell Res</source>. <year>2002</year>;<volume>16</volume>:<fpage>321</fpage>&#x2013;<lpage>57</lpage>. doi:<pub-id pub-id-type="doi">10.1613/jair.953</pub-id>.</mixed-citation></ref>
</ref-list>
</back></article>