<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">73771</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2025.073771</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>OPOR-Bench: Evaluating Large Language Models on Online Public Opinion Report Generation</article-title>
<alt-title alt-title-type="left-running-head">OPOR-Bench: Evaluating Large Language Models on Online Public Opinion Report Generation</alt-title>
<alt-title alt-title-type="right-running-head">OPOR-Bench: Evaluating Large Language Models on Online Public Opinion Report Generation</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Yu</surname><given-names>Jinzheng</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author">
<name name-style="western"><surname>Xu</surname><given-names>Yang</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Li</surname><given-names>Haozhen</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Li</surname><given-names>Junqi</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Zhu</surname><given-names>Ligu</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-6" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Shen</surname><given-names>Hao</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref rid="cor1" ref-type="corresp">&#x002A;</xref><email>shenhao@cuc.edu.cn</email></contrib>
<contrib id="author-7" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Shi</surname><given-names>Lei</given-names></name><xref ref-type="aff" rid="aff-1">1</xref><xref rid="cor1" ref-type="corresp">&#x002A;</xref><email>leiky_shi@cuc.edu.cn</email></contrib>
<aff id="aff-1"><label>1</label><institution>State Key Laboratory of Media Convergence and Communication, Communication University of China</institution>, <addr-line>Beijing, 100024</addr-line>, <country>China</country></aff>
<aff id="aff-2"><label>2</label><institution>Research Center for Social Computing and Interactive Robotics, Harbin Institute of Technology</institution>, <addr-line>Harbin, 150001</addr-line>, <country>China</country></aff>
<aff id="aff-3"><label>3</label><institution>Scientific and Information Technical Research Institute, China Academy of Railway Sciences Corporation Limited</institution>, <addr-line>Beijing, 100081</addr-line>, <country>China</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Authors: Hao Shen. Email: <email>shenhao@cuc.edu.cn</email>; Lei Shi. Email: <email>leiky_shi@cuc.edu.cn</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2026</year>
</pub-date>
<pub-date date-type="pub" publication-format="electronic">
<day>10</day><month>2</month><year>2026</year>
</pub-date>
<volume>87</volume>
<issue>1</issue>
<elocation-id>58</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>09</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>11</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2026 The Authors.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Published by Tech Science Press.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_73771.pdf"></self-uri>
<abstract>
<p>Online Public Opinion Reports consolidate news and social media for timely crisis management by governments and enterprises. While large language models (LLMs) enable automated report generation, this specific domain lacks formal task definitions and corresponding benchmarks. To bridge this gap, we define the Automated Online Public Opinion Report Generation (OPOR-Gen) task and construct OPOR-Bench, an event-centric dataset with 463 crisis events across 108 countries (comprising 8.8 K news articles and 185 K tweets). To evaluate report quality, we propose OPOR-Eval, a novel agent-based framework that simulates human expert evaluation. Validation experiments show OPOR-Eval achieves a high Spearman&#x2019;s correlation (&#x03C1; &#x003D; 0.70) with human judgments, though challenges in temporal reasoning persist. This work establishes an initial foundation for advancing automated public opinion reporting research.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Online public opinion reports</kwd>
<kwd>crisis management</kwd>
<kwd>large language models</kwd>
<kwd>agent-based evaluation</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Fundamental Research Funds for the Central Universities</funding-source>
<award-id>CUC25SG013</award-id>
</award-group>
<award-group id="awg2">
<funding-source>Foundation of Key Laboratory of Education Informatization for Nationalities (Yunnan Normal University)</funding-source>
<funding-source>Ministry of Education</funding-source>
<award-id>EIN2024C006</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Online Public Opinion Reports are critical tools that consolidate news articles and social media posts about crisis events (e.g., earthquakes, floods) into structured reports, enabling governments or enterprises to respond timely to these rapidly spreading incidents [<xref ref-type="bibr" rid="ref-1">1</xref>].</p>
<p>However, the industry&#x2019;s reliance on manual report generation and evaluation is time-consuming and inefficient, often causing responsible parties to miss optimal response windows and potentially worsen crises. Large language models (LLMs) [<xref ref-type="bibr" rid="ref-2">2</xref>] have made automated report generation technically feasible. However, this specific domain lacks formal task definitions and corresponding benchmarks, hindering systematic research and practical deployment. The absence of systematic research stems from two critical gaps: the lack of a formal task definition for this complex, multi-source generation task, and the absence of a corresponding benchmark dataset.</p>
<p>While the task of OPOR-Gen involves synthesizing multiple documents, it is fundamentally distinct from traditional Multi-Document Summarization (MDS). Traditional MDS primarily focuses on information consolidation, aiming to summarize homogeneous sources (e.g., news-only) into a single unstructured paragraph. In stark contrast, OPOR-Gen demands a full-cycle analytical product. It requires synthesizing highly heterogeneous sources (i.e., formal news and informal social media) into a structured, multi-section report, and critically, moves beyond mere summarization to require the analysis of diverse public viewpoints (Event Focus) and the generation of actionable recommendations (Event Suggestions). Similarly, evaluation frameworks present a critical barrier. Traditional metrics like ROUGE are known to be inadequate for long-form, structured content. While recent LLM-as-a-judge methods show promise [<xref ref-type="bibr" rid="ref-3">3</xref>,<xref ref-type="bibr" rid="ref-4">4</xref>], they typically assess holistic quality and are not tailored to the unique, multi-faceted structural demands of OPOR-Gen, such as simultaneously validating timeline accuracy, opinion diversity, and suggestion feasibility.</p>
<p>To address these challenges, we define the Automated Online Public Opinion Report Generation (OPOR-Gen) task, which challenges models to synthesize documents from diverse sources (news and social media) about a single crisis event into a structured report (as shown in <xref ref-type="fig" rid="fig-1">Fig. 1b</xref>). We construct OPOR-Bench, an event-centric dataset with 463 crisis events (2012&#x2013;2025) across 108 countries, comprising 8842 news articles and 185,554 tweets. Each event includes news articles, social media posts, and a structured reference report.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>(<bold>a</bold>) Traditional methods require manual information consolidation from diverse sources (e.g., news, social media) and labor-intensive report writing and evaluation. In contrast, our (<bold>b</bold>) Automated approach generates and evaluates reports automatically, significantly accelerating the feedback loop</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-1.tif"/>
</fig>
<p>Furthermore, recognizing that evaluating these complex reports is another major challenge, we develop OPOR-Eval, a novel agent-based framework that simulates human expert judgment through multi-dimensional analysis: factual accuracy (event details verification), opinion mining (sentiment and stance extraction), and solution reasoning (recommendation quality). OPOR-Eval decomposes the evaluation into section-specific assessments, applying tailored criteria for each dimension on a 5-point Likert scale and achieving &#x03C1; &#x003D; 0.70 correlation with human judgments. This automated approach can replace labor-intensive manual evaluation, significantly accelerating the feedback loop for crisis response.</p>
<p>This work makes the following key contributions:
<list list-type="bullet">
<list-item>
<p><bold>A New Task and the First Supporting Benchmark.</bold> We define a new task, Automated Online Public Opinion Report Generation (OPOR-Gen), and introduce OPOR-Bench, the first event-centric benchmark designed to support it, along with a dedicated annotation tool for quality assurance.</p></list-item>
<list-item>
<p><bold>An Innovative and Reliable Evaluation Framework.</bold> We propose OPOR Eval, an agent-based framework for evaluating long-form, structured reports, addressing the limitations of traditional metrics.</p></list-item>
<list-item>
<p><bold>Comprehensive Baselines and In-depth Analysis.</bold> We establish strong baselines using frontier models and conduct in-depth analysis of both generation and evaluation. Our findings reveal universal challenges in temporal reasoning and systematic evaluation biases, providing concrete directions for future research.</p></list-item>
</list></p>
</sec>
<sec id="s2">
<label>2</label>
<title>Preliminaries: The Structure of an Online Public Opinion Report</title>
<p>In this section, we break down the structure of an online public opinion report, detailing the five key components that our study aims to automatically generate.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Event Title</title>
<p>The Event Title serves as a concise identifier that allows readers to immediately grasp the event&#x2019;s essence. A well-formed title conveys the crisis name, type, and time, facilitating efficient storage and retrieval.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Event Summary</title>
<p>The Event Summary offers a condensed overview of the crisis for rapid comprehension. Inspired by the classic 5W1H framework [<xref ref-type="bibr" rid="ref-5">5</xref>], it covers the Crisis Name (What), Location (Where), Time (When), Cause (Why/How), and Involved Parties (Who). Crucially, we extend this framework with an Impact component, which highlights the event&#x2019;s consequences to underscore its severity and prompt responsible parties to take actions.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Event Timeline</title>
<p>By analyzing the public opinion lifecycle, the Event Timeline provides stage-specific guidance, thereby shifting crisis management from reactive to proactive [<xref ref-type="bibr" rid="ref-6">6</xref>]. While established theories often divide the lifecycle into four phases (Incubation, Outbreak, Diffusion, Decline), the sudden and fast-spreading nature of crises typically merges the Outbreak and Diffusion stages into a single, intense Peak Period, leading us to adopt a three-phase timeline: <bold>Incubation</bold>, <bold>Peak</bold>, and <bold>Decline</bold>.</p>
<p><bold>The Incubation Period</bold> features discussion limited to directly affected stakeholders, making detection difficult without specialized monitoring. Despite low activity, these topics possess significant eruption potential&#x2014;a controversial sub-event can rapidly trigger widespread attention. This period thus serves as a critical early-warning phase for anticipating crisis development.</p>
<p><bold>The Peak Period</bold> exhibits exponential growth in attention, participation, volume, and velocity [<xref ref-type="bibr" rid="ref-7">7</xref>]. This expansion broadens scope and deepens complexity through derivative sub-events like official announcements and public controversies. The phase is a critical window for shaping public opinion, as it forges the public&#x2019;s long-term perception of the event [<xref ref-type="bibr" rid="ref-6">6</xref>].</p>
<p><bold>The Decline Period</bold> shows diminishing public interest as focus shifts to newer events [<xref ref-type="bibr" rid="ref-8">8</xref>]. Discussion reverts to directly affected stakeholders, where unresolved issues persist and can reactivate under specific conditions. Thus, rather than a final resolution, this period marks a decline in widespread attention that leaves a lasting reputational impact [<xref ref-type="bibr" rid="ref-9">9</xref>].</p>
</sec>
<sec id="s2_4">
<label>2.4</label>
<title>Event Focus</title>
<p>During the Peak Period, an event triggers exponential growth in online discussions, leading to topic polarization, rumor surges, and emotional instability [<xref ref-type="bibr" rid="ref-6">6</xref>,<xref ref-type="bibr" rid="ref-8">8</xref>]. Emotional contagion drives this growth, with anger fueling sharing and anxiety driving information-seeking [<xref ref-type="bibr" rid="ref-7">7</xref>]. This volatile mix necessitates multi-perspective analysis beyond event timelines, making group-specific analysis essential for prediction and intervention [<xref ref-type="bibr" rid="ref-10">10</xref>]. The Event Focus deconstructs public opinion by analyzing two participant groups&#x2014;Netizens and Authoritative Institutions. For each group, the analysis extracts three key insights: (1) Core Topics to reveal their primary concerns; (2) Sentiment Stance to gauge their overall emotional orientation; and (3) Key Viewpoints to highlight their core arguments and stances.</p>
</sec>
<sec id="s2_5">
<label>2.5</label>
<title>Event Suggestions</title>
<p>The Event Suggestions section converts complex event data&#x2014;foundational context (Summary), thematic evolution (Timeline), and divergent viewpoints (Focus)&#x2014;into actionable recommendations, such as targeted communication strategies or policy adjustments [<xref ref-type="bibr" rid="ref-11">11</xref>]. This translation from analysis to action is critical for accelerating the official response, thereby mitigating the public anxiety and mistrust that stem from delays.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Task Definition and Dataset</title>
<sec id="s3_1">
<label>3.1</label>
<title>Task Definition</title>
<p>The OPOR-Gen task aims to generate a structured, multi-section report <inline-formula id="ieqn-1"><mml:math id="mml-ieqn-1"><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> for a given public opinion event <inline-formula id="ieqn-2"><mml:math id="mml-ieqn-2"><mml:msub><mml:mi>e</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>. The input for each event consists of a set of news articles <inline-formula id="ieqn-3"><mml:math id="mml-ieqn-3"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>M</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula> and a set of social media posts <inline-formula id="ieqn-4"><mml:math id="mml-ieqn-4"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo fence="false" stretchy="false">{</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi></mml:mrow></mml:msub><mml:mo fence="false" stretchy="false">}</mml:mo></mml:math></inline-formula>. Formally, the task is defined as:
<disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>LLM</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mtext>gen</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>where <inline-formula id="ieqn-5"><mml:math id="mml-ieqn-5"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mtext>gen</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> represents the generation prompt.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>OPOR-Bench</title>
<p>While several crisis-related datasets exist [<xref ref-type="bibr" rid="ref-12">12</xref>&#x2013;<xref ref-type="bibr" rid="ref-16">16</xref>], they prove inadequate for the OPOR-Gen task due to two critical shortcomings. First, they contain only social media posts, which overlooks the formal perspective provided by news articles. Second, their effective scale is limited; after thorough quality validation, we find only 95 unique events suitable for our purposes, which is insufficient for a reliable benchmark.</p>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Event-Centric Corpus Construction</title>
<p>As shown in <xref ref-type="fig" rid="fig-2">Fig. 2a</xref>, we begin by collecting crisis events from authoritative sources and then gather documents.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Overview of OPOR-Bench construction pipeline. (<bold>a</bold>) Event-Centric Corpus Construction: Starting from authoritative databases (EM-DAT) and curated lists (Wikipedia), we identify crisis events and collect corresponding multi-source documents&#x2014;news articles from Wikipedia references and social media posts from X/Twitter API. (<bold>b</bold>) Dataset Annotation: Three-layered annotation process transforms raw documents into structured data. Human experts annotate timeline phases, while our LLM framework handles factual attribute extraction and social media author classification, ultimately producing a comprehensive reference for each event</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-2.tif"/>
</fig>
<p>Crisis Event Collection</p>
<p>The first stage of corpus construction is to identify a large and diverse set of crisis events that have received sufficient media coverage and generated widespread social media discussion. We gather crisis events (2018&#x2013;2025) from two sources: the EM-DAT international disaster database [<xref ref-type="bibr" rid="ref-17">17</xref>] for standardized records, and Wikipedia&#x2019;s curated disaster lists for events with high public interest. After deduplication against 95 seed events from prior datasets, we obtain 368 new events.
<list list-type="bullet">
<list-item>
<p><bold>EM-DAT International Disaster Database:</bold> Maintained by the Center for Research on the Epidemiology of Disasters (CRED), providing standardized records with verified impact metrics.</p></list-item>
<list-item>
<p><bold>Wikipedia&#x2019;s Curated Disaster Lists:</bold> These lists are community-vetted and often link to well-referenced articles, making them a reliable source. Specific lists we utilize include, but are not limited to earthquakes<xref ref-type="fn" rid="fn-1"><sup>1</sup></xref><fn id="fn-1"><label>1</label><p><ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Lists_of_earthquakes">https://en.wikipedia.org/wiki/Lists_of_earthquakes</ext-link> (accessed on 01 September 2025).</p></fn>, floods<xref ref-type="fn" rid="fn-2"><sup>2</sup></xref><fn id="fn-2"><label>2</label><p><ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/List_of_floods_in_Europe">https://en.wikipedia.org/wiki/List_of_floods_in_Europe</ext-link> (accessed on 01 September 2025).</p></fn>, and wildfires<xref ref-type="fn" rid="fn-3"><sup>3</sup></xref><fn id="fn-3"><label>3</label><p><ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/List_of_wildfires">https://en.wikipedia.org/wiki/List_of_wildfires</ext-link> (accessed on 01 September 2025).</p></fn>.</p></list-item>
</list></p>
<p>Document Collection</p>
<p>For each event, we collect multi-source documents from two parallel streams.
<list list-type="bullet">
<list-item>
<p><bold>News Articles:</bold> To ensure both quality and relevance, we employ a two-phase collection strategy. We begin by crawling articles from the vetted &#x201C;References&#x201D; section of an event&#x2019;s official Wikipedia page (e.g., the page for the &#x201C;2025 Table Mountain fire&#x201D;<xref ref-type="fn" rid="fn-4"><sup>4</sup></xref><fn id="fn-4"><label>4</label><p><ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/2025_Table_Mountain_fire">https://en.wikipedia.org/wiki/2025_Table_Mountain_fire</ext-link> (accessed on 01 September 2025).</p></fn>). After this initial crawl, we refine the collection by using a BM25 retriever [<xref ref-type="bibr" rid="ref-18">18</xref>] to rerank articles based on their relevance to event-specific keywords.</p></list-item>
<list-item>
<p><bold>Social Media Posts:</bold> Simultaneously, we acquire social media posts via the official X (Twitter) API<xref ref-type="fn" rid="fn-5"><sup>5</sup></xref><fn id="fn-5"><label>5</label><p><ext-link ext-link-type="uri" xlink:href="https://developer.x.com/en/docs/x-api">https://developer.x.com/en/docs/x-api</ext-link> (accessed on 01 September 2025).</p></fn>, targeting a window from one week before to one month after each event.</p></list-item>
</list></p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Annotation Pipeline and Quality Assurance</title>
<p>To fulfill the multifaceted requirements of the OPOR-Gen task, we perform three distinct annotation tasks (as shown in <xref ref-type="fig" rid="fig-2">Fig. 2b</xref>): (1) Reference Annotation, to extract key factual attributes about each event; (2) Social Media Annotation, to classify the author type of each post; and (3) Timeline Annotation, to pinpoint the start and end dates of each key phase in the public opinion lifecycle. The first two tasks are automated via our protocol-guided LLM framework, while the third, more complex task is handled by human experts to ensure the highest quality.</p>
<p>LLM-Based Annotation Framework</p>
<p>To address the prohibitive expense and time of manual annotation, we develop a protocol-guided LLM framework with three key components: (1) clear label definitions, (2) detailed annotation criteria, and (3) diverse few-shot examples.</p>
<p>We evaluate a set of frontier LLMs on the Social Media Annotation task. We use the pre-labeled data from the CrisisLexT26 dataset [<xref ref-type="bibr" rid="ref-12">12</xref>] as the ground truth for this evaluation. Each model is prompted to classify the author type of tweets from the dataset. As shown in <xref ref-type="table" rid="table-1">Table 1</xref>, gpt-4o-mini demonstrates the optimal balance between annotation quality and cost, so we select it as the primary model for our protocol-guided annotation tasks.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>Performance comparison of different models on the Social Media Annotation. The best result is highlighted in bold</title>
</caption>
<table>
<colgroup>
<col align="center" width="48mm"/>
<col align="center" width="52mm"/> </colgroup>
<thead>
<tr>
<th>Model</th>
<th>Macro-F1</th>
</tr>
</thead>
<tbody>
<tr>
<td>Gemini 2.5 Pro</td>
<td><bold>0.8053</bold></td>
</tr>
<tr>
<td>DeepSeek-V3</td>
<td>0.8012</td>
</tr>
<tr>
<td>GPT-4o</td>
<td>0.8009</td>
</tr>
<tr>
<td>DeepSeek-R1</td>
<td>0.7934</td>
</tr>
<tr>
<td>GPT-4o-mini</td>
<td>0.7800</td>
</tr>
<tr>
<td>Grok-3 Reasoner</td>
<td>0.6235</td>
</tr>
<tr>
<td>Claude 3.7</td>
<td>0.6127</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Reference and Social Media Annotation</p>
<p>In the Reference Annotation task, our framework distills key factual attributes (e.g., location, time, cause) from Wikipedia pages to produce <inline-formula id="ieqn-6"><mml:math id="mml-ieqn-6"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, the structured metadata used for factual evaluation. The subsequent Social Media Annotation task classifies tweet authors into <italic>Netizens</italic> or <italic>Authoritative Institutions</italic>, providing a crucial prerequisite for Event Focus section.</p>
<p>Human Timeline Annotation</p>
<p>Given the intricate nature of identifying public opinion phases, the timeline annotation is performed entirely by human experts to ensure high quality and reliability. Six in-house experts, all Master&#x2019;s or PhD students familiar with public opinion lifecycles, conduct the annotation using a dedicated tool<xref ref-type="fn" rid="fn-6"><sup>6</sup></xref><fn id="fn-6"><label>6</label><p>Our dedicated annotation tool significantly streamlines the process by addressing two primary challenges: (1) providing a unified interface where annotators can view all documents and visualizations simultaneously, and (2) automatically enforcing standardized JSON output to ensure data consistency and eliminate manual formatting errors. Internal tests show this tool reduces annotation time per event by over 50%.</p></fn> and following a rigorous, multi-stage protocol. The process proceeds as follows:
<list list-type="bullet">
<list-item>
<p>The 463 events are evenly divided among three groups, with two annotators per group.</p></list-item>
<list-item>
<p>Both annotators within each group work independently on the same set of assigned events.</p></list-item>
<list-item>
<p>The two partners then compare and consolidate their results, and are required to discuss every disagreement until a consensus is reached.</p></list-item>
<list-item>
<p>For the rare cases where a consensus cannot be reached, a senior researcher performs a final adjudication. If ambiguity persists, the event is discarded to ensure data integrity.</p></list-item>
</list></p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Dataset Statistics and Analysis</title>
<p>Volume and Length Distribution</p>
<p>As shown in <xref ref-type="table" rid="table-2">Table 2</xref>, the OPOR-Bench provides comprehensive multi-source coverage for each crisis event. The substantial token count per event (averaging 32K&#x002B;) highlights the significant information distillation challenge inherent in the OPOR-Gen task.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Statistics of the OPOR-Bench dataset</title>
</caption>
<table>
<colgroup>
<col align="center" width="27mm"/>
<col align="center" width="19mm"/>
<col align="center" width="18mm"/>
<col align="center" width="18mm"/>
<col align="center" width="18mm"/> </colgroup>
<thead>
<tr>
<th></th>
<th>Events</th>
<th>News</th>
<th>Tweets</th>
<th>Ref.</th>
</tr>
</thead>
<tbody>
<tr>
<td>Total Num</td>
<td>463</td>
<td>8842</td>
<td>185,554</td>
<td>463</td>
</tr>
<tr>
<td>Avg. Num/Event</td>
<td>&#x2013;</td>
<td>19.1</td>
<td>400.8</td>
<td>1.0</td>
</tr>
<tr>
<td>Avg. Token Len.&#x002A;</td>
<td>32,531</td>
<td>787.2</td>
<td>42.5</td>
<td>471.0</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-2fn1" fn-type="other">
<p>Note: &#x002A;Token lengths measured using cl100k_base tokenizer from tiktoken.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Geographic Distribution</p>
<p><xref ref-type="table" rid="table-3">Table 3</xref> shows our dataset spans 108 countries across six continents, ensuring geographic diversity and mitigating potential regional bias.</p>
<table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Geographic distribution of the OPOR-Bench dataset across the six populated continents</title>
</caption>
<table>
<colgroup>
<col align="center" width="34mm"/>
<col align="center" width="29mm"/>
<col align="center" width="38mm"/> </colgroup>
<thead>
<tr>
<th>Continent</th>
<th>#Events</th>
<th>Percentage</th>
</tr>
</thead>
<tbody>
<tr>
<td>Asia</td>
<td>185</td>
<td>40.0%</td>
</tr>
<tr>
<td>North America</td>
<td>136</td>
<td>29.4%</td>
</tr>
<tr>
<td>Europe</td>
<td>58</td>
<td>12.5%</td>
</tr>
<tr>
<td>Africa</td>
<td>48</td>
<td>10.4%</td>
</tr>
<tr>
<td>Oceania</td>
<td>21</td>
<td>4.5%</td>
</tr>
<tr>
<td>South America</td>
<td>15</td>
<td>3.2%</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Type and Category Distribution</p>
<p><xref ref-type="fig" rid="fig-3">Fig. 3</xref> shows our dataset&#x2019;s balanced coverage of natural disasters (44.9%) and human-caused crises (55.1%). <xref ref-type="table" rid="table-4">Table 4</xref> reveals distinct media coverage patterns across event types&#x2014;&#x201C;Political&#x201D; events generate more news coverage while &#x201C;Wildfires&#x201D; trigger higher social media activity&#x2014;underscoring the necessity of multi-source integration.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>The distribution of crisis event types in our dataset. The <bold>inner</bold> ring shows the top-level classification into Natural Disasters (44.9%) and Human-caused Crises (55.1%). The <bold>outer</bold> ring displays the breakdown into more specific sub-categories</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-3.tif"/>
</fig><table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Detailed statistics for each event sub-type, including the number of events and the average number of associated news articles and social media posts</title>
</caption>
<table>
<colgroup>
<col align="center" width="48mm"/>
<col align="center" width="18mm"/>
<col align="center" width="15mm"/>
<col align="center" width="19mm"/> </colgroup>
<thead>
<tr>
<th>Event sub-type</th>
<th>#Events</th>
<th>#News</th>
<th>#Tweets</th>
</tr>
</thead>
<tbody>
<tr>
<td colspan="4"><italic>Natural disasters</italic></td>
</tr>
<tr>
<td>Cyclones</td>
<td>120</td>
<td>21.3</td>
<td>400.9</td>
</tr>
<tr>
<td>Floods</td>
<td>26</td>
<td>14.0</td>
<td>551.3</td>
</tr>
<tr>
<td>Wildfires</td>
<td>21</td>
<td>16.9</td>
<td>768.2</td>
</tr>
<tr>
<td>Earthquakes</td>
<td>17</td>
<td>20.7</td>
<td>761.2</td>
</tr>
<tr>
<td>Volcanic</td>
<td>9</td>
<td>24.2</td>
<td>210.3</td>
</tr>
<tr>
<td>Others (Natural)</td>
<td>15</td>
<td>25.2</td>
<td>854.5</td>
</tr>
<tr>
<td colspan="4"><italic>Human-caused crises</italic></td>
</tr>
<tr>
<td>Traffic</td>
<td>87</td>
<td>16.5</td>
<td>274.8</td>
</tr>
<tr>
<td>Industrial</td>
<td>76</td>
<td>15.2</td>
<td>341.5</td>
</tr>
<tr>
<td>Health</td>
<td>26</td>
<td>22.0</td>
<td>455.0</td>
</tr>
<tr>
<td>Violence</td>
<td>26</td>
<td>22.5</td>
<td>268.3</td>
</tr>
<tr>
<td>Political</td>
<td>22</td>
<td>24.6</td>
<td>114.0</td>
</tr>
<tr>
<td>Others (Human)</td>
<td>18</td>
<td>17.5</td>
<td>442.3</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>OPOR-Eval</title>
<sec id="s4_1">
<label>4.1</label>
<title>Overview</title>
<p>In our framework, the evaluator role (shown in <xref ref-type="fig" rid="fig-1">Fig. 1b</xref>) is realized through an agent-based architecture that manages three specialized tools. Given a generated report <inline-formula id="ieqn-7"><mml:math id="mml-ieqn-7"><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, its associated social media posts <inline-formula id="ieqn-8"><mml:math id="mml-ieqn-8"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and reference <inline-formula id="ieqn-9"><mml:math id="mml-ieqn-9"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, the evaluation task produces a 15-dimensional score vector <inline-formula id="ieqn-10"><mml:math id="mml-ieqn-10"><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>15</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>, where each score <inline-formula id="ieqn-11"><mml:math id="mml-ieqn-11"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is rated on a 5-point Likert scale:
<disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>LLM</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mtext>eval</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p><inline-formula id="ieqn-12"><mml:math id="mml-ieqn-12"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mtext>eval</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula> is the evaluation prompt containing detailed scoring criteria.</p>
<p>Our 15 dimensions are established through iterative expert consensus. As shown in <xref ref-type="fig" rid="fig-4">Fig. 4</xref>, the OPOR-Eval agent (<xref ref-type="sec" rid="s4_2_1">Section 4.2.1</xref>) employs three specialized tools: (1) <bold>Fact-Checker</bold> (<xref ref-type="sec" rid="s4_2_2">Section 4.2.2</xref>) for verifying accuracy against references, (2) <bold>Opinion-Miner</bold> (<xref ref-type="sec" rid="s4_2_3">Section 4.2.3</xref>) for evaluating public opinion coverage, and (3) <bold>Solution-Counselor</bold> (<xref ref-type="sec" rid="s4_2_4">Section 4.2.4</xref>) for evaluating recommendations. This transforms evaluation from black-box judgment into a traceable analytical process. See <xref ref-type="app" rid="app-1">Appendix A</xref> for Event Title of scoring guidelines.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>The OPOR-Eval architecture: An evaluation agent manages three specialized tools (Fact-Checker, Opinion-Miner, Solution-Counselor) through structured task assignment</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-4.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Architecture</title>
<sec id="s4_2_1">
<label>4.2.1</label>
<title>OPOR-Eval Agent</title>
<p>The OPOR-Eval Agent manages the evaluation process by analyzing report com ponents, selecting appropriate tools, and producing the comprehensive score vector. Following a reasoning-acting protocol, the agent explicitly externalizes reasoning before each action (see <xref ref-type="app" rid="app-2">Appendix B</xref> for implementation details). This process is formalized as:
<disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mtext>Agent</mml:mtext></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>fact</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2295;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>opin</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>&#x2295;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>sol</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></disp-formula>where <inline-formula id="ieqn-13"><mml:math id="mml-ieqn-13"><mml:mo>&#x2295;</mml:mo></mml:math></inline-formula> denotes vector concatenation and the components are produced by each specialized tool.</p>
</sec>
<sec id="s4_2_2">
<label>4.2.2</label>
<title>Fact-Checker Tool</title>
<p>The Fact-Checker Tool (<inline-formula id="ieqn-14"><mml:math id="mml-ieqn-14"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mrow><mml:mtext>fact</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>) verifies the factual accuracy of the report&#x2019;s <italic>Title</italic>, <italic>Summary</italic>, and the <italic>Date Accuracy</italic> of the Event Timeline by comparing them against the reference data (<inline-formula id="ieqn-15"><mml:math id="mml-ieqn-15"><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>). This tool evaluates the model&#x2019;s Factual Consistency&#x2014;its ability to stay faithful to the provided source material.
<disp-formula id="eqn-4"><label>(4)</label><mml:math id="mml-eqn-4" display="block"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mrow><mml:mtext>fact</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>title</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>summary</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>timeline</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Z</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>fact</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
</sec>
<sec id="s4_2_3">
<label>4.2.3</label>
<title>Opinion-Miner Tool</title>
<p>The Opinion-Miner Tool (<inline-formula id="ieqn-16"><mml:math id="mml-ieqn-16"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mrow><mml:mtext>opin</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>) evaluates the <italic>Sub-Events</italic> of the <italic>Event Timeline</italic> and the entire <italic>Event Focus</italic> section by comparing them against the source social media posts (<inline-formula id="ieqn-17"><mml:math id="mml-ieqn-17"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>). It measures the model&#x2019;s Multi-source Synthesis&#x2014;extracting key insights from large volumes of noisy, unstructured text.
<disp-formula id="eqn-5"><label>(5)</label><mml:math id="mml-eqn-5" display="block"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mrow><mml:mtext>opin</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>timeline</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>focus</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>opin</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>8</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>9</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>10</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>11</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
</sec>
<sec id="s4_2_4">
<label>4.2.4</label>
<title>Solution Counselor Tool</title>
<p>The Solution Counselor Tool (<inline-formula id="ieqn-18"><mml:math id="mml-ieqn-18"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mrow><mml:mtext>sol</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>) leverages the LLM&#x2019;s internal knowledge and reasoning to evaluate the report&#x2019;s <italic>Event Suggestions</italic> based on criteria such as their feasibility, relevance, and innovation. This directly tests the model&#x2019;s Practical Reasoning&#x2014;generating novel, actionable solutions from parametric knowledge.
<disp-formula id="eqn-6"><label>(6)</label><mml:math id="mml-eqn-6" display="block"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mrow><mml:mtext>sol</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>R</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>suggestions</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>sol</mml:mtext></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>12</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>13</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>14</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>15</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
</sec>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experiments</title>
<sec id="s5_1">
<label>5.1</label>
<title>Experimental Setup</title>
<sec id="s5_1_1">
<label>5.1.1</label>
<title>Models and Strategies</title>
<p>We evaluate five frontier LLMs with 128 K&#x002B; context windows: GPT-4o, DeepSeek-R1, DeepSeek-V3, Gemini 2.5 Pro, and Llama-3.3-70B. For each model, we employ two distinct generation strategies (<xref ref-type="fig" rid="fig-5">Fig. 5</xref>): modular generation creating sections independently then assembling them [<xref ref-type="bibr" rid="ref-19">19</xref>], and end-to-end generation producing complete reports in a single pass. See <xref ref-type="app" rid="app-3">Appendix C</xref> for prompt templates.</p>
<fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>Comparison of two OPOR-Gen strategies. (<bold>Left</bold>): Modular generation decomposes the task into five sequential subtasks (title, summary, timeline, focus, and suggestions), with each component generated independently. (<bold>Right):</bold> End-to-end generation produces all five report components simultaneously in a single pass, maintaining global coherence throughout the document</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-5.tif"/>
</fig>
</sec>
<sec id="s5_1_2">
<label>5.1.2</label>
<title>Implementation Details</title>
<p>We use temperature 0.7 for generation and 0.3 for evaluation tasks. All other hyperparameters follow model defaults. Complete configuration details are provided in our dataset repository.</p>
</sec>
<sec id="s5_1_3">
<label>5.1.3</label>
<title>Evaluation Setup</title>
<p>We implement OPOR-Eval with GPT-4o and DeepSeek-V3 to evaluate the generated reports across 15 dimensions following identical protocols. Additionally, we conduct human evaluation on a subset of these reports to validate the overall effectiveness.</p>
</sec>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>Evaluation Framework Validation</title>
<sec id="s5_2_1">
<label>5.2.1</label>
<title>Human Evaluation Protocol</title>
<p>To validate OPOR-Eval, we conduct a two-phase human evaluation with three experts who also design the scoring criteria. Using a dedicated annotation tool ensuring blind evaluation, experts independently score each report. The protocol includes calibration and formal evaluation phases:
<list list-type="bullet">
<list-item>
<p>The calibration phase utilizes a preliminary set of 50 reports (5 events by our five baseline models under both generation strategies) to ensure all three experts share a consistent understanding of the criteria. In this phase, the experts independently rate the reports, and their agreement is measured using the Intraclass Correlation Coefficient (ICC).</p></list-item>
<list-item>
<p>The formal evaluation phase then uses a distinct and larger corpus of 500 reports (from 50 events). The three calibrated experts then independently score this entire corpus, yielding three complete sets of ratings for our human-agent agreement analysis.</p></list-item>
</list></p>
</sec>
<sec id="s5_2_2">
<label>5.2.2</label>
<title>Agreement Analysis and Results</title>
<p>In this section, we analyze the results of our human evaluation to answer two key questions: (1) How reliable are our human experts? and (2) How strong is the agreement between our human experts and the OPOR-Eval agents?</p>
<p><bold><italic>Answer 1: Our human experts demonstrate high inter-rater reliability</italic>.</bold></p>
<p><xref ref-type="table" rid="table-5">Table 5</xref> shows high inter-rater reliability among human experts. Based on the classification where Intraclass Correlation Coefficient (ICC) values are categorized as poor (&#x003C;0.50), moderate (0.50&#x2013;0.75), good (0.75&#x2013;0.90), and excellent (&#x003E;0.90), most dimensions achieve good to excellent agreement with ICC &#x003E; 0.75. The relatively lower agreement on Event Suggestions (ICC &#x003D; 0.64, moderate) reflects the inherent subjectivity in evaluating recommendation quality.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>The ICC scores confirm high inter-rater reliability among human experts</title>
</caption>
<table>
<colgroup>
<col align="center" width="53mm"/>
<col align="center" width="47mm"/> </colgroup>
<thead>
<tr>
<th>Dimension</th>
<th>ICC3</th>
</tr>
</thead>
<tbody>
<tr>
<td><italic>Event title</italic></td>
<td>0.843</td>
</tr>
<tr>
<td><italic>Event summary</italic></td>
<td></td>
</tr>
<tr>
<td>Event nature</td>
<td>0.868</td>
</tr>
<tr>
<td>Time &#x0026; loc.</td>
<td>0.860</td>
</tr>
<tr>
<td>Involved parties</td>
<td>0.856</td>
</tr>
<tr>
<td>Causes</td>
<td>0.879</td>
</tr>
<tr>
<td>Impact</td>
<td>0.887</td>
</tr>
<tr>
<td><italic>Event timeline</italic></td>
<td></td>
</tr>
<tr>
<td>Date acc.</td>
<td>0.839</td>
</tr>
<tr>
<td>Sub events</td>
<td>0.793</td>
</tr>
<tr>
<td><italic>Event focus</italic></td>
<td></td>
</tr>
<tr>
<td>Contro. topic</td>
<td>0.894</td>
</tr>
<tr>
<td>Repr. stmt.</td>
<td>0.877</td>
</tr>
<tr>
<td>Emo. anal.</td>
<td>0.893</td>
</tr>
<tr>
<td><italic>Event suggestions</italic></td>
<td></td>
</tr>
<tr>
<td>Rel.</td>
<td>0.625</td>
</tr>
<tr>
<td>Feas.</td>
<td>0.621</td>
</tr>
<tr>
<td>Emo. guide.</td>
<td>0.640</td>
</tr>
<tr>
<td>Innov.</td>
<td>0.676</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><bold><italic>Answer 2: The OPOR-Eval framework achieves strong human-agent alignment with GPT-4o</italic>.</bold></p>
<p>Correlation is generally categorized as weak (&#x003C;0.50), moderate (0.50&#x2013;0.70), or strong (&#x003E;0.70). For MAE, lower values indicate better alignment. Our human-agent agreement analysis (<xref ref-type="table" rid="table-6">Table 6</xref>) reveals that the GPT-4o achieves a strong overall alignment (<inline-formula id="ieqn-19"><mml:math id="mml-ieqn-19"><mml:mrow><mml:mi mathvariant="normal">&#x03C1;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>.69, MAE &#x003D; 0.53). In contrast, DeepSeek-V3 shows moderate performance, with notably poor correlation to subjective Opinion Mining tasks (<inline-formula id="ieqn-20"><mml:math id="mml-ieqn-20"><mml:mrow><mml:mi mathvariant="normal">&#x03C1;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>.13). This confirms that while OPOR-Eval is effective with a strong model, human oversight remains valuable for these subjective dimensions.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>GPT-4o demonstrates superior human-agent alignment over DeepSeek-V3</title>
</caption>
<table>
<colgroup>
<col align="center" width="28mm"/>
<col align="center" width="16mm"/>
<col align="center" width="10mm"/>
<col align="center" width="16mm"/>
<col align="center" width="10mm"/>
<col align="center" width="16mm"/>
<col align="center" width="10mm"/> </colgroup>
<thead>
<tr>
<th align="center" rowspan="2">Dimension</th>
<th colspan="2">Spearman&#x2019;s &#x03C1;</th>
<th colspan="2">Kendall&#x2019;s &#x03C4;</th>
<th colspan="2">MAE</th>
</tr>
<tr>
<th>DeepSeek-V3</th>
<th>GPT-4o</th>
<th>DeepSeek-V3</th>
<th>GPT-4o</th>
<th>DeepSeek-V3</th>
<th>GPT-4o</th>
</tr>
</thead>
<tbody>
<tr>
<td><italic>Event title</italic></td>
<td>0.55</td>
<td>0.72</td>
<td>0.48</td>
<td>0.63</td>
<td>0.64</td>
<td>0.54</td>
</tr>
<tr>
<td><italic>Event summary</italic></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Event nature</td>
<td>0.57</td>
<td>0.75</td>
<td>0.49</td>
<td>0.66</td>
<td>0.73</td>
<td>0.52</td>
</tr>
<tr>
<td>Time &#x0026; loc.</td>
<td>0.64</td>
<td>0.78</td>
<td>0.54</td>
<td>0.68</td>
<td>0.70</td>
<td>0.52</td>
</tr>
<tr>
<td>Involved parties</td>
<td>0.45</td>
<td>0.83</td>
<td>0.37</td>
<td>0.73</td>
<td>0.96</td>
<td>0.45</td>
</tr>
<tr>
<td>Causes</td>
<td>0.56</td>
<td>0.84</td>
<td>0.46</td>
<td>0.73</td>
<td>0.85</td>
<td>0.50</td>
</tr>
<tr>
<td>Impact</td>
<td>0.41</td>
<td>0.85</td>
<td>0.33</td>
<td>0.75</td>
<td>0.91</td>
<td>0.63</td>
</tr>
<tr>
<td><italic>Event timeline</italic></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Date acc.</td>
<td>0.18</td>
<td>0.80</td>
<td>0.14</td>
<td>0.70</td>
<td>1.36</td>
<td>0.45</td>
</tr>
<tr>
<td>Sub events</td>
<td>0.19</td>
<td>0.75</td>
<td>0.16</td>
<td>0.64</td>
<td>1.02</td>
<td>0.58</td>
</tr>
<tr>
<td><italic>Event focus</italic></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Contro. topic</td>
<td>0.07</td>
<td>0.81</td>
<td>0.06</td>
<td>0.70</td>
<td>1.24</td>
<td>0.54</td>
</tr>
<tr>
<td>Repr. stmt.</td>
<td>0.15</td>
<td>0.77</td>
<td>0.12</td>
<td>0.66</td>
<td>1.20</td>
<td>0.52</td>
</tr>
<tr>
<td>Emo. anal.</td>
<td>0.11</td>
<td>0.82</td>
<td>0.09</td>
<td>0.71</td>
<td>1.12</td>
<td>0.51</td>
</tr>
<tr>
<td><italic>Event suggestions</italic></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>Rel.</td>
<td>0.11</td>
<td>0.47</td>
<td>0.09</td>
<td>0.41</td>
<td>0.64</td>
<td>0.47</td>
</tr>
<tr>
<td>Feas.</td>
<td>&#x2013;</td>
<td>0.22</td>
<td>&#x2013;</td>
<td>0.19</td>
<td>0.57</td>
<td>0.56</td>
</tr>
<tr>
<td>Emo. guide.</td>
<td>0.11</td>
<td>0.50</td>
<td>0.10</td>
<td>0.43</td>
<td>0.60</td>
<td>0.59</td>
</tr>
<tr>
<td>Innov.</td>
<td>0.04</td>
<td>0.52</td>
<td>0.03</td>
<td>0.45</td>
<td>0.71</td>
<td>0.59</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>General Results</title>
<p>Overall Performance</p>
<p>Given GPT-4o&#x2019;s strong alignment with human judgment, we report the average of its scores across both generation strategies to determine each model&#x2019;s final performance. As shown in <xref ref-type="table" rid="table-7">Table 7</xref>, Gemini 2.5 Pro leads with an average score of 3.71, followed by DeepSeek-R1 (3.67), DeepSeek-V3 (3.63), GPT-4o (3.61), and Llama-3.3-70B (3.52). The narrow 5.12% gap between the best and worst models indicates they all have a solid baseline capability for the OPOR-Gen task.</p>
<table-wrap id="table-7">
<label>Table 7</label>
<caption>
<title>Overall performance comparison of five LLMs using two generation strategies (end-to-end and modular), evaluated by two distinct LLM evaluators (DeepSeek-V3 and GPT-4o). Asterisks indicate statistically significant differences between end-to-end and modular strategies within the same evaluator-generator pair, determined by paired <italic>t</italic>-tests (&#x002A;<italic>p</italic> &#x003C; 0.05, &#x002A;&#x002A;<italic>p</italic> &#x003C; 0.01, &#x002A;&#x002A;&#x002A;<italic>p</italic> &#x003C; 0.001). Within each experimental block, the highest score for each evaluation dimension is highlighted in bold. Model abbreviations: DS-R1 &#x003D; DeepSeek-R1, DS-V3 &#x003D; DeepSeek-V3, Gemini-2.5 &#x003D; Gemini 2.5 Pro, Llama-70B &#x003D; Llama-3.3-70B</title>
</caption>
<table>
<colgroup>
<col align="center" width="15mm"/>
<col align="center" width="20mm"/>
<col align="center" width="18mm"/>
<col align="center" width="15mm"/>
<col align="center" width="15mm"/>
<col align="center" width="15mm"/>
<col align="center" width="15mm"/>
<col align="center" width="15mm"/>
<col align="center" width="15mm"/> </colgroup>
<thead>
<tr>
<th>Evaluator</th>
<th>Gen strategy</th>
<th>Model</th>
<th>Event title</th>
<th>Event summary</th>
<th>Event timeline</th>
<th>Event focus</th>
<th>Event suggestions</th>
<th>Avg. score</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="10">DS-V3</td>
<td rowspan="5">End-to-end</td>
<td>DS-R1</td>
<td><bold>4.48&#x002A;&#x002A;&#x002A;</bold></td>
<td>4.25&#x002A;&#x002A;</td>
<td>3.20</td>
<td>4.07</td>
<td>4.16</td>
<td>4.03</td>
</tr>
<tr>
<td>DS-V3</td>
<td>4.37&#x002A;&#x002A;</td>
<td>4.19&#x002A;</td>
<td>2.99</td>
<td>4.02</td>
<td>4.12</td>
<td>3.94</td>
</tr>
<tr>
<td>Gemini-2.5</td>
<td>4.37</td>
<td><bold>4.27&#x002A;&#x002A;</bold></td>
<td><bold>3.40</bold></td>
<td><bold>4.27</bold></td>
<td><bold>4.20&#x002A;&#x002A;</bold></td>
<td><bold>4.10</bold></td>
</tr>
<tr>	
<td>GPT-4o</td>
<td>4.29&#x002A;&#x002A;</td>
<td>4.10&#x002A;&#x002A;</td>
<td>3.06</td>
<td>4.03</td>
<td>4.05</td>
<td>3.91</td>
</tr>
<tr>
<td>Llama-70B</td>
<td>4.15</td>
<td>3.95&#x002A;</td>
<td>2.67</td>
<td>3.89</td>
<td>3.95</td>
<td>3.72</td>
</tr>
<tr>
<td rowspan="5">Modular<break/>[<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td>DS-R1</td>
<td>4.34</td>
<td><bold>4.18</bold></td>
<td><bold>3.56&#x002A;&#x002A;&#x002A;</bold></td>
<td>4.25&#x002A;&#x002A;&#x002A;</td>
<td><bold>4.22&#x002A;&#x002A;</bold></td>
<td><bold>4.11&#x002A;&#x002A;</bold></td>
</tr>
<tr>
<td>DS-V3</td>
<td>4.28</td>
<td>4.14</td>
<td>3.46&#x002A;</td>
<td>4.23&#x002A;&#x002A;&#x002A;</td>
<td>4.17&#x002A;</td>
<td>4.06&#x002A;</td>
</tr>
<tr>
<td>Gemini-2.5</td>
<td><bold>4.38&#x002A;&#x002A;</bold></td>
<td>4.15</td>
<td>3.52&#x002A;</td>
<td><bold>4.29&#x002A;</bold></td>
<td>4.17</td>
<td>4.10</td>
</tr>
<tr>
<td>GPT-4o</td>
<td>4.12</td>
<td>3.99</td>
<td>3.40&#x002A;&#x002A;</td>
<td>4.21&#x002A;&#x002A;&#x002A;</td>
<td>4.12&#x002A;&#x002A;</td>
<td>3.97&#x002A;&#x002A;</td>
</tr>
<tr>
<td>Llama-70B</td>
<td>4.15</td>
<td>3.86</td>
<td>3.20&#x002A;&#x002A;&#x002A;</td>
<td>4.19&#x002A;&#x002A;&#x002A;</td>
<td>4.02&#x002A;&#x002A;</td>
<td>3.88&#x002A;&#x002A;&#x002A;</td>
</tr>
<tr>
<td rowspan="10">GPT-4o</td>
<td rowspan="5">End-to-end</td>
<td>DS-R1</td>
<td><bold>4.50&#x002A;&#x002A;&#x002A;</bold></td>
<td>3.65&#x002A;&#x002A;</td>
<td>2.56</td>
<td>3.75</td>
<td>3.99</td>
<td>3.69&#x002A;</td>
</tr>
<tr>
<td>DS-V3</td>
<td>4.40&#x002A;&#x002A;</td>
<td>3.57&#x002A;</td>
<td>2.52</td>
<td>3.75</td>
<td>3.99&#x002A;&#x002A;</td>
<td>3.65&#x002A;&#x002A;</td>
</tr>
<tr>
<td>Gemini-2.5</td>
<td>4.47&#x002A;&#x002A;</td>
<td><bold>3.73&#x002A;&#x002A;</bold></td>
<td><bold>2.71&#x002A;</bold></td>
<td><bold>3.85&#x002A;</bold></td>
<td><bold>4.00&#x002A;&#x002A;&#x002A;</bold></td>
<td><bold>3.75&#x002A;&#x002A;</bold></td>
</tr>
<tr>
<td>GPT-4o</td>
<td>4.33&#x002A;</td>
<td>3.52&#x002A;&#x002A;</td>
<td>2.56</td>
<td>3.77</td>
<td>3.95</td>
<td>3.63&#x002A;</td>
</tr>
<tr>
<td>Llama-70B</td>
<td>4.27&#x002A;</td>
<td>3.42&#x002A;</td>
<td>2.24</td>
<td>3.68</td>
<td>3.92</td>
<td>3.51</td>
</tr>
<tr>
<td rowspan="5">Modular<break/> [<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td>DS-R1</td>
<td>4.16</td>
<td>3.55</td>
<td>2.67&#x002A;</td>
<td><bold>3.85&#x002A;&#x002A;</bold></td>
<td><bold>4.01&#x002A;&#x002A;</bold></td>
<td>3.65</td>
</tr>
<tr>
<td>DS-V3</td>
<td>4.14</td>
<td>3.54</td>
<td>2.63&#x002A;</td>
<td>3.78&#x002A;&#x002A;</td>
<td>3.97</td>
<td>3.61</td>
</tr>
<tr>
<td>Gemini-2.5</td>
<td><bold>4.26</bold></td>
<td><bold>3.57</bold></td>
<td><bold>2.70</bold></td>
<td>3.84</td>
<td>3.98</td>
<td><bold>3.67</bold></td>
</tr>
<tr>
<td>GPT-4o</td>
<td>4.04</td>
<td>3.42</td>
<td>2.68&#x002A;&#x002A;</td>
<td>3.84&#x002A;&#x002A;</td>
<td>3.97&#x002A;&#x002A;</td>
<td>3.59</td>
</tr>
<tr>
<td>Llama-70B</td>
<td>4.13</td>
<td>3.27</td>
<td>2.56&#x002A;&#x002A;&#x002A;</td>
<td>3.74&#x002A;&#x002A;</td>
<td>3.94&#x002A;&#x002A;</td>
<td>3.53&#x002A;&#x002A;</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Comparison of Generation Strategies</p>
<p>As shown in <xref ref-type="table" rid="table-7">Table 7</xref>, while the end-to-end generation strategy slightly outperforms the modular approach on average (GPT-4o: 3.65 vs. 3.61), our results reveal a clear trade-off: end-to-end excels at high-level synthesis (Title, Summary), while the modular approach is superior for detailed, multi-perspective analysis (Timeline, Focus). This key finding suggests that the optimal strategy is task-dependent, pointing towards hybrid approaches as a promising direction for future research.</p>

</sec>
</sec>
<sec id="s6">
<label>6</label>
<title>Analysis and Discussion</title>
<sec id="s6_1">
<label>6.1</label>
<title>Task Complexity Analysis</title>
<sec id="s6_1_1">
<label>6.1.1</label>
<title>Temporal Reasoning Is a Universal Challenge for LLMs</title>
<p>Analysis of GPT-4o evaluation results (<xref ref-type="table" rid="table-7">Table 7</xref>) reveals a notable weakness across all models in the Event Timeline dimension. This weakness is rooted in a dramatic failure on the Date Accuracy sub-dimension (average: 1.25). This failure is consistent even for the top-performing model (Gemini 2.5 Pro: 1.28) and stems from the task&#x2019;s demand for complex temporal reasoning&#x2014;identifying inflection points in data trends rather than simply extracting dates from documents.</p>

</sec>
<sec id="s6_1_2">
<label>6.1.2</label>
<title>Difficulty Stems from Information Structure, Not Thematic Contentu</title>
<p>A consistent pattern across all models (<xref ref-type="fig" rid="fig-6">Fig. 6</xref>) suggests this relationship: performance is significantly higher for Human-caused Disasters than for Natural Disasters. Further examination at the sub-category level (<xref ref-type="fig" rid="fig-7">Fig. 7</xref>) reveals a clearer distinction: models excel on events with well-defined information structures, like &#x201C;Industrial&#x201D; and &#x201C;Traffic&#x201D; accidents, but struggle with events characterized by diffuse information, such as &#x201C;Wildfires&#x201D; and &#x201C;Floods&#x201D;. We hypothesize that this disparity stems from the fact that human-caused disasters typically feature clear causal chains and structured data (e.g., official investigation reports) that are easily processed by LLMs. In contrast, natural disasters generate fragmented information from diverse sources with ambiguous temporal boundaries, posing a fundamental challenge to fact extraction and timeline segmentation.</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Consistent performance gap between human-caused (higher) and natural disasters (lower) across all models</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-6.tif"/>
</fig><fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Performance correlates with information structure: structured events (Industrial, Traffic) score highest while diffuse events (Wildfires, Floods) score lowest</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-7.tif"/>
</fig>
</sec>
</sec>
<sec id="s6_2">
<label>6.2</label>
<title>Generator Performance Analysis</title>
<sec id="s6_2_1">
<label>6.2.1</label>
<title>LLMs Performance Is Remarkably Consistent across All Three Evaluation Categories</title>
<p>Our evaluation framework evaluates three distinct capabilities: Factual Consistency (via Fact-Checker Tool), Multi-source Synthesis (via Opinion-Miner Tool), and Practical Reasoning (via Solution Counselor Tool). As shown in <xref ref-type="table" rid="table-8">Table 8</xref>, model performance rankings remain remarkably stable across these three capabilities. Pearson correlations between all category pairs exceed 0.84 (<italic>p</italic> &#x003C; 0.001), indicating high consistency in model performance across dimensions. This high inter-category correlation provides statistical evidence that current LLMs demonstrate coherent performance across factual verification, information integration, and strategic reasoning, rather than excelling in isolated dimensions.</p>
<table-wrap id="table-8">
<label>Table 8</label>
<caption>
<title>Average model performance across three evaluation categories, with bold values indicating the best performance scores.</title>
</caption>
<table>
<colgroup>
<col align="center" width="28mm"/>
<col align="center" width="30mm"/>
<col align="center" width="35mm"/>
<col align="center" width="35mm"/> </colgroup>
<thead>
<tr>
<th>Model</th>
<th>Factual consist.</th>
<th>Multi-Src synthesis</th>
<th>Practical reasoning</th>
</tr>
</thead>
<tbody>
<tr>
<td>DeepSeek-R1</td>
<td><bold>3.06</bold></td>
<td>3.89</td>
<td>4.00</td>
</tr>
<tr>
<td>DeepSeek-V3</td>
<td>3.02</td>
<td>3.84</td>
<td>3.98</td>
</tr>
<tr>
<td>Gemini 2.5 Pro</td>
<td>3.10</td>
<td>3.99</td>
<td>3.99</td>
</tr>
<tr>
<td>GPT-4o</td>
<td>2.99</td>
<td>3.87</td>
<td>3.96</td>
</tr>
<tr>
<td>Llama-3.3-70B</td>
<td>2.91</td>
<td>3.66</td>
<td>3.93</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s6_2_2">
<label>6.2.2</label>
<title>LLMs Struggle with Information Overload and Multi-Document Synthesis</title>
<p>Our analysis reveals that report quality does not simply increase with source document volume, highlighting a critical limitation in the information overload and inefficient multi-document synthesis of current LLMs. Since Title and Summary generation primarily relies on news articles while Timeline and Focus depend on social media data, we analyze their correlations separately.</p>
<p><bold>News Articles:</bold> <xref ref-type="table" rid="table-9">Table 9</xref> shows a negative correlation between article count and the scores for both the Title and Summary, suggesting information overload. <xref ref-type="fig" rid="fig-8">Fig. 8a</xref> identifies the optimal range: 10&#x2013;20 articles.</p>
<table-wrap id="table-9">
<label>Table 9</label>
<caption>
<title>Correlation between news/tweet count and evaluation scores</title>
</caption>
<table>
<colgroup>
<col align="center" width="29mm"/>
<col align="center" width="33mm"/>
<col align="center" width="38mm"/> </colgroup>
<thead>
<tr>
<th>Score category</th>
<th>News count corr. (r)</th>
<th>Tweet count corr. (r)</th>
</tr>
</thead>
<tbody>
<tr>
<td>Title</td>
<td>&#x2212;0.0342</td>
<td>&#x2014;</td>
</tr>
<tr>
<td>Summary</td>
<td>&#x2212;0.1373</td>
<td>&#x2014;</td>
</tr>
<tr>
<td>Timeline</td>
<td>&#x2014;</td>
<td>0.2134</td>
</tr>
<tr>
<td>Focus</td>
<td>&#x2014;</td>
<td>&#x2212;0.4411</td>
</tr>
</tbody>
</table>
</table-wrap><fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Source document volume exhibits complex relationships with report quality. (<bold>a</bold>) Title and Summary scores peak with 10&#x2013;20 news articles, declining with information overload. (<bold>b</bold>) Tweet volume creates opposing effects: Timeline benefits from more data while Focus degrades, with a critical threshold around 700 tweets</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-8.tif"/>
</fig>
<p><bold>Social Media:</bold> <xref ref-type="table" rid="table-9">Table 9</xref> shows opposing effects: Timeline accuracy benefits from more tweets (r &#x003D; &#x002B;0.21, <italic>p</italic> &#x003C; 0.001), while Event Focus suffers dramatically (r &#x003D; &#x2212;0.44, <italic>p</italic> &#x003C; 0.001). <xref ref-type="fig" rid="fig-8">Fig. 8b</xref> reveals a critical threshold approximately 700 tweets where this trade-off becomes pronounced.</p>

</sec>
</sec>
<sec id="s6_3">
<label>6.3</label>
<title>Evaluator Bias Analysis</title>
<sec id="s6_3_1">
<label>6.3.1</label>
<title>LLM Evaluators Display Inherent Scoring Biases</title>
<p>DeepSeek-V3 consistently assigns higher scores than GPT-4o for identical reports (average: 4.02 vs. 3.62, <italic>p</italic> &#x003C; 0.001). <xref ref-type="fig" rid="fig-9">Fig. 9a</xref> visualizes this bias: GPT-4o shows lower median scores with wider distribution, indicating more stricter criteria and better discrimination, whereas DeepSeek-V3&#x2019;s scores cluster higher with less variance. This systematic difference proves that absolute scores from different LLM evaluators are not directly comparable, highlighting the need for score normalization or calibration in practical applications.</p>
<fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>LLM evaluator characteristics. (<bold>a</bold>) Systematic scoring bias: DeepSeek-V3 assigns consistently higher scores with less variance compared to GPT-4o&#x2019;s stricter, more discriminative scoring. (<bold>b</bold>) Negligible self-evaluation bias: both evaluators maintain objectivity with minimal self-preference (DeepSeek-V3: &#x002B;0.03) or self-criticism (GPT-4o: &#x2212;0.02), validating their use in automated evaluation systems</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_73771-fig-9.tif"/>
</fig>
</sec>
<sec id="s6_3_2">
<label>6.3.2</label>
<title>LLM Evaluators Show Strong Objectivity with Negligible Self-Evaluation Bias</title>
<p>To investigate self-evaluation bias, we compare the scores an agent assigns to its own reports (&#x201C;Self-Evaluation&#x201D;) vs. those from other models (&#x201C;Cross-Evaluation&#x201D;). Our results <xref ref-type="fig" rid="fig-9">Fig. 9b</xref> show that while self-evaluation biases are statistically significant, their practical impact is negligible: DeepSeek-V3 shows a minor self-preference (&#x002B;0.03 average score, <italic>p</italic> &#x003C; 0.001), while GPT-4o exhibits slight self-criticism (&#x2212;0.02 average score, <italic>p</italic> &#x003C; 0.05). These differences represent less than 1% of the rating scale. This key finding confirms a high degree of objectivity in these LLM evaluators, enhancing the credibility and practical viability of LLM-based evaluation systems.</p>

</sec>
</sec>
</sec>
<sec id="s7">
<label>7</label>
<title>Related Work</title>
<sec id="s7_1">
<label>7.1</label>
<title>Multi-Document Summarization</title>
<p>Multi Document Summarization (MDS) generates comprehensive summaries from document collections on the same topic [<xref ref-type="bibr" rid="ref-20">20</xref>], with applications in news extraction, social media mining, and review analysis [<xref ref-type="bibr" rid="ref-21">21</xref>&#x2013;<xref ref-type="bibr" rid="ref-24">24</xref>]. Research explores both extractive [<xref ref-type="bibr" rid="ref-25">25</xref>,<xref ref-type="bibr" rid="ref-26">26</xref>] and abstractive approaches [<xref ref-type="bibr" rid="ref-27">27</xref>&#x2013;<xref ref-type="bibr" rid="ref-30">30</xref>].</p>
<p>However, while the task of OPOR-Gen involves synthesizing multiple documents, it is fundamentally distinct from traditional Multi-Document Summarization (MDS). Traditional MDS primarily focuses on information consolidation, aiming to summarize homogeneous sources (e.g., news-only) into a single unstructured paragraph. In contrast, OPOR-Gen demands a full-cycle analytical product that synthesizes highly heterogeneous sources (i.e., formal news and informal social media) into a structured, multi-section report. Critically, it moves beyond mere summarization to require analysis of diverse public viewpoints through sentiment and stance detection (Event Focus) and generation of actionable recommendations (Event Suggestions).</p>
<p>This structural complexity also means that traditional n-gram metrics (ROUGE, BLEU) [<xref ref-type="bibr" rid="ref-31">31</xref>&#x2013;<xref ref-type="bibr" rid="ref-34">34</xref>], commonly used for MDS, are inadequate for evaluating long-form, multi-faceted reports [<xref ref-type="bibr" rid="ref-35">35</xref>]. Similar challenges exist in open-ended text generation tasks [<xref ref-type="bibr" rid="ref-36">36</xref>], where gold references are absent and human evaluation suffers from expertise limitations and subjectivity [<xref ref-type="bibr" rid="ref-37">37</xref>,<xref ref-type="bibr" rid="ref-38">38</xref>].</p>
</sec>
<sec id="s7_2">
<label>7.2</label>
<title>Text Generation Evaluation</title>
<p>Traditional metrics like ROUGE are inadequate for evaluating long-form, structured content, while recent LLM-as-a-judge methods [<xref ref-type="bibr" rid="ref-3">3</xref>,<xref ref-type="bibr" rid="ref-4">4</xref>] typically assess holistic quality without addressing the multi-faceted structural demands of OPOR-Gen (e.g., timeline accuracy, opinion diversity, suggestion feasibility).</p>
<p>LLM-based evaluation has recently emerged as a promising solution to the limitations of traditional metrics, demonstrating a strong correlation with human judgment while offering superior reproducibility, speed, and cost-effectiveness [<xref ref-type="bibr" rid="ref-4">4</xref>,<xref ref-type="bibr" rid="ref-39">39</xref>&#x2013;<xref ref-type="bibr" rid="ref-42">42</xref>]. A variety of strategies have been developed. For reference-free evaluation, methods employ techniques like chain-of-thought prompting or proxy question-answering [<xref ref-type="bibr" rid="ref-3">3</xref>,<xref ref-type="bibr" rid="ref-43">43</xref>,<xref ref-type="bibr" rid="ref-44">44</xref>]. Other research focuses on creating benchmarks to evaluate specific attributes, such as instruction following [<xref ref-type="bibr" rid="ref-33">33</xref>,<xref ref-type="bibr" rid="ref-40">40</xref>], factual consistency [<xref ref-type="bibr" rid="ref-45">45</xref>,<xref ref-type="bibr" rid="ref-46">46</xref>], response alignment [<xref ref-type="bibr" rid="ref-47">47</xref>], and even leveraging multi-agent systems for evaluation [<xref ref-type="bibr" rid="ref-48">48</xref>].</p>
<p>Building on these advances, our OPOR-Eval framework employs LLMs as intelligent agents, simulating expert evaluation by using generated reports as contextual background and applying 5-point Likert scale scoring tailored to OPOR-Gen&#x2019;s unique requirements.</p>
</sec>
</sec>
<sec id="s8">
<label>8</label>
<title>Conclusion</title>
<p>In this paper, we address the critical inefficiency of manual online public opinion reporting. To tackle this, we introduce three core contributions: the OPOR-Gen task for automated report generation; OPOR-Bench, the first multi-source benchmark to support it; and OPOR-Eval, a reliable agent-based evaluation framework achieving strong human correlation that can be generalized to other long-form structured generation tasks. Our experiments establish strong baselines and reveal key challenges, such as complex temporal reasoning and systematic biases inherent in different LLM evaluators. We believe this work not only provides practical guidance for public opinion management but also serves as a valuable resource for related NLP tasks like multi-document summarization and event extraction.</p>
</sec>
<sec id="s9">
<label>9</label>
<title>Limitations</title>
<p>Dataset Scope and Generalizability: The current OPOR-Bench dataset is text-only and predominantly English, which limits generalizability and does not fully reflect the true multimodal and multilingual nature of real-world public opinion.</p>
<p>Reproducibility and Scalability: Reproducibility is constrained by the framework&#x2019;s reliance on costly, proprietary models (like GPT-4o) and evolving APIs. Furthermore, scalability is challenged by potential dataset biases that may underrepresent marginalized voices.</p>
<p>Ethical Implications: We acknowledge that automated public opinion reporting tools carry a significant risk of misuse (e.g., surveillance or propaganda). Therefore, robust safeguards, bias auditing, and human oversight are necessary for any real-world deployment.</p>
</sec>
<sec id="s10">
<label>10</label>
<title>Future Work</title>
<p>Dataset Expansion: Future work will prioritize expanding OPOR-Bench to be both multilingual and multimodal. This involves incorporating non-English data from diverse cultural contexts and integrating critical visual content (e.g., images, videos) to ensure global applicability.</p>
<p>Hybrid Evaluation: To mitigate circularity and improve robustness, future work should develop hybrid evaluation frameworks. This involves combining LLM evaluators with specialized, verifiable modules (e.g., fact-checking tools, sentiment models) and structured human oversight.</p>
<p>Addressing LLM Evaluation Limitations: Deeper investigation is needed into LLM evaluator reliability, particularly shared capability weaknesses (e.g., blindness to temporal errors) and decision transparency. Future work should explore explainable frameworks to address this opacity.</p>
<p>Shared Capability Weaknesses: Evaluators may be &#x201C;blind&#x201D; to errors they are also prone to making, such as the universal temporal reasoning challenge we identified. Future work must integrate hybrid architectures with specialized, verifiable modules (e.g., temporal reasoners, fact-checkers) to address these capability blind spots.</p>
<p>Decision Transparency: Despite our agent-based design, the internal logic for specific score assignments remain opaque. Future work should explore explainable frameworks that incorporate uncertainty quantification and adversarial testing with deliberately flawed reports to improve transparency.</p>
<p>Reproducibility and Real-World Deployment: Future work will focus on improving reproducibility by developing smaller, efficient, open-source agents for generation and evaluation. We will also establish clear ethical guidelines for practical deployment, including robust safeguards for bias auditing and privacy protection.</p>
<p>Bias Mitigation: Systematic research in bias mitigation is needed to address diverse voice representation within the dataset and models. This includes developing new bias detection tools and establishing fairness metrics for public opinion analysis.</p>
</sec>
</body>
<back>
<ack>
<p>Not applicable.</p>
</ack>
<sec>
<title>Funding Statement</title>
<p>This work is supported by the Fundamental Research Funds for the Central Universities (No. CUC25SG013) and the Foundation of Key Laboratory of Education Informatization for Nationalities (Yunnan Normal University), Ministry of Education (No. EIN2024C006).</p>
</sec>
<sec>
<title>Author Contributions</title>
<p>Conceptualization, Jinzheng Yu, Yang Xu, Haozhen Li; methodology, Jinzheng Yu, Yang Xu, Haozhen Li; software, Jinzheng Yu, Junqi Li; validation, Jinzheng Yu, Yang Xu, Haozhen Li, Junqi Li; formal analysis, Jinzheng Yu, Yang Xu; investigation, Jinzheng Yu, Yang Xu, Haozhen Li; resources, Jinzheng Yu, Junqi Li; data curation, Junqi Li; writing&#x2014;original draft preparation, Jinzheng Yu; writing&#x2014;review and editing, Jinzheng Yu, Yang Xu; visualization, Jinzheng Yu, Yang Xu; supervision, Ligu Zhu, Hao Shen, Lei Shi; project administration, Ligu Zhu, Hao Shen, Lei Shi; funding acquisition, Ligu Zhu, Hao Shen, Lei Shi. All authors reviewed the results and approved the final version of the manuscript.</p>
</sec>
<sec sec-type="data-availability">
<title>Availability of Data and Materials</title>
<p>Data available on request from the authors. The data that support the findings of this study are available from the Corresponding Author, Lei Shi, upon reasonable request.</p>
</sec>
<sec>
<title>Ethics Approval</title>
<p>Not applicable.</p>
</sec>
<sec sec-type="COI-statement">
<title>Conflicts of Interest</title>
<p>The authors declare no conflicts of interest to report regarding the present study.</p>
</sec>
<app-group id="appg-1">
<app id="app-1">
<title>Appendix A Scoring Guideline for Event Title</title>
<p>The following sections provide the detailed scoring criteria for Event Title. Adopting a methodology similar to that of Kocmi and Federmann [<xref ref-type="bibr" rid="ref-49">49</xref>], each dimension is rated on a 5-point Likert scale, where a score of 1 indicates an unacceptable generation and 5 represents an excellent one.</p>
<p><boxed-text id="box1"><caption><title>Scoring Guideline for Event Title</title></caption>
<p>The quality of the &#x201C;Event_Title&#x201D; is rated on a scale of 1 to 5. Your evaluation should assess to what extent the title incorporates the official &#x201C;event name&#x201D; and relevant &#x201C;keywords&#x201D; to be clear, specific, and instantly recognizable.</p>
<p><bold>Score 5 (Excellent):</bold></p>
<p>The title perfectly incorporates the official event name (or its recognized alternative) and key keywords in a clear and coherent manner, precisely and unambiguously identifying the crisis event.</p>
<p><bold>Score 4 (Good):</bold></p>
<p>The title clearly references the event name and relevant keywords, allowing readers to readily identify the crisis, though there may be minor room for improvement.</p>
<p><bold>Score 3 (Fair):</bold></p>
<p>The title partially mentions the event name or a few keywords, broadly pointing to the correct crisis but lacking clarity and completeness.</p>
<p><bold>Score 2 (Poor):</bold></p>
<p>The title provides only minimal or vague hints related to the event, leaving the specific crisis unclear to the reader.</p>
<p><bold>Score 1 (Unacceptable):</bold></p>
<p>The title completely fails to mention the event name or any relevant keywords, providing no clear indication of the crisis.</p>
</boxed-text></p>
</app>
<app id="app-2">
<title>Appendix B Prompts for Evaluation Framework (OPOR-Eval)</title>
<p><boxed-text id="box2"><caption><title>Prompt for the Evaluation Agent</title></caption>
<p>Try your best to evaluate the quality of the given public opinion report comprehensively.</p>
<p>&#x003C;tool introduction&#x003E;</p>
<p>You have access to three specialized evaluation tools:</p>
<p><bold>Fact-Checker Tool:</bold> Verifies factual accuracy by comparing report content against reference data (<inline-formula id="ieqn-21"><mml:math id="mml-ieqn-21"><mml:msub><mml:mrow><mml:mtext>Z</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>). Use this for Event Title, Event Summary, and Timeline date accuracy.</p>
<p><bold>Opinion Mining Tool:</bold> Analyzes public opinion coverage by examining social media posts (<inline-formula id="ieqn-22"><mml:math id="mml-ieqn-22"><mml:msub><mml:mrow><mml:mtext>Y</mml:mtext></mml:mrow><mml:mrow><mml:mrow><mml:mtext>i</mml:mtext></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>). Use this for Timeline coverage completeness and Event Focus evaluation.</p>
<p><bold>Solution Counselor Tool:</bold> Evaluates recommendation quality using your expert knowledge. Use this for Event Suggestions evaluation.</p>
<p><bold>Use the following format:</bold></p>
<p><bold>Initial Input:</bold> the public opinion report to be evaluated. If the report is too long, focus on the section relevant to current evaluation.</p>
<p><bold>Thought:</bold> analyze which aspect needs to be evaluated and which tool to use.</p>
<p><bold>Tool to Use:</bold> should be one of [Fact-Checker, Opinion-Mining, Solution-Counselor]</p>
<p><bold>Tool Input:</bold> the specific content for the selected tool</p>
<p><bold>Observation:</bold> the evaluation score (1&#x2013;5) with reasoning from the tool</p>
<p>&#x2026; (this Thought/Tool to Use/Tool Input/Observation can repeat N times for each report section)</p>
<p><bold>Thought:</bold> I have completed evaluating all sections and can provide final scores</p>
<p><bold>Final Scores:</bold> The final output for the i-th report is a 15-dimensional score vector <inline-formula id="ieqn-23"><mml:math id="mml-ieqn-23"><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>15</mml:mn></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> where:
<list list-type="bullet">
<list-item>
<p><inline-formula id="ieqn-24"><mml:math id="mml-ieqn-24"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: Title score (from Fact-Checker Tool)</p></list-item>
<list-item>
<p><inline-formula id="ieqn-25"><mml:math id="mml-ieqn-25"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> to <inline-formula id="ieqn-26"><mml:math id="mml-ieqn-26"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>6</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: Summary scores (from Fact-Checker Tool)</p></list-item>
<list-item>
<p><inline-formula id="ieqn-27"><mml:math id="mml-ieqn-27"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: Timeline Date Accuracy score (from Fact-Checker Tool)</p></list-item>
<list-item>
<p><inline-formula id="ieqn-28"><mml:math id="mml-ieqn-28"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>8</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: Timeline Coverage score (from Opinion Mining Tool)</p></list-item>
<list-item>
<p><inline-formula id="ieqn-29"><mml:math id="mml-ieqn-29"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>9</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> to <inline-formula id="ieqn-30"><mml:math id="mml-ieqn-30"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>11</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: Focus scores (from Opinion Mining Tool)</p></list-item>
<list-item>
<p><inline-formula id="ieqn-31"><mml:math id="mml-ieqn-31"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>12</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> to <inline-formula id="ieqn-32"><mml:math id="mml-ieqn-32"><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>15</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: Suggestion scores (from Solution Counselor Tool)</p></list-item>
</list></p>
</boxed-text></p>
</app>
<app id="app-3">
<title>Appendix C</title>
<p>All prompt templates presented in this appendix were carefully designed with input from domain experts in crisis management and underwent iterative refinement through multiple validation rounds. Due to space constraints, we present representative examples here. We make the complete prompt collection, along with detailed design rationale and optimization process, publicly available with our dataset to ensure full reproducibility.</p>
<sec id="s13">
<title>C.1 Prompt for End-to-End Strategy</title>
<p><boxed-text id="box3"><caption><title>Prompt Template for End-to-end Generation Approach</title></caption>
<p><bold>[SYSTEM PROMPT]</bold></p>
<p><bold>1. Role and Goal</bold></p>
<p>You are an expert public opinion analyst. Your primary task is to analyze the provided context (news articles) and input (tweets) to generate a complete, structured public opinion report in a single pass. The output must be a valid JSON object.</p>
<p><bold>2. Field-by-Field Generation Instructions</bold></p>
<p>You must generate content for all five report sections, adhering to the following key guidelines:
<list list-type="bullet">
<list-item>
<p><bold>Event_Title:</bold> Generate a concise and accurate event title.</p></list-item>
<list-item>
<p><bold>Event_Summary:</bold> Generate a detailed summary covering the five core dimensions (Crisis Type, Time/Location, Cause, Impact, etc.).</p></list-item>
<list-item>
<p><bold>Event_Focus:</bold> Classify tweets (Netizens/Authoritative Institutions), perform topic clustering and sentiment analysis, and extract 2&#x2013;3 key viewpoints for each group.</p></list-item>
<list-item>
<p>... (and so on for <italic>Event_Timeline</italic> and <italic>Event_Suggestions</italic>, with their respective constraints).</p></list-item>
</list></p>
<p><bold>3. Few-Shot Examples</bold></p>
<p><italic>This section illustrates the expected style and depth for each field</italic>.</p>
<p>&#x2013; Content Examples for &#x201C;Event_Summary&#x201D; field &#x2013;</p>
<p>{summary_style_examples}</p>
<p>&#x2013; Content Examples for &#x201C;Event_Focus&#x201D; field &#x2013;</p>
<p>{focus_style_examples}</p>
<p>... (and so on for other fields)</p>
<p><bold>[TASK DATA]</bold></p>
<p><bold>&#x2013; News Data (Context) Below &#x2013;</bold> &#x003C;input&#x003E;</p>
<p><bold>&#x2013; Twitter Data (Input) Below &#x2013;</bold> &#x003C;input&#x003E;</p>
</boxed-text></p>
</sec>
<sec id="s14">
<title>C.2 Prompts for Modular Strategy</title>
<p><boxed-text id="box4"><caption><title>Prompt Template for Event Title Generation</title></caption>
<p><bold>1. Role and Goal</bold></p>
<p>You are an expert public opinion analyst. Your task is to generate a concise, neutral, and highly descriptive title based on the news content provided below.</p>
<p><bold>2. Field-by-Field Generation Instructions</bold>
<list list-type="bullet">
<list-item>
<p><bold>Event_Title:</bold> The title should capture the core essence of the event in a single, clear phrase.</p></list-item>
</list></p>
<p><bold>3. Few-Shot Examples</bold>
<list list-type="bullet">
<list-item>
<p><bold>Input:</bold> &#x003C;input_example_1&#x003E;</p></list-item>
<list-item>
<p><bold>Output Title:</bold> &#x003C;title_example_1&#x003E;</p></list-item>
</list></p>
<p><bold>&#x2013; News Content (Context) Below &#x2013;</bold> &#x003C;input&#x003E;</p>
</boxed-text></p>
</sec>
</app>
</app-group>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Zi</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Deng</surname> <given-names>P</given-names></string-name>, <string-name><surname>Qin</surname> <given-names>B</given-names></string-name></person-group>. <article-title>ESDM: early sensing depression model in social media streams</article-title>. In: <conf-name>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024); 2024 May 20&#x2013;25</conf-name>; <publisher-loc>Torino, Italia</publisher-loc>. p. <fpage>6288</fpage>&#x2013;<lpage>98</lpage>.</mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Che</surname> <given-names>W</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Qin</surname> <given-names>L</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Unlocking the capabilities of thought: a reasoning boundary framework to quantify and optimize chain-of-thought</article-title>. In: <conf-name>Proceedings of the Advances in Neural Information Processing Systems 37; 2024 Dec 10&#x2013;15</conf-name>; <publisher-loc>Vancouver, BC, Canada</publisher-loc>. p. <fpage>54872</fpage>&#x2013;<lpage>904</lpage>. doi:<pub-id pub-id-type="doi">10.52202/079017-1740</pub-id>.</mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Iter</surname> <given-names>D</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>C</given-names></string-name></person-group>. <article-title>G-eval: NLG evaluation using gpt-4 with better human alignment</article-title>. In: <conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing; 2023 Dec 6&#x2013;10</conf-name>; <publisher-loc>Singapore</publisher-loc>. p. <fpage>2511</fpage>&#x2013;<lpage>22</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.153</pub-id>.</mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Chiang</surname> <given-names>CH</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>HY</given-names></string-name></person-group>. <article-title>Can large language models be an alternative to human evaluations?</article-title> In: <conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics; 2023 July 9&#x2013;14</conf-name>; <publisher-loc>Toronto, ON, Canada</publisher-loc>. p. <fpage>15607</fpage>&#x2013;<lpage>31</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2023.acl-long.870</pub-id>.</mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Wu</surname> <given-names>W</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>P</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>F</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>H</given-names></string-name></person-group>. <chapter-title>Unfolding the headline: iterative self-questioning for news retrieval and timeline summarization</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Chiruzzo</surname> <given-names>L</given-names></string-name>, <string-name><surname>Ritter</surname> <given-names>A</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>L</given-names></string-name></person-group>, editors. <source>Findings of the association for computational linguistics: NAACL 2025</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2025</year>. p. <fpage>4385</fpage>&#x2013;<lpage>98</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2025.findings-naacl.248</pub-id>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Yang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Fan</surname> <given-names>C</given-names></string-name>, <string-name><surname>Gong</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Yeoh</surname> <given-names>W</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Forwarding in social media: forecasting popularity of public opinion with deep learning</article-title>. <source>IEEE Trans Comput Soc Syst</source>. <year>2025</year>;<volume>12</volume>(<issue>2</issue>):<fpage>749</fpage>&#x2013;<lpage>63</lpage>. doi:<pub-id pub-id-type="doi">10.1109/TCSS.2024.3468721</pub-id>.</mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Tu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>S</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name></person-group>. <article-title>Multi-stage Internet public opinion risk grading analysis of public health emergencies: an empirical study on Microblog in COVID-19</article-title>. <source>Inf Process Manag</source>. <year>2022</year>;<volume>59</volume>(<issue>1</issue>):<fpage>102796</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ipm.2021.102796</pub-id>; <pub-id pub-id-type="pmid">34744256</pub-id></mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Son</surname> <given-names>H</given-names></string-name></person-group>. <article-title>The memory cycle of time-series public opinion data: validation based on deep learning prediction</article-title>. <source>Inf Process Manag</source>. <year>2025</year>;<volume>62</volume>(<issue>4</issue>):<fpage>104168</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ipm.2025.104168</pub-id>.</mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Jiang</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>J</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Qian</surname> <given-names>Y</given-names></string-name></person-group>. <article-title>Network public opinion detection during the coronavirus pandemic: a short-text relational topic model</article-title>. <source>ACM Trans Knowl Discov Data</source>. <year>2022</year>;<volume>16</volume>(<issue>3</issue>):<fpage>1</fpage>&#x2013;<lpage>27</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3480246</pub-id>.</mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lu</surname> <given-names>X</given-names></string-name>, <string-name><surname>Qin</surname> <given-names>B</given-names></string-name></person-group>. <article-title>Cognitive distortion based explainable depression detection and analysis technologies for the adolescent internet users on social media</article-title>. <source>Front Public Health</source>. <year>2023</year>;<volume>10</volume>:<fpage>1045777</fpage>. doi:<pub-id pub-id-type="doi">10.3389/fpubh.2022.1045777</pub-id>; <pub-id pub-id-type="pmid">36733285</pub-id></mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>S</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>P</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>L</given-names></string-name>, <string-name><surname>Deveci</surname> <given-names>M</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>An integrated CRITIC-EDAS approach for assessing enterprise crisis management effectiveness based on Weibo</article-title>. <source>J Contingencies Crisis Manag</source>. <year>2024</year>;<volume>32</volume>(<issue>2</issue>):<fpage>e12572</fpage>. doi:<pub-id pub-id-type="doi">10.1111/1468-5973.12572</pub-id>.</mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Olteanu</surname> <given-names>A</given-names></string-name>, <string-name><surname>Vieweg</surname> <given-names>S</given-names></string-name>, <string-name><surname>Castillo</surname> <given-names>C</given-names></string-name></person-group>. <article-title>What to expect when the unexpected happens: social media communications across crises</article-title>. In: <conf-name>Proceedings of the 18th ACM Conference on Computer Supported Cooperative Work &#x0026; Social Computing; 2015 Mar 14&#x2013;18</conf-name>; <publisher-loc>Vancouver, BC, Canada</publisher-loc>. p. <fpage>994</fpage>&#x2013;<lpage>1009</lpage>. doi:<pub-id pub-id-type="doi">10.1145/2675133.2675242</pub-id>.</mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Imran</surname> <given-names>M</given-names></string-name>, <string-name><surname>Mitra</surname> <given-names>P</given-names></string-name>, <string-name><surname>Castillo</surname> <given-names>C</given-names></string-name></person-group>. <article-title>Twitter as a lifeline: human-annotated Twitter corpora for NLP of crisis-related messages</article-title>. In: <conf-name>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC&#x2019;16); 2016 May 23&#x2013;28</conf-name>; <publisher-loc>Portoro&#x017E;, Slovenia</publisher-loc>. p. <fpage>1638</fpage>&#x2013;<lpage>43</lpage>.</mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Alam</surname> <given-names>F</given-names></string-name>, <string-name><surname>Ofli</surname> <given-names>F</given-names></string-name>, <string-name><surname>Imran</surname> <given-names>M</given-names></string-name></person-group>. <article-title>CrisisMMD: multimodal Twitter datasets from natural disasters</article-title>. <source>Proc Int AAAI Conf Web Soc Media</source>. <year>2018</year>;<volume>12</volume>(<issue>1</issue>):<fpage>465</fpage>&#x2013;<lpage>73</lpage>. doi:<pub-id pub-id-type="doi">10.1609/icwsm.v12i1.14983</pub-id>.</mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Suwaileh</surname> <given-names>R</given-names></string-name>, <string-name><surname>Elsayed</surname> <given-names>T</given-names></string-name>, <string-name><surname>Imran</surname> <given-names>M</given-names></string-name></person-group>. <article-title>IDRISI-RE: a generalizable dataset with benchmarks for location mention recognition on disaster tweets</article-title>. <source>Inf Process Manag</source>. <year>2023</year>;<volume>60</volume>(<issue>3</issue>):<fpage>103340</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ipm.2023.103340</pub-id>.</mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Alam</surname> <given-names>F</given-names></string-name>, <string-name><surname>Sajjad</surname> <given-names>H</given-names></string-name>, <string-name><surname>Imran</surname> <given-names>M</given-names></string-name>, <string-name><surname>Ofli</surname> <given-names>F</given-names></string-name></person-group>. <chapter-title>CrisisBench: benchmarking crisis-related social media datasets for humanitarian information processing</chapter-title>. <source>Proc Int AAAI Conf Web Soc Media</source>. <year>2021</year>;<volume>15</volume>:<fpage>923</fpage>&#x2013;<lpage>32</lpage>. doi:<pub-id pub-id-type="doi">10.1609/icwsm.v15i1.18115</pub-id>.</mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Delforge</surname> <given-names>D</given-names></string-name>, <string-name><surname>Wathelet</surname> <given-names>V</given-names></string-name>, <string-name><surname>Below</surname> <given-names>R</given-names></string-name>, <string-name><surname>Sofia</surname> <given-names>CL</given-names></string-name>, <string-name><surname>Tonnelier</surname> <given-names>M</given-names></string-name>, <string-name><surname>van Loenhout</surname> <given-names>JAF</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>EM-DAT: the emergency events database</article-title>. <source>Int J Disaster Risk Reduct</source>. <year>2025</year>;<volume>124</volume>:<fpage>105509</fpage>. doi:<pub-id pub-id-type="doi">10.1016/j.ijdrr.2025.105509</pub-id>.</mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Lin</surname> <given-names>J</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>X</given-names></string-name>, <string-name><surname>Lin</surname> <given-names>SC</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>JH</given-names></string-name>, <string-name><surname>Pradeep</surname> <given-names>R</given-names></string-name>, <string-name><surname>Nogueira</surname> <given-names>R</given-names></string-name></person-group>. <article-title>Pyserini: a Python toolkit for reproducible information retrieval research with sparse and dense representations</article-title>. In: <conf-name>Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval; 2021 Jul 11&#x2013;15; Online</conf-name>. p. <fpage>2356</fpage>&#x2013;<lpage>62</lpage>. doi:<pub-id pub-id-type="doi">10.1145/3404835.3463238</pub-id>.</mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Bai</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lv</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>S</given-names></string-name>, <string-name><surname>Hou</surname> <given-names>L</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>LongWriter: unleashing 10,000&#x002B; word generation from long context LLMs</article-title>. In: <conf-name>Proceedings of the 13th International Conference on Learning Representations; 2025 Apr 24&#x2013;28</conf-name>; <publisher-loc>Vienna, Austria</publisher-loc>. p. <fpage>36528</fpage>&#x2013;<lpage>46</lpage>.</mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>R</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>M</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>M</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>H</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Li</surname> <given-names>G</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>SumSurvey: an abstractive dataset of scientific survey papers for long document summarization</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Ku</surname> <given-names>LW</given-names></string-name>, <string-name><surname>Martins</surname> <given-names>A</given-names></string-name>, <string-name><surname>Srikumar</surname> <given-names>V</given-names></string-name></person-group>, editors. <source>Findings of the association for computational linguistics ACL 2024</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2024</year>. p. <fpage>9632</fpage>&#x2013;<lpage>51</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.574</pub-id>.</mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><surname>Bilal</surname> <given-names>IM</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Tsakalidis</surname> <given-names>A</given-names></string-name>, <string-name><surname>Nguyen</surname> <given-names>D</given-names></string-name>, <string-name><surname>Procter</surname> <given-names>R</given-names></string-name>, <string-name><surname>Liakata</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Template-based abstractive microblog opinion summarization</article-title>. <source>Trans Assoc Comput Linguist</source>. <year>2022</year>;<volume>10</volume>(<issue>1</issue>):<fpage>1229</fpage>&#x2013;<lpage>48</lpage>. doi:<pub-id pub-id-type="doi">10.1162/tacl_a_00516</pub-id>.</mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Angelidis</surname> <given-names>S</given-names></string-name>, <string-name><surname>Lapata</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Summarizing opinions: aspect extraction meets sentiment prediction and they are both weakly supervised</article-title>. In: <conf-name>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing; 2018 Oct 31&#x2013;Nov 4</conf-name>; <publisher-loc>Brussels, Belgium</publisher-loc>. p. <fpage>3675</fpage>&#x2013;<lpage>86</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/d18-1403</pub-id>.</mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Nallapati</surname> <given-names>R</given-names></string-name>, <string-name><surname>Zhou</surname> <given-names>B</given-names></string-name>, <string-name><surname>dos Santos</surname> <given-names>C</given-names></string-name>, <string-name><surname>Gulcehre</surname> <given-names>C</given-names></string-name>, <string-name><surname>Xiang</surname> <given-names>B</given-names></string-name></person-group>. <article-title>Abstractive text summarization using sequence-to-sequence RNNs and beyond</article-title>. In: <conf-name>Proceedings of the 20th SIGNLL Conference on Computational Natural Language Learning; 2016 Aug 11&#x2013;12</conf-name>; <publisher-loc>Berlin/Heidelberg, Germany</publisher-loc>. p. <fpage>280</fpage>&#x2013;<lpage>90</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/k16-1028</pub-id>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Huang</surname> <given-names>KH</given-names></string-name>, <string-name><surname>Laban</surname> <given-names>P</given-names></string-name>, <string-name><surname>Fabbri</surname> <given-names>A</given-names></string-name>, <string-name><surname>Choubey</surname> <given-names>PK</given-names></string-name>, <string-name><surname>Joty</surname> <given-names>S</given-names></string-name>, <string-name><surname>Xiong</surname> <given-names>C</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Embrace divergence for richer insights: a multi-document summarization benchmark and a case study on summarizing diverse information from news articles</article-title>. In: <conf-name>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies; 2016 Aug 11&#x2013;12</conf-name>; <publisher-loc>Mexico City, Mexico</publisher-loc>. p. <fpage>570</fpage>&#x2013;<lpage>93</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.naacl-long.32</pub-id>.</mixed-citation></ref>
<ref id="ref-25"><label>[25]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Mao</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Qu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Ren</surname> <given-names>X</given-names></string-name>, <string-name><surname>Han</surname> <given-names>J</given-names></string-name></person-group>. <article-title>Multi-document summarization with maximal marginal relevance-guided reinforcement learning</article-title>. In: <conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP); 2020 Nov 16&#x2013;20; Online</conf-name>. p. <fpage>1737</fpage>&#x2013;<lpage>51</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.136</pub-id>.</mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Zheng</surname> <given-names>X</given-names></string-name>, <string-name><surname>Sun</surname> <given-names>A</given-names></string-name>, <string-name><surname>Li</surname> <given-names>J</given-names></string-name>, <string-name><surname>Muthuswamy</surname> <given-names>K</given-names></string-name></person-group>. <article-title>Subtopic-driven multi-document summarization</article-title>. In: <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP); 2019 Nov 3&#x2013;7</conf-name>; <publisher-loc>Hong Kong, China</publisher-loc>. p. <fpage>3151</fpage>&#x2013;<lpage>60</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/d19-1311</pub-id>.</mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Chen</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Qin</surname> <given-names>L</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Lv</surname> <given-names>H</given-names></string-name>, <string-name><surname>Zou</surname> <given-names>Y</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>What are the essential factors in crafting effective long context multi-hop instruction datasets? Insights and best practices</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Che</surname> <given-names>W</given-names></string-name>, <string-name><surname>Nabende</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shutova</surname> <given-names>E</given-names></string-name>, <string-name><surname>Pilehvar</surname> <given-names>MT</given-names></string-name></person-group>, editors. <source>Proceedings of the 63rd annual meeting of the association for computational linguistics (Volume 1: Long Papers)</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2025</year>. p. <fpage>27129</fpage>&#x2013;<lpage>51</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.1316</pub-id>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Ye</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Feng</surname> <given-names>X</given-names></string-name>, <string-name><surname>Feng</surname> <given-names>X</given-names></string-name>, <string-name><surname>Ma</surname> <given-names>W</given-names></string-name>, <string-name><surname>Qin</surname> <given-names>L</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>D</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>GlobeSumm: a challenging benchmark towards unifying multi-lingual, cross-lingual and multi-document news summarization</article-title>. In: <conf-name>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing; 2024 Nov 12&#x2013;16</conf-name>; <publisher-loc>Miami, FL, USA</publisher-loc>. p. <fpage>10803</fpage>&#x2013;<lpage>21</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.603</pub-id>.</mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Ernst</surname> <given-names>O</given-names></string-name>, <string-name><surname>Caciularu</surname> <given-names>A</given-names></string-name>, <string-name><surname>Shapira</surname> <given-names>O</given-names></string-name>, <string-name><surname>Pasunuru</surname> <given-names>R</given-names></string-name>, <string-name><surname>Bansal</surname> <given-names>M</given-names></string-name>, <string-name><surname>Goldberger</surname> <given-names>J</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>Proposition-level clustering for multi-document summarization</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Carpuat</surname> <given-names>M</given-names></string-name>, <string-name><surname>de Marneffe</surname> <given-names>MC</given-names></string-name>, <string-name><surname>Ruiz</surname> <given-names>Meza</given-names> <suffix>IV</suffix></string-name></person-group>, editors. <source>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2022</year>. p. <fpage>1765</fpage>&#x2013;<lpage>79</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2022.naacl-main.128</pub-id>.</mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Cho</surname> <given-names>S</given-names></string-name>, <string-name><surname>Song</surname> <given-names>K</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>F</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Toward unifying text segmentation and long document summarization</article-title>. In: <conf-name>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing; 2022 Dec 7&#x2013;11; Abu Dhabi, United Arab Emirates</conf-name>. p. <fpage>106</fpage>&#x2013;<lpage>18</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.8</pub-id>.</mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Bai</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Lv</surname> <given-names>X</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Lyu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Tang</surname> <given-names>J</given-names></string-name>, <string-name><surname>Huang</surname> <given-names>Z</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>LongBench: a bilingual, multitask benchmark for long context understanding</article-title>. In: <conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguisticss; 2024 Aug 11&#x2013;16</conf-name>; <publisher-loc>Bangkok, Thailand</publisher-loc>. p. <fpage>3119</fpage>&#x2013;<lpage>37</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.172</pub-id>.</mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Giorgi</surname> <given-names>J</given-names></string-name>, <string-name><surname>Soldaini</surname> <given-names>L</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>B</given-names></string-name>, <string-name><surname>Bader</surname> <given-names>G</given-names></string-name>, <string-name><surname>Lo</surname> <given-names>K</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>L</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>Open domain multi-document summarization: a comprehensive study of model brittleness under retrieval</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Bouamor</surname> <given-names>H</given-names></string-name>, <string-name><surname>Pino</surname> <given-names>J</given-names></string-name>, <string-name><surname>Bali</surname> <given-names>K</given-names></string-name></person-group>, editors. <source>Findings of the association for computational linguistics: EMNLP 2023</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2023</year>. p. <fpage>8177</fpage>&#x2013;<lpage>99</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.549</pub-id>.</mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>An</surname> <given-names>C</given-names></string-name>, <string-name><surname>Gong</surname> <given-names>S</given-names></string-name>, <string-name><surname>Zhong</surname> <given-names>M</given-names></string-name>, <string-name><surname>Zhao</surname> <given-names>X</given-names></string-name>, <string-name><surname>Li</surname> <given-names>M</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>J</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>L-eval: instituting standardized evaluation for long context language models</article-title>. In: <conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics; 2024 Aug 11&#x2013;16</conf-name>; <publisher-loc>Bangkok, Thailand</publisher-loc>. p. <fpage>14388</fpage>&#x2013;<lpage>411</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.776</pub-id>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Fabbri</surname> <given-names>A</given-names></string-name>, <string-name><surname>Li</surname> <given-names>I</given-names></string-name>, <string-name><surname>She</surname> <given-names>T</given-names></string-name>, <string-name><surname>Li</surname> <given-names>S</given-names></string-name>, <string-name><surname>Radev</surname> <given-names>D</given-names></string-name></person-group>. <article-title>Multi-news: a large-scale multi-document summarization dataset and abstractive hierarchical model</article-title>. In: <conf-name>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics; 2019 Jul 28&#x2013;Aug 2</conf-name>; <publisher-loc>Florence, Italy</publisher-loc>. p. <fpage>1074</fpage>&#x2013;<lpage>84</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/p19-1102</pub-id>.</mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Krishna</surname> <given-names>K</given-names></string-name>, <string-name><surname>Roy</surname> <given-names>A</given-names></string-name>, <string-name><surname>Iyyer</surname> <given-names>M</given-names></string-name></person-group>. <article-title>Hurdles to progress in long-form question answering</article-title>. In: <conf-name>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies; 2021 Jun 6&#x2013;11; Online</conf-name>. p. <fpage>4940</fpage>&#x2013;<lpage>57</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.393</pub-id>.</mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Ni</surname> <given-names>X</given-names></string-name>, <string-name><surname>Cai</surname> <given-names>H</given-names></string-name>, <string-name><surname>Wei</surname> <given-names>X</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Yin</surname> <given-names>D</given-names></string-name>, <string-name><surname>Li</surname> <given-names>P</given-names></string-name></person-group>. <article-title>XL<sup>2</sup>Bench: a benchmark for extremely long context understanding with long-range dependencies</article-title>. <comment>arXiv:2404.05446. 2024</comment>.</mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Xu</surname> <given-names>F</given-names></string-name>, <string-name><surname>Song</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Iyyer</surname> <given-names>M</given-names></string-name>, <string-name><surname>Choi</surname> <given-names>E</given-names></string-name></person-group>. <article-title>A critical evaluation of evaluations for long-form question answering</article-title>. In: <conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics; 2023 Jul 9&#x2013;14</conf-name>; <publisher-loc>Toronto, ON, Canada</publisher-loc>. p. <fpage>3225</fpage>&#x2013;<lpage>45</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2023.acl-long.181</pub-id>.</mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Wang</surname> <given-names>D</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>K</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>H</given-names></string-name>, <string-name><surname>Yang</surname> <given-names>X</given-names></string-name>, <string-name><surname>Cohen</surname> <given-names>A</given-names></string-name>, <string-name><surname>Li</surname> <given-names>L</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>Learning personalized alignment for evaluating open-ended text generation</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Al-Onaizan</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Bansal</surname> <given-names>M</given-names></string-name>, <string-name><surname>Chen</surname> <given-names>YN</given-names></string-name></person-group>, editors. <source>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2024</year>. p. <fpage>13274</fpage>&#x2013;<lpage>92</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.737</pub-id>.</mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Shen</surname> <given-names>C</given-names></string-name>, <string-name><surname>Cheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Nguyen</surname> <given-names>XP</given-names></string-name>, <string-name><surname>You</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Bing</surname> <given-names>L</given-names></string-name></person-group>. <chapter-title>Large language models are not yet human-level evaluators for abstractive summarization</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Bouamor</surname> <given-names>H</given-names></string-name>, <string-name><surname>Pino</surname> <given-names>J</given-names></string-name>, <string-name><surname>Bali</surname> <given-names>K</given-names></string-name></person-group>, editors. <source>Findings of the Association for Computational Linguistics: EMNLP 2023</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2023</year>. p. <fpage>4215</fpage>&#x2013;<lpage>33</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.278</pub-id>.</mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Liu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Yu</surname> <given-names>J</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Li</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhu</surname> <given-names>Q</given-names></string-name></person-group>. <article-title>A survey on transformer context extension: approaches and evaluation</article-title>. <comment>arXiv:2503.13299. 2025</comment>.</mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Li</surname> <given-names>J</given-names></string-name>, <string-name><surname>Wang</surname> <given-names>M</given-names></string-name>, <string-name><surname>Zheng</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhang</surname> <given-names>M</given-names></string-name></person-group>. <chapter-title>LooGLE: can long-context language models understand long contexts?</chapter-title> In: <article-title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics; 2024 Aug 11&#x2013;16</article-title>; <publisher-loc>Bangkok, Thailand</publisher-loc>. p. <fpage>16304</fpage>&#x2013;<lpage>33</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.859</pub-id>.</mixed-citation></ref>
<ref id="ref-42"><label>[42]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Calderon</surname> <given-names>N</given-names></string-name>, <string-name><surname>Reichart</surname> <given-names>R</given-names></string-name>, <string-name><surname>Dror</surname> <given-names>R</given-names></string-name></person-group>. <chapter-title>The alternative annotator test for LLM-as-a-judge: how to statistically justify replacing human annotators with LLMs</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Che</surname> <given-names>W</given-names></string-name>, <string-name><surname>Nabende</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shutova</surname> <given-names>E</given-names></string-name>, <string-name><surname>Pilehvar</surname> <given-names>MT</given-names></string-name></person-group>, editors. <source>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2025</year>. p. <fpage>16051</fpage>&#x2013;<lpage>81</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.782</pub-id>.</mixed-citation></ref>
<ref id="ref-43"><label>[43]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Tan</surname> <given-names>H</given-names></string-name>, <string-name><surname>Guo</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Shi</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Xu</surname> <given-names>L</given-names></string-name>, <string-name><surname>Liu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Feng</surname> <given-names>Y</given-names></string-name>, <etal>et al</etal></person-group>. <chapter-title>ProxyQA: an alternative framework for evaluating long-form text generation with large language models</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Ku</surname> <given-names>LW</given-names></string-name>, <string-name><surname>Martins</surname> <given-names>A</given-names></string-name>, <string-name><surname>Srikumar</surname> <given-names>V</given-names></string-name></person-group>, editors. <source>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (volume 1: Long Papers)</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2024</year>. p. <fpage>6806</fpage>&#x2013;<lpage>27</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.368</pub-id>.</mixed-citation></ref>
<ref id="ref-44"><label>[44]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Chiang</surname> <given-names>CH</given-names></string-name>, <string-name><surname>Lee</surname> <given-names>HY</given-names></string-name>, <string-name><surname>Lukasik</surname> <given-names>M</given-names></string-name></person-group>. <chapter-title>TRACT: regression-aware fine-tuning meets chain-of-thought reasoning for LLM-as-a-judge</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Che</surname> <given-names>W</given-names></string-name>, <string-name><surname>Nabende</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shutova</surname> <given-names>E</given-names></string-name>, <string-name><surname>Pilehvar</surname> <given-names>MT</given-names></string-name></person-group>, editors. <source>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2025</year>. p. <fpage>2934</fpage>&#x2013;<lpage>52</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.147</pub-id>.</mixed-citation></ref>
<ref id="ref-45"><label>[45]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Luo</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Xie</surname> <given-names>Q</given-names></string-name>, <string-name><surname>Ananiadou</surname> <given-names>S</given-names></string-name></person-group>. <article-title>ChatGPT as a factual inconsistency evaluator for text summarization</article-title>. <comment>arXiv:2303.15621. 2023</comment>.</mixed-citation></ref>
<ref id="ref-46"><label>[46]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>D&#x2019;Souza</surname> <given-names>J</given-names></string-name>, <string-name><surname>Babaei Giglou</surname> <given-names>H</given-names></string-name>, <string-name><surname>M&#x00FC;nch</surname> <given-names>Q</given-names></string-name></person-group>. <chapter-title>YESciEval: robust LLM-as-a-judge for scientific question answering</chapter-title>. In: <person-group person-group-type="editor"><string-name><surname>Che</surname> <given-names>W</given-names></string-name>, <string-name><surname>Nabende</surname> <given-names>J</given-names></string-name>, <string-name><surname>Shutova</surname> <given-names>E</given-names></string-name>, <string-name><surname>Pilehvar</surname> <given-names>MT</given-names></string-name></person-group>, editors. <source>Proceedings of the 63rd annual meeting of the association for computational linguistics (volume 1: long papers)</source>. <publisher-loc>Stroudsburg, PA, USA</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>; <year>2025</year>. p. <fpage>13749</fpage>&#x2013;<lpage>83</lpage>. doi:<pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.675</pub-id>.</mixed-citation></ref>
<ref id="ref-47"><label>[47]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><surname>Zheng</surname> <given-names>L</given-names></string-name>, <string-name><surname>Chiang</surname> <given-names>WL</given-names></string-name>, <string-name><surname>Sheng</surname> <given-names>Y</given-names></string-name>, <string-name><surname>Zhuang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Wu</surname> <given-names>Z</given-names></string-name>, <string-name><surname>Zhuang</surname> <given-names>Y</given-names></string-name>, <etal>et al</etal></person-group>. <article-title>Judging LLM-as-a-judge with MT-bench and chatbot arena</article-title>. <comment>arXiv:2306.05685. 2023</comment>.</mixed-citation></ref>
<ref id="ref-48"><label>[48]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><surname>Wu</surname> <given-names>N</given-names></string-name>, <string-name><surname>Gong</surname> <given-names>M</given-names></string-name>, <string-name><surname>Shou</surname> <given-names>L</given-names></string-name>, <string-name><surname>Liang</surname> <given-names>S</given-names></string-name>, <string-name><surname>Jiang</surname> <given-names>D</given-names></string-name></person-group>. <chapter-title>Large language models are diverse role-players for summarization evaluation</chapter-title>. In: <source>Natural language processing and Chinese computing</source>. <publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer</publisher-name>; <year>2023</year>. p. <fpage>695</fpage>&#x2013;<lpage>707</lpage>. doi:<pub-id pub-id-type="doi">10.1007/978-3-031-44693-1_54</pub-id>.</mixed-citation></ref>
<ref id="ref-49"><label>[49]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><surname>Kocmi</surname> <given-names>T</given-names></string-name>, <string-name><surname>Federmann</surname> <given-names>C</given-names></string-name></person-group>. <article-title>Large language models are state-of-the-art evaluators of translation quality</article-title>. In: <conf-name>Proceedings of the 24th Annual Conference of the European Association for Machine Translation; 2023 Jun 12&#x2013;15</conf-name>; <publisher-loc>Tampere, Finland</publisher-loc>. p. <fpage>193</fpage>&#x2013;<lpage>203</lpage>.</mixed-citation></ref>
</ref-list>
</back></article>