<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xml:lang="en" article-type="research-article" dtd-version="1.1">
<front>
<journal-meta>
<journal-id journal-id-type="pmc">CMC</journal-id>
<journal-id journal-id-type="nlm-ta">CMC</journal-id>
<journal-id journal-id-type="publisher-id">CMC</journal-id>
<journal-title-group>
<journal-title>Computers, Materials &#x0026; Continua</journal-title>
</journal-title-group>
<issn pub-type="epub">1546-2226</issn>
<issn pub-type="ppub">1546-2218</issn>
<publisher>
<publisher-name>Tech Science Press</publisher-name>
<publisher-loc>USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">39228</article-id>
<article-id pub-id-type="doi">10.32604/cmc.2023.039228</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Article</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Thalassemia Screening by Sentiment Analysis on Social Media Platform Twitter</article-title>
<alt-title alt-title-type="left-running-head">Thalassemia Screening by Sentiment Analysis on Social Media Platform Twitter</alt-title>
<alt-title alt-title-type="right-running-head">Thalassemia Screening by Sentiment Analysis on Social Media Platform Twitter</alt-title>
</title-group>
<contrib-group>
<contrib id="author-1" contrib-type="author">
<name name-style="western"><surname>Aqlan</surname><given-names>Wadhah Mohammed M.</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-2" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Ali</surname><given-names>Ghassan Ahmed</given-names></name><xref ref-type="aff" rid="aff-2">2</xref><email>Alhabeb@gmail.com</email></contrib>
<contrib id="author-3" contrib-type="author">
<name name-style="western"><surname>Rajab</surname><given-names>Khairan</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-4" contrib-type="author">
<name name-style="western"><surname>Rajab</surname><given-names>Adel</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-5" contrib-type="author">
<name name-style="western"><surname>Shaikh</surname><given-names>Asadullah</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-6" contrib-type="author">
<name name-style="western"><surname>Olayah</surname><given-names>Fekry</given-names></name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
<contrib id="author-7" contrib-type="author" corresp="yes">
<name name-style="western"><surname>Alzaeemi</surname><given-names>Shehab Abdulhabib Saeed</given-names></name><xref ref-type="aff" rid="aff-3">3</xref><email>shehab@uthm.edu.my</email></contrib>
<contrib id="author-8" contrib-type="author">
<name name-style="western"><surname>Tay</surname><given-names>Kim Gaik</given-names></name><xref ref-type="aff" rid="aff-3">3</xref></contrib>
<contrib id="author-9" contrib-type="author">
<name name-style="western"><surname>Omar</surname><given-names>Mohd Adib</given-names></name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
<contrib id="author-10" contrib-type="author">
<name name-style="western"><surname>Mangantig</surname><given-names>Ernest</given-names></name><xref ref-type="aff" rid="aff-4">4</xref></contrib>
<aff id="aff-1"><label>1</label><institution>School of Computer Sciences, Universiti Sains Malaysia, USM</institution>, <addr-line>11800, Penang</addr-line>, <country>Malaysia</country></aff>
<aff id="aff-2"><label>2</label><institution>College of Computer Science and Information Systems, Najran University</institution>, <addr-line>Najran, 61441</addr-line>, <country>Saudi Arabia</country></aff>
<aff id="aff-3"><label>3</label><institution>Faculty of Electrical and Electronic Engineering, Universiti Tun Hussein Onn Malaysia</institution>, <addr-line>86400, Johor</addr-line>, <country>Malaysia</country></aff>
<aff id="aff-4"><label>4</label><institution>IPPT, Universiti Sains Malaysia, USM</institution>, <addr-line>11800, Penang</addr-line>, <country>Malaysia</country></aff>
</contrib-group>
<author-notes>
<corresp id="cor1"><label>&#x002A;</label>Corresponding Authors: Ghassan Ahmed Ali. Email: <email>Alhabeb@gmail.com</email>; Shehab Abdulhabib Saeed Alzaeemi. Email: <email>shehab@uthm.edu.my</email></corresp>
</author-notes>
<pub-date date-type="collection" publication-format="electronic">
<year>2023</year></pub-date>
<pub-date date-type="pub" publication-format="electronic"><day>09</day>
<month>6</month>
<year>2023</year></pub-date>
<volume>76</volume>
<issue>1</issue>
<fpage>665</fpage>
<lpage>686</lpage>
<history>
<date date-type="received">
<day>16</day>
<month>1</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>4</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2023 Aqlan et al.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Aqlan et al.</copyright-holder>
<license xlink:href="https://creativecommons.org/licenses/by/4.0/">
<license-p>This work is licensed under a <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="TSP_CMC_39228.pdf"></self-uri>
<abstract>
<p>Thalassemia syndrome is a genetic blood disorder induced by the reduction of normal hemoglobin production, resulting in a drop in the size of red blood cells. In severe forms, it can lead to death. This genetic disorder has posed a major burden on public health wherein patients with severe thalassemia need periodic therapy of iron chelation and blood transfusion for survival. Therefore, controlling thalassemia is extremely important and is made by promoting screening to the general population, particularly among thalassemia carriers. Today Twitter is one of the most influential social media platforms for sharing opinions and discussing different topics like people&#x2019;s health conditions and major public health affairs. Exploring individuals&#x2019; sentiments in these tweets helps the research centers to formulate strategies to promote thalassemia screening to the public. An effective Lexicon-based approach has been introduced in this study by highlighting a classifier called valence aware dictionary for sentiment reasoning (VADER). In this study applied twitter intelligence tool (TWINT), Natural Language Toolkit (NLTK), and VADER constitute the three main tools. VADER represents a gold-standard sentiment lexicon, which is basically tailored to attitudes that are communicated by using social media. The contribution of this study is to introduce an effective Lexicon-based approach by highlighting a classifier called VADER to analyze the sentiment of the general population, particularly among thalassemia carriers on the social media platform Twitter. In this study, the results showed that the proposed approach achieved 0.829, 0.816, and 0.818 regarding precision, recall, together with F-score, respectively. The tweets were crawled using the search keywords, &#x201C;thalassemia screening,&#x201D; thalassemia test, &#x201C;and thalassemia diagnosis&#x201D;. Finally, results showed that India and Pakistan ranked the highest in mentions in tweets by the public&#x2019;s conversations on thalassemia screening with 181 and 164 tweets, respectively.</p>
</abstract>
<kwd-group kwd-group-type="author">
<kwd>Social media platform</kwd>
<kwd>Twitter</kwd>
<kwd>screening</kwd>
<kwd>thalassemia</kwd>
<kwd>lexicon-based</kwd>
<kwd>VADER</kwd>
</kwd-group>
<funding-group>
<award-group id="awg1">
<funding-source>Deanship of Scientific Research at Najran University</funding-source>
<award-id>NU/RC/SERC/11/5</award-id>
</award-group>
</funding-group>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1</label>
<title>Introduction</title>
<p>Thalassemia is a significant global health problem that posed a heavy burden on the affected individual&#x2019;s quality of life and healthcare [<xref ref-type="bibr" rid="ref-1">1</xref>]. Patients with severe forms of thalassemia typically suffer from severe anemia, enlargement of the spleen, retardation of growth, and endocrine systems complications due to the body&#x2019;s inability to produce hemoglobin, a substance that transports oxygen to the whole-body tissue. Moreover, severe thalassemia patients need a long-lasting therapy of blood transfusion, in addition to costly iron chelation for survival [<xref ref-type="bibr" rid="ref-2">2</xref>]. Worldwide estimates revealed that every year more than 50,000 new affected people are inherently born with a severe thalassemia form (i.e., beta&#x2011;thalassemia major, as well as HbE beta&#x2011;thalassemia) and about 80 percent of the infected births take place in several developing countries [<xref ref-type="bibr" rid="ref-3">3</xref>].</p>
<p>For high-risk couples who are both carriers of thalassemia mutation, prenatal thalassemia screening is a must for the expecting mother. However, despite the health care effort for prenatal and premarital thalassemia screening, the incidence of new thalassemia births continues to rise [<xref ref-type="bibr" rid="ref-4">4</xref>]. Accordingly, to control this disease more effectively, screening should be promoted to the general population before marriage. This indicates that more effort is needed to improve thalassemia awareness in the general population. Therefore, understanding the public&#x2019;s opinion and sentiment toward thalassemia screening is important in formulating strategies to promote thalassemia screening to the public. Social media represents a crucial source, whereby the general public&#x2019;s opinion is grasped and, therefore, it has been progressively growing in recent years [<xref ref-type="bibr" rid="ref-5">5</xref>]. Around the globe, people share thoughts and communicate a wide range of topics like health conditions and issues related to public health on social media platforms. Sharing these pieces of information gives advantages to researchers over traditional data sources, such as real-time data availability, ease of access, and low cost. Sentiment analysis is often conducted on these thoughts to help the healthcare sector analyze people&#x2019;s interests and opinions, thereby helping them in making decisions and providing effective solutions [<xref ref-type="bibr" rid="ref-6">6</xref>].</p>
<p>Sentiment Analysis represents the most employed tool of text classification (also known as opinion mining), which analyses textual materials and classifies an underlying sentiment into positive or negative opinions, and neutral sentiments. The principal objective of sentiment analysis involves identifying the user&#x2019;s or audience&#x2019;s viewpoint on a target object by analyzing a vast amount of text from several sources. Sentiment analysis approaches are widely used in various fields, namely marketing, political, and sociological [<xref ref-type="bibr" rid="ref-7">7</xref>]. They are classified into two main methods, including the Lexicon-Based approach and the Machine Learning approach [<xref ref-type="bibr" rid="ref-8">8</xref>]. The following <xref ref-type="table" rid="table-1">Table 1</xref> to explains all abbreviations and symbols that used at this study.</p>
<table-wrap id="table-1">
<label>Table 1</label>
<caption>
<title>List of abbreviations and symbols</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Abbreviations</th>
<th>Explanation</th>
<th>Abbreviations</th>
<th>Explanation</th>
</tr>
</thead>
<tbody>
<tr>
<td>NLP</td>
<td>Natural Language Processing</td>
<td>HTML</td>
<td>Hyper Text Markup Language</td>
</tr>
<tr>
<td>NLTK</td>
<td>Natural Language Toolkit</td>
<td>VADER</td>
<td>Valence Aware Dictionary for Sentiment Reasoning</td>
</tr>
<tr>
<td>API</td>
<td>Application Programming Interfaces</td>
<td>TWINT</td>
<td>Twitter Intelligence Tool</td>
</tr>
<tr>
<td>RegEx</td>
<td>Regular Expression</td>
<td>POST</td>
<td>Part of Speech Tagging</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s2">
<label>2</label>
<title>Lexicon-Based Approach</title>
<p>The Lexicon-Based approach is unsupervised as it aims to perform the analysis using the lexicon and scoring method for evaluating opinions. A sentiment lexicon is a lexical list of features, which is labeled based on the semantic orientation of such features [<xref ref-type="bibr" rid="ref-9">9</xref>,<xref ref-type="bibr" rid="ref-10">10</xref>], interpreted as positive sentiments, negative sentiments, or neutral sentiments.</p>
<p>This lexicon-based method has been introduced to compute polarity scores at two levels, involving the word level and the sentence level using a manually created corpus of patient-provided medical drug reviews [<xref ref-type="bibr" rid="ref-11">11</xref>]. The results showed that the proposed technique obtained 79 percent accuracy and achieved 81 percent accuracy on average at the word level and the sentence level, respectively.</p>
<p>Next, a sentiment lexicon is built to provide comprehensive coverage of health-related words through a hybrid approach that combines two strategies: bootstrapping and lexicon-based [<xref ref-type="bibr" rid="ref-12">12</xref>]. Furthermore, a proper polarity class can be defined for every word by suggesting a specific count-based probability measurement. The results obtained by this approach were 0.89, 0.79, and 0.83 scores in precision and recall, in addition to F1, respectively.</p>
<p>In [<xref ref-type="bibr" rid="ref-13">13</xref>], an approach for the analysis of the aspect-level opinion is introduced according to the SentiWordNet (SWN) lexicon and the ontologies within the domain of diabetes. The N-gram techniques were applied by the authors (i.e., N-gram-after and N-gram-before, as well as N-gram around) for calculating a given aspect-level sentiment by heeding the words around the aspect. A Twitter dataset corpus has been collected and manually labeled at the aspect level to ternary classes for evaluating the proposed approach. Furthermore, it emerges that the N-gram-around method achieves better results by 0.819, 0.811, and 0.812 in precision and recall, as well as <italic>F</italic>-measure, correspondingly.</p>
<p>Another lexicon-based method has been employed for analyzing people&#x2019;s thoughts and insights communicated via Twitter about Cardiovascular diseases [<xref ref-type="bibr" rid="ref-14">14</xref>]. About two million Tweets, including one of the terms cardiovascular disease or heart disease, were extracted using the Twitter application programming interfaces (API).</p>
<p>Lastly, Wong and his team have proposed a lexicon-based sentiment analysis model called VADER for classifying the public&#x2019;s attitudes on Twitter about breast cancer screening in the US. [<xref ref-type="bibr" rid="ref-15">15</xref>], linking the sentiments on Twitter to an actual screening of breast cancer patterns from the behavioral risk factor surveillance system (BRFSS) to investigate how sentiments can be possibly related to screening uptake behaviour. Based on the results, the projected method in this study achieved an accuracy of 77.2%.</p>
</sec>
<sec id="s3">
<label>3</label>
<title>Machine Learning Approach</title>
<p>Machine learning requires the application of feature extraction, as well as model training with the help of a feature set and some dataset. Indeed, this approach utilizes the classification technique for categorizing the text into classes.</p>
<p>A sentiment analysis approach is proposed on various medical forums devoted to Hearing Loss (HL) [<xref ref-type="bibr" rid="ref-16">16</xref>]. Evaluations were made using three different supervised learning-based classifiers like Na&#x00EF;ve Bayes and support vector machine (SVM), as well as the algorithms of logistics regression. A feature extraction method was introduced by the authors depending on data about the speech parts. A set of experiences has been conducted using a manually labeled dataset as a positive or negative sentiment; otherwise, it is a neutral sentiment to assess the performance of this approach. The results showed that the logistics regression&#x2019;s performance is the best of the features selected, with an average of 0.685 on the F1 score.</p>
<p>In medical forums, a new approach for analyzing sentiments has been proposed by Arbane et al., [<xref ref-type="bibr" rid="ref-17">17</xref>]. They created a new lexicon called HealthAffect and used two algorithms of classification: Na&#x00EF;ve Bayes and <italic>k</italic>-Nearest Neighbors (KNN), for solving a multiclass sentiment classification problem. The results showed that the used Na&#x00EF;ve Bayes algorithm provided better performance compared with the KNN algorithm by achieving a 0.518 F1 score.</p>
<p>Another sentiment analysis method is proposed by creating a vast corpus of manually labeled data [<xref ref-type="bibr" rid="ref-18">18</xref>]. Data is collected from medical forums related to depression, anxiety, asthma, and allergy. Additionally, they built a deep Convolutional Neural Network (CNN) model for evaluation based on the medical sentiment analysis system for predicting a potential medical sentiment classification for two categories, including &#x2018;medical condition&#x2019; and &#x2018;medication&#x2019; categorization schemes. They achieved a significant performance with a precision of 0.86 and 0.68, respectively.</p>
<p>Lastly, an enhanced machine learning method has been proposed for extracting sentiments from people&#x2019;s tweets related to the HPV vaccine [<xref ref-type="bibr" rid="ref-19">19</xref>]. The authors manually annotated these tweets, and they completed a hierarchical categorization utilizing the SVM standard. According to the results, better performance with a 0.744 F-score was achieved compared with different baseline models.</p>
<p>Although the machine learning approach was successfully applied in many domains, it still has some disadvantages compared to the lexicon-based, including (1) Depending on a labeled dataset. (2) Requires a massive amount of training set with many features that are tricky to obtain on social media with short, sparse text. (3) High computational operation (time, memory, and process) is needed for the training and testing process [<xref ref-type="bibr" rid="ref-20">20</xref>,<xref ref-type="bibr" rid="ref-21">21</xref>]. On the other side, despite the lexicon-based approach demands linguistic resources, which are limited for some languages, studies showed that the analysis of the lexicon-based method outperformed the supervised machine learning techniques not just in performance but also in the economy of time and effort [<xref ref-type="bibr" rid="ref-22">22</xref>].</p>
<p>Therefore, this work aims to use the lexicon-based sentiment analysis method by using the VADER lexicon, which is successfully applied in many works [<xref ref-type="bibr" rid="ref-23">23</xref>&#x2013;<xref ref-type="bibr" rid="ref-26">26</xref>]. Moreover, according to the prior studies, all papers have focused on sentiment analysis in particular health domains such as diabetes [<xref ref-type="bibr" rid="ref-13">13</xref>], Cardiovascular Disease [<xref ref-type="bibr" rid="ref-14">14</xref>], breast cancer screening [<xref ref-type="bibr" rid="ref-15">15</xref>], and SARS-CoV-2 [<xref ref-type="bibr" rid="ref-27">27</xref>,<xref ref-type="bibr" rid="ref-28">28</xref>].</p>
<p>Besides, there is a lack of published studies focusing on the thalassemia screening domain. Consequently, this paper focuses on identifying the public&#x2019;s sentiment polarity on thalassemia screening on Twitter using a lexicon-based approach. The subsequent sections of this paper outline in detail the architecture and components of the introduced method.</p>
</sec>
<sec id="s4">
<label>4</label>
<title>Methodology</title>
<sec id="s4_1">
<label>4.1</label>
<title>Proposed Method</title>
<p>This study falls into three main stages. These include (1) the first stage involves acquiring data on Twitter; (2) the second stage concentrates on the initial operation of preprocessing, which was conducted for cleaning and filtering out irrelevant information (i.e., punctuation, stop words, and retweet symbols) from the tweets; (3) the third stage involves utilizing NLTK&#x2019;s VADER classifier and identifying the most frequent terms used in public conversations on thalassemia screening. The application of the scoring method was carried out to the results of VADER to assess the method&#x2019;s capability of classifying the collected tweets to a three-point measurement (either positive or negative, and neutral). The architecture of the introduced approach is shown in <xref ref-type="fig" rid="fig-1">Fig. 1</xref>.</p>
<fig id="fig-1">
<label>Figure 1</label>
<caption>
<title>Proposed approach</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-1.tif"/>
</fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Data Acquisition</title>
<p>The data in the context of this study comprise a total of 3,376 English tweets, which were gathered between February 2009 and September 2020, applying the TWINT application. The collected tweets were crawled utilizing several search keywords, e.g., &#x201C;thalassemia screening.&#x201D; <xref ref-type="fig" rid="fig-2">Fig. 2</xref> shows the number of tweets posted each year, starting from 2009 until 2020. It is observed from the Figure that the number of tweets published related to thalassemia increases over time. This confirms how people are aware of thalassemia screening and its impact on society.</p>
<fig id="fig-2">
<label>Figure 2</label>
<caption>
<title>Number of posted tweets every year</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-2.tif"/>
</fig>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Data Preprocessing</title>
<p>A tweet can be defined as a specific microblog message, which is posted on the Twitter platform and is confined to a total of 280 characters. Often, users do not apply the proper language structure when they post opinions or sentiments about a specific topic. Instead, slang, misspelling, different emoticons, and abbreviations, sometimes puns, complicates the analysis of these structures. Complex textual data significantly influence the performance of analyzing sentiments because the quality of the output depends on the input [<xref ref-type="bibr" rid="ref-29">29</xref>]. Therefore, a preprocessing steps series have been performed for removing irrelevant data from the collected tweets because the cleaner the information is, the more appropriate this information will be for mining, as well as feature extraction, thereby improving the findings&#x2019; accuracy and precision [<xref ref-type="bibr" rid="ref-30">30</xref>,<xref ref-type="bibr" rid="ref-31">31</xref>]. For the preprocessing, we have utilized Python&#x2019;s natural language processing (NLP) toolkit (NLTK). Initially, a given regular expression, i.e., (Regex) in Python is applied for detecting and eliminating uniform resource locator (URL) links, retweet symbols (RT), user mentions (@), and decoding hypertext markup language (HTML) Entities with equivalent characters. Hashtags (#) describe the subject of the tweet. They carry useful information about the topic of the tweet, included as a particular part of a given tweet, except for the symbol &#x201C;#&#x201D;, which is deleted [<xref ref-type="bibr" rid="ref-31">31</xref>]. Once the preprocessing steps are complete, a dataset is now ready for the classification of sentiments by using the algorithm of VADER. Various NLTK functions are also used as additional preprocessing steps and are required to calculate terms&#x2019; frequency and thus to visualize in Word Cloud. These steps include:
<list list-type="order">
<list-item>
<p>Convert the tweets to lowercase to reduce redundancy.</p></list-item>
<list-item>
<p>Tokenize these tweets into separate tokens or words.</p></list-item>
<list-item>
<p>Apply part of speech (POS) tagging (i.e., adjectives, adverbs, verbs, and nouns). The reason behind the application of POS tagging prior to the lemmatization and elimination operation involves retaining the structure of linguistic categories and maintaining the quality of the analysis.</p></list-item>
<list-item>
<p>Lemmatize the tweets by eliminating inflectional endings; thus, obtaining the original dictionary or base form of this word utilizing the NLTK wordnet lemmatizer.</p></list-item>
<list-item>
<p>Remove the special characters and numbers.</p></list-item>
<list-item>
<p>Remove single characters.</p></list-item>
<list-item>
<p>Remove the common query words that emerge in almost all tweets (i.e., thalassemia, screening, test, and diagnosis).</p></list-item>
<list-item>
<p>Remove stop words that do not express any meaning in tweets (i.e., a, the, he, them, etc.).</p></list-item>
</list></p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Sentiment Classification</title>
<p>The analysis of sentiment can be generally used for examining the context polarity, whether a positive or negative opinion and a neutral sentiment. During this phase, a largely approved human-validated sentiment analysis instrument was used, i.e., Valence Aware Dictionary for Sentiment Reasoning (VADER). This analysis tool was used because it represents a lexicon-based sentiment analysis engine, which is specially tailored for expressions on social media sentiments [<xref ref-type="bibr" rid="ref-22">22</xref>]. It is a wholly open-sourced lexicon under the MIT License. We use the VADER analyzer to examine both the sentiment&#x2019;s polarity and intensity in each tweet. This operation results in four sentiment scores, including positive, negative, neutral, and compound. The compound score is a unidimensional metric with a specific value between &#x2212;1, referring to (extremely negative), in addition to 1, referring to (extremely positive), which is a very helpful metric to measure the overall sentiments in each tweet. <xref ref-type="table" rid="table-2">Table 2</xref> demonstrates the compound threshold value utilized for classifying tweets as positive or negative tweets and neutral tweets.</p>
<table-wrap id="table-2">
<label>Table 2</label>
<caption>
<title>Sentiment type according to the compound score threshold</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th style="background:#FFFFFF;">Type of sentiment</th>
<th style="background:#FFFFFF;">Compound score</th>
</tr>
</thead>
<tbody>
<tr>
<td style="background:#FFFFFF;">Positive</td>
<td style="background:#FFFFFF;">&#x003E; &#x003D; 0.05</td>
</tr>
<tr>
<td style="background:#FFFFFF;">Negative</td>
<td style="background:#FFFFFF;">&#x003C; &#x003D; &#x2212;0.05</td>
</tr>
<tr>
<td style="background:#FFFFFF;">Neutral</td>
<td style="background:#FFFFFF;">&#x003E; &#x2212;0.05 and &#x003C; 0.05</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Based on <xref ref-type="table" rid="table-2">Table 2</xref>, 0.05 and &#x2212;0.05 were used as threshold values to explain compound values, i.e., positive values, negative values, as well as neutral values. If the compound value is above or equal to 0.05, it signifies a positive value. If it is less than or equivalent to &#x2212;0.05, it signifies a negative value. Otherwise, it signifies a neutral value.</p>

</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Modification of VADER</title>
<p>Although VADER has been substantiated by researchers on typical tweets according to Hutto et al. [<xref ref-type="bibr" rid="ref-22">22</xref>], VADER&#x2019;s performance in categorizing tweets linked to public health issues, especially thalassemia screening, requires additional validation. Accordingly, validation has been performed by selecting a randomly selected subset of a total of 320 tweets obtained from an original set of thalassemia-screening tweets. The sentiment polarity of each of the 320 tweets is interpreted by domain experts as actual results. However, the poor performance of the F1 score (&#x003C;0.7) has been observed based on the classification of VADER, whereby the primary reason is detected.</p>
<p>In this original lexical dictionary of VADER, some words appeared in tweets that can reverse the polarity score from negative to positive and vice versa, relying on their score values in VADER. For example, In the tweet &#x201C;I did thalassemia test; unfortunately, it was positive,&#x201D; this tweet shows a positive compound score of 0.296. In fact, it is originally a negative sentiment. Its polarity score has been changed to positive due to the word &#x2018;positive&#x2019; appearing in the tweet with a high sentiment value (i.e., 2.6) which has affected the final compound score to be positive at the end. Similarly, the word &#x2018;negative&#x2019; refers to thalassemia screening results containing a high negative sentiment value (i.e., &#x2212;2.7).</p>
<p>This can affect the sentiment polarity towards the negative side once it is a part of thalassemia screening conversations, affecting sentiment classification accuracy. In addition, we have also removed the word &#x2018;cancer&#x2019; from the original lexical dictionary since these words do not contain any positive or negative connotations as they are a part of the health domain. Consequently, that has resulted in a more favorable F1 score of 0.818. Thus, other classification variations between using VADER and utilizing a human rater refer to challenging obstacles faced in classifying sentiments, including ambiguity, sarcasm, as well as mixed sentiments, which seem quite difficult for the human rater to identify. Eventually, this study presented the VADER analyzer&#x2019;s adjusted version to measure thalassemia screening tweets&#x2019; sentiment scores.</p>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Tools</title>
<p>TWINT, NLTK (Natural Language Toolkit), and VADER constitute the three main tools, which were applied in this study. TWINT has been applied for collecting the raw data from Twitter. NLTK, a text analysis Python library, has been utilized for data pre-processing activities to prepare it for sentiment classification. VADER, a sentiment lexicon, has been used to categorize the relevant tweets&#x2019; sentiments into ternary classes.</p>
</sec>
<sec id="s4_7">
<label>4.7</label>
<title>TWINT</title>
<p>TWINT represents a sophisticated Twitter scraping instrument, which is written in Python and aims at enabling scholars to scrape stored tweets collected from profiles on Twitter without using API authentication. TWINT utilizes the search operators of Twitter for scraping the tweets. Additionally, it aims to scrape tweets, which are linked to hashtags, topics, and trends. It also aims to sort out sensitive data from the Tweets&#x2019; emails and users&#x2019; phone numbers in a helpful way. Moreover, TWINT provides queries to Twitter, and enables scraping the users&#x2019; followers on Twitter, special tweets, which are loved by users, and their likes and follows without using an API authentication or even browser emulation [<xref ref-type="bibr" rid="ref-32">32</xref>]. TWINT is a standard Python library used in various text analysis research as a primary tool for data acquisition [<xref ref-type="bibr" rid="ref-26">26</xref>,<xref ref-type="bibr" rid="ref-33">33</xref>].</p>
</sec>
<sec id="s4_8">
<label>4.8</label>
<title>NLTK</title>
<p>Natural Language Toolkit, i.e., NLTK represents a Python library. It provides a specific base for building Python programs, as well as data classification. The toolbox also plays a key role in converting textual data to a certain format, where sentiments are extracted. NLTK mainly aims to perform natural language processing via performing analysis of human language data. Also, NLTK provides different functions for pre-processing data to perform all NLP methodologies, such as part-of-speech tagging, tokenizing, lemmatizing, stemming, parsing performing sentiment analysis for specified datasets. By doing so, the available data can be fit for feature extraction and mining [<xref ref-type="bibr" rid="ref-34">34</xref>].</p>
</sec>
<sec id="s4_9">
<label>4.9</label>
<title>VADER</title>
<p>Vader represents a gold-standard sentiment lexicon; it is mainly attuned to contexts on social media platforms. It aims to combine several lexical features considering 5 generalizable rules: (1) Punctuation, (2) Capitalization, (3) Degree modifiers, (4) Polarity shift due to Conjunctions, and (5) Catching Polarity Negation. The rules are syntactic and grammatical conventions used by people when they highlight or express the intensity of sentiment. Hutto et al. [<xref ref-type="bibr" rid="ref-22">22</xref>] compared VADER efficiency to eleven benchmark models, involving linguistic inquiry and word count (LIWC), affective norms for english words (ANEW), the General Inquirer, SentiWordNet, and other techniques of machine learning, including Naive Bayes, Maximum Entropy, as well as SVM algorithms. They concluded that the VADER sentiment lexicon generalized more favorably over contexts than other models. Botchway et al. [<xref ref-type="bibr" rid="ref-26">26</xref>] and Kumaresh et al. [<xref ref-type="bibr" rid="ref-25">25</xref>] examined various lexicons in text classification and concluded that VADER had beaten other lexicons in terms of accuracy. Indeed, the VADER sentiment lexicon discriminates itself from other models as being more sensitive to some sentiment phrases in certain social media texts. VADER enjoys the capability of generalizing more efficiently in other domains. Besides common dictionary words, it also gives sentiment scores on emoticons, slang (nah, meh, giggly, etc.), and acronyms (LOL, OMG, etc.) [<xref ref-type="bibr" rid="ref-22">22</xref>].</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Results</title>
<p>This section provides the results of the sentiment analysis on Twitter with the application of the following tools: VADER and NLTK.</p>
<sec id="s5_1">
<label>5.1</label>
<title>Descriptive Analysis</title>
<p>Primarily, there were 3,375 tweets were found during the data collection period associated with the thalassemia screening topic. The baseline monthly thalassemia screening tweets volume has fluctuated between (20) and (50) tweets, whereby an explosive number of tweets were observed in May due to world thalassemia day, celebrated every year on May 8th. The tweets memorialized thalassemia victims and encouraged those who are struggling to live with the disease, raising awareness among vulnerable people against thalassemia risks and encouraging them to do screening early. The volume of tweets gradually dropped down and returned to the baseline, as illustrated in <xref ref-type="fig" rid="fig-3">Fig. 3</xref>.</p>
<fig id="fig-3">
<label>Figure 3</label>
<caption>
<title>Temporal trends of thalassemia screening tweet volume</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-3.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig-4">Fig. 4</xref> displays the most 25 unigram words that are frequently used while people were having conversations about thalassemia screening. As observed in <xref ref-type="fig" rid="fig-4">Fig. 4</xref> above, many words are relevant to blood, screen, get, marriage, and patient. Similarly, <xref ref-type="fig" rid="fig-5">Fig. 5</xref> displays the most 25 frequent bigrams and common words, such as (make, mandatory), (blood, marriage), (pregnant, women), (get, marry), and (blood, transfusion). For example, the terms &#x201C;Getting Married&#x201D;, &#x201C;Pregnant Woman&#x201D;, &#x201C;Genetic&#x201D;, &#x201C;Carrier&#x201D; and &#x201C;Prevent&#x201D; were observed in both bigram and unigram graphs, emphasizing the dire need for thalassemia screening for people who will get married and for pregnant women, which allowed them to access their status of thalassemia at an early stage. Therefore, they have adequate time to identify the risks and prevent the disease from being passed from parents to children through genes. Moreover, the terms, &#x201C;Blood Transfusion&#x201D;, &#x201C;Blood Donation&#x201D;, &#x201C;Blood Group&#x201D; and &#x201C;Iron Deficiency&#x201D; emphasized an urgent need for blood donation because of the lack of iron and the periodic need for thalassemia patients for blood transfusions.</p>
<fig id="fig-4">
<label>Figure 4</label>
<caption>
<title>The most 25 unigrams frequent words</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-4.tif"/>
</fig><fig id="fig-5">
<label>Figure 5</label>
<caption>
<title>The most 25 bigrams frequent words</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-5.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig-6">Fig. 6</xref> displays the top co-occurring bigrams visualization in the dataset as a network diagram with the help of a Python package named NetworkX. Based on the graph in <xref ref-type="fig" rid="fig-6">Fig. 6</xref>, it is easier to understand the relationship between the words (nodes), which frequently appeared together by drawing the edge that determines the words&#x2019; connectedness to each other. For instance, combinations of connected terms in sickle-cell-anemia disease were observed, indicating that most patients of thalassemia have severe forms of the disease and suffer from chronic anemia (sickle cell disease). Additionally, a combination of &#x201C;hiv-hepatitis&#x201D; terms was observed with the co-occurrence of words for three main reasons:</p>
<fig id="fig-6">
<label>Figure 6</label>
<caption>
<title>Networks of 25 co-occurring bigrams words</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-6.tif"/>
</fig>
<p><list list-type="bullet">
<list-item>
<p>Some tweets complained that most people do not pay attention to thalassemia screening as much as they are concerned about HIV and hepatitis tests.</p></list-item>
<list-item>
<p>Another group of users demanded a mandatory thalassemia test before marriage besides HIV and Hepatitis tests.</p></list-item>
<list-item>
<p>Some tweets mentioned that some patients with thalassemia suffered from HIV and Hepatitis positive results due to allegedly receiving infected blood during the transfusion stage. Therefore, examining the donated blood is a priority to prevent thalassemia people from having infectious diseases, thus alleviating their pain.</p></list-item>
</list></p>
<p><xref ref-type="fig" rid="fig-7">Fig. 7</xref> maps the countries mentioned in public conversations regarding thalassemia screening. This map helped determine the correlation between countries with a high rate of thalassemia prevalence and active conversations over thalassemia screening on Twitter. <xref ref-type="table" rid="table-3">Table 3</xref> demonstrates the 10 most countries mentioned during people&#x2019;s conversations about thalassemia screening.</p>
<fig id="fig-7">
<label>Figure 7</label>
<caption>
<title>Geospatial distribution of mentioned countries</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-7.tif"/>
</fig><table-wrap id="table-3">
<label>Table 3</label>
<caption>
<title>Top 10 countries mentioned by people with their frequencies</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Country</th>
<th>Frequency mentioned</th>
</tr>
</thead>
<tbody>
<tr>
<td>India</td>
<td>181</td>
</tr>
<tr>
<td>Pakistan</td>
<td>164</td>
</tr>
<tr>
<td>UAE</td>
<td>22</td>
</tr>
<tr>
<td>Cyprus</td>
<td>17</td>
</tr>
<tr>
<td>Oman</td>
<td>13</td>
</tr>
<tr>
<td>Thailand</td>
<td>12</td>
</tr>
<tr>
<td>Nepal</td>
<td>12</td>
</tr>
<tr>
<td>Maldives</td>
<td>11</td>
</tr>
<tr>
<td>Egypt</td>
<td>11</td>
</tr>
<tr>
<td>Mali</td>
<td>10</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As remarked in [<xref ref-type="bibr" rid="ref-35">35</xref>], more than 70,000 babies are born with thalassemia worldwide each year. This defect is observed more often in the Indian subcontinent, the Mediterranean, Southeast Asia, and West Africa. Also, most children with thalassemia are born to women in countries, where people receive a low income. Thalassemia was first confined to the tropics and subtropics. Currently, it is commonly found worldwide, and that is because of the substantial population migrations. Moreover, Beta-thalassemia is most prevalent in Mediterranean, African, and Southeast Asian descent, whereas Alpha thalassemia is much more widespread among African and Southeast Asian descent.</p>
<p>From the figure above, we can observe the highest countries mentioned in tweets, including India and Pakistan. They were expressed in the extracted data with 181 and 164 tweets, respectively. According to the study conducted by [<xref ref-type="bibr" rid="ref-36">36</xref>], most of the available info about thalassemia in the South Asian region originated from studies in India. Because of severe heterogeneity, the variable frequency of the beta-thalassemia heterozygote or carrier ranging between 1%&#x2013;10% was registered in different areas of India. Nevertheless, the general prevalence of beta-thalassemia carriers was in India between 2.78% and 4%. This represents nearly 30&#x2013;48 million of India&#x2019;s population, in other words, approximately 5&#x2013;12 million carriers of thalassemia with a rate, ranging between 5% and 7% in Pakistan [<xref ref-type="bibr" rid="ref-37">37</xref>&#x2013;<xref ref-type="bibr" rid="ref-39">39</xref>].</p>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>VADER Sentiment Classification</title>
<p>The algorithm of VADER lexicon-based was utilized for extracting features via importing SentimentIntensityAnalyzer from vaderSentiment in the NLTK library. The utilized method &#x201C;polarity_scores ()&#x201D; from SentimentIntensityAnalyzer () module has provided four sentiment scores as the output scores, comprising positive, negative, as well as neutral scores, with the compound scores. Regarding the compound scores, every sentiment orientation of the tweet has been identified in accordance with its values. These scores are in a dictionary in Python, whereas the compound scores are extracted for measuring whether each tweet is a positive tweet, a negative tweet, or a neutral tweet with a value ranging from [&#x2212;1 to &#x002B;1]. <xref ref-type="table" rid="table-4">Table 4</xref> shows the tweets&#x2019; sentiments with corresponding values after using the threshold (Table x) to classify tweets into three predefined categories. The values of [1, 0, &#x2212;1 ] were set to refer to positive public sentiments, neutral, as well as negative public sentiments [<xref ref-type="bibr" rid="ref-40">40</xref>].</p>
<table-wrap id="table-4">
<label>Table 4</label>
<caption>
<title>Classification of tweets by sentiment</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th>Orginal_Tweet</th>
<th>Tweet_SA</th>
<th>Neg</th>
<th>Neu</th>
<th>Pos</th>
<th>Compound</th>
<th>Sentiment</th>
</tr>
</thead>
<tbody>
<tr>
<td>@BorealPenguin Yeah, that helps! The thing the doc doesn&#x2019;t know where the blood came from, so a diet test (since I have thalassemia) and retina scans are needed</td>
<td>Yeah, that helps! The thing is the doc doesn&#x2019;t know where the blood came from, so a diet test (since I have thalassemia) and retina scans are needed</td>
<td>0.00</td>
<td>0.836</td>
<td>0.164</td>
<td>0.6239</td>
<td>1</td>
</tr>
<tr>
<td>@narcissists_the yeh, but I think they do test for thalassemia, and they should&#x2013;it&#x2019;s a legit concern bc some of my red blood cells aren&#x2019;t shaped normally. But def I would lie about the gay thing&#x2026;</td>
<td>yeh, but I think they do test for thalassemia, and they should&#x2013;it&#x2019;s a legit concern bc some of my red blood cells aren&#x2019;t shaped normally. But def I would lie about the gay thing or living in Ger&#x2026;.</td>
<td>0.00</td>
<td>1.00</td>
<td>0.00</td>
<td>0.000</td>
<td>0</td>
</tr>
<tr>
<td>@ftrhshrf Hi Dr I want to ask why malaysia does not require thalassemia test before marriage like HIV, hepatitis B n blood group. I understand that is a choice to every couple to do the test or &#x2026;</td>
<td>Hi Dr I want to ask why malaysia does not require thalassemia test before marriage like HIV, hepatitis B n blood group. I understand that is a choice to every couple to do the test or not but wh&#x2026;</td>
<td>0.07</td>
<td>0.877</td>
<td>0.053</td>
<td>&#x2212;0.4497</td>
<td>&#x2212;1</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="fig" rid="fig-8">Fig. 8</xref> displays the sentiment results by the number of tweets of each class in a bar chart using the matplotlib.pyplot Python library.</p>
<fig id="fig-8">
<label>Figure 8</label>
<caption>
<title>Number of the tweets by sentiment</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-8.tif"/>
</fig>
<p>Based on the number of tweets by sentiment in <xref ref-type="fig" rid="fig-8">Fig. 8</xref> and <xref ref-type="table" rid="table-5">Table 5</xref>, there are about 3376 tweets, incorporating all sentiment categories. Most tweets referred to positive and neutral sentiments regarding thalassemia screening. Remarkably, (1569) 46.5% of the obtained tweets indicated positive sentiments, while (1302) 38.6% of these tweets indicated neutral opinions, and (503) 14.9% referred to negative sentiments. The positive tweets were the highest in number compared with other classifications as the value of the given compound threshold expressed positive sentiments regarding thalassemia screening.</p>
<table-wrap id="table-5">
<label>Table 5</label>
<caption>
<title>Results of the vader algorithm</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th style="background:#FFFFFF;">Total tweets</th>
<th style="background:#FFFFFF;">3376</th>
</tr>
</thead>
<tbody>
<tr>
<td style="background:#FFFFFF;">Positive tweets</td>
<td style="background:#FFFFFF;">1569</td>
</tr>
<tr>
<td style="background:#FFFFFF;">Neutral tweets</td>
<td style="background:#FFFFFF;">1302</td>
</tr>
<tr>
<td style="background:#FFFFFF;">Negative tweets</td>
<td style="background:#FFFFFF;">503</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="fig" rid="fig-8">Fig. 8</xref> and <xref ref-type="table" rid="table-5">Table 5</xref> illustrates the word cloud of the negative and positive tweets labeled through VADER to manifest the dominant words for each class. The search keywords (i.e., thalassemia and screening), and stop words (i.e., they, I, do, them) were removed because they appeared in almost all tweets. This helped Wordcloud focus on terms only and be more productive in gaining insights precisely. It was observed that the most frequent words appearing in both Figures are domain-related, such as blood, disease, disorder, alpha, beta, genetic, etc. as we expected. By looking at the Wordcloud of positive tweets, some words were observed like prevent, awareness, free, support, mandatory, marriage, camp, and pregnancy. The results showed considerable solidarity and support among the thalassemia community on Twitter translated by organizing free test camps, spreading awareness among the public, and making mandatory tests before marriage and during pregnancy. The Wordcloud of negative tweets showed the following words: child, suffer, HIV, and negligence, which highlighted the suffering of children from thalassemia during blood transfusion who were infected with HIV due to negligence in blood screening.</p>

<p>A Word Cloud also called a tag cloud, or a text cloud is a tool for visually summarizing a vast amount of text [<xref ref-type="bibr" rid="ref-25">25</xref>]. In this study, <xref ref-type="fig" rid="fig-9">Fig. 9</xref> showed the word cloud to visualize the tweets for every sentiment classification based on the VADER scores after removing the meaningless information.</p>
<fig id="fig-9">
<label>Figure 9</label>
<caption>
<title>Wordcloud of positive (right) and negative (left) tweets</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-9.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig-10">Fig. 10</xref> demonstrates the distribution of the negative and positive compound scores as a histogram plot after neglecting the neutral ones.</p>
<fig id="fig-10">
<label>Figure 10</label>
<caption>
<title>Compound score distribution: negative tweets (left) and Positive Tweets (right)</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-10.tif"/>
</fig>
<p>Based on <xref ref-type="fig" rid="fig-11">Fig. 11</xref>, the negative opinions do not correspond to the positive and negative sentiments. The positive tweets have a significant numerical advantage over the negative ones in terms of the compound score. The mean of the positive tweets is 0.4758, as opposed to negative tweets, which have an average of about &#x2212;0.4234.</p>
<fig id="fig-11">
<label>Figure 11</label>
<caption>
<title>Compound score distribution in boxplot</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-11.tif"/>
</fig>
<p>As shown in <xref ref-type="fig" rid="fig-11">Fig. 11</xref>, a box plot gives graphical information concerning the positive and negative tweets such as batch location, dispersion, and the data set&#x2019;s skewness. Also, a boxplot drew attention to specific possible outliers. Therefore, it is easier to compare and reflect on these 4 features of our data sets.
<list list-type="bullet">
<list-item>
<p>Assessment of batch location: The figure above indicates that the median compound score of the obtained positive tweets was larger compared with the negative tweets.</p></list-item>
<list-item>
<p>Assessment of data dispersion: These interquartile ranges were not parallel, as indicated by the length of boxes for positive tweets and negative tweets and the data set&#x2019;s total range for positive tweets is greater. This is illustrated through the distance between the two whiskers&#x2019; ends for every boxplot.</p></list-item>
<list-item>
<p>The difference in skewness: Even though both data batches appeared to be left skewness and the negative tweet batch is a little more skewed compared with positive tweets. The skewness of the sample of a compound score for positive tweets recorded &#x2212;0.184, while negative tweets recorded a compound score of &#x2212;0.287. Thus, both skewnesses were negative, and the negative tweets&#x2019; value was slightly larger, which corresponds to a further apparent absence of symmetry; however, neither of the skewnesses was especially large.</p></list-item>
<list-item>
<p>Assessment of potential outliers: Neither of the data sets showed any questionable far-out rates.</p></list-item>
</list></p>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>Model Evaluation</title>
<p>Regarding the model evaluation of the proposed approach, we used four performance measures, such as True Positive (TP), True Negative (TN), False Positive (FP), and False Negative (FN). These four performance measures represent the confusion matrix and are shown in <xref ref-type="fig" rid="fig-12">Fig. 12</xref>. The confusion matrix represents a mixture of predicted and actual observations to measure the current algorithm&#x0027;s effectiveness by determining precision and recall, in addition to the F1 Score.</p>
<fig id="fig-12">
<label>Figure 12</label>
<caption>
<title>VADER confusion matrix</title>
</caption>
<graphic mimetype="image" mime-subtype="tif" xlink:href="CMC_39228-fig-12.tif"/>
</fig>
<p>&#x25A0; Precision</p>
<p>Precision represents the observations&#x2019; rate, which correctly forecasted positive to total forecasted positive ones.</p>
<p><disp-formula id="eqn-1"><label>(1)</label><mml:math id="mml-eqn-1" display="block"><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>&#x25A0; Recall</p>
<p>Recall, which is referred to as sensitivity, signifies the observations&#x2019; rate, which is correctly forecasted positive to the entire real class observations.</p>
<p><disp-formula id="eqn-2"><label>(2)</label><mml:math id="mml-eqn-2" display="block"><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p>&#x25A0; F-measure</p>
<p>F-measure or F-score refers to a performance measure, which combines precision with recall by determining the weighted harmonic mean covering accuracy flaws with the skewed data.</p>
<p><disp-formula id="eqn-3"><label>(3)</label><mml:math id="mml-eqn-3" display="block"><mml:mi>F</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x2217;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo>&#x2217;</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mo>/</mml:mo></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula></p>
<p><xref ref-type="fig" rid="fig-12">Fig. 12</xref> shows the confusion matrix of 320 randomly selected sentiments. They were manually labeled by three domain expert annotators at the tweet level for identifying ternary classes (actual labels) to be compared to the results of the VADER classifier (predicted labels). According to the obtained results, the total number of the actual positive sentiments, the neutral sentiments, as well as the negative sentiments is 137, 130, 53, correspondingly. When the VADER lexicon was used, the model anticipated that the positive attitudes number will be 148, 109 for neutral sentiments, and 63 for negative opinions.</p>
<p><xref ref-type="table" rid="table-6">Table 6</xref> compares several related studies about public health regarding precision (<italic>P</italic>), recall (), and <italic>F</italic>-score (<italic>F</italic>1). It is, therefore, concluded that the proposed approach achieved promising results, which outperformed many proposals with 0.829 precision, 0.816 recall, and 0.818 <italic>F</italic>1 score. However, holding a comparison between approaches is a difficult task for a couple of reasons. First, these approaches are oriented toward various sentiment analysis stages [<xref ref-type="bibr" rid="ref-41">41</xref>]. Second, the proposed methods are based on various approaches (i.e., machine learning or lexicon-based), topics, as well as domains, whereby the linguistic resources utilized vary in size and context, making the comparison between the proposals so tricky. Therefore, to perform an appropriate comparison between the proposals, the same corpus in all evaluations is necessitated.</p>
<table-wrap id="table-6">
<label>Table 6</label>
<caption>
<title>Comparison of results of previous related works</title>
</caption>
<table frame="hsides">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
<col align="left"/>
</colgroup>
<thead>
<tr>
<th style="background:#FFFFFF;">Work</th>
<th style="background:#FFFFFF;">Dataset</th>
<th style="background:#FFFFFF;">P</th>
<th style="background:#FFFFFF;">R</th>
<th style="background:#FFFFFF;">F1</th>
<th style="background:#FFFFFF;">Approach</th>
</tr>
</thead>
<tbody>
<tr>
<td style="background:#FFFFFF;">[<xref ref-type="bibr" rid="ref-12">12</xref>]</td>
<td style="background:#FFFFFF;">Health reviews</td>
<td style="background:#FFFFFF;">0.89</td>
<td style="background:#FFFFFF;">0.79</td>
<td style="background:#FFFFFF;">0.83</td>
<td style="background:#FFFFFF;">Hybrid (UMLS &#x002B; SWN)</td>
</tr>
<tr>
<td style="background:#FFFFFF;">[<xref ref-type="bibr" rid="ref-13">13</xref>]</td>
<td style="background:#FFFFFF;">Diabetes</td>
<td style="background:#FFFFFF;">0.819</td>
<td style="background:#FFFFFF;">0.811</td>
<td style="background:#FFFFFF;">0.812</td>
<td style="background:#FFFFFF;">SWN</td>
</tr>
<tr>
<td style="background:#FFFFFF;">[<xref ref-type="bibr" rid="ref-16">16</xref>]</td>
<td style="background:#FFFFFF;">Hearing loss</td>
<td style="background:#FFFFFF;">0.688</td>
<td style="background:#FFFFFF;">0.686</td>
<td style="background:#FFFFFF;">0.685</td>
<td style="background:#FFFFFF;">Linear Reg.</td>
</tr>
<tr>
<td style="background:#FFFFFF;">[<xref ref-type="bibr" rid="ref-17">17</xref>]</td>
<td style="background:#FFFFFF;">Health reviews</td>
<td style="background:#FFFFFF;">0.527</td>
<td style="background:#FFFFFF;">0.541</td>
<td style="background:#FFFFFF;">0.518</td>
<td style="background:#FFFFFF;">Na&#x00EF;ve Bayes</td>
</tr>
<tr>
<td style="background:#FFFFFF;" rowspan="2">[<xref ref-type="bibr" rid="ref-18">18</xref>]</td>
<td style="background:#FFFFFF;">Medical Condition</td>
<td style="background:#FFFFFF;">0.68</td>
<td style="background:#FFFFFF;">0.60</td>
<td style="background:#FFFFFF;">0.63</td>
<td style="background:#FFFFFF;">CNN</td>
</tr>
<tr>
<td style="background:#FFFFFF;">Medication</td>
<td style="background:#FFFFFF;">0.86</td>
<td style="background:#FFFFFF;">0.77</td>
<td style="background:#FFFFFF;">0.82</td>
<td style="background:#FFFFFF;">CNN</td>
</tr>
<tr>
<td style="background:#FFFFFF;">[<xref ref-type="bibr" rid="ref-19">19</xref>]</td>
<td style="background:#FFFFFF;">HPV vaccines</td>
<td style="background:#FFFFFF;">&#x2013;</td>
<td style="background:#FFFFFF;">&#x2013;</td>
<td style="background:#FFFFFF;">0.744</td>
<td style="background:#FFFFFF;">SVM</td>
</tr>
<tr>
<td style="background:#FFFFFF;">Our approach</td>
<td style="background:#FFFFFF;">Thalassemia screening</td>
<td style="background:#FFFFFF;">0.829</td>
<td style="background:#FFFFFF;">0.816</td>
<td style="background:#FFFFFF;">0.818</td>
<td style="background:#FFFFFF;">VADER</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Vader is a gold-standard sentiment lexicon that is particularly developed for microblogs such as Twitter. It combines the lexical features with consideration for five generalizable rules (punctuation, capitalization, degree modifiers, constructive conjunction, and Tri-gram examination to identify negation), which are grammatical and syntactic conventions that humans use to express and emphasize sentiment intensity. Indeed, VADER has outperformed individual human raters at correctly classifying the sentiment of tweets into positive, neutral, or negative classes by 0.96 to 0.84 of the F1 measure (Hutto &#x0026; Gilbert, 2014). The VADER sentiment lexicon discriminates itself from others in that it is more sensitive to sentiment expressions in social media contexts and generalizes better to other domains. Besides common dictionary words, it also gives information on emoticons, slang (nah, meh, giggly, etc.), and acronyms (LOL, OMG, etc.) (Hutto &#x0026; Gilbert, 2014). I have chosen VADER in the present project because it is becoming broadly adopted as it was even implemented as a component of the NLTK python library. However, several limitations can be outlined in this study. Firstly, the introduced approach handles English tweets only although a wealth of information about thalassemia is adequately available in different languages. It is, therefore, highly recommended that this method is applied to other languages, e.g., Arabic. Secondly, the general opinion lexicon is inadequate to capture the meanings of health-related texts.</p>
</sec>
</sec>
<sec id="s6">
<label>6</label>
<title>Conclusion</title>
<p>Due to the lack of relevant studies about the utilization of sentiment analysis in the public health domain, particularly thalassemia screening, this study utilized one of the social media platforms, i.e., Twitter platform. The study explored people&#x2019;s sentiments regarding thalassemia screening with the application of the VADER lexicon-based algorithm. The results revealed that the suggested approach had achieved results of 0.829, 0.816, and 0.818 for the corresponding precision and recall, as well as the F1 measure. It was found that the results of the VADER analysis are remarkably promising. This study showed by digging into tweets and based on the most frequent words obtained, there is a significant amount of support and solidarity among the thalassemia Twitter community, with numerous popular terms related to positive emotions and sentiments observed such as &#x2018;make,&#x2019; &#x2018;mandatory,&#x2019; &#x2018;marriage,&#x2019; &#x2018;free,&#x2019; &#x2018;child,&#x2019; and &#x2018;prevent&#x2019;. Also, the final results showed the correlation between countries with a high prevalence of thalassemia and the active conversations about thalassemia screening on Twitter where India and Pakistan ranked the highest mention in tweets by the public&#x2019;s conversations on thalassemia screening with 181 and 164 tweets, respectively. In the future, the system can be developed by establishing a domain-related lexicon along with the current one to achieve better results.</p>
</sec>
</body>
<back>
<ack>
<p>The authors are thankful to the Deanship of Scientific Research at Najran University and registrar of Universiti Tun Hussein Onn Malaysia.</p>
</ack>
<sec><title>Funding Statement</title>
<p>The authors are thankful to the Deanship of Scientific Research at Najran University for funding this work under the Research Collaboration Funding program grant coder NU/RC/SERC/11/5.</p>
</sec>
<sec sec-type="COI-statement"><title>Conflicts of Interest</title>
<p>The authors declare that they have no conflicts of interest to report regarding the present study.</p>
</sec>
<ref-list content-type="authoryear">
<title>References</title>
<ref id="ref-1"><label>[1]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Angastiniotis</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Lobitz</surname></string-name></person-group>, &#x201C;<article-title>Thalassemias: An overview</article-title>,&#x201D; <source>International Journal of Neonatal Screen</source>, vol. <volume>5</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>11</lpage>, <year>2019</year>. <pub-id pub-id-type="doi">10.3390/ijns5010016</pub-id>; <pub-id pub-id-type="pmid">33072976</pub-id></mixed-citation></ref>
<ref id="ref-2"><label>[2]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. T.</given-names> <surname>Riquier</surname></string-name></person-group>, &#x201C;<article-title>The ethics of genetic screening for beta thalassemia in Vietnam</article-title>,&#x201D; <source>Developing World Bioethics</source>, vol. <volume>22</volume>, no. <issue>1</issue>, pp. <fpage>44</fpage>&#x2013;<lpage>52</lpage>, <year>2022</year>; <pub-id pub-id-type="pmid">33974342</pub-id></mixed-citation></ref>
<ref id="ref-3"><label>[3]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Kantharaj</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Chandrashekar</surname></string-name></person-group>, &#x201C;<article-title>Coping with the burden of thalassemia: Aiming for a thalassemia free world</article-title>,&#x201D; <source>Global Journal of Transfusion Medicine</source>, vol. <volume>3</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>, <year>2018</year>. <pub-id pub-id-type="doi">10.4103/gjtm.gjtm_19_18</pub-id></mixed-citation></ref>
<ref id="ref-4"><label>[4]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H. Al</given-names> <surname>Sabbah</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Khan</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Hamadna</surname></string-name>, <string-name><given-names>L. A.</given-names> <surname>Ghazaleh</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Dudin</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Factors associated with continuing emergence of &#x03B2;-thalassemia major despite prenatal testing: A cross-sectional survey</article-title>,&#x201D; <source>International Journal of Women&#x2019;s Health</source>, vol. <volume>9</volume>, no. <issue>9</issue>, pp. <fpage>673</fpage>&#x2013;<lpage>679</lpage>, <year>2017</year>. <pub-id pub-id-type="doi">10.2147/IJWH.S141936</pub-id>; <pub-id pub-id-type="pmid">29026336</pub-id></mixed-citation></ref>
<ref id="ref-5"><label>[5]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Yadav</surname></string-name>, <string-name><given-names>O.</given-names> <surname>Kudale</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Rao</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Gupta</surname></string-name> and <string-name><given-names>A.</given-names> <surname>Shitole</surname></string-name></person-group>, &#x201C;<chapter-title>Twitter sentiment analysis using supervised machine learning</chapter-title>,&#x201D; in <source>Intelligent Data Communication Technologies and Internet of Things</source>, vol. <volume>57</volume>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer</publisher-name>, pp. <fpage>631</fpage>&#x2013;<lpage>642</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-6"><label>[6]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>E.</given-names> <surname>Smailhodzic</surname></string-name>, <string-name><given-names>W.</given-names> <surname>Hooijsma</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Boonstra</surname></string-name> and <string-name><given-names>D. J.</given-names> <surname>Langley</surname></string-name></person-group>, &#x201C;<article-title>Social media use in healthcare: A systematic review of effects on patients and on their relationship with healthcare professionals</article-title>,&#x201D; <source>BMC Health Services Research</source>, vol. <volume>16</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>15</lpage>, <year>2016</year>. <pub-id pub-id-type="doi">10.1186/s12913-016-1691-0</pub-id>; <pub-id pub-id-type="pmid">27562728</pub-id></mixed-citation></ref>
<ref id="ref-7"><label>[7]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>D&#x2019;Andrea</surname></string-name>, <string-name><given-names>F.</given-names> <surname>Ferri</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Grifoni</surname></string-name> and <string-name><given-names>T.</given-names> <surname>Guzzo</surname></string-name></person-group>, &#x201C;<article-title>Approaches, tools and applications for sentiment analysis implementation</article-title>,&#x201D; <source>International Journal of Computer Applications</source>, vol. <volume>125</volume>, no. <issue>3</issue>, pp. <fpage>26</fpage>&#x2013;<lpage>33</lpage>, <year>2015</year>. <pub-id pub-id-type="doi">10.5120/ijca2015905866</pub-id></mixed-citation></ref>
<ref id="ref-8"><label>[8]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Mittal</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Patidar</surname></string-name></person-group>, &#x201C;<article-title>Sentiment analysis on twitter data: A survey</article-title>,&#x201D; in <conf-name>Proc. of the 7th Int. Conf. on Computer and Communications Management</conf-name>, <publisher-loc>New York, United States</publisher-loc>, pp. <fpage>91</fpage>&#x2013;<lpage>95</lpage>, <year>2019</year>. <pub-id pub-id-type="doi">10.1145/3348445.3348466</pub-id></mixed-citation></ref>
<ref id="ref-9"><label>[9]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>R. K.</given-names> <surname>Botchway</surname></string-name>, <string-name><given-names>A. B.</given-names> <surname>Jibril</surname></string-name>, <string-name><given-names>Z. K.</given-names> <surname>Oplatkov&#x00E1;</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Chovancov&#x00E1;</surname></string-name></person-group>, &#x201C;<article-title>Deductions from a sub-saharan African bank&#x2019;s tweets: A sentiment analysis approach</article-title>,&#x201D; <source>Cogent Economics &#x0026; Finance</source>, vol. <volume>8</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>19</lpage>, <year>2020</year>. <pub-id pub-id-type="doi">10.1145/3348445.3348466</pub-id></mixed-citation></ref>
<ref id="ref-10"><label>[10]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>Y. C.</given-names> <surname>Phang</surname></string-name>, <string-name><given-names>A. M.</given-names> <surname>Kassim</surname></string-name> and <string-name><given-names>E.</given-names> <surname>Mangantig</surname></string-name></person-group>, &#x201C;<article-title>Concerns of thalassemia patients, carriers, and their caregivers in Malaysia: Text mining information shared on social media</article-title>,&#x201D; <source>Healthcare Informatics Research</source>, vol. <volume>27</volume>, no. <issue>3</issue>, pp. <fpage>200</fpage>&#x2013;<lpage>213</lpage>, <year>2021</year>; <pub-id pub-id-type="pmid">34384202</pub-id></mixed-citation></ref>
<ref id="ref-11"><label>[11]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Zubair</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Qasim</surname></string-name>, <string-name><given-names>B.</given-names> <surname>Ahmad</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Ahmad</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Khan</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Health miner: Opinion extraction from user generated health reviews</article-title>,&#x201D; <source>International Journal of Academic Research</source>, vol. <volume>5</volume>, no. <issue>6</issue>, pp. <fpage>279</fpage>&#x2013;<lpage>284</lpage>, <year>2013</year>. <pub-id pub-id-type="doi">10.7813/2075-4124.2013/5-6/A.35</pub-id></mixed-citation></ref>
<ref id="ref-12"><label>[12]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. Z.</given-names> <surname>Asghar</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Ahmad</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Qasim</surname></string-name>, <string-name><given-names>S. R.</given-names> <surname>Zahra</surname></string-name> and <string-name><given-names>F. M.</given-names> <surname>Kundi</surname></string-name></person-group>, &#x201C;<article-title>SentiHealth: Creating health-related sentiment lexicon using hybrid approach</article-title>,&#x201D; <source>Springerplus</source>, vol. <volume>5</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>23</lpage>, <year>2016</year>. <pub-id pub-id-type="doi">10.1186/s40064-016-2809-x</pub-id>; <pub-id pub-id-type="pmid">27504237</pub-id></mixed-citation></ref>
<ref id="ref-13"><label>[13]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. D. P.</given-names> <surname>Salas-Z&#x00E1;rate</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Medina-Moreira</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Lagos-Ortiz</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Luna-Aveiga</surname></string-name>, <string-name><given-names>M.&#x00C1;.</given-names> <surname>Rodr&#x00ED;guez-Garc&#x00ED;a</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Sentiment analysis on tweets about diabetes: An aspect-level approach</article-title>,&#x201D; <source>Computational and Mathematical Methods in Medicine</source>, vol. <volume>2017</volume>, no. <issue>5</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>9</lpage>, <year>2017</year>. <pub-id pub-id-type="doi">10.1155/2017/5140631</pub-id>; <pub-id pub-id-type="pmid">28316638</pub-id></mixed-citation></ref>
<ref id="ref-14"><label>[14]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>Verma</surname></string-name> and <string-name><given-names>V.</given-names> <surname>Sapra</surname></string-name></person-group>, &#x201C;<article-title>Semantic analysis of cardiovascular disease sentiment in online social media</article-title>,&#x201D; in <conf-name>Proc. of Int. Conf. on Advancements in Computing &#x0026; Management (ICACM)</conf-name>, <publisher-loc>Dhaka, Bangladesh</publisher-loc>, pp. <fpage>1078</fpage>&#x2013;<lpage>1082</lpage>, <year>2019</year>. <pub-id pub-id-type="doi">10.2139/ssrn.3462426</pub-id></mixed-citation></ref>
<ref id="ref-15"><label>[15]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>K. O.</given-names> <surname>Wong</surname></string-name>, <string-name><given-names>F. G.</given-names> <surname>Davis</surname></string-name>, <string-name><given-names>O. R.</given-names> <surname>Za&#x00EF;ane</surname></string-name> and <string-name><given-names>Y.</given-names> <surname>Yasui</surname></string-name></person-group>, &#x201C;<article-title>Sentiment analysis of breast cancer screening in the United States using twitter</article-title>,&#x201D; in <conf-name>IC3K 2016&#x2014;Proc. 8th Int. Jt. Conf. Knowl. Discov. Knowl. Eng. Knowl. Manag</conf-name>, <publisher-loc>UAlberta, Canadian</publisher-loc>, vol. <volume>1</volume>, pp. <fpage>265</fpage>&#x2013;<lpage>274</lpage>, <year>2016</year>. </mixed-citation></ref>
<ref id="ref-16"><label>[16]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Ali</surname></string-name>, <string-name><given-names>D.</given-names> <surname>Schramm</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Sokolova</surname></string-name> and <string-name><given-names>D.</given-names> <surname>Inkpen</surname></string-name></person-group>, &#x201C;<article-title>Can i hear you? sentiment analysis on medical forums</article-title>,&#x201D; in <conf-name>Proc. of the Sixth Int. Joint Conf. on Natural Language Processing</conf-name>, <publisher-loc>Nagoya, Japan</publisher-loc>, vol. <volume>14&#x2013;18</volume>, pp. <fpage>667</fpage>&#x2013;<lpage>673</lpage>, <year>2013</year>. </mixed-citation></ref>
<ref id="ref-17"><label>[17]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M.</given-names> <surname>Arbane</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Benlamri</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Brik</surname></string-name> and <string-name><given-names>A. D.</given-names> <surname>Alahmar</surname></string-name></person-group>, &#x201C;<article-title>Social media-based COVID-19 sentiment classification model using Bi-LSTM</article-title>,&#x201D; <source>Expert Systems with Applications</source>, vol. <volume>212</volume>, no. <issue>6</issue>, pp. <fpage>118710</fpage>, <year>2023</year>. <pub-id pub-id-type="doi">10.1016/j.eswa.2022.118710</pub-id>; <pub-id pub-id-type="pmid">36060151</pub-id></mixed-citation></ref>
<ref id="ref-18"><label>[18]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Yadav</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Ekbal</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Saha</surname></string-name> and <string-name><given-names>P.</given-names> <surname>Bhattacharyya</surname></string-name></person-group>, &#x201C;<article-title>Medical sentiment analysis using social media: Towards building a patient assisted system</article-title>,&#x201D; in <conf-name>Proc. of the Eleventh Int. Conf. on Language Resources and Evaluation (LREC 2018)</conf-name>, <publisher-loc>Miyazaki, Japan</publisher-loc>, pp. <fpage>2790</fpage>&#x2013;<lpage>2797</lpage>, <year>2019</year>. </mixed-citation></ref>
<ref id="ref-19"><label>[19]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Du</surname></string-name>, <string-name><given-names>J.</given-names> <surname>Xu</surname></string-name>, <string-name><given-names>H.</given-names> <surname>Song</surname></string-name>, <string-name><given-names>X.</given-names> <surname>Liu</surname></string-name> and <string-name><given-names>C.</given-names> <surname>Tao</surname></string-name></person-group>, &#x201C;<article-title>Optimization on machine learning based approaches for sentiment analysis on HPV vaccines related tweets</article-title>,&#x201D; <source>Journal of Biomedical Semantics</source>, vol. <volume>8</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>8</lpage>, <year>2017</year>. <pub-id pub-id-type="doi">10.1186/s13326-017-0120-6</pub-id>; <pub-id pub-id-type="pmid">28253919</pub-id></mixed-citation></ref>
<ref id="ref-20"><label>[20]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Mukhtar</surname></string-name>, <string-name><given-names>M. A.</given-names> <surname>Khan</surname></string-name> and <string-name><given-names>N.</given-names> <surname>Chiragh</surname></string-name></person-group>, &#x201C;<article-title>Lexicon-based approach outperforms supervised machine learning approach for Urdu sentiment analysis in multiple domains</article-title>,&#x201D; <source>Telematics and Informatics</source>, vol. <volume>35</volume>, no. <issue>8</issue>, pp. <fpage>2173</fpage>&#x2013;<lpage>2183</lpage>, <year>2018</year>. <pub-id pub-id-type="doi">10.1016/j.tele.2018.08.003</pub-id></mixed-citation></ref>
<ref id="ref-21"><label>[21]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Jurek</surname></string-name>, <string-name><given-names>M. D.</given-names> <surname>Mulvenna</surname></string-name> and <string-name><given-names>Y.</given-names> <surname>Bi</surname></string-name></person-group>, &#x201C;<article-title>Improved lexicon-based sentiment analysis for social media analytics</article-title>,&#x201D; <source>Security Informatics</source>, vol. <volume>4</volume>, no. <issue>1</issue>, pp. <fpage>9166</fpage>, <year>2015</year>. <pub-id pub-id-type="doi">10.1186/s13388-015-0024-x</pub-id></mixed-citation></ref>
<ref id="ref-22"><label>[22]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>C. J.</given-names> <surname>Hutto</surname></string-name> and <string-name><given-names>E.</given-names> <surname>Gilbert</surname></string-name></person-group>, &#x201C;<article-title>VADER: A parsimonious rule-based model for sentiment analysis of social media text</article-title>,&#x201D; in <conf-name>Proc. of the Int. AAAI Conf. on Web and Social Media</conf-name>, <publisher-loc>Atlanta</publisher-loc>, <publisher-name>Georgia Institute of Technology</publisher-name>, pp. <fpage>216</fpage>&#x2013;<lpage>225</lpage>, <year>2014</year>. </mixed-citation></ref>
<ref id="ref-23"><label>[23]</label><mixed-citation publication-type="book"><person-group person-group-type="author"><string-name><given-names>V. L.</given-names> <surname>Narasamma</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Sreedevi</surname></string-name>, <string-name><given-names>G. V.</given-names> <surname>Kumar</surname></string-name> and <string-name><given-names>A.</given-names> <surname>Pradesh</surname></string-name></person-group>, &#x201C;<chapter-title>TweetShort text data analysis on COVID-19 out break</chapter-title>,&#x201D; in <source>Smart Technologies in Data Science and Communication</source>, vol. <volume>29</volume>. <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer</publisher-name>, pp. <fpage>183</fpage>&#x2013;<lpage>193</lpage>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref-24"><label>[24]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>L.</given-names> <surname>He</surname></string-name> and <string-name><given-names>K.</given-names> <surname>Zheng</surname></string-name></person-group>, &#x201C;<article-title>How do general-purpose sentiment analyzers perform when applied to health-related online social media data?</article-title>,&#x201D; <source>Studies in Health Technology and Informatics</source>, vol. <volume>264</volume>, pp. <fpage>1208</fpage>&#x2013;<lpage>1212</lpage>, <year>2019</year>; <pub-id pub-id-type="pmid">31438117</pub-id></mixed-citation></ref>
<ref id="ref-25"><mixed-citation publication-type="journal"><label>25.</label><person-group person-group-type="author"><string-name><given-names>N.</given-names> <surname>Kumaresh</surname></string-name>, <string-name><given-names>V.</given-names> <surname>Bonta</surname></string-name> and <string-name><given-names>N.</given-names> <surname>Janardhan</surname></string-name></person-group>, &#x201C;<article-title>A comprehensive study on lexicon based approaches for sentiment analysis</article-title>,&#x201D; <source>Asian Journal of Computer Science and Technology</source>, vol. <volume>8</volume>, no. <issue>S2</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>6</lpage>, <year>2019</year>. <pub-id pub-id-type="doi">10.51983/ajcst-2019.8.S2.2037</pub-id></mixed-citation></ref>
<ref id="ref-26"><label>[26]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>R. K.</given-names> <surname>Botchway</surname></string-name>, <string-name><given-names>A. B.</given-names> <surname>Jibril</surname></string-name>, <string-name><given-names>M. A.</given-names> <surname>Kwarteng</surname></string-name>, <string-name><given-names>M.</given-names> <surname>Chovancova</surname></string-name> and <string-name><given-names>Z. K.</given-names> <surname>Oplatkov&#x00E1;</surname></string-name></person-group>, &#x201C;<article-title>A review of social media posts from Unicredit bank in Europe: A sentiment analysis approach</article-title>,&#x201D; in <conf-name>Proc. of the 3rd Int. Conf. on Business and Information Management</conf-name>, <publisher-loc>New York, United States</publisher-loc>, pp. <fpage>74</fpage>&#x2013;<lpage>79</lpage>, <year>2020</year>. </mixed-citation></ref>
<ref id="ref-27"><label>[27]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>J.</given-names> <surname>Al-Garaady</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Mahyoob</surname></string-name></person-group>, &#x201C;<article-title>Public sentiment analysis in social media on the SARS-CoV-2 vaccination using VADER lexicon polarity</article-title>,&#x201D; <source>Humanities and Educational Sciences Journal</source>, vol. <volume>22</volume>, no. <issue>1</issue>, pp. <fpage>591</fpage>&#x2013;<lpage>609</lpage>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref-28"><label>[28]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Mustaqim</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Umam</surname></string-name> and <string-name><given-names>M. A.</given-names> <surname>Muslim</surname></string-name></person-group>, &#x201C;<article-title>Twitter text mining for sentiment analysis on government&#x2019;s response to forest fires with vader lexicon polarity detection and k-nearest neighbor algorithm</article-title>,&#x201D; <source>Journal of Physics: Conference Series</source>, vol. <volume>1567</volume>, no. <issue>3</issue>, pp. <fpage>8</fpage>&#x2013;<lpage>15</lpage>, <year>2020</year>. <pub-id pub-id-type="doi">10.1088/1742-6596/1567/3/032024</pub-id></mixed-citation></ref>
<ref id="ref-29"><label>[29]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>E.</given-names> <surname>Apostolova</surname></string-name> and <string-name><given-names>R.</given-names> <surname>Andrew Kreek</surname></string-name></person-group>, &#x201C;<article-title>Training and prediction data discrepancies: Challenges of text classification with noisy, historical data</article-title>,&#x201D; <source>W-NUT</source>, <year>2018</year>.<pub-id pub-id-type="doi">10.18653/v1/w18-6114</pub-id></mixed-citation></ref>
<ref id="ref-30"><label>[30]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Krouska</surname></string-name>, <string-name><given-names>C.</given-names> <surname>Troussas</surname></string-name> and <string-name><given-names>M.</given-names> <surname>Virvou</surname></string-name></person-group>, &#x201C;<article-title>The effect of preprocessing techniques on Twitter sentiment analysis</article-title>,&#x201D; in <conf-name>2016 7th Int. Conf. on Information, Intelligence, Systems &#x0026; Applications (IISA)</conf-name>, <publisher-loc>Chalkidiki, Greece</publisher-loc>, <publisher-name>IEEE</publisher-name>, pp. <fpage>1</fpage>&#x2013;<lpage>5</lpage>, <year>2016</year>. </mixed-citation></ref>
<ref id="ref-31"><label>[31]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>Elbagir</surname></string-name> and <string-name><given-names>J.</given-names> <surname>Yang</surname></string-name></person-group>, &#x201C;<article-title>Twitter sentiment analysis using natural language toolkit and Vader sentiment</article-title>,&#x201D; in <conf-name>Proc. of the Int. Multiconference of Engineers and Computer Scientists</conf-name>, <publisher-loc>Hong Kong</publisher-loc>, vol. <volume>2239</volume>, pp. <fpage>12</fpage>&#x2013;<lpage>16</lpage>, <year>2019</year>. </mixed-citation></ref>
<ref id="ref-32"><label>[32]</label><mixed-citation publication-type="other"><person-group person-group-type="author"><string-name><given-names>C.</given-names> <surname>Zacharias</surname></string-name> and <string-name><given-names>F.</given-names> <surname>Poldi</surname></string-name></person-group>, &#x201C;<article-title>GitHubtwintproject/twint: An advanced Twitter scraping &#x0026; OSINT tool written in Python that doesn't use Twitter's API, allowing you to scrape a user's followers, following, tweets and more while evading most API limitations</article-title>,&#x201D; <comment>February 2020</comment>. Available: <ext-link ext-link-type="uri" xlink:href="https://github.com/twintproject/twint">https://github.com/twintproject/twint</ext-link></mixed-citation></ref>
<ref id="ref-33"><label>[33]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Mehta</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Kolase</surname></string-name>, <string-name><given-names>V.</given-names> <surname>Tekade</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Sathe</surname></string-name> and <string-name><given-names>A.</given-names> <surname>Dhawale</surname></string-name></person-group>, &#x201C;<article-title>Price prediction and analysis of financial markets based on news, social feed, and sentiment index using machine learning and market data</article-title>,&#x201D; <source>International Research Journal of Engineering and Technology</source>, vol. <volume>7</volume>, no. <issue>6</issue>, pp. <fpage>483</fpage>&#x2013;<lpage>489</lpage>, <year>2020</year>.</mixed-citation></ref>
<ref id="ref-34"><label>[34]</label><mixed-citation publication-type="conf-proc"><person-group person-group-type="author"><string-name><given-names>E.</given-names> <surname>Loper</surname></string-name> and <string-name><given-names>S.</given-names> <surname>Bird</surname></string-name></person-group>, &#x201C;<article-title>nltk: The natural language toolkit&#x201D;</article-title>,&#x201D; in <conf-name>Proc. of the ACL-02 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing and Computational Linguistics</conf-name>, <publisher-loc>Philadelphia, Pennsylvania</publisher-loc>, vol. <volume>1</volume>, pp. <fpage>63</fpage>&#x2013;<lpage>70</lpage>, <year>2002</year>. <pub-id pub-id-type="doi">10.3115/1118108.1118117</pub-id></mixed-citation></ref>
<ref id="ref-35"><label>[35]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>T.</given-names> <surname>Banerjee</surname></string-name> and <string-name><given-names>R. B.</given-names> <surname>Aniyery</surname></string-name></person-group>, &#x201C;<article-title>Thalassemia and its management during pregnancy</article-title>,&#x201D; <source>World Journal of Anemia</source>, vol. <volume>1</volume>, no. <issue>1</issue>, pp. <fpage>5</fpage>&#x2013;<lpage>17</lpage>, <year>2017</year>. <pub-id pub-id-type="doi">10.5005/jp-journals-10065</pub-id></mixed-citation></ref>
<ref id="ref-36"><label>[36]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>M. S.</given-names> <surname>Hossain</surname></string-name>, <string-name><given-names>E.</given-names> <surname>Raheem</surname></string-name>, <string-name><given-names>T. A.</given-names> <surname>Sultana</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Ferdous</surname></string-name>, <string-name><given-names>N.</given-names> <surname>Nahar</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Thalassemias in South Asia: Clinical lessons learnt from Bangladesh</article-title>,&#x201D; <source>Orphanet Journal of Rare Diseases</source>, vol. <volume>12</volume>, no. <issue>1</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>9</lpage>, <year>2017</year>. <pub-id pub-id-type="doi">10.1186/s13023-017-0643-z</pub-id>; <pub-id pub-id-type="pmid">28521805</pub-id></mixed-citation></ref>
<ref id="ref-37"><label>[37]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>B.</given-names> <surname>Cartwright</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Frank</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Weir</surname></string-name> and <string-name><given-names>K.</given-names> <surname>Padda</surname></string-name></person-group>, &#x201C;<article-title>Detecting and responding to hostile disinformation activities on social media using machine learning and deep neural networks</article-title>,&#x201D; <source>Neural Computing and Applications</source>, vol. <volume>34</volume>, no. <issue>18</issue>, pp. <fpage>15141</fpage>&#x2013;<lpage>15163</lpage>, <year>2022</year>. <pub-id pub-id-type="doi">10.1007/s00521-022-07296-0</pub-id></mixed-citation></ref>
<ref id="ref-38"><label>[38]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>S.</given-names> <surname>He</surname></string-name>, <string-name><given-names>Q.</given-names> <surname>Qin</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Yi</surname></string-name>, <string-name><given-names>Y.</given-names> <surname>Wei</surname></string-name>, <string-name><given-names>L.</given-names> <surname>Lin</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Prevalence and genetic analysis of &#x03B1;-and &#x03B2;-thalassemia in Baise region, a multi-ethnic region in southern China</article-title>,&#x201D; <source>Gene</source>, vol. <volume>619</volume>, no. <issue>1</issue>, pp. <fpage>71</fpage>&#x2013;<lpage>75</lpage>, <year>2017</year>. <pub-id pub-id-type="doi">10.1016/j.gene.2016.02.014</pub-id>; <pub-id pub-id-type="pmid">26877226</pub-id></mixed-citation></ref>
<ref id="ref-39"><label>[39]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>A.</given-names> <surname>Kathuria</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Gupta</surname></string-name> and <string-name><given-names>R. K.</given-names> <surname>Singla</surname></string-name></person-group>, &#x201C;<article-title>AOH-senti: Aspect-oriented hybrid approach to sentiment analysis of students&#x2019; feedback</article-title>,&#x201D; <source>SN Computer Science</source>, vol. <volume>4</volume>, no. <issue>2</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>23</lpage>, <year>2023</year>. <pub-id pub-id-type="doi">10.1007/s42979-022-01611-1</pub-id></mixed-citation></ref>
<ref id="ref-40"><label>[40]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>H.</given-names> <surname>Li</surname></string-name>, <string-name><given-names>X. B.</given-names> <surname>Bruce</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Li</surname></string-name> and <string-name><given-names>H.</given-names> <surname>Gao</surname></string-name></person-group>, &#x201C;<article-title>Restaurant survival prediction using customer-generated content: An aspect-based sentiment analysis of online reviews</article-title>,&#x201D; <source>Tourism Management</source>, vol. <volume>96</volume>, no. <issue>7</issue>, pp. <fpage>104707</fpage>, <year>2023</year>. <pub-id pub-id-type="doi">10.1016/j.tourman.2022.104707</pub-id></mixed-citation></ref>
<ref id="ref-41"><label>[41]</label><mixed-citation publication-type="journal"><person-group person-group-type="author"><string-name><given-names>R.</given-names> <surname>Jain</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Kumar</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Nayyar</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Dewan</surname></string-name>, <string-name><given-names>R.</given-names> <surname>Garg</surname></string-name> <etal>et al.</etal></person-group><italic>,</italic> &#x201C;<article-title>Explaining sentiment analysis results on social media texts through visualization</article-title>,&#x201D; <source>Multimedia Tools and Applications</source>, vol. <volume>6</volume>, no. <issue>6</issue>, pp. <fpage>1</fpage>&#x2013;<lpage>17</lpage>, <year>2023</year>. <pub-id pub-id-type="doi">10.1007/s11042-023-14432-y</pub-id>; <pub-id pub-id-type="pmid">36747895</pub-id></mixed-citation></ref>
</ref-list>
</back>
</article>