1- Install and load the required packages:

afinn <- get_sentiments("afinn")

2- Preparing data to analyze

tweets <- read_csv("tweets.csv")

## New names:
## Rows: 4000 Columns: 17
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): text, replyToSN, statusSource, screenName dbl (8): ...1, favoriteCount,
## replyToSID, id, replyToUID, retweetCount, lon... lgl (4): favorited, truncated,
## isRetweet, retweeted dttm (1): created
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

head(tweets)

## # A tibble: 6 × 17
##    ...1 text     favorited favoriteCount replyToSN created             truncated
##   <dbl> <chr>    <lgl>             <dbl> <chr>     <dttm>              <lgl>    
## 1     1 "RT @Ka… FALSE                 0 <NA>      2017-11-30 23:59:14 FALSE    
## 2     2 "RT @Ka… FALSE                 0 <NA>      2017-11-30 23:58:58 FALSE    
## 3     3 "RT @Ka… FALSE                 0 <NA>      2017-11-30 23:57:29 FALSE    
## 4     4 "RT @ju… FALSE                 0 <NA>      2017-11-30 23:57:07 FALSE    
## 5     5 "RT @Ka… FALSE                 0 <NA>      2017-11-30 23:56:33 FALSE    
## 6     6 "RT @Ka… FALSE                 0 <NA>      2017-11-30 23:56:32 FALSE    
## # ℹ 10 more variables: replyToSID <dbl>, id <dbl>, replyToUID <dbl>,
## #   statusSource <chr>, screenName <chr>, retweetCount <dbl>, isRetweet <lgl>,
## #   retweeted <lgl>, longitude <dbl>, latitude <dbl>

tidy_tweets <- tweets %>%
  rowid_to_column("tweet_id") %>%
  select(tweet_id, text) %>%
  unnest_tokens(word, text)

## View the first few rows
head(tidy_tweets)

## # A tibble: 6 × 2
##   tweet_id word      
##      <int> <chr>     
## 1        1 rt        
## 2        1 karnamikko
## 3        1 i         
## 4        1 guess     
## 5        1 spain     
## 6        1 will

3- Performing Sentiment analyze

sentiment_scores <- tidy_tweets %>%
  left_join(afinn, by = "word") %>%
  group_by(tweet_id)

sentiment_summary <- sentiment_scores %>%
  summarise(sentiment_score = sum(value, na.rm = TRUE))

# View the sentiment summary
sentiment_summary

## # A tibble: 4,000 × 2
##    tweet_id sentiment_score
##       <int>           <dbl>
##  1        1              -2
##  2        2              -2
##  3        3              -2
##  4        4              -2
##  5        5              -2
##  6        6              -2
##  7        7              -2
##  8        8              -2
##  9        9              -2
## 10       10              -2
## # ℹ 3,990 more rows

4- Visualization th results

# Calculate percentage distribution
sentiment_percentage <- prop.table(table(factor(sign(sentiment_summary$sentiment_score)))) * 100
sentiment_percentage

## 
##     -1      0      1 
## 32.375 46.425 21.200

ggplot(data.frame(sentiment = names(sentiment_percentage), percentage = as.numeric(sentiment_percentage)), 
       aes(x = "", y = percentage, fill = sentiment)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = sprintf("%.2f%%", percentage)), position = position_stack(vjust = 0.5), color = "black") +
  labs(title = "Sentiment Distribution",
       fill = "Sentiment Category") +
  scale_fill_manual(values = c("-1" = "red", "0" = "grey", "1" = "green")) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        plot.margin = unit(rep(-2, 4), "cm"))

We read the data set tweets.csv and make sentiment analyze by using AFINN lexicon that is most common used lexicon for R. In the end we categorized the sentiment scores into three groups : Negative, Neutral and Positive. We calculate the percentage share of those three groups and then made a pie chart to make clear vizulation. (negative =red, neutral =grey, and positive =green)

According to results we can say that most of the tweets are neutral with an approximately 46.42 %. About 32.375% of the tweets have a negative sentiment and negative tweets are more than a positives (21.20%).

It’s important to note that these percentages are based on the sentiment scores calculated using the AFINN lexicon, and the categorization into negative, neutral, and positive is determined by the sign of the sentiment score

Text Mining- Sentiment Analyze

Gizem Güleli

2023-11-24

1- Install and load the required packages:

2- Preparing data to analyze

3- Performing Sentiment analyze

4- Visualization th results