1 Library

library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(tidyr)
library(readr)
library(stringr)
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.3.3
library(text)
## Warning: package 'text' was built under R version 4.3.3
## This is text (version 1.2.1).
## Text is new and still rapidly improving.
##                
## Newer versions may have improved functions and updated defaults to reflect current understandings of the state-of-the-art.
##                Please send us feedback based on your experience.
## 
## For more information about the package see www.r-text.org.

2 Memasukkan Data

dataasli <- read_csv("PRAKMDS.scraping.csv")
## Rows: 60 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): _id, titles, dates, links
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(dataasli)
## # A tibble: 6 × 4
##   `_id`                    titles                                    dates links
##   <chr>                    <chr>                                     <chr> <chr>
## 1 666053e9d5c7826f4f019b11 Cek Fakta: Tidak Benar Informasi Gebyar … 6/5/… http…
## 2 666053e9d5c7826f4f019b12 Cek Fakta: Tidak Benar Gebyar Undian Ban… 6/5/… http…
## 3 666053e9d5c7826f4f019b13 Hoaks Undiah dari Bank Terus Berkembang,… 6/4/… http…
## 4 666053e9d5c7826f4f019b14 Tokoh Terkenal Dicatut Hoaks Pendaftaran… 6/4/… http…
## 5 666053e9d5c7826f4f019b15 Waspada Praktik Calo dan Penipuan, Polri… 6/4/… http…
## 6 666053e9d5c7826f4f019b16 Cek Fakta: Tidak Benar Pendaftaran Progr… 6/4/… http…

3 Mengatasi Duplikasi Data

# Menghitung data asli
jumlahdataasli <- nrow(dataasli)

# Menghapus Duplikasi
data <- dataasli %>% distinct(titles, .keep_all = TRUE)

# Menghitung data yang sudah bersih
jumlahdata <- nrow(data)

# Data Frame
jumlah_data <- data.frame(
  Kondisi = c("Sebelum Penghapusan", "Setelah Penghapusan"),
  Jumlah = c(jumlahdataasli, jumlahdata)
)
jumlah_data
##               Kondisi Jumlah
## 1 Sebelum Penghapusan     60
## 2 Setelah Penghapusan     42
ggplot(jumlah_data, aes(x = Kondisi, y = Jumlah, fill = Kondisi)) +
  geom_segment(aes(xend = Kondisi, yend = 0), color = "#CD6600", linewidth = 1) +
  geom_point(size = 10, color = "#FF8C00") +
  geom_text(aes(label = Jumlah), vjust = 0.5, color = "white", size = 5) +  # Teks di dalam titik
  scale_fill_manual(values = c("Sebelum" = "#CD6600", "Sesudah" = "#FF8C00")) +
  coord_flip() +
  labs(title = 'Jumlah Data Berita',x = "Kondisi",
       y = "Jumlah Data") +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),
    panel.background = element_rect(fill = "black"),
    panel.grid.major = element_line(color = "black"),
    panel.grid.minor = element_blank(),
    axis.text = element_text(color = "white"),
    axis.title = element_text(color = "darkorange"),
    plot.title = element_text(color = "darkorange", hjust = 0.5, size = 15, face = "bold"),
    panel.border = element_rect(color = "lightyellow", fill = NA, size = 1.5),# Judul di tengah
    legend.position = "none"
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4 Visualisasi Data

4.1 Top 10 Words in Headline

data$titles <- iconv(data$titles, from = "ISO-8859-1", to = "UTF-8")


corpus <- Corpus(VectorSource(data$titles))


stopwords_id <- c("yang", "dan", "di", "ke", "dari", "ini", "itu", "untuk", "dengan", "pada", "adalah", "sebagai", "juga", "dalam", "tidak", "akan", "atau", "saya", "kami", "kita", "mereka", "anda", "ia", "saja", "the", "setelah", "juta", "nex", "hasil", "kena", "sabet", "apa", "catat", "ada", "gelar", "bisa", "hingga", "hadirkan", "ikut","jadi","saat","bagaimana","jika","terus","mau")


corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords_id) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords_id): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
cleaned_titles <- sapply(corpus, as.character)
all_titles <- paste(cleaned_titles, collapse = " ")
words <- unlist(str_split(all_titles, "\\s+"))
word_freq <- as.data.frame(table(words))
word_freq <- word_freq %>% arrange(desc(Freq))
most_common_words <- head(word_freq, 10)
print(most_common_words)
##        words Freq
## 1      hoaks   25
## 2      simak    9
## 3        cek    8
## 4      fakta    8
## 5       bank    7
## 6     undian    7
## 7   penipuan    6
## 8    seputar    6
## 9      benar    5
## 10 berhadiah    5
ggplot(most_common_words, aes(x = reorder(words, Freq), y = Freq, fill = Freq)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = Freq), hjust = 1.5, color = "white", size = 4) +  
  scale_fill_gradient(low = "#FF4500", high = "#FF8C00") +  
  labs(x = "Kata", y = "Frekuensi", title = "Top 10 Words in Headline") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),      
    panel.background = element_rect(fill = "black"),     
    panel.grid.major = element_line(color = "grey"),     
    panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
    axis.text = element_text(color = "darkorange"),      
    axis.title = element_text(color = "darkorange"),     
    plot.title = element_text(color = "darkorange", hjust = 0.5, size = 20, face = "bold")  
  )

4.2 Wordcloud

set.seed(013)

# Membuat word cloud dengan background hitam dan gradasi oranye untuk teks
par(bg = "black")  # Mengubah background menjadi hitam
wordcloud(words = word_freq$words, freq = word_freq$Freq, min.freq = 1,
          max.words = 100, random.order = FALSE, rot.per = 0.35,
          colors = colorRampPalette(c("#FF7F50", "lightblue", "#FF8C00"))(100))

# Menampilkan 10 kata paling sering muncul
most_common_words <- head(word_freq, 10)
print(most_common_words)
##        words Freq
## 1      hoaks   25
## 2      simak    9
## 3        cek    8
## 4      fakta    8
## 5       bank    7
## 6     undian    7
## 7   penipuan    6
## 8    seputar    6
## 9      benar    5
## 10 berhadiah    5

4.3 Top 10 bigrams in Headline

bigrams <- data %>%
  unnest_tokens(bigram, titles, token = "ngrams", n = 2)
bigram_counts <- bigrams %>%
  count(bigram, sort = TRUE)
top_10_bigrams <- head(bigram_counts, 10)
top_10_bigrams_separated <- top_10_bigrams %>%
  separate(bigram, into = c("word1", "word2"), sep = " ")
top_10_bigrams_separated <- top_10_bigrams_separated %>%
  unite(bigram, word1, word2, sep = " ")
print(top_10_bigrams_separated)
## # A tibble: 10 × 2
##    bigram               n
##    <chr>            <int>
##  1 cek fakta            8
##  2 fakta tidak          5
##  3 tidak benar          5
##  4 dari bank            4
##  5 hoaks seputar        4
##  6 kumpulan hoaks       4
##  7 undian berhadiah     4
##  8 berhadiah dari       3
##  9 biar tak             3
## 10 hoaks yang           3
ggplot(top_10_bigrams_separated, aes(x = reorder(bigram, n), y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = n), hjust = 1.5, color = "black", size = 4) +  
  scale_fill_gradient(low = "#ADD8E6", high = "#0000FF") +  # dari LightBlue ke Blue (gradasi biru)
  labs(x = "Bigram", y = "Frekuensi", title = "Top 10 Bigrams in Headline") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),      
    panel.background = element_rect(fill = "black"),     
    panel.grid.major = element_line(color = "grey"),     
    panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
    axis.text = element_text(color = "lightblue"),      
    axis.title = element_text(color = "lightblue"),     
    plot.title = element_text(color = "lightblue", hjust = 0.5, size = 20, face = "bold")  
  )

4.4 Top 10 Trigrams in Headline

trigrams <- data %>%
  unnest_tokens(trigram, titles, token = "ngrams", n = 3)
trigram_counts <- trigrams %>%
  count(trigram, sort = TRUE)
top_10_trigrams <- head(trigram_counts, 10)
top_10_trigrams_separated <- top_10_trigrams %>%
  separate(trigram, into = c("word1", "word2", "word3"), sep = " ")
top_10_trigrams_separated <- top_10_trigrams_separated %>%
  unite(trigram, word1, word2, word3, sep = " ")
ggplot(top_10_trigrams_separated, aes(x = reorder(trigram, n), y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = n), hjust = 1.5, color = "black", size = 4) +  
  scale_fill_gradient(low = "#98FB98", high = "#006400") +  # dari PaleGreen ke DarkGreen (gradasi hijau)
  labs(x = "Trigram", y = "Frekuensi", title = "Top 10 Trigrams in Headline") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),      
    panel.background = element_rect(fill = "black"),     
    panel.grid.major = element_line(color = "grey"),     
    panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
    axis.text = element_text(color = "lightgreen"),      
    axis.title = element_text(color = "lightgreen"),     
    plot.title = element_text(color = "lightgreen", hjust = 0.5, size = 20, face = "bold")  
  )

4.5 Analisis Sentimen

data$Sentiment <- get_sentiment(data$titles, method = "syuzhet")

# Klasifikasikan sentimen menjadi negatif, netral, dan positif
data <- data %>%
  mutate(Sentiment_Class = case_when(
    Sentiment > 0 ~ "Positif",
    Sentiment < 0 ~ "Negatif",
    TRUE ~ "Netral"
  ))

# Simpan hasil ke file CSV baru
output_path <- "sentiment_analysis_results.csv"  # Menyimpan di direktori saat ini
write_csv(data, output_path)

# Hitung jumlah setiap kelas sentimen
sentiment_count <- data %>%
  count(Sentiment_Class)
sentiment_count
## # A tibble: 2 × 2
##   Sentiment_Class     n
##   <chr>           <int>
## 1 Netral             40
## 2 Positif             2
sentiment_count$percentage <- (sentiment_count$n / sum(sentiment_count$n)) * 100
sentiment_count <- sentiment_count %>%
  arrange(desc(Sentiment_Class)) %>%
  mutate(ypos = cumsum(percentage) - 0.5 * percentage)
ggplot(sentiment_count, aes(x = "", y = percentage, fill = Sentiment_Class)) +
  geom_bar(stat = "identity", width = 1, color = "black") +  # Add black border to the bars
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = n), 
            position = position_stack(vjust = 0.5), color = "white", size = 5) +
  geom_label(aes(x = 1.7, y = ypos, label = paste0(Sentiment_Class, ": ", round(percentage, 1), "%")), 
             color = "black", fill = "white", label.size = 0.25, hjust = 0) +  
  geom_segment(aes(x = 1.1, xend = 1.7, y = ypos, yend = ypos), color = "black") +  
  scale_fill_manual(values = c("Positif" = "#CD6600", "Netral" = "#FF8C00")) +
  theme(legend.position = "none")