Top 10 Words in
Headline
data$titles <- iconv(data$titles, from = "ISO-8859-1", to = "UTF-8")
corpus <- Corpus(VectorSource(data$titles))
stopwords_id <- c("yang", "dan", "di", "ke", "dari", "ini", "itu", "untuk", "dengan", "pada", "adalah", "sebagai", "juga", "dalam", "tidak", "akan", "atau", "saya", "kami", "kita", "mereka", "anda", "ia", "saja", "the", "setelah", "juta", "nex", "hasil", "kena", "sabet", "apa", "catat", "ada", "gelar", "bisa", "hingga", "hadirkan", "ikut","jadi","saat","bagaimana","jika","terus","mau")
corpus <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords_id) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords_id): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
cleaned_titles <- sapply(corpus, as.character)
all_titles <- paste(cleaned_titles, collapse = " ")
words <- unlist(str_split(all_titles, "\\s+"))
word_freq <- as.data.frame(table(words))
word_freq <- word_freq %>% arrange(desc(Freq))
most_common_words <- head(word_freq, 10)
print(most_common_words)
## words Freq
## 1 hoaks 25
## 2 simak 9
## 3 cek 8
## 4 fakta 8
## 5 bank 7
## 6 undian 7
## 7 penipuan 6
## 8 seputar 6
## 9 benar 5
## 10 berhadiah 5
ggplot(most_common_words, aes(x = reorder(words, Freq), y = Freq, fill = Freq)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = Freq), hjust = 1.5, color = "white", size = 4) +
scale_fill_gradient(low = "#FF4500", high = "#FF8C00") +
labs(x = "Kata", y = "Frekuensi", title = "Top 10 Words in Headline") +
coord_flip() +
theme_minimal() +
theme(
plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
axis.text = element_text(color = "darkorange"),
axis.title = element_text(color = "darkorange"),
plot.title = element_text(color = "darkorange", hjust = 0.5, size = 20, face = "bold")
)

Wordcloud
set.seed(013)
# Membuat word cloud dengan background hitam dan gradasi oranye untuk teks
par(bg = "black") # Mengubah background menjadi hitam
wordcloud(words = word_freq$words, freq = word_freq$Freq, min.freq = 1,
max.words = 100, random.order = FALSE, rot.per = 0.35,
colors = colorRampPalette(c("#FF7F50", "lightblue", "#FF8C00"))(100))

# Menampilkan 10 kata paling sering muncul
most_common_words <- head(word_freq, 10)
print(most_common_words)
## words Freq
## 1 hoaks 25
## 2 simak 9
## 3 cek 8
## 4 fakta 8
## 5 bank 7
## 6 undian 7
## 7 penipuan 6
## 8 seputar 6
## 9 benar 5
## 10 berhadiah 5
Top 10 bigrams in
Headline
bigrams <- data %>%
unnest_tokens(bigram, titles, token = "ngrams", n = 2)
bigram_counts <- bigrams %>%
count(bigram, sort = TRUE)
top_10_bigrams <- head(bigram_counts, 10)
top_10_bigrams_separated <- top_10_bigrams %>%
separate(bigram, into = c("word1", "word2"), sep = " ")
top_10_bigrams_separated <- top_10_bigrams_separated %>%
unite(bigram, word1, word2, sep = " ")
print(top_10_bigrams_separated)
## # A tibble: 10 × 2
## bigram n
## <chr> <int>
## 1 cek fakta 8
## 2 fakta tidak 5
## 3 tidak benar 5
## 4 dari bank 4
## 5 hoaks seputar 4
## 6 kumpulan hoaks 4
## 7 undian berhadiah 4
## 8 berhadiah dari 3
## 9 biar tak 3
## 10 hoaks yang 3
ggplot(top_10_bigrams_separated, aes(x = reorder(bigram, n), y = n, fill = n)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), hjust = 1.5, color = "black", size = 4) +
scale_fill_gradient(low = "#ADD8E6", high = "#0000FF") + # dari LightBlue ke Blue (gradasi biru)
labs(x = "Bigram", y = "Frekuensi", title = "Top 10 Bigrams in Headline") +
coord_flip() +
theme_minimal() +
theme(
plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
axis.text = element_text(color = "lightblue"),
axis.title = element_text(color = "lightblue"),
plot.title = element_text(color = "lightblue", hjust = 0.5, size = 20, face = "bold")
)

Top 10 Trigrams in
Headline
trigrams <- data %>%
unnest_tokens(trigram, titles, token = "ngrams", n = 3)
trigram_counts <- trigrams %>%
count(trigram, sort = TRUE)
top_10_trigrams <- head(trigram_counts, 10)
top_10_trigrams_separated <- top_10_trigrams %>%
separate(trigram, into = c("word1", "word2", "word3"), sep = " ")
top_10_trigrams_separated <- top_10_trigrams_separated %>%
unite(trigram, word1, word2, word3, sep = " ")
ggplot(top_10_trigrams_separated, aes(x = reorder(trigram, n), y = n, fill = n)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), hjust = 1.5, color = "black", size = 4) +
scale_fill_gradient(low = "#98FB98", high = "#006400") + # dari PaleGreen ke DarkGreen (gradasi hijau)
labs(x = "Trigram", y = "Frekuensi", title = "Top 10 Trigrams in Headline") +
coord_flip() +
theme_minimal() +
theme(
plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
axis.text = element_text(color = "lightgreen"),
axis.title = element_text(color = "lightgreen"),
plot.title = element_text(color = "lightgreen", hjust = 0.5, size = 20, face = "bold")
)

Analisis
Sentimen
data$Sentiment <- get_sentiment(data$titles, method = "syuzhet")
# Klasifikasikan sentimen menjadi negatif, netral, dan positif
data <- data %>%
mutate(Sentiment_Class = case_when(
Sentiment > 0 ~ "Positif",
Sentiment < 0 ~ "Negatif",
TRUE ~ "Netral"
))
# Simpan hasil ke file CSV baru
output_path <- "sentiment_analysis_results.csv" # Menyimpan di direktori saat ini
write_csv(data, output_path)
# Hitung jumlah setiap kelas sentimen
sentiment_count <- data %>%
count(Sentiment_Class)
sentiment_count
## # A tibble: 2 × 2
## Sentiment_Class n
## <chr> <int>
## 1 Netral 40
## 2 Positif 2
sentiment_count$percentage <- (sentiment_count$n / sum(sentiment_count$n)) * 100
sentiment_count <- sentiment_count %>%
arrange(desc(Sentiment_Class)) %>%
mutate(ypos = cumsum(percentage) - 0.5 * percentage)
ggplot(sentiment_count, aes(x = "", y = percentage, fill = Sentiment_Class)) +
geom_bar(stat = "identity", width = 1, color = "black") + # Add black border to the bars
coord_polar(theta = "y") +
theme_void() +
geom_text(aes(label = n),
position = position_stack(vjust = 0.5), color = "white", size = 5) +
geom_label(aes(x = 1.7, y = ypos, label = paste0(Sentiment_Class, ": ", round(percentage, 1), "%")),
color = "black", fill = "white", label.size = 0.25, hjust = 0) +
geom_segment(aes(x = 1.1, xend = 1.7, y = ypos, yend = ypos), color = "black") +
scale_fill_manual(values = c("Positif" = "#CD6600", "Netral" = "#FF8C00")) +
theme(legend.position = "none")

Trends of News
Headlines over Time
data$dates <- as.Date(data$dates, format = '%m/%d/%Y')
news_count_per_day <- table(data$dates)
df_news_count <- as.data.frame(news_count_per_day)
colnames(df_news_count) <- c('Date', 'Count')
## Date Count
## 1 2024-06-04 4
## 2 2024-06-05 3
## 3 2024-06-06 4
## 4 2024-06-07 5
## 5 2024-06-08 3
## 6 2024-06-09 5
## 7 2024-06-10 2
## 8 2024-06-11 7
## 9 2024-06-12 3
## 10 2024-06-14 6
df_news_count$Color <- as.numeric(cut(df_news_count$Count, breaks = 5))
gradient_palette <- scale_color_gradient(low = "lightyellow", high = "orange")
ggplot(df_news_count, aes(x = Date, y = Count, group = 1, color = Color)) +
geom_line(size = 1.2) +
geom_point(color = '#CD6600', size = 3) +
geom_text(aes(label = Count), vjust = -1.2, color = 'darkorange') +
gradient_palette +
labs(title = 'Trends of News Headlines over Time', x = 'Tanggal', y = 'Jumlah Artikel') +
theme_minimal() +
theme(
plot.background = element_rect(fill = "black", color = NA),
panel.background = element_rect(fill = "black", color = NA),
panel.grid.major = element_line(color = "black"),
panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, color = "orange", size = 18, face = "bold"),
axis.title.x = element_text(color = "lightyellow", size = 10),
axis.title.y = element_text(color = "lightyellow", size = 10),
axis.text.x = element_text(angle = 45, hjust = 1, color = "orange"),
axis.text.y = element_text(color = "orange"),
plot.margin = margin(t = 10, r = 10, b = 30, l = 30),
panel.border = element_rect(color = "lightyellow", fill = NA, size = 1.5), legend.position = "none" # Bingkai di luar grafik
) +
scale_y_continuous(limits = c(2, 7), expand = expansion(mult = c(0.09, 0.15))) # Mengatur batas y agar angka 7 tidak terpotong
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
