1 Library

library(tm)

## Warning: package 'tm' was built under R version 4.3.3

## Loading required package: NLP

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.3.3

## Loading required package: RColorBrewer

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.3

library(tidyr)
library(readr)
library(stringr)
library(syuzhet)

## Warning: package 'syuzhet' was built under R version 4.3.3

library(text)

## Warning: package 'text' was built under R version 4.3.3

## [0;34mThis is text (version 1.2.1).
## [0m[0;32mText is new and still rapidly improving.
##                
## Newer versions may have improved functions and updated defaults to reflect current understandings of the state-of-the-art.
##                Please send us feedback based on your experience.[0m[0;35m
## 
## For more information about the package see www.r-text.org.[0m

2 Memasukkan Data

dataasli <- read_csv("PRAKMDS.scraping.csv")

## Rows: 60 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): _id, titles, dates, links
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(dataasli)

## # A tibble: 6 × 4
##   `_id`                    titles                                    dates links
##   <chr>                    <chr>                                     <chr> <chr>
## 1 666053e9d5c7826f4f019b11 Cek Fakta: Tidak Benar Informasi Gebyar … 6/5/… http…
## 2 666053e9d5c7826f4f019b12 Cek Fakta: Tidak Benar Gebyar Undian Ban… 6/5/… http…
## 3 666053e9d5c7826f4f019b13 Hoaks Undiah dari Bank Terus Berkembang,… 6/4/… http…
## 4 666053e9d5c7826f4f019b14 Tokoh Terkenal Dicatut Hoaks Pendaftaran… 6/4/… http…
## 5 666053e9d5c7826f4f019b15 Waspada Praktik Calo dan Penipuan, Polri… 6/4/… http…
## 6 666053e9d5c7826f4f019b16 Cek Fakta: Tidak Benar Pendaftaran Progr… 6/4/… http…

3 Mengatasi Duplikasi Data

# Menghitung data asli
jumlahdataasli <- nrow(dataasli)

# Menghapus Duplikasi
data <- dataasli %>% distinct(titles, .keep_all = TRUE)

# Menghitung data yang sudah bersih
jumlahdata <- nrow(data)

# Data Frame
jumlah_data <- data.frame(
  Kondisi = c("Sebelum Penghapusan", "Setelah Penghapusan"),
  Jumlah = c(jumlahdataasli, jumlahdata)
)
jumlah_data

##               Kondisi Jumlah
## 1 Sebelum Penghapusan     60
## 2 Setelah Penghapusan     42

ggplot(jumlah_data, aes(x = Kondisi, y = Jumlah, fill = Kondisi)) +
  geom_segment(aes(xend = Kondisi, yend = 0), color = "#CD6600", linewidth = 1) +
  geom_point(size = 10, color = "#FF8C00") +
  geom_text(aes(label = Jumlah), vjust = 0.5, color = "white", size = 5) +  # Teks di dalam titik
  scale_fill_manual(values = c("Sebelum" = "#CD6600", "Sesudah" = "#FF8C00")) +
  coord_flip() +
  labs(title = 'Jumlah Data Berita',x = "Kondisi",
       y = "Jumlah Data") +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),
    panel.background = element_rect(fill = "black"),
    panel.grid.major = element_line(color = "black"),
    panel.grid.minor = element_blank(),
    axis.text = element_text(color = "white"),
    axis.title = element_text(color = "darkorange"),
    plot.title = element_text(color = "darkorange", hjust = 0.5, size = 15, face = "bold"),
    panel.border = element_rect(color = "lightyellow", fill = NA, size = 1.5),# Judul di tengah
    legend.position = "none"
  )

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4 Visualisasi Data

4.1 Top 10 Words in Headline

data$titles <- iconv(data$titles, from = "ISO-8859-1", to = "UTF-8")


corpus <- Corpus(VectorSource(data$titles))


stopwords_id <- c("yang", "dan", "di", "ke", "dari", "ini", "itu", "untuk", "dengan", "pada", "adalah", "sebagai", "juga", "dalam", "tidak", "akan", "atau", "saya", "kami", "kita", "mereka", "anda", "ia", "saja", "the", "setelah", "juta", "nex", "hasil", "kena", "sabet", "apa", "catat", "ada", "gelar", "bisa", "hingga", "hadirkan", "ikut","jadi","saat","bagaimana","jika","terus","mau")


corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords_id) %>%
  tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents

## Warning in tm_map.SimpleCorpus(., removeWords, stopwords_id): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents

cleaned_titles <- sapply(corpus, as.character)
all_titles <- paste(cleaned_titles, collapse = " ")
words <- unlist(str_split(all_titles, "\\s+"))
word_freq <- as.data.frame(table(words))
word_freq <- word_freq %>% arrange(desc(Freq))
most_common_words <- head(word_freq, 10)
print(most_common_words)

##        words Freq
## 1      hoaks   25
## 2      simak    9
## 3        cek    8
## 4      fakta    8
## 5       bank    7
## 6     undian    7
## 7   penipuan    6
## 8    seputar    6
## 9      benar    5
## 10 berhadiah    5

ggplot(most_common_words, aes(x = reorder(words, Freq), y = Freq, fill = Freq)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = Freq), hjust = 1.5, color = "white", size = 4) +  
  scale_fill_gradient(low = "#FF4500", high = "#FF8C00") +  
  labs(x = "Kata", y = "Frekuensi", title = "Top 10 Words in Headline") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),      
    panel.background = element_rect(fill = "black"),     
    panel.grid.major = element_line(color = "grey"),     
    panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
    axis.text = element_text(color = "darkorange"),      
    axis.title = element_text(color = "darkorange"),     
    plot.title = element_text(color = "darkorange", hjust = 0.5, size = 20, face = "bold")  
  )

4.2 Wordcloud

set.seed(013)

# Membuat word cloud dengan background hitam dan gradasi oranye untuk teks
par(bg = "black")  # Mengubah background menjadi hitam
wordcloud(words = word_freq$words, freq = word_freq$Freq, min.freq = 1,
          max.words = 100, random.order = FALSE, rot.per = 0.35,
          colors = colorRampPalette(c("#FF7F50", "lightblue", "#FF8C00"))(100))

# Menampilkan 10 kata paling sering muncul
most_common_words <- head(word_freq, 10)
print(most_common_words)

##        words Freq
## 1      hoaks   25
## 2      simak    9
## 3        cek    8
## 4      fakta    8
## 5       bank    7
## 6     undian    7
## 7   penipuan    6
## 8    seputar    6
## 9      benar    5
## 10 berhadiah    5

4.3 Top 10 bigrams in Headline

bigrams <- data %>%
  unnest_tokens(bigram, titles, token = "ngrams", n = 2)
bigram_counts <- bigrams %>%
  count(bigram, sort = TRUE)
top_10_bigrams <- head(bigram_counts, 10)
top_10_bigrams_separated <- top_10_bigrams %>%
  separate(bigram, into = c("word1", "word2"), sep = " ")
top_10_bigrams_separated <- top_10_bigrams_separated %>%
  unite(bigram, word1, word2, sep = " ")
print(top_10_bigrams_separated)

## # A tibble: 10 × 2
##    bigram               n
##    <chr>            <int>
##  1 cek fakta            8
##  2 fakta tidak          5
##  3 tidak benar          5
##  4 dari bank            4
##  5 hoaks seputar        4
##  6 kumpulan hoaks       4
##  7 undian berhadiah     4
##  8 berhadiah dari       3
##  9 biar tak             3
## 10 hoaks yang           3

ggplot(top_10_bigrams_separated, aes(x = reorder(bigram, n), y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = n), hjust = 1.5, color = "black", size = 4) +  
  scale_fill_gradient(low = "#ADD8E6", high = "#0000FF") +  # dari LightBlue ke Blue (gradasi biru)
  labs(x = "Bigram", y = "Frekuensi", title = "Top 10 Bigrams in Headline") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),      
    panel.background = element_rect(fill = "black"),     
    panel.grid.major = element_line(color = "grey"),     
    panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
    axis.text = element_text(color = "lightblue"),      
    axis.title = element_text(color = "lightblue"),     
    plot.title = element_text(color = "lightblue", hjust = 0.5, size = 20, face = "bold")  
  )

4.4 Top 10 Trigrams in Headline

trigrams <- data %>%
  unnest_tokens(trigram, titles, token = "ngrams", n = 3)
trigram_counts <- trigrams %>%
  count(trigram, sort = TRUE)
top_10_trigrams <- head(trigram_counts, 10)
top_10_trigrams_separated <- top_10_trigrams %>%
  separate(trigram, into = c("word1", "word2", "word3"), sep = " ")
top_10_trigrams_separated <- top_10_trigrams_separated %>%
  unite(trigram, word1, word2, word3, sep = " ")

ggplot(top_10_trigrams_separated, aes(x = reorder(trigram, n), y = n, fill = n)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = n), hjust = 1.5, color = "black", size = 4) +  
  scale_fill_gradient(low = "#98FB98", high = "#006400") +  # dari PaleGreen ke DarkGreen (gradasi hijau)
  labs(x = "Trigram", y = "Frekuensi", title = "Top 10 Trigrams in Headline") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black"),      
    panel.background = element_rect(fill = "black"),     
    panel.grid.major = element_line(color = "grey"),     
    panel.grid.minor = element_line(color = "grey", linetype = "dashed"),
    axis.text = element_text(color = "lightgreen"),      
    axis.title = element_text(color = "lightgreen"),     
    plot.title = element_text(color = "lightgreen", hjust = 0.5, size = 20, face = "bold")  
  )

4.5 Analisis Sentimen

data$Sentiment <- get_sentiment(data$titles, method = "syuzhet")

# Klasifikasikan sentimen menjadi negatif, netral, dan positif
data <- data %>%
  mutate(Sentiment_Class = case_when(
    Sentiment > 0 ~ "Positif",
    Sentiment < 0 ~ "Negatif",
    TRUE ~ "Netral"
  ))

# Simpan hasil ke file CSV baru
output_path <- "sentiment_analysis_results.csv"  # Menyimpan di direktori saat ini
write_csv(data, output_path)

# Hitung jumlah setiap kelas sentimen
sentiment_count <- data %>%
  count(Sentiment_Class)
sentiment_count

## # A tibble: 2 × 2
##   Sentiment_Class     n
##   <chr>           <int>
## 1 Netral             40
## 2 Positif             2

sentiment_count$percentage <- (sentiment_count$n / sum(sentiment_count$n)) * 100
sentiment_count <- sentiment_count %>%
  arrange(desc(Sentiment_Class)) %>%
  mutate(ypos = cumsum(percentage) - 0.5 * percentage)
ggplot(sentiment_count, aes(x = "", y = percentage, fill = Sentiment_Class)) +
  geom_bar(stat = "identity", width = 1, color = "black") +  # Add black border to the bars
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = n), 
            position = position_stack(vjust = 0.5), color = "white", size = 5) +
  geom_label(aes(x = 1.7, y = ypos, label = paste0(Sentiment_Class, ": ", round(percentage, 1), "%")), 
             color = "black", fill = "white", label.size = 0.25, hjust = 0) +  
  geom_segment(aes(x = 1.1, xend = 1.7, y = ypos, yend = ypos), color = "black") +  
  scale_fill_manual(values = c("Positif" = "#CD6600", "Netral" = "#FF8C00")) +
  theme(legend.position = "none")

4.6 Trends of News Headlines over Time

data$dates <- as.Date(data$dates, format = '%m/%d/%Y')
news_count_per_day <- table(data$dates)
df_news_count <- as.data.frame(news_count_per_day)
colnames(df_news_count) <- c('Date', 'Count')

df_news_count

##          Date Count
## 1  2024-06-04     4
## 2  2024-06-05     3
## 3  2024-06-06     4
## 4  2024-06-07     5
## 5  2024-06-08     3
## 6  2024-06-09     5
## 7  2024-06-10     2
## 8  2024-06-11     7
## 9  2024-06-12     3
## 10 2024-06-14     6

df_news_count$Color <- as.numeric(cut(df_news_count$Count, breaks = 5))
gradient_palette <- scale_color_gradient(low = "lightyellow", high = "orange")
ggplot(df_news_count, aes(x = Date, y = Count, group = 1, color = Color)) +
  geom_line(size = 1.2) +  
  geom_point(color = '#CD6600', size = 3) +  
  geom_text(aes(label = Count), vjust = -1.2, color = 'darkorange') +  
  gradient_palette +  
  labs(title = 'Trends of News Headlines over Time', x = 'Tanggal', y = 'Jumlah Artikel') +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = "black", color = NA),  
    panel.background = element_rect(fill = "black", color = NA), 
    panel.grid.major = element_line(color = "black"),  
    panel.grid.minor = element_blank(),  
    plot.title = element_text(hjust = 0.5, color = "orange", size = 18, face = "bold"),  
    axis.title.x = element_text(color = "lightyellow", size = 10),  
    axis.title.y = element_text(color = "lightyellow", size = 10),  
    axis.text.x = element_text(angle = 45, hjust = 1, color = "orange"),  
    axis.text.y = element_text(color = "orange"),  
    plot.margin = margin(t = 10, r = 10, b = 30, l = 30),  
    panel.border = element_rect(color = "lightyellow", fill = NA, size = 1.5), legend.position = "none"  # Bingkai di luar grafik
  ) +
  scale_y_continuous(limits = c(2, 7), expand = expansion(mult = c(0.09, 0.15)))  # Mengatur batas y agar angka 7 tidak terpotong

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Scraping Liputan6: Cek Fakta

Dwi Fitrianti (G1501231013)

2024-06-14

1 Library

2 Memasukkan Data

3 Mengatasi Duplikasi Data

4 Visualisasi Data

4.1 Top 10 Words in Headline

4.2 Wordcloud

4.3 Top 10 bigrams in Headline

4.4 Top 10 Trigrams in Headline

4.5 Analisis Sentimen

4.6 Trends of News Headlines over Time