DMBG #T4

Library

# Load required libraries
library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.3
library(ggplot2)

Step 1: Get article URLs from Kompas Money

get_finansial_articles <- function(page_num = 1, max_articles = 100) {
  if (page_num == 1) {
    url <- "https://indeks.kompas.com/?source=navbar&site=money"
  } else {
    url <- paste0("https://indeks.kompas.com/?source=navbar&site=money&page=", page_num)
  }
  
  cat("Accessing URL:", url, "\n")
  
  tryCatch({
    page <- read_html(url)
    
    # Ambil semua tag <a>
    links <- html_nodes(page, "a")
    hrefs <- html_attr(links, "href")
    hrefs <- hrefs[!is.na(hrefs)]
    
    # Filter hanya link artikel Kompas Money
    article_links <- unique(hrefs[grepl("^https://money\\.kompas\\.com/read", hrefs)])
    
    # Ambil teks dari <a> yang cocok
    article_texts <- html_text(links[grepl("^https://money\\.kompas\\.com/read", hrefs)], trim = TRUE)
    
    # Judul fallback jika kosong
    article_titles <- ifelse(article_texts == "" | is.na(article_texts),
                             paste("Article", seq_along(article_links)),
                             article_texts)
    
    # Batasi jumlah artikel
    n <- min(length(article_links), max_articles)
    
    df <- data.frame(
      url = article_links[1:n],
      title = article_titles[1:n],
      stringsAsFactors = FALSE
    )
    
    cat("Successfully extracted", nrow(df), "article URLs\n")
    return(df)
    
  }, error = function(e) {
    cat("Error:", e$message, "\n")
    return(data.frame(url = character(0), title = character(0), stringsAsFactors = FALSE))
  })
}
library(knitr)

komparasi <- data.frame(
  Komponen = c(
    "Selector HTML",
    "Ekstraksi Judul",
    "Struktur Halaman",
    "URL Pattern",
    "Fallback Judul",
    "Normalisasi URL"
  ),
  Kompas = c(
    '`html_nodes("a")` lalu filter URL',
    'Teks langsung dari `<a>`',
    'Lebih konsisten',
    '`money.kompas.com/read`',
    '`paste("Article", i)` kalau kosong',
    'Tidak diperlukan (sudah absolut)'
  ),
  Detik = c(
    '`html_nodes("article")`, cek `<a>` & judul',
    'Cek `<a>`, lalu `<h2>`, `.title`, dll',
    'Harus pakai fallback selector (`.media__title`, dll)',
    '`finance.detik.com/...`',
    'Sama, tapi pakai `html_text()` fallback dari elemen lain',
    'Diperlukan (kadang relatif seperti `/read/...`)'
  )
)

kable(komparasi, format = "html", escape = FALSE, caption = "Perbandingan Ekstraksi Artikel Kompas vs Detik")
Perbandingan Ekstraksi Artikel Kompas vs Detik
Komponen Kompas Detik
Selector HTML html_nodes("a") lalu filter URL html_nodes("article"), cek <a> & judul
Ekstraksi Judul Teks langsung dari <a> Cek <a>, lalu <h2>, .title, dll
Struktur Halaman Lebih konsisten Harus pakai fallback selector (.media__title, dll)
URL Pattern money.kompas.com/read finance.detik.com/...
Fallback Judul paste("Article", i) kalau kosong Sama, tapi pakai html_text() fallback dari elemen lain
Normalisasi URL Tidak diperlukan (sudah absolut) Diperlukan (kadang relatif seperti /read/...)

Modifikasi utama pada fungsi Kompas adalah karena struktur halaman lebih konsisten dan link artikelnya sudah jelas dalam dengan domain dan pola tetap (money.kompas.com/read). Sehingga tidak perlu selektor kompleks seperti pada versi Detik.

Step 2: Function to extract article content from Kompas Money

extract_article_content <- function(url) {
  cat("Crawling article:", url, "\n")
  
  tryCatch({
    page <- read_html(url)
    
    # Helper: try multiple selectors
    try_selectors <- function(selectors, default = NA, type = "text", attr = NULL) {
      for (selector in selectors) {
        element <- page %>% html_node(selector)
        
        if (!is.null(element) && !inherits(element, "xml_missing")) {
          if (type == "text") {
            content <- element %>% html_text(trim = TRUE)
            if (!is.na(content) && content != "") return(content)
          } else if (type == "attr" && !is.null(attr)) {
            content <- element %>% html_attr(attr)
            if (!is.na(content) && content != "") return(content)
          } else if (type == "nodes") {
            nodes <- page %>% html_nodes(selector)
            if (length(nodes) > 0) {
              content <- nodes %>% html_text(trim = TRUE) %>% paste(collapse = "\n\n")
              if (!is.na(content) && content != "") return(content)
            }
          }
        }
      }
      return(default)
    }
    
    # ==== Extract each component ====
    
    # Title
    title <- try_selectors(c("h1.read__title", "h1", ".article__title"), "No Title Found")
    
    # Author
    # Author
    author <- try_selectors(c("div.credit-title-name"), "Unknown Author")
    
    
    # Date
    date_raw <- try_selectors(c(".read__time", ".article__date", ".date"), NA)
    date_clean <- str_extract(date_raw, "\\d{1,2}/\\d{1,2}/\\d{4}")
    parsed_date <- tryCatch({
      as.Date(date_clean, format = "%d/%m/%Y")
    }, error = function(e) Sys.Date())
    
    if (is.na(parsed_date)) {
      date_match <- str_extract(date_raw, "\\d{4}-\\d{2}-\\d{2}")
      if (!is.na(date_match)) {
        parsed_date <- as.Date(date_match)
      } else {
        parsed_date <- Sys.Date()
      }
    }
    
    # Content
    content <- try_selectors(c(".read__content p", ".read__content", "article p", ".article__body"), "No content found", "nodes")
    if (content == "No content found") {
      content <- try_selectors(c(".read__content", ".article-content", "article"), "No content found")
    }
    
    # Image
    image_url <- try_selectors(c(".photo img", ".photo__wrap img", "figure img", "img"), NA, "attr", "src")
    
    # Category (from breadcrumb or meta tag)
    category <- try_selectors(c(".breadcrumb__item a", ".breadcrumb a", "meta[property='article:section']"), "Money")
    
    # Tags (usually not available, but check anyway)
    tags <- try_selectors(c(".tag a", ".tags a", ".read__tag a"), "", "nodes")
    
    # Return as data frame row
    return(data.frame(
      url = url,
      title = title,
      author = author,
      date = parsed_date,
      publish_date_raw = date_raw,
      content = content,
      image_url = image_url,
      category = category,
      tags = tags,
      crawl_time = Sys.time(),
      stringsAsFactors = FALSE
    ))
    
  }, error = function(e) {
    cat("Error extracting content from", url, ":", e$message, "\n")
    return(data.frame(
      url = url,
      title = "Error extracting content",
      author = NA,
      date = as.Date(Sys.Date()),
      publish_date_raw = NA,
      content = paste("Error:", e$message),
      image_url = NA,
      category = NA,
      tags = NA,
      crawl_time = Sys.time(),
      stringsAsFactors = FALSE
    ))
  })
}
knitr::kable(
  data.frame(
    Komponen = c(
      "Struktur Halaman", "Title", "Author", "Tanggal Publikasi",
      "Konten Artikel", "Gambar", "Kategori", "Tags", "Parsing Tanggal", "Error Handling"
    ),
    Kompas = c(
      "Konsisten, link artikel sudah jelas dengan pola domain `money.kompas.com/read`",
      "`h1.read__title`, `h1`, `.article__title`",
      "`div.credit-title-name`",
      "`.read__time`, `.article__date`, atau `.date` → parsing `%d/%m/%Y`",
      "`.read__content p`, `article p`, `.article__body`, fallback ke `.read__content`",
      "`.photo img`, `figure img`, `img`",
      "`.breadcrumb__item a`, `meta[property='article:section']`",
      "`.read__tag a`, `.tags a`",
      "Jika format gagal, fallback ke `Sys.Date()` atau regex untuk pola tanggal",
      "Langsung kembali `data.frame` default dengan pesan error jika gagal"
    ),
    Detik = c(
      "Lebih kompleks, membutuhkan banyak fallback dan kombinasi selector",
      "Banyak variasi selector: `h1.detail__title`, `.article-title h1`, `h1`, dll.",
      "`span.author`, `.byline`, `.detail__author`, dsb.",
      "Lebih banyak variasi dan format tanggal (hingga 5 percobaan format parsing)",
      "`.detail__body-text p`, `article-content p`, fallback ke area konten penuh",
      "`.detail__media-image img`, `figure img`, `article img`",
      "`.breadcrumb__item`, `.nav__item--active`, `breadcrumb a`",
      "`.detail__body-tag a`, `.tags a`",
      "Mencoba berbagai format `%d %b %Y`, `%d/%m/%Y`, regex fallback lebih kompleks",
      "Sama, tapi proses parsing tanggal lebih ketat dan spesifik"
    ),
    check.names = FALSE
  ),
  caption = "Tabel Perbandingan Struktur Fungsi `extract_article_content` untuk Kompas dan Detik",
  format = "html",
  escape = FALSE
)
Tabel Perbandingan Struktur Fungsi extract_article_content untuk Kompas dan Detik
Komponen Kompas Detik
Struktur Halaman Konsisten, link artikel sudah jelas dengan pola domain money.kompas.com/read Lebih kompleks, membutuhkan banyak fallback dan kombinasi selector
Title h1.read__title, h1, .article__title Banyak variasi selector: h1.detail__title, .article-title h1, h1, dll.
Author div.credit-title-name span.author, .byline, .detail__author, dsb.
Tanggal Publikasi .read__time, .article__date, atau .date → parsing %d/%m/%Y Lebih banyak variasi dan format tanggal (hingga 5 percobaan format parsing)
Konten Artikel .read__content p, article p, .article__body, fallback ke .read__content .detail__body-text p, article-content p, fallback ke area konten penuh
Gambar .photo img, figure img, img .detail__media-image img, figure img, article img
Kategori .breadcrumb__item a, meta[property='article:section'] .breadcrumb__item, .nav__item--active, breadcrumb a
Tags .read__tag a, .tags a .detail__body-tag a, .tags a
Parsing Tanggal Jika format gagal, fallback ke Sys.Date() atau regex untuk pola tanggal Mencoba berbagai format %d %b %Y, %d/%m/%Y, regex fallback lebih kompleks
Error Handling Langsung kembali data.frame default dengan pesan error jika gagal Sama, tapi proses parsing tanggal lebih ketat dan spesifik

Step 3: Function to analyze financial news articles

analyze_financial_news <- function(articles_df) {
  # Ensure we have articles to analyze
  if (nrow(articles_df) == 0) {
    cat("No articles to analyze\n")
    return(NULL)
  }
  
  # Create analysis result structure
  analysis_results <- list()
  
  # 1. Basic statistics
  analysis_results$basic_stats <- list(
    total_articles = nrow(articles_df),
    date_range = range(articles_df$date, na.rm = TRUE),
    unique_authors = length(unique(articles_df$author[!is.na(articles_df$author)])),
    avg_content_length = mean(nchar(articles_df$content), na.rm = TRUE)
  )
  
  # 2. Time series analysis
  if ("date" %in% names(articles_df) && any(!is.na(articles_df$date))) {
    # Count articles by date
    articles_by_date <- articles_df %>%
      group_by(date) %>%
      summarise(count = n()) %>%
      arrange(date)
    
    analysis_results$time_series <- articles_by_date
  }
  
  # 3. Author analysis
  if ("author" %in% names(articles_df) && any(!is.na(articles_df$author))) {
    # Count articles by author
    articles_by_author <- articles_df %>%
      filter(!is.na(author) & author != "Unknown Author") %>%
      group_by(author) %>%
      summarise(
        count = n(),
        avg_length = mean(nchar(content), na.rm = TRUE)
      ) %>%
      arrange(desc(count))
    
    analysis_results$author_stats <- head(articles_by_author, 10)
  }
  
  # 4. Text analysis - requires tidytext
  if ("content" %in% names(articles_df) && any(!is.na(articles_df$content))) {
    # Define Indonesian stopwords
    id_stopwords <- c("yang", "dan", "di", "dengan", "untuk", "pada", "ini", "dari", "dalam", "akan", 
                      "tidak", "juga", "ke", "atau", "tersebut", "bisa", "oleh", "ada", "itu", "jika",
                      "telah", "sudah", "seperti", "karena", "hanya", "bahwa", "saat", "untuk", "itu")
    
    # Tokenize content and remove stopwords
    word_counts <- articles_df %>%
      unnest_tokens(word, content) %>%
      filter(
        !word %in% id_stopwords,
        nchar(word) > 2,
        !grepl("^\\d+$", word)  # Remove numbers
      ) %>%
      count(word, sort = TRUE)
    
    analysis_results$top_words <- head(word_counts, 20)
    
    # Find common bigrams
    bigrams <- articles_df %>%
      unnest_tokens(bigram, content, token = "ngrams", n = 2) %>%
      filter(!is.na(bigram)) %>%
      count(bigram, sort = TRUE)
    
    analysis_results$top_bigrams <- head(bigrams, 20)
    
    # Sentiment analysis (if you have a sentiment lexicon for Indonesian)
    # This is a placeholder - you would need an actual sentiment lexicon
    # analysis_results$sentiment <- sentiment_analysis_results
  }
  
  # 5. Tag/Category analysis
  if ("tags" %in% names(articles_df) && any(!is.na(articles_df$tags))) {
    # Split tags and count
    all_tags <- unlist(strsplit(paste(articles_df$tags, collapse = ", "), ", "))
    tag_counts <- table(all_tags)
    tag_df <- data.frame(
      tag = names(tag_counts),
      count = as.numeric(tag_counts),
      stringsAsFactors = FALSE
    ) %>%
      arrange(desc(count))
    
    analysis_results$tag_stats <- head(tag_df, 15)
  }
  
  # 6. Entity recognition (simplified)
  # Look for monetary values
  if ("content" %in% names(articles_df) && any(!is.na(articles_df$content))) {
    # Pattern for Indonesian Rupiah amounts
    rupiah_pattern <- "Rp\\s*\\d+[\\.,]?\\d*\\s*(?:ribu|juta|miliar|triliun)?"
    all_content <- paste(articles_df$content, collapse = " ")
    rupiah_values <- str_extract_all(all_content, rupiah_pattern)[[1]]
    
    analysis_results$rupiah_mentions <- head(table(rupiah_values), 10)
    
    # Look for company names (simplified approach)
    company_keywords <- c("PT", "Tbk", "Persero", "Bank", "Perusahaan", "Grup")
    company_pattern <- paste0("(?:[A-Z][a-z]+\\s+)+(?:", paste(company_keywords, collapse = "|"), ")")
    company_mentions <- str_extract_all(all_content, company_pattern)[[1]]
    
    analysis_results$company_mentions <- head(table(company_mentions), 10)
  }
  
  return(analysis_results)
}

Step 4: Function to visualize analysis results

visualize_analysis <- function(analysis_results, articles_df) {
  # List to store visualizations
  visualizations <- list()
  
  # 1. Articles over time
  if (!is.null(analysis_results$time_series) && nrow(analysis_results$time_series) > 1) {
    time_plot <- ggplot(analysis_results$time_series, aes(x = date, y = count)) +
      geom_line() +
      geom_point() +
      labs(title = "Articles Published Over Time",
           x = "Date",
           y = "Number of Articles") +
      theme_minimal()
    
    visualizations$time_series <- time_plot
  }
  
  # 2. Top words
  if (!is.null(analysis_results$top_words) && nrow(analysis_results$top_words) > 0) {
    word_plot <- ggplot(head(analysis_results$top_words, 15), aes(x = reorder(word, n), y = n)) +
      geom_col() +
      coord_flip() +
      labs(title = "Most Frequent Words in Financial News",
           x = NULL,
           y = "Count") +
      theme_minimal()
    
    visualizations$word_frequency <- word_plot
  }
  
  # 3. Top authors
  if (!is.null(analysis_results$author_stats) && nrow(analysis_results$author_stats) > 0) {
    author_plot <- ggplot(head(analysis_results$author_stats, 10), aes(x = reorder(author, count), y = count)) +
      geom_col() +
      coord_flip() +
      labs(title = "Top News Authors",
           x = NULL,
           y = "Number of Articles") +
      theme_minimal()
    
    visualizations$author_counts <- author_plot
  }
  
  # 4. Tag cloud (simplified as bar chart)
  if (!is.null(analysis_results$tag_stats) && nrow(analysis_results$tag_stats) > 0) {
    tag_plot <- ggplot(head(analysis_results$tag_stats, 10), aes(x = reorder(tag, count), y = count)) +
      geom_col() +
      coord_flip() +
      labs(title = "Most Common Article Tags",
           x = NULL,
           y = "Count") +
      theme_minimal()
    
    visualizations$tag_counts <- tag_plot
  }
  
  return(visualizations)
}

Main execution function

crawl_and_analyze_financial_news <- function(max_pages = 3, max_articles_per_page = 20, delay = 2) {
  # Step 1: Collect article links
  cat("==== COLLECTING ARTICLE LINKS ====\n")
  all_articles <- data.frame(url = character(), title = character(), stringsAsFactors = FALSE)
  
  for (page in 1:max_pages) {
    cat("\n----- Processing page", page, "-----\n")
    page_articles <- get_finansial_articles(page, max_articles_per_page)
    
    if (nrow(page_articles) > 0) {
      # Remove duplicates based on URL
      all_articles <- rbind(all_articles, page_articles)
      all_articles <- all_articles[!duplicated(all_articles$url), ]
      
      cat("Total unique articles collected so far:", nrow(all_articles), "\n")
      
      # Add delay between requests
      if (page < max_pages) {
        cat("Waiting", delay, "seconds before next page...\n")
        Sys.sleep(delay)
      }
    } else {
      cat("No articles found on this page or reached the end\n")
      break
    }
  }
  
  if (nrow(all_articles) == 0) {
    cat("No articles were found. Exiting.\n")
    return(NULL)
  }
  
  # Step 2: Crawl each article
  cat("\n==== CRAWLING INDIVIDUAL ARTICLES ====\n")
  articles_content <- data.frame()
  
  for (i in 1:nrow(all_articles)) {
    cat("\nProcessing article", i, "of", nrow(all_articles), "\n")
    article_data <- extract_article_content(all_articles$url[i])
    
    if (!is.null(article_data) && nrow(article_data) > 0) {
      articles_content <- rbind(articles_content, article_data)
    }
    
    # Add delay between requests
    if (i < nrow(all_articles)) {
      cat("Waiting", delay, "seconds before next article...\n")
      Sys.sleep(delay)
    }
  }
  
  if (nrow(articles_content) == 0) {
    cat("Failed to extract content from any articles. Exiting.\n")
    return(NULL)
  }
  
  # Save the raw data
  timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
  csv_filename <- paste0("detik_finansial_articles_", timestamp, ".csv")
  write.csv(articles_content, csv_filename, row.names = FALSE)
  cat("\nRaw article data saved to", csv_filename, "\n")
  
  # Step 3: Analyze the articles
  cat("\n==== ANALYZING ARTICLES ====\n")
  analysis_results <- analyze_financial_news(articles_content)
  
  # Step 4: Visualize the analysis
  cat("\n==== CREATING VISUALIZATIONS ====\n")
  visualizations <- visualize_analysis(analysis_results, articles_content)
  
  # Print text-based analysis results
  cat("\n==== ANALYSIS RESULTS ====\n")
  cat("\nBasic Statistics:\n")
  cat("Total articles:", analysis_results$basic_stats$total_articles, "\n")
  cat("Date range:", format(analysis_results$basic_stats$date_range[1]), "to", 
      format(analysis_results$basic_stats$date_range[2]), "\n")
  cat("Unique authors:", analysis_results$basic_stats$unique_authors, "\n")
  cat("Average content length:", round(analysis_results$basic_stats$avg_content_length), "characters\n")
  
  cat("\nTop 10 Words:\n")
  print(head(analysis_results$top_words, 10))
  
  cat("\nTop 5 Authors:\n")
  print(head(analysis_results$author_stats, 5))
  
  cat("\nTop 5 Tags:\n")
  print(head(analysis_results$tag_stats, 5))
  
  # Save plots if available
  if (length(visualizations) > 0) {
    # Requires the gridExtra package for arranging multiple plots
    if (require(gridExtra)) {
      pdf_filename <- paste0("kompas_finansial_analysis_", timestamp, ".pdf")
      pdf(pdf_filename, width = 10, height = 8)
      
      # Arrange and print plots
      plot_list <- list()
      for (name in names(visualizations)) {
        plot_list[[name]] <- visualizations[[name]]
      }
      
      # Print each plot on its own page
      for (plot in plot_list) {
        print(plot)
      }
      
      dev.off()
      cat("\nVisualizations saved to", pdf_filename, "\n")
    } else {
      cat("\nCould not save visualizations. The gridExtra package is required.\n")
    }
  }
  
  # Return the complete results
  return(list(
    articles = articles_content,
    analysis = analysis_results,
    visualizations = visualizations
  ))
}

Execute the full workflow

cat("\n====== STARTING FINANCIAL NEWS CRAWLING AND ANALYSIS ======\n")
## 
## ====== STARTING FINANCIAL NEWS CRAWLING AND ANALYSIS ======
result <- crawl_and_analyze_financial_news(max_pages = 2, max_articles_per_page = 10, delay = 3)
## ==== COLLECTING ARTICLE LINKS ====
## 
## ----- Processing page 1 -----
## Accessing URL: https://indeks.kompas.com/?source=navbar&site=money 
## Successfully extracted 10 article URLs
## Total unique articles collected so far: 10 
## Waiting 3 seconds before next page...
## 
## ----- Processing page 2 -----
## Accessing URL: https://indeks.kompas.com/?source=navbar&site=money&page=2 
## Successfully extracted 10 article URLs
## Total unique articles collected so far: 20 
## 
## ==== CRAWLING INDIVIDUAL ARTICLES ====
## 
## Processing article 1 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/110000926/lp3es--pembebasan-kuota-impor-bisa-jadi-alternatif-asal-dibarengi-tarif- 
## Waiting 3 seconds before next article...
## 
## Processing article 2 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/104439826/viral-video-parkir-rp-60000-di-pasar-tanah-abang-ylki-meresahkan-pembeli-bisa 
## Waiting 3 seconds before next article...
## 
## Processing article 3 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/103521926/danantara-dan-badan-investasi-qatar-kelola-dana-bersama-rp-672-triliun 
## Waiting 3 seconds before next article...
## 
## Processing article 4 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/103232126/saham-fore-ara-lagi-usai-resmi-melantai-di-bursa-efek 
## Waiting 3 seconds before next article...
## 
## Processing article 5 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/102411326/kata-ibu-rumah-tangga-soal-tutupnya-tupperware-ada-yang-lebih-murah 
## Waiting 3 seconds before next article...
## 
## Processing article 6 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/101819826/harga-emas-terbaru-hari-ini-15-april-2025-di-pegadaian 
## Waiting 3 seconds before next article...
## 
## Processing article 7 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/101412726/mdla-resmi-melantai-di-bursa-himpun-dana-rp-685-miliar-lewat-ipo 
## Waiting 3 seconds before next article...
## 
## Processing article 8 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/101409826/simak-daftar-kurs-rupiah-di-5-bank-besar-indonesia 
## Waiting 3 seconds before next article...
## 
## Processing article 9 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/100000326/plts-terapung-tembesi-di-batam-dapat-suntikan-dana-siap-masuk-tahap-konstruksi 
## Waiting 3 seconds before next article...
## 
## Processing article 10 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/095745726/pembuatan-npwp-online-berapa-lama 
## Waiting 3 seconds before next article...
## 
## Processing article 11 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/084758926/indonesia-jajaki-fta-dengan-rusia-dan-eurasia-target-selesai-semester-i-2025 
## Waiting 3 seconds before next article...
## 
## Processing article 12 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/083541926/antam-catat-penjualan-emas-tertinggi-sepanjang-sejarah-pada-2024 
## Waiting 3 seconds before next article...
## 
## Processing article 13 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/082213926/indonesia-siap-negosiasi-dengan-trump-bawa-strategi-apa-saja 
## Waiting 3 seconds before next article...
## 
## Processing article 14 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/081632926/harga-emas-dunia-melemah-setelah-cetak-rekor-tertinggi 
## Waiting 3 seconds before next article...
## 
## Processing article 15 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/081147326/ihsg-berpeluang-lanjutkan-kenaikan-simak-analisis-dan-saham-pilihan 
## Waiting 3 seconds before next article...
## 
## Processing article 16 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/075942826/prabowo-beli-banyak-fosfat-dari-yordania-untuk-bahan-baku-pupuk-indonesia 
## Waiting 3 seconds before next article...
## 
## Processing article 17 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/075602326/menaker-eks-pegawai-sritex-segera-kembali-bekerja-tunggu-proses-kurator-dan 
## Waiting 3 seconds before next article...
## 
## Processing article 18 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/075126026/ipo-fore-cetak-ara-di-tengah-pasar-saham-yang-bergejolak 
## Waiting 3 seconds before next article...
## 
## Processing article 19 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/074736326/impor-garam-industri-dihentikan-pengusaha-soroti-kualitas-dan-cuaca 
## Waiting 3 seconds before next article...
## 
## Processing article 20 of 20 
## Crawling article: https://money.kompas.com/read/2025/04/15/074107126/ramai-di-medsos-maxim-pastikan-tak-ada-pungutan-biaya-untuk-driver 
## 
## Raw article data saved to detik_finansial_articles_20250415_111117.csv 
## 
## ==== ANALYZING ARTICLES ====
## 
## ==== CREATING VISUALIZATIONS ====
## 
## ==== ANALYSIS RESULTS ====
## 
## Basic Statistics:
## Total articles: 20 
## Date range: 2025-04-15 to 2025-04-15 
## Unique authors: 11 
## Average content length: 2280 characters
## 
## Top 10 Words:
##         word  n
## 1  indonesia 60
## 2       baca 42
## 3      harga 39
## 4      saham 34
## 5       emas 30
## 6       fore 30
## 7    jakarta 30
## 8      impor 28
## 9      pasar 27
## 10    produk 27
## 
## Top 5 Authors:
## # A tibble: 5 × 3
##   author                                                        count avg_length
##   <chr>                                                         <int>      <dbl>
## 1 "Agustinus Rangga Respati,\n                        Teuku Mu…     5      2532 
## 2 "Dian Erika Nugraheny,\n                        Teuku Muhamm…     4      2434.
## 3 "Elsa Catriana,\n                        Teuku Muhammad Vald…     2      1814 
## 4 "Nirmala Maulana Achmad,\n                        Aprillia I…     2      2552 
## 5 "Agustinus Rangga Respati,\n                        Aprillia…     1      1718 
## 
## Top 5 Tags:
##                                                                                                                      tag
## 1                                    cara buat npwp online\n\npembuatan NPWP online berapa lama\n\nberapa lama buat NPWP
## 2                                                                                            emas\n\nAntam\n\nemas Antam
## 3                                                                  Fore Coffee\n\nFore Kopi IPO\n\nFore IPO\n\nFore Kopi
## 4                                                              fore\n\nSaham Fore ARA\n\nfore ara\n\nsaham kopi fore ARA
## 5 harga emas UBS\n\nharga emas batangan\n\nharga emas\n\nharga emas galeri 24\n\nharga emas hari ini\n\nharga emas antam
##   count
## 1     1
## 2     1
## 3     1
## 4     1
## 5     1
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## Visualizations saved to kompas_finansial_analysis_20250415_111117.pdf
cat("\n====== FINISHED FINANCIAL NEWS CRAWLING AND ANALYSIS ======\n")
## 
## ====== FINISHED FINANCIAL NEWS CRAWLING AND ANALYSIS ======