library(httr)
library(jsonlite)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(ggplot2)
library(tidyr)
library(stringr)



# Function to Get articles from NYT Article Search API

Get_NYT_Articles <- function(api_key, query, page = 0) {
  base_url <- "https://api.nytimes.com/svc/search/v2/articlesearch.json"
  response <- GET(url = base_url,
                  query = list(q = query,
                               page = page,
                               'api-key' = api_key))
  
  # Check if the request was successful
  if (status_code(response) == 200) {
    content <- content(response, "text")
    json <- fromJSON(content, flatten = TRUE)
    
    # Check for articles in the response
    if (length(json$response$docs) > 0) {
      articles <- json$response$docs
      df <- as.data.frame(articles)
      return(df)
    } else {
      stop("No articles found")
    }
  } else {
    stop("Request failed with status: ", status_code(response))
  }
}

api_key <- 'JMEmmEEIa0QRZMCedK3Ol6QmRHBVNlfE'

# Get articles about "data science"
df_articles <- Get_NYT_Articles(api_key, "CUNY")

# view the head of the DataFrame
glimpse(df_articles)
## Rows: 10
## Columns: 27
## $ abstract                <chr> "Citing questions about the integrity of the p…
## $ web_url                 <chr> "https://www.nytimes.com/2023/10/28/health/cas…
## $ snippet                 <chr> "Citing questions about the integrity of the p…
## $ lead_paragraph          <chr> "The City University of New York is pausing it…
## $ source                  <chr> "The New York Times", "The New York Times", "T…
## $ multimedia              <list> [<data.frame[74 x 19]>], [<data.frame[74 x 19…
## $ keywords                <list> [<data.frame[9 x 4]>], [<data.frame[12 x 4]>]…
## $ pub_date                <chr> "2023-10-28T13:40:45+0000", "2023-10-14T17:07:…
## $ document_type           <chr> "article", "article", "article", "article", "a…
## $ news_desk               <chr> "Science", "Science", "Metro", "Metro", "Obits…
## $ section_name            <chr> "Health", "Health", "New York", "New York", "N…
## $ type_of_material        <chr> "News", "News", "briefing", "News", "Obituary …
## $ `_id`                   <chr> "nyt://article/5ef816ec-aea9-5ca5-a929-37e7199…
## $ word_count              <int> 414, 745, 1489, 851, 1282, 945, 917, 794, 1195…
## $ uri                     <chr> "nyt://article/5ef816ec-aea9-5ca5-a929-37e7199…
## $ print_section           <chr> NA, "A", NA, "A", "B", "D", NA, NA, "MB", NA
## $ print_page              <chr> NA, "12", NA, "18", "12", "4", NA, NA, "3", NA
## $ headline.main           <chr> "CUNY Halts Investigation of Alzheimer’s Resea…
## $ headline.kicker         <chr> NA, NA, "New York Today", NA, NA, NA, NA, NA, …
## $ headline.content_kicker <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ headline.print_headline <chr> NA, "Report Faults Scientists Working on Alzhe…
## $ headline.name           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ headline.seo            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ headline.sub            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ byline.original         <chr> "By Apoorva Mandavilli", "By Apoorva Mandavill…
## $ byline.person           <list> [<data.frame[1 x 8]>], [<data.frame[1 x 8]>], …
## $ byline.organization     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA

what are the most used words?

df_articles$text <- paste(df_articles$headline.main, df_articles$snippet, df_articles$lead_paragraph, sep = " ")

# Tokenize the text
words <- df_articles %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word) # remove stop words

# Count the words
word_counts <- words %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_counts, n = 10) 
##           word  n
## 1          law 10
## 2   university  8
## 3         city  7
## 4         cuny  6
## 5        mayor  6
## 6       school  6
## 7        adams  5
## 8      college  5
## 9         york  5
## 10 alzheimer’s  4
#  visualize the most common words
ggplot(head(word_counts, n = 10), aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity") +
  xlab("Word") +
  ylab("Frequency") +
  coord_flip() + # Flip the axes for horizontal bars
  theme_minimal()