library(httr)
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(ggplot2)
library(tidyr)
library(stringr)
# Function to Get articles from NYT Article Search API
Get_NYT_Articles <- function(api_key, query, page = 0) {
base_url <- "https://api.nytimes.com/svc/search/v2/articlesearch.json"
response <- GET(url = base_url,
query = list(q = query,
page = page,
'api-key' = api_key))
# Check if the request was successful
if (status_code(response) == 200) {
content <- content(response, "text")
json <- fromJSON(content, flatten = TRUE)
# Check for articles in the response
if (length(json$response$docs) > 0) {
articles <- json$response$docs
df <- as.data.frame(articles)
return(df)
} else {
stop("No articles found")
}
} else {
stop("Request failed with status: ", status_code(response))
}
}
api_key <- 'JMEmmEEIa0QRZMCedK3Ol6QmRHBVNlfE'
# Get articles about "data science"
df_articles <- Get_NYT_Articles(api_key, "CUNY")
# view the head of the DataFrame
glimpse(df_articles)
## Rows: 10
## Columns: 27
## $ abstract <chr> "Citing questions about the integrity of the p…
## $ web_url <chr> "https://www.nytimes.com/2023/10/28/health/cas…
## $ snippet <chr> "Citing questions about the integrity of the p…
## $ lead_paragraph <chr> "The City University of New York is pausing it…
## $ source <chr> "The New York Times", "The New York Times", "T…
## $ multimedia <list> [<data.frame[74 x 19]>], [<data.frame[74 x 19…
## $ keywords <list> [<data.frame[9 x 4]>], [<data.frame[12 x 4]>]…
## $ pub_date <chr> "2023-10-28T13:40:45+0000", "2023-10-14T17:07:…
## $ document_type <chr> "article", "article", "article", "article", "a…
## $ news_desk <chr> "Science", "Science", "Metro", "Metro", "Obits…
## $ section_name <chr> "Health", "Health", "New York", "New York", "N…
## $ type_of_material <chr> "News", "News", "briefing", "News", "Obituary …
## $ `_id` <chr> "nyt://article/5ef816ec-aea9-5ca5-a929-37e7199…
## $ word_count <int> 414, 745, 1489, 851, 1282, 945, 917, 794, 1195…
## $ uri <chr> "nyt://article/5ef816ec-aea9-5ca5-a929-37e7199…
## $ print_section <chr> NA, "A", NA, "A", "B", "D", NA, NA, "MB", NA
## $ print_page <chr> NA, "12", NA, "18", "12", "4", NA, NA, "3", NA
## $ headline.main <chr> "CUNY Halts Investigation of Alzheimer’s Resea…
## $ headline.kicker <chr> NA, NA, "New York Today", NA, NA, NA, NA, NA, …
## $ headline.content_kicker <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ headline.print_headline <chr> NA, "Report Faults Scientists Working on Alzhe…
## $ headline.name <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ headline.seo <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ headline.sub <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ byline.original <chr> "By Apoorva Mandavilli", "By Apoorva Mandavill…
## $ byline.person <list> [<data.frame[1 x 8]>], [<data.frame[1 x 8]>], …
## $ byline.organization <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
what are the most used words?
df_articles$text <- paste(df_articles$headline.main, df_articles$snippet, df_articles$lead_paragraph, sep = " ")
# Tokenize the text
words <- df_articles %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) # remove stop words
# Count the words
word_counts <- words %>%
count(word, sort = TRUE)
# Display the most common words
head(word_counts, n = 10)
## word n
## 1 law 10
## 2 university 8
## 3 city 7
## 4 cuny 6
## 5 mayor 6
## 6 school 6
## 7 adams 5
## 8 college 5
## 9 york 5
## 10 alzheimer’s 4
# visualize the most common words
ggplot(head(word_counts, n = 10), aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity") +
xlab("Word") +
ylab("Frequency") +
coord_flip() + # Flip the axes for horizontal bars
theme_minimal()
