This notebook loads data in from the NY Times API using the jsonlite packages in R. Getting a personal API key and making sure you have jsonlite installed are prerequisites for using this notebook.
pacman::p_load(dbplyr, tidyr, magrittr, stringr, udpipe, tm, lattice, tidytext, jsonlite, dplyr)
#this uses the search API
election <- fromJSON('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=election&api-key=MnxPBpfA2doPVbkRhXJVw0N7YlPjTTaW', flatten =TRUE) %>% data.frame()
space <- fromJSON('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=space&api-key=MnxPBpfA2doPVbkRhXJVw0N7YlPjTTaW', flatten =TRUE) %>% data.frame()
nyc <- fromJSON('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=nyc&api-key=MnxPBpfA2doPVbkRhXJVw0N7YlPjTTaW', flatten =TRUE) %>% data.frame()
names(nyc)
## [1] "status"
## [2] "copyright"
## [3] "response.docs.abstract"
## [4] "response.docs.web_url"
## [5] "response.docs.snippet"
## [6] "response.docs.lead_paragraph"
## [7] "response.docs.print_section"
## [8] "response.docs.print_page"
## [9] "response.docs.source"
## [10] "response.docs.multimedia"
## [11] "response.docs.keywords"
## [12] "response.docs.pub_date"
## [13] "response.docs.document_type"
## [14] "response.docs.news_desk"
## [15] "response.docs.section_name"
## [16] "response.docs.type_of_material"
## [17] "response.docs._id"
## [18] "response.docs.word_count"
## [19] "response.docs.uri"
## [20] "response.docs.subsection_name"
## [21] "response.docs.headline.main"
## [22] "response.docs.headline.kicker"
## [23] "response.docs.headline.content_kicker"
## [24] "response.docs.headline.print_headline"
## [25] "response.docs.headline.name"
## [26] "response.docs.headline.seo"
## [27] "response.docs.headline.sub"
## [28] "response.docs.byline.original"
## [29] "response.docs.byline.person"
## [30] "response.docs.byline.organization"
## [31] "response.meta.hits"
## [32] "response.meta.offset"
## [33] "response.meta.time"
This cell drops columns from the dataframe with more that 25% of the column is null.
nyc <- nyc[,!sapply(nyc, function(x) mean(is.na(x)))>0.25]
space <- space[,!sapply(space, function(x) mean(is.na(x)))>0.25]
election <- election[,!sapply(election, function(x) mean(is.na(x)))>0.25]
names(nyc)
## [1] "status" "copyright"
## [3] "response.docs.abstract" "response.docs.web_url"
## [5] "response.docs.snippet" "response.docs.lead_paragraph"
## [7] "response.docs.source" "response.docs.multimedia"
## [9] "response.docs.keywords" "response.docs.pub_date"
## [11] "response.docs.document_type" "response.docs.news_desk"
## [13] "response.docs.section_name" "response.docs.type_of_material"
## [15] "response.docs._id" "response.docs.word_count"
## [17] "response.docs.uri" "response.docs.headline.main"
## [19] "response.docs.byline.original" "response.docs.byline.person"
## [21] "response.meta.hits" "response.meta.offset"
## [23] "response.meta.time"
Get the three main columns that contain the text of the article. I used snippet instead of firs paragraph because it describes the content better. I did not use both to avoid redundancy.
text_nyc <- select(nyc, "response.docs.source", "response.docs.snippet" , "response.docs.headline.main")
text_election <- select(election, "response.docs.source", "response.docs.snippet" , "response.docs.headline.main")
text_space <- select(space, "response.docs.source", "response.docs.snippet" , "response.docs.headline.main")
head(text_space)
## response.docs.source
## 1 The New York Times
## 2 The New York Times
## 3 The New York Times
## 4 The New York Times
## 5 The New York Times
## 6 The New York Times
## response.docs.snippet
## 1 While two Apollo 15 crewmen roamed the lunar surface on a scientific mission, he took valuable photographs from the space capsule.
## 2 Scientists proposed a technique that would allow us to see more of the unseeable.
## 3 The planet is shedding its atmosphere into the void, a signal that was recorded but overlooked in 1986 when the robotic spacecraft flew past.
## 4 The Hayabusa2 spacecraft’s explosive encounter reveals that a space rock called Ryugu looks extremely young for its age.
## 5 A new explanation for the rocky world’s jumbled landscape opens a possibility that it could have had ingredients for habitability.
## 6 Work will be suspended at two NASA centers, a setback that could end hopes for sending astronauts back to the moon in 2024.
## response.docs.headline.main
## 1 Alfred Worden, 88, Dies; Orbited Moon and Walked in Deep Space
## 2 Infinite Visions Were Hiding in the First Black Hole Image’s Rings
## 3 Uranus Ejected a Giant Plasma Bubble During Voyager 2’s Visit
## 4 Remember When Japan Blasted an Asteroid? Here’s What We Learned
## 5 Life on the Planet Mercury? ‘It’s Not Completely Nuts’
## 6 Coronavirus Delays Work on NASA’s Moon Rocket and Capsule
Get rid of the noise to see the most used words in the dataset.
stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
text_space$response.docs.snippet = stringr::str_replace_all(text_space$response.docs.snippet, stopwords_regex, '')
text_election$response.docs.snippet = stringr::str_replace_all(text_election$response.docs.snippet, stopwords_regex, '')
text_nyc$response.docs.snippet = stringr::str_replace_all(text_nyc$response.docs.snippet, stopwords_regex, '')
# Download model from udpipe
model <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.4/master/inst/udpipe-ud-2.4-190531/english-ewt-ud-2.4-190531.udpipe to /Users/kevinpotter/Documents/spring_2020_ms/data_607/data_607/english-ewt-ud-2.4-190531.udpipe
## Visit https://github.com/jwijffels/udpipe.models.ud.2.4 for model license details
udmodel_english <- udpipe_load_model(model)
s <- udpipe_annotate(udmodel_english, text_election$response.docs.snippet)
x <- data.frame(s)
stats <- subset(x, upos %in% c("NOUN"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
main = "Most Occuring Nouns in Election Articles", xlab = "Freq")
s <- udpipe_annotate(udmodel_english, text_space$response.docs.snippet)
x <- data.frame(s)
stats <- subset(x, upos %in% c("NOUN"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
main = "Most Occuring Nouns in Space Articles", xlab = "Freq")
s <- udpipe_annotate(udmodel_english, text_nyc$response.docs.snippet)
x <- data.frame(s)
stats <- subset(x, upos %in% c("NOUN"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
main = "Most Occuring Nouns in NYC Articles", xlab = "Freq")
The results are as I would expect, the results for the most common words that are not stop words are different for each set of articles collected. The most common word for space was space, NYC was people, and election was intelligence. I was surprised not to see anything relative to the virus outbreak in the NYC and election categories. Aside from that the word counts make sense.