assignment_8_kevin

Load Data

This notebook loads data in from the NY Times API using the jsonlite packages in R. Getting a personal API key and making sure you have jsonlite installed are prerequisites for using this notebook.

pacman::p_load(dbplyr, tidyr, magrittr, stringr, udpipe, tm, lattice, tidytext, jsonlite, dplyr)

#this uses the search API
election <- fromJSON('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=election&api-key=MnxPBpfA2doPVbkRhXJVw0N7YlPjTTaW', flatten =TRUE) %>% data.frame()

space <- fromJSON('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=space&api-key=MnxPBpfA2doPVbkRhXJVw0N7YlPjTTaW', flatten =TRUE) %>% data.frame()

nyc <- fromJSON('https://api.nytimes.com/svc/search/v2/articlesearch.json?q=nyc&api-key=MnxPBpfA2doPVbkRhXJVw0N7YlPjTTaW', flatten =TRUE) %>% data.frame()

names(nyc)

##  [1] "status"                               
##  [2] "copyright"                            
##  [3] "response.docs.abstract"               
##  [4] "response.docs.web_url"                
##  [5] "response.docs.snippet"                
##  [6] "response.docs.lead_paragraph"         
##  [7] "response.docs.print_section"          
##  [8] "response.docs.print_page"             
##  [9] "response.docs.source"                 
## [10] "response.docs.multimedia"             
## [11] "response.docs.keywords"               
## [12] "response.docs.pub_date"               
## [13] "response.docs.document_type"          
## [14] "response.docs.news_desk"              
## [15] "response.docs.section_name"           
## [16] "response.docs.type_of_material"       
## [17] "response.docs._id"                    
## [18] "response.docs.word_count"             
## [19] "response.docs.uri"                    
## [20] "response.docs.subsection_name"        
## [21] "response.docs.headline.main"          
## [22] "response.docs.headline.kicker"        
## [23] "response.docs.headline.content_kicker"
## [24] "response.docs.headline.print_headline"
## [25] "response.docs.headline.name"          
## [26] "response.docs.headline.seo"           
## [27] "response.docs.headline.sub"           
## [28] "response.docs.byline.original"        
## [29] "response.docs.byline.person"          
## [30] "response.docs.byline.organization"    
## [31] "response.meta.hits"                   
## [32] "response.meta.offset"                 
## [33] "response.meta.time"

Drop Columns

This cell drops columns from the dataframe with more that 25% of the column is null.

nyc <- nyc[,!sapply(nyc, function(x) mean(is.na(x)))>0.25]
space <- space[,!sapply(space, function(x) mean(is.na(x)))>0.25]
election <- election[,!sapply(election, function(x) mean(is.na(x)))>0.25]
names(nyc)

##  [1] "status"                         "copyright"                     
##  [3] "response.docs.abstract"         "response.docs.web_url"         
##  [5] "response.docs.snippet"          "response.docs.lead_paragraph"  
##  [7] "response.docs.source"           "response.docs.multimedia"      
##  [9] "response.docs.keywords"         "response.docs.pub_date"        
## [11] "response.docs.document_type"    "response.docs.news_desk"       
## [13] "response.docs.section_name"     "response.docs.type_of_material"
## [15] "response.docs._id"              "response.docs.word_count"      
## [17] "response.docs.uri"              "response.docs.headline.main"   
## [19] "response.docs.byline.original"  "response.docs.byline.person"   
## [21] "response.meta.hits"             "response.meta.offset"          
## [23] "response.meta.time"

Article Text Data

Get the three main columns that contain the text of the article. I used snippet instead of firs paragraph because it describes the content better. I did not use both to avoid redundancy.

text_nyc <- select(nyc, "response.docs.source",  "response.docs.snippet" , "response.docs.headline.main")
text_election <- select(election, "response.docs.source",  "response.docs.snippet" , "response.docs.headline.main")
text_space <- select(space, "response.docs.source",  "response.docs.snippet" , "response.docs.headline.main")
head(text_space)

##   response.docs.source
## 1   The New York Times
## 2   The New York Times
## 3   The New York Times
## 4   The New York Times
## 5   The New York Times
## 6   The New York Times
##                                                                                                                           response.docs.snippet
## 1            While two Apollo 15 crewmen roamed the lunar surface on a scientific mission, he took valuable photographs from the space capsule.
## 2                                                             Scientists proposed a technique that would allow us to see more of the unseeable.
## 3 The planet is shedding its atmosphere into the void, a signal that was recorded but overlooked in 1986 when the robotic spacecraft flew past.
## 4                      The Hayabusa2 spacecraft’s explosive encounter reveals that a space rock called Ryugu looks extremely young for its age.
## 5            A new explanation for the rocky world’s jumbled landscape opens a possibility that it could have had ingredients for habitability.
## 6                   Work will be suspended at two NASA centers, a setback that could end hopes for sending astronauts back to the moon in 2024.
##                                          response.docs.headline.main
## 1     Alfred Worden, 88, Dies; Orbited Moon and Walked in Deep Space
## 2 Infinite Visions Were Hiding in the First Black Hole Image’s Rings
## 3      Uranus Ejected a Giant Plasma Bubble During Voyager 2’s Visit
## 4    Remember When Japan Blasted an Asteroid? Here’s What We Learned
## 5             Life on the Planet Mercury? ‘It’s Not Completely Nuts’
## 6          Coronavirus Delays Work on NASA’s Moon Rocket and Capsule

Remove Stopwords

Get rid of the noise to see the most used words in the dataset.

stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
text_space$response.docs.snippet = stringr::str_replace_all(text_space$response.docs.snippet, stopwords_regex, '')
text_election$response.docs.snippet = stringr::str_replace_all(text_election$response.docs.snippet, stopwords_regex, '')
text_nyc$response.docs.snippet = stringr::str_replace_all(text_nyc$response.docs.snippet, stopwords_regex, '')

Visualize Results

# Download model from udpipe
model <- udpipe_download_model(language = "english")

## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.4/master/inst/udpipe-ud-2.4-190531/english-ewt-ud-2.4-190531.udpipe to /Users/kevinpotter/Documents/spring_2020_ms/data_607/data_607/english-ewt-ud-2.4-190531.udpipe

## Visit https://github.com/jwijffels/udpipe.models.ud.2.4 for model license details

udmodel_english <- udpipe_load_model(model)

s <- udpipe_annotate(udmodel_english, text_election$response.docs.snippet)
x <- data.frame(s)
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most Occuring Nouns in Election Articles", xlab = "Freq")

s <- udpipe_annotate(udmodel_english, text_space$response.docs.snippet)
x <- data.frame(s)
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most Occuring Nouns in Space Articles", xlab = "Freq")

s <- udpipe_annotate(udmodel_english, text_nyc$response.docs.snippet)
x <- data.frame(s)
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most Occuring Nouns in NYC Articles", xlab = "Freq")

Conclusion