library(jsonlite)
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
library(mongolite)
## Warning: package 'mongolite' was built under R version 3.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
library(DT)
## Warning: package 'DT' was built under R version 3.4.2
# NYC API key
api_key <- '77b01ee0b9a84bb9a9d52e137bf1174e'
# MongoDb connection
con <- mongo(collection = "data607", url = "mongodb://raf:data607raf@ds135876.mlab.com:35876/data607")
# Required:
# 1. New York Times API key: https://developer.nytimes.com/signup (1k-requests limit)
# 2. MongoDB connetion. Used mLab for this study and hidden for security.
# Function that loops through multiple pages and request data from a NYT API
# Obtain function - OSEMNY
download <- function(query, pages){
for (p in seq(0,pages)){
# Print Page Iteration
cat(paste("Page", p, "\n"))
# Fill out spaces with url friendly chars
url_query <- gsub(' ', '%20', query)
# Compose a URL with first day of year 2000 as the begining of the query
ai_url <- paste0("https://api.nytimes.com/svc/search/v2/articlesearch.json?",
"api-key=", api_key,
"&q=", url_query,
"&fl=", "pub_date,headline,news_desk",
"&begin_date=", "20120101",
"&page=", p,
"&sort=newest")
# Request the page response as json
ai_json <- fromJSON(ai_url, simplifyVector = TRUE)
# Convert the json into a dataframe
ai <- data.frame(ai_json)
ai$tag <- query
# Insert into mongoDB
con$insert(ai)
# 1 sec delay not to exceed the NYT limits
Sys.sleep(2)
}
}
# Perform Data Collection.
# Obtain, function trigger - OSEMNY
# Args:(query, number of pages)
#download('artificial intelligence', 100)
#download('data', 100)
#download('data science', 100)
#download('machine learning', 100)
#download('big data', 100)
# scrub - OSEMNY
# Data Extraction from mongoDB
all_data <- con$find()
# Read only selected columns
# Convert date format
df <- data.frame('date' <- all_data$response_docs_pub_date,
'tag' <- all_data$tag,
stringsAsFactors=FALSE)
# Rename Columns
colnames(df) <- c('date', 'tag')
The collected data is now stored in a cloud-based mongoDB collection. It consists of one thousand returned articles per each of the following query:
It is now time for an analysis and visual exploration of the collected articles.
# Explore - OSEMNY
# Convert date into Year-Month and count frequency
frequency_monthly <- as.data.frame.matrix(table(format(as.Date(df$date, '%Y-%m-%d'), '%Y-%m'), df$tag))
# Find correlation between number of articles and each subject
correlation_monthly <- cor(frequency_monthly)
# Plot Correlation
corrplot(correlation_monthly, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
# Determine quartiles and ouliers using boxplot
boxplot(frequency_monthly)
datatable(frequency_monthly)
While frequency aggregation by month was a fair attempt, the number of matches found per category varied too much to draw any conclusions. ‘Data’ query in particular returned a thousand matches just within the last 2 months. Since there is a limit of 1000 request per day for NYT API, I could not go beyond the already collected data.
Let’s break it down by daily results:
# Explore - OSEMNY
frequency_daily <- as.data.frame.matrix(table(as.Date(df$date, '%Y-%m-%d'), df$tag))
head(frequency_daily)
## artificial intelligence big data data data science
## 2013-05-08 0 0 0 0
## 2013-05-11 0 0 0 0
## 2013-05-12 0 0 0 0
## 2013-05-13 0 0 0 0
## 2013-05-15 0 0 0 0
## 2013-05-16 0 0 0 0
## machine learning
## 2013-05-08 1
## 2013-05-11 1
## 2013-05-12 1
## 2013-05-13 2
## 2013-05-15 2
## 2013-05-16 1
summary(frequency_daily)
## artificial intelligence big data data
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.0000
## Mean : 0.9637 Mean : 0.9637 Mean : 0.9637
## 3rd Qu.: 1.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :10.0000 Max. :20.0000 Max. :79.0000
## data science machine learning
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000
## Mean :0.9637 Mean :0.9447
## 3rd Qu.:2.0000 3rd Qu.:1.0000
## Max. :9.0000 Max. :6.0000
# Explore - OSEMNY
# Scale the values and examine quortiles and mean via multiple joined boxplots
scaled_frequency_daily <- scale(frequency_daily)
boxplot(scaled_frequency_daily)
The end date was set to 2012-01-01 or 1000 results; whichever came first. It seems machine learning had the fewest results per day which allowed the download loop reach back to 2013.
It seems we can only run a fair comparison of days in which:
# Model more - OSEMN
# Subset the daily frequency initialized in the previous block where AI >0
ai_nonzero <- subset(frequency_daily, frequency_daily$`artificial intelligence`>0)
head(ai_nonzero)
## artificial intelligence big data data data science
## 2015-09-18 2 0 0 0
## 2015-09-20 2 0 0 0
## 2015-09-21 1 0 0 0
## 2015-09-24 1 0 0 0
## 2015-09-25 2 0 0 0
## 2015-10-01 2 0 0 0
## machine learning
## 2015-09-18 3
## 2015-09-20 2
## 2015-09-21 1
## 2015-09-24 0
## 2015-09-25 2
## 2015-10-01 1
ai_nonzero goes back to 3rd quarter of 2015 with 518 rows which represent individual days (not all days may be included if there are no articles for ai on a given day.)
# Model more - OSEMN
# Find correlation between number of articles and each subject
corr_ai_nonzero <- cor(ai_nonzero)
# Plot Correlation
corrplot(corr_ai_nonzero, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
There appears to be a positive correlation between all categories.
The final test will determine correlation of ai category to each ‘data-group’ individually. This will include all non-zero values both both
# iNterpret - OSEMN
# Artificial intelligence and big data
ai_bd <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) &
(frequency_daily$`big data` > 0))
ai_d <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) &
(frequency_daily$data > 0))
ai_ds <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) &
(frequency_daily$`data science` > 0))
ai_ml <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) &
(frequency_daily$`machine learning` > 0))
corr_ai_bd <- cor(ai_bd$`artificial intelligence`, ai_bd$`big data`)
corr_ai_d <- cor(ai_d$`artificial intelligence`, ai_d$data)
corr_ai_ds <- cor(ai_ds$`artificial intelligence`, ai_ds$`data science`)
corr_ai_ml <- cor(ai_ml$`artificial intelligence`, ai_ml$`machine learning`)
cat(paste0("Correlation between AI and Big Data: ", round(corr_ai_bd, 2), "\n"))
## Correlation between AI and Big Data: 0.45
cat(paste0("Correlation between AI and Data: ", round(corr_ai_d, 2), "\n"))
## Correlation between AI and Data: 0.57
cat(paste0("Correlation between AI and Data Science: ", round(corr_ai_ds, 2), "\n"))
## Correlation between AI and Data Science: 0.24
cat(paste0("Correlation between AI and Machine Learning: ", round(corr_ai_ml, 2), "\n"))
## Correlation between AI and Machine Learning: 0.42
A low to moderate, positive correlation was found between the number of articles appeared in New York Times on artificial intelligence and every of the 4 data related categories. The results suggest there is a relationship between all the topics included in this observational study. We can assume that when AI is ‘talked about’, data-related articles are more likely to be discussed as well.