Data Extraction and storage in MongoDB

library(jsonlite)
library(ggplot2)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.2

library(mongolite)

## Warning: package 'mongolite' was built under R version 3.4.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.4.3

## corrplot 0.84 loaded

library(DT)

## Warning: package 'DT' was built under R version 3.4.2

# NYC API key
api_key <- '77b01ee0b9a84bb9a9d52e137bf1174e'
# MongoDb connection
con <- mongo(collection = "data607", url = "mongodb://raf:data607raf@ds135876.mlab.com:35876/data607")


# Required:
# 1. New York Times API key: https://developer.nytimes.com/signup (1k-requests limit)
# 2. MongoDB connetion. Used mLab for this study and hidden for security. 


# Function that loops through multiple pages and request data from a NYT API
# Obtain function - OSEMNY
download <- function(query, pages){
  for (p in seq(0,pages)){
    # Print Page Iteration
    cat(paste("Page", p, "\n"))
    
    # Fill out spaces with url friendly chars
    url_query <- gsub(' ', '%20', query)
    
    # Compose a URL with first day of year 2000 as the begining of the query
    ai_url <- paste0("https://api.nytimes.com/svc/search/v2/articlesearch.json?", 
                  "api-key=", api_key,
                  "&q=", url_query,
                  "&fl=", "pub_date,headline,news_desk",
                  "&begin_date=", "20120101",
                  "&page=", p,
                  "&sort=newest")
    
    
    
    # Request the page response as json
    ai_json <- fromJSON(ai_url, simplifyVector = TRUE)

    # Convert the json into a dataframe
    ai <- data.frame(ai_json)
    ai$tag <- query
    
    # Insert into mongoDB
    con$insert(ai)
    
    # 1 sec delay not to exceed the NYT limits
    Sys.sleep(2)
  }
}

# Perform Data Collection. 
# Obtain, function trigger - OSEMNY
# Args:(query, number of pages)

#download('artificial intelligence', 100)
#download('data', 100)
#download('data science', 100)
#download('machine learning', 100)
#download('big data', 100)

Data Reading, Cleanup and Transformation

# scrub - OSEMNY

# Data Extraction from mongoDB
all_data <- con$find()


# Read only selected columns
# Convert date format
df <- data.frame('date' <- all_data$response_docs_pub_date,
                 'tag' <- all_data$tag,
                 stringsAsFactors=FALSE)

# Rename Columns
colnames(df) <- c('date', 'tag')

The collected data is now stored in a cloud-based mongoDB collection. It consists of one thousand returned articles per each of the following query:

‘Artificial Intelligence’
‘big data’
‘data’
‘data science’
‘machine learning’

It is now time for an analysis and visual exploration of the collected articles.

Statistical Analysis and charting

# Explore - OSEMNY
# Convert date into Year-Month and count frequency
frequency_monthly <- as.data.frame.matrix(table(format(as.Date(df$date, '%Y-%m-%d'), '%Y-%m'), df$tag))
# Find correlation between number of articles and each subject
correlation_monthly <- cor(frequency_monthly)

# Plot Correlation
corrplot(correlation_monthly, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

# Determine quartiles and ouliers using boxplot
boxplot(frequency_monthly)

datatable(frequency_monthly)

While frequency aggregation by month was a fair attempt, the number of matches found per category varied too much to draw any conclusions. ‘Data’ query in particular returned a thousand matches just within the last 2 months. Since there is a limit of 1000 request per day for NYT API, I could not go beyond the already collected data.

Let’s break it down by daily results:

# Explore - OSEMNY

frequency_daily <- as.data.frame.matrix(table(as.Date(df$date, '%Y-%m-%d'), df$tag))
head(frequency_daily)

##            artificial intelligence big data data data science
## 2013-05-08                       0        0    0            0
## 2013-05-11                       0        0    0            0
## 2013-05-12                       0        0    0            0
## 2013-05-13                       0        0    0            0
## 2013-05-15                       0        0    0            0
## 2013-05-16                       0        0    0            0
##            machine learning
## 2013-05-08                1
## 2013-05-11                1
## 2013-05-12                1
## 2013-05-13                2
## 2013-05-15                2
## 2013-05-16                1

summary(frequency_daily)

##  artificial intelligence    big data            data        
##  Min.   : 0.0000         Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.: 0.0000         1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median : 0.0000         Median : 0.0000   Median : 0.0000  
##  Mean   : 0.9637         Mean   : 0.9637   Mean   : 0.9637  
##  3rd Qu.: 1.0000         3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :10.0000         Max.   :20.0000   Max.   :79.0000  
##   data science    machine learning
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000  
##  Mean   :0.9637   Mean   :0.9447  
##  3rd Qu.:2.0000   3rd Qu.:1.0000  
##  Max.   :9.0000   Max.   :6.0000

# Explore - OSEMNY
# Scale the values and examine quortiles and mean via multiple joined boxplots
scaled_frequency_daily <- scale(frequency_daily)
boxplot(scaled_frequency_daily)

The end date was set to 2012-01-01 or 1000 results; whichever came first. It seems machine learning had the fewest results per day which allowed the download loop reach back to 2013.

It seems we can only run a fair comparison of days in which:

None of the categories returned some data ( > 0 )
We can go back to the last row where artificial intelligence query returned 1 or more results since this is the main subject we are trying to find correlation for.

# Model more - OSEMN

# Subset the daily frequency initialized in the previous block where AI >0

ai_nonzero <- subset(frequency_daily, frequency_daily$`artificial intelligence`>0)
head(ai_nonzero)

##            artificial intelligence big data data data science
## 2015-09-18                       2        0    0            0
## 2015-09-20                       2        0    0            0
## 2015-09-21                       1        0    0            0
## 2015-09-24                       1        0    0            0
## 2015-09-25                       2        0    0            0
## 2015-10-01                       2        0    0            0
##            machine learning
## 2015-09-18                3
## 2015-09-20                2
## 2015-09-21                1
## 2015-09-24                0
## 2015-09-25                2
## 2015-10-01                1

ai_nonzero goes back to 3rd quarter of 2015 with 518 rows which represent individual days (not all days may be included if there are no articles for ai on a given day.)

# Model more - OSEMN

# Find correlation between number of articles and each subject
corr_ai_nonzero <- cor(ai_nonzero)

# Plot Correlation
corrplot(corr_ai_nonzero, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

There appears to be a positive correlation between all categories.

The final test will determine correlation of ai category to each ‘data-group’ individually. This will include all non-zero values both both

# iNterpret - OSEMN
# Artificial intelligence and big data
ai_bd <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) & 
                  (frequency_daily$`big data` > 0))

ai_d <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) & 
                  (frequency_daily$data > 0))

ai_ds <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) & 
                  (frequency_daily$`data science` > 0))

ai_ml <- subset(frequency_daily, (frequency_daily$`artificial intelligence` > 0) & 
                  (frequency_daily$`machine learning` > 0))

corr_ai_bd <- cor(ai_bd$`artificial intelligence`, ai_bd$`big data`)
corr_ai_d <- cor(ai_d$`artificial intelligence`, ai_d$data)
corr_ai_ds <- cor(ai_ds$`artificial intelligence`, ai_ds$`data science`)
corr_ai_ml <- cor(ai_ml$`artificial intelligence`, ai_ml$`machine learning`)

cat(paste0("Correlation between AI and Big Data: ", round(corr_ai_bd, 2), "\n"))

## Correlation between AI and Big Data: 0.45

cat(paste0("Correlation between AI and Data: ", round(corr_ai_d, 2), "\n"))

## Correlation between AI and Data: 0.57

cat(paste0("Correlation between AI and Data Science: ", round(corr_ai_ds, 2), "\n"))

## Correlation between AI and Data Science: 0.24

cat(paste0("Correlation between AI and Machine Learning: ", round(corr_ai_ml, 2), "\n"))

## Correlation between AI and Machine Learning: 0.42

Conclusion

A low to moderate, positive correlation was found between the number of articles appeared in New York Times on artificial intelligence and every of the 4 data related categories. The results suggest there is a relationship between all the topics included in this observational study. We can assume that when AI is ‘talked about’, data-related articles are more likely to be discussed as well.

data607-finalproject

Rafal Decowski

December 10, 2017

Data Extraction and storage in MongoDB

Data Reading, Cleanup and Transformation

Statistical Analysis and charting

Conclusion