SETUP

Source files: https://github.com/djlofland/DATA607_F2019/tree/master/Week9

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(forcats)
library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

library(formattable)

# We will be working with the NYT Movies API's
NYT_MOVIE_API_BASE <- "https://api.nytimes.com/svc/movies/v2"
NYT_ACTION_PICKS <- "/reviews/picks.json?"

NYT_ACTION_CRITIC <- "/critics/A.%20O.%20Scott.json?"
NYT_ACTION_CRITICS <- "/critics/all.json?"

NYT_ACTION_SEARCH <- "/reviews/search.json?"

# Note that my NYT API Key is stored in my .Renviron file making it available as an envioronmental variable in any projects.
APIKEY <- paste("&api-key=", Sys.getenv("NYTIMES_APIKEY_D607"), "&", sep="")

# Local cache data file to use if available
local_cache_fn <- './data/reviews.csv'

Load Critics

We will be working with the NYT Movies API’s

url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_CRITICS, APIKEY, sep="")
print(url)

## [1] "https://api.nytimes.com/svc/movies/v2/critics/all.json?&api-key=CMGVPoNAFeGZSrQtDOZAEonuEsOLA5xX&"

critics <- fromJSON(url, flatten = TRUE) %>% 
  data.frame()

Load Movie Picks

We will be working with the NYT Movies API’s

url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_PICKS, APIKEY, sep="")
print(url)

## [1] "https://api.nytimes.com/svc/movies/v2/reviews/picks.json?&api-key=CMGVPoNAFeGZSrQtDOZAEonuEsOLA5xX&"

movie_picks <- fromJSON(url, flatten = TRUE) %>% 
  data.frame()

Load Movie Reviews

We will be working with the NYT Movies API’s. Note that the NYUT API has throttlelimits that only allow 6 queries/min and a maximum of 4000 queries per day. In this example, I setup a loop to see if I could scrape all the Movie Reviews. After several attempts, its clear that if I really want all movie reviews, I’ll have to babysit the process for the better part of a day. So, I allowed it to run 758 times at 20 reviews pare page for a total of 15160 rows (or movie reviews). This is more than sufficient for proof of concept in this project.

# Check if we have a local copy of the data available to load
isCacheFound <- FALSE

# Check whether we previously downloaded the review data from NYT API.  If found, we will used the cached copy.  If not found, then we'll kick off job to pull the data from NYT (note: this takes a while)
if(file.exists(local_cache_fn)) {
  msg <- paste('Found cached copy: ', local_cache_fn, sep='')
  isCacheFound <- TRUE
}

if(!isCacheFound) {
  # Kick of process to download data from NYT API
  
  # List that will hold each page of results returned from the API
  reviews <- list()
  
  # We will need to check the has_more column 
  has_more <- TRUE
  page <- 0
    
  while(has_more) {
    # construct the URL - note we add an offset to handle 20 results per page
    # in the respone, if has_more column is TRUE, we will need to request the next page
    url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_SEARCH, APIKEY, "offset=", page*20, sep="")
    
    # download results from API
    results <- fromJSON(url, flatten = TRUE) %>% 
      data.frame() 
    
    message("Retrieving page: ", page + 1)
    
    # save off the current page of results
    reviews[[page+1]] <- results
    
    # scrape whether we have more results we'll need to pull
    has_more <- results$has_more[1]
    page <- page + 1
    
    # Throttle we can use to only grab the first few pages of results
    #  if(page==3) {
    #    has_more <- FALSE
    #  }
    
    # NYT API has a rate limmit of 10 requests per minute (HTTP error 429 if this is exceeded)
    Sys.sleep(6.1)
  }
  
  # combine the list of dataframes into a single main dataframe
  data_df <- data.table::rbindlist(reviews) 
  
  # Cache the processed as CSV for future
  write.csv(data_df, local_cache_fn, row.names=FALSE, na="")
  
} else {
  # Now load the data from local cache CSV
  data_df <- read_csv(local_cache_fn, col_names = TRUE)
  isDataLoaded <- TRUE
  msg <- 'Cached CSV data loaded.'
}

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   has_more = col_logical(),
##   num_results = col_double(),
##   results.critics_pick = col_double(),
##   results.publication_date = col_date(format = ""),
##   results.opening_date = col_date(format = ""),
##   results.date_updated = col_datetime(format = ""),
##   results.multimedia.width = col_double(),
##   results.multimedia.height = col_double(),
##   results.multimedia = col_logical()
## )

## See spec(...) for full column specifications.

## Warning: 77 parsing failures.
##  row                  col   expected     actual                 file
##  106 results.opening_date valid date 0000-00-00 './data/reviews.csv'
##  248 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 1074 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 1135 results.opening_date valid date 1969-00-00 './data/reviews.csv'
## 1207 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## .... .................... .......... .......... ....................
## See problems(...) for more details.

Cleanup Reviews

# drop rows with missing dates - since I'm not really interested in dates and am not
# doing any analysis where loss of rows might affect interpretation, it's safe to just
# drop rows with any missing dates.  Note: There are also nulls in the rating column 
# for movies that didn't receive an official mpaa rating letter.  
# For these, I'll replace nulls with 'not rated'.  Next, teh ByLines are a mess - I'll do some
# quick spot cleanup.
data_df2 <- data_df %>% 
  drop_na(results.publication_date, results.opening_date, results.date_updated) %>%
  replace_na(list(results.mpaa_rating = "Not Rated")) %>%
  mutate(results.byline = str_to_upper(results.byline)) %>%
  mutate(results.byline = str_replace_all(results.byline, "[\\.,]", "")) %>%
  mutate(results.byline = str_replace_all(results.byline, "&#160;", "")) %>%
  mutate(results.byline = str_replace_all(results.byline, "BY ", "")) %>%
  mutate(results.byline = str_replace_all(results.byline, "&NBSP;", "")) %>%
  mutate(results.byline = str_trim(results.byline, side=c("both"))) %>%
  mutate(results.byline = str_replace_all(results.byline, "[;]", "")) %>%
  mutate(results.byline = str_squish(results.byline))

Analysis

results.byline	review_count
STEPHEN HOLDEN	1496
JANET MASLIN	1211
A O SCOTT	1068
MANOHLA DARGIS	851
JEANNETTE CATSOULIS	774
GLENN KENNY	374
BEN KENIGSBERG	353
NEIL GENZLINGER	339
VINCENT CANBY	304
ELVIS MITCHELL	284

Reviews by Critical & Rating

# How many reviews did critics post for each mpaa rating?
rating_counts <- data_df2 %>%
  group_by(results.byline, results.mpaa_rating) %>%
  summarize(rating_count = length(results.mpaa_rating)) %>%
  arrange(results.byline) %>%
  spread(results.mpaa_rating, rating_count) %>%
  select(results.byline, G, PG, `PG-13`, R, `NC-17`, X, `Not Rated`) %>%
  replace_na(list(G = 0,PG=0, `PG-13`=0, R=0, `NC-17`=0, X=0, `Not Rated`=0))

formattable(rating_counts,
            align =c("l", "c", "c", "c", "c", "c", "c", "c"), 
            list(`results.byline` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")))
            )

results.byline	G	PG	PG-13	R	NC-17	X	Not Rated
A O SCOTT	18	104	369	438	1	0	138
AISHA HARRIS	0	1	3	4	0	0	5
ANDREA HIGBIE	0	1	0	0	0	0	0
ANDREW GELLER	0	0	0	1	0	0	0
ANDY WEBSTER	3	22	41	45	0	0	123
ANITA GATES	9	10	10	22	0	0	30
ANNA KISSELGOFF	0	0	0	0	0	0	1
AO SCOTT	0	13	68	115	0	0	57
AS TOLD TO MANOHLA DARGIS LORD DARGIS	0	1	0	0	0	0	0
BEN KENIGSBERG	0	24	37	73	1	0	218
BERNADINE MORRIS	0	0	0	1	0	0	0
BERNARD WEINRAUB	0	0	0	1	0	0	0
BILGE EBIRI	0	6	3	4	0	0	13
BRIAN SEIBERT	0	0	0	0	0	0	2
CANDICE FREDERICK	0	0	0	0	0	0	3
CARINA CHOCANO	0	0	0	0	0	0	1
CARYN JAMES	8	40	46	100	2	0	34
CHRIS VOGNAR	0	0	0	1	0	0	2
DANA STEVENS	0	2	0	0	0	0	0
DANIEL M GOLD	0	2	2	8	0	0	57
DAVE KEHR	5	20	17	27	0	0	22
DAVID DEWITT	1	1	1	3	0	0	19
EDWARD ROTHSTEIN	0	0	0	0	0	0	1
ELISABETH VINCENTELLI	0	0	1	1	0	0	4
ELVIS MITCHELL	7	23	106	125	1	0	22
FLETCHER ROBERTS	0	0	0	0	0	0	1
FRANCINE PROSE	0	1	0	0	0	0	0
FRANCIS X CLINES	0	0	0	1	0	0	0
GIA KOURLAS	0	0	0	0	0	0	1
GLENN KENNY	3	23	54	95	0	0	199
HELEN T VERONGOS	0	1	1	5	0	0	27
JAMIE DIAMOND	0	0	1	0	0	0	0
JANET MASLIN	23	157	269	627	6	1	128
JASON BAILEY	0	0	0	0	0	0	8
JASON ZINOMAN	0	1	2	1	0	0	10
JEANETTE CATSOULIS	0	0	0	1	0	0	0
JEANNETTE CATSOULIS	8	41	120	199	1	0	405
JEANNETTE CATSOULIS NATHAN LEE	0	1	0	0	0	0	0
JEANNETTE CATSOULIS NATHAN LEE NEIL GENZLINGER LAURA KERN	0	0	1	0	0	0	0
JENNIFER DUNNING	0	0	0	0	0	0	1
JENNIFER SZALAI	0	0	0	2	0	0	2
JOHN ANDERSON	0	0	0	0	0	0	1
JOHN WILLIAMS	0	0	1	0	0	0	0
JON CARAMANICA	0	1	2	0	0	0	5
JON PARELES	0	0	1	1	0	0	0
JULIE LEW	0	0	1	0	0	0	0
KAREN HAN	0	0	0	1	0	0	2
KEN JAWOROWSKI	1	7	10	11	0	0	79
KENNETH C DAVIS	0	1	0	1	0	0	0
KYLE TURNER	0	0	0	0	0	0	1
LAURA KERN	0	2	3	7	0	0	10
LAWRENCE VAN GELDER	11	33	35	73	0	0	36
LAWRNENCE VAN GELDER	0	0	0	0	0	0	1
LINDA LEE	0	1	0	2	0	0	0
MANOHLA DARGIS	12	65	271	374	1	0	128
MARTIN TSAI	0	0	0	0	0	0	1
MATT ZOLLER SEITZ	1	7	13	18	0	0	22
MICHAEL ERIC DYSON	0	0	0	1	0	0	0
MICHAEL KIMMELMAN	0	0	0	0	0	0	1
MIKE HALE	2	12	13	17	0	0	48
MIRIAM BALE	0	4	2	5	0	0	15
MONICA CASTILLO	0	1	1	3	0	0	9
NATHAN LEE	2	5	18	12	0	0	39
NEIL GENZLINGER	13	40	56	71	0	0	159
NICOLAS RAPOLD	1	12	21	24	0	0	96
NICOLE HERRINGTON	0	0	1	5	0	0	14
PATRICIA S MCCORMICK	3	5	0	2	0	0	0
PAUL BRUNICK	0	0	1	1	0	0	7
PAUL GOLDBERGER	0	0	0	0	0	0	1
PETER M NICHOLS	0	0	1	1	0	0	1
PHILLIP LOPATE	0	1	0	0	0	0	0
RACHEL SALTZ	1	2	12	4	0	0	72
ROBERTA SMITH	0	0	0	1	0	0	0
SAMUEL G FREEDMAN	0	0	0	1	0	0	0
SCOTT TOBIAS	0	0	1	0	0	0	4
SEAN T COLLINS	0	0	0	0	0	0	1
STEPHEN HOLDEN	32	151	310	596	5	0	402
SUZANNE O’CONNOR	0	0	2	0	0	0	1
TEO BUGBEE	1	9	3	15	0	0	45
THE NEW YORK TIMES	0	0	0	0	0	0	1
VICTOR NAVASKY	0	0	1	0	0	0	0
VINCENT CANBY	3	41	76	148	2	0	34
WALTER GOODMAN	0	0	0	0	0	0	1
WESLEY MORRIS	0	0	4	4	0	0	8
WILLIAM GRIMES	0	0	0	1	0	0	0

DATA607 Assignment 9

NYT - Web APIs

Donny Lofland