SETUP

Source files: https://github.com/djlofland/DATA607_F2019/tree/master/Week9

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(forcats)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(formattable)

# We will be working with the NYT Movies API's
NYT_MOVIE_API_BASE <- "https://api.nytimes.com/svc/movies/v2"
NYT_ACTION_PICKS <- "/reviews/picks.json?"

NYT_ACTION_CRITIC <- "/critics/A.%20O.%20Scott.json?"
NYT_ACTION_CRITICS <- "/critics/all.json?"

NYT_ACTION_SEARCH <- "/reviews/search.json?"

# Note that my NYT API Key is stored in my .Renviron file making it available as an envioronmental variable in any projects.
APIKEY <- paste("&api-key=", Sys.getenv("NYTIMES_APIKEY_D607"), "&", sep="")

# Local cache data file to use if available
local_cache_fn <- './data/reviews.csv'

Load Critics

We will be working with the NYT Movies API’s

url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_CRITICS, APIKEY, sep="")
print(url)
## [1] "https://api.nytimes.com/svc/movies/v2/critics/all.json?&api-key=CMGVPoNAFeGZSrQtDOZAEonuEsOLA5xX&"
critics <- fromJSON(url, flatten = TRUE) %>% 
  data.frame() 

Load Movie Picks

We will be working with the NYT Movies API’s

url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_PICKS, APIKEY, sep="")
print(url)
## [1] "https://api.nytimes.com/svc/movies/v2/reviews/picks.json?&api-key=CMGVPoNAFeGZSrQtDOZAEonuEsOLA5xX&"
movie_picks <- fromJSON(url, flatten = TRUE) %>% 
  data.frame() 

Load Movie Reviews

We will be working with the NYT Movies API’s. Note that the NYUT API has throttlelimits that only allow 6 queries/min and a maximum of 4000 queries per day. In this example, I setup a loop to see if I could scrape all the Movie Reviews. After several attempts, its clear that if I really want all movie reviews, I’ll have to babysit the process for the better part of a day. So, I allowed it to run 758 times at 20 reviews pare page for a total of 15160 rows (or movie reviews). This is more than sufficient for proof of concept in this project.

# Check if we have a local copy of the data available to load
isCacheFound <- FALSE

# Check whether we previously downloaded the review data from NYT API.  If found, we will used the cached copy.  If not found, then we'll kick off job to pull the data from NYT (note: this takes a while)
if(file.exists(local_cache_fn)) {
  msg <- paste('Found cached copy: ', local_cache_fn, sep='')
  isCacheFound <- TRUE
}

if(!isCacheFound) {
  # Kick of process to download data from NYT API
  
  # List that will hold each page of results returned from the API
  reviews <- list()
  
  # We will need to check the has_more column 
  has_more <- TRUE
  page <- 0
    
  while(has_more) {
    # construct the URL - note we add an offset to handle 20 results per page
    # in the respone, if has_more column is TRUE, we will need to request the next page
    url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_SEARCH, APIKEY, "offset=", page*20, sep="")
    
    # download results from API
    results <- fromJSON(url, flatten = TRUE) %>% 
      data.frame() 
    
    message("Retrieving page: ", page + 1)
    
    # save off the current page of results
    reviews[[page+1]] <- results
    
    # scrape whether we have more results we'll need to pull
    has_more <- results$has_more[1]
    page <- page + 1
    
    # Throttle we can use to only grab the first few pages of results
    #  if(page==3) {
    #    has_more <- FALSE
    #  }
    
    # NYT API has a rate limmit of 10 requests per minute (HTTP error 429 if this is exceeded)
    Sys.sleep(6.1)
  }
  
  # combine the list of dataframes into a single main dataframe
  data_df <- data.table::rbindlist(reviews) 
  
  # Cache the processed as CSV for future
  write.csv(data_df, local_cache_fn, row.names=FALSE, na="")
  
} else {
  # Now load the data from local cache CSV
  data_df <- read_csv(local_cache_fn, col_names = TRUE)
  isDataLoaded <- TRUE
  msg <- 'Cached CSV data loaded.'
}
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   has_more = col_logical(),
##   num_results = col_double(),
##   results.critics_pick = col_double(),
##   results.publication_date = col_date(format = ""),
##   results.opening_date = col_date(format = ""),
##   results.date_updated = col_datetime(format = ""),
##   results.multimedia.width = col_double(),
##   results.multimedia.height = col_double(),
##   results.multimedia = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 77 parsing failures.
##  row                  col   expected     actual                 file
##  106 results.opening_date valid date 0000-00-00 './data/reviews.csv'
##  248 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 1074 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 1135 results.opening_date valid date 1969-00-00 './data/reviews.csv'
## 1207 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## .... .................... .......... .......... ....................
## See problems(...) for more details.

Cleanup Reviews

# drop rows with missing dates - since I'm not really interested in dates and am not
# doing any analysis where loss of rows might affect interpretation, it's safe to just
# drop rows with any missing dates.  Note: There are also nulls in the rating column 
# for movies that didn't receive an official mpaa rating letter.  
# For these, I'll replace nulls with 'not rated'.  Next, teh ByLines are a mess - I'll do some
# quick spot cleanup.
data_df2 <- data_df %>% 
  drop_na(results.publication_date, results.opening_date, results.date_updated) %>%
  replace_na(list(results.mpaa_rating = "Not Rated")) %>%
  mutate(results.byline = str_to_upper(results.byline)) %>%
  mutate(results.byline = str_replace_all(results.byline, "[\\.,]", "")) %>%
  mutate(results.byline = str_replace_all(results.byline, "&#160;", "")) %>%
  mutate(results.byline = str_replace_all(results.byline, "BY ", "")) %>%
  mutate(results.byline = str_replace_all(results.byline, "&NBSP;", "")) %>%
  mutate(results.byline = str_trim(results.byline, side=c("both"))) %>%
  mutate(results.byline = str_replace_all(results.byline, "[;]", "")) %>%
  mutate(results.byline = str_squish(results.byline))

Analysis

Top 10 Posting Critics

# How many reviews were authored by critic - show the top 10 most prolific
critics_counts <- data_df2 %>%
  group_by(results.byline) %>%
  summarize(review_count = length(results.byline)) %>%
  arrange(desc(review_count)) %>%
  top_n(10)
## Selecting by review_count
formattable(critics_counts,
            align=c("l", "c"))
results.byline review_count
STEPHEN HOLDEN 1496
JANET MASLIN 1211
A O SCOTT 1068
MANOHLA DARGIS 851
JEANNETTE CATSOULIS 774
GLENN KENNY 374
BEN KENIGSBERG 353
NEIL GENZLINGER 339
VINCENT CANBY 304
ELVIS MITCHELL 284

Reviews by Critical & Rating

# How many reviews did critics post for each mpaa rating?
rating_counts <- data_df2 %>%
  group_by(results.byline, results.mpaa_rating) %>%
  summarize(rating_count = length(results.mpaa_rating)) %>%
  arrange(results.byline) %>%
  spread(results.mpaa_rating, rating_count) %>%
  select(results.byline, G, PG, `PG-13`, R, `NC-17`, X, `Not Rated`) %>%
  replace_na(list(G = 0,PG=0, `PG-13`=0, R=0, `NC-17`=0, X=0, `Not Rated`=0))

formattable(rating_counts,
            align =c("l", "c", "c", "c", "c", "c", "c", "c"), 
            list(`results.byline` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")))
            )
results.byline G PG PG-13 R NC-17 X Not Rated
A O SCOTT 18 104 369 438 1 0 138
AISHA HARRIS 0 1 3 4 0 0 5
ANDREA HIGBIE 0 1 0 0 0 0 0
ANDREW GELLER 0 0 0 1 0 0 0
ANDY WEBSTER 3 22 41 45 0 0 123
ANITA GATES 9 10 10 22 0 0 30
ANNA KISSELGOFF 0 0 0 0 0 0 1
AO SCOTT 0 13 68 115 0 0 57
AS TOLD TO MANOHLA DARGIS LORD DARGIS 0 1 0 0 0 0 0
BEN KENIGSBERG 0 24 37 73 1 0 218
BERNADINE MORRIS 0 0 0 1 0 0 0
BERNARD WEINRAUB 0 0 0 1 0 0 0
BILGE EBIRI 0 6 3 4 0 0 13
BRIAN SEIBERT 0 0 0 0 0 0 2
CANDICE FREDERICK 0 0 0 0 0 0 3
CARINA CHOCANO 0 0 0 0 0 0 1
CARYN JAMES 8 40 46 100 2 0 34
CHRIS VOGNAR 0 0 0 1 0 0 2
DANA STEVENS 0 2 0 0 0 0 0
DANIEL M GOLD 0 2 2 8 0 0 57
DAVE KEHR 5 20 17 27 0 0 22
DAVID DEWITT 1 1 1 3 0 0 19
EDWARD ROTHSTEIN 0 0 0 0 0 0 1
ELISABETH VINCENTELLI 0 0 1 1 0 0 4
ELVIS MITCHELL 7 23 106 125 1 0 22
FLETCHER ROBERTS 0 0 0 0 0 0 1
FRANCINE PROSE 0 1 0 0 0 0 0
FRANCIS X CLINES 0 0 0 1 0 0 0
GIA KOURLAS 0 0 0 0 0 0 1
GLENN KENNY 3 23 54 95 0 0 199
HELEN T VERONGOS 0 1 1 5 0 0 27
JAMIE DIAMOND 0 0 1 0 0 0 0
JANET MASLIN 23 157 269 627 6 1 128
JASON BAILEY 0 0 0 0 0 0 8
JASON ZINOMAN 0 1 2 1 0 0 10
JEANETTE CATSOULIS 0 0 0 1 0 0 0
JEANNETTE CATSOULIS 8 41 120 199 1 0 405
JEANNETTE CATSOULIS NATHAN LEE 0 1 0 0 0 0 0
JEANNETTE CATSOULIS NATHAN LEE NEIL GENZLINGER LAURA KERN 0 0 1 0 0 0 0
JENNIFER DUNNING 0 0 0 0 0 0 1
JENNIFER SZALAI 0 0 0 2 0 0 2
JOHN ANDERSON 0 0 0 0 0 0 1
JOHN WILLIAMS 0 0 1 0 0 0 0
JON CARAMANICA 0 1 2 0 0 0 5
JON PARELES 0 0 1 1 0 0 0
JULIE LEW 0 0 1 0 0 0 0
KAREN HAN 0 0 0 1 0 0 2
KEN JAWOROWSKI 1 7 10 11 0 0 79
KENNETH C DAVIS 0 1 0 1 0 0 0
KYLE TURNER 0 0 0 0 0 0 1
LAURA KERN 0 2 3 7 0 0 10
LAWRENCE VAN GELDER 11 33 35 73 0 0 36
LAWRNENCE VAN GELDER 0 0 0 0 0 0 1
LINDA LEE 0 1 0 2 0 0 0
MANOHLA DARGIS 12 65 271 374 1 0 128
MARTIN TSAI 0 0 0 0 0 0 1
MATT ZOLLER SEITZ 1 7 13 18 0 0 22
MICHAEL ERIC DYSON 0 0 0 1 0 0 0
MICHAEL KIMMELMAN 0 0 0 0 0 0 1
MIKE HALE 2 12 13 17 0 0 48
MIRIAM BALE 0 4 2 5 0 0 15
MONICA CASTILLO 0 1 1 3 0 0 9
NATHAN LEE 2 5 18 12 0 0 39
NEIL GENZLINGER 13 40 56 71 0 0 159
NICOLAS RAPOLD 1 12 21 24 0 0 96
NICOLE HERRINGTON 0 0 1 5 0 0 14
PATRICIA S MCCORMICK 3 5 0 2 0 0 0
PAUL BRUNICK 0 0 1 1 0 0 7
PAUL GOLDBERGER 0 0 0 0 0 0 1
PETER M NICHOLS 0 0 1 1 0 0 1
PHILLIP LOPATE 0 1 0 0 0 0 0
RACHEL SALTZ 1 2 12 4 0 0 72
ROBERTA SMITH 0 0 0 1 0 0 0
SAMUEL G FREEDMAN 0 0 0 1 0 0 0
SCOTT TOBIAS 0 0 1 0 0 0 4
SEAN T COLLINS 0 0 0 0 0 0 1
STEPHEN HOLDEN 32 151 310 596 5 0 402
SUZANNE O’CONNOR 0 0 2 0 0 0 1
TEO BUGBEE 1 9 3 15 0 0 45
THE NEW YORK TIMES 0 0 0 0 0 0 1
VICTOR NAVASKY 0 0 1 0 0 0 0
VINCENT CANBY 3 41 76 148 2 0 34
WALTER GOODMAN 0 0 0 0 0 0 1
WESLEY MORRIS 0 0 4 4 0 0 8
WILLIAM GRIMES 0 0 0 1 0 0 0