Source files: https://github.com/djlofland/DATA607_F2019/tree/master/Week9
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(formattable)
# We will be working with the NYT Movies API's
NYT_MOVIE_API_BASE <- "https://api.nytimes.com/svc/movies/v2"
NYT_ACTION_PICKS <- "/reviews/picks.json?"
NYT_ACTION_CRITIC <- "/critics/A.%20O.%20Scott.json?"
NYT_ACTION_CRITICS <- "/critics/all.json?"
NYT_ACTION_SEARCH <- "/reviews/search.json?"
# Note that my NYT API Key is stored in my .Renviron file making it available as an envioronmental variable in any projects.
APIKEY <- paste("&api-key=", Sys.getenv("NYTIMES_APIKEY_D607"), "&", sep="")
# Local cache data file to use if available
local_cache_fn <- './data/reviews.csv'We will be working with the NYT Movies API’s
## [1] "https://api.nytimes.com/svc/movies/v2/critics/all.json?&api-key=CMGVPoNAFeGZSrQtDOZAEonuEsOLA5xX&"
We will be working with the NYT Movies API’s
## [1] "https://api.nytimes.com/svc/movies/v2/reviews/picks.json?&api-key=CMGVPoNAFeGZSrQtDOZAEonuEsOLA5xX&"
We will be working with the NYT Movies API’s. Note that the NYUT API has throttlelimits that only allow 6 queries/min and a maximum of 4000 queries per day. In this example, I setup a loop to see if I could scrape all the Movie Reviews. After several attempts, its clear that if I really want all movie reviews, I’ll have to babysit the process for the better part of a day. So, I allowed it to run 758 times at 20 reviews pare page for a total of 15160 rows (or movie reviews). This is more than sufficient for proof of concept in this project.
# Check if we have a local copy of the data available to load
isCacheFound <- FALSE
# Check whether we previously downloaded the review data from NYT API. If found, we will used the cached copy. If not found, then we'll kick off job to pull the data from NYT (note: this takes a while)
if(file.exists(local_cache_fn)) {
msg <- paste('Found cached copy: ', local_cache_fn, sep='')
isCacheFound <- TRUE
}
if(!isCacheFound) {
# Kick of process to download data from NYT API
# List that will hold each page of results returned from the API
reviews <- list()
# We will need to check the has_more column
has_more <- TRUE
page <- 0
while(has_more) {
# construct the URL - note we add an offset to handle 20 results per page
# in the respone, if has_more column is TRUE, we will need to request the next page
url <- paste(NYT_MOVIE_API_BASE, NYT_ACTION_SEARCH, APIKEY, "offset=", page*20, sep="")
# download results from API
results <- fromJSON(url, flatten = TRUE) %>%
data.frame()
message("Retrieving page: ", page + 1)
# save off the current page of results
reviews[[page+1]] <- results
# scrape whether we have more results we'll need to pull
has_more <- results$has_more[1]
page <- page + 1
# Throttle we can use to only grab the first few pages of results
# if(page==3) {
# has_more <- FALSE
# }
# NYT API has a rate limmit of 10 requests per minute (HTTP error 429 if this is exceeded)
Sys.sleep(6.1)
}
# combine the list of dataframes into a single main dataframe
data_df <- data.table::rbindlist(reviews)
# Cache the processed as CSV for future
write.csv(data_df, local_cache_fn, row.names=FALSE, na="")
} else {
# Now load the data from local cache CSV
data_df <- read_csv(local_cache_fn, col_names = TRUE)
isDataLoaded <- TRUE
msg <- 'Cached CSV data loaded.'
}## Parsed with column specification:
## cols(
## .default = col_character(),
## has_more = col_logical(),
## num_results = col_double(),
## results.critics_pick = col_double(),
## results.publication_date = col_date(format = ""),
## results.opening_date = col_date(format = ""),
## results.date_updated = col_datetime(format = ""),
## results.multimedia.width = col_double(),
## results.multimedia.height = col_double(),
## results.multimedia = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 77 parsing failures.
## row col expected actual file
## 106 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 248 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 1074 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## 1135 results.opening_date valid date 1969-00-00 './data/reviews.csv'
## 1207 results.opening_date valid date 0000-00-00 './data/reviews.csv'
## .... .................... .......... .......... ....................
## See problems(...) for more details.
# drop rows with missing dates - since I'm not really interested in dates and am not
# doing any analysis where loss of rows might affect interpretation, it's safe to just
# drop rows with any missing dates. Note: There are also nulls in the rating column
# for movies that didn't receive an official mpaa rating letter.
# For these, I'll replace nulls with 'not rated'. Next, teh ByLines are a mess - I'll do some
# quick spot cleanup.
data_df2 <- data_df %>%
drop_na(results.publication_date, results.opening_date, results.date_updated) %>%
replace_na(list(results.mpaa_rating = "Not Rated")) %>%
mutate(results.byline = str_to_upper(results.byline)) %>%
mutate(results.byline = str_replace_all(results.byline, "[\\.,]", "")) %>%
mutate(results.byline = str_replace_all(results.byline, " ", "")) %>%
mutate(results.byline = str_replace_all(results.byline, "BY ", "")) %>%
mutate(results.byline = str_replace_all(results.byline, "&NBSP;", "")) %>%
mutate(results.byline = str_trim(results.byline, side=c("both"))) %>%
mutate(results.byline = str_replace_all(results.byline, "[;]", "")) %>%
mutate(results.byline = str_squish(results.byline))# How many reviews were authored by critic - show the top 10 most prolific
critics_counts <- data_df2 %>%
group_by(results.byline) %>%
summarize(review_count = length(results.byline)) %>%
arrange(desc(review_count)) %>%
top_n(10)## Selecting by review_count
| results.byline | review_count |
|---|---|
| STEPHEN HOLDEN | 1496 |
| JANET MASLIN | 1211 |
| A O SCOTT | 1068 |
| MANOHLA DARGIS | 851 |
| JEANNETTE CATSOULIS | 774 |
| GLENN KENNY | 374 |
| BEN KENIGSBERG | 353 |
| NEIL GENZLINGER | 339 |
| VINCENT CANBY | 304 |
| ELVIS MITCHELL | 284 |
# How many reviews did critics post for each mpaa rating?
rating_counts <- data_df2 %>%
group_by(results.byline, results.mpaa_rating) %>%
summarize(rating_count = length(results.mpaa_rating)) %>%
arrange(results.byline) %>%
spread(results.mpaa_rating, rating_count) %>%
select(results.byline, G, PG, `PG-13`, R, `NC-17`, X, `Not Rated`) %>%
replace_na(list(G = 0,PG=0, `PG-13`=0, R=0, `NC-17`=0, X=0, `Not Rated`=0))
formattable(rating_counts,
align =c("l", "c", "c", "c", "c", "c", "c", "c"),
list(`results.byline` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold")))
)| results.byline | G | PG | PG-13 | R | NC-17 | X | Not Rated |
|---|---|---|---|---|---|---|---|
| A O SCOTT | 18 | 104 | 369 | 438 | 1 | 0 | 138 |
| AISHA HARRIS | 0 | 1 | 3 | 4 | 0 | 0 | 5 |
| ANDREA HIGBIE | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| ANDREW GELLER | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ANDY WEBSTER | 3 | 22 | 41 | 45 | 0 | 0 | 123 |
| ANITA GATES | 9 | 10 | 10 | 22 | 0 | 0 | 30 |
| ANNA KISSELGOFF | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| AO SCOTT | 0 | 13 | 68 | 115 | 0 | 0 | 57 |
| AS TOLD TO MANOHLA DARGIS LORD DARGIS | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| BEN KENIGSBERG | 0 | 24 | 37 | 73 | 1 | 0 | 218 |
| BERNADINE MORRIS | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| BERNARD WEINRAUB | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| BILGE EBIRI | 0 | 6 | 3 | 4 | 0 | 0 | 13 |
| BRIAN SEIBERT | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| CANDICE FREDERICK | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| CARINA CHOCANO | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| CARYN JAMES | 8 | 40 | 46 | 100 | 2 | 0 | 34 |
| CHRIS VOGNAR | 0 | 0 | 0 | 1 | 0 | 0 | 2 |
| DANA STEVENS | 0 | 2 | 0 | 0 | 0 | 0 | 0 |
| DANIEL M GOLD | 0 | 2 | 2 | 8 | 0 | 0 | 57 |
| DAVE KEHR | 5 | 20 | 17 | 27 | 0 | 0 | 22 |
| DAVID DEWITT | 1 | 1 | 1 | 3 | 0 | 0 | 19 |
| EDWARD ROTHSTEIN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ELISABETH VINCENTELLI | 0 | 0 | 1 | 1 | 0 | 0 | 4 |
| ELVIS MITCHELL | 7 | 23 | 106 | 125 | 1 | 0 | 22 |
| FLETCHER ROBERTS | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| FRANCINE PROSE | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| FRANCIS X CLINES | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| GIA KOURLAS | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| GLENN KENNY | 3 | 23 | 54 | 95 | 0 | 0 | 199 |
| HELEN T VERONGOS | 0 | 1 | 1 | 5 | 0 | 0 | 27 |
| JAMIE DIAMOND | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| JANET MASLIN | 23 | 157 | 269 | 627 | 6 | 1 | 128 |
| JASON BAILEY | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
| JASON ZINOMAN | 0 | 1 | 2 | 1 | 0 | 0 | 10 |
| JEANETTE CATSOULIS | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| JEANNETTE CATSOULIS | 8 | 41 | 120 | 199 | 1 | 0 | 405 |
| JEANNETTE CATSOULIS NATHAN LEE | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| JEANNETTE CATSOULIS NATHAN LEE NEIL GENZLINGER LAURA KERN | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| JENNIFER DUNNING | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| JENNIFER SZALAI | 0 | 0 | 0 | 2 | 0 | 0 | 2 |
| JOHN ANDERSON | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| JOHN WILLIAMS | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| JON CARAMANICA | 0 | 1 | 2 | 0 | 0 | 0 | 5 |
| JON PARELES | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| JULIE LEW | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| KAREN HAN | 0 | 0 | 0 | 1 | 0 | 0 | 2 |
| KEN JAWOROWSKI | 1 | 7 | 10 | 11 | 0 | 0 | 79 |
| KENNETH C DAVIS | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| KYLE TURNER | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| LAURA KERN | 0 | 2 | 3 | 7 | 0 | 0 | 10 |
| LAWRENCE VAN GELDER | 11 | 33 | 35 | 73 | 0 | 0 | 36 |
| LAWRNENCE VAN GELDER | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| LINDA LEE | 0 | 1 | 0 | 2 | 0 | 0 | 0 |
| MANOHLA DARGIS | 12 | 65 | 271 | 374 | 1 | 0 | 128 |
| MARTIN TSAI | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| MATT ZOLLER SEITZ | 1 | 7 | 13 | 18 | 0 | 0 | 22 |
| MICHAEL ERIC DYSON | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| MICHAEL KIMMELMAN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| MIKE HALE | 2 | 12 | 13 | 17 | 0 | 0 | 48 |
| MIRIAM BALE | 0 | 4 | 2 | 5 | 0 | 0 | 15 |
| MONICA CASTILLO | 0 | 1 | 1 | 3 | 0 | 0 | 9 |
| NATHAN LEE | 2 | 5 | 18 | 12 | 0 | 0 | 39 |
| NEIL GENZLINGER | 13 | 40 | 56 | 71 | 0 | 0 | 159 |
| NICOLAS RAPOLD | 1 | 12 | 21 | 24 | 0 | 0 | 96 |
| NICOLE HERRINGTON | 0 | 0 | 1 | 5 | 0 | 0 | 14 |
| PATRICIA S MCCORMICK | 3 | 5 | 0 | 2 | 0 | 0 | 0 |
| PAUL BRUNICK | 0 | 0 | 1 | 1 | 0 | 0 | 7 |
| PAUL GOLDBERGER | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| PETER M NICHOLS | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
| PHILLIP LOPATE | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| RACHEL SALTZ | 1 | 2 | 12 | 4 | 0 | 0 | 72 |
| ROBERTA SMITH | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| SAMUEL G FREEDMAN | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| SCOTT TOBIAS | 0 | 0 | 1 | 0 | 0 | 0 | 4 |
| SEAN T COLLINS | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| STEPHEN HOLDEN | 32 | 151 | 310 | 596 | 5 | 0 | 402 |
| SUZANNE O’CONNOR | 0 | 0 | 2 | 0 | 0 | 0 | 1 |
| TEO BUGBEE | 1 | 9 | 3 | 15 | 0 | 0 | 45 |
| THE NEW YORK TIMES | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| VICTOR NAVASKY | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| VINCENT CANBY | 3 | 41 | 76 | 148 | 2 | 0 | 34 |
| WALTER GOODMAN | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| WESLEY MORRIS | 0 | 0 | 4 | 4 | 0 | 0 | 8 |
| WILLIAM GRIMES | 0 | 0 | 0 | 1 | 0 | 0 | 0 |