Libraries
# This package is required for Accessing APIS (HTTP or HTTPS URLS from Web)
library(httr)
# This package exposes some additional functions to convert json/text to data frame
library(jsonlite)
# This library is used to manipulate data
library(tidyverse)
## ── Attaching packages ────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x dplyr::lag() masks stats::lag()
# This puts the key into your computer’s key and credential manager for storage
library(keyring)
# Renviron file using the {usethis} package function
library(usethis)
# Add datetime formats
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# friendly for color blind people
library(ggthemes)
# extract elements from HTML files
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
# edit key access
# usethis::edit_r_environ() # I called this one is "NASA_KEY_SECURE"
jsonMarsWeather <- GET("https://api.nasa.gov/insight_weather/?api_key=DEMO_KEY&feedtype=json&ver=1.0", apikey = key_get("NASA_KEY_SECURE"))
# This method will tell us what is the type of response fetched from GET() call to the API.
http_type(jsonMarsWeather)
## [1] "application/json"
# This method just verifies if the response is error free for processing
http_error(jsonMarsWeather)
## [1] FALSE
# check for our request
status_code(jsonMarsWeather)
## [1] 200
# glimpse(jsonMarsWeather)
# Shows raw data which is not structured and readable, if `as` is not specified, content does its best to guess which output is most appropriate.
jsonMarsWeatherText <- content(jsonMarsWeather, as = "text")
# print(jsonMarsWeatherText)
# Convert JSON reponse which is in text format to data frame using jsonlite package
MarsWeatherList <- fromJSON(jsonMarsWeatherText)
# check the structure
#glimpse(MarsWeatherList)
# remove useless lists
MarsWeatherList02 <- MarsWeatherList[-c(8,9)]
MarsWeather <- tibble(MarsWeather = MarsWeatherList02) # save as data frame
MarsWeather %>%
unnest_wider(MarsWeather) %>%
hoist(.col = AT, Average = "av", Minimum = "mn", Maximum = "mx") -> MarsWeather02
MarsWeather02 %>%
separate(First_UTC, c("MarsDate", "MarsTime"), sep = "T") %>%
select(Average:Maximum, MarsDate, Season) %>%
mutate(MarsDate = ymd(MarsDate)) %>%
pivot_longer(c(Average:Maximum), names_to = "Temp", values_to = "values") ->
MarsWeather03 # lubridate
# create a graph
ggplot(data = MarsWeather03, aes(x = MarsDate, y = values, color = Temp)) +
geom_line() +
geom_point() +
theme_bw() +
scale_color_colorblind() +
labs(title = "Latest Temperature at Elysium Planitia (on Mars) ",
x = "Date on Mars", y = "Degree (°C)",
subtitle = paste(tail(MarsWeather03$MarsDate, 1)," ",
"Season:", tail(MarsWeather03$Season, 1)," ",
"High:", round(tail(MarsWeather03$values, 1)),"°C", " ",
"Low:", round(MarsWeather03$values[20]) ,"°C"))
IMDB has a list of the Oscar Best Picture Winners.
Scrape the following elements, convert the data into a tibble, tidy it, and clean it to answer the questions below: - Number - Title - Year - MPAA Rating - Length in minutes - Genre - Star Rating - Metascore Rating - Gross Receipts
Convert the data into a tibble, tidy it, and clean it to answer the following questions:
# save an HTML file
html_obj <- read_html("https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year%2Cdesc&ref_=nv_ch_osc")
# insert the found selectors as the value for the `css` argument
# using SelectorGadget with google extension
OscarPicsHtml <- html_nodes(html_obj,
css = ".ghost~ .text-muted+ span , .ratings-metascore , .ratings-imdb-rating strong , .genre , .certificate , .runtime , .unbold , .lister-item-header a")
OscarPicsText <- html_text(OscarPicsHtml)
head(OscarPicsText)
## [1] "1."
## [2] "Parasite"
## [3] "(2019)"
## [4] "R"
## [5] "132 min"
## [6] "\nComedy, Drama, Thriller "
# create Number
tibble(text = OscarPicsText) %>%
mutate(isPicsNumber = str_detect(text, "^\\d+\\.$")) ->
OscarPics
# make sure there have 93 ranks corresponding to the website
# sum(rankingOscarPics$isPicsRank)
# create each segment
OscarPics %>%
mutate(movieNum = cumsum(isPicsNumber)) %>%
filter(movieNum > 0) ->
OscarPics
NumsTitles <- html_nodes(html_obj,
css = ".lister-item-header a , .text-primary")
NumsTitlesText <- html_text(NumsTitles)
# check the downloaded data
# head(NumsTitlesText)
tibble(text = NumsTitlesText) %>%
mutate(rownum = row_number(),
iseven = rownum %% 2 == 0,
movie = rep(1:93, each = 2)) %>%
select(-rownum) %>%
pivot_wider(names_from = iseven, values_from = text) %>%
select(-movie, "Rank" = "FALSE", pics = "TRUE") %>%
mutate(Rank = parse_number(Rank)) ->
picsNums
# create Titles (name of pics) that the pic names are corresponding to picsRank
OscarPics %>%
mutate(isTitle = text %in% picsNums$pics) ->
OscarPics
# create Years
OscarPics %>%
mutate(isYear = str_detect(text, "\\(\\d+\\)")) ->
OscarPics
# create Length in minutes
OscarPics %>%
mutate(isMins = str_detect(text, "^\\d{1,3}+\\s+\\D{1,3}$")) ->
OscarPics
# create genres
OscarPics %>%
mutate(isGenre = str_detect(text, "^\\n+\\D")) ->
OscarPics
# create Star Ratings
OscarPics %>%
mutate(isStar = str_detect(text, "^\\d+\\.+\\d$")) ->
OscarPics
# create Metascore Ratings
OscarPics %>%
mutate(isMeta = str_detect(text, "^\\n+\\d")) ->
OscarPics
# create Gross Receipts
OscarPics %>%
mutate(isGross = str_detect(text, "^\\$")) ->
OscarPics
# create MPAA Ratings- we keep it to the final step, be it's the most complicated
# OscarPics %>%
# group_by(isPicsNumber, isTitle, isYear, isMins, isGenre, isStar, isMeta, isGross) %>%
# count()
OscarPics %>% # process of elimination
mutate(isMPAA = !isPicsNumber & !isTitle & !isYear & !isMins & !isGenre & !isStar & !isMeta & !isGross) -> OscarPics
OscarPics %>%
mutate(key = case_when(isPicsNumber ~ "number",
isTitle ~ "title",
isYear ~ "year",
isMPAA ~"MPAA",
isMins ~ "minutes",
isGenre ~ "genre",
isStar ~ "starRating",
isMeta ~ "Metascore",
isGross ~ "gross")) %>%
select(key, text, movieNum) %>%
pivot_wider(names_from = key,
values_from = text) -> OscarPics02
# head(OscarPics02)
# remove movieNum and set the suitable type for each variable
OscarPics02 %>%
mutate(number = parse_number(number),
year = parse_number(year),
minutes = parse_number(minutes),
genre = str_replace_all(genre, "\\n", ""),
genre = str_squish(genre),
starRating = parse_number(starRating),
Metascore = str_replace_all(Metascore, "\\n", ""),
Metascore = str_extract(Metascore, "\\d{0,3}"),
Metascore = parse_number(Metascore),
gross = parse_number(gross), # Million
movieNum = NULL) -> OscarPics03
OscarPics03 %>%
summarise_all(funs(sum(!is.na(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 1 x 9
## number title year MPAA minutes genre starRating Metascore gross
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 93 93 93 93 93 93 93 76 83
OscarPics03 %>%
filter(!is.na(gross)) -> OscarPicsNoGrossNAs # remove the NAs in gross
ggplot(data = OscarPicsNoGrossNAs, mapping = aes(x = minutes, y = gross, color = MPAA)) +
geom_point() +
theme_bw() +
labs(x = "Film in Minutes",
y = "Gross Receipts (million)")
OscarPics03 %>%
filter(!is.na(Metascore)) -> OscarPicsNoMetaNAs # remove the NAs in Metascore
ggplot(data = OscarPicsNoMetaNAs, aes(x = Metascore, y = starRating)) + # predict stars rating as Y-axis
geom_point()+
geom_smooth(method = lm, se = FALSE) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# lm(y ~ x)
# x = predictive variable = independent variable = 自变量
# y = response variable = dependent variable = 因变量
scoreSLR <- lm(data = OscarPicsNoMetaNAs, starRating ~ Metascore)
summary(scoreSLR)
##
## Call:
## lm(formula = starRating ~ Metascore, data = OscarPicsNoMetaNAs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.75784 -0.28695 0.03461 0.30735 0.99310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.376689 0.558256 11.423 < 2e-16 ***
## Metascore 0.018302 0.006576 2.783 0.00683 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.53 on 74 degrees of freedom
## Multiple R-squared: 0.09474, Adjusted R-squared: 0.08251
## F-statistic: 7.745 on 1 and 74 DF, p-value: 0.006832
ggplot(data = OscarPicsNoGrossNAs, aes(x = MPAA, y = gross)) +
geom_boxplot() +
theme_bw()
OscarPicsNoGrossNAs %>%
arrange(desc(gross)) %>%
select(title, year, MPAA, gross) %>%
slice(1:10)
## # A tibble: 10 x 4
## title year MPAA gross
## <chr> <dbl> <chr> <dbl>
## 1 Titanic 1997 PG-13 659.
## 2 The Lord of the Rings: The Return of the King 2003 PG-13 378.
## 3 Forrest Gump 1994 PG-13 330.
## 4 Gone with the Wind 1939 Passed 199.
## 5 Gladiator 2000 R 188.
## 6 Dances with Wolves 1990 PG-13 184.
## 7 Rain Man 1988 R 179.
## 8 A Beautiful Mind 2001 PG-13 171.
## 9 Chicago 2002 PG-13 171.
## 10 The Sound of Music 1965 G 163.
allDiff <- aov(gross ~ MPAA, data = OscarPicsNoGrossNAs) # value ~ group
summary(allDiff)
## Df Sum Sq Mean Sq F value Pr(>F)
## MPAA 7 228706 32672 4.835 0.000156 ***
## Residuals 75 506820 6758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1