Problem 1: Trump Approval
Trump Approval1.Download new Trump approval data using the url for the 538 website: https://projects.fivethirtyeight.com/trump-approval-data/approval_topline.csv. Be sure to download it from this link directly into R - do not download the .csv then load from your files. Use read_csv() and just paste in the url surrounded by “quotes”!
approval_topline <- read_csv(file = "https://projects.fivethirtyeight.com/trump-approval-data/approval_topline.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## president = col_character(),
## subgroup = col_character(),
## modeldate = col_character(),
## approve_estimate = col_double(),
## approve_hi = col_double(),
## approve_lo = col_double(),
## disapprove_estimate = col_double(),
## disapprove_hi = col_double(),
## disapprove_lo = col_double(),
## timestamp = col_character()
## )
approval_topline <- approval_topline %>%
mutate(modeldate = mdy(modeldate))
Plot the approval (approve_estimate) of the President using data that aggregates all polls (check the subgroup variable). It might also be helpful to add the disapproval estimate to the same plot for extra context.
approval_topline %>%
filter(subgroup == "All polls") %>%
ggplot() +
geom_line(aes(x = modeldate, y = approve_estimate), color = 'green') +
geom_line(aes(x = modeldate, y = disapprove_estimate), color = 'red') +
ggtitle('Average Trump Approval Rating (%)') +
theme(plot.title = element_text(size = 13, face = 'bold', hjust = 0.5),
axis.text.y = element_text(size = 12, face = 'bold'),
axis.text.x = element_text(size = 12, face = 'bold'),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.grid.minor = element_blank()) +
scale_x_date(date_breaks = "4 month", date_labels = "%b") +
annotate("text", x = as.Date('2021-01-01'),
label = 'Disapproval', y = 50, size = 3, fontface = 'bold') +
annotate("text", x = as.Date('2021-01-01'),
label = 'Approval', y = 37, size = 3, fontface = 'bold')
Next, pick a period where approval ratings are out of the ordinary, or display some sort of unusual trend. Explain this trend using the events of that time period. For example, approval rating in January of 2019 was lower than normal due to the government shutdown. Answer: The approval ratings fell drastically in December, as there was an investigation in the the accusation of Russian interference in the 2016 election and reports of sexual misconduct. There was also major disapproval for the tax reform that Trump signed this month that lowered the corporate tax rate and dropped the individual tax rate for the richest people in America and put an expiration date on how long it would help the lower and middle class families and individuals that it did moderately help.
approval_topline %>%
filter(subgroup == "All polls", modeldate >= as.Date("2017-12-01"), modeldate <= as.Date("2018-01-01")) %>%
ggplot() +
geom_line(aes(x = modeldate, y = approve_estimate), color = 'green') +
geom_line(aes(x = modeldate, y = disapprove_estimate), color = 'red') +
ggtitle('Average Trump Approval Rating December 2017 (%)') +
theme(plot.title = element_text(size = 13, face = 'bold', hjust = 0.5),
axis.text.y = element_text(size = 12, face = 'bold'),
axis.text.x = element_text(size = 12, face = 'bold'),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.grid.minor = element_blank()) +
annotate("text", x = as.Date('2017-12-20'),
label = 'Disapproval', y = 55, size = 3, fontface = 'bold') +
annotate("text", x = as.Date('2017-12-20'),
label = 'Approval', y = 35, size = 3, fontface = 'bold')
Problem 2: Flight Delays In the flights data, are delays more likely on certain days of the week? This requires knowing how many flights leave each day. Answer: Yes. Departure and Arrival delays are more likely to happen on Thursdays.
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt %>%
mutate(dayofweek = wday(sched_dep_time, label = TRUE)) %>%
group_by(dayofweek) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
gather(key = 'delay', value = 'minutes', 2:3) %>%
ggplot() +
geom_col(mapping = aes(x = dayofweek, y = minutes, fill = delay),
position = 'dodge') +
labs(y = "Average Delay (in minutes)",
x = "Day of the Week") +
scale_fill_discrete(name = "Delay", labels = c("Average Arrival Delay", "Average Departure Delay"))
Problem 3: Textbook How does the average delay time change over the course of a day? Should you use dep_time or sched_dep_time? Why? Answer: We should look at the sched_dep_time, because it will tell us how much of a delay that we have from when the flight was supposed to leave and the scheduled departure time is when we expect to leave when we go to book a flight.
flights_dt %>%
mutate(sched_dep_hour = hour(sched_dep_time)) %>%
group_by(sched_dep_hour) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = sched_dep_hour, y = avg_dep_delay)) +
geom_point() +
geom_smooth() +
labs(y = "Average Departure Delay (in minutes)",
x = "Hour of the Day")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Problem 4: Flight Cancellations In addition to delays, there are also cancellations. When, during the day, is your flight most likely to be cancelled? Group the flights into 15 minute intervals, and then plot when your flight is most likely to be cancelled. For simplicity, let’s pool days/weeks etc. together. For simplicity, start the day at 5 AM (since effectively no flights are scheduled to depart before then). Hint: the rounding functions and update will be helpful in answering this question. Answer: 19:30:00 or 7:30pm.
flights_can <- flights %>%
filter(is.na(dep_time), is.na(arr_time), sched_dep_time >= 500) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("time"))
binned_dep <- flights_can %>%
mutate(binned_dep = floor_date(sched_dep_time, minutes(15))) %>%
select(binned_dep, everything())
binned_dep %>%
mutate(., binned_dep = update(binned_dep, yday = 1)) %>%
ggplot(aes(x = binned_dep)) +
geom_bar() +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "15 mins") +
labs(y = "Number of Flights",
x = "Time of Day (in 15 minute intervals)") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Problem 5: Stock Data Download stock data for the last few years from Apple (AAPL) and Microsoft (MSFT) using the code below, grabbing the stock prices by day using the tidyquant package. This package allows you to super easily pull stock price data into R directly from Yahoo Finance’s API. Cool, right?
prices <- tq_get(c('AAPL','MSFT'),
from = "2017-01-01",
to = "2020-04-17",
get = "stock.prices")
prices %>%
ggplot(aes(x = date,y = close, color = symbol)) +
labs(title = "Apple and Microsoft Stock Prices January 1, 2017 - April 4, 2020",
subtitle = "End of Day Closing Prices") +
xlab("Date") + ylab("Price") +
scale_color_manual(values = c("Green", "Blue")) +
geom_line()
prices2 <- tq_get(c('ETSY','NCLH'),
from = "2020-02-01",
to = "2021-05-03",
get = "stock.prices")
prices2 %>%
ggplot(aes(x = date,y = close, color = symbol)) +
labs(title = "Etsy and Norwegian Cruise Line Stock Prices Feburary 1, 2020 - May 3, 2021",
subtitle = "End of Day Closing Prices") +
xlab("Date") + ylab("Price") +
scale_color_manual(values = c("Green", "Blue")) +
geom_line()