us_tuition <- read.csv("/Users/yuhe/Downloads/us_avg_tuition.csv")
glimpse(who)
## Rows: 7,240
## Columns: 60
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
## $ iso2 <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
## $ year <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198…
## $ new_sp_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
who_0 <- who %>%
pivot_longer(cols = new_sp_m014:newrel_f65, names_to = "key", values_to = "cases", values_drop_na = T) %>%
mutate(key = str_replace(key, "newrel", "new_rel")) %>%
separate(key, c("new", "type", "sexage"), sep = "_") %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
who_0
## # A tibble: 76,046 × 6
## country year type sex age cases
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
## 7 Afghanistan 1997 sp m 65 0
## 8 Afghanistan 1997 sp f 014 5
## 9 Afghanistan 1997 sp f 1524 38
## 10 Afghanistan 1997 sp f 2534 36
## # ℹ 76,036 more rows
Which country has the highest cases and which type has the most cases?
who_0 %>%
group_by(country) %>%
summarise(cases = sum(cases)) %>%
arrange(desc(cases)) %>%
head(4)
## # A tibble: 4 × 2
## country cases
## <chr> <dbl>
## 1 China 8389839
## 2 India 7098552
## 3 South Africa 3010272
## 4 Indonesia 2909925
who_0 %>%
filter(country == "China") %>%
group_by(type) %>%
summarise(cases = sum(cases)) %>%
arrange(desc(cases))
## # A tibble: 4 × 2
## type cases
## <chr> <dbl>
## 1 sp 5637562
## 2 sn 1885757
## 3 rel 847176
## 4 ep 19344
Answer: China has the most cases, and the type of sp cases is the most.
glimpse(us_tuition)
## Rows: 50
## Columns: 13
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co…
## $ X2004.05 <chr> "$5,683", "$4,328", "$5,138", "$5,772", "$5,286", "$4,704", "…
## $ X2005.06 <chr> "$5,841", "$4,633", "$5,416", "$6,082", "$5,528", "$5,407", "…
## $ X2006.07 <chr> "$5,753", "$4,919", "$5,481", "$6,232", "$5,335", "$5,596", "…
## $ X2007.08 <chr> "$6,008", "$5,070", "$5,682", "$6,415", "$5,672", "$6,227", "…
## $ X2008.09 <chr> "$6,475", "$5,075", "$6,058", "$6,417", "$5,898", "$6,284", "…
## $ X2009.10 <chr> "$7,189", "$5,455", "$7,263", "$6,627", "$7,259", "$6,948", "…
## $ X2010.11 <chr> "$8,071", "$5,759", "$8,840", "$6,901", "$8,194", "$7,748", "…
## $ X2011.12 <chr> "$8,452", "$5,762", "$9,967", "$7,029", "$9,436", "$8,316", "…
## $ X2012.13 <chr> "$9,098", "$6,026", "$10,134", "$7,287", "$9,361", "$8,793", …
## $ X2013.14 <chr> "$9,359", "$6,012", "$10,296", "$7,408", "$9,274", "$9,293", …
## $ X2014.15 <chr> "$9,496", "$6,149", "$10,414", "$7,606", "$9,187", "$9,299", …
## $ X2015.16 <chr> "$9,751", "$6,571", "$10,646", "$7,867", "$9,270", "$9,748", …
us_tuition_0 <- us_tuition %>%
pivot_longer(c(X2004.05, X2005.06, X2006.07, X2007.08, X2008.09, X2009.10, X2010.11, X2011.12, X2012.13, X2013.14, X2014.15, X2015.16), names_to = "date", values_to = "avg_tuition") %>%
separate(date, into = c("X", "academic_year"), sep = 1) %>%
select(State, academic_year, avg_tuition) %>%
mutate(academic_year = str_replace(academic_year, "\\.", "-20")) %>%
mutate(avg_tuition = str_remove(avg_tuition, ",")) %>%
separate(avg_tuition, into = c("currency", "avg_tuition"), sep = 1, convert = T) %>%
select(State, academic_year, avg_tuition)
us_tuition_0
## # A tibble: 600 × 3
## State academic_year avg_tuition
## <chr> <chr> <int>
## 1 Alabama 2004-2005 5683
## 2 Alabama 2005-2006 5841
## 3 Alabama 2006-2007 5753
## 4 Alabama 2007-2008 6008
## 5 Alabama 2008-2009 6475
## 6 Alabama 2009-2010 7189
## 7 Alabama 2010-2011 8071
## 8 Alabama 2011-2012 8452
## 9 Alabama 2012-2013 9098
## 10 Alabama 2013-2014 9359
## # ℹ 590 more rows
us_tuition_0 %>%
ggplot() +
stat_summary(aes(x = State, y = avg_tuition), fun = 'mean', geom = 'bar') +
labs(title = "State Average Tuition Of All Years ", x = "State", y = "Average Tuition") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.2)),
axis.text.x = element_text(angle = 45))
us_tuition_0 %>%
group_by(State) %>%
summarise(avg_tuition_year = mean(avg_tuition)) %>%
arrange(desc(avg_tuition_year))
## # A tibble: 50 × 2
## State avg_tuition_year
## <chr> <dbl>
## 1 Vermont 13067.
## 2 New Hampshire 12781.
## 3 New Jersey 12054.
## 4 Pennsylvania 11970
## 5 Illinois 11228.
## 6 Michigan 10477
## 7 South Carolina 10377.
## 8 Delaware 10099.
## 9 Massachusetts 10058.
## 10 Ohio 9942.
## # ℹ 40 more rows
Answer: Vermont has the highest average tuition. Wyoming has the lowest average tuition.
us_tuition_0 %>%
pivot_wider(names_from = academic_year, values_from = avg_tuition) %>%
group_by(State) %>%
summarise(rate_of_change = (`2015-2016`-`2004-2005`)/`2004-2005`) %>%
ggplot(aes(rate_of_change, reorder(State, rate_of_change), fill = rate_of_change)) +
geom_col() +
labs(title = "Rate of Change in College Tuition by State From 2004 -2015",
x = "Rate of Change",
y = "State")
Answer: Hawaii has the fastest rate while Ohio is the slowest.
flights2 <- flights %>%
filter(!is.na(dest)) %>%
group_by(dest, month, day) %>%
summarise(flights_count_day = length(dest))
## `summarise()` has grouped output by 'dest', 'month'. You can override using the
## `.groups` argument.
flights3 <- flights2 %>%
group_by(dest) %>%
summarise(avg_flights_day = sum(flights_count_day)/length(dest))
airports %>%
semi_join(flights3, c("faa" = "dest")) %>%
left_join(flights3, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
annotation_borders("state") +
geom_point(aes(colour = avg_flights_day)) +
scale_color_gradient(low = "green", high = "red") +
coord_quickmap() +
labs(title = "Flights and Counts from NYC",
x = "Longitude",
y = "Latitude") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
flights_weather <- left_join(weather, flights, by = c("origin", "time_hour"))
flights_weather <- flights_weather %>%
select(origin, temp, wind_speed, precip, visib, time_hour, dep_delay) %>%
filter(!is.na(dep_delay))
ggplot(flights_weather) +
geom_point(aes(temp, dep_delay)) +
scale_x_continuous(name = "Temperature (F)") +
scale_y_continuous(name = "Departure Delay Time") +
labs(title = "Density Graph of Temperature and Departure Delay Time")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(flights_weather) +
geom_density_2d_filled(aes(wind_speed, dep_delay)) +
scale_x_continuous(name = "Wind Speed (mph)") +
scale_y_continuous(name = "Departure Delay Time", limits = c(-25, 25)) +
labs(title = "Density Graph of Wind Speed and Departure Delay Time")
## Warning: Removed 54147 rows containing non-finite outside the scale range
## (`stat_density2d_filled()`).
ggplot(flights_weather) +
geom_point(aes(precip, dep_delay)) +
scale_x_continuous(name = "Precipitation (inch)") +
scale_y_continuous(name = "Departure Delay Time") +
labs(title = "Density Graph of Precipitation and Departure Delay Time")
ggplot(flights_weather) +
geom_point(aes(visib, dep_delay)) +
geom_smooth(aes(visib, dep_delay)) +
scale_x_continuous(name = "Visibility (miles)") +
scale_y_continuous(name = "Departure Delay Time") +
labs(title = "Density Graph of Visibility and Departure Delay Time")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Answer: Based on the graphs, there is no strong correlation between visibility, precipitation, wind speed, temperature and flights departure delay.
june_13_delay <- flights %>%
filter(year == 2013, month == 6, day == 13, !is.na(arr_delay)) %>%
group_by(dest) %>%
summarise(avg_arr_delay = mean(arr_delay), flight_count = length(arr_delay)) %>%
inner_join(airports, by = c("dest" = "faa"))
ggplot(june_13_delay, aes(lon, lat)) +
annotation_borders("state") +
geom_point(aes(colour = avg_arr_delay, size = flight_count)) +
scale_color_gradient(low = "green", high = "red") +
coord_quickmap() +
labs(title = "Flights Arrival Delay and Counts in US",
x = "Longitude",
y = "Latitude") +
theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))
Answer: On June 13, 2013, a powerful derecho raced to the south and east of the US caused a lot of damage in that area. It must forced a lot of flights delay their arrive to avoid the storm.