us_tuition <- read.csv("/Users/yuhe/Downloads/us_avg_tuition.csv")

1.who data set in the tidyr package

a.Read the lecture notebook that tidies the who data set, understand all steps needed, then reproduce the whole tidying steps by yourself without referring to my notebook.

glimpse(who)
## Rows: 7,240
## Columns: 60
## $ country      <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
## $ iso2         <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ iso3         <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
## $ year         <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198…
## $ new_sp_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
who_0 <- who %>%
  pivot_longer(cols = new_sp_m014:newrel_f65, names_to = "key", values_to = "cases", values_drop_na = T) %>%
  mutate(key = str_replace(key, "newrel", "new_rel")) %>%
  separate(key, c("new", "type", "sexage"), sep = "_") %>%
  select(-new, -iso2, -iso3) %>%
  separate(sexage, c("sex", "age"), sep = 1)

who_0
## # A tibble: 76,046 × 6
##    country      year type  sex   age   cases
##    <chr>       <dbl> <chr> <chr> <chr> <dbl>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # ℹ 76,036 more rows

b.Using the tidied data, explore one question of your interest and answer it with visualization or summary table.

Which country has the highest cases and which type has the most cases?

who_0 %>%
  group_by(country) %>%
  summarise(cases = sum(cases)) %>%
  arrange(desc(cases)) %>%
  head(4)
## # A tibble: 4 × 2
##   country        cases
##   <chr>          <dbl>
## 1 China        8389839
## 2 India        7098552
## 3 South Africa 3010272
## 4 Indonesia    2909925
who_0 %>%
  filter(country == "China") %>%
  group_by(type) %>%
  summarise(cases = sum(cases)) %>%
  arrange(desc(cases))
## # A tibble: 4 × 2
##   type    cases
##   <chr>   <dbl>
## 1 sp    5637562
## 2 sn    1885757
## 3 rel    847176
## 4 ep      19344

Answer: China has the most cases, and the type of sp cases is the most.

2.Tidy and visualize the US_average_tuition data set (attached on Canvas)

glimpse(us_tuition)
## Rows: 50
## Columns: 13
## $ State    <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co…
## $ X2004.05 <chr> "$5,683", "$4,328", "$5,138", "$5,772", "$5,286", "$4,704", "…
## $ X2005.06 <chr> "$5,841", "$4,633", "$5,416", "$6,082", "$5,528", "$5,407", "…
## $ X2006.07 <chr> "$5,753", "$4,919", "$5,481", "$6,232", "$5,335", "$5,596", "…
## $ X2007.08 <chr> "$6,008", "$5,070", "$5,682", "$6,415", "$5,672", "$6,227", "…
## $ X2008.09 <chr> "$6,475", "$5,075", "$6,058", "$6,417", "$5,898", "$6,284", "…
## $ X2009.10 <chr> "$7,189", "$5,455", "$7,263", "$6,627", "$7,259", "$6,948", "…
## $ X2010.11 <chr> "$8,071", "$5,759", "$8,840", "$6,901", "$8,194", "$7,748", "…
## $ X2011.12 <chr> "$8,452", "$5,762", "$9,967", "$7,029", "$9,436", "$8,316", "…
## $ X2012.13 <chr> "$9,098", "$6,026", "$10,134", "$7,287", "$9,361", "$8,793", …
## $ X2013.14 <chr> "$9,359", "$6,012", "$10,296", "$7,408", "$9,274", "$9,293", …
## $ X2014.15 <chr> "$9,496", "$6,149", "$10,414", "$7,606", "$9,187", "$9,299", …
## $ X2015.16 <chr> "$9,751", "$6,571", "$10,646", "$7,867", "$9,270", "$9,748", …

a.Tidy the data following what we learned in class.

us_tuition_0 <- us_tuition %>%
  pivot_longer(c(X2004.05, X2005.06, X2006.07, X2007.08, X2008.09, X2009.10, X2010.11, X2011.12, X2012.13, X2013.14, X2014.15, X2015.16), names_to = "date", values_to = "avg_tuition") %>%
  separate(date, into = c("X", "academic_year"), sep = 1) %>%
  select(State, academic_year, avg_tuition) %>%
  mutate(academic_year = str_replace(academic_year, "\\.", "-20")) %>%
  mutate(avg_tuition = str_remove(avg_tuition, ",")) %>%
  separate(avg_tuition, into = c("currency", "avg_tuition"), sep = 1, convert = T) %>%
  select(State, academic_year, avg_tuition)

us_tuition_0
## # A tibble: 600 × 3
##    State   academic_year avg_tuition
##    <chr>   <chr>               <int>
##  1 Alabama 2004-2005            5683
##  2 Alabama 2005-2006            5841
##  3 Alabama 2006-2007            5753
##  4 Alabama 2007-2008            6008
##  5 Alabama 2008-2009            6475
##  6 Alabama 2009-2010            7189
##  7 Alabama 2010-2011            8071
##  8 Alabama 2011-2012            8452
##  9 Alabama 2012-2013            9098
## 10 Alabama 2013-2014            9359
## # ℹ 590 more rows

b.Make an informative visualization of the data to show the average tuition across all years in the data set for each state. Which state has the highest tuition? Which state has the lowest?

us_tuition_0 %>%
  ggplot() +
  stat_summary(aes(x = State, y = avg_tuition), fun = 'mean', geom = 'bar') +
  labs(title = "State Average Tuition Of All Years ", x = "State", y = "Average Tuition") +
  theme(plot.title = element_text(hjust = 0.5, size = rel(1.2)),
        axis.text.x = element_text(angle = 45))

us_tuition_0 %>%
  group_by(State) %>%
  summarise(avg_tuition_year = mean(avg_tuition)) %>%
  arrange(desc(avg_tuition_year))
## # A tibble: 50 × 2
##    State          avg_tuition_year
##    <chr>                     <dbl>
##  1 Vermont                  13067.
##  2 New Hampshire            12781.
##  3 New Jersey               12054.
##  4 Pennsylvania             11970 
##  5 Illinois                 11228.
##  6 Michigan                 10477 
##  7 South Carolina           10377.
##  8 Delaware                 10099.
##  9 Massachusetts            10058.
## 10 Ohio                      9942.
## # ℹ 40 more rows

Answer: Vermont has the highest average tuition. Wyoming has the lowest average tuition.

c.Make an informative visualization of the data to show the increasing rate of average tuition from 2004-2005 to 2015-2016 in each state. Which state’s tuition increased at the fastest rate? Which one the slowest?

us_tuition_0 %>%
  pivot_wider(names_from = academic_year, values_from = avg_tuition) %>%
  group_by(State) %>%
  summarise(rate_of_change = (`2015-2016`-`2004-2005`)/`2004-2005`) %>%
  ggplot(aes(rate_of_change, reorder(State, rate_of_change), fill = rate_of_change)) +
  geom_col() +
  labs(title = "Rate of Change in College Tuition by State From 2004 -2015",
       x = "Rate of Change", 
       y = "State")

Answer: Hawaii has the fastest rate while Ohio is the slowest.

3.Analyze flights and weather data sets in nycflights13 (relational data)

a.Finish the lab exercise - Create a airport map with each airport location marked on the map and colored by the number of flights per day from NYC to each airport.

flights2 <- flights %>%
  filter(!is.na(dest)) %>%
  group_by(dest, month, day) %>%
  summarise(flights_count_day = length(dest))
## `summarise()` has grouped output by 'dest', 'month'. You can override using the
## `.groups` argument.
flights3 <- flights2 %>%
  group_by(dest) %>%
  summarise(avg_flights_day = sum(flights_count_day)/length(dest))

airports %>%
  semi_join(flights3, c("faa" = "dest")) %>%
  left_join(flights3, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    annotation_borders("state") +
    geom_point(aes(colour = avg_flights_day)) +
    scale_color_gradient(low = "green", high = "red") +
    coord_quickmap() +
    labs(title = "Flights and Counts from NYC",
          x = "Longitude",
          y = "Latitude") +
    theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))

b.What weather conditions make it more likely to see a departure delay? hot or cold weather? windy weather? rainy or snowy? foggy? Create a proper data frame and use proper visualization or summary table to answer the question.

flights_weather <- left_join(weather, flights, by = c("origin", "time_hour"))

flights_weather <- flights_weather %>%
  select(origin, temp, wind_speed, precip, visib, time_hour, dep_delay) %>%
  filter(!is.na(dep_delay))
ggplot(flights_weather) +
  geom_point(aes(temp, dep_delay)) +
  scale_x_continuous(name = "Temperature (F)") +
  scale_y_continuous(name = "Departure Delay Time") +
  labs(title = "Density Graph of Temperature and Departure Delay Time")
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(flights_weather) +
  geom_density_2d_filled(aes(wind_speed, dep_delay)) +
  scale_x_continuous(name = "Wind Speed (mph)") +
  scale_y_continuous(name = "Departure Delay Time", limits = c(-25, 25)) +
  labs(title = "Density Graph of Wind Speed and Departure Delay Time")
## Warning: Removed 54147 rows containing non-finite outside the scale range
## (`stat_density2d_filled()`).

ggplot(flights_weather) +
  geom_point(aes(precip, dep_delay)) +
  scale_x_continuous(name = "Precipitation (inch)") +
  scale_y_continuous(name = "Departure Delay Time") +
  labs(title = "Density Graph of Precipitation and Departure Delay Time")

ggplot(flights_weather) +
  geom_point(aes(visib, dep_delay)) +
  geom_smooth(aes(visib, dep_delay)) +
  scale_x_continuous(name = "Visibility (miles)") +
  scale_y_continuous(name = "Departure Delay Time") +
  labs(title = "Density Graph of Visibility and Departure Delay Time")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Answer: Based on the graphs, there is no strong correlation between visibility, precipitation, wind speed, temperature and flights departure delay.

c.Display the spatial pattern of arrival delays on June 13, 2013 using a map, and then use Google to cross-reference with the weather. Explain how the weather condition might have affected the spatial pattern of arrival delays.

june_13_delay <- flights %>%
  filter(year == 2013, month == 6, day == 13, !is.na(arr_delay)) %>%
  group_by(dest) %>%
  summarise(avg_arr_delay = mean(arr_delay), flight_count = length(arr_delay)) %>%
  inner_join(airports, by = c("dest" = "faa"))

  ggplot(june_13_delay, aes(lon, lat)) +
    annotation_borders("state") +
    geom_point(aes(colour = avg_arr_delay, size = flight_count)) +
    scale_color_gradient(low = "green", high = "red") +
    coord_quickmap() +
    labs(title = "Flights Arrival Delay and Counts in US",
          x = "Longitude",
          y = "Latitude") +
    theme(plot.title = element_text(hjust = 0.5, size = rel(1.5)))

Answer: On June 13, 2013, a powerful derecho raced to the south and east of the US caused a lot of damage in that area. It must forced a lot of flights delay their arrive to avoid the storm.