#step 1: load data:
glimpse(who)
## Rows: 7,240
## Columns: 60
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
## $ iso2 <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
## $ year <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198…
## $ new_sp_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#step 2: pivit longer:
who1 <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
)
who1
## # A tibble: 76,046 × 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2
## 7 Afghanistan AF AFG 1997 new_sp_m65 0
## 8 Afghanistan AF AFG 1997 new_sp_f014 5
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36
## # ℹ 76,036 more rows
#step 3: clean the key
who2<- who1 %>%
mutate(key = str_replace(key, "newrel", "new_rel"))
who2
## # A tibble: 76,046 × 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2
## 7 Afghanistan AF AFG 1997 new_sp_m65 0
## 8 Afghanistan AF AFG 1997 new_sp_f014 5
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36
## # ℹ 76,036 more rows
#step 4: seperate to different column
who3 <- who2 %>%
separate(key, c("new","type", "sexage" ), sep="_")
who3
## # A tibble: 76,046 × 8
## country iso2 iso3 year new type sexage cases
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan AF AFG 1997 new sp m014 0
## 2 Afghanistan AF AFG 1997 new sp m1524 10
## 3 Afghanistan AF AFG 1997 new sp m2534 6
## 4 Afghanistan AF AFG 1997 new sp m3544 3
## 5 Afghanistan AF AFG 1997 new sp m4554 5
## 6 Afghanistan AF AFG 1997 new sp m5564 2
## 7 Afghanistan AF AFG 1997 new sp m65 0
## 8 Afghanistan AF AFG 1997 new sp f014 5
## 9 Afghanistan AF AFG 1997 new sp f1524 38
## 10 Afghanistan AF AFG 1997 new sp f2534 36
## # ℹ 76,036 more rows
#step 5: remove iso2, iso3 and new
who4<- who3%>%
dplyr ::select(-iso2,-iso3,-new)
who4
## # A tibble: 76,046 × 5
## country year type sexage cases
## <chr> <dbl> <chr> <chr> <dbl>
## 1 Afghanistan 1997 sp m014 0
## 2 Afghanistan 1997 sp m1524 10
## 3 Afghanistan 1997 sp m2534 6
## 4 Afghanistan 1997 sp m3544 3
## 5 Afghanistan 1997 sp m4554 5
## 6 Afghanistan 1997 sp m5564 2
## 7 Afghanistan 1997 sp m65 0
## 8 Afghanistan 1997 sp f014 5
## 9 Afghanistan 1997 sp f1524 38
## 10 Afghanistan 1997 sp f2534 36
## # ℹ 76,036 more rows
#step 6: seperate age and sex
who5 <- who4 %>%
separate(sexage, c("sex", "age"), sep = 1)
who5
## # A tibble: 76,046 × 6
## country year type sex age cases
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
## 7 Afghanistan 1997 sp m 65 0
## 8 Afghanistan 1997 sp f 014 5
## 9 Afghanistan 1997 sp f 1524 38
## 10 Afghanistan 1997 sp f 2534 36
## # ℹ 76,036 more rows
who6<- who5%>%
filter(year>=2000)
ggplot(who6)+
stat_summary(aes(x = year, y = cases, fill = age), fun = 'sum', geom = 'bar', position = 'dodge') +
labs(title = "TB cases in the world over after 2000", x = "Years", y = "Case Counts") +
scale_fill_discrete(labels = c("0-14 yrs", "15-24 yrs", "25-34 yrs", "35-44 yrs", "45-54 yrs", "55-64 yrs", "> 65 yrs"))
us_avg_tuition <- read.csv("/Users/linhle/Downloads/School material/D-Visual/us_avg_tuition.csv")
glimpse(us_avg_tuition)
## Rows: 50
## Columns: 13
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co…
## $ X2004.05 <chr> "$5,683", "$4,328", "$5,138", "$5,772", "$5,286", "$4,704", "…
## $ X2005.06 <chr> "$5,841", "$4,633", "$5,416", "$6,082", "$5,528", "$5,407", "…
## $ X2006.07 <chr> "$5,753", "$4,919", "$5,481", "$6,232", "$5,335", "$5,596", "…
## $ X2007.08 <chr> "$6,008", "$5,070", "$5,682", "$6,415", "$5,672", "$6,227", "…
## $ X2008.09 <chr> "$6,475", "$5,075", "$6,058", "$6,417", "$5,898", "$6,284", "…
## $ X2009.10 <chr> "$7,189", "$5,455", "$7,263", "$6,627", "$7,259", "$6,948", "…
## $ X2010.11 <chr> "$8,071", "$5,759", "$8,840", "$6,901", "$8,194", "$7,748", "…
## $ X2011.12 <chr> "$8,452", "$5,762", "$9,967", "$7,029", "$9,436", "$8,316", "…
## $ X2012.13 <chr> "$9,098", "$6,026", "$10,134", "$7,287", "$9,361", "$8,793", …
## $ X2013.14 <chr> "$9,359", "$6,012", "$10,296", "$7,408", "$9,274", "$9,293", …
## $ X2014.15 <chr> "$9,496", "$6,149", "$10,414", "$7,606", "$9,187", "$9,299", …
## $ X2015.16 <chr> "$9,751", "$6,571", "$10,646", "$7,867", "$9,270", "$9,748", …
Uat1<-us_avg_tuition %>%
pivot_longer(
cols = X2004.05:X2015.16,
names_to = "years",
values_to = "tuition")
Uat1
## # A tibble: 600 × 3
## State years tuition
## <chr> <chr> <chr>
## 1 Alabama X2004.05 $5,683
## 2 Alabama X2005.06 $5,841
## 3 Alabama X2006.07 $5,753
## 4 Alabama X2007.08 $6,008
## 5 Alabama X2008.09 $6,475
## 6 Alabama X2009.10 $7,189
## 7 Alabama X2010.11 $8,071
## 8 Alabama X2011.12 $8,452
## 9 Alabama X2012.13 $9,098
## 10 Alabama X2013.14 $9,359
## # ℹ 590 more rows
Uat2 <- Uat1 %>%
separate(years, c("X", "years"), sep = 1)%>%
dplyr ::select(-X)
Uat2
## # A tibble: 600 × 3
## State years tuition
## <chr> <chr> <chr>
## 1 Alabama 2004.05 $5,683
## 2 Alabama 2005.06 $5,841
## 3 Alabama 2006.07 $5,753
## 4 Alabama 2007.08 $6,008
## 5 Alabama 2008.09 $6,475
## 6 Alabama 2009.10 $7,189
## 7 Alabama 2010.11 $8,071
## 8 Alabama 2011.12 $8,452
## 9 Alabama 2012.13 $9,098
## 10 Alabama 2013.14 $9,359
## # ℹ 590 more rows
Uat3<- Uat2 %>%
mutate(tuition_clean = as.numeric(str_remove_all(tuition, "[$,]")))
Uat3
## # A tibble: 600 × 4
## State years tuition tuition_clean
## <chr> <chr> <chr> <dbl>
## 1 Alabama 2004.05 $5,683 5683
## 2 Alabama 2005.06 $5,841 5841
## 3 Alabama 2006.07 $5,753 5753
## 4 Alabama 2007.08 $6,008 6008
## 5 Alabama 2008.09 $6,475 6475
## 6 Alabama 2009.10 $7,189 7189
## 7 Alabama 2010.11 $8,071 8071
## 8 Alabama 2011.12 $8,452 8452
## 9 Alabama 2012.13 $9,098 9098
## 10 Alabama 2013.14 $9,359 9359
## # ℹ 590 more rows
Uat4 <-Uat3 %>%
group_by(State)%>%
summarise(Average_tuition_of_all_years = mean(tuition_clean, na.rm = TRUE))%>%
arrange(Average_tuition_of_all_years)
Uat4
## # A tibble: 50 × 2
## State Average_tuition_of_all_years
## <chr> <dbl>
## 1 Wyoming 4307.
## 2 Florida 5131.
## 3 Utah 5228.
## 4 Nevada 5367
## 5 Louisiana 5478.
## 6 Alaska 5480.
## 7 New Mexico 5485.
## 8 North Carolina 5589.
## 9 Idaho 5633.
## 10 West Virginia 5652.
## # ℹ 40 more rows
ggplot(Uat4) +
geom_bar(aes(x = reorder(State, Average_tuition_of_all_years),
y = Average_tuition_of_all_years),
stat = "identity") +
coord_flip() +
labs(x = "State", y = "Average Tuition of All Years")
Answer : We can tell Vermort has the highest average tuition over years and Wyoming has the lowest one
Uat5 <-Uat3 %>%
filter(years == "2004.05" | years== "2015.16") %>%
dplyr ::select(State, years, tuition_clean ) %>%
group_by(State) %>%
summarize(ir = (tuition_clean[years == "2015.16"] / tuition_clean[years == "2004.05"])^(1/11) - 1)
Uat5
## # A tibble: 50 × 2
## State ir
## <chr> <dbl>
## 1 Alabama 0.0503
## 2 Alaska 0.0387
## 3 Arizona 0.0685
## 4 Arkansas 0.0286
## 5 California 0.0524
## 6 Colorado 0.0685
## 7 Connecticut 0.0329
## 8 Delaware 0.0309
## 9 Florida 0.0467
## 10 Georgia 0.0633
## # ℹ 40 more rows
ggplot(Uat5) +
geom_bar(aes(x = reorder(State, ir),
y = ir),
stat = "identity") +
coord_flip() +
labs(x = "State", title = "Increasing rate of average tuition from 2004-2005 to 2015-2016 in each state", y = "Increasing rate")
Answer : Hawaii’s tuition increased at the fastest rate and Ohio is the slowest
because I don’t know which day you meant so I made to graph, one is average flights per day and the other one is just flights per day but across 12 days.
fpd<- flights%>%
group_by(year, month, day, dest)%>%
summarise(flight_per_day = n(), .groups = "drop")%>%
group_by(dest) %>%
summarise(avg_flights_per_day = mean(flight_per_day))
fpd
## # A tibble: 105 × 2
## dest avg_flights_per_day
## <chr> <dbl>
## 1 ABQ 1
## 2 ACK 1.71
## 3 ALB 1.69
## 4 ANC 1
## 5 ATL 47.2
## 6 AUS 6.68
## 7 AVL 1.11
## 8 BDL 1.76
## 9 BGR 1.44
## 10 BHM 1
## # ℹ 95 more rows
airports %>%
semi_join(fpd, c("faa" = "dest")) %>%
left_join(fpd, by = c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(aes(color = avg_flights_per_day) ) +
coord_quickmap()+
labs(
title = "Average Number of Flights Per Day from NYC",
color = "Avg Flights/Day",
x = "Longitude",
y = "Latitude"
)
## Warning: `borders()` was deprecated in ggplot2 4.0.0.
## ℹ Please use `annotation_borders()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fpd_12<- flights%>%
group_by(year, month, day, dest)%>%
summarise(flight_per_day = n(), .groups = "drop")
fpd_12
## # A tibble: 31,229 × 5
## year month day dest flight_per_day
## <int> <int> <int> <chr> <int>
## 1 2013 1 1 ALB 3
## 2 2013 1 1 ATL 40
## 3 2013 1 1 AUS 6
## 4 2013 1 1 AVL 1
## 5 2013 1 1 BDL 2
## 6 2013 1 1 BNA 10
## 7 2013 1 1 BOS 25
## 8 2013 1 1 BQN 3
## 9 2013 1 1 BTV 7
## 10 2013 1 1 BUF 16
## # ℹ 31,219 more rows
airports %>%
semi_join(fpd_12, c("faa" = "dest")) %>%
left_join(fpd_12, by = c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(aes(color =flight_per_day ) ) +
coord_quickmap()+
labs(
title = "Flights Per Day accross 12 days from NYC",
color = " Flights/Day",
x = "Longitude",
y = "Latitude"
)
glimpse(weather)
## Rows: 26,115
## Columns: 15
## $ origin <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW…
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hour <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, …
## $ temp <dbl> 39.02, 39.02, 39.02, 39.92, 39.02, 37.94, 39.02, 39.92, 39.…
## $ dewp <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.…
## $ humid <dbl> 59.37, 61.63, 64.43, 62.21, 64.43, 67.21, 64.43, 62.21, 62.…
## $ wind_dir <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,…
## $ wind_speed <dbl> 10.35702, 8.05546, 11.50780, 12.65858, 12.65858, 11.50780, …
## $ wind_gust <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.…
## $ precip <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pressure <dbl> 1012.0, 1012.3, 1012.5, 1012.2, 1011.9, 1012.4, 1012.2, 101…
## $ visib <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
## $ time_hour <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00…
fw<- flights %>%
left_join(weather)
## Joining with `by = join_by(year, month, day, origin, hour, time_hour)`
fw1 <- fw%>%
group_by(dep_delay, temp, wind_speed, precip, visib )%>%
dplyr ::select(dep_delay, temp, wind_speed, precip, visib)%>%
arrange(desc(dep_delay))
fw1_c <- fw1%>%
filter(!is.na(dep_delay),!is.na(temp),!is.na(wind_speed),!is.na(precip),!is.na(visib))
tribble(
~Delay_by, ~cor,
#--|--|----
"Temp", cor(fw1_c$dep_delay, fw1_c$temp),
"Wind speed", cor(fw1_c$dep_delay, fw1_c$wind_speed),
"Precipitation", cor(fw1_c$dep_delay, fw1_c$precip),
"Visibility",cor(fw1_c$dep_delay, fw1_c$visib)
)
## # A tibble: 4 × 2
## Delay_by cor
## <chr> <dbl>
## 1 Temp 0.0615
## 2 Wind speed 0.0474
## 3 Precipitation 0.0904
## 4 Visibility -0.0942
Based on the correlation, the precipitation and the visibility might be more likely to see a departure delay but It seems like the correlation is quite weak here so actually none of the weather variables meaningfully predict flight delays based on this method.
flights %>%
filter(!is.na(arr_delay), year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarise(avg_delay = mean(arr_delay)) -> delay_data
airports %>%
semi_join(delay_data, by = c("faa" = "dest")) %>%
left_join(delay_data, by = c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(aes(color = avg_delay)) +
scale_color_gradient( low = "green", high = "red") +
coord_quickmap()
Answer: It seems like the there is the high delay in the Northeast/Mid-Atlantic region (NYC, Boston, Philadelphia, DC area). and by google, A derecho formed in the Midwest on June 12, tracking eastward into Ohio, Pennsylvania, and the Mid-Atlantic states. It produced damaging winds, hail up to 2.75 inches (7.0 cm) in diameter, and five tornadoes in Iowa.
=> This is the reason why they need to delay the flight there