## Warning: package 'knitr' was built under R version 3.2.5
## Warning: package 'lubridate' was built under R version 3.2.5
## Warning: package 'ggplot2' was built under R version 3.2.4
## Warning: package 'dplyr' was built under R version 3.2.5
## Warning: package 'readr' was built under R version 3.2.5

Admistrative:

Please indicate

  • Who you collaborated with: Me, Myself, and Albert Kim. (office hours on Tuesday Oct 4th.)

  • Roughly how much time you spent on this HW so far: Likely three or four hours. I sped up as I practiced more throughout the week.

  • The URL of the RPubs published URL here.

  • What gave you the most trouble: The month function. Understanding conceptually what was happening with the grouping function and what variables to use.

  • Any comments you have: I would like the homework to be worth more percentage points for our grades because they are where the learning happens. Or the class excercises being worth something. Incentives.

Question 1:

Plot a “time series” of the proportion of flights that were delayed by > 30 minutes on each day. i.e.

  • the x-axis should be some notion of time
  • the y-axis should be the proportion.

Using this plot, indicate describe the seasonality of when delays over 30 minutes tend to occur.

date2 num_flights_30
07 2792
06 2528
05 2319
04 2193
03 2095
12 2063
08 1793
01 1716
02 1668
10 1497
11 1415
09 1394
date2 num_flights num_flights_30 proportion_flight_delayed
01 18910 1716 0.0907456
02 17128 1668 0.0973844
03 19470 2095 0.1076014
04 18593 2193 0.1179476
05 19172 2319 0.1209576
06 19600 2528 0.1289796
07 20548 2792 0.1358770
08 20176 1793 0.0888680
09 18065 1394 0.0771658
10 18696 1497 0.0800706
11 18021 1415 0.0785195
12 19117 2063 0.1079144

Question 2:

Some people prefer flying on older planes. Even though they aren’t as nice, they tend to have more room. Which airlines should these people favor?

## Source: local data frame [15 x 2]
## 
##    carrier mean_years
##      <chr>      <dbl>
## 1       AA   1986.675
## 2       AS   2005.554
## 3       B6   2006.080
## 4       CO   2001.137
## 5       DL   1990.240
## 6       EV   2004.586
## 7       F9   2003.878
## 8       FL   2002.110
## 9       MQ   1981.579
## 10      OO   2004.983
## 11      UA   1996.365
## 12      US   1991.922
## 13      WN   1997.943
## 14      XE   2000.442
## 15      YV   2003.886

Question 3:

  • What states did Southwest Airlines’ flight paths tend to fly to?
  • What states did Southwest Airlines’ flights tend to fly to?

For example, Southwest Airlines Flight 60 to Dallas consists of a single flight path, but since it flew 299 times in 2013, it would be counted as 299 flights.

#lets first join airports and flights to find states SW flies to. 
sw_join <- full_join(airports, flights, by=c("iata"="dest"))

#dataset with only SouthWest airline flights and no NAs. 
sw_flights <- sw_join %>% 
  filter(!is.na(flight)) %>% 
  filter((carrier == "WN")) 

sw_tot <- sw_flights %>% 
  select(flight) %>% 
  tally() %>% 
  rename(tot_sw = n)

#find state where flight paths tend to fly to (so dont group by destination)
sw_fp <- sw_flights %>% 
  group_by(flight, state) %>% 
  tally() %>% 
  rename(flight_dest = n)
sw_fp
## Source: local data frame [2,894 x 3]
## Groups: flight [1,969]
## 
##    flight state flight_dest
##     <int> <chr>       <int>
## 1       1    CO          35
## 2       1    FL          12
## 3       1    MS          56
## 4       1    NJ           1
## 5       1    TX         137
## 6       2    TX         256
## 7       3    FL          25
## 8       3    MS          20
## 9       3    NM           1
## 10      3    TX         210
## ..    ...   ...         ...
   # head(10) %>%
  #  format(big.mark = ",") %>%
   # knitr::kable(sw_fp)

#gives us the common flight destinations for flight paths.
#find states flights tend to fly to 
sw_states <- sw_flights %>% 
  select(state, iata) %>% 
  group_by(state, iata) %>% 
  tally() %>% 
  rename(state_count = n) %>% 
  head(10) %>%
    format(big.mark = ",") %>%
    knitr::kable("markdown", align = c('l', 'r'))

Question 4:

I want to know proportionately what regions (NE, south, west, midwest) each carrier flies to/from Houston in the month of July. Consider the month() function from the lubridate package.

#month function. proportionally. carrier region total/ flight total
#carrier region total/ flight total. 

#sw_join was data made earlier, joiningg airoprts and flights dataset togehter. 
sw_join <- full_join(airports, flights, by=c("iata"="dest"))
region <- full_join(sw_join, states, by = "state")


July_Flight <- region %>% 
  filter(!is.na(carrier) & (!is.na(date)))  %>% 
  mutate(month = month(date)) %>% 
  filter(month == 7) 
                            
regionTot <- July_Flight %>% 
  group_by(region, carrier) %>% 
  tally() %>% 
  rename(region_fly_tot = n) 

flight_tot <- July_Flight %>% 
  group_by(region) %>%       
  tally() %>% 
  rename(tot_flights = n) 
#to compute this proportion, let's join region flight totals and flights totals 
props <- left_join(regionTot, flight_tot, by="region") %>% 
  mutate(prop_region = (region_fly_tot/tot_flights)) 

#mutate(props, prop_region=as.numeric(prop_region)) 

ggplot(data = props, aes(x=carrier, y=prop_region, col = region)) +
geom_point() +
         labs(x = "Airline Carrier", y = "Proportion", 
              title = "Proportionately what regions each 
airline carrier flies to in July")