Lab 2: Intro to Data

library(tidyverse)
library(openintro)

data(nycflights)
names(nycflights)

##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay" "arr_time" 
##  [7] "arr_delay" "carrier"   "tailnum"   "flight"    "origin"    "dest"     
## [13] "air_time"  "distance"  "hour"      "minute"

Exercise 1

The three histogram shows how the histogram are sized differently. When you compare them you are able to see the little differences in the data the smaller the bins are. So when you change the bin width to 15 you are able to see little changes in the bins. When you change the binwidth to 150 it obscures the little bins in a way that it is not noticeable at all in the graph.

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 15)

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 150)

Exercise 2

Creating the sfo_feb_flights we can see that there are 68 flights that meet these criteria.

sfo_feb_flights <- nycflights %>%
  filter(dest=="SFO" & month == 2)

Exercise 3

The distribution of the arrival delays using a histogram and appropriate summary statistics

library(ggplot2)
ggplot(data=sfo_feb_flights,aes(x=arr_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sfo_feb_flights %>%
  summarise(mean_arr = mean(arr_delay),
            median_arr = median(arr_delay),
            min_arr = min(arr_delay),
            max_arr = max(arr_delay))

## # A tibble: 1 x 4
##   mean_arr median_arr min_arr max_arr
##      <dbl>      <dbl>   <dbl>   <dbl>
## 1     -4.5        -11     -66     196

Exercise 4

The carrier with the most variable arrival delays is the UA carrier with its median being -10 and its IQR being 22.

sfo_feb_flights %>%
  group_by(carrier) %>%
  summarise(median_arr = median(arr_delay),iqr_arr = IQR(arr_delay))

## # A tibble: 5 x 3
##   carrier median_arr iqr_arr
##   <chr>        <dbl>   <dbl>
## 1 AA             5      17.5
## 2 B6           -10.5    12.2
## 3 DL           -15      22  
## 4 UA           -10      22  
## 5 VX           -22.5    21.2

Exercise 5

The pros of the first option tells us the lowest average the mean departure which in this case tells us that the month with the lowest average delay is October.On the other hand the other option tells us the lowest median of the departure delays which is September and October. The cons for both are the values are pretty minuscule like a few points difference that it can be hard to see which month is actually the best so you would be stuck between Septemeber, October and November

##Option 1

nycflights %>%
  group_by(month) %>%
  summarise(low_mean = min(mean(dep_delay))) %>%
  arrange(low_mean)

## # A tibble: 12 x 2
##    month low_mean
##    <int>    <dbl>
##  1    10     5.88
##  2    11     6.10
##  3     9     6.87
##  4     1    10.2 
##  5     2    10.7 
##  6     8    12.6 
##  7     5    13.3 
##  8     3    13.5 
##  9     4    14.6 
## 10    12    17.4 
## 11     6    20.4 
## 12     7    20.8

## Option 2 

nycflights %>%
  group_by(month) %>%
  summarise(low_medi = min(median(dep_delay))) %>%
  arrange(low_medi)

## # A tibble: 12 x 2
##    month low_medi
##    <int>    <dbl>
##  1     9       -3
##  2    10       -3
##  3     1       -2
##  4     2       -2
##  5     4       -2
##  6    11       -2
##  7     3       -1
##  8     5       -1
##  9     8       -1
## 10     6        0
## 11     7        0
## 12    12        1

Exercise 6

I would choose LGA airport since it has a time departure percentage of 72%, compared to JFK and EWR.

nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
  group_by(origin) %>%
  summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
  arrange(desc(ot_dep_rate))

## # A tibble: 3 x 2
##   origin ot_dep_rate
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637

Exercise 7

I first converted the data frame by making the dividing the airtime by 60 minutes which is an hour and then I mutated the avg speed data frame into the flights data.

# convert the air time in hours
nycflights <- nycflights %>%
  mutate(air_time_in_hrs =air_time/60)

nycflights <- nycflights %>%
  mutate(avg_speed = distance/air_time_in_hrs)

Exercise 8

It seems there is a positive correlation between the average speed and the distance where the higher the average speed the more distance that are traveled. But it seems there are certain flights where they exceeded the avg speed but flew a small distance.

library(ggplot2)
ggplot(data= nycflights, aes(x=avg_speed,y=distance)) +
  geom_point()

Exercise 9

Here is the code for the scatterplot it came out exactly as the lab wanted I had to use color for carrier instead of fill.

nyc <- nycflights %>% 
  filter(carrier == "AA"| carrier == "DL" | carrier == "UA")
nyc

## # A tibble: 13,709 x 19
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     5     7     1657        -3     2104        10 DL      N3760C 
##  2  2013    12     8      859        -1     1238        11 DL      N712TW 
##  3  2013     5    14     1841        -4     2122       -34 DL      N914DL 
##  4  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
##  5  2013     9    26      725       -10     1027        -8 AA      N3FSAA 
##  6  2013     8     5      757        -3     1041       -23 DL      N380DA 
##  7  2013    10    21      859        -1     1036        11 UA      N57852 
##  8  2013     2     1      729         9     1018        -5 UA      N36247 
##  9  2013     6     8     1839        -1     2120       -12 DL      N3751B 
## 10  2013     9    18      933        -7     1037       -43 AA      N514AA 
## # ... with 13,699 more rows, and 10 more variables: flight <int>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   dep_type <chr>, air_time_in_hrs <dbl>, avg_speed <dbl>

ggplot(data = nyc, aes(x=dep_delay,y=arr_delay,color=carrier)) + geom_point()