Problem 1

Load the nycflights13 dataset in R using the following code:

#load the nycflights13 dataset into R
library(nycflights13)

Problem 2

Give the dataframes in the nycflights13 dataset distinctive names (flights, planes,and airports)

#datasets named to make them easily distinguishable
flights <- nycflights13::flights
planes <- nycflights13::planes
airports <- nycflights13::airports

Problem 3

Using a filter join, determine the number of records in the flights dataset that have tail numbers matching records in the planes dataset.

#ran library tidyverse to run the code below
library(tidyverse)
#semi join is used to keep all observations from the flights dataframe that have matching tail numbers in the 
#planes dataframe
flights %>% semi_join(planes, by = "tailnum") -> filter_join
filter_join
## # A tibble: 284,170 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      849            851
## # … with 284,160 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# number of records that have matching tail numbers
nrow(filter_join)
## [1] 284170

Problem 4

Using a filter join, determine the number of records in the flights dataset with tail numbers that do not match records in the planes dataset.

#anti join is used to drop all observations in flights that have a match in planes
flights %>% anti_join(planes, by = "tailnum") -> anti_join
anti_join
## # A tibble: 52,606 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      558            600        -2      753            745
##  2  2013     1     1      559            600        -1      941            910
##  3  2013     1     1      600            600         0      837            825
##  4  2013     1     1      602            605        -3      821            805
##  5  2013     1     1      608            600         8      807            735
##  6  2013     1     1      611            600        11      945            931
##  7  2013     1     1      623            610        13      920            915
##  8  2013     1     1      624            630        -6      840            830
##  9  2013     1     1      628            630        -2     1137           1140
## 10  2013     1     1      629            630        -1      824            810
## # … with 52,596 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# number of records that do not match
nrow(anti_join)
## [1] 52606

Problem 5

Using a filter join, filter the airports dataset for those airports that do not have matching destination values in the flights dataset.

# anti join used again and since they do not have the same column name faa = dest
airports %>% anti_join(flights, by = c("faa" = "dest")) -> airport_antijoin
airport_antijoin
## # A tibble: 1,357 × 8
##    faa   name                             lat    lon   alt    tz dst   tzone    
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>    
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…
## # … with 1,347 more rows
# number of records that do not match
nrow(airport_antijoin)
## [1] 1357

Problem 6

Load the data_Windmill csv file into R and perform a regression

#Windmill file loaded
Windmill <- read.csv("G:/Other computers/My Laptop/Documents/Richard 621/Week 7/data_Windmill.csv")

#a) Perform regression on this dataset using Velocity as the covariate (i.e., the independent variable), and use Output as the target variable.
windmill_1 <- lm(Output ~ Velocity, data = Windmill)
summary(windmill_1)
## 
## Call:
## lm(formula = Output ~ Velocity, data = Windmill)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59869 -0.14099  0.06059  0.17262  0.32184 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.13088    0.12599   1.039     0.31    
## Velocity     0.24115    0.01905  12.659 7.55e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2361 on 23 degrees of freedom
## Multiple R-squared:  0.8745, Adjusted R-squared:  0.869 
## F-statistic: 160.3 on 1 and 23 DF,  p-value: 7.546e-12

Create a scatter plot comparing the residuals to the predictions from the regression

#b)scatter plot to evaluate linearity
plot(windmill_1$fitted.values, windmill_1$residuals, pch = 20, col = "blue")
abline(h = 0)

Determine the best value for λ to use in a Box-Cox transformation since the scatter plot is non linear and create a new regression based on the ideal value of λ

#c)
# library used to help determine λ 
library(MASS)

# determine λ   λ=2
boxcox(windmill_1)

# transformation using the value of λ
windmill_1b = lm(I(Output^2) ~ Velocity, data = Windmill)
plot(windmill_1b$fitted.values, windmill_1b$residuals, pch = 20, col = "blue")
abline(h = 0)

Problem 7

Load the data_Windmill2 and data_Windmill3 csv files into R.

#windmill2 dataset
Windmill_2<- read.csv("G:/Other computers/My Laptop/Documents/Richard 621/Week 7/data_Windmill2.csv")
#of rows in the dataset
nrow(Windmill_2)
## [1] 25
#windmill3 dataset
Windmill_3<- read.csv("G:/Other computers/My Laptop/Documents/Richard 621/Week 7/data_Windmill3.csv")
#of rows in the dataset
nrow(Windmill_3)
## [1] 11

Using a set operation, determine the number of rows that are found in both datasets.

#a)
# of rows found in both data sets using intersect (total of 10)
intersect(Windmill_2, Windmill_3)
##    Velocity Output
## 1      9.70  2.386
## 2      9.55  2.294
## 3      3.05  0.558
## 4      8.15  2.166
## 5      6.20  1.866
## 6      2.90  0.653
## 7      6.35  1.930
## 8      4.60  1.562
## 9      5.80  1.737
## 10     7.40  2.088

Using a set operation, determine the number of rows that are contained in the data_Windmill2 file, but not in the data_Windmill3 file.

#b)
# of rows found in the windmill_2 dataset but not in the windmill_3 data set (total of 15)
setdiff(Windmill_2, Windmill_3)
##    Velocity Output
## 1      5.00  1.582
## 2      6.00  1.822
## 3      3.40  1.057
## 4      2.70  0.500
## 5     10.00  2.236
## 6      3.60  1.137
## 7      7.85  2.179
## 8      8.80  2.112
## 9      7.00  1.800
## 10     5.45  1.501
## 11     9.10  2.303
## 12    10.20  2.310
## 13     4.10  1.194
## 14     3.95  1.144
## 15     2.45  0.123

Using a set operation, determine the number of unique rows that are contained in the datasets.

#c)
# of rows unique rows in datasets (total of 26)
union(Windmill_2, Windmill_3)
##    Velocity Output
## 1      5.00  1.582
## 2      6.00  1.822
## 3      3.40  1.057
## 4      2.70  0.500
## 5     10.00  2.236
## 6      9.70  2.386
## 7      9.55  2.294
## 8      3.05  0.558
## 9      8.15  2.166
## 10     6.20  1.866
## 11     2.90  0.653
## 12     6.35  1.930
## 13     4.60  1.562
## 14     5.80  1.737
## 15     7.40  2.088
## 16     3.60  1.137
## 17     7.85  2.179
## 18     8.80  2.112
## 19     7.00  1.800
## 20     5.45  1.501
## 21     9.10  2.303
## 22    10.20  2.310
## 23     4.10  1.194
## 24     3.95  1.144
## 25     2.45  0.123
## 26     7.10  2.100