#load the nycflights13 dataset into R
library(nycflights13)
#datasets named to make them easily distinguishable
flights <- nycflights13::flights
planes <- nycflights13::planes
airports <- nycflights13::airports
#ran library tidyverse to run the code below
library(tidyverse)
#semi join is used to keep all observations from the flights dataframe that have matching tail numbers in the
#planes dataframe
flights %>% semi_join(planes, by = "tailnum") -> filter_join
filter_join
## # A tibble: 284,170 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 849 851
## # … with 284,160 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# number of records that have matching tail numbers
nrow(filter_join)
## [1] 284170
#anti join is used to drop all observations in flights that have a match in planes
flights %>% anti_join(planes, by = "tailnum") -> anti_join
anti_join
## # A tibble: 52,606 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 558 600 -2 753 745
## 2 2013 1 1 559 600 -1 941 910
## 3 2013 1 1 600 600 0 837 825
## 4 2013 1 1 602 605 -3 821 805
## 5 2013 1 1 608 600 8 807 735
## 6 2013 1 1 611 600 11 945 931
## 7 2013 1 1 623 610 13 920 915
## 8 2013 1 1 624 630 -6 840 830
## 9 2013 1 1 628 630 -2 1137 1140
## 10 2013 1 1 629 630 -1 824 810
## # … with 52,596 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# number of records that do not match
nrow(anti_join)
## [1] 52606
# anti join used again and since they do not have the same column name faa = dest
airports %>% anti_join(flights, by = c("faa" = "dest")) -> airport_antijoin
airport_antijoin
## # A tibble: 1,357 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
## # … with 1,347 more rows
# number of records that do not match
nrow(airport_antijoin)
## [1] 1357
#Windmill file loaded
Windmill <- read.csv("G:/Other computers/My Laptop/Documents/Richard 621/Week 7/data_Windmill.csv")
#a) Perform regression on this dataset using Velocity as the covariate (i.e., the independent variable), and use Output as the target variable.
windmill_1 <- lm(Output ~ Velocity, data = Windmill)
summary(windmill_1)
##
## Call:
## lm(formula = Output ~ Velocity, data = Windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59869 -0.14099 0.06059 0.17262 0.32184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13088 0.12599 1.039 0.31
## Velocity 0.24115 0.01905 12.659 7.55e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2361 on 23 degrees of freedom
## Multiple R-squared: 0.8745, Adjusted R-squared: 0.869
## F-statistic: 160.3 on 1 and 23 DF, p-value: 7.546e-12
#b)scatter plot to evaluate linearity
plot(windmill_1$fitted.values, windmill_1$residuals, pch = 20, col = "blue")
abline(h = 0)
#c)
# library used to help determine λ
library(MASS)
# determine λ λ=2
boxcox(windmill_1)
# transformation using the value of λ
windmill_1b = lm(I(Output^2) ~ Velocity, data = Windmill)
plot(windmill_1b$fitted.values, windmill_1b$residuals, pch = 20, col = "blue")
abline(h = 0)
#windmill2 dataset
Windmill_2<- read.csv("G:/Other computers/My Laptop/Documents/Richard 621/Week 7/data_Windmill2.csv")
#of rows in the dataset
nrow(Windmill_2)
## [1] 25
#windmill3 dataset
Windmill_3<- read.csv("G:/Other computers/My Laptop/Documents/Richard 621/Week 7/data_Windmill3.csv")
#of rows in the dataset
nrow(Windmill_3)
## [1] 11
#a)
# of rows found in both data sets using intersect (total of 10)
intersect(Windmill_2, Windmill_3)
## Velocity Output
## 1 9.70 2.386
## 2 9.55 2.294
## 3 3.05 0.558
## 4 8.15 2.166
## 5 6.20 1.866
## 6 2.90 0.653
## 7 6.35 1.930
## 8 4.60 1.562
## 9 5.80 1.737
## 10 7.40 2.088
#b)
# of rows found in the windmill_2 dataset but not in the windmill_3 data set (total of 15)
setdiff(Windmill_2, Windmill_3)
## Velocity Output
## 1 5.00 1.582
## 2 6.00 1.822
## 3 3.40 1.057
## 4 2.70 0.500
## 5 10.00 2.236
## 6 3.60 1.137
## 7 7.85 2.179
## 8 8.80 2.112
## 9 7.00 1.800
## 10 5.45 1.501
## 11 9.10 2.303
## 12 10.20 2.310
## 13 4.10 1.194
## 14 3.95 1.144
## 15 2.45 0.123
#c)
# of rows unique rows in datasets (total of 26)
union(Windmill_2, Windmill_3)
## Velocity Output
## 1 5.00 1.582
## 2 6.00 1.822
## 3 3.40 1.057
## 4 2.70 0.500
## 5 10.00 2.236
## 6 9.70 2.386
## 7 9.55 2.294
## 8 3.05 0.558
## 9 8.15 2.166
## 10 6.20 1.866
## 11 2.90 0.653
## 12 6.35 1.930
## 13 4.60 1.562
## 14 5.80 1.737
## 15 7.40 2.088
## 16 3.60 1.137
## 17 7.85 2.179
## 18 8.80 2.112
## 19 7.00 1.800
## 20 5.45 1.501
## 21 9.10 2.303
## 22 10.20 2.310
## 23 4.10 1.194
## 24 3.95 1.144
## 25 2.45 0.123
## 26 7.10 2.100