Load the nycflights13 dataset in R using the following code: library(nycflights13)
library(nycflights13)
The nycflights13 dataset contains multiple data frames. One of them is called flights, another is called airports, and another is called planes. To make it easier to refer to these data frames without needing to type the name of the dataset (i.e., nycflights13), type the following code in R: flights <- nycflights13::flights planes <- nycflights13::planes airports <- nycflights13::airports
flights <- nycflights13::flights
planes <- nycflights13::planes
airports <- nycflights13::airports
colnames(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
flights <- data.frame(flights)
str(flights)
## 'data.frame': 336776 obs. of 19 variables:
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr "UA" "UA" "AA" "B6" ...
## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num 1400 1416 1089 1576 762 ...
## $ hour : num 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
colnames(planes)
## [1] "tailnum" "year" "type" "manufacturer" "model"
## [6] "engines" "seats" "speed" "engine"
planes <- data.frame(planes)
str(planes)
## 'data.frame': 3322 obs. of 9 variables:
## $ tailnum : chr "N10156" "N102UW" "N103US" "N104UW" ...
## $ year : int 2004 1998 1999 1999 2002 1999 1999 1999 1999 1999 ...
## $ type : chr "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" ...
## $ manufacturer: chr "EMBRAER" "AIRBUS INDUSTRIE" "AIRBUS INDUSTRIE" "AIRBUS INDUSTRIE" ...
## $ model : chr "EMB-145XR" "A320-214" "A320-214" "A320-214" ...
## $ engines : int 2 2 2 2 2 2 2 2 2 2 ...
## $ seats : int 55 182 182 182 55 182 182 182 182 182 ...
## $ speed : int NA NA NA NA NA NA NA NA NA NA ...
## $ engine : chr "Turbo-fan" "Turbo-fan" "Turbo-fan" "Turbo-fan" ...
colnames(airports)
## [1] "faa" "name" "lat" "lon" "alt" "tz" "dst" "tzone"
airports <- data.frame(airports)
str(airports)
## 'data.frame': 1458 obs. of 8 variables:
## $ faa : chr "04G" "06A" "06C" "06N" ...
## $ name : chr "Lansdowne Airport" "Moton Field Municipal Airport" "Schaumburg Regional" "Randall Airport" ...
## $ lat : num 41.1 32.5 42 41.4 31.1 ...
## $ lon : num -80.6 -85.7 -88.1 -74.4 -81.4 ...
## $ alt : num 1044 264 801 523 11 ...
## $ tz : num -5 -6 -6 -5 -5 -5 -5 -5 -5 -8 ...
## $ dst : chr "A" "A" "A" "A" ...
## $ tzone: chr "America/New_York" "America/Chicago" "America/Chicago" "America/New_York" ...
nrow(flights)
## [1] 336776
nrow(planes)
## [1] 3322
flights %>% semi_join(planes, by = "tailnum") -> filter1
nrow(filter1)
## [1] 284170
The number of records in the flights dataset that have tail numbers matching records in the planes dataset are 284,170.
nrow(flights)
## [1] 336776
nrow(planes)
## [1] 3322
flights %>% anti_join(planes, by = "tailnum") -> filter2
nrow(filter2)
## [1] 52606
The number of records in the flights dataset with tail numbers that do not match records in the planes dataset are 52,606.
nrow(flights)
## [1] 336776
nrow(airports)
## [1] 1458
airports %>% anti_join(flights, by= c("faa" = "dest"))-> filter3
nrow(filter3)
## [1] 1357
The airports that do not have matching destination values in the flights dataset are 1,357.
Load the data_Windmill csv file into R. This datafile can be found in the “Week 7” folder of our Google Drive.
windmill <- read.csv("C:/Users/justt/Desktop/School/621/Assignment/Homework 4/data_Windmill.csv")
colnames(windmill)
## [1] "Observation_Number" "Velocity" "Output"
colnames(windmill)
## [1] "Observation_Number" "Velocity" "Output"
# Observation_Number is not relevant as it functions as more of a label and will not be used in these models.
windmill <- windmill[,-1]
windmill <- data.frame(windmill)
str(windmill)
## 'data.frame': 25 obs. of 2 variables:
## $ Velocity: num 5 6 3.4 2.7 10 9.7 9.55 3.05 8.15 6.2 ...
## $ Output : num 1.58 1.82 1.06 0.5 2.24 ...
colnames(windmill)
## [1] "Velocity" "Output"
model1 <- lm(Output ~ Velocity, data = windmill)
summary(model1)
##
## Call:
## lm(formula = Output ~ Velocity, data = windmill)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59869 -0.14099 0.06059 0.17262 0.32184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13088 0.12599 1.039 0.31
## Velocity 0.24115 0.01905 12.659 7.55e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2361 on 23 degrees of freedom
## Multiple R-squared: 0.8745, Adjusted R-squared: 0.869
## F-statistic: 160.3 on 1 and 23 DF, p-value: 7.546e-12
plot(model1$fitted.values, model1$residuals, main = "Scatter Plot", pch = 20)
abline(h=0, col="blue")
boxcox(model1)
#The nearest integer is 2.
model2 <- lm(I((Output)^2) ~ Velocity, data = windmill)
boxcox(model2)
The value for 𝜆 to use in a Box-Cox transformation is 2, as seen in the Box-Cox for model1. Then a new regression was run, model2, and the nearest integer for Box-Cox for model2 is 1 meaning no additional transformations are needed.
Load the data_Windmill2 and data_Windmill3 csv files into R. These datafiles can also be found in the “Week 7” folder of our Google Drive.
windmill2 <- read.csv("C:/Users/justt/Desktop/School/621/Assignment/Homework 4/data_Windmill2.csv")
colnames(windmill2)
## [1] "Velocity" "Output"
str(windmill2)
## 'data.frame': 25 obs. of 2 variables:
## $ Velocity: num 5 6 3.4 2.7 10 9.7 9.55 3.05 8.15 6.2 ...
## $ Output : num 1.58 1.82 1.06 0.5 2.24 ...
windmill3 <- read.csv("C:/Users/justt/Desktop/School/621/Assignment/Homework 4/data_Windmill3.csv")
colnames(windmill3)
## [1] "Velocity" "Output"
str(windmill3)
## 'data.frame': 11 obs. of 2 variables:
## $ Velocity: num 9.7 9.55 3.05 8.15 6.2 7.1 2.9 6.35 4.6 5.8 ...
## $ Output : num 2.386 2.294 0.558 2.166 1.866 ...
combined <- rbind(windmill2,windmill3)
nrow(combined)
## [1] 36
The number of rows that are found in both datasets are 36 rows of combined data.
setdiff1 <- setdiff(windmill2,windmill3)
nrow(setdiff1)
## [1] 15
The number of rows that are contained in the data_Windmill2 file, but not in the data_Windmill3 file are 15 rows.
union1 <- union(windmill2,windmill3)
nrow(union1)
## [1] 26
The number of unique rows that are contained in the datasets are 26 rows.