1.Load the nycflights13 dataset in R using the following code:
library(nycflights13)
flights <- nycflights13::flights
planes <- nycflights13::planes
airports <- nycflights13::airports
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
flights %>% semi_join(planes, by = "tailnum") -> joined_data
nrow(joined_data)
## [1] 284170
4.Using a filter join, determine the number of records in the flights dataset with tail numbers that do not match records in the planes dataset.
flights %>% anti_join(planes, by = "tailnum") -> join_data2
nrow(join_data2)
## [1] 52606
airports %>% anti_join(flights,by = c("faa" = "dest")) -> join_data3
nrow(join_data3)
## [1] 1357
W1 <- read.csv("/Users/jusimioni/Desktop/data_Windmill.csv")
W2 <- read.csv("/Users/jusimioni/Desktop/data_Windmill2.csv")
W3 <- read.csv("/Users/jusimioni/Desktop/data_Windmill3.csv")
colnames(W1)
## [1] "Observation_Number" "Velocity" "Output"
model <- lm(Velocity ~ Output, data = W1)
summary(model)
##
## Call:
## lm(formula = Velocity ~ Output, data = W1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3594 -0.7940 -0.2882 0.7315 1.7089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2950 0.4961 0.595 0.558
## Output 3.6264 0.2865 12.659 7.55e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9154 on 23 degrees of freedom
## Multiple R-squared: 0.8745, Adjusted R-squared: 0.869
## F-statistic: 160.3 on 1 and 23 DF, p-value: 7.546e-12
plot(model$fitted.values, model$residuals, pch = 20, col = "blue")
abline(h = 0)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
boxcox(model)
Based on the results a reciprocal transformation may be beneficial to the analysis.
model1b = lm(I((1/W1$Velocity)) ~ Output, data = W1)
summary(model1b)
##
## Call:
## lm(formula = I((1/W1$Velocity)) ~ Output, data = W1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0293165 -0.0089289 0.0006147 0.0104300 0.0185666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.424932 0.007286 58.32 <2e-16 ***
## Output -0.141325 0.004207 -33.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01344 on 23 degrees of freedom
## Multiple R-squared: 0.98, Adjusted R-squared: 0.9792
## F-statistic: 1128 on 1 and 23 DF, p-value: < 2.2e-16
plot(model1b$fitted.values, model1b$residuals, pch = 20, col = "blue")
abline(h = 0)
boxcox(model1b)
intersect(W2,W3)
## Velocity Output
## 1 9.70 2.386
## 2 9.55 2.294
## 3 3.05 0.558
## 4 8.15 2.166
## 5 6.20 1.866
## 6 2.90 0.653
## 7 6.35 1.930
## 8 4.60 1.562
## 9 5.80 1.737
## 10 7.40 2.088
No data is found in both data files.
setdiff(W2,W3)
## Velocity Output
## 1 5.00 1.582
## 2 6.00 1.822
## 3 3.40 1.057
## 4 2.70 0.500
## 5 10.00 2.236
## 6 3.60 1.137
## 7 7.85 2.179
## 8 8.80 2.112
## 9 7.00 1.800
## 10 5.45 1.501
## 11 9.10 2.303
## 12 10.20 2.310
## 13 4.10 1.194
## 14 3.95 1.144
## 15 2.45 0.123
19 rows
c. Using a set operation, determine the number of unique rows that are
contained in the datasets.
union(W2,W3)
## Velocity Output
## 1 5.00 1.582
## 2 6.00 1.822
## 3 3.40 1.057
## 4 2.70 0.500
## 5 10.00 2.236
## 6 9.70 2.386
## 7 9.55 2.294
## 8 3.05 0.558
## 9 8.15 2.166
## 10 6.20 1.866
## 11 2.90 0.653
## 12 6.35 1.930
## 13 4.60 1.562
## 14 5.80 1.737
## 15 7.40 2.088
## 16 3.60 1.137
## 17 7.85 2.179
## 18 8.80 2.112
## 19 7.00 1.800
## 20 5.45 1.501
## 21 9.10 2.303
## 22 10.20 2.310
## 23 4.10 1.194
## 24 3.95 1.144
## 25 2.45 0.123
## 26 7.10 2.100