library(ISLR2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(Auto)
## Rows: 392
## Columns: 9
## $ mpg <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
Auto_numeric <- select(Auto, -name)
sum(is.na(Auto)) # 0
## [1] 0
range_Auto <- data.frame(sapply(Auto[ ,1:7], range))
rownames(range_Auto) <- c("min:", "max:")
range_Auto
## mpg cylinders displacement horsepower weight acceleration year
## min: 9.0 3 68 46 1613 8.0 70
## max: 46.6 8 455 230 5140 24.8 82
sapply(Auto[ ,1:7], mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
sapply(Auto[ ,1:7], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
Auto_2 <- Auto[-c(10:85), ]
range_Auto_2 <- data.frame(sapply(Auto_2[ ,1:7], range))
rownames(range_Auto_2) <- c("min:", "max:")
range_Auto_2
## mpg cylinders displacement horsepower weight acceleration year
## min: 11.0 3 68 46 1649 8.5 70
## max: 46.6 8 455 230 4997 24.8 82
sapply(Auto_2[ ,1:7], mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
sapply(Auto_2[ ,1:7], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
pairs(Auto[ ,1:7])

Auto$origin <- factor(Auto$origin, labels = c("American", "European", "Japanese"))
library(ggplot2)
ggplot(Auto, aes(x = weight, y = acceleration)) +
geom_point() +
theme(legend.position = "none") +
scale_x_continuous(labels = scales::comma_format()) +
labs(x = "Weight",
y = "Acceleration",
title = "Correlation between weight and acceleration")

ggplot(Auto, aes(x = weight, y = acceleration, col = origin)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~ origin) +
theme(legend.position = "none") +
scale_x_continuous(labels = scales::comma_format()) +
labs(x = "Weight",
y = "Acceleration",
title = "Correlation between weight and acceleration, by origin")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x = year + 1900, y = displacement)) +
geom_jitter() +
theme(legend.position = "none") +
labs(x = "Year",
y = "Displacement",
title = "Engine Displacement (trends over time)")

summary(lm(displacement ~ year, data = Auto))
##
## Call:
## lm(formula = displacement ~ year, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -175.73 -75.58 -13.45 69.69 229.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 992.691 101.661 9.765 < 2e-16 ***
## year -10.506 1.336 -7.862 3.75e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97.35 on 390 degrees of freedom
## Multiple R-squared: 0.1368, Adjusted R-squared: 0.1346
## F-statistic: 61.8 on 1 and 390 DF, p-value: 3.748e-14
ggplot(Auto, aes(x = year + 1900, y = displacement, col = factor(origin))) +
geom_jitter() +
geom_smooth(method = "lm") +
theme(legend.position = "none") +
labs(x = "Year",
y = "Displacement",
title = "Engine Displacement (trends over time), by Origin") +
facet_wrap(~ origin)
## `geom_smooth()` using formula = 'y ~ x'

summary(lm(displacement ~ year * origin, data = Auto))
##
## Call:
## lm(formula = displacement ~ year * origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -217.920 -26.784 -4.066 33.700 174.813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1258.425 91.656 13.730 < 2e-16 ***
## year -13.373 1.211 -11.042 < 2e-16 ***
## originEuropean -1252.625 208.468 -6.009 4.33e-09 ***
## originJapanese -1215.032 190.072 -6.392 4.71e-10 ***
## year:originEuropean 14.745 2.752 5.357 1.46e-07 ***
## year:originJapanese 14.139 2.466 5.734 1.98e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.25 on 386 degrees of freedom
## Multiple R-squared: 0.5677, Adjusted R-squared: 0.5621
## F-statistic: 101.4 on 5 and 386 DF, p-value: < 2.2e-16
Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
function(x) x[1]) # extract the first item from each list element
Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen",
ifelse(Auto$brand == "toyouta", "toyota",
ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet",
ifelse(Auto$brand == "maxda", "mazda",
Auto$brand))))) # fixing typo's
library(forcats)
Auto$brand <- fct_lump(Auto$brand,
n = 9,
other_level = "uncommon") # collapse into 10 categories
table(Auto$brand)
##
## amc buick chevrolet datsun dodge ford plymouth
## 27 17 47 23 28 48 31
## toyota volkswagen uncommon
## 26 22 123
ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) +
geom_boxplot() +
theme(legend.position = "none") +
labs(title = "Brand vs Mpg - Boxplot",
subtitle = "Engineered feature",
x = "Brand",
y = "MPG")

ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) +
geom_boxplot() +
theme(legend.position = "none") +
labs(title = "Origin vs Mpg - Boxplot",
x = "Origin",
y = "MPG")

models <- lapply(names(Auto_numeric)[-1], function(var) {
lm(as.formula(paste("mpg ~", var)), data = Auto_numeric)
})
r_squared <- sapply(models, function(model) summary(model)$r.squared)
best_var <- names(sort(r_squared, decreasing = TRUE)[1])
best_var
## NULL