library(ISLR2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(Auto)
## Rows: 392
## Columns: 9
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year         <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…
Auto_numeric <- select(Auto, -name)
sum(is.na(Auto)) # 0
## [1] 0
range_Auto <- data.frame(sapply(Auto[ ,1:7], range))
rownames(range_Auto) <- c("min:", "max:")
range_Auto
##       mpg cylinders displacement horsepower weight acceleration year
## min:  9.0         3           68         46   1613          8.0   70
## max: 46.6         8          455        230   5140         24.8   82
sapply(Auto[ ,1:7], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    23.445918     5.471939   194.411990   104.469388  2977.584184    15.541327 
##         year 
##    75.979592
sapply(Auto[ ,1:7], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.805007     1.705783   104.644004    38.491160   849.402560     2.758864 
##         year 
##     3.683737
Auto_2 <- Auto[-c(10:85), ]
range_Auto_2 <- data.frame(sapply(Auto_2[ ,1:7], range))
rownames(range_Auto_2) <- c("min:", "max:")
range_Auto_2
##       mpg cylinders displacement horsepower weight acceleration year
## min: 11.0         3           68         46   1649          8.5   70
## max: 46.6         8          455        230   4997         24.8   82
sapply(Auto_2[ ,1:7], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    24.404430     5.373418   187.240506   100.721519  2935.971519    15.726899 
##         year 
##    77.145570
sapply(Auto_2[ ,1:7], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.867283     1.654179    99.678367    35.708853   811.300208     2.693721 
##         year 
##     3.106217
pairs(Auto[ ,1:7])

Auto$origin <- factor(Auto$origin, labels = c("American", "European", "Japanese"))
library(ggplot2)
ggplot(Auto, aes(x = weight, y = acceleration)) + 
  geom_point() + 
  theme(legend.position = "none") + 
  scale_x_continuous(labels = scales::comma_format()) + 
  labs(x = "Weight", 
       y = "Acceleration", 
       title = "Correlation between weight and acceleration")

ggplot(Auto, aes(x = weight, y = acceleration, col = origin)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
  facet_wrap(~ origin) + 
  theme(legend.position = "none") + 
  scale_x_continuous(labels = scales::comma_format()) + 
  labs(x = "Weight", 
       y = "Acceleration", 
       title = "Correlation between weight and acceleration, by origin")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x = year + 1900, y = displacement)) + 
  geom_jitter() + 
  theme(legend.position = "none") + 
  labs(x = "Year", 
       y = "Displacement", 
       title = "Engine Displacement (trends over time)")

summary(lm(displacement ~ year, data = Auto))
## 
## Call:
## lm(formula = displacement ~ year, data = Auto)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -175.73  -75.58  -13.45   69.69  229.28 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  992.691    101.661   9.765  < 2e-16 ***
## year         -10.506      1.336  -7.862 3.75e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97.35 on 390 degrees of freedom
## Multiple R-squared:  0.1368, Adjusted R-squared:  0.1346 
## F-statistic:  61.8 on 1 and 390 DF,  p-value: 3.748e-14
ggplot(Auto, aes(x = year + 1900, y = displacement, col = factor(origin))) + 
  geom_jitter() + 
  geom_smooth(method = "lm") +
  theme(legend.position = "none") + 
  labs(x = "Year", 
       y = "Displacement", 
       title = "Engine Displacement (trends over time), by Origin") + 
  facet_wrap(~ origin)
## `geom_smooth()` using formula = 'y ~ x'

summary(lm(displacement ~ year * origin, data = Auto))
## 
## Call:
## lm(formula = displacement ~ year * origin, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -217.920  -26.784   -4.066   33.700  174.813 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1258.425     91.656  13.730  < 2e-16 ***
## year                  -13.373      1.211 -11.042  < 2e-16 ***
## originEuropean      -1252.625    208.468  -6.009 4.33e-09 ***
## originJapanese      -1215.032    190.072  -6.392 4.71e-10 ***
## year:originEuropean    14.745      2.752   5.357 1.46e-07 ***
## year:originJapanese    14.139      2.466   5.734 1.98e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.25 on 386 degrees of freedom
## Multiple R-squared:  0.5677, Adjusted R-squared:  0.5621 
## F-statistic: 101.4 on 5 and 386 DF,  p-value: < 2.2e-16
Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
                     function(x) x[1]) # extract the first item from each list element

Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen", 
                     ifelse(Auto$brand == "toyouta", "toyota", 
                            ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet", 
                                   ifelse(Auto$brand == "maxda", "mazda", 
                                          Auto$brand))))) # fixing typo's
library(forcats)
Auto$brand <- fct_lump(Auto$brand, 
                       n = 9, 
                       other_level = "uncommon") # collapse into 10 categories

table(Auto$brand)
## 
##        amc      buick  chevrolet     datsun      dodge       ford   plymouth 
##         27         17         47         23         28         48         31 
##     toyota volkswagen   uncommon 
##         26         22        123
ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) + 
geom_boxplot() + 
  theme(legend.position = "none") + 
  labs(title = "Brand vs Mpg - Boxplot", 
       subtitle = "Engineered feature", 
       x = "Brand", 
       y = "MPG")

ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) + 
  geom_boxplot() + 
  theme(legend.position = "none") + 
  labs(title = "Origin vs Mpg - Boxplot", 
       x = "Origin", 
       y = "MPG")

models <- lapply(names(Auto_numeric)[-1], function(var) {
  lm(as.formula(paste("mpg ~", var)), data = Auto_numeric)
})
r_squared <- sapply(models, function(model) summary(model)$r.squared)
best_var <- names(sort(r_squared, decreasing = TRUE)[1])
best_var
## NULL