{r} install.packages("ISLR")
{r} library(ISLR) data("Auto") head(Auto)
{r} url <- "https://www.statlearning.com/s/Auto.csv" auto_data <- read.csv(url, header = TRUE, na.strings = "?") head(auto_data)
’’(a) Which of the predictors are quantitative, and which are quali-tative?
Quantitative:
mpg - Miles per gallon cylinders - Number of cylinders between 4 and 8 displacement - Engine displacement (cu. inches) horsepower - Engine horsepower weight - Vehicle weight (lbs.) acceleration - Time to accelerate from 0 to 60 mph (sec.) year - Model year (modulo 100)
Qualitative:
origin - Origin of car (1. American, 2. European, 3. Japanese) name - Vehicle name’’
{r} range_Auto <- data.frame(sapply(Auto[ ,1:7], range)) rownames(range_Auto) <- c("min:", "max:") range_Auto
{r} sapply(Auto[ ,1:7], mean)
{r} sapply(Auto[ ,1:7], sd)
{r} Auto_2 <- Auto[-c(10:85), ]
{r} range_Auto_2 <- data.frame(sapply(Auto_2[ ,1:7], range)) rownames(range_Auto_2) <- c("min:", "max:") range_Auto_2
{r} sapply(Auto_2[ ,1:7], mean)
{r} sapply(Auto_2[ ,1:7], sd)
{r} pairs(Auto[ ,1:7])
{r} summary(lm(displacement ~ year, data = Auto))
{r} summary(lm(displacement ~ year * origin, data = Auto))
```{r} Auto\(brand <- sapply(strsplit(as.character(Auto\)name), split = ” “), function(x) x[1]) # extract the first item from each list element
Auto\(brand <- factor(ifelse(Auto\)brand %in% c(“vokswagen”, “vw”), “volkswagen”, ifelse(Auto\(brand == "toyouta", "toyota", ifelse(Auto\)brand %in% c(“chevroelt”, “chevy”), “chevrolet”, ifelse(Auto\(brand == "maxda", "mazda", Auto\)brand))))) # fixing typo’s
table(Auto$brand)
```{r}
Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
function(x) x[1]) # extract the first item from each list element
Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen",
ifelse(Auto$brand == "toyouta", "toyota",
ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet",
ifelse(Auto$brand == "maxda", "mazda",
Auto$brand))))) # fixing typo's
library(forcats)
Auto$brand <- fct_lump(Auto$brand,
n = 9,
other_level = "uncommon") # collapse into 10 categories
table(Auto$brand)
{r} library(ggplot2) ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Brand vs Mpg - Boxplot", subtitle = "Engineered feature", x = "Brand", y = "MPG")
{r} ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Origin vs Mpg - Boxplot", x = "Origin", y = "MPG")