install.packages("ISLR")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ISLR)
data("Auto") # Example dataset from ISLR package
url <- "https://www.statlearning.com/s/Auto.csv"
auto_data <- read.csv(url, header = TRUE)
# (a) Which of the predictors are quantitative, and which are qualitative?
# Quantitative Variables:
# mpg - Miles per gallon
# cylinders - Number of cylinders (4 to 8)
# displacement - Engine displacement (cu. inches)
# horsepower - Engine horsepower
# weight - Vehicle weight (lbs.)
# acceleration - 0 to
Qualitative:
origin - Origin of car (1. American, 2. European, 3. Japanese) name - Vehicle name’’
What is the range of each quantitative predictor? {r} range_Auto <- data.frame(sapply(Auto[ ,1:7], range)) rownames(range_Auto) <- c(“min:”, “max:”) range_Auto
What is the mean and standard deviation of each quantitative predictor? {r} sapply(Auto[ ,1:7], mean) {r} sapply(Auto[ ,1:7], sd)
Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
Auto_2 <- Auto[-c(10:85), ]
range_Auto_2 <- data.frame(sapply(Auto_2[, 1:7], range)) # Compute the range for columns 1 to 7
rownames(range_Auto_2) <- c("min:", "max:") # Assign row names
range_Auto_2 # Display the result
## mpg cylinders displacement horsepower weight acceleration year
## min: 11.0 3 68 46 1649 8.5 70
## max: 46.6 8 455 230 4997 24.8 82
sapply(Auto_2[ ,1:7], mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
sapply(Auto_2[ ,1:7], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.
pairs(Auto[ ,1:7])
summary(lm(displacement ~ year, data = Auto))
##
## Call:
## lm(formula = displacement ~ year, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -175.73 -75.58 -13.45 69.69 229.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 992.691 101.661 9.765 < 2e-16 ***
## year -10.506 1.336 -7.862 3.75e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97.35 on 390 degrees of freedom
## Multiple R-squared: 0.1368, Adjusted R-squared: 0.1346
## F-statistic: 61.8 on 1 and 390 DF, p-value: 3.748e-14
summary(lm(displacement ~ year * origin, data = Auto))
##
## Call:
## lm(formula = displacement ~ year * origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -208.809 -49.130 -1.753 47.496 180.801
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1980.870 169.234 11.705 < 2e-16 ***
## year -21.902 2.222 -9.855 < 2e-16 ***
## origin -771.920 96.358 -8.011 1.34e-14 ***
## year:origin 9.098 1.254 7.256 2.18e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.33 on 388 degrees of freedom
## Multiple R-squared: 0.5128, Adjusted R-squared: 0.509
## F-statistic: 136.1 on 3 and 388 DF, p-value: < 2.2e-16
Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.
```{r} Autobrand<−sapply(strsplit(as.character(Auto name), split = ” “), function(x) x[1]) # extract the first item from each list element
Autobrand<−factor(ifelse(Auto brand %in% c(“vokswagen”, “vw”), “volkswagen”, ifelse(Autobrand==“toyouta”,“toyota”,ifelse(Auto brand %in% c(“chevroelt”, “chevy”), “chevrolet”, ifelse(Autobrand==“maxda”,“mazda”,Auto brand))))) # fixing typo’s
table(Auto$brand)
``` r
Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
function(x) x[1]) # extract the first item from each list element
Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen",
ifelse(Auto$brand == "toyouta", "toyota",
ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet",
ifelse(Auto$brand == "maxda", "mazda",
Auto$brand))))) # fixing typo's
library(forcats)
Auto$brand <- fct_lump(Auto$brand,
n = 9,
other_level = "uncommon") # collapse into 10 categories
table(Auto$brand)
##
## amc buick chevrolet datsun dodge ford plymouth
## 27 17 47 23 28 48 31
## toyota volkswagen uncommon
## 26 22 123
library(ggplot2) # Load ggplot2 package
colnames(Auto) # Check column names of the Auto dataset
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "brand"
str(Auto) # Inspect structure of the Auto dataset
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ brand : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
colnames(Auto) # Check column names of the Auto dataset
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "brand"
head(Auto) # View the first few rows of the Auto dataset
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name brand
## 1 chevrolet chevelle malibu chevrolet
## 2 buick skylark 320 buick
## 3 plymouth satellite plymouth
## 4 amc rebel sst amc
## 5 ford torino ford
## 6 ford galaxie 500 ford
str(Auto) # View the structure of the dataset (column names, types, etc.)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ brand : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
colnames(Auto) # Check the column names of the Auto dataset
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "brand"
colnames(Auto) # Check the column names of the Auto dataset
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "brand"
head(Auto) # View the first few rows of the Auto dataset
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name brand
## 1 chevrolet chevelle malibu chevrolet
## 2 buick skylark 320 buick
## 3 plymouth satellite plymouth
## 4 amc rebel sst amc
## 5 ford torino ford
## 6 ford galaxie 500 ford
str(Auto) # Check the structure of the Auto dataset
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ brand : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
colnames(Auto) # Check the column names of the Auto dataset
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "brand"
head(Auto) # View the first few rows of the Auto dataset
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name brand
## 1 chevrolet chevelle malibu chevrolet
## 2 buick skylark 320 buick
## 3 plymouth satellite plymouth
## 4 amc rebel sst amc
## 5 ford torino ford
## 6 ford galaxie 500 ford
str(Auto) # View the structure of the Auto dataset
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ brand : Factor w/ 10 levels "amc","buick",..: 3 2 7 1 6 6 3 7 10 1 ...
ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Origin vs Mpg - Boxplot", x = "Origin", y = "MPG")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Auto$brand <- sapply(strsplit(as.character(Auto$name), split = " "),
function(x) x[1]) # extract the first item from each list element
Auto$brand <- factor(ifelse(Auto$brand %in% c("vokswagen", "vw"), "volkswagen",
ifelse(Auto$brand == "toyouta", "toyota",
ifelse(Auto$brand %in% c("chevroelt", "chevy"), "chevrolet",
ifelse(Auto$brand == "maxda", "mazda",
Auto$brand))))) # fixing typo's
library(forcats)
Auto$brand <- fct_lump(Auto$brand,
n = 9,
other_level = "uncommon") # collapse into 10 categories
table(Auto$brand)
##
## amc buick chevrolet datsun dodge ford plymouth
## 27 17 47 23 28 48 31
## toyota volkswagen uncommon
## 26 22 123
ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) +
geom_boxplot() +
theme(legend.position = "none") +
labs(
title = "Brand vs Mpg - Boxplot",
subtitle = "Engineered feature",
x = "Brand",
y = "MPG"
)
ggplot(Auto, aes(x = brand, y = mpg, fill = brand)) +
geom_boxplot() +
theme(legend.position = "none") +
labs(
title = "Brand vs Mpg - Boxplot",
subtitle = "Engineered feature",
x = "Brand",
y = "MPG"
)
ggplot(Auto, aes(x = origin, y = mpg, fill = origin)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Origin vs Mpg - Boxplot", x = "Origin", y = "MPG")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?