#install.packages("ISLR")
library(ISLR) # Contains the Auto dataset
## Warning: package 'ISLR' was built under R version 4.4.2
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(Auto)
Auto <- na.omit(Auto)
(a) Which of the predictors are quantitative, and which are qualitative?
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
– Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year – Qualitative predictors: name, origin
(b) What is the range of each quantitative predictor? You can answer this using the min() and max() methods in numpy.
# Remove missing values
Auto <- na.omit(Auto)
# Define quantitative variables
quantitative_vars <- Auto[, sapply(Auto, is.numeric)]
# Compute range (min and max) for each quantitative variable
range_list <- lapply(quantitative_vars, range)
# Ensure all elements have the same length before converting to a data frame
range_df <- data.frame(
Variable = names(range_list),
Min = sapply(range_list, function(x) x[1]),
Max = sapply(range_list, function(x) x[2])
)
# Print the formatted range table
print(range_df, row.names = FALSE)
## Variable Min Max
## mpg 9 46.6
## cylinders 3 8.0
## displacement 68 455.0
## horsepower 46 230.0
## weight 1613 5140.0
## acceleration 8 24.8
## year 70 82.0
## origin 1 3.0
(c) What is the mean and standard deviation of each quantitative .max() predictor?
# Compute mean and standard deviation
means <- sapply(quantitative_vars, mean)
sds <- sapply(quantitative_vars, sd)
# Store results in a data frame
res_val <- data.frame(Mean = means, Standard_Dev = sds)
res_val # Print the results
## Mean Standard_Dev
## mpg 23.445918 7.8050075
## cylinders 5.471939 1.7057832
## displacement 194.411990 104.6440039
## horsepower 104.469388 38.4911599
## weight 2977.584184 849.4025600
## acceleration 15.541327 2.7588641
## year 75.979592 3.6837365
## origin 1.576531 0.8055182
(d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
# Remove observations 10 through 85
Auto_subset <- Auto[-c(10:85), ]
# Select relevant quantitative variables
Auto_filter <- Auto_subset[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year")]
# Compute min, max, mean, and standard deviation
stats_df <- as.data.frame(t(sapply(Auto_filter, function(x)
c(Min = min(x), Max = max(x), Mean = mean(x), SD = sd(x))
)))
# Print the formatted results
print(stats_df)
## Min Max Mean SD
## mpg 11.0 46.6 24.404430 7.867283
## cylinders 3.0 8.0 5.373418 1.654179
## displacement 68.0 455.0 187.240506 99.678367
## horsepower 46.0 230.0 100.721519 35.708853
## weight 1649.0 4997.0 2935.971519 811.300208
## acceleration 8.5 24.8 15.726899 2.693721
## year 70.0 82.0 77.145570 3.106217
pairs(Auto[, c("mpg", "cylinders","displacement", "horsepower", "weight", "acceleration", "year")])
The pairs plot helps visualize the relationships between different predictors and their patterns.
Interestingly, the number of cylinders doesn’t show a simple linear relationship with MPG. Instead, there’s a sweet spot—cars with 4 cylinders tend to have the best fuel efficiency, while 3-cylinder cars are the least efficient. Meanwhile, 6- and 8-cylinder cars also show lower MPG compared to 4-cylinder ones.
Displacement, horsepower, and weight are all negatively correlated with MPG. In other words, as these factors increase, fuel efficiency decreases.Newer car models tend to have better MPG compared to older ones, indicating improvements in fuel efficiency over time.
# Additional scatterplots
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("MPG vs Horsepower")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
ggtitle("MPG vs Weight")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(Auto, aes(x = displacement, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", col = "green") +
ggtitle("MPG vs Displacemnt")
## `geom_smooth()` using formula = 'y ~ x'
The scatter plots reveal a strong negative correlation between weight, horsepower, displacement, and MPG.
Heavier cars with more horsepower and larger displacement generally have lower fuel efficiency, indicating that these factors could be valuable in predicting gas mileage.
# Load necessary library
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
## The following object is masked from 'package:MASS':
##
## Boston
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
# Load the Boston dataset
data("Boston")
dim(Boston) # Number of rows and columns
## [1] 506 13
The Boston data frame has 506 rows and 14 columns.
Boston
This data frame contains the following columns:
pairs(Boston, main="Pairwise Scatterplots of Boston Housing Data")
Higher crime rates and increased nitrogen oxide concentration in an area tend to drive down median home values.
Homes with more rooms and closer proximity to employment centers generally have higher median values.
cor(Boston$crim, Boston[, -which(names(Boston) == "crim")])
## zn indus chas nox rm age dis
## [1,] -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343 -0.3796701
## rad tax ptratio lstat medv
## [1,] 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046
Crime rate shows a strong correlation with access to radial highways, followed by the proportion of non-retail business acres and property tax rates.
There is a negative correlation between crime rate and distance to employment centers, meaning areas farther from job hubs tend to have lower crime rates.
summary(Boston$crim) # Crime rate summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
summary(Boston$tax) # Tax rate summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187.0 279.0 330.0 408.2 666.0 711.0
summary(Boston$ptratio) # Pupil-teacher ratio summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
Boston[Boston$crim > quantile(Boston$crim, 0.95), ] # Top 5% crime rates
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 375 18.4982 0 18.1 0 0.668 4.138 100.0 1.1370 24 666 20.2 37.97 13.8
## 376 19.6091 0 18.1 0 0.671 7.313 97.9 1.3163 24 666 20.2 13.44 15.0
## 379 23.6482 0 18.1 0 0.671 6.380 96.2 1.3861 24 666 20.2 23.69 13.1
## 380 17.8667 0 18.1 0 0.671 6.223 100.0 1.3861 24 666 20.2 21.78 10.2
## 381 88.9762 0 18.1 0 0.671 6.968 91.9 1.4165 24 666 20.2 17.21 10.4
## 382 15.8744 0 18.1 0 0.671 6.545 99.1 1.5192 24 666 20.2 21.08 10.9
## 385 20.0849 0 18.1 0 0.700 4.368 91.2 1.4395 24 666 20.2 30.63 8.8
## 386 16.8118 0 18.1 0 0.700 5.277 98.1 1.4261 24 666 20.2 30.81 7.2
## 387 24.3938 0 18.1 0 0.700 4.652 100.0 1.4672 24 666 20.2 28.28 10.5
## 388 22.5971 0 18.1 0 0.700 5.000 89.5 1.5184 24 666 20.2 31.99 7.4
## 399 38.3518 0 18.1 0 0.693 5.453 100.0 1.4896 24 666 20.2 30.59 5.0
## 401 25.0461 0 18.1 0 0.693 5.987 100.0 1.5888 24 666 20.2 26.77 5.6
## 404 24.8017 0 18.1 0 0.693 5.349 96.0 1.7028 24 666 20.2 19.77 8.3
## 405 41.5292 0 18.1 0 0.693 5.531 85.4 1.6074 24 666 20.2 27.38 8.5
## 406 67.9208 0 18.1 0 0.693 5.683 100.0 1.4254 24 666 20.2 22.98 5.0
## 407 20.7162 0 18.1 0 0.659 4.138 100.0 1.1781 24 666 20.2 23.34 11.9
## 411 51.1358 0 18.1 0 0.597 5.757 100.0 1.4130 24 666 20.2 10.11 15.0
## 413 18.8110 0 18.1 0 0.597 4.628 100.0 1.5539 24 666 20.2 34.37 17.9
## 414 28.6558 0 18.1 0 0.597 5.155 100.0 1.5894 24 666 20.2 20.08 16.3
## 415 45.7461 0 18.1 0 0.693 4.519 100.0 1.6582 24 666 20.2 36.98 7.0
## 416 18.0846 0 18.1 0 0.679 6.434 100.0 1.8347 24 666 20.2 29.05 7.2
## 418 25.9406 0 18.1 0 0.679 5.304 89.1 1.6475 24 666 20.2 26.64 10.4
## 419 73.5341 0 18.1 0 0.679 5.957 100.0 1.8026 24 666 20.2 20.62 8.8
## 426 15.8603 0 18.1 0 0.679 5.896 95.4 1.9096 24 666 20.2 24.39 8.3
## 428 37.6619 0 18.1 0 0.679 6.202 78.7 1.8629 24 666 20.2 14.52 10.9
## 441 22.0511 0 18.1 0 0.740 5.818 92.4 1.8662 24 666 20.2 22.11 10.5
Boston[Boston$tax > quantile(Boston$tax, 0.95), ] # Top 5% tax rates
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 489 0.15086 0 27.74 0 0.609 5.454 92.7 1.8209 4 711 20.1 18.06 15.2
## 490 0.18337 0 27.74 0 0.609 5.414 98.3 1.7554 4 711 20.1 23.97 7.0
## 491 0.20746 0 27.74 0 0.609 5.093 98.0 1.8226 4 711 20.1 29.68 8.1
## 492 0.10574 0 27.74 0 0.609 5.983 98.8 1.8681 4 711 20.1 18.07 13.6
## 493 0.11132 0 27.74 0 0.609 5.983 83.5 2.1099 4 711 20.1 13.35 20.1
Boston[Boston$ptratio > quantile(Boston$ptratio, 0.95), ] # Top 5% pupil-teacher ratios
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 55 0.01360 75 4.00 0 0.410 5.888 47.6 7.3197 3 469 21.1 14.80 18.9
## 128 0.25915 0 21.89 0 0.624 5.693 96.0 1.7883 4 437 21.2 17.19 16.2
## 129 0.32543 0 21.89 0 0.624 6.431 98.8 1.8125 4 437 21.2 15.39 18.0
## 130 0.88125 0 21.89 0 0.624 5.637 94.7 1.9799 4 437 21.2 18.34 14.3
## 131 0.34006 0 21.89 0 0.624 6.458 98.9 2.1185 4 437 21.2 12.60 19.2
## 132 1.19294 0 21.89 0 0.624 6.326 97.7 2.2710 4 437 21.2 12.26 19.6
## 133 0.59005 0 21.89 0 0.624 6.372 97.9 2.3274 4 437 21.2 11.12 23.0
## 134 0.32982 0 21.89 0 0.624 5.822 95.4 2.4699 4 437 21.2 15.03 18.4
## 135 0.97617 0 21.89 0 0.624 5.757 98.4 2.3460 4 437 21.2 17.31 15.6
## 136 0.55778 0 21.89 0 0.624 6.335 98.2 2.1107 4 437 21.2 16.96 18.1
## 137 0.32264 0 21.89 0 0.624 5.942 93.5 1.9669 4 437 21.2 16.90 17.4
## 138 0.35233 0 21.89 0 0.624 6.454 98.4 1.8498 4 437 21.2 14.59 17.1
## 139 0.24980 0 21.89 0 0.624 5.857 98.2 1.6686 4 437 21.2 21.32 13.3
## 140 0.54452 0 21.89 0 0.624 6.151 97.9 1.6687 4 437 21.2 18.46 17.8
## 141 0.29090 0 21.89 0 0.624 6.174 93.6 1.6119 4 437 21.2 24.16 14.0
## 142 1.62864 0 21.89 0 0.624 5.019 100.0 1.4394 4 437 21.2 34.41 14.4
## 355 0.04301 80 1.91 0 0.413 5.663 21.9 10.5857 4 334 22.0 8.05 18.2
## 356 0.10659 80 1.91 0 0.413 5.936 19.5 10.5857 4 334 22.0 5.57 20.6
sum(Boston$chas == 1)
## [1] 35
N.of census tracts bounding Charles River are 35
median(Boston$ptratio)
## [1] 19.05
The median pupil-teacher ratio is found by calculating the median of the ptratio column and is equal to 19.05
min_medv_index <- which.min(Boston$medv)
Boston[min_medv_index, ] # Display values for this tract
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
summary(Boston) # Compare to overall predictor ranges
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
The suburb with the lowest median home value stands out when we compare its characteristics to other areas.
Crime rates here are significantly higher than the average for most suburbs, and there aren’t many large residential zones nearby.
This suburb has more non-retail business land compared to the average across other areas.
It’s not located near the Charles River, and nitrogen oxide levels are much higher than in other places.
It’s far from both employment centers and the radial highways, making it more isolated than most suburbs.
The area has a high percentage of residents from lower-income backgrounds, and African Americans make up the majority of the population here.
sum(Boston$rm > 7)
## [1] 64
sum(Boston$rm > 8)
## [1] 13
Boston[Boston$rm > 8, ]
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 98 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 4.21 38.7
## 164 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 3.32 50.0
## 205 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 2.88 50.0
## 225 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 4.14 44.8
## 226 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 4.63 50.0
## 227 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 3.13 37.6
## 233 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 2.47 41.7
## 234 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 3.95 48.3
## 254 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 3.54 42.8
## 258 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 5.12 50.0
## 263 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 5.91 48.8
## 268 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 7.44 50.0
## 365 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 5.29 21.9
Out of the 64 suburbs with homes featuring more than 7 rooms, and the 13 with homes having more than 8 rooms:
The suburbs with homes having more than 8 rooms generally have lower crime rates, a larger portion of land zoned for residential use, and are often located along the Charles River.
These areas also tend to have average nitrogen oxide levels and are much older than other Boston suburbs.
They are not too far from employment hubs and are located closer to major highways.
The property tax rates in these suburbs are about average for Boston, and the pupil-teacher ratios are in a healthy range.
These suburbs have a higher proportion of African American residents, with a lower percentage of people from lower-income backgrounds.
The median home values in these suburbs are about double the average of other areas, placing them among the highest in the region.