library(MASS)
## Warning: package 'MASS' was built under R version 4.3.2
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
This exercise involves the Auto data set studied in the lab. Make sure that the missing values have been removed from the data.
(a) Which of the predictors are quantitative, and which are qualitative?
(b) What is the range of each quantitative predictor? You can answer this using the range() function. range()
(c) What is the mean and standard deviation of each quantitative predictor?
(d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
(e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your fndings.
(f) Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.
# Loading the Auto data set
data("Auto")
# Checking the structure of the data set
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# Checking the documentation for more information about the data set
?Auto
## starting httpd help server ... done
(a.) Identify quantitative and qualitative predictors:
# Identifying quantitative predictors
quantitative_predictors <- sapply(Auto, is.numeric)
# Printing quantitative predictors
cat("Quantitative Predictors:", names(quantitative_predictors[quantitative_predictors]), "\n")
## Quantitative Predictors: mpg cylinders displacement horsepower weight acceleration year origin
# Identifying qualitative predictors
qualitative_predictors <- sapply(Auto, is.factor)
# Printing qualitative predictors
cat("Qualitative Predictors:", names(qualitative_predictors[qualitative_predictors]), "\n")
## Qualitative Predictors: name
(b.) Range of each quantitative predictor:
# Calculating the range of each quantitative predictor
quantitative_ranges <- sapply(Auto[, quantitative_predictors], range)
# Printing the range of each quantitative predictor
print(quantitative_ranges)
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 9.0 3 68 46 1613 8.0 70 1
## [2,] 46.6 8 455 230 5140 24.8 82 3
(c.) Mean and standard deviation of each quantitative predictor:
# Calculating the mean of each quantitative predictor
quantitative_means <- colMeans(Auto[, quantitative_predictors], na.rm = TRUE)
# Calculating the standard deviation of each quantitative predictor
quantitative_sd <- apply(Auto[, quantitative_predictors], 2, sd, na.rm = TRUE)
# Printing the mean and standard deviation of each quantitative predictor
cat("Mean of Quantitative Predictors:\n")
## Mean of Quantitative Predictors:
print(quantitative_means)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year origin
## 75.979592 1.576531
cat("\nStandard Deviation of Quantitative Predictors:\n")
##
## Standard Deviation of Quantitative Predictors:
print(quantitative_sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.8050075 1.7057832 104.6440039 38.4911599 849.4025600 2.7588641
## year origin
## 3.6837365 0.8055182
(d.) Remove 10th through 85th observations and calculate range, mean, and standard deviation:
# Removing 10th through 85th observations
Auto_subset <- Auto[-(10:85), ]
# Calculating the range, mean, and standard deviation of each predictor in the subset
subset_ranges <- sapply(Auto_subset[, quantitative_predictors], range)
subset_means <- colMeans(Auto_subset[, quantitative_predictors], na.rm = TRUE)
subset_sd <- apply(Auto_subset[, quantitative_predictors], 2, sd, na.rm = TRUE)
# Printing the results for the subset
cat("Range of Quantitative Predictors in Subset:\n")
## Range of Quantitative Predictors in Subset:
print(subset_ranges)
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0 3 68 46 1649 8.5 70 1
## [2,] 46.6 8 455 230 4997 24.8 82 3
cat("\nMean of Quantitative Predictors in Subset:\n")
##
## Mean of Quantitative Predictors in Subset:
print(subset_means)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year origin
## 77.145570 1.601266
cat("\nStandard Deviation of Quantitative Predictors in Subset:\n")
##
## Standard Deviation of Quantitative Predictors in Subset:
print(subset_sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year origin
## 3.106217 0.819910
(e.) Investigate predictors graphically using scatterplot:
# Scatterplot of horsepower vs mpg
plot(Auto$horsepower, Auto$mpg, main = "Horsepower vs MPG", xlab = "Horsepower", ylab = "Miles per Gallon")
# Scatterplot of weight vs mpg
plot(Auto$weight, Auto$mpg, main = "Weight vs MPG", xlab = "Weight", ylab = "Miles per Gallon")
(f.) Investigate relationships for predicting gas mileage (mpg):
# Create scatterplots for each quantitative predictor against mpg
scatterplots_mpg <- lapply(names(quantitative_predictors), function(predictor) {
ggplot(Auto, aes_string(x = predictor, y = "mpg")) +
geom_point() +
geom_smooth(method = "lm") +
theme_minimal() +
labs(title = paste("Scatterplot of", predictor, "vs. mpg"))
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display scatterplots
print(scatterplots_mpg)
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'
##
## [[9]]
## `geom_smooth()` using formula = 'y ~ x'
This exercise involves the Boston housing data set.
(a) To begin, load in the Boston data set. The Boston data set is part of the ISLR2 library.
> library(ISLR2)
Now the data set is contained in the object Boston.
> Boston
Read about the data set:
> ?Boston
How many rows are in this data set? How many columns? What do the rows and columns represent?
(b) Make some pairwise scatterplots of the predictors (columns) in this data set. Describe your fndings.
(c) Are any of the predictors associated with per capita crime rate? If so, explain the relationship.
(d) Do any of the census tracts of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.
(e) How many of the census tracts in this data set bound the Charles river?
(f) What is the median pupil-teacher ratio among the towns in this data set?
(g) Which census tract of Boston has lowest median value of owner occupied homes? What are the values of the other predictors for that census tract, and how do those values compare to the overall ranges for those predictors? Comment on your fndings.
(h) In this data set, how many of the census tracts average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the census tracts that average more than eight rooms per dwelling.
# Loading the Boston data set
data("Boston")
# Checking the structure of the data set
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
# Checking the documentation for more information about the data set
?Boston
# Getting the number of rows and columns
n_rows <- nrow(Boston)
n_cols <- ncol(Boston)
# Printing the number of rows and columns
cat("Number of rows:", n_rows, "\n")
## Number of rows: 506
cat("Number of columns:", n_cols, "\n")
## Number of columns: 13
The Boston data set contains information about various housing-related features in Boston. Each row represents a census tract, and each column represents different predictors such as crime rate, tax rates, pupil-teacher ratios, etc.
# Creating pairwise scatterplots
pairs(Boston)
# Identifying correlation between predictors and per capita crime rate
correlations <- cor(Boston)
# Printing correlation values for crime rate (column 1 in Boston)
correlations_with_crime <- correlations[, 1]
# Identifying predictors with high correlation
high_correlation_predictors <- names(which(abs(correlations_with_crime) > 0.5))
# Printing the predictors with high correlation
cat("Predictors with high correlation to per capita crime rate:", high_correlation_predictors, "\n")
## Predictors with high correlation to per capita crime rate: crim rad tax
# Identifying census tracts with high crime rates
high_crime_tracts <- Boston$crim > 20 # Adjust the threshold as needed
# Identifying census tracts with high tax rates
high_tax_tracts <- Boston$tax > 500 # Adjust the threshold as needed
# Identifying census tracts with high pupil-teacher ratios
high_ptratio_tracts <- Boston$ptratio > 20 # Adjust the threshold as needed
# Commenting on the range of each predictor
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
summary(Boston$tax)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187.0 279.0 330.0 408.2 666.0 711.0
summary(Boston$ptratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
# Counting the number of census tracts that bound the Charles River
charles_river_tracts <- sum(Boston$chas == 1)
cat("Number of census tracts bounding the Charles River:", charles_river_tracts, "\n")
## Number of census tracts bounding the Charles River: 35
# Calculating the median pupil-teacher ratio
median_ptratio <- median(Boston$ptratio)
cat("Median pupil-teacher ratio among the towns:", median_ptratio, "\n")
## Median pupil-teacher ratio among the towns: 19.05
# Identifying census tract with the lowest median value of owner-occupied homes
lowest_median_value_tract <- which.min(Boston$medv)
# Printing the details for the census tract
cat("Census tract with the lowest median value of owner-occupied homes:", lowest_median_value_tract, "\n")
## Census tract with the lowest median value of owner-occupied homes: 399
cat("Values of predictors for this census tract:\n")
## Values of predictors for this census tract:
print(Boston[lowest_median_value_tract, ])
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
# Counting the number of census tracts with more than seven rooms
more_than_seven_rooms <- sum(Boston$rm > 7)
# Counting the number of census tracts with more than eight rooms
more_than_eight_rooms <- sum(Boston$rm > 8)
cat("Number of census tracts with more than seven rooms per dwelling:", more_than_seven_rooms, "\n")
## Number of census tracts with more than seven rooms per dwelling: 64
cat("Number of census tracts with more than eight rooms per dwelling:", more_than_eight_rooms, "\n")
## Number of census tracts with more than eight rooms per dwelling: 13