# Load necessary libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(ggplot2)

9. This exercise involves the Auto data set studied in the lab. We have to make sure that the missing values have been removed from the data.

# Load the Auto dataset and remove missing values
data(Auto)
Auto <- na.omit(Auto)

(a) Which of the predictors are quantitative, and which are qualitative?

sapply(Auto, class)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    "numeric"    "integer"    "numeric"    "integer"    "integer"    "numeric" 
##         year       origin         name 
##    "integer"    "integer"     "factor"
# (a) Identify quantitative and qualitative predictors
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# (b) Range of quantitative predictors
sapply(Auto[sapply(Auto, is.numeric)], range)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,]  9.0         3           68         46   1613          8.0   70      1
## [2,] 46.6         8          455        230   5140         24.8   82      3

(b) Range of Quantitative Predictors

sapply(Auto[sapply(Auto, is.numeric)], range)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,]  9.0         3           68         46   1613          8.0   70      1
## [2,] 46.6         8          455        230   5140         24.8   82      3

(c) Mean and Standard Deviation of Quantitative Predictors

sapply(Auto[sapply(Auto, is.numeric)], function(x) c(mean = mean(x), sd = sd(x)))
##            mpg cylinders displacement horsepower    weight acceleration
## mean 23.445918  5.471939      194.412  104.46939 2977.5842    15.541327
## sd    7.805007  1.705783      104.644   38.49116  849.4026     2.758864
##           year    origin
## mean 75.979592 1.5765306
## sd    3.683737 0.8055182

(d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

Auto_subset <- Auto[-c(10:85),]
sapply(Auto_subset[sapply(Auto_subset, is.numeric)], function(x) c(range = diff(range(x)), mean = mean(x), sd = sd(x)))
##             mpg cylinders displacement horsepower    weight acceleration
## range 35.600000  5.000000    387.00000  184.00000 3348.0000    16.300000
## mean  24.404430  5.373418    187.24051  100.72152 2935.9715    15.726899
## sd     7.867283  1.654179     99.67837   35.70885  811.3002     2.693721
##            year   origin
## range 12.000000 2.000000
## mean  77.145570 1.601266
## sd     3.106217 0.819910

This part creates a subset of the data, excluding observations 10-85, and calculates range, mean, and standard deviation for numeric variables.

(e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.

# Scatterplot matrix
pairs(Auto[,c("mpg","cylinders","displacement","horsepower","weight","acceleration","year")])

Based on the pairs plot:

  1. Cylinder count doesn’t linearly affect MPG. 4-cylinder vehicles tend to be most fuel-efficient, while 3-cylinder models are least efficient.

  2. MPG decreases as displacement, horsepower, and weight increase, showing a negative correlation.

  3. Newer car models generally demonstrate improved fuel efficiency compared to older ones.

These observations highlight the complex relationships between various vehicle characteristics and fuel economy.

(f) Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.

# Scatterplot of mpg vs horsepower with smoothed trend line
ggplot(Auto, aes(x=horsepower, y=mpg)) +
  geom_point() +
  geom_smooth() +
  labs(title="MPG vs Horsepower",
       x="Horsepower", 
       y="Miles per Gallon")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Boxplot of mpg by number of cylinders  
ggplot(Auto, aes(x=as.factor(cylinders), y=mpg)) +
  geom_boxplot() +
  labs(title="MPG by Number of Cylinders",
       x="Number of Cylinders",
       y="Miles per Gallon")

# Scatterplot of mpg vs weight, colored by origin
ggplot(Auto, aes(x=weight, y=mpg, color=origin)) +
  geom_point() +
  labs(title="MPG vs Weight by Origin",
       x="Weight",
       y="Miles per Gallon")

# Faceted scatterplots of mpg vs displacement for each origin
ggplot(Auto, aes(x=displacement, y=mpg)) +
  geom_point() +
  geom_smooth(method="lm") +
  facet_wrap(~origin) +
  labs(title="MPG vs Displacement by Origin",
       x="Displacement",
       y="Miles per Gallon")
## `geom_smooth()` using formula = 'y ~ x'

Exercise 10

(a) Loading and Examining the Boston Dataset

library(ISLR2)
data(Boston)
dim(Boston)
## [1] 506  13
str(Boston)
## 'data.frame':    506 obs. of  13 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

The Boston Housing Data set is a classic dataset frequently used in statistics and machine learning for regression tasks. It consists of 506 rows, where each row represents a suburb of Boston, and 14 columns, which include various features (predictors) as well as the target variable (housing prices). Here’s an explanation of each column:

Feature Description
crim Crime rate per person by town.
zn Percentage of residential land zoned for large lots (greater than 25,000 sq. ft.).
indus Percentage of land in a town used for non-retail business purposes.
chas A binary variable indicating whether the area is near the Charles River (1 = yes, 0 = no).
nox Concentration of nitrogen oxides (measured in parts per 10 million).
rm Average number of rooms per house.
age Percentage of houses occupied by owners that were built before 1940.
dis Weighted average distance to major employment hubs in Boston.
rad Measure of how easily accessible radial highways are.
tax Property tax rate per $10,000 of property value.
ptratio Ratio of students to teachers in town schools.
b Proportion of residents of African American descent.
lstat Percentage of the population considered lower-income or lower-status.
medv Median value of homes occupied by their owners (in $1,000s).

(b) Make some pairwise scatter plots of the predictors in this data set. Describe your findings.

pairs(Boston[, 1:6])

(c) Are any of the predictors associated with percapita crime rate? If so, explain the relationship

cor(Boston$crim, Boston)
##      crim         zn     indus        chas       nox         rm       age
## [1,]    1 -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343
##             dis       rad       tax   ptratio     lstat       medv
## [1,] -0.3796701 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046

(d) Do any of the suburbs of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.

summary(Boston[, c("crim", "tax", "ptratio")])
##       crim               tax           ptratio     
##  Min.   : 0.00632   Min.   :187.0   Min.   :12.60  
##  1st Qu.: 0.08205   1st Qu.:279.0   1st Qu.:17.40  
##  Median : 0.25651   Median :330.0   Median :19.05  
##  Mean   : 3.61352   Mean   :408.2   Mean   :18.46  
##  3rd Qu.: 3.67708   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :88.97620   Max.   :711.0   Max.   :22.00

Crime Rates: The crime rates in the data set span a broad range, from as low as 0.00632 to as high as 88.9762. This indicates a significant disparity, with the majority of suburbs having minimal crime, while a few experience exceptionally high levels.

Tax Rates: Property tax rates vary considerably, ranging from 187 to 711. This highlights that some towns have relatively modest tax burdens, whereas others impose substantially higher taxes.

Pupil-Teacher Ratios: The pupil-teacher ratio falls within a narrower range of 12.60 to 22.00 compared to crime or tax rates. However, this still reflects some differences in the availability of educational resources across the suburbs.

(e) How many of the suburbs in this data set bound the Charles river?

sum(Boston$chas == 1)
## [1] 35

(f) What is the median pupil-teacher ratio among the towns in this data set?

median(Boston$ptratio)
## [1] 19.05

(g) Which suburb of Boston has lowest median value of owner-occupied homes? What are the values of the other predictors for that suburb, and how do those values compare to the overall ranges for those predictors? Comment on your findings.

lowest_medv <- which.min(Boston$medv)
Boston[lowest_medv,]
##        crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5
# Range of the predictors
smry=data.frame(sapply(Boston, function(x) c(min = min(x), max = max(x),avg=mean(x))))
smry
##          crim        zn    indus       chas       nox       rm      age
## min  0.006320   0.00000  0.46000 0.00000000 0.3850000 3.561000   2.9000
## max 88.976200 100.00000 27.74000 1.00000000 0.8710000 8.780000 100.0000
## avg  3.613524  11.36364 11.13678 0.06916996 0.5546951 6.284634  68.5749
##           dis       rad      tax  ptratio    lstat     medv
## min  1.129600  1.000000 187.0000 12.60000  1.73000  5.00000
## max 12.126500 24.000000 711.0000 22.00000 37.97000 50.00000
## avg  3.795043  9.549407 408.2372 18.45553 12.65306 22.53281

(h) In this data set, how many of the suburbs average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the suburbs that average more than eight rooms per dwelling.

sum(Boston$rm > 7)
## [1] 64
sum(Boston$rm > 8)
## [1] 13
mt8=data.frame(sapply(Boston[Boston$rm>8,], function(x) Avg=mean(x)))
mt8
##         sapply.Boston.Boston.rm...8.....function.x..Avg...mean.x..
## crim                                                     0.7187954
## zn                                                      13.6153846
## indus                                                    7.0784615
## chas                                                     0.1538462
## nox                                                      0.5392385
## rm                                                       8.3485385
## age                                                     71.5384615
## dis                                                      3.4301923
## rad                                                      7.4615385
## tax                                                    325.0769231
## ptratio                                                 16.3615385
## lstat                                                    4.3100000
## medv                                                    44.2000000

There are 64 suburbs with homes averaging more than 7 rooms, and 13 suburbs where houses have over 8 rooms. The suburbs with homes exceeding 8 rooms generally exhibit distinct features: