#install.packages("ISLR")

library(ISLR)  # Contains the Auto dataset

## Warning: package 'ISLR' was built under R version 4.4.2

library(dplyr) # For data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

Question 9: This exercise involves the Auto data set studied in the lab. Make sure that the missing values have been removed from the data.

data(Auto)

Remove missing values

Auto <- na.omit(Auto)

(a) Which of the predictors are quantitative, and which are qualitative?

str(Auto)

## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

– Quantitative predictors: mpg, cylinders, displacement, horsepower, weight, acceleration, year – Qualitative predictors: name, origin

(b) What is the range of each quantitative predictor? You can answer this using the min() and max() methods in numpy.

# Remove missing values
Auto <- na.omit(Auto)

# Define quantitative variables
quantitative_vars <- Auto[, sapply(Auto, is.numeric)]

# Compute range (min and max) for each quantitative variable
range_list <- lapply(quantitative_vars, range)

# Ensure all elements have the same length before converting to a data frame
range_df <- data.frame(
  Variable = names(range_list),
  Min = sapply(range_list, function(x) x[1]),
  Max = sapply(range_list, function(x) x[2])
)

# Print the formatted range table
print(range_df, row.names = FALSE)

##      Variable  Min    Max
##           mpg    9   46.6
##     cylinders    3    8.0
##  displacement   68  455.0
##    horsepower   46  230.0
##        weight 1613 5140.0
##  acceleration    8   24.8
##          year   70   82.0
##        origin    1    3.0

(c) What is the mean and standard deviation of each quantitative .max() predictor?

# Compute mean and standard deviation
means <- sapply(quantitative_vars, mean)
sds <- sapply(quantitative_vars, sd)

# Store results in a data frame
res_val <- data.frame(Mean = means, Standard_Dev = sds)
res_val  # Print the results

##                     Mean Standard_Dev
## mpg            23.445918    7.8050075
## cylinders       5.471939    1.7057832
## displacement  194.411990  104.6440039
## horsepower    104.469388   38.4911599
## weight       2977.584184  849.4025600
## acceleration   15.541327    2.7588641
## year           75.979592    3.6837365
## origin          1.576531    0.8055182

(d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?

# Remove observations 10 through 85
Auto_subset <- Auto[-c(10:85), ]

# Select relevant quantitative variables
Auto_filter <- Auto_subset[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year")]

# Compute min, max, mean, and standard deviation
stats_df <- as.data.frame(t(sapply(Auto_filter, function(x) 
  c(Min = min(x), Max = max(x), Mean = mean(x), SD = sd(x))
)))

# Print the formatted results
print(stats_df)

##                 Min    Max        Mean         SD
## mpg            11.0   46.6   24.404430   7.867283
## cylinders       3.0    8.0    5.373418   1.654179
## displacement   68.0  455.0  187.240506  99.678367
## horsepower     46.0  230.0  100.721519  35.708853
## weight       1649.0 4997.0 2935.971519 811.300208
## acceleration    8.5   24.8   15.726899   2.693721
## year           70.0   82.0   77.145570   3.106217

(e) Visualize relationships between predictors

pairs(Auto[, c("mpg", "cylinders","displacement", "horsepower", "weight", "acceleration", "year")])

The pairs plot helps visualize the relationships between different predictors and their patterns.

Interestingly, the number of cylinders doesn’t show a simple linear relationship with MPG. Instead, there’s a sweet spot—cars with 4 cylinders tend to have the best fuel efficiency, while 3-cylinder cars are the least efficient. Meanwhile, 6- and 8-cylinder cars also show lower MPG compared to 4-cylinder ones.

Displacement, horsepower, and weight are all negatively correlated with MPG. In other words, as these factors increase, fuel efficiency decreases.Newer car models tend to have better MPG compared to older ones, indicating improvements in fuel efficiency over time.

# Additional scatterplots
ggplot(Auto, aes(x = horsepower, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  ggtitle("MPG vs Horsepower")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x = weight, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  ggtitle("MPG vs Weight")

## `geom_smooth()` using formula = 'y ~ x'

ggplot(Auto, aes(x = displacement, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", col = "green") +
  ggtitle("MPG vs Displacemnt")

## `geom_smooth()` using formula = 'y ~ x'

The scatter plots reveal a strong negative correlation between weight, horsepower, displacement, and MPG.

Heavier cars with more horsepower and larger displacement generally have lower fuel efficiency, indicating that these factors could be valuable in predicting gas mileage.

10.This exercise involves the Boston housing data set.

# Load necessary library
library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.4.2

## 
## Attaching package: 'ISLR2'

## The following object is masked _by_ '.GlobalEnv':
## 
##     Auto

## The following object is masked from 'package:MASS':
## 
##     Boston

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

# Load the Boston dataset
data("Boston")

(a) Check the structure and dimensions of the dataset

dim(Boston)  # Number of rows and columns

## [1] 506  13

Boston Housing Data Analysis

Description

The Boston data frame has 506 rows and 14 columns.

Usage

Boston

Format

This data frame contains the following columns:

crim: per capita crime rate by town.
zn: proportion of residential land zoned for lots over 25,000 sq.ft.
indus: proportion of non-retail business acres per town.
chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
nox: nitrogen oxides concentration (parts per 10 million).
rm: average number of rooms per dwelling.
age: proportion of owner-occupied units built prior to 1940.
dis: weighted mean of distances to five Boston employment centres.
rad: index of accessibility to radial highways.
tax: full-value property-tax rate per $10,000.
ptratio: pupil-teacher ratio by town.
black: 1000(Bk−0.63)^2 where Bk is the proportion of blacks by town.
lstat: lower status of the population (percent).
medv: median value of owner-occupied homes in $1000s.

(b) Create pairwise scatterplots of predictors

pairs(Boston, main="Pairwise Scatterplots of Boston Housing Data")

Higher crime rates and increased nitrogen oxide concentration in an area tend to drive down median home values.

Homes with more rooms and closer proximity to employment centers generally have higher median values.

(c) Check correlation of predictors with per capita crime rate

cor(Boston$crim, Boston[, -which(names(Boston) == "crim")])

##              zn     indus        chas       nox         rm       age        dis
## [1,] -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343 -0.3796701
##            rad       tax   ptratio     lstat       medv
## [1,] 0.6255051 0.5827643 0.2899456 0.4556215 -0.3883046

Crime rate shows a strong correlation with access to radial highways, followed by the proportion of non-retail business acres and property tax rates.

There is a negative correlation between crime rate and distance to employment centers, meaning areas farther from job hubs tend to have lower crime rates.

(d) Checking high values in crime rate, tax rate, and pupil-teacher ratio

summary(Boston$crim)  # Crime rate summary

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620

summary(Boston$tax)   # Tax rate summary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   187.0   279.0   330.0   408.2   666.0   711.0

summary(Boston$ptratio)  # Pupil-teacher ratio summary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.60   17.40   19.05   18.46   20.20   22.00

Key Observations

Crime Rates: The crime rate varies widely, ranging from 0.00632 to 88.9762. While most suburbs have low crime rates, some towns experience significantly high crime levels.
Tax Rates: Property tax rates range from 187 to 711, showing substantial variability among different suburbs.
Pupil-Teacher Ratios: The pupil-teacher ratio ranges from 12.60 to 22.00, indicating some disparities in educational resources across towns.

Identify census tracts with high crime, tax, and pupil-teacher ratios

Boston[Boston$crim > quantile(Boston$crim, 0.95), ]  # Top 5% crime rates

##        crim zn indus chas   nox    rm   age    dis rad tax ptratio lstat medv
## 375 18.4982  0  18.1    0 0.668 4.138 100.0 1.1370  24 666    20.2 37.97 13.8
## 376 19.6091  0  18.1    0 0.671 7.313  97.9 1.3163  24 666    20.2 13.44 15.0
## 379 23.6482  0  18.1    0 0.671 6.380  96.2 1.3861  24 666    20.2 23.69 13.1
## 380 17.8667  0  18.1    0 0.671 6.223 100.0 1.3861  24 666    20.2 21.78 10.2
## 381 88.9762  0  18.1    0 0.671 6.968  91.9 1.4165  24 666    20.2 17.21 10.4
## 382 15.8744  0  18.1    0 0.671 6.545  99.1 1.5192  24 666    20.2 21.08 10.9
## 385 20.0849  0  18.1    0 0.700 4.368  91.2 1.4395  24 666    20.2 30.63  8.8
## 386 16.8118  0  18.1    0 0.700 5.277  98.1 1.4261  24 666    20.2 30.81  7.2
## 387 24.3938  0  18.1    0 0.700 4.652 100.0 1.4672  24 666    20.2 28.28 10.5
## 388 22.5971  0  18.1    0 0.700 5.000  89.5 1.5184  24 666    20.2 31.99  7.4
## 399 38.3518  0  18.1    0 0.693 5.453 100.0 1.4896  24 666    20.2 30.59  5.0
## 401 25.0461  0  18.1    0 0.693 5.987 100.0 1.5888  24 666    20.2 26.77  5.6
## 404 24.8017  0  18.1    0 0.693 5.349  96.0 1.7028  24 666    20.2 19.77  8.3
## 405 41.5292  0  18.1    0 0.693 5.531  85.4 1.6074  24 666    20.2 27.38  8.5
## 406 67.9208  0  18.1    0 0.693 5.683 100.0 1.4254  24 666    20.2 22.98  5.0
## 407 20.7162  0  18.1    0 0.659 4.138 100.0 1.1781  24 666    20.2 23.34 11.9
## 411 51.1358  0  18.1    0 0.597 5.757 100.0 1.4130  24 666    20.2 10.11 15.0
## 413 18.8110  0  18.1    0 0.597 4.628 100.0 1.5539  24 666    20.2 34.37 17.9
## 414 28.6558  0  18.1    0 0.597 5.155 100.0 1.5894  24 666    20.2 20.08 16.3
## 415 45.7461  0  18.1    0 0.693 4.519 100.0 1.6582  24 666    20.2 36.98  7.0
## 416 18.0846  0  18.1    0 0.679 6.434 100.0 1.8347  24 666    20.2 29.05  7.2
## 418 25.9406  0  18.1    0 0.679 5.304  89.1 1.6475  24 666    20.2 26.64 10.4
## 419 73.5341  0  18.1    0 0.679 5.957 100.0 1.8026  24 666    20.2 20.62  8.8
## 426 15.8603  0  18.1    0 0.679 5.896  95.4 1.9096  24 666    20.2 24.39  8.3
## 428 37.6619  0  18.1    0 0.679 6.202  78.7 1.8629  24 666    20.2 14.52 10.9
## 441 22.0511  0  18.1    0 0.740 5.818  92.4 1.8662  24 666    20.2 22.11 10.5

Boston[Boston$tax > quantile(Boston$tax, 0.95), ]    # Top 5% tax rates

##        crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 489 0.15086  0 27.74    0 0.609 5.454 92.7 1.8209   4 711    20.1 18.06 15.2
## 490 0.18337  0 27.74    0 0.609 5.414 98.3 1.7554   4 711    20.1 23.97  7.0
## 491 0.20746  0 27.74    0 0.609 5.093 98.0 1.8226   4 711    20.1 29.68  8.1
## 492 0.10574  0 27.74    0 0.609 5.983 98.8 1.8681   4 711    20.1 18.07 13.6
## 493 0.11132  0 27.74    0 0.609 5.983 83.5 2.1099   4 711    20.1 13.35 20.1

Boston[Boston$ptratio > quantile(Boston$ptratio, 0.95), ]  # Top 5% pupil-teacher ratios

##        crim zn indus chas   nox    rm   age     dis rad tax ptratio lstat medv
## 55  0.01360 75  4.00    0 0.410 5.888  47.6  7.3197   3 469    21.1 14.80 18.9
## 128 0.25915  0 21.89    0 0.624 5.693  96.0  1.7883   4 437    21.2 17.19 16.2
## 129 0.32543  0 21.89    0 0.624 6.431  98.8  1.8125   4 437    21.2 15.39 18.0
## 130 0.88125  0 21.89    0 0.624 5.637  94.7  1.9799   4 437    21.2 18.34 14.3
## 131 0.34006  0 21.89    0 0.624 6.458  98.9  2.1185   4 437    21.2 12.60 19.2
## 132 1.19294  0 21.89    0 0.624 6.326  97.7  2.2710   4 437    21.2 12.26 19.6
## 133 0.59005  0 21.89    0 0.624 6.372  97.9  2.3274   4 437    21.2 11.12 23.0
## 134 0.32982  0 21.89    0 0.624 5.822  95.4  2.4699   4 437    21.2 15.03 18.4
## 135 0.97617  0 21.89    0 0.624 5.757  98.4  2.3460   4 437    21.2 17.31 15.6
## 136 0.55778  0 21.89    0 0.624 6.335  98.2  2.1107   4 437    21.2 16.96 18.1
## 137 0.32264  0 21.89    0 0.624 5.942  93.5  1.9669   4 437    21.2 16.90 17.4
## 138 0.35233  0 21.89    0 0.624 6.454  98.4  1.8498   4 437    21.2 14.59 17.1
## 139 0.24980  0 21.89    0 0.624 5.857  98.2  1.6686   4 437    21.2 21.32 13.3
## 140 0.54452  0 21.89    0 0.624 6.151  97.9  1.6687   4 437    21.2 18.46 17.8
## 141 0.29090  0 21.89    0 0.624 6.174  93.6  1.6119   4 437    21.2 24.16 14.0
## 142 1.62864  0 21.89    0 0.624 5.019 100.0  1.4394   4 437    21.2 34.41 14.4
## 355 0.04301 80  1.91    0 0.413 5.663  21.9 10.5857   4 334    22.0  8.05 18.2
## 356 0.10659 80  1.91    0 0.413 5.936  19.5 10.5857   4 334    22.0  5.57 20.6

(e) Count of census tracts bounding Charles River

sum(Boston$chas == 1)

## [1] 35

N.of census tracts bounding Charles River are 35

(f) Compute median pupil-teacher ratio

median(Boston$ptratio)

## [1] 19.05

The median pupil-teacher ratio is found by calculating the median of the ptratio column and is equal to 19.05

(g) Find the census tract with the lowest median home value

min_medv_index <- which.min(Boston$medv)
Boston[min_medv_index, ]  # Display values for this tract

##        crim zn indus chas   nox    rm age    dis rad tax ptratio lstat medv
## 399 38.3518  0  18.1    0 0.693 5.453 100 1.4896  24 666    20.2 30.59    5

summary(Boston)  # Compare to overall predictor ranges

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

The suburb with the lowest median home value stands out when we compare its characteristics to other areas.

Crime rates here are significantly higher than the average for most suburbs, and there aren’t many large residential zones nearby.

This suburb has more non-retail business land compared to the average across other areas.

It’s not located near the Charles River, and nitrogen oxide levels are much higher than in other places.

It’s far from both employment centers and the radial highways, making it more isolated than most suburbs.

The area has a high percentage of residents from lower-income backgrounds, and African Americans make up the majority of the population here.

(h) Count of census tracts with more than 7 and 8 rooms per dwelling

sum(Boston$rm > 7)

## [1] 64

sum(Boston$rm > 8)

## [1] 13

Display census tracts with more than 8 rooms per dwelling

Boston[Boston$rm > 8, ]

##        crim zn indus chas    nox    rm  age    dis rad tax ptratio lstat medv
## 98  0.12083  0  2.89    0 0.4450 8.069 76.0 3.4952   2 276    18.0  4.21 38.7
## 164 1.51902  0 19.58    1 0.6050 8.375 93.9 2.1620   5 403    14.7  3.32 50.0
## 205 0.02009 95  2.68    0 0.4161 8.034 31.9 5.1180   4 224    14.7  2.88 50.0
## 225 0.31533  0  6.20    0 0.5040 8.266 78.3 2.8944   8 307    17.4  4.14 44.8
## 226 0.52693  0  6.20    0 0.5040 8.725 83.0 2.8944   8 307    17.4  4.63 50.0
## 227 0.38214  0  6.20    0 0.5040 8.040 86.5 3.2157   8 307    17.4  3.13 37.6
## 233 0.57529  0  6.20    0 0.5070 8.337 73.3 3.8384   8 307    17.4  2.47 41.7
## 234 0.33147  0  6.20    0 0.5070 8.247 70.4 3.6519   8 307    17.4  3.95 48.3
## 254 0.36894 22  5.86    0 0.4310 8.259  8.4 8.9067   7 330    19.1  3.54 42.8
## 258 0.61154 20  3.97    0 0.6470 8.704 86.9 1.8010   5 264    13.0  5.12 50.0
## 263 0.52014 20  3.97    0 0.6470 8.398 91.5 2.2885   5 264    13.0  5.91 48.8
## 268 0.57834 20  3.97    0 0.5750 8.297 67.0 2.4216   5 264    13.0  7.44 50.0
## 365 3.47428  0 18.10    1 0.7180 8.780 82.9 1.9047  24 666    20.2  5.29 21.9

Out of the 64 suburbs with homes featuring more than 7 rooms, and the 13 with homes having more than 8 rooms:

The suburbs with homes having more than 8 rooms generally have lower crime rates, a larger portion of land zoned for residential use, and are often located along the Charles River.

These areas also tend to have average nitrogen oxide levels and are much older than other Boston suburbs.

They are not too far from employment hubs and are located closer to major highways.

The property tax rates in these suburbs are about average for Boston, and the pupil-teacher ratios are in a healthy range.

These suburbs have a higher proportion of African American residents, with a lower percentage of people from lower-income backgrounds.

The median home values in these suburbs are about double the average of other areas, placing them among the highest in the region.

Week-2 Lab-H515

Sai Pranam

2025-02-03