#we first load the Auto dataset
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.2
#inspect the first five elements in the dataset
head(Auto, 5)
Response: Quantitative predictors are mpg, cylinders, displacement, horsepower, weight, acceleration, year. Qualitative data would be origin, and name.
#first we ensure we remove missing values from the data
anyNA(Auto)
## [1] FALSE
# we don't seem to have missing values in this dataset
#evaluating the range for quantitative predictors
cat("mpg range: ", range(Auto$mpg), "\n")
## mpg range: 9 46.6
cat("cylinders range: ", range(Auto$cylinders), "\n")
## cylinders range: 3 8
cat("Displacement range: ", range(Auto$displacement), "\n")
## Displacement range: 68 455
cat("Horsepower range: ", range(Auto$horsepower), "\n")
## Horsepower range: 46 230
cat("Weight range: ", range(Auto$weight), "\n")
## Weight range: 1613 5140
cat("Acceleration range: ", range(Auto$acceleration), "\n")
## Acceleration range: 8 24.8
cat("Year range: ", range(Auto$year), "\n")
## Year range: 70 82
Response: The above are the ranges for the various quantitative predictors.
#To evaluate the mean for all the quantitative predictors
#quantitative columns to exclude
quantitative_cols <- c("origin", "name")
cat("Mean of quantitative predictors\n")
## Mean of quantitative predictors
sapply(Auto[, !names(Auto) %in% quantitative_cols], mean, na.rm=TRUE)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
#To evaluate the standard deviation for all the quantitative predictors
cat("\n\nStandard Deviations for quantitative predictors\n")
##
##
## Standard Deviations for quantitative predictors
sapply(Auto[, !names(Auto) %in% quantitative_cols], sd, na.rm=TRUE)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
Response: The standard deviations and means of the quantitative predictors is as shown above.
#removing the 10th to 85th observations
Auto_subset <- Auto[-(10:85),]
#evaluating the mean and sd
#quantitative columns to exclude
quantitative_cols <- c("origin", "name")
cat("Mean of quantitative predictors\n")
## Mean of quantitative predictors
sapply(Auto[, !names(Auto_subset) %in% quantitative_cols], mean, na.rm=TRUE)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
#To evaluate the standard deviation for all the quantitative predictors
cat("\n\nStandard Deviations for quantitative predictors\n")
##
##
## Standard Deviations for quantitative predictors
sapply(Auto[, !names(Auto_subset) %in% quantitative_cols], sd, na.rm=TRUE)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
#Horsepower vs Miles Per Gallon relationship
plot(Auto$horsepower, Auto$mpg,
col="blue",
pch=19,
xlab = "Miles Per Gallon",
ylab = "Horsepower",
main="Horsepower vs MPG")
#we investigate the relationship between weight of car and horsepower
plot(
Auto$horsepower,
Auto$weight,
pch=19,
col="blue",
xlab = "Weight",
ylab="Horsepower",
main = "Horsepower vs. Weight"
)
#mpg vs. weight
plot(
Auto$mpg,
Auto$weight,
pch=19,
col="blue",
xlab = "Weight",
ylab="mpg",
main = "mpg vs. Weight"
)
lines(lowess(Auto$mpg,Auto$weight), col="red", lwd=2)
#mpg vs. acceleration
plot(
Auto$mpg,
Auto$acceleration,
pch=19,
col="blue",
xlab = "acceleration",
ylab="mpg",
main = "mpg vs. acceleration"
)
lines(lowess(Auto$mpg,Auto$acceleration), col="red", lwd=2)
#mpg vs. horsepower
plot(
Auto$mpg,
Auto$horsepower,
pch=19,
col="blue",
xlab = "horsepower",
ylab="mpg",
main = "mpg vs. horsepower"
)
lines(lowess(Auto$mpg,Auto$horsepower), col="red", lwd=2)
#reading the dataset documentation
#?Boston
#crime vs. housing value
boston_ds <- Boston
plot(
boston_ds$crim,
boston_ds$medv,
pch = 19,
col = "blue",
xlab = "Median Home Value",
ylab = "Per Capital Crime Rate"
)
lines(lowess(boston_ds$crim,boston_ds$medv), col="red", lwd=2)
#Nitrogen Oxides concentration vs. proportion of non-retail business areas
boston_ds <- Boston
plot(
boston_ds$nox,
boston_ds$indus,
pch = 19,
col = "blue",
xlab = "industrial ares proportion per town",
ylab = "Nitrogen Oxides concentrations"
)
lines(lowess(boston_ds$nox,boston_ds$indus), col="red", lwd=2)
#evaluating the range of each of the predictor
#tax rate, crime rate, and pupil-teacher ratio
cat("Crime Rate Range:\t")
## Crime Rate Range:
range(boston_ds$crim)
## [1] 0.00632 88.97620
cat("Tax Rate Range:\t")
## Tax Rate Range:
range(boston_ds$tax)
## [1] 187 711
cat("Pupil-teacher ratio Range:\t")
## Pupil-teacher ratio Range:
range(boston_ds$ptratio)
## [1] 12.6 22.0
For crime rate, the value ranges from 0.00632 to 88.97620 suggesting that there are areas with very low crime rate, and others with extremely high crime rate.
For tax rate, the value ranges from 187 to 711, indicating also a large disparity in tax rate in Boston homes.
For pupil-teacher ration, the value ranges from 12.5 to 22.0. The difference is quite glaring i.e. 12 students per teacher vs. 22 students per teacher. This means that some neighborhoods in Boston have crowded classrooms, which could imply that students don’t get teacher support as much,
Overall, for the census tracts that we have considered above, Boston homes exhibit a significant variation across those census tracts.
# for tract that borders Charles River, value = 1, otherwise value = 0
cat("Census Tracts that bound by Charles River:\t")
## Census Tracts that bound by Charles River:
sum(boston_ds$chas)
## [1] 35
#median pupil-teacher ratio
cat("Medial pupil-teacher ratio:\t")
## Medial pupil-teacher ratio:
median(boston_ds$ptratio)
## [1] 19.05
#census tract with lowest median value
min_medv_index <- which.min(boston_ds$medv)
boston_ds[min_medv_index, ]
The above census tract has the lowest median value of owner occupied homes. This census tract has tax rate of 666 which is near the maximum in the range (187, 711). Ideally, you would expect neighborhoods in low median values to have lower tax rates.
For the pupil teacher ratio, the census tract has a value of 20.2, again near the maximum range of (12.6, 22.0). This indicates a scenario of perhaps crowded classrooms.
Lastly for crime rate, this census tract has a relatively high crime rate of 38% in a range of (0.00632, 88.97620). This was too expected, since neighborhoods with low median values tend to be associated with a higher crime rate.
#no of census tracts with more than 7 rooms
cat("number of census tracts with more than 7 rooms:\t")
## number of census tracts with more than 7 rooms:
sum(boston_ds$rm > 7)
## [1] 64
#no of census tracts with more than 8 rooms
cat("\nnumber of census tracts with more than 8 rooms:\t")
##
## number of census tracts with more than 8 rooms:
sum(boston_ds$rm > 8)
## [1] 13