library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.2
library(ggplot2)
data(Auto)
Auto1<- any(is.na(Auto))
if(Auto1){
cat("there are missing values")
}
{
cat("No missing values")
}
## No missing values
Q9 a. Quantitative and Qualitative predictors
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
print("The following are quantitative predictors: mpg, cylinders, displacement, horsepower, weig
ht, acceleration, year, origin")
## [1] "The following are quantitative predictors: mpg, cylinders, displacement, horsepower, weig\nht, acceleration, year, origin"
print("The following are qualitative predictors: name")
## [1] "The following are qualitative predictors: name"
Q9 b. Range of each quantitative predictor
quantitative_predictors <- sapply(Auto, is.numeric)
# Geting the range for each quantitative predictor
quantitative_ranges <- sapply(Auto[, quantitative_predictors], range)
# Printing the range for each quantitative predictor
cat("Range of Quantitative Predictors:\n")
print(quantitative_ranges)
Q9 c.Mean and standard deviation of each quantitative predictor
cat("Mean of mpg and Std. Deviation of mpg are",mean(Auto$mpg) ,"and", sd(Auto$mpg))
cat("\n")
cat("Mean of cylinders and Std. Deviation of cylinders are",mean(Auto$cylinders), "and", sd(Auto
$cylinders))
cat("Mean of displacement and Std. Deviation of displacement are",mean(Auto$displacement), "an
d", sd(Auto$displacement))
cat("Mean of horsepower and Std. Deviation of horsepower",mean(Auto$horsepower),"and", sd(Auto$h
orsepower))
cat("Mean of weight and Std. Deviation of weight",mean(Auto$weight),"and", sd(Auto$weight))
cat("Mean of acceleration and Std. Deviation of acceleration",mean(Auto$acceleration),"and", sd
(Auto$acceleration))
cat("Mean of year and Std. Deviation of year",mean(Auto$year),"and", sd(Auto$year))
cat("Mean of origin and Std. Deviation of origin",mean(Auto$origin),"and", sd(Auto$origin))
Q9 d.Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
# Assuming 'Auto' is your data frame
subset_auto <- Auto[-(10:85), ]
# Calculate range, mean, and standard deviation for each predictor in the subset
range_subset_mpg <- range(subset_auto$mpg)
mean_subset_mpg <- mean(subset_auto$mpg)
sd_subset_mpg <- sd(subset_auto$mpg)
range_subset_cylinders <- range(subset_auto$cylinders)
mean_subset_cylinders <- mean(subset_auto$cylinders)
sd_subset_cylinders <- sd(subset_auto$cylinders)
range_subset_displacement <- range(subset_auto$displacement)
mean_subset_displacement <- mean(subset_auto$displacement)
sd_subset_displacement <- sd(subset_auto$displacement)
range_subset_horsepower <- range(subset_auto$horsepower)
mean_subset_horsepower <- mean(subset_auto$horsepower)
sd_subset_horsepower <- sd(subset_auto$horsepower)
range_subset_weight <- range(subset_auto$weight)
mean_subset_weight <- mean(subset_auto$weight)
sd_subset_weight <- sd(subset_auto$weight)
range_subset_acceleration <- range(subset_auto$acceleration)
mean_subset_acceleration <- mean(subset_auto$acceleration)
sd_subset_acceleration <- sd(subset_auto$acceleration)
range_subset_year <- range(subset_auto$year)
mean_subset_year <- mean(subset_auto$year)
sd_subset_year <- sd(subset_auto$year)
# Display the results
cat("Range, Mean, and Standard Deviation for mpg in the subset:", range_subset_mpg, mean_subset_mpg, sd_subset_mpg, "\n")
## Range, Mean, and Standard Deviation for mpg in the subset: 11 46.6 24.40443 7.867283
cat("Range, Mean, and Standard Deviation for cylinders in the subset:", range_subset_cylinders,
mean_subset_cylinders, sd_subset_cylinders, "\n")
## Range, Mean, and Standard Deviation for cylinders in the subset: 3 8 5.373418 1.654179
cat("Range, Mean, and Standard Deviation for displacement in the subset:", range_subset_displacement, mean_subset_displacement, sd_subset_displacement, "\n")
## Range, Mean, and Standard Deviation for displacement in the subset: 68 455 187.2405 99.67837
cat("Range, Mean, and Standard Deviation for horsepower in the subset:", range_subset_horsepower, mean_subset_horsepower, sd_subset_horsepower, "\n")
## Range, Mean, and Standard Deviation for horsepower in the subset: 46 230 100.7215 35.70885
cat("Range, Mean, and Standard Deviation for weight in the subset:", range_subset_weight, mean_subset_weight, sd_subset_weight, "\n")
## Range, Mean, and Standard Deviation for weight in the subset: 1649 4997 2935.972 811.3002
cat("Range, Mean, and Standard Deviation for acceleration in the subset:", range_subset_acceleration, mean_subset_acceleration, sd_subset_acceleration, "\n")
## Range, Mean, and Standard Deviation for acceleration in the subset: 8.5 24.8 15.7269 2.693721
cat("Range, Mean, and Standard Deviation for year in the subset:", range_subset_year, mean_subset_year, sd_subset_year, "\n")
## Range, Mean, and Standard Deviation for year in the subset: 70 82 77.14557 3.106217
Q9 e.Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.
# Scatterplot for mpg vs. weight
ggplot(Auto, aes(x = weight, y = mpg)) +
geom_point() +
labs(title = "Scatterplot of MPG vs. Weight",
x = "Weight",
y = "MPG") +
theme_classic()
# Scatterplot for horsepower vs. acceleration
ggplot(Auto, aes(x = horsepower, y = acceleration)) +
geom_point() +
labs(title = "Scatterplot of Horsepower vs. Acceleration",
x = "Horsepower",
y = "Acceleration") +
theme_classic()
# Scatterplot matrix for multiple predictors
ggplot(Auto, aes(x = horsepower, y = mpg)) +
geom_point() +
facet_wrap(~cylinders) +
labs(title = "Scatterplot Matrix of MPG vs. Horsepower (Faceted by Cylinders)",
x = "Horsepower",
y = "MPG") +
theme_classic()
Main trend here: Negative correlation btw weight and mileage Q9
f.Suppose that we wish to predict gas mileage (mpg) on the basis of the
other variables. Do your plots suggest that any of the other variables
might be useful in predicting mpg? Justify your answer.
When evaluating the potential predictive value of various variables in relation to gas mileage (MPG), an analysis of scatterplots reveals certain observations:
In the context of MPG vs. Weight, a discernible negative correlation is evident. Lighter vehicles generally exhibit higher MPG, suggesting that weight could be a valuable predictor of fuel efficiency.
Turning attention to the relationship between Horsepower and Acceleration, no clear pattern or trend emerges. This lack of a distinct association implies that acceleration may not serve as a robust predictor of MPG.
A more nuanced perspective is gained when examining MPG vs. Horsepower, particularly when faceted by Cylinders. In certain cylinder categories, a negative correlation between horsepower and MPG is observed. The scatterplot matrix offers insights into the variability of this relationship across different cylinder levels.
Considering the interaction with the cylinders variable, Horsepower emerges as a potentially valuable predictor of MPG. The negative relationship between horsepower and MPG, especially within specific cylinder categories, suggests its potential utility in predicting fuel efficiency.
Q10
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
Q10 a.Number of rows and columns of boston dataset?
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
Q10 b. Make some pairwise scatterplots of the predictors (columns) in this data set. Describe your findings.
pairs(Boston)
We can observe that there are correlations between variables. Q9 c. Are any of the predictors associated with per capita crime rate? If so, explain the relationship.
result_df <- data.frame(i = character(), j = character(), cor = numeric(), p = numeric(), stringsAsFactors = FALSE)
for (i in colnames(Boston)) {
for (j in colnames(Boston)) {
if (i != j && i=='crim') {
correlation_test <- cor.test(Boston[[i]], Boston[[j]])
result_df <- rbind(result_df, data.frame(i = i, j = j, cor = correlation_test$estimate, p
= correlation_test$p.value))
}
}
}
# Print the result data frame
print(result_df)
## i j cor p
## cor crim zn -0.20046922 5.506472e-06
## cor1 crim indus 0.40658341 1.450349e-21
## cor2 crim chas -0.05589158 2.094345e-01
## cor3 crim nox 0.42097171 3.751739e-23
## cor4 crim rm -0.21924670 6.346703e-07
## cor5 crim age 0.35273425 2.854869e-16
## cor6 crim dis -0.37967009 8.519949e-19
## cor7 crim rad 0.62550515 2.693844e-56
## cor8 crim tax 0.58276431 2.357127e-47
## cor9 crim ptratio 0.28994558 2.942922e-11
## cor10 crim lstat 0.45562148 2.654277e-27
## cor11 crim medv -0.38830461 1.173987e-19
There is a direct relation between per capita income and crime rate.
Q10 d. Do any of the census tracts of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Create boxplots for selected predictors
selected_predictors <- c("crim", "tax", "ptratio")
# Boxplot for crime rates
ggplot(Boston, aes(x = 1, y = crim)) +
geom_boxplot() +
labs(title = "Boxplot of Crime Rates",
x = "",
y = "Crime Rate")
# Boxplot for tax rates
ggplot(Boston, aes(x = 1, y = tax)) +
geom_boxplot() +
labs(title = "Boxplot of Tax Rates",
x = "",
y = "Tax Rate")
# Boxplot for pupil-teacher ratios
ggplot(Boston, aes(x = 1, y = ptratio)) +
geom_boxplot() +
labs(title = "Boxplot of Pupil-Teacher Ratios",
x = "",
y = "Pupil-Teacher Ratio")
1.Indications from the boxplot point to relatively modest crime rates in the city, with a few instances that deviate from the norm.
2.The boxplot implies that tax rates in Boston are generally on the lower side, showcasing a scattering of outliers both at the lower and higher extremes.
3.A direct association is noticeable between the number of pupils and teachers, as depicted by the upward trend in the boxplots. Schools boasting more than 20 pupils tend to exhibit a higher pupil-teacher ratio compared to those with fewer pupils. This distinction is evident in the positioning of the boxplots, with those for schools with more than 20 pupils positioned to the right. Furthermore, there is a broader range of pupil-teacher ratios in larger schools, as evidenced by the wider spread of the boxplots in comparison to smaller schools.
Q10 e. How many of the census tracts in this data set bound the Charles river?
sum(Boston$chas == 1)
## [1] 35
Q10 f. What is the median pupil-teacher ratio among the towns in this data set?
cat("median pupil-teacher ratio among the towns :", median(Boston$ptratio))
## median pupil-teacher ratio among the towns : 19.05
Q10 g. Which census tract of Boston has lowest median value of owneroccupied homes? What are the values of the other predictors for that census tract, and how do those values compare to the overall ranges for those predictors? Comment on your findings.
selection <- Boston[order(Boston$medv),]
selection[1,]
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
Suburb 399 holds the least median value among owner-occupied residences.
summary(selection)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
Neighborhood #399 exhibits an exceptionally elevated crime rate in stark contrast to the median and average rates across all Boston neighborhoods. Additionally, over 50% of Boston neighborhoods share the characteristic of lacking residential land zoned for lots exceeding 25,000 sq.ft. In comparison to the majority of suburbs, this area stands out for its notably high proportion of non-retail business acres per town. It’s worth noting that this suburb does not fall within the category of those bordering the Charles River. Furthermore, the concentration of nitrogen oxides, measured in parts per 10 million, ranks among the highest in this particular neighborhood.
Q10 h. In this data set, how many of the census tracts average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the census tracts that average more than eight rooms per dwelling.
rm_over_7 <- subset(Boston, rm>7)
nrow(rm_over_7)
## [1] 64
There are 64 suburbs having more than 7 rooms per dwelling!
rm_over_8 <- subset(Boston, rm>8)
nrow(rm_over_8)
## [1] 13
There are 13 suburbs having more than 8 rooms per dwelling
summary(rm_over_8)
## crim zn indus chas
## Min. :0.02009 Min. : 0.00 Min. : 2.680 Min. :0.0000
## 1st Qu.:0.33147 1st Qu.: 0.00 1st Qu.: 3.970 1st Qu.:0.0000
## Median :0.52014 Median : 0.00 Median : 6.200 Median :0.0000
## Mean :0.71879 Mean :13.62 Mean : 7.078 Mean :0.1538
## 3rd Qu.:0.57834 3rd Qu.:20.00 3rd Qu.: 6.200 3rd Qu.:0.0000
## Max. :3.47428 Max. :95.00 Max. :19.580 Max. :1.0000
## nox rm age dis
## Min. :0.4161 Min. :8.034 Min. : 8.40 Min. :1.801
## 1st Qu.:0.5040 1st Qu.:8.247 1st Qu.:70.40 1st Qu.:2.288
## Median :0.5070 Median :8.297 Median :78.30 Median :2.894
## Mean :0.5392 Mean :8.349 Mean :71.54 Mean :3.430
## 3rd Qu.:0.6050 3rd Qu.:8.398 3rd Qu.:86.50 3rd Qu.:3.652
## Max. :0.7180 Max. :8.780 Max. :93.90 Max. :8.907
## rad tax ptratio lstat medv
## Min. : 2.000 Min. :224.0 Min. :13.00 Min. :2.47 Min. :21.9
## 1st Qu.: 5.000 1st Qu.:264.0 1st Qu.:14.70 1st Qu.:3.32 1st Qu.:41.7
## Median : 7.000 Median :307.0 Median :17.40 Median :4.14 Median :48.3
## Mean : 7.462 Mean :325.1 Mean :16.36 Mean :4.31 Mean :44.2
## 3rd Qu.: 8.000 3rd Qu.:307.0 3rd Qu.:17.40 3rd Qu.:5.12 3rd Qu.:50.0
## Max. :24.000 Max. :666.0 Max. :20.20 Max. :7.44 Max. :50.0
Census tracts with an average of more than eight rooms per dwelling typically exhibit characteristics such as higher affluence, an older population, and larger residences. Concurrently, these areas demonstrate a heightened level of safety and a reduced presence of industrial activities.