library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
data("Auto")
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
# a
quantitative_vars <- c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year")
qualitative_vars <- c("origin", "name")
cat("Quantitative Variables:", quantitative_vars, "\n")
## Quantitative Variables: mpg cylinders displacement horsepower weight acceleration year
cat("Qualitative Variables:", qualitative_vars, "\n")
## Qualitative Variables: origin name
#b
sapply(Auto[, quantitative_vars], range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
#c
summary_stats <- data.frame(
Mean = sapply(Auto[, quantitative_vars], mean),
Std_Dev = sapply(Auto[, quantitative_vars], sd)
)
summary_stats
## Mean Std_Dev
## mpg 23.445918 7.805007
## cylinders 5.471939 1.705783
## displacement 194.411990 104.644004
## horsepower 104.469388 38.491160
## weight 2977.584184 849.402560
## acceleration 15.541327 2.758864
## year 75.979592 3.683737
# d
subset <- Auto[-c(10:85), -c(4,9)]
sapply(subset, range)
## mpg cylinders displacement weight acceleration year origin
## [1,] 11.0 3 68 1649 8.5 70 1
## [2,] 46.6 8 455 4997 24.8 82 3
#e
pairs(Auto)
From the plots we can understand that mpg of the car decreases with the
increase in displacement and cylinders of the car. - the new models are
having higher mpg than the older ones. this can be concluded by
observing the mpg and year graph.
# f
Auto$horsepower <- as.numeric(Auto$horsepower)
cor(Auto$weight, Auto$horsepower)
## [1] 0.8645377
cor(Auto$displacement, Auto$horsepower)
## [1] 0.897257
#10
data("Boston")
dim(Boston) # Number of rows and columns
## [1] 506 13
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#b
pairs(Boston[, c("crim", "tax", "rm", "medv", "ptratio")])
cor_matrix <- cor(Boston)
corrplot(cor_matrix, method="color")
#c
cor(Boston[-1],Boston$crim)
## [,1]
## zn -0.20046922
## indus 0.40658341
## chas -0.05589158
## nox 0.42097171
## rm -0.21924670
## age 0.35273425
## dis -0.37967009
## rad 0.62550515
## tax 0.58276431
## ptratio 0.28994558
## lstat 0.45562148
## medv -0.38830461
pairs(Boston[Boston$crim < 20, ])
#d
high_crime <- Boston[Boston$crim > quantile(Boston$crim, 0.90), ]
high_tax <- Boston[Boston$tax > quantile(Boston$tax, 0.90), ]
high_ptratio <- Boston[Boston$ptratio > quantile(Boston$ptratio, 0.90), ]
summary(Boston[, c("crim", "tax", "ptratio")])
## crim tax ptratio
## Min. : 0.00632 Min. :187.0 Min. :12.60
## 1st Qu.: 0.08205 1st Qu.:279.0 1st Qu.:17.40
## Median : 0.25651 Median :330.0 Median :19.05
## Mean : 3.61352 Mean :408.2 Mean :18.46
## 3rd Qu.: 3.67708 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :88.97620 Max. :711.0 Max. :22.00
#e
sum(Boston$chas == 1)
## [1] 35
#f
median(Boston$ptratio)
## [1] 19.05
#g
lowest_medv <- Boston[Boston$medv == min(Boston$medv), ]
print(lowest_medv)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 30.59 5
## 406 67.9208 0 18.1 0 0.693 5.683 100 1.4254 24 666 20.2 22.98 5
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
#h
nrow(Boston[Boston$rm > 7, ])
## [1] 64
nrow(Boston[Boston$rm > 8, ])
## [1] 13
Boston[Boston$rm > 8, ]
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 98 0.12083 0 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 4.21 38.7
## 164 1.51902 0 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 3.32 50.0
## 205 0.02009 95 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 2.88 50.0
## 225 0.31533 0 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 4.14 44.8
## 226 0.52693 0 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 4.63 50.0
## 227 0.38214 0 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 3.13 37.6
## 233 0.57529 0 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 2.47 41.7
## 234 0.33147 0 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 3.95 48.3
## 254 0.36894 22 5.86 0 0.4310 8.259 8.4 8.9067 7 330 19.1 3.54 42.8
## 258 0.61154 20 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 5.12 50.0
## 263 0.52014 20 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 5.91 48.8
## 268 0.57834 20 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 7.44 50.0
## 365 3.47428 0 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 5.29 21.9