Question 7: KNN Distance Calculations

(a) Euclidean Distances

df <- data.frame(
  X1 = c(0, 2, 0, 0, 1, 2),
  X2 = c(3, 0, 1, 2, 1, 3),
  Y = c('Red', 'Red', 'Red', 'Green', 'Green', 'Red')
)
test_point <- c(1, 2)
df$Distance <- sqrt((df$X1 - test_point[1])^2 + (df$X2 - test_point[2])^2)
df <- df[order(df$Distance), ]
print(df)
##   X1 X2     Y Distance
## 4  0  2 Green 1.000000
## 5  1  1 Green 1.000000
## 1  0  3   Red 1.414214
## 3  0  1   Red 1.414214
## 6  2  3   Red 1.414214
## 2  2  0   Red 2.236068

(b) K = 1 Prediction

cat("K=1 Prediction:", df$Y[1], "\n")
## K=1 Prediction: Green

(c) K = 3 Prediction

cat("K=3 Prediction:", names(sort(table(df$Y[1:3]), decreasing = TRUE))[1], "\n")
## K=3 Prediction: Green

Question 9: Auto Dataset Analysis

(a) Identify quantitative vs qualitative

library(ISLR)
data(Auto)
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

(b) Range of quantitative predictors

sapply(Auto[, sapply(Auto, is.numeric)], range)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,]  9.0         3           68         46   1613          8.0   70      1
## [2,] 46.6         8          455        230   5140         24.8   82      3

(c) Mean and standard deviation

sapply(Auto[, sapply(Auto, is.numeric)], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    23.445918     5.471939   194.411990   104.469388  2977.584184    15.541327 
##         year       origin 
##    75.979592     1.576531
sapply(Auto[, sapply(Auto, is.numeric)], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    7.8050075    1.7057832  104.6440039   38.4911599  849.4025600    2.7588641 
##         year       origin 
##    3.6837365    0.8055182

(d) With rows 10–85 removed

Auto_new <- Auto[-(10:85), ]
sapply(Auto_new[, sapply(Auto_new, is.numeric)], range)
##       mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0         3           68         46   1649          8.5   70      1
## [2,] 46.6         8          455        230   4997         24.8   82      3
sapply(Auto_new[, sapply(Auto_new, is.numeric)], mean)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##    24.404430     5.373418   187.240506   100.721519  2935.971519    15.726899 
##         year       origin 
##    77.145570     1.601266
sapply(Auto_new[, sapply(Auto_new, is.numeric)], sd)
##          mpg    cylinders displacement   horsepower       weight acceleration 
##     7.867283     1.654179    99.678367    35.708853   811.300208     2.693721 
##         year       origin 
##     3.106217     0.819910

(e) Scatterplot matrix and correlation

pairs(Auto)
Scatterplot matrix for Auto dataset

Scatterplot matrix for Auto dataset

cor(Auto[, sapply(Auto, is.numeric)])
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
##              acceleration       year     origin
## mpg             0.4233285  0.5805410  0.5652088
## cylinders      -0.5046834 -0.3456474 -0.5689316
## displacement   -0.5438005 -0.3698552 -0.6145351
## horsepower     -0.6891955 -0.4163615 -0.4551715
## weight         -0.4168392 -0.3091199 -0.5850054
## acceleration    1.0000000  0.2903161  0.2127458
## year            0.2903161  1.0000000  0.1815277
## origin          0.2127458  0.1815277  1.0000000

Question 10: Boston Dataset

(a) Dimensions

library(MASS)
data(Boston)
dim(Boston)
## [1] 506  14

(b) Scatterplot matrix

pairs(Boston[, 1:6])
Scatterplot matrix for Boston dataset

Scatterplot matrix for Boston dataset

(c) Correlation with crime

cor(Boston$crim, Boston[, -which(names(Boston) == "crim")])
##              zn     indus        chas       nox         rm       age        dis
## [1,] -0.2004692 0.4065834 -0.05589158 0.4209717 -0.2192467 0.3527343 -0.3796701
##            rad       tax   ptratio      black     lstat       medv
## [1,] 0.6255051 0.5827643 0.2899456 -0.3850639 0.4556215 -0.3883046

(d) Outliers

summary(Boston$crim)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08205  0.25651  3.61352  3.67708 88.97620
summary(Boston$tax)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   187.0   279.0   330.0   408.2   666.0   711.0
summary(Boston$ptratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.60   17.40   19.05   18.46   20.20   22.00

(e) Charles River bounds

sum(Boston$chas == 1)
## [1] 35

(f) Median pupil-teacher ratio

median(Boston$ptratio)
## [1] 19.05

(g) Suburb with lowest median value

Boston[which.min(Boston$medv), ]

(h) Suburbs with more than 7 and 8 rooms

table(Boston$rm > 7)
## 
## FALSE  TRUE 
##   442    64
table(Boston$rm > 8)
## 
## FALSE  TRUE 
##   493    13