# Load MASS library
library(MASS)

# Load Boston dataset
data(Boston)

# Remove 'chas' and 'rad' columns
Boston <- Boston[, !(names(Boston) %in% c("chas", "rad"))]

#summary(Boston)

# Load psych library
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
# Q3 Create summary table for every feature using describe function
describe(Boston)
##         vars   n   mean     sd median trimmed    mad    min    max  range  skew
## crim       1 506   3.61   8.60   0.26    1.68   0.33   0.01  88.98  88.97  5.19
## zn         2 506  11.36  23.32   0.00    5.08   0.00   0.00 100.00 100.00  2.21
## indus      3 506  11.14   6.86   9.69   10.93   9.37   0.46  27.74  27.28  0.29
## nox        4 506   0.55   0.12   0.54    0.55   0.13   0.38   0.87   0.49  0.72
## rm         5 506   6.28   0.70   6.21    6.25   0.51   3.56   8.78   5.22  0.40
## age        6 506  68.57  28.15  77.50   71.20  28.98   2.90 100.00  97.10 -0.60
## dis        7 506   3.80   2.11   3.21    3.54   1.91   1.13  12.13  11.00  1.01
## tax        8 506 408.24 168.54 330.00  400.04 108.23 187.00 711.00 524.00  0.67
## ptratio    9 506  18.46   2.16  19.05   18.66   1.70  12.60  22.00   9.40 -0.80
## black     10 506 356.67  91.29 391.44  383.17   8.09   0.32 396.90 396.58 -2.87
## lstat     11 506  12.65   7.14  11.36   11.90   7.11   1.73  37.97  36.24  0.90
## medv      12 506  22.53   9.20  21.20   21.56   5.93   5.00  50.00  45.00  1.10
##         kurtosis   se
## crim       36.60 0.38
## zn          3.95 1.04
## indus      -1.24 0.30
## nox        -0.09 0.01
## rm          1.84 0.03
## age        -0.98 1.25
## dis         0.46 0.09
## tax        -1.15 7.49
## ptratio    -0.30 0.10
## black       7.10 4.06
## lstat       0.46 0.32
## medv        1.45 0.41
# Load corrplot library
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
# Compute correlation matrix for Boston dataset
corr_matrix <- cor(Boston)

# Plot correlation matrix using corrplot function
corrplot(corr_matrix, method = "color",diag = FALSE,type = 'lower',addCoefasPercent = TRUE,addCoef.col = TRUE,order = 'FPC')

# Calculate correlation coefficients between each feature and 'mdev'
correlations <- cor(Boston)
corr_mdev <- correlations[, "medv"]^2
corr_mdev <- sort(corr_mdev, decreasing = TRUE)

# Get name of feature with highest correlation to 'mdev'
highest_corr_feature <- names(corr_mdev)[2]
second_highest_corr_feature <- names(corr_mdev)[3]


# Create scatter plot of 'highest_corr_feature' vs. 'mdev'
plot(Boston[, highest_corr_feature], Boston$medv, xlab = highest_corr_feature, ylab = "mdev")

plot(Boston[, second_highest_corr_feature], Boston$medv, xlab = second_highest_corr_feature, ylab = "mdev")