require(ggthemes)
library(tidyverse)
library(magrittr)
library(TTR)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
library(fpp2)
library(forecast)
library(caTools)
library(reshape2)
library(psych)
require(graphics)
require(Matrix)
library(corrplot)
library(mltools)
library(fBasics)
library(kableExtra)
library(DMwR)
library(caret)
library(gridExtra)
library(DAAG)
df <- read.csv(file="uscrime.csv",stringsAsFactors = F, header=T)
head(df,2)
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob
## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602
## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599
## Time Crime
## 1 26.2011 791
## 2 25.2999 1635
#Checking if NA exist in Data set
is.null(df)
## [1] FALSE
#Statistical Summary of original data
df_unistats<- basicStats(df)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
"Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()
kable(df_unistats)
Minimum | Maximum |
|
|
Mean | Median | Variance | Stdev | Skewness | Kurtosis | |
---|---|---|---|---|---|---|---|---|---|---|
M | 11.9000 | 17.700000 | 13.000000 | 14.60000 | 13.857447 | 13.6000 | 1.579454e+00 | 1.256763 | 0.821917 | 0.377753 |
So | 0.0000 | 1.000000 | 0.000000 | 1.00000 | 0.340426 | 0.0000 | 2.294170e-01 | 0.478975 | 0.652139 | -1.607569 |
Ed | 8.7000 | 12.200000 | 9.750000 | 11.45000 | 10.563830 | 10.8000 | 1.251489e+00 | 1.118700 | -0.318987 | -1.149253 |
Po1 | 4.5000 | 16.600000 | 6.250000 | 10.45000 | 8.500000 | 7.8000 | 8.832174e+00 | 2.971897 | 0.890124 | 0.162309 |
Po2 | 4.1000 | 15.700000 | 5.850000 | 9.70000 | 8.023404 | 7.3000 | 7.818353e+00 | 2.796132 | 0.844367 | 0.008590 |
LF | 0.4800 | 0.641000 | 0.530500 | 0.59300 | 0.561191 | 0.5600 | 1.633000e-03 | 0.040412 | 0.270675 | -0.888732 |
M.F | 93.4000 | 107.100000 | 96.450000 | 99.20000 | 98.302128 | 97.7000 | 8.683256e+00 | 2.946737 | 0.993223 | 0.652010 |
Pop | 3.0000 | 168.000000 | 10.000000 | 41.50000 | 36.617021 | 25.0000 | 1.449415e+03 | 38.071188 | 1.854230 | 3.078936 |
NW | 0.2000 | 42.300000 | 2.400000 | 13.25000 | 10.112766 | 7.6000 | 1.057377e+02 | 10.282882 | 1.379966 | 1.077648 |
U1 | 0.0700 | 0.142000 | 0.080500 | 0.10400 | 0.095468 | 0.0920 | 3.250000e-04 | 0.018029 | 0.774876 | -0.131208 |
U2 | 2.0000 | 5.800000 | 2.750000 | 3.85000 | 3.397872 | 3.4000 | 7.132560e-01 | 0.844545 | 0.542264 | 0.173008 |
Wealth | 2880.0000 | 6890.000000 | 4595.000000 | 5915.00000 | 5253.829787 | 5370.0000 | 9.310502e+05 | 964.909442 | -0.381952 | -0.613169 |
Ineq | 12.6000 | 27.600000 | 16.550000 | 22.75000 | 19.400000 | 17.6000 | 1.591696e+01 | 3.989606 | 0.367063 | -1.138824 |
Prob | 0.0069 | 0.119804 | 0.032701 | 0.05445 | 0.047091 | 0.0421 | 5.170000e-04 | 0.022737 | 0.883336 | 0.749579 |
Time | 12.1996 | 44.000400 | 21.600350 | 30.45075 | 26.597921 | 25.8006 | 5.022408e+01 | 7.086895 | 0.371275 | -0.413474 |
Crime | 342.0000 | 1993.000000 | 658.500000 | 1057.50000 | 905.085106 | 831.0000 | 1.495854e+05 | 386.762697 | 1.053927 | 0.777628 |
# Visualizations via Box plots
meltData <- melt(df)
## No id variables; using all as measure variables
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()
#density plots of original data
density <- df %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()+theme_economist()+scale_colour_economist()
density
Sampling of Predictors
Certain predictors exhibit high multicollinearity issues
headers = c("Ed","Po1","U2","Ineq","Ed","Po2","LF","Wealth","Pop")
newdata <- df[headers]
correlation = cor(newdata )
corrplot(correlation, method = 'number',type='upper',bg="lightblue")
mtext("Fig1a: Correlation Matrix", at=.95, line=-15, cex=1.2)
Observation1:
pca_result<-prcomp(df[,1:15],scale=T)
names(pca_result)
## [1] "sdev" "rotation" "center" "scale" "x"
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4534 1.6739 1.4160 1.07806 0.97893 0.74377 0.56729
## Proportion of Variance 0.4013 0.1868 0.1337 0.07748 0.06389 0.03688 0.02145
## Cumulative Proportion 0.4013 0.5880 0.7217 0.79920 0.86308 0.89996 0.92142
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.55444 0.48493 0.44708 0.41915 0.35804 0.26333 0.2418
## Proportion of Variance 0.02049 0.01568 0.01333 0.01171 0.00855 0.00462 0.0039
## Cumulative Proportion 0.94191 0.95759 0.97091 0.98263 0.99117 0.99579 0.9997
## PC15
## Standard deviation 0.06793
## Proportion of Variance 0.00031
## Cumulative Proportion 1.00000
# means
pca_result$center
## M So Ed Po1 Po2 LF
## 1.385745e+01 3.404255e-01 1.056383e+01 8.500000e+00 8.023404e+00 5.611915e-01
## M.F Pop NW U1 U2 Wealth
## 9.830213e+01 3.661702e+01 1.011277e+01 9.546809e-02 3.397872e+00 5.253830e+03
## Ineq Prob Time
## 1.940000e+01 4.709138e-02 2.659792e+01
# standard deviations
pca_result$scale
## M So Ed Po1 Po2 LF
## 1.25676339 0.47897516 1.11869985 2.97189736 2.79613186 0.04041181
## M.F Pop NW U1 U2 Wealth
## 2.94673654 38.07118801 10.28288187 0.01802878 0.84454499 964.90944200
## Ineq Prob Time
## 3.98960606 0.02273697 7.08689519
pca_result$rotation <- -pca_result$rotation #removing the default direction
pca_result$rotation
## PC1 PC2 PC3 PC4 PC5
## M 0.30371194 -0.06280357 -0.1724199946 0.02035537 0.35832737
## So 0.33088129 0.15837219 -0.0155433104 -0.29247181 0.12061130
## Ed -0.33962148 -0.21461152 -0.0677396249 -0.07974375 0.02442839
## Po1 -0.30863412 0.26981761 -0.0506458161 -0.33325059 0.23527680
## Po2 -0.31099285 0.26396300 -0.0530651173 -0.35192809 0.20473383
## LF -0.17617757 -0.31943042 -0.2715301768 0.14326529 0.39407588
## M.F -0.11638221 -0.39434428 0.2031621598 -0.01048029 0.57877443
## Pop -0.11307836 0.46723456 -0.0770210971 0.03210513 0.08317034
## NW 0.29358647 0.22801119 -0.0788156621 -0.23925971 0.36079387
## U1 -0.04050137 -0.00807439 0.6590290980 0.18279096 0.13136873
## U2 -0.01812228 0.27971336 0.5785006293 0.06889312 0.13499487
## Wealth -0.37970331 0.07718862 -0.0100647664 -0.11781752 -0.01167683
## Ineq 0.36579778 0.02752240 0.0002944563 0.08066612 0.21672823
## Prob 0.25888661 -0.15831708 0.1176726436 -0.49303389 -0.16562829
## Time 0.02062867 0.38014836 -0.2235664632 0.54059002 0.14764767
## PC6 PC7 PC8 PC9 PC10 PC11
## M 0.449132706 0.15707378 0.55367691 -0.15474793 0.01443093 -0.39446657
## So 0.100500743 -0.19649727 -0.22734157 0.65599872 -0.06141452 -0.23397868
## Ed 0.008571367 0.23943629 0.14644678 0.44326978 -0.51887452 0.11821954
## Po1 0.095776709 -0.08011735 -0.04613156 -0.19425472 0.14320978 0.13042001
## Po2 0.119524780 -0.09518288 -0.03168720 -0.19512072 0.05929780 0.13885912
## LF -0.504234275 0.15931612 -0.25513777 -0.14393498 -0.03077073 -0.38532827
## M.F 0.074501901 -0.15548197 0.05507254 0.24378252 0.35323357 0.28029732
## Pop -0.547098563 -0.09046187 0.59078221 0.20244830 0.03970718 -0.05849643
## NW -0.051219538 0.31154195 -0.20432828 -0.18984178 -0.49201966 0.20695666
## U1 -0.017385981 0.17354115 0.20206312 -0.02069349 -0.22765278 0.17857891
## U2 -0.048155286 0.07526787 -0.24369650 -0.05576010 0.04750100 -0.47021842
## Wealth 0.154683104 0.14859424 -0.08630649 0.23196695 0.11219383 -0.31955631
## Ineq -0.272027031 -0.37483032 -0.07184018 0.02494384 0.01390576 0.18278697
## Prob -0.283535996 0.56159383 0.08598908 0.05306898 0.42530006 0.08978385
## Time 0.148203050 0.44199877 -0.19507812 0.23551363 0.29264326 0.26363121
## PC12 PC13 PC14 PC15
## M -0.16580189 0.05142365 -0.04901705 -0.0051398012
## So 0.05753357 0.29368483 0.29364512 -0.0084369230
## Ed -0.47786536 -0.19441949 -0.03964277 0.0280052040
## Po1 -0.22611207 0.18592255 0.09490151 0.6894155129
## Po2 -0.19088461 0.13454940 0.08259642 -0.7200270100
## LF -0.02705134 0.27742957 0.15385625 -0.0336823193
## M.F 0.23925913 -0.31624667 0.04125321 -0.0097922075
## Pop 0.18350385 -0.12651689 0.05326383 -0.0001496323
## NW 0.36671707 -0.22901695 -0.13227774 0.0370783671
## U1 0.09314897 0.59039450 0.02335942 -0.0111359325
## U2 -0.28440496 -0.43292853 0.03985736 -0.0073618948
## Wealth 0.32172821 0.14077972 -0.70031840 0.0025685109
## Ineq -0.43762828 0.12181090 -0.59279037 -0.0177570357
## Prob -0.15567100 0.03547596 -0.04761011 -0.0293376260
## Time -0.13536989 0.05738113 0.04488401 -0.0376754405
Principal Component Scores
scores: The positions of each observation in the new coordinate system of principal components
pca_result$x <- - pca_result$x
head(pca_result$x,2)
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] 4.199284 1.0938312 1.11907395 -0.67178115 -0.05528338 -0.3073383
## [2,] -1.172663 -0.6770136 0.05244634 0.08350709 1.17319982 0.5832373
## PC7 PC8 PC9 PC10 PC11 PC12
## [1,] 0.5664082 0.007801727 -0.2235099 -0.4527437 0.08474542 -0.2209664
## [2,] -0.1956112 -0.154566472 -0.4367772 -0.2120859 0.03391661 -0.3568652
## PC13 PC14 PC15
## [1,] 0.1126168 -0.3269649 -0.02338401
## [2,] -0.2975165 -0.2523567 0.06076368
variance <- (pca_result$sdev)^2
loadings <- pca_result$rotation
rownames(loadings) <- colnames(df[,1:15])
scores <- pca_result$x
#setting up plot variables
varPercent <- variance/sum(variance) * 100
par(bg = 'lightgrey')
barplot(varPercent, xlab='PC', ylab='Percent Variance',names.arg=1:length(varPercent), las=1, col='cyan', main="Fig1b: Scree Plot")
grid (lty = 3, col = "white")
box( col = 'black')
abline(h=1/ncol(df[,1:15])*100, col='red')
round(loadings, 2)[ , 1:5]
## PC1 PC2 PC3 PC4 PC5
## M 0.30 -0.06 -0.17 0.02 0.36
## So 0.33 0.16 -0.02 -0.29 0.12
## Ed -0.34 -0.21 -0.07 -0.08 0.02
## Po1 -0.31 0.27 -0.05 -0.33 0.24
## Po2 -0.31 0.26 -0.05 -0.35 0.20
## LF -0.18 -0.32 -0.27 0.14 0.39
## M.F -0.12 -0.39 0.20 -0.01 0.58
## Pop -0.11 0.47 -0.08 0.03 0.08
## NW 0.29 0.23 -0.08 -0.24 0.36
## U1 -0.04 -0.01 0.66 0.18 0.13
## U2 -0.02 0.28 0.58 0.07 0.13
## Wealth -0.38 0.08 -0.01 -0.12 -0.01
## Ineq 0.37 0.03 0.00 0.08 0.22
## Prob 0.26 -0.16 0.12 -0.49 -0.17
## Time 0.02 0.38 -0.22 0.54 0.15
Observation2:
From loadings table above and focusing only on PC1 (as its the most important PC):
PC<-pca_result$x[,1:5]
uscrimePC<-cbind(PC,df[,16])
pca.model<- lm( V6 ~., data = as.data.frame(uscrimePC))
summary(pca.model)
##
## Call:
## lm(formula = V6 ~ ., data = as.data.frame(uscrimePC))
##
## Residuals:
## Min 1Q Median 3Q Max
## -420.79 -185.01 12.21 146.24 447.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 905.09 35.59 25.428 < 2e-16 ***
## PC1 -65.22 14.67 -4.447 6.51e-05 ***
## PC2 70.08 21.49 3.261 0.00224 **
## PC3 -25.19 25.41 -0.992 0.32725
## PC4 -69.45 33.37 -2.081 0.04374 *
## PC5 229.04 36.75 6.232 2.02e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 244 on 41 degrees of freedom
## Multiple R-squared: 0.6452, Adjusted R-squared: 0.6019
## F-statistic: 14.91 on 5 and 41 DF, p-value: 2.446e-08
Parameters of PCA
Transformation to original predictors (variables)
Scaled.intercept <- pca.model$coefficients[1]#intercept from scaled PCA regression
Scaled.intercept
## (Intercept)
## 905.0851
Scaled.Coefficients <- pca.model$coefficients[2:6]#slopes from scaled PCA regression
Scaled.Coefficients
## PC1 PC2 PC3 PC4 PC5
## -65.21593 70.08312 -25.19408 -69.44603 229.04282
#per lecture slides: Implied regression coefficients for xj from PCA
a<-pca_result$rotation[,1:5]%*%Scaled.Coefficients
t(a)
## M So Ed Po1 Po2 LF M.F Pop
## [1,] 60.79435 37.84824 19.94776 117.3449 111.4508 76.2549 108.1266 58.88024
## NW U1 U2 Wealth Ineq Prob Time
## [1,] 98.07179 2.866783 32.34551 35.93336 22.1037 -34.64026 27.20502
-Unscaling the Coefficients & Intercept
A.unscaled<- a/pca_result$scale # unscaled slopes
A.unscaled
## [,1]
## M 4.837374e+01
## So 7.901922e+01
## Ed 1.783120e+01
## Po1 3.948484e+01
## Po2 3.985892e+01
## LF 1.886946e+03
## M.F 3.669366e+01
## Pop 1.546583e+00
## NW 9.537384e+00
## U1 1.590115e+02
## U2 3.829933e+01
## Wealth 3.724014e-02
## Ineq 5.540321e+00
## Prob -1.523521e+03
## Time 3.838779e+00
Intercept.unscaled<- Scaled.intercept-sum(a*pca_result$center/pca_result$scale)#unscaled intercept
Intercept.unscaled
## (Intercept)
## -5933.837
set.seed(123)
#fit the base regression model
base.model <- lm(Crime ~. ,data = df)
#view the output of the base regression model
summary(base.model)
##
## Call:
## lm(formula = Crime ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -395.74 -98.09 -6.69 112.99 512.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.984e+03 1.628e+03 -3.675 0.000893 ***
## M 8.783e+01 4.171e+01 2.106 0.043443 *
## So -3.803e+00 1.488e+02 -0.026 0.979765
## Ed 1.883e+02 6.209e+01 3.033 0.004861 **
## Po1 1.928e+02 1.061e+02 1.817 0.078892 .
## Po2 -1.094e+02 1.175e+02 -0.931 0.358830
## LF -6.638e+02 1.470e+03 -0.452 0.654654
## M.F 1.741e+01 2.035e+01 0.855 0.398995
## Pop -7.330e-01 1.290e+00 -0.568 0.573845
## NW 4.204e+00 6.481e+00 0.649 0.521279
## U1 -5.827e+03 4.210e+03 -1.384 0.176238
## U2 1.678e+02 8.234e+01 2.038 0.050161 .
## Wealth 9.617e-02 1.037e-01 0.928 0.360754
## Ineq 7.067e+01 2.272e+01 3.111 0.003983 **
## Prob -4.855e+03 2.272e+03 -2.137 0.040627 *
## Time -3.479e+00 7.165e+00 -0.486 0.630708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.1 on 31 degrees of freedom
## Multiple R-squared: 0.8031, Adjusted R-squared: 0.7078
## F-statistic: 8.429 on 15 and 31 DF, p-value: 3.539e-07
#(RMSE) of the base linear model
print(sprintf("R-squared of Base Model = %0.3f", summary(base.model)$r.squared ))
## [1] "R-squared of Base Model = 0.803"
print(sprintf("RMSE of Base Model = %0.2f", sigma(base.model) ))
## [1] "RMSE of Base Model = 209.06"
set.seed(123)
model2 <- lm( Crime ~ M + Ed + Po1 + U2 + Ineq + Prob, data = df)#reduced predictors with only significant p-values
c <- cv.lm(df,model2,m=5) #5 fold
## Analysis of Variance Table
##
## Response: Crime
## Df Sum Sq Mean Sq F value Pr(>F)
## M 1 55084 55084 1.37 0.24914
## Ed 1 725967 725967 18.02 0.00013 ***
## Po1 1 3173852 3173852 78.80 5.3e-11 ***
## U2 1 217386 217386 5.40 0.02534 *
## Ineq 1 848273 848273 21.06 4.3e-05 ***
## Prob 1 249308 249308 6.19 0.01711 *
## Residuals 40 1611057 40276
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## fold 1
## Observations in test set: 9
## 1 3 17 18 19 22 36 38 40
## Predicted 810.83 386 527.4 800 1221 728 1102 544.4 1140.8
## cvpred 785.36 345 492.2 701 1240 702 1127 544.7 1168.2
## Crime 791.00 578 539.0 929 750 439 1272 566.0 1151.0
## CV residual 5.64 233 46.8 228 -490 -263 145 21.3 -17.2
##
## Sum of squares = 439507 Mean square = 48834 n = 9
##
## fold 2
## Observations in test set: 10
## 4 6 12 25 28 32 34 41 44 46
## Predicted 1897.2 730.3 673 579.1 1259.0 774 997.5 796 1178 748
## cvpred 1882.7 781.8 684 621.4 1238.3 788 1013.9 778 1159 808
## Crime 1969.0 682.0 849 523.0 1216.0 754 923.0 880 1030 508
## CV residual 86.3 -99.8 165 -98.4 -22.3 -34 -90.9 102 -129 -300
##
## Sum of squares = 181038 Mean square = 18104 n = 10
##
## fold 3
## Observations in test set: 10
## 5 8 9 11 15 23 37 39 43 47
## Predicted 1269.8 1354 719 1118 828.3 938 992 787 1017 976
## cvpred 1266.8 1243 724 946 826.3 754 1077 717 1080 1038
## Crime 1234.0 1555 856 1674 798.0 1216 831 826 823 849
## CV residual -32.8 312 132 728 -28.3 462 -246 109 -257 -189
##
## Sum of squares = 1033612 Mean square = 103361 n = 10
##
## fold 4
## Observations in test set: 9
## 7 13 14 20 24 27 30 35 45
## Predicted 733 739 713.6 1203.0 919.4 312.2 668.0 808 622
## cvpred 760 770 730.1 1247.9 953.7 297.2 638.9 851 691
## Crime 963 511 664.0 1225.0 968.0 342.0 696.0 653 455
## CV residual 203 -259 -66.1 -22.9 14.3 44.8 57.1 -198 -236
##
## Sum of squares = 213398 Mean square = 23711 n = 9
##
## fold 5
## Observations in test set: 9
## 2 10 16 21 26 29 31 33 42
## Predicted 1388 787.3 1004 783.3 1789 1495 440.4 874 369
## cvpred 1356 723.7 1047 819.7 1795 1664 456.6 858 261
## Crime 1635 705.0 946 742.0 1993 1043 373.0 1072 542
## CV residual 279 -18.7 -101 -77.7 198 -621 -83.6 214 281
##
## Sum of squares = 650990 Mean square = 72332 n = 9
##
## Overall (Sum over all 9 folds)
## ms
## 53586
SStot <- sum((df$Crime - mean(df$Crime))^2)
SSres_model2 <- sum(model2$residuals^2)
SSres_c <- attr(c,"ms")*nrow(df)
rsquare.cv<-1 - SSres_c/SStot # cross-validated
print(sprintf("R-squared of CV Model = %0.3f", rsquare.cv))
## [1] "R-squared of CV Model = 0.634"
print(sprintf("RMSE of CV Model = %0.3f", sqrt(53586 )))
## [1] "RMSE of CV Model = 231.487"
y.pred<-Intercept.unscaled+as.matrix(df[,1:15])%*%A.unscaled
#Calculating accuracy measures
rss <- sum((y.pred - df[,16]) ^ 2) ## residual sum of squares
tss <- sum((df[,16] - mean(df[,16])) ^ 2) ## total sum of squares
rsq <- 1 - rss/tss
print(sprintf("R-squared of PCA linear regression = %0.2f", rsq ))
## [1] "R-squared of PCA linear regression = 0.65"
#calculate RMSE
rmse.pca<-sqrt(mean((df[,16]-y.pred)^2))
print(sprintf("RMSE of PCA linear regression = %0.2f", rmse.pca))
## [1] "RMSE of PCA linear regression = 227.91"
M = 14.0,So = 0,Ed = 10.0,Po1 = 12.0,Po2 = 15.5,LF = 0.640,M.F = 94.0,Pop = 150,NW = 1.1,U1 = 0.120,U2 = 3.6
Wealth = 3200,Ineq = 20.1,Prob = 0.04 & Time = 39.0
#Given test points from HW8.2
testpts <-data.frame(M = 14.0,So = 0, Ed = 10.0, Po1 = 12.0, Po2 = 15.5,LF = 0.640, M.F = 94.0, Pop = 150, NW = 1.1, U1 = 0.120, U2 = 3.6, Wealth = 3200, Ineq = 20.1, Prob = 0.040,Time = 39.0)
#Predict the crime rate for test data point
pred.pca <- Intercept.unscaled+as.matrix(testpts)%*%A.unscaled
print(sprintf("Crime Prediction using PCA linear regression = %0.0f", pred.pca))
## [1] "Crime Prediction using PCA linear regression = 1389"
- Following hints from HW8.2, we knew there were excess parameters or factors for such a small sample size, leading to overfitting. In this exercise, we used PCA as a tool to reduce dimensions and simultaneously reducing multicollinearity
- To produce a less-overfitting and more robust linear model, we used the following items as our quality guide:
(1) Scree Plot to systematically tell us which Principal Components explained the most variance in the reduced dimensional space; five principal components were discovered
(2) Loadings Table showed which variables having high loadings (positive or negative) on each principal component, that is, which variables contribute most strongly to each PC. Examining this table can give you a good sense of what each principal
component represents, in terms of the original data
Based on 5 PCs:
(a) PCA linear regression equation (scaled): Y = 905.09-65.22PC1+70.08PC2-25.19PC3-69.4PC4+PC3PC5
Base on original variables:
(b) PCA linear regression equation (unscaled):
Y = -5933.837+4.837374e+01M+7.901922e+01So+1.783120e+3.948484e+01Po1+3.985892e+01Po2+1.886946e+03LF+3.669366e+01M.F+1.546583e+00Pop+9.537384e+00NW+1.590115e+02U1+3.829933e+01U2+3.724014e-02Wealth+5.540321e+00Ineq-1.523521e+03Prob+3.838779e+00Time
- Quality comparisions between base model and cross-validated model from HW8.2 showed the following:
(1) Not surprisingly, PCA's quality based on R-Squared was lower than base model but pretty much in-line with CV linear model. HW8.2's base model was grossly-overfitted while CV's reduced predictors were similar to PCA's linear regression
Note: PCA's linear regression equation were derived from 5 reduced PC's coeffients which was then transformed back to the original 15 variables (dimensions). In other words, we travelled full circle; reduced dimensions to 5 PCs to remove multicollinearities and discovered 5 PCs that explained the most variances along the way. Unscaled and transformed a reduced PCA linear regression back to original 15-dimensioned variables linear regression for comparision and prediction purposes
- Finally, the new PCA linear equation was used to predict the new city's crime rate which was 1389. How accurate is this prediction is anyone's guess? But it appeared reasonable as the predicted value is not out of bounds of the predictors