setwd("~/Documents/Grad School Analytics/Spring 2018/ISYE 6501 - Analytics Modeling/Homework 6")
library(DAAG)
rm(list = ls())
set.seed(1)
dat <- read.table("uscrime.txt",stringsAsFactors = FALSE, header = TRUE)
my.prc <- prcomp(dat[,-16], center=T, scale=T)
#lets view a summary of the principal components
summary(my.prc)
Importance of components:
                         PC1   PC2   PC3    PC4    PC5    PC6    PC7    PC8    PC9   PC10
Standard deviation     2.453 1.674 1.416 1.0781 0.9789 0.7438 0.5673 0.5544 0.4849 0.4471
Proportion of Variance 0.401 0.187 0.134 0.0775 0.0639 0.0369 0.0215 0.0205 0.0157 0.0133
Cumulative Proportion  0.401 0.588 0.722 0.7992 0.8631 0.9000 0.9214 0.9419 0.9576 0.9709
                         PC11    PC12    PC13   PC14    PC15
Standard deviation     0.4191 0.35804 0.26333 0.2418 0.06793
Proportion of Variance 0.0117 0.00855 0.00462 0.0039 0.00031
Cumulative Proportion  0.9826 0.99117 0.99579 0.9997 1.00000
#The summary ranks the proportion of variance of each principal component. #PCs 1-3 have a significant amount of variation relative to the rest. Shown in the plot below. 
#get back eigenvalues, square the standard deviation 
var <- my.prc$sdev^2
#get proportional variance by dividing each eigenvalue by the sum of eigenvalues
propvar <- var/sum(var)
plot(propvar,
     xlab = "Principal Component",
     ylab = "Proportion of Variance",
     ylim = c(0,1) , type= "b")

#Determine which PC variables are import. Kaiser method suggests any stdev greater than
#one is important. 
screeplot(my.prc,main = "Scree Plot", type = "line")
abline(h=1, col="red")

#based on this technique, we would choose to use the first 5 PCs in our model, but lets dig further

r2 <- numeric(15)
r2cross <- numeric(15)

#we will run a loop to calculate R-squared and 5 fold Cross validated R-squared values of a PCA model
#using all PCs from 1 to 15. we will plot the results, evaluate, and then chooose a number of
#PCs we think will give our most robust and accurate model
for (i in 1:15){
  PClist <- my.prc$x[,1:i]
  pcc <- cbind(dat[,16],PClist)
  #calculate R-squared, store in list
  model <- lm(V1~., data = as.data.frame(pcc))
  r2[i] <- 1 -sum(model$residuals^2)/sum((dat$Crime - mean(dat$Crime))^2)
   #calculate cross validated R-squared, store in list
  par(mfrow = c(3,5))
  c <- cv.lm(as.data.frame(pcc), model, m = 5, plotit = TRUE, printit = FALSE)
  r2cross[i] <- 1 - attr(c,"ms")*nrow(dat) / sum((dat$Crime - mean(dat$Crime))^2)
}
#plot the results
plot(r2,xlab = "Principal Component",ylab = "R^2 at x Principal Components", ylim = c(0,1), type = "b", col = "blue")

plot(r2cross,xlab = "Principal Component",ylab = "Cross-Validation R^2 at x Principal Components", ylim = c(0,1), type = "b", col = "blue")

#store R-squared data in a table
tab <- data.frame(r2, r2cross, c(1:length(r2)))
tab
#the results indicate using 5 or 14 PCs will provide the most accurate and highest quality model.
#the whole point of this assignment is to use less variables and decrease the complexity of the
#model. We will use 5 prinicipal components
#number of PCs we want to test = k
k  = 5
#we now combine PCs 1:k with the crime data from our original data set
PCcrime <- cbind(my.prc$x[,1:k],dat[,16])
#using PCs combined with crime data, we create a linear regression model
#The advantage of doing this is to reduce the complexity of the model
#while also making it more robust
model <- lm(V6~., data = as.data.frame(PCcrime))
summary(model)

Call:
lm(formula = V6 ~ ., data = as.data.frame(PCcrime))

Residuals:
   Min     1Q Median     3Q    Max 
-420.8 -185.0   12.2  146.2  447.9 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    905.1       35.6   25.43  < 2e-16 ***
PC1             65.2       14.7    4.45  6.5e-05 ***
PC2            -70.1       21.5   -3.26   0.0022 ** 
PC3             25.2       25.4    0.99   0.3272    
PC4             69.4       33.4    2.08   0.0437 *  
PC5           -229.0       36.8   -6.23  2.0e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 244 on 41 degrees of freedom
Multiple R-squared:  0.645, Adjusted R-squared:  0.602 
F-statistic: 14.9 on 5 and 41 DF,  p-value: 2.45e-08
#now to do our transformation, we first need our intercept
beta0 <- model$coefficients[1]
#below we pull out our model coefficients, and make the Beta vector
betas <- model$coefficients[2:(k+1)]
#now multply the coefficients by our rotated matrix, A to create alpha vector
alpha <- my.prc$rotation[,1:k] %*% betas
#we recover our original alpha values by dividing the alpha vector by sigma
#and our original beta by subtracting from the intercept the sum of (alpha*mu)/sigma
mu <- sapply(dat[,1:15],mean)
sigma <- sapply(dat[,1:15],sd)
origAlpha <- alpha/sigma
origBeta0 <- beta0 - sum(alpha*mu /sigma)
#estimates now gives us our model Y = aX + b
#where a is our scaled alpha and b is our original intercept
estimates <- as.matrix(dat[,1:15]) %*% origAlpha + origBeta0
#we can now use our estimates to calculate the R-squared values 
#to observe the accuracy of our model
SSE = sum((estimates - dat[,16])^2)
SStot = sum((dat[,16] - mean(dat[,16]))^2)
R2 <- 1 - SSE/SStot
R2
[1] 0.645
R2_adjust <- R2 - (1-R2)*k/(nrow(dat)-k-1)
R2_adjust
[1] 0.602
#now we will use the new_city data given from last week to see what 
#our improved model predicts the crime rate to be
new_city <- data.frame(M= 14.0, So = 0, Ed = 10.0, Po1 = 12.0, Po2 = 15.5,
                    LF = 0.640, M.F = 94.0, Pop = 150, NW = 1.1, U1 = 0.120, U2 = 3.6, Wealth = 3200, Ineq = 20.1, Prob = 0.040,Time = 39.0)
#first we apply the PCA data onto the new city data so we can apply our model
pred_df <- data.frame(predict(my.prc, new_city)) 
#now predict the Crime rate using Principal components and new city data
pred <- predict(model, pred_df)
pred #1389 this value makes sense relative to the other Crime values
   1 
1389 

relative to last weeks prediction of 1304 and an R-squared of .671, this model seems slightly less sufficient at prescribing values. But this was only a small test to compare, and we observed that with significantly less predictors, a PCA model can deliver nearly the same accuracy.

LS0tCnRpdGxlOiAiSG9tZXdvcmsgNiAtIElTWUUgNjUwMSAtIFByaW5jaXBhbCBDb21wb25lbnQgQW5hbHlzaXMgKFBDQSkgLSBNQlMgMi8xOC8xOCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CnNldHdkKCJ+L0RvY3VtZW50cy9HcmFkIFNjaG9vbCBBbmFseXRpY3MvU3ByaW5nIDIwMTgvSVNZRSA2NTAxIC0gQW5hbHl0aWNzIE1vZGVsaW5nL0hvbWV3b3JrIDYiKQpsaWJyYXJ5KERBQUcpCnJtKGxpc3QgPSBscygpKQpzZXQuc2VlZCgxKQpkYXQgPC0gcmVhZC50YWJsZSgidXNjcmltZS50eHQiLHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSwgaGVhZGVyID0gVFJVRSkKCm15LnByYyA8LSBwcmNvbXAoZGF0WywtMTZdLCBjZW50ZXI9VCwgc2NhbGU9VCkKI2xldHMgdmlldyBhIHN1bW1hcnkgb2YgdGhlIHByaW5jaXBhbCBjb21wb25lbnRzCnN1bW1hcnkobXkucHJjKQojVGhlIHN1bW1hcnkgcmFua3MgdGhlIHByb3BvcnRpb24gb2YgdmFyaWFuY2Ugb2YgZWFjaCBwcmluY2lwYWwgY29tcG9uZW50LiAjUENzIDEtMyBoYXZlIGEgc2lnbmlmaWNhbnQgYW1vdW50IG9mIHZhcmlhdGlvbiByZWxhdGl2ZSB0byB0aGUgcmVzdC4gU2hvd24gaW4gdGhlIHBsb3QgYmVsb3cuIAoKI2dldCBiYWNrIGVpZ2VudmFsdWVzLCBzcXVhcmUgdGhlIHN0YW5kYXJkIGRldmlhdGlvbiAKdmFyIDwtIG15LnByYyRzZGV2XjIKI2dldCBwcm9wb3J0aW9uYWwgdmFyaWFuY2UgYnkgZGl2aWRpbmcgZWFjaCBlaWdlbnZhbHVlIGJ5IHRoZSBzdW0gb2YgZWlnZW52YWx1ZXMKcHJvcHZhciA8LSB2YXIvc3VtKHZhcikKcGxvdChwcm9wdmFyLAogICAgIHhsYWIgPSAiUHJpbmNpcGFsIENvbXBvbmVudCIsCiAgICAgeWxhYiA9ICJQcm9wb3J0aW9uIG9mIFZhcmlhbmNlIiwKICAgICB5bGltID0gYygwLDEpICwgdHlwZT0gImIiKQoKYGBgCgoKYGBge3J9CiNEZXRlcm1pbmUgd2hpY2ggUEMgdmFyaWFibGVzIGFyZSBpbXBvcnQuIEthaXNlciBtZXRob2Qgc3VnZ2VzdHMgYW55IHN0ZGV2IGdyZWF0ZXIgdGhhbgojb25lIGlzIGltcG9ydGFudC4gCnNjcmVlcGxvdChteS5wcmMsbWFpbiA9ICJTY3JlZSBQbG90IiwgdHlwZSA9ICJsaW5lIikKYWJsaW5lKGg9MSwgY29sPSJyZWQiKQojYmFzZWQgb24gdGhpcyB0ZWNobmlxdWUsIHdlIHdvdWxkIGNob29zZSB0byB1c2UgdGhlIGZpcnN0IDUgUENzIGluIG91ciBtb2RlbCwgYnV0IGxldHMgZGlnIGZ1cnRoZXIKYGBgCgpgYGB7cn0KCnIyIDwtIG51bWVyaWMoMTUpCnIyY3Jvc3MgPC0gbnVtZXJpYygxNSkKCiN3ZSB3aWxsIHJ1biBhIGxvb3AgdG8gY2FsY3VsYXRlIFItc3F1YXJlZCBhbmQgNSBmb2xkIENyb3NzIHZhbGlkYXRlZCBSLXNxdWFyZWQgdmFsdWVzIG9mIGEgUENBIG1vZGVsCiN1c2luZyBhbGwgUENzIGZyb20gMSB0byAxNS4gd2Ugd2lsbCBwbG90IHRoZSByZXN1bHRzLCBldmFsdWF0ZSwgYW5kIHRoZW4gY2hvb29zZSBhIG51bWJlciBvZgojUENzIHdlIHRoaW5rIHdpbGwgZ2l2ZSBvdXIgbW9zdCByb2J1c3QgYW5kIGFjY3VyYXRlIG1vZGVsCmZvciAoaSBpbiAxOjE1KXsKICBQQ2xpc3QgPC0gbXkucHJjJHhbLDE6aV0KICBwY2MgPC0gY2JpbmQoZGF0WywxNl0sUENsaXN0KQogICNjYWxjdWxhdGUgUi1zcXVhcmVkLCBzdG9yZSBpbiBsaXN0CiAgbW9kZWwgPC0gbG0oVjF+LiwgZGF0YSA9IGFzLmRhdGEuZnJhbWUocGNjKSkKICByMltpXSA8LSAxIC1zdW0obW9kZWwkcmVzaWR1YWxzXjIpL3N1bSgoZGF0JENyaW1lIC0gbWVhbihkYXQkQ3JpbWUpKV4yKQogICAjY2FsY3VsYXRlIGNyb3NzIHZhbGlkYXRlZCBSLXNxdWFyZWQsIHN0b3JlIGluIGxpc3QKICBwYXIobWZyb3cgPSBjKDMsNSkpCiAgYyA8LSBjdi5sbShhcy5kYXRhLmZyYW1lKHBjYyksIG1vZGVsLCBtID0gNSwgcGxvdGl0ID0gVFJVRSwgcHJpbnRpdCA9IEZBTFNFKQogIHIyY3Jvc3NbaV0gPC0gMSAtIGF0dHIoYywibXMiKSpucm93KGRhdCkgLyBzdW0oKGRhdCRDcmltZSAtIG1lYW4oZGF0JENyaW1lKSleMikKfQoKCmBgYApgYGB7cn0KI3Bsb3QgdGhlIHJlc3VsdHMKcGxvdChyMix4bGFiID0gIlByaW5jaXBhbCBDb21wb25lbnQiLHlsYWIgPSAiUl4yIGF0IHggUHJpbmNpcGFsIENvbXBvbmVudHMiLCB5bGltID0gYygwLDEpLCB0eXBlID0gImIiLCBjb2wgPSAiYmx1ZSIpCgpwbG90KHIyY3Jvc3MseGxhYiA9ICJQcmluY2lwYWwgQ29tcG9uZW50Iix5bGFiID0gIkNyb3NzLVZhbGlkYXRpb24gUl4yIGF0IHggUHJpbmNpcGFsIENvbXBvbmVudHMiLCB5bGltID0gYygwLDEpLCB0eXBlID0gImIiLCBjb2wgPSAiYmx1ZSIpCgojc3RvcmUgUi1zcXVhcmVkIGRhdGEgaW4gYSB0YWJsZQp0YWIgPC0gZGF0YS5mcmFtZShyMiwgcjJjcm9zcywgYygxOmxlbmd0aChyMikpKQp0YWIKCgojdGhlIHJlc3VsdHMgaW5kaWNhdGUgdXNpbmcgNSBvciAxNCBQQ3Mgd2lsbCBwcm92aWRlIHRoZSBtb3N0IGFjY3VyYXRlIGFuZCBoaWdoZXN0IHF1YWxpdHkgbW9kZWwuCiN0aGUgd2hvbGUgcG9pbnQgb2YgdGhpcyBhc3NpZ25tZW50IGlzIHRvIHVzZSBsZXNzIHZhcmlhYmxlcyBhbmQgZGVjcmVhc2UgdGhlIGNvbXBsZXhpdHkgb2YgdGhlCiNtb2RlbC4gV2Ugd2lsbCB1c2UgNSBwcmluaWNpcGFsIGNvbXBvbmVudHMKYGBgCgoKCgpgYGB7cn0KCiNudW1iZXIgb2YgUENzIHdlIHdhbnQgdG8gdGVzdCA9IGsKayAgPSA1Cgojd2Ugbm93IGNvbWJpbmUgUENzIDE6ayB3aXRoIHRoZSBjcmltZSBkYXRhIGZyb20gb3VyIG9yaWdpbmFsIGRhdGEgc2V0ClBDY3JpbWUgPC0gY2JpbmQobXkucHJjJHhbLDE6a10sZGF0WywxNl0pCiN1c2luZyBQQ3MgY29tYmluZWQgd2l0aCBjcmltZSBkYXRhLCB3ZSBjcmVhdGUgYSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbAojVGhlIGFkdmFudGFnZSBvZiBkb2luZyB0aGlzIGlzIHRvIHJlZHVjZSB0aGUgY29tcGxleGl0eSBvZiB0aGUgbW9kZWwKI3doaWxlIGFsc28gbWFraW5nIGl0IG1vcmUgcm9idXN0Cm1vZGVsIDwtIGxtKFY2fi4sIGRhdGEgPSBhcy5kYXRhLmZyYW1lKFBDY3JpbWUpKQpzdW1tYXJ5KG1vZGVsKQoKI25vdyB0byBkbyBvdXIgdHJhbnNmb3JtYXRpb24sIHdlIGZpcnN0IG5lZWQgb3VyIGludGVyY2VwdApiZXRhMCA8LSBtb2RlbCRjb2VmZmljaWVudHNbMV0KI2JlbG93IHdlIHB1bGwgb3V0IG91ciBtb2RlbCBjb2VmZmljaWVudHMsIGFuZCBtYWtlIHRoZSBCZXRhIHZlY3RvcgpiZXRhcyA8LSBtb2RlbCRjb2VmZmljaWVudHNbMjooaysxKV0KCiNub3cgbXVsdHBseSB0aGUgY29lZmZpY2llbnRzIGJ5IG91ciByb3RhdGVkIG1hdHJpeCwgQSB0byBjcmVhdGUgYWxwaGEgdmVjdG9yCmFscGhhIDwtIG15LnByYyRyb3RhdGlvblssMTprXSAlKiUgYmV0YXMKCiN3ZSByZWNvdmVyIG91ciBvcmlnaW5hbCBhbHBoYSB2YWx1ZXMgYnkgZGl2aWRpbmcgdGhlIGFscGhhIHZlY3RvciBieSBzaWdtYQojYW5kIG91ciBvcmlnaW5hbCBiZXRhIGJ5IHN1YnRyYWN0aW5nIGZyb20gdGhlIGludGVyY2VwdCB0aGUgc3VtIG9mIChhbHBoYSptdSkvc2lnbWEKbXUgPC0gc2FwcGx5KGRhdFssMToxNV0sbWVhbikKc2lnbWEgPC0gc2FwcGx5KGRhdFssMToxNV0sc2QpCgpvcmlnQWxwaGEgPC0gYWxwaGEvc2lnbWEKb3JpZ0JldGEwIDwtIGJldGEwIC0gc3VtKGFscGhhKm11IC9zaWdtYSkKCiNlc3RpbWF0ZXMgbm93IGdpdmVzIHVzIG91ciBtb2RlbCBZID0gYVggKyBiCiN3aGVyZSBhIGlzIG91ciBzY2FsZWQgYWxwaGEgYW5kIGIgaXMgb3VyIG9yaWdpbmFsIGludGVyY2VwdAplc3RpbWF0ZXMgPC0gYXMubWF0cml4KGRhdFssMToxNV0pICUqJSBvcmlnQWxwaGEgKyBvcmlnQmV0YTAKCiN3ZSBjYW4gbm93IHVzZSBvdXIgZXN0aW1hdGVzIHRvIGNhbGN1bGF0ZSB0aGUgUi1zcXVhcmVkIHZhbHVlcyAKI3RvIG9ic2VydmUgdGhlIGFjY3VyYWN5IG9mIG91ciBtb2RlbApTU0UgPSBzdW0oKGVzdGltYXRlcyAtIGRhdFssMTZdKV4yKQpTU3RvdCA9IHN1bSgoZGF0WywxNl0gLSBtZWFuKGRhdFssMTZdKSleMikKCgpSMiA8LSAxIC0gU1NFL1NTdG90ClIyClIyX2FkanVzdCA8LSBSMiAtICgxLVIyKSprLyhucm93KGRhdCktay0xKQpSMl9hZGp1c3QKCgpgYGAKCmBgYHtyfQojbm93IHdlIHdpbGwgdXNlIHRoZSBuZXdfY2l0eSBkYXRhIGdpdmVuIGZyb20gbGFzdCB3ZWVrIHRvIHNlZSB3aGF0IAojb3VyIGltcHJvdmVkIG1vZGVsIHByZWRpY3RzIHRoZSBjcmltZSByYXRlIHRvIGJlCm5ld19jaXR5IDwtIGRhdGEuZnJhbWUoTT0gMTQuMCwgU28gPSAwLCBFZCA9IDEwLjAsIFBvMSA9IDEyLjAsIFBvMiA9IDE1LjUsCiAgICAgICAgICAgICAgICAgICAgTEYgPSAwLjY0MCwgTS5GID0gOTQuMCwgUG9wID0gMTUwLCBOVyA9IDEuMSwgVTEgPSAwLjEyMCwgVTIgPSAzLjYsIFdlYWx0aCA9IDMyMDAsIEluZXEgPSAyMC4xLCBQcm9iID0gMC4wNDAsVGltZSA9IDM5LjApCgojZmlyc3Qgd2UgYXBwbHkgdGhlIFBDQSBkYXRhIG9udG8gdGhlIG5ldyBjaXR5IGRhdGEgc28gd2UgY2FuIGFwcGx5IG91ciBtb2RlbApwcmVkX2RmIDwtIGRhdGEuZnJhbWUocHJlZGljdChteS5wcmMsIG5ld19jaXR5KSkgCiNub3cgcHJlZGljdCB0aGUgQ3JpbWUgcmF0ZSB1c2luZyBQcmluY2lwYWwgY29tcG9uZW50cyBhbmQgbmV3IGNpdHkgZGF0YQpwcmVkIDwtIHByZWRpY3QobW9kZWwsIHByZWRfZGYpCnByZWQgIzEzODkgdGhpcyB2YWx1ZSBtYWtlcyBzZW5zZSByZWxhdGl2ZSB0byB0aGUgb3RoZXIgQ3JpbWUgdmFsdWVzCgoKYGBgCgojcmVsYXRpdmUgdG8gbGFzdCB3ZWVrcyBwcmVkaWN0aW9uIG9mIDEzMDQgYW5kIGFuIFItc3F1YXJlZCBvZiAuNjcxLCB0aGlzIG1vZGVsIHNlZW1zIHNsaWdodGx5IGxlc3Mgc3VmZmljaWVudCBhdCBwcmVzY3JpYmluZyB2YWx1ZXMuIEJ1dCB0aGlzIHdhcyBvbmx5IGEgc21hbGwgdGVzdCB0byBjb21wYXJlLCBhbmQgd2Ugb2JzZXJ2ZWQgdGhhdCB3aXRoIHNpZ25pZmljYW50bHkgbGVzcyBwcmVkaWN0b3JzLCBhIFBDQSBtb2RlbCBjYW4gZGVsaXZlciBuZWFybHkgdGhlIHNhbWUgYWNjdXJhY3kuIAoKCg==