# cleaning the working memory and load the data
rm(list=ls())
data("USArrests")
summary(USArrests)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
# explore the distribution of each variable
library(ggplot2)
for (var in names(USArrests)) {
plt <- ggplot(aes(x = USArrests[, var]), data = USArrests) +
geom_histogram(bins = 15) +
labs(x = var)
print(plt)
}




# do the log transformation to Rape variable
USArrests$logRape <- log(USArrests$Rape)
USArrests$Rape <- NULL
summary(USArrests)
## Murder Assault UrbanPop logRape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. :1.988
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:2.713
## Median : 7.250 Median :159.0 Median :66.00 Median :3.001
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :2.959
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:3.265
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :3.829
# conduct the PCA
pcs <- prcomp(~Murder + Assault + UrbanPop + logRape, data=USArrests, scale. = T)
summary(pcs)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.5940 0.9930 0.54644 0.41769
## Proportion of Variance 0.6352 0.2465 0.07465 0.04362
## Cumulative Proportion 0.6352 0.8817 0.95638 1.00000
# biplot the first 2 PCs
biplot(pcs, scale = 0, cex = 0.6)

# for comparison, build the pcs without scaling
pcs.unscaled <- prcomp(~Murder + Assault + UrbanPop + logRape, data = USArrests, scale. = F)
pcs.unscaled
## Standard deviations (1, .., p=4):
## [1] 83.497812 13.982885 2.523684 0.291433
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Murder -0.041810210 0.047906448 -0.99725886 0.037837046
## Assault -0.998053720 0.044131837 0.04402688 0.001669238
## UrbanPop -0.046117192 -0.997841072 -0.04561010 0.010301396
## logRape -0.003725906 -0.008399323 -0.03815909 -0.999229430
biplot(pcs.unscaled, scale = 0, cex = 0.6)

# generating new feature by using the first PC
Usarrests <- USArrests
Usarrests$crime1 <- pcs$x[, 1]
head(Usarrests)
## Murder Assault UrbanPop logRape crime1
## Alabama 13.2 236 58 3.054001 -1.08857173
## Alaska 10.0 263 48 3.795489 -1.59517817
## Arizona 8.1 294 80 3.433987 -1.74286818
## Arkansas 8.8 190 50 2.970414 0.02590682
## California 9.0 276 91 3.703768 -2.27067947
## Colorado 7.9 204 78 3.655840 -1.33306093
# new feature generation by using the 3 loadings
Usarrests$crime2 <- pcs$rotation[1, 1] * scale(USArrests[, 1]) +
pcs$rotation[2, 1] * scale(USArrests[, 2]) +
pcs$rotation[4, 1] * scale(USArrests[, 4])
head(Usarrests)
## Murder Assault UrbanPop logRape crime1 crime2
## Alabama 13.2 236 58 3.054001 -1.08857173 -1.2326877
## Alaska 10.0 263 48 3.795489 -1.59517817 -1.9304293
## Arizona 8.1 294 80 3.433987 -1.74286818 -1.4664867
## Arkansas 8.8 190 50 2.970414 0.02590682 -0.2711172
## California 9.0 276 91 3.703768 -2.27067947 -1.7840493
## Colorado 7.9 204 78 3.655840 -1.33306093 -1.0949065
# new feature generation by using the 3 variables
pcs2 <- prcomp(USArrests[, c(1, 2, 4)], scale. = T)
Usarrests$crime3 <- pcs2$x[, 1]
head(Usarrests)
## Murder Assault UrbanPop logRape crime1 crime2 crime3
## Alabama 13.2 236 58 3.054001 -1.08857173 -1.2326877 -1.3085142
## Alaska 10.0 263 48 3.795489 -1.59517817 -1.9304293 -1.9744320
## Arizona 8.1 294 80 3.433987 -1.74286818 -1.4664867 -1.5022655
## Arkansas 8.8 190 50 2.970414 0.02590682 -0.2711172 -0.2874132
## California 9.0 276 91 3.703768 -2.27067947 -1.7840493 -1.8219787
## Colorado 7.9 204 78 3.655840 -1.33306093 -1.0949065 -1.1004001
# drop the original variables
Usarrests$Murder <- NULL
Usarrests$Assault <- NULL
Usarrests$logRape <- NULL
head(Usarrests)
## UrbanPop crime1 crime2 crime3
## Alabama 58 -1.08857173 -1.2326877 -1.3085142
## Alaska 48 -1.59517817 -1.9304293 -1.9744320
## Arizona 80 -1.74286818 -1.4664867 -1.5022655
## Arkansas 50 0.02590682 -0.2711172 -0.2874132
## California 91 -2.27067947 -1.7840493 -1.8219787
## Colorado 78 -1.33306093 -1.0949065 -1.1004001