# cleaning the working memory and load the data
rm(list=ls())

data("USArrests")
summary(USArrests)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00
# explore the distribution of each variable
library(ggplot2)

for (var in names(USArrests)) {
  plt <- ggplot(aes(x = USArrests[, var]), data = USArrests) +
    geom_histogram(bins = 15) +
    labs(x = var)
  print(plt)
}

# do the log transformation to Rape variable
USArrests$logRape <- log(USArrests$Rape)
USArrests$Rape <- NULL
summary(USArrests)
##      Murder          Assault         UrbanPop        logRape     
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   :1.988  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:2.713  
##  Median : 7.250   Median :159.0   Median :66.00   Median :3.001  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :2.959  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:3.265  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :3.829
# conduct the PCA
pcs <- prcomp(~Murder + Assault + UrbanPop + logRape, data=USArrests, scale. = T)
summary(pcs)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.5940 0.9930 0.54644 0.41769
## Proportion of Variance 0.6352 0.2465 0.07465 0.04362
## Cumulative Proportion  0.6352 0.8817 0.95638 1.00000
# biplot the first 2 PCs
biplot(pcs, scale = 0, cex = 0.6)

# for comparison, build the pcs without scaling
pcs.unscaled <- prcomp(~Murder + Assault + UrbanPop + logRape, data = USArrests, scale. = F)
pcs.unscaled
## Standard deviations (1, .., p=4):
## [1] 83.497812 13.982885  2.523684  0.291433
## 
## Rotation (n x k) = (4 x 4):
##                   PC1          PC2         PC3          PC4
## Murder   -0.041810210  0.047906448 -0.99725886  0.037837046
## Assault  -0.998053720  0.044131837  0.04402688  0.001669238
## UrbanPop -0.046117192 -0.997841072 -0.04561010  0.010301396
## logRape  -0.003725906 -0.008399323 -0.03815909 -0.999229430
biplot(pcs.unscaled, scale = 0, cex = 0.6)

# generating new feature by using the first PC
Usarrests <- USArrests

Usarrests$crime1 <- pcs$x[, 1]
head(Usarrests)
##            Murder Assault UrbanPop  logRape      crime1
## Alabama      13.2     236       58 3.054001 -1.08857173
## Alaska       10.0     263       48 3.795489 -1.59517817
## Arizona       8.1     294       80 3.433987 -1.74286818
## Arkansas      8.8     190       50 2.970414  0.02590682
## California    9.0     276       91 3.703768 -2.27067947
## Colorado      7.9     204       78 3.655840 -1.33306093
# new feature generation by using the 3 loadings
Usarrests$crime2 <- pcs$rotation[1, 1] * scale(USArrests[, 1]) +
  pcs$rotation[2, 1] * scale(USArrests[, 2]) +
  pcs$rotation[4, 1] * scale(USArrests[, 4])

head(Usarrests)
##            Murder Assault UrbanPop  logRape      crime1     crime2
## Alabama      13.2     236       58 3.054001 -1.08857173 -1.2326877
## Alaska       10.0     263       48 3.795489 -1.59517817 -1.9304293
## Arizona       8.1     294       80 3.433987 -1.74286818 -1.4664867
## Arkansas      8.8     190       50 2.970414  0.02590682 -0.2711172
## California    9.0     276       91 3.703768 -2.27067947 -1.7840493
## Colorado      7.9     204       78 3.655840 -1.33306093 -1.0949065
# new feature generation by using the 3 variables
pcs2 <- prcomp(USArrests[, c(1, 2, 4)], scale. = T)
Usarrests$crime3 <- pcs2$x[, 1]

head(Usarrests)
##            Murder Assault UrbanPop  logRape      crime1     crime2     crime3
## Alabama      13.2     236       58 3.054001 -1.08857173 -1.2326877 -1.3085142
## Alaska       10.0     263       48 3.795489 -1.59517817 -1.9304293 -1.9744320
## Arizona       8.1     294       80 3.433987 -1.74286818 -1.4664867 -1.5022655
## Arkansas      8.8     190       50 2.970414  0.02590682 -0.2711172 -0.2874132
## California    9.0     276       91 3.703768 -2.27067947 -1.7840493 -1.8219787
## Colorado      7.9     204       78 3.655840 -1.33306093 -1.0949065 -1.1004001
# drop the original variables
Usarrests$Murder <- NULL
Usarrests$Assault <- NULL
Usarrests$logRape <- NULL

head(Usarrests)
##            UrbanPop      crime1     crime2     crime3
## Alabama          58 -1.08857173 -1.2326877 -1.3085142
## Alaska           48 -1.59517817 -1.9304293 -1.9744320
## Arizona          80 -1.74286818 -1.4664867 -1.5022655
## Arkansas         50  0.02590682 -0.2711172 -0.2874132
## California       91 -2.27067947 -1.7840493 -1.8219787
## Colorado         78 -1.33306093 -1.0949065 -1.1004001