Belajar Dimension Reduction

Sumber :

Buku Belajar Statistika dengan R dari Prana Ugiana Gio dan Buku Data Mining for Business Analytics (yang kaya kalau tertarik bisa dibeli bukunya, kalau tak mampu bisa dicari di google)

Dataset : boston

cereal

Load Package

library(dplyr)
library(ggplot2)
library(scales)
library(factoextra)

Baca Data

data<-read.csv("BostonHousing.csv")

Ringkasan Statistik

#1
summary(data)
##       CRIM                ZN             INDUS            CHAS        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       NOX               RM             AGE              DIS        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       RAD              TAX           PTRATIO          LSTAT      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       MEDV         CAT..MEDV    
##  Min.   : 5.00   Min.   :0.000  
##  1st Qu.:17.02   1st Qu.:0.000  
##  Median :21.20   Median :0.000  
##  Mean   :22.53   Mean   :0.166  
##  3rd Qu.:25.00   3rd Qu.:0.000  
##  Max.   :50.00   Max.   :1.000
#2
data.frame(mean=sapply(data,mean),
           sd=sapply(data,sd),
           min=sapply(data,min),
           max=sapply(data, max),
           median=sapply(data, median),
           length=sapply(data, length),
           miss.value=sapply(data, function(x)
             sum(length(which(is.na(x))))))
##                   mean          sd       min      max    median length
## CRIM        3.61352356   8.6015451   0.00632  88.9762   0.25651    506
## ZN         11.36363636  23.3224530   0.00000 100.0000   0.00000    506
## INDUS      11.13677866   6.8603529   0.46000  27.7400   9.69000    506
## CHAS        0.06916996   0.2539940   0.00000   1.0000   0.00000    506
## NOX         0.55469506   0.1158777   0.38500   0.8710   0.53800    506
## RM          6.28463439   0.7026171   3.56100   8.7800   6.20850    506
## AGE        68.57490119  28.1488614   2.90000 100.0000  77.50000    506
## DIS         3.79504269   2.1057101   1.12960  12.1265   3.20745    506
## RAD         9.54940711   8.7072594   1.00000  24.0000   5.00000    506
## TAX       408.23715415 168.5371161 187.00000 711.0000 330.00000    506
## PTRATIO    18.45553360   2.1649455  12.60000  22.0000  19.05000    506
## LSTAT      12.65306324   7.1410615   1.73000  37.9700  11.36000    506
## MEDV       22.53280632   9.1971041   5.00000  50.0000  21.20000    506
## CAT..MEDV   0.16600791   0.3724560   0.00000   1.0000   0.00000    506
##           miss.value
## CRIM               0
## ZN                 0
## INDUS              0
## CHAS               0
## NOX                0
## RM                 0
## AGE                0
## DIS                0
## RAD                0
## TAX                0
## PTRATIO            0
## LSTAT              0
## MEDV               0
## CAT..MEDV          0
#melihat korelasi antar variabel (karena semua variabel numeric bisa dilihat korelasi langsung)
cor<-round(cor(data),2)
cor
##            CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX PTRATIO
## CRIM       1.00 -0.20  0.41 -0.06  0.42 -0.22  0.35 -0.38  0.63  0.58    0.29
## ZN        -0.20  1.00 -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31   -0.39
## INDUS      0.41 -0.53  1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72    0.38
## CHAS      -0.06 -0.04  0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04   -0.12
## NOX        0.42 -0.52  0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67    0.19
## RM        -0.22  0.31 -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29   -0.36
## AGE        0.35 -0.57  0.64  0.09  0.73 -0.24  1.00 -0.75  0.46  0.51    0.26
## DIS       -0.38  0.66 -0.71 -0.10 -0.77  0.21 -0.75  1.00 -0.49 -0.53   -0.23
## RAD        0.63 -0.31  0.60 -0.01  0.61 -0.21  0.46 -0.49  1.00  0.91    0.46
## TAX        0.58 -0.31  0.72 -0.04  0.67 -0.29  0.51 -0.53  0.91  1.00    0.46
## PTRATIO    0.29 -0.39  0.38 -0.12  0.19 -0.36  0.26 -0.23  0.46  0.46    1.00
## LSTAT      0.46 -0.41  0.60 -0.05  0.59 -0.61  0.60 -0.50  0.49  0.54    0.37
## MEDV      -0.39  0.36 -0.48  0.18 -0.43  0.70 -0.38  0.25 -0.38 -0.47   -0.51
## CAT..MEDV -0.15  0.37 -0.37  0.11 -0.23  0.64 -0.19  0.12 -0.20 -0.27   -0.44
##           LSTAT  MEDV CAT..MEDV
## CRIM       0.46 -0.39     -0.15
## ZN        -0.41  0.36      0.37
## INDUS      0.60 -0.48     -0.37
## CHAS      -0.05  0.18      0.11
## NOX        0.59 -0.43     -0.23
## RM        -0.61  0.70      0.64
## AGE        0.60 -0.38     -0.19
## DIS       -0.50  0.25      0.12
## RAD        0.49 -0.38     -0.20
## TAX        0.54 -0.47     -0.27
## PTRATIO    0.37 -0.51     -0.44
## LSTAT      1.00 -0.74     -0.47
## MEDV      -0.74  1.00      0.79
## CAT..MEDV -0.47  0.79      1.00
#biar lebih enak liatnya
cor[upper.tri(cor)]=NA
cor
##            CRIM    ZN INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX PTRATIO
## CRIM       1.00    NA    NA    NA    NA    NA    NA    NA    NA    NA      NA
## ZN        -0.20  1.00    NA    NA    NA    NA    NA    NA    NA    NA      NA
## INDUS      0.41 -0.53  1.00    NA    NA    NA    NA    NA    NA    NA      NA
## CHAS      -0.06 -0.04  0.06  1.00    NA    NA    NA    NA    NA    NA      NA
## NOX        0.42 -0.52  0.76  0.09  1.00    NA    NA    NA    NA    NA      NA
## RM        -0.22  0.31 -0.39  0.09 -0.30  1.00    NA    NA    NA    NA      NA
## AGE        0.35 -0.57  0.64  0.09  0.73 -0.24  1.00    NA    NA    NA      NA
## DIS       -0.38  0.66 -0.71 -0.10 -0.77  0.21 -0.75  1.00    NA    NA      NA
## RAD        0.63 -0.31  0.60 -0.01  0.61 -0.21  0.46 -0.49  1.00    NA      NA
## TAX        0.58 -0.31  0.72 -0.04  0.67 -0.29  0.51 -0.53  0.91  1.00      NA
## PTRATIO    0.29 -0.39  0.38 -0.12  0.19 -0.36  0.26 -0.23  0.46  0.46    1.00
## LSTAT      0.46 -0.41  0.60 -0.05  0.59 -0.61  0.60 -0.50  0.49  0.54    0.37
## MEDV      -0.39  0.36 -0.48  0.18 -0.43  0.70 -0.38  0.25 -0.38 -0.47   -0.51
## CAT..MEDV -0.15  0.37 -0.37  0.11 -0.23  0.64 -0.19  0.12 -0.20 -0.27   -0.44
##           LSTAT MEDV CAT..MEDV
## CRIM         NA   NA        NA
## ZN           NA   NA        NA
## INDUS        NA   NA        NA
## CHAS         NA   NA        NA
## NOX          NA   NA        NA
## RM           NA   NA        NA
## AGE          NA   NA        NA
## DIS          NA   NA        NA
## RAD          NA   NA        NA
## TAX          NA   NA        NA
## PTRATIO      NA   NA        NA
## LSTAT      1.00   NA        NA
## MEDV      -0.74 1.00        NA
## CAT..MEDV -0.47 0.79         1

aggregation tables

Kalau saya lebih enak pake package dplyr dari pada aggregate. Tentang dplyr bisa dibaca disini

#1
table(data$CHAS)
## 
##   0   1 
## 471  35
#tapi saya lebi enak
count(data,CHAS)
##   CHAS   n
## 1    0 471
## 2    1  35
data$rm.bin<-.bincode(data$RM, c(1:9))
#2
aggregate(data$MEDV, by=list(RM=data$rm.bin,CHAS=data$CHAS), FUN=mean)
##    RM CHAS        x
## 1   3    0 25.30000
## 2   4    0 15.40714
## 3   5    0 17.20000
## 4   6    0 21.76917
## 5   7    0 35.96444
## 6   8    0 45.70000
## 7   5    1 22.21818
## 8   6    1 25.91875
## 9   7    1 44.06667
## 10  8    1 35.95000
#lebih enak
data%>%
  group_by(rm.bin,CHAS)%>%
  summarise("rata-rata" = mean(MEDV))
## # A tibble: 10 x 3
## # Groups:   rm.bin [6]
##    rm.bin  CHAS `rata-rata`
##     <int> <int>       <dbl>
##  1      3     0        25.3
##  2      4     0        15.4
##  3      5     0        17.2
##  4      5     1        22.2
##  5      6     0        21.8
##  6      6     1        25.9
##  7      7     0        36.0
##  8      7     1        44.1
##  9      8     0        45.7
## 10      8     1        36.0

pivot table

library(reshape)
mlt<-melt(data, id=c("rm.bin", "CHAS"), measure.vars = c("MEDV"))

cast(mlt, rm.bin ~ CHAS, subset = variable=="MEDV", margins = c("grand_row","grand_col"),mean)
##   rm.bin        0        1    (all)
## 1      3 25.30000      NaN 25.30000
## 2      4 15.40714      NaN 15.40714
## 3      5 17.20000 22.21818 17.55159
## 4      6 21.76917 25.91875 22.01599
## 5      7 35.96444 44.06667 36.91765
## 6      8 45.70000 35.95000 44.20000
## 7  (all) 22.09384 28.44000 22.53281

reducing the number of categories in categorical variables

tbl<-table(data$CAT..MEDV, data$ZN)
prop.tbl<-prop.table(tbl,margin = 2)
barplot(prop.tbl)
axis(2, at= seq(0,1,0.2), paste(seq(0,100,20),"%"))

prop.tbl1<-as.data.frame(prop.tbl)
prop.tbl1$Var2<-as.factor(prop.tbl1$Var2)
p<-ggplot(prop.tbl1, aes(Var2, Freq, fill= Var1)) + geom_bar(stat = "identity")+
  scale_y_continuous(labels = scales::percent_format(decimal.mark = ".") )+
  xlab("ZN")
p+guides(fill=guide_legend(title = "CAT..MEDV"),x= guide_axis(n.dodge = 2))

Misal, disini kita anggap variabel ZN sebagai variabel categorik. Berdasarkan grafik diatas, disini kita dapat mengurangi kategori dari variabel ZN dengan menggabungkan categori (17.5, 90, 95, 100) menjadi satu kategori. Kemudian kita dapat menggabungkan kategori (12.5, 18, 25, 28, 30, 52.5, 70, 85) menjadi satu kategori.

PCA

cereal<-read.csv("Cereals.csv")

#PCA on the variable calories and rating
pcs<-prcomp(data.frame(cereal$calories, cereal$rating))
summary(pcs)
## Importance of components:
##                            PC1    PC2
## Standard deviation     22.3165 8.8844
## Proportion of Variance  0.8632 0.1368
## Cumulative Proportion   0.8632 1.0000
pcs$rotation
##                        PC1       PC2
## cereal.calories  0.8470535 0.5315077
## cereal.rating   -0.5315077 0.8470535
pcs.cor<-prcomp(na.omit(cereal[,c("calories", "rating")]), scale. = T)
pcs.cor$rotation
##                 PC1       PC2
## calories  0.7071068 0.7071068
## rating   -0.7071068 0.7071068
score<-pcs$x
pcs$rotation
##                        PC1       PC2
## cereal.calories  0.8470535 0.5315077
## cereal.rating   -0.5315077 0.8470535
score<-as.data.frame(score)
cereal1<-bind_cols(cereal, score)
cereal1$name<-as.factor(cereal1$name)
cereal1$no<-as.numeric(cereal1$name)
ggplot(cereal1, aes(PC1, PC2))+
  geom_point(size=5, col="red",pch=1)+
  geom_text(aes(label=no))

#PCA on 13 variable
pcs1<-prcomp(na.omit(cereal[,-c(1:3)]))
summary(pcs1)
## Importance of components:
##                            PC1     PC2      PC3      PC4     PC5     PC6
## Standard deviation     83.7641 70.9143 22.64375 19.18148 8.42323 2.09167
## Proportion of Variance  0.5395  0.3867  0.03943  0.02829 0.00546 0.00034
## Cumulative Proportion   0.5395  0.9262  0.96560  0.99389 0.99935 0.99968
##                            PC7     PC8     PC9    PC10   PC11    PC12      PC13
## Standard deviation     1.69942 0.77963 0.65783 0.37043 0.1864 0.06302 5.334e-08
## Proportion of Variance 0.00022 0.00005 0.00003 0.00001 0.0000 0.00000 0.000e+00
## Cumulative Proportion  0.99991 0.99995 0.99999 1.00000 1.0000 1.00000 1.000e+00
pcs1.cor<-prcomp(na.omit(cereal[,-c(1:3)]), scale. = T)
summary(pcs1.cor)
## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5     PC6     PC7
## Standard deviation     1.9062 1.7743 1.3818 1.00969 0.9947 0.84974 0.81946
## Proportion of Variance 0.2795 0.2422 0.1469 0.07842 0.0761 0.05554 0.05166
## Cumulative Proportion  0.2795 0.5217 0.6685 0.74696 0.8231 0.87861 0.93026
##                            PC8     PC9    PC10    PC11    PC12      PC13
## Standard deviation     0.64515 0.56192 0.30301 0.25194 0.13897 1.499e-08
## Proportion of Variance 0.03202 0.02429 0.00706 0.00488 0.00149 0.000e+00
## Cumulative Proportion  0.96228 0.98657 0.99363 0.99851 1.00000 1.000e+00
score1<-pcs1.cor$x
pcs1.cor$rotation
##                  PC1        PC2          PC3         PC4         PC5
## calories  0.29954236  0.3931479 -0.114857453  0.20435870  0.20389885
## protein  -0.30735632  0.1653233 -0.277281953  0.30074318  0.31974897
## fat       0.03991542  0.3457243  0.204890102  0.18683311  0.58689327
## sodium    0.18339651  0.1372205 -0.389431009  0.12033726 -0.33836424
## fiber    -0.45349036  0.1798119 -0.069766079  0.03917361 -0.25511906
## carbo     0.19244902 -0.1494483 -0.562452458  0.08783547  0.18274252
## sugars    0.22806849  0.3514345  0.355405174 -0.02270716 -0.31487243
## potass   -0.40196429  0.3005442 -0.067620183  0.09087843 -0.14836048
## vitamins  0.11598020  0.1729092 -0.387858660 -0.60411064 -0.04928672
## shelf    -0.17126336  0.2650503  0.001531036 -0.63887859  0.32910135
## weight    0.05029930  0.4503085 -0.247138314  0.15342874 -0.22128334
## cups      0.29463553 -0.2122479 -0.139999705  0.04748909  0.12081645
## rating   -0.43837841 -0.2515389 -0.181842433  0.03831622  0.05758420
##                  PC6         PC7          PC8         PC9        PC10
## calories -0.25590627  0.02559550 -0.002477488 -0.02990943  0.49953153
## protein   0.12075193 -0.28270494 -0.426631915 -0.53471801 -0.02156642
## fat       0.34796727  0.05115470  0.063050445  0.45968763 -0.14508724
## sodium    0.66437216  0.28370316  0.176720469 -0.21510290 -0.00109092
## fiber     0.06424358 -0.11232541  0.216215570  0.24433313  0.29510679
## carbo    -0.32639279  0.26046798  0.167436379  0.11680691  0.24055692
## sugars   -0.15208230 -0.22798517 -0.063088121 -0.22538436  0.25165896
## potass    0.02515384 -0.14880826  0.262222466  0.16651285  0.17722073
## vitamins  0.12948578 -0.29427617 -0.457040884  0.34615102  0.05229538
## shelf    -0.05204414  0.17483438  0.414145832 -0.41619893 -0.04610229
## weight   -0.39877368 -0.01392058  0.075247652  0.06507258 -0.69159851
## cups      0.09946094 -0.74856682  0.498958957 -0.04952533 -0.07705936
## rating   -0.18614524 -0.06344456  0.014944982  0.06327447  0.01223508
##                  PC11          PC12          PC13
## calories -0.213580948 -4.921985e-01 -2.339398e-01
## protein   0.032302627  9.969927e-02  1.863851e-01
## fat      -0.066832137  2.914159e-01 -9.013898e-02
## sodium   -0.087460805  5.371283e-02 -2.387372e-01
## fiber    -0.530700425  5.917321e-02  4.417036e-01
## carbo     0.178979001  4.759935e-01  2.250341e-01
## sugars   -0.003250769  6.142503e-01 -1.672565e-01
## potass    0.729209878 -1.207196e-01 -1.275321e-01
## vitamins  0.019464859 -4.477121e-05 -6.043328e-02
## shelf    -0.059398709  1.654411e-02 -1.203515e-09
## weight   -0.112920514 -3.062288e-02 -4.106761e-09
## cups     -0.055325530 -3.213026e-02  1.409513e-09
## rating   -0.275809192  1.892103e-01 -7.428180e-01
score1<-as.data.frame(score1)
cereal2<-na.omit(cereal)
cereal3<-bind_cols(cereal2,score1)
cereal3$name<-as.factor(cereal3$name)
cereal3$no<-as.numeric(cereal3$name)

#pake ggplot mah bingung, jelek graf na
ggplot(cereal3, aes(PC1, PC2))+
  geom_point(size=5, col="red",pch=1)+
  geom_text(aes(label=no))

#percentage of variance explained by each principal component
fviz_eig(pcs1.cor)

#individuals with similar profile are grouped together
fviz_pca_ind(pcs1.cor, repel = T, col.ind = "cos2",
             gradient.cols=c("#00AFBB","#E7B800","#FC4E07"))

#graph of variables. Positive correlated variables point to the same side of the plot, negative correlated variables point to opposite sides of the graph
fviz_pca_var(pcs1.cor,col.var = "contrib", repel = T)

#individuals and variables
fviz_pca_biplot(pcs1.cor, repel = T )