Sumber :
Buku Belajar Statistika dengan R dari Prana Ugiana Gio dan Buku Data Mining for Business Analytics (yang kaya kalau tertarik bisa dibeli bukunya, kalau tak mampu bisa dicari di google)
Dataset : boston
Load Package
library(dplyr)
library(ggplot2)
library(scales)
library(factoextra)
Baca Data
data<-read.csv("BostonHousing.csv")
Ringkasan Statistik
#1
summary(data)
## CRIM ZN INDUS CHAS
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## NOX RM AGE DIS
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## RAD TAX PTRATIO LSTAT
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## MEDV CAT..MEDV
## Min. : 5.00 Min. :0.000
## 1st Qu.:17.02 1st Qu.:0.000
## Median :21.20 Median :0.000
## Mean :22.53 Mean :0.166
## 3rd Qu.:25.00 3rd Qu.:0.000
## Max. :50.00 Max. :1.000
#2
data.frame(mean=sapply(data,mean),
sd=sapply(data,sd),
min=sapply(data,min),
max=sapply(data, max),
median=sapply(data, median),
length=sapply(data, length),
miss.value=sapply(data, function(x)
sum(length(which(is.na(x))))))
## mean sd min max median length
## CRIM 3.61352356 8.6015451 0.00632 88.9762 0.25651 506
## ZN 11.36363636 23.3224530 0.00000 100.0000 0.00000 506
## INDUS 11.13677866 6.8603529 0.46000 27.7400 9.69000 506
## CHAS 0.06916996 0.2539940 0.00000 1.0000 0.00000 506
## NOX 0.55469506 0.1158777 0.38500 0.8710 0.53800 506
## RM 6.28463439 0.7026171 3.56100 8.7800 6.20850 506
## AGE 68.57490119 28.1488614 2.90000 100.0000 77.50000 506
## DIS 3.79504269 2.1057101 1.12960 12.1265 3.20745 506
## RAD 9.54940711 8.7072594 1.00000 24.0000 5.00000 506
## TAX 408.23715415 168.5371161 187.00000 711.0000 330.00000 506
## PTRATIO 18.45553360 2.1649455 12.60000 22.0000 19.05000 506
## LSTAT 12.65306324 7.1410615 1.73000 37.9700 11.36000 506
## MEDV 22.53280632 9.1971041 5.00000 50.0000 21.20000 506
## CAT..MEDV 0.16600791 0.3724560 0.00000 1.0000 0.00000 506
## miss.value
## CRIM 0
## ZN 0
## INDUS 0
## CHAS 0
## NOX 0
## RM 0
## AGE 0
## DIS 0
## RAD 0
## TAX 0
## PTRATIO 0
## LSTAT 0
## MEDV 0
## CAT..MEDV 0
#melihat korelasi antar variabel (karena semua variabel numeric bisa dilihat korelasi langsung)
cor<-round(cor(data),2)
cor
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
## CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 0.29
## ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 -0.39
## INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 0.38
## CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 -0.12
## NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 0.19
## RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 -0.36
## AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 0.26
## DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 -0.23
## RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 0.46
## TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 0.46
## PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
## LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
## MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
## CAT..MEDV -0.15 0.37 -0.37 0.11 -0.23 0.64 -0.19 0.12 -0.20 -0.27 -0.44
## LSTAT MEDV CAT..MEDV
## CRIM 0.46 -0.39 -0.15
## ZN -0.41 0.36 0.37
## INDUS 0.60 -0.48 -0.37
## CHAS -0.05 0.18 0.11
## NOX 0.59 -0.43 -0.23
## RM -0.61 0.70 0.64
## AGE 0.60 -0.38 -0.19
## DIS -0.50 0.25 0.12
## RAD 0.49 -0.38 -0.20
## TAX 0.54 -0.47 -0.27
## PTRATIO 0.37 -0.51 -0.44
## LSTAT 1.00 -0.74 -0.47
## MEDV -0.74 1.00 0.79
## CAT..MEDV -0.47 0.79 1.00
#biar lebih enak liatnya
cor[upper.tri(cor)]=NA
cor
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
## CRIM 1.00 NA NA NA NA NA NA NA NA NA NA
## ZN -0.20 1.00 NA NA NA NA NA NA NA NA NA
## INDUS 0.41 -0.53 1.00 NA NA NA NA NA NA NA NA
## CHAS -0.06 -0.04 0.06 1.00 NA NA NA NA NA NA NA
## NOX 0.42 -0.52 0.76 0.09 1.00 NA NA NA NA NA NA
## RM -0.22 0.31 -0.39 0.09 -0.30 1.00 NA NA NA NA NA
## AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 NA NA NA NA
## DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 NA NA NA
## RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 NA NA
## TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 NA
## PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 1.00
## LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 0.37
## MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 -0.51
## CAT..MEDV -0.15 0.37 -0.37 0.11 -0.23 0.64 -0.19 0.12 -0.20 -0.27 -0.44
## LSTAT MEDV CAT..MEDV
## CRIM NA NA NA
## ZN NA NA NA
## INDUS NA NA NA
## CHAS NA NA NA
## NOX NA NA NA
## RM NA NA NA
## AGE NA NA NA
## DIS NA NA NA
## RAD NA NA NA
## TAX NA NA NA
## PTRATIO NA NA NA
## LSTAT 1.00 NA NA
## MEDV -0.74 1.00 NA
## CAT..MEDV -0.47 0.79 1
aggregation tables
Kalau saya lebih enak pake package dplyr dari pada aggregate. Tentang dplyr bisa dibaca disini
#1
table(data$CHAS)
##
## 0 1
## 471 35
#tapi saya lebi enak
count(data,CHAS)
## CHAS n
## 1 0 471
## 2 1 35
data$rm.bin<-.bincode(data$RM, c(1:9))
#2
aggregate(data$MEDV, by=list(RM=data$rm.bin,CHAS=data$CHAS), FUN=mean)
## RM CHAS x
## 1 3 0 25.30000
## 2 4 0 15.40714
## 3 5 0 17.20000
## 4 6 0 21.76917
## 5 7 0 35.96444
## 6 8 0 45.70000
## 7 5 1 22.21818
## 8 6 1 25.91875
## 9 7 1 44.06667
## 10 8 1 35.95000
#lebih enak
data%>%
group_by(rm.bin,CHAS)%>%
summarise("rata-rata" = mean(MEDV))
## # A tibble: 10 x 3
## # Groups: rm.bin [6]
## rm.bin CHAS `rata-rata`
## <int> <int> <dbl>
## 1 3 0 25.3
## 2 4 0 15.4
## 3 5 0 17.2
## 4 5 1 22.2
## 5 6 0 21.8
## 6 6 1 25.9
## 7 7 0 36.0
## 8 7 1 44.1
## 9 8 0 45.7
## 10 8 1 36.0
pivot table
library(reshape)
mlt<-melt(data, id=c("rm.bin", "CHAS"), measure.vars = c("MEDV"))
cast(mlt, rm.bin ~ CHAS, subset = variable=="MEDV", margins = c("grand_row","grand_col"),mean)
## rm.bin 0 1 (all)
## 1 3 25.30000 NaN 25.30000
## 2 4 15.40714 NaN 15.40714
## 3 5 17.20000 22.21818 17.55159
## 4 6 21.76917 25.91875 22.01599
## 5 7 35.96444 44.06667 36.91765
## 6 8 45.70000 35.95000 44.20000
## 7 (all) 22.09384 28.44000 22.53281
reducing the number of categories in categorical variables
tbl<-table(data$CAT..MEDV, data$ZN)
prop.tbl<-prop.table(tbl,margin = 2)
barplot(prop.tbl)
axis(2, at= seq(0,1,0.2), paste(seq(0,100,20),"%"))
prop.tbl1<-as.data.frame(prop.tbl)
prop.tbl1$Var2<-as.factor(prop.tbl1$Var2)
p<-ggplot(prop.tbl1, aes(Var2, Freq, fill= Var1)) + geom_bar(stat = "identity")+
scale_y_continuous(labels = scales::percent_format(decimal.mark = ".") )+
xlab("ZN")
p+guides(fill=guide_legend(title = "CAT..MEDV"),x= guide_axis(n.dodge = 2))
Misal, disini kita anggap variabel ZN sebagai variabel categorik. Berdasarkan grafik diatas, disini kita dapat mengurangi kategori dari variabel ZN dengan menggabungkan categori (17.5, 90, 95, 100) menjadi satu kategori. Kemudian kita dapat menggabungkan kategori (12.5, 18, 25, 28, 30, 52.5, 70, 85) menjadi satu kategori.
PCA
cereal<-read.csv("Cereals.csv")
#PCA on the variable calories and rating
pcs<-prcomp(data.frame(cereal$calories, cereal$rating))
summary(pcs)
## Importance of components:
## PC1 PC2
## Standard deviation 22.3165 8.8844
## Proportion of Variance 0.8632 0.1368
## Cumulative Proportion 0.8632 1.0000
pcs$rotation
## PC1 PC2
## cereal.calories 0.8470535 0.5315077
## cereal.rating -0.5315077 0.8470535
pcs.cor<-prcomp(na.omit(cereal[,c("calories", "rating")]), scale. = T)
pcs.cor$rotation
## PC1 PC2
## calories 0.7071068 0.7071068
## rating -0.7071068 0.7071068
score<-pcs$x
pcs$rotation
## PC1 PC2
## cereal.calories 0.8470535 0.5315077
## cereal.rating -0.5315077 0.8470535
score<-as.data.frame(score)
cereal1<-bind_cols(cereal, score)
cereal1$name<-as.factor(cereal1$name)
cereal1$no<-as.numeric(cereal1$name)
ggplot(cereal1, aes(PC1, PC2))+
geom_point(size=5, col="red",pch=1)+
geom_text(aes(label=no))
#PCA on 13 variable
pcs1<-prcomp(na.omit(cereal[,-c(1:3)]))
summary(pcs1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 83.7641 70.9143 22.64375 19.18148 8.42323 2.09167
## Proportion of Variance 0.5395 0.3867 0.03943 0.02829 0.00546 0.00034
## Cumulative Proportion 0.5395 0.9262 0.96560 0.99389 0.99935 0.99968
## PC7 PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 1.69942 0.77963 0.65783 0.37043 0.1864 0.06302 5.334e-08
## Proportion of Variance 0.00022 0.00005 0.00003 0.00001 0.0000 0.00000 0.000e+00
## Cumulative Proportion 0.99991 0.99995 0.99999 1.00000 1.0000 1.00000 1.000e+00
pcs1.cor<-prcomp(na.omit(cereal[,-c(1:3)]), scale. = T)
summary(pcs1.cor)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9062 1.7743 1.3818 1.00969 0.9947 0.84974 0.81946
## Proportion of Variance 0.2795 0.2422 0.1469 0.07842 0.0761 0.05554 0.05166
## Cumulative Proportion 0.2795 0.5217 0.6685 0.74696 0.8231 0.87861 0.93026
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.64515 0.56192 0.30301 0.25194 0.13897 1.499e-08
## Proportion of Variance 0.03202 0.02429 0.00706 0.00488 0.00149 0.000e+00
## Cumulative Proportion 0.96228 0.98657 0.99363 0.99851 1.00000 1.000e+00
score1<-pcs1.cor$x
pcs1.cor$rotation
## PC1 PC2 PC3 PC4 PC5
## calories 0.29954236 0.3931479 -0.114857453 0.20435870 0.20389885
## protein -0.30735632 0.1653233 -0.277281953 0.30074318 0.31974897
## fat 0.03991542 0.3457243 0.204890102 0.18683311 0.58689327
## sodium 0.18339651 0.1372205 -0.389431009 0.12033726 -0.33836424
## fiber -0.45349036 0.1798119 -0.069766079 0.03917361 -0.25511906
## carbo 0.19244902 -0.1494483 -0.562452458 0.08783547 0.18274252
## sugars 0.22806849 0.3514345 0.355405174 -0.02270716 -0.31487243
## potass -0.40196429 0.3005442 -0.067620183 0.09087843 -0.14836048
## vitamins 0.11598020 0.1729092 -0.387858660 -0.60411064 -0.04928672
## shelf -0.17126336 0.2650503 0.001531036 -0.63887859 0.32910135
## weight 0.05029930 0.4503085 -0.247138314 0.15342874 -0.22128334
## cups 0.29463553 -0.2122479 -0.139999705 0.04748909 0.12081645
## rating -0.43837841 -0.2515389 -0.181842433 0.03831622 0.05758420
## PC6 PC7 PC8 PC9 PC10
## calories -0.25590627 0.02559550 -0.002477488 -0.02990943 0.49953153
## protein 0.12075193 -0.28270494 -0.426631915 -0.53471801 -0.02156642
## fat 0.34796727 0.05115470 0.063050445 0.45968763 -0.14508724
## sodium 0.66437216 0.28370316 0.176720469 -0.21510290 -0.00109092
## fiber 0.06424358 -0.11232541 0.216215570 0.24433313 0.29510679
## carbo -0.32639279 0.26046798 0.167436379 0.11680691 0.24055692
## sugars -0.15208230 -0.22798517 -0.063088121 -0.22538436 0.25165896
## potass 0.02515384 -0.14880826 0.262222466 0.16651285 0.17722073
## vitamins 0.12948578 -0.29427617 -0.457040884 0.34615102 0.05229538
## shelf -0.05204414 0.17483438 0.414145832 -0.41619893 -0.04610229
## weight -0.39877368 -0.01392058 0.075247652 0.06507258 -0.69159851
## cups 0.09946094 -0.74856682 0.498958957 -0.04952533 -0.07705936
## rating -0.18614524 -0.06344456 0.014944982 0.06327447 0.01223508
## PC11 PC12 PC13
## calories -0.213580948 -4.921985e-01 -2.339398e-01
## protein 0.032302627 9.969927e-02 1.863851e-01
## fat -0.066832137 2.914159e-01 -9.013898e-02
## sodium -0.087460805 5.371283e-02 -2.387372e-01
## fiber -0.530700425 5.917321e-02 4.417036e-01
## carbo 0.178979001 4.759935e-01 2.250341e-01
## sugars -0.003250769 6.142503e-01 -1.672565e-01
## potass 0.729209878 -1.207196e-01 -1.275321e-01
## vitamins 0.019464859 -4.477121e-05 -6.043328e-02
## shelf -0.059398709 1.654411e-02 -1.203515e-09
## weight -0.112920514 -3.062288e-02 -4.106761e-09
## cups -0.055325530 -3.213026e-02 1.409513e-09
## rating -0.275809192 1.892103e-01 -7.428180e-01
score1<-as.data.frame(score1)
cereal2<-na.omit(cereal)
cereal3<-bind_cols(cereal2,score1)
cereal3$name<-as.factor(cereal3$name)
cereal3$no<-as.numeric(cereal3$name)
#pake ggplot mah bingung, jelek graf na
ggplot(cereal3, aes(PC1, PC2))+
geom_point(size=5, col="red",pch=1)+
geom_text(aes(label=no))
#percentage of variance explained by each principal component
fviz_eig(pcs1.cor)
#individuals with similar profile are grouped together
fviz_pca_ind(pcs1.cor, repel = T, col.ind = "cos2",
gradient.cols=c("#00AFBB","#E7B800","#FC4E07"))
#graph of variables. Positive correlated variables point to the same side of the plot, negative correlated variables point to opposite sides of the graph
fviz_pca_var(pcs1.cor,col.var = "contrib", repel = T)
#individuals and variables
fviz_pca_biplot(pcs1.cor, repel = T )