#data
raw <- read.csv("creditcard.csv")
str(raw)
## 'data.frame': 284807 obs. of 31 variables:
## $ Time : num 0 0 1 1 2 2 4 7 7 9 ...
## $ V1 : num -1.36 1.192 -1.358 -0.966 -1.158 ...
## $ V2 : num -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
## $ V3 : num 2.536 0.166 1.773 1.793 1.549 ...
## $ V4 : num 1.378 0.448 0.38 -0.863 0.403 ...
## $ V5 : num -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
## $ V6 : num 0.4624 -0.0824 1.8005 1.2472 0.0959 ...
## $ V7 : num 0.2396 -0.0788 0.7915 0.2376 0.5929 ...
## $ V8 : num 0.0987 0.0851 0.2477 0.3774 -0.2705 ...
## $ V9 : num 0.364 -0.255 -1.515 -1.387 0.818 ...
## $ V10 : num 0.0908 -0.167 0.2076 -0.055 0.7531 ...
## $ V11 : num -0.552 1.613 0.625 -0.226 -0.823 ...
## $ V12 : num -0.6178 1.0652 0.0661 0.1782 0.5382 ...
## $ V13 : num -0.991 0.489 0.717 0.508 1.346 ...
## $ V14 : num -0.311 -0.144 -0.166 -0.288 -1.12 ...
## $ V15 : num 1.468 0.636 2.346 -0.631 0.175 ...
## $ V16 : num -0.47 0.464 -2.89 -1.06 -0.451 ...
## $ V17 : num 0.208 -0.115 1.11 -0.684 -0.237 ...
## $ V18 : num 0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
## $ V19 : num 0.404 -0.146 -2.262 -1.233 0.803 ...
## $ V20 : num 0.2514 -0.0691 0.525 -0.208 0.4085 ...
## $ V21 : num -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
## $ V22 : num 0.27784 -0.63867 0.77168 0.00527 0.79828 ...
## $ V23 : num -0.11 0.101 0.909 -0.19 -0.137 ...
## $ V24 : num 0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
## $ V25 : num 0.129 0.167 -0.328 0.647 -0.206 ...
## $ V26 : num -0.189 0.126 -0.139 -0.222 0.502 ...
## $ V27 : num 0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
## $ V28 : num -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
## $ Amount: num 149.62 2.69 378.66 123.5 69.99 ...
## $ Class : int 0 0 0 0 0 0 0 0 0 0 ...
summary(raw)
## Time V1 V2
## Min. : 0 Min. :-56.40751 Min. :-72.71573
## 1st Qu.: 54202 1st Qu.: -0.92037 1st Qu.: -0.59855
## Median : 84692 Median : 0.01811 Median : 0.06549
## Mean : 94814 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:139321 3rd Qu.: 1.31564 3rd Qu.: 0.80372
## Max. :172792 Max. : 2.45493 Max. : 22.05773
## V3 V4 V5
## Min. :-48.3256 Min. :-5.68317 Min. :-113.74331
## 1st Qu.: -0.8904 1st Qu.:-0.84864 1st Qu.: -0.69160
## Median : 0.1799 Median :-0.01985 Median : -0.05434
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 1.0272 3rd Qu.: 0.74334 3rd Qu.: 0.61193
## Max. : 9.3826 Max. :16.87534 Max. : 34.80167
## V6 V7 V8
## Min. :-26.1605 Min. :-43.5572 Min. :-73.21672
## 1st Qu.: -0.7683 1st Qu.: -0.5541 1st Qu.: -0.20863
## Median : -0.2742 Median : 0.0401 Median : 0.02236
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.3986 3rd Qu.: 0.5704 3rd Qu.: 0.32735
## Max. : 73.3016 Max. :120.5895 Max. : 20.00721
## V9 V10 V11
## Min. :-13.43407 Min. :-24.58826 Min. :-4.79747
## 1st Qu.: -0.64310 1st Qu.: -0.53543 1st Qu.:-0.76249
## Median : -0.05143 Median : -0.09292 Median :-0.03276
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.59714 3rd Qu.: 0.45392 3rd Qu.: 0.73959
## Max. : 15.59500 Max. : 23.74514 Max. :12.01891
## V12 V13 V14
## Min. :-18.6837 Min. :-5.79188 Min. :-19.2143
## 1st Qu.: -0.4056 1st Qu.:-0.64854 1st Qu.: -0.4256
## Median : 0.1400 Median :-0.01357 Median : 0.0506
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.6182 3rd Qu.: 0.66251 3rd Qu.: 0.4931
## Max. : 7.8484 Max. : 7.12688 Max. : 10.5268
## V15 V16 V17
## Min. :-4.49894 Min. :-14.12985 Min. :-25.16280
## 1st Qu.:-0.58288 1st Qu.: -0.46804 1st Qu.: -0.48375
## Median : 0.04807 Median : 0.06641 Median : -0.06568
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.64882 3rd Qu.: 0.52330 3rd Qu.: 0.39968
## Max. : 8.87774 Max. : 17.31511 Max. : 9.25353
## V18 V19 V20
## Min. :-9.498746 Min. :-7.213527 Min. :-54.49772
## 1st Qu.:-0.498850 1st Qu.:-0.456299 1st Qu.: -0.21172
## Median :-0.003636 Median : 0.003735 Median : -0.06248
## Mean : 0.000000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.500807 3rd Qu.: 0.458949 3rd Qu.: 0.13304
## Max. : 5.041069 Max. : 5.591971 Max. : 39.42090
## V21 V22 V23
## Min. :-34.83038 Min. :-10.933144 Min. :-44.80774
## 1st Qu.: -0.22839 1st Qu.: -0.542350 1st Qu.: -0.16185
## Median : -0.02945 Median : 0.006782 Median : -0.01119
## Mean : 0.00000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.18638 3rd Qu.: 0.528554 3rd Qu.: 0.14764
## Max. : 27.20284 Max. : 10.503090 Max. : 22.52841
## V24 V25 V26
## Min. :-2.83663 Min. :-10.29540 Min. :-2.60455
## 1st Qu.:-0.35459 1st Qu.: -0.31715 1st Qu.:-0.32698
## Median : 0.04098 Median : 0.01659 Median :-0.05214
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.43953 3rd Qu.: 0.35072 3rd Qu.: 0.24095
## Max. : 4.58455 Max. : 7.51959 Max. : 3.51735
## V27 V28 Amount
## Min. :-22.565679 Min. :-15.43008 Min. : 0.00
## 1st Qu.: -0.070840 1st Qu.: -0.05296 1st Qu.: 5.60
## Median : 0.001342 Median : 0.01124 Median : 22.00
## Mean : 0.000000 Mean : 0.00000 Mean : 88.35
## 3rd Qu.: 0.091045 3rd Qu.: 0.07828 3rd Qu.: 77.17
## Max. : 31.612198 Max. : 33.84781 Max. :25691.16
## Class
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.001728
## 3rd Qu.:0.000000
## Max. :1.000000
names(raw) <- tolower(names(raw))
#Time, V1~28, Amount, Class(int)
summary(raw$time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 54202 84692 94814 139321 172792
table(raw$class)
##
## 0 1
## 284315 492
#package
#install.packages("dplyr")
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.3
# library(plotly)
# library(patchwork)
library(outliers)
library(FNN)
## Warning: package 'FNN' was built under R version 3.6.1
library(dbscan)
## Warning: package 'dbscan' was built under R version 3.6.3
library(isofor)
library(cluster)
#install.packages("OutlierDetection")
library(OutlierDetection)
## Warning: package 'OutlierDetection' was built under R version 3.6.3
## Registered S3 method overwritten by 'spatstat':
## method from
## print.boxx cli
##1.boxplot
raw_long <- raw %>% gather(varia, val, starts_with("v")) #long하게 만들어서
raw_box <- raw_long %>% ggplot(aes(x = varia, y = val)) + geom_boxplot() #그림 그리기.
#잠깐, v1~ 28까지는 단위가 다를 수도 있잖아?
sc_raw <- as.data.frame(scale(raw[,c(2:29)])) #scale 해버리자. 정규화/표준화에는 다른 방법들도 있지만, scale이 제일 만만함.
sc_raw <- cbind(raw[,c(1, 30:31)], sc_raw) #다시 나머지 변인들 붙이고.
sc_raw_long <- sc_raw %>% gather(varia, val, starts_with("v")) #다시 long하게 만들고
sc_raw_long$varia <- as.factor(sc_raw_long$varia)
#sc_raw_box <- sc_raw_long %>% ggplot(aes(x = varia, y = val)) + geom_boxplot() #다시 그림 그리기,,, 아까랑 다를바 없다???
#sc_raw_box
#그리고, 이걸 그려주자니 엄청 시간이 오래걸린다. 원래 284807행인 데이터인데, 그걸 28배로 늘렸으니까.
sc_raw_long %>% ggplot(aes(x = varia, y = val)) +
geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3, outlier.colour = "red", shape = 0.3) #이왕이면, 색도 넣고

#patchwork가 왜 안돌아가지?
#raw_box + sc_raw_box
#이 데이터셋에서는 boxplot으로 outlier를 찾는게 의미가 있는지 잘모르겠다.
##잠깐 0이랑 1로 fraud다 아니다 보는것 불편해, 바꿀래
sc_raw_long$class <- factor(sc_raw_long$class, levels = c("0", "1"), labels = c("NF", "FRA"))
##잠깐, sampling해서 크기를 1/5 수준으로 줄이자고 하자
set.seed(123)
idx <- sample(size = NROW(sc_raw), x = c("tr", "vl", "te"), replace= T, prob = c(1,2,2))
tr <- sc_raw[idx == 'tr', ]
te <- sc_raw[idx == 'te', ]
vl <- sc_raw[idx == 'vl', ]
NROW(tr) #57218로 줄였다.
## [1] 57218
tr_long <- tr %>% gather(varia, val, starts_with("v")) #long으로!
##2. grubbs test 정규분포를 가정하며, histogram으로 분포한번보고, 정규분포인지 테스트하고 돌린다, 결과치를 해석할 수 있어야함
#https://statkclee.github.io/model/model-anomaly.html
#https://sosal.kr/945
#https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h1.htm
#https://www.youtube.com/watch?v=ovIoVApeH_Q
#https://www.youtube.com/watch?v=rcYN89Trt4Y
#v24하나만 보자, histogram 먼저 그려보고.
tr_long_v24 <- tr_long %>% filter(varia == "v24")
tr_long_v24 %>% ggplot(aes(x = val)) + geom_histogram() #아, 첨도랑 왜도가,,, 대충 정규분포라 치자
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

grubbs.test(tr$v24)
##
## Grubbs test for one outlier
##
## data: tr$v24
## G = 6.64222, U = 0.99923, p-value = 8.765e-07
## alternative hypothesis: highest value 6.6283560145564 is an outlier
#결과는 아래와 같다. 가장 높은 value가 outlier일 확률을 말해준다.
#하지만, 정규분포를 가정해야하며, 가장 큰 값만 outlier인지 아닌지 봐주는 것 같다. 역시 써먹기는 어렵겠다.
#grubbs.test(x, type = 10, opposite = FALSE, two.sided = FALSE)
# Grubbs test for one outlier
#
# data: tr$v24
# G = 6.64222, U = 0.99923, p-value = 8.765e-07
# alternative hypothesis: highest value 6.6283560145564 is an outlier
##3. AnomalyDetectionVec vector에서 anomaly를 찾고 싶다면.
#http://research.sualab.com/introduction/review/2020/01/30/anomaly-detection-overview-1.html
#AnomalyDetectionVec(data, period = , direction = , plot = T/F)
#예를들어 river_anomalies <- AnomalyDetectionVec(x = river$nitrate, period = 12, direction = 'both', plot = T)
#어떤 시계열 데이터를 가져와야할까? 아보카도 데이터를 가져와보자
raw_avo <- read.csv("avocado.csv")
names(raw_avo) <- tolower(names(raw_avo))
str(raw_avo)
## 'data.frame': 18249 obs. of 14 variables:
## $ x : int 0 1 2 3 4 5 6 7 8 9 ...
## $ date : Factor w/ 169 levels "2015-01-04","2015-01-11",..: 52 51 50 49 48 47 46 45 44 43 ...
## $ averageprice: num 1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
## $ total.volume: num 64237 54877 118220 78992 51040 ...
## $ x4046 : num 1037 674 795 1132 941 ...
## $ x4225 : num 54455 44639 109150 71976 43838 ...
## $ x4770 : num 48.2 58.3 130.5 72.6 75.8 ...
## $ total.bags : num 8697 9506 8145 5811 6184 ...
## $ small.bags : num 8604 9408 8042 5677 5986 ...
## $ large.bags : num 93.2 97.5 103.1 133.8 197.7 ...
## $ xlarge.bags : num 0 0 0 0 0 0 0 0 0 0 ...
## $ type : Factor w/ 2 levels "conventional",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ region : Factor w/ 54 levels "Albany","Atlanta",..: 1 1 1 1 1 1 1 1 1 1 ...
raw_avo_avprice <- raw_avo %>% select(x, date, averageprice)
raw_avo_avprice %>% ggplot(aes(date, averageprice)) + geom_line()

raw_avo_avprice_ts <- ts(na.omit(raw_avo_avprice$averageprice), frequency = 52)
raw_avo_avprice_ts %>% ts.plot()

#package를 깔 수가 없다. 뭐가 문제인가,,,, 아, 모르겠다 넘어가자.
#AnomalyDetectionVec(x=raw_avo_avprice$averageprice, period = 52, direction = "both", plot = T)
##4. 드디어 KNN
#1. 첫번째 방법. outlierdetection패키지에서 nn 함수쓰기
#https://rdrr.io/cran/OutlierDetection/man/nn.html
# knn_tr <- nn(tr, k = 0.05 * nrow(x), cutoff = 0.95, Method = "euclidean", rnames = FALSE, boottimes = 100)
#12.2G size vector를 allocate 못한다며 에러난다,
#그럼 다른 데이터로 해보자
knn_te <- nn(iris[,1:4], k = 0.05*nrow(iris), cutoff = 0.95, Method = "euclidean", rnames = FALSE, boottimes =100)
knn_te
## $`Outlier Observations`
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 107 4.9 2.5 4.5 1.7
## 118 7.7 3.8 6.7 2.2
## 119 7.7 2.6 6.9 2.3
## 132 7.9 3.8 6.4 2.0
##
## $`Location of Outlier`
## [1] 107 118 119 132
##
## $`Outlier Probability`
## [1] 1 1 1 1
##
## $`3Dplot`
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
#와, 3차원 plotly로 보여주네? 재밌다.
#또 다른 데이터로도 해볼래! 2015년 행복도 데이터
happy_2015 <- read.csv("2015.csv")
str(happy_2015)
## 'data.frame': 158 obs. of 12 variables:
## $ Country : Factor w/ 158 levels "Afghanistan",..: 136 59 38 106 25 46 100 135 101 7 ...
## $ Region : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 6 10 10 10 1 1 ...
## $ Happiness.Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Happiness.Score : num 7.59 7.56 7.53 7.52 7.43 ...
## $ Standard.Error : num 0.0341 0.0488 0.0333 0.0388 0.0355 ...
## $ Economy..GDP.per.Capita. : num 1.4 1.3 1.33 1.46 1.33 ...
## $ Family : num 1.35 1.4 1.36 1.33 1.32 ...
## $ Health..Life.Expectancy. : num 0.941 0.948 0.875 0.885 0.906 ...
## $ Freedom : num 0.666 0.629 0.649 0.67 0.633 ...
## $ Trust..Government.Corruption.: num 0.42 0.141 0.484 0.365 0.33 ...
## $ Generosity : num 0.297 0.436 0.341 0.347 0.458 ...
## $ Dystopia.Residual : num 2.52 2.7 2.49 2.47 2.45 ...
free_gen <- happy_2015[,c("Freedom", "Generosity")]
free_gen_happy <- nn(free_gen , k = 0.05*nrow(free_gen), cutoff = 0.95, Method = "manhattan", rnames = FALSE)
free_gen_happy
## $`Outlier Observations`
## Freedom Generosity
## 102 0.07699 0.00000
## 112 0.00000 0.17922
## 129 0.44017 0.79588
## 156 0.15684 0.47179
##
## $`Location of Outlier`
## [1] 102 112 129 156
##
## $`Outlier Probability`
## [1] 1.00 0.96 1.00 1.00
##
## $`Scatter plot`

#knn으로 outlier 시각화하기는 이게 재밌네. method를 다양하게 볼 수 있음. 아까는 유클리디안, 이번에는 맨하탄
#2. 두번째 방법 get.knn
sc_free_gen <- as.data.frame(scale(free_gen)) #거리다룰 때는 scale 필수 (끝나고 as.data.frame으로 만들걸그랬나)
g_knn_happy <- get.knn(data = sc_free_gen, k = 5) #dataknn <- get.knn(data, k = )
names(g_knn_happy) #nn.dist와 nn.index가 나온다.
## [1] "nn.index" "nn.dist"
head(g_knn_happy$nn.dist, 3)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.2830544 0.3019585 0.3546693 0.3563672 0.3681585
## [2,] 0.1492079 0.1744009 0.1935559 0.2458342 0.3135681
## [3,] 0.1372279 0.1420937 0.1812864 0.1857843 0.3681585
# dist는 거리다. [1,1]을 보자. 이건 첫 데이터와 가장 가까운 이웃과의 거리다
# [2,1]은 첫데이터와 두번째로 가까운 이웃과의 거리
free_gen$score <- rowMeans(g_knn_happy$nn.dist)
g_knn_happy_score <- rowMeans(g_knn_happy$nn.dist)
#이웃들과의 거리 평균을 구한다.
which.max(g_knn_happy_score)
## [1] 129
#그 중에서 제일 거리 평균이 먼 친구를 찾는다! 자, 얘가 outlier다!
plot(Freedom ~ Generosity, cex = sqrt(score), free_gen, pch =20)

#떨어진 애들을 크게 그려서 시각화하기!
#자, 카드 데이터로 똑같이 해보,,,기는 귀찮다. 게다가, 이건 dimension이 너무 많아.
g_knn_tr <- get.knn(data = tr, k = 5)
knn_tr_score <- rowMeans(g_knn_tr$nn.dist)
which.max(knn_tr_score)
## [1] 57103
##5. lof (local outlier factor) 데이터의 밀도까지 고려해서 (주변 k 개 밀도의 평균과 점 p의 밀도를 비교해서, 주변 k개의 밀도가 높으면 lof는 올라감)
#https://godongyoung.github.io/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D/2019/03/11/Local-Outlier-Factor(LOF).html
#https://jayhey.github.io/novelty%20detection/2017/11/10/Novelty_detection_LOF/
#https://m.blog.naver.com/PostView.nhn?blogId=wjddudwo209&logNo=220309587111&proxyReferer=https:%2F%2Fwww.google.com%2F
#https://buillee.tistory.com/93
library(dbscan)
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.6.1
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'DMwR'
## The following object is masked from 'package:dbscan':
##
## kNN
#사용법: lof(scale(data), k = )
#사용법: lofactor(data, k = )
#lof값이 1보다 크면 가능성있음.
lof_happy <- lof(sc_free_gen, k = 5)
plot(density(lof_happy), main = "distr of out") #분포 densitiy를 그려줌

#top3만 뽑자
top3outlier <- order(lof_happy, decreasing = T)[1:3]
top3outlier
## [1] 129 102 81
#visualize outliers with plots
n <- nrow(sc_free_gen)
labels <- 1:n
labels[-top3outlier] <- "."
biplot(prcomp(sc_free_gen), cex = .98, xlabs = labels)

pch <- rep(".", n)
pch[top3outlier] <- "+"
col <- rep("black", n)
col[top3outlier] <- "red"
pairs(sc_free_gen, pch = pch, col = col)

#knn 이랑 비교를 해보면....에, 똑같은 애 찾네
# Calculate and append kNN distance as a new column
sc_free_gen_nn <- get.knn(sc_free_gen, k = 5)
sc_free_gen$score_knn <- rowMeans(sc_free_gen_nn$nn.dist)
# Calculate and append LOF as a new column
sc_free_gen$score_lof <- lof(sc_free_gen, k = 10)
# Find the row location of highest kNN
which.max(sc_free_gen$score_knn)
## [1] 129
# Find the row location of highest LOF
which.max(sc_free_gen$score_lof)
## [1] 129
##6.isolation tree
#https://donghwa-kim.github.io/iforest.html
#https://www.kaggle.com/norealityshows/outlier-detection-with-isolation-forest-in-r
# #datacamp 버젼
# #install.packages("remotes")
# remotes::install_github("Zelazny7/isofor")
# library(isofor)
# #itree <- iForest(data, nt = #)
# #preidct(itree, newdata = )
#
# # Build an isolation tree
# sc_free_gen_tree <- iForest(sc_free_gen, nt = 1)
# # Create isolation score
# sc_free_gen$tree_score <- predict(sc_free_gen_tree, newdata = sc_free_gen)
# # Histogram plot of the scores
# hist(sc_free_gen$tree_score, breaks = 40)
# # Fit isolation forest
# sc_free_gen_forest <- iForest(sc_free_gen, nt = 100, phi = 200)
# # Create isolation score from forest
# sc_free_gen_score <- predict(sc_free_gen_forest, newdata = sc_free_gen)
# # Append score to the happy data
# sc_free_gen$score <- sc_free_gen_score
# contourplot(score ~ Freedom + Generosity, data = sc_free_gen, region = TRUE)
#
# #캐글에서 본 버젼...인데 안된다...왜지.
# #install.packages("solitude")
# #library(solitude)
#
# #우선 데이터를 다른 걸로 준비해보자
# econ_free_hs <- happy_2015[,c("Economy..GDP.per.Capita.", "Freedom", "Happiness.Score")]
# econ_free_hs <- as.data.frame(scale(econ_free_hs))
# econ_free_hs %>% ggplot(aes(x = Happiness.Score, y = Economy..GDP.per.Capita.)) + geom_point()
#
# #initiate an iso forest
# iso <- isolationForest$new()
# econ_free_hsX <- econ_free_hs[,setdiff(colnames(econ_free_hs), "econ_free_hs")]
# # iso$fit(econ_free_hsX) 에러남 에휴.
##7.categorical 이라면? Gower distance (거리기반)
#https://www.rdocumentation.org/packages/StatMatch/versions/1.3.0/topics/gower.dist
#https://www.youtube.com/watch?v=0qp7p98Su6U
#https://www.rdocumentation.org/packages/cluster/versions/2.1.0/topics/daisy
#hamming distance
library(cluster)
# 사용법
# daisy(data, metric = "gower")
#데이터를 다시 준비해야겠군
glimpse(happy_2015)
## Rows: 158
## Columns: 12
## $ Country <fct> Switzerland, Iceland, Denmark, N...
## $ Region <fct> Western Europe, Western Europe, ...
## $ Happiness.Rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1...
## $ Happiness.Score <dbl> 7.587, 7.561, 7.527, 7.522, 7.42...
## $ Standard.Error <dbl> 0.03411, 0.04884, 0.03328, 0.038...
## $ Economy..GDP.per.Capita. <dbl> 1.39651, 1.30232, 1.32548, 1.459...
## $ Family <dbl> 1.34951, 1.40223, 1.36058, 1.330...
## $ Health..Life.Expectancy. <dbl> 0.94143, 0.94784, 0.87464, 0.885...
## $ Freedom <dbl> 0.66557, 0.62877, 0.64938, 0.669...
## $ Trust..Government.Corruption. <dbl> 0.41978, 0.14145, 0.48357, 0.365...
## $ Generosity <dbl> 0.29678, 0.43630, 0.34139, 0.346...
## $ Dystopia.Residual <dbl> 2.51738, 2.70201, 2.49204, 2.465...
happy_four <- happy_2015 %>% select(Country, Region, Happiness.Rank, Happiness.Score)
happy_four$Happiness.Rank <- as.numeric(happy_four$Happiness.Rank)
daisy_happy <- as.matrix(daisy(happy_four, metric = "gower"))
#distance matrix,
#can't apply kmeans clustering,
#can use kmedoids, hierarchical clustering
#통계분산
#거리
#밀도
#나무
#유사성 nominal: if 1, p = q / ordinal: |p-q|/(n-1) / interval: - |p-q|