9th_card.utf8

#data 

raw <- read.csv("creditcard.csv")
str(raw)

## 'data.frame':    284807 obs. of  31 variables:
##  $ Time  : num  0 0 1 1 2 2 4 7 7 9 ...
##  $ V1    : num  -1.36 1.192 -1.358 -0.966 -1.158 ...
##  $ V2    : num  -0.0728 0.2662 -1.3402 -0.1852 0.8777 ...
##  $ V3    : num  2.536 0.166 1.773 1.793 1.549 ...
##  $ V4    : num  1.378 0.448 0.38 -0.863 0.403 ...
##  $ V5    : num  -0.3383 0.06 -0.5032 -0.0103 -0.4072 ...
##  $ V6    : num  0.4624 -0.0824 1.8005 1.2472 0.0959 ...
##  $ V7    : num  0.2396 -0.0788 0.7915 0.2376 0.5929 ...
##  $ V8    : num  0.0987 0.0851 0.2477 0.3774 -0.2705 ...
##  $ V9    : num  0.364 -0.255 -1.515 -1.387 0.818 ...
##  $ V10   : num  0.0908 -0.167 0.2076 -0.055 0.7531 ...
##  $ V11   : num  -0.552 1.613 0.625 -0.226 -0.823 ...
##  $ V12   : num  -0.6178 1.0652 0.0661 0.1782 0.5382 ...
##  $ V13   : num  -0.991 0.489 0.717 0.508 1.346 ...
##  $ V14   : num  -0.311 -0.144 -0.166 -0.288 -1.12 ...
##  $ V15   : num  1.468 0.636 2.346 -0.631 0.175 ...
##  $ V16   : num  -0.47 0.464 -2.89 -1.06 -0.451 ...
##  $ V17   : num  0.208 -0.115 1.11 -0.684 -0.237 ...
##  $ V18   : num  0.0258 -0.1834 -0.1214 1.9658 -0.0382 ...
##  $ V19   : num  0.404 -0.146 -2.262 -1.233 0.803 ...
##  $ V20   : num  0.2514 -0.0691 0.525 -0.208 0.4085 ...
##  $ V21   : num  -0.01831 -0.22578 0.248 -0.1083 -0.00943 ...
##  $ V22   : num  0.27784 -0.63867 0.77168 0.00527 0.79828 ...
##  $ V23   : num  -0.11 0.101 0.909 -0.19 -0.137 ...
##  $ V24   : num  0.0669 -0.3398 -0.6893 -1.1756 0.1413 ...
##  $ V25   : num  0.129 0.167 -0.328 0.647 -0.206 ...
##  $ V26   : num  -0.189 0.126 -0.139 -0.222 0.502 ...
##  $ V27   : num  0.13356 -0.00898 -0.05535 0.06272 0.21942 ...
##  $ V28   : num  -0.0211 0.0147 -0.0598 0.0615 0.2152 ...
##  $ Amount: num  149.62 2.69 378.66 123.5 69.99 ...
##  $ Class : int  0 0 0 0 0 0 0 0 0 0 ...

summary(raw)

##       Time              V1                  V2           
##  Min.   :     0   Min.   :-56.40751   Min.   :-72.71573  
##  1st Qu.: 54202   1st Qu.: -0.92037   1st Qu.: -0.59855  
##  Median : 84692   Median :  0.01811   Median :  0.06549  
##  Mean   : 94814   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.:139321   3rd Qu.:  1.31564   3rd Qu.:  0.80372  
##  Max.   :172792   Max.   :  2.45493   Max.   : 22.05773  
##        V3                 V4                 V5            
##  Min.   :-48.3256   Min.   :-5.68317   Min.   :-113.74331  
##  1st Qu.: -0.8904   1st Qu.:-0.84864   1st Qu.:  -0.69160  
##  Median :  0.1799   Median :-0.01985   Median :  -0.05434  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :   0.00000  
##  3rd Qu.:  1.0272   3rd Qu.: 0.74334   3rd Qu.:   0.61193  
##  Max.   :  9.3826   Max.   :16.87534   Max.   :  34.80167  
##        V6                 V7                 V8           
##  Min.   :-26.1605   Min.   :-43.5572   Min.   :-73.21672  
##  1st Qu.: -0.7683   1st Qu.: -0.5541   1st Qu.: -0.20863  
##  Median : -0.2742   Median :  0.0401   Median :  0.02236  
##  Mean   :  0.0000   Mean   :  0.0000   Mean   :  0.00000  
##  3rd Qu.:  0.3986   3rd Qu.:  0.5704   3rd Qu.:  0.32735  
##  Max.   : 73.3016   Max.   :120.5895   Max.   : 20.00721  
##        V9                 V10                 V11          
##  Min.   :-13.43407   Min.   :-24.58826   Min.   :-4.79747  
##  1st Qu.: -0.64310   1st Qu.: -0.53543   1st Qu.:-0.76249  
##  Median : -0.05143   Median : -0.09292   Median :-0.03276  
##  Mean   :  0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.:  0.59714   3rd Qu.:  0.45392   3rd Qu.: 0.73959  
##  Max.   : 15.59500   Max.   : 23.74514   Max.   :12.01891  
##       V12                V13                V14          
##  Min.   :-18.6837   Min.   :-5.79188   Min.   :-19.2143  
##  1st Qu.: -0.4056   1st Qu.:-0.64854   1st Qu.: -0.4256  
##  Median :  0.1400   Median :-0.01357   Median :  0.0506  
##  Mean   :  0.0000   Mean   : 0.00000   Mean   :  0.0000  
##  3rd Qu.:  0.6182   3rd Qu.: 0.66251   3rd Qu.:  0.4931  
##  Max.   :  7.8484   Max.   : 7.12688   Max.   : 10.5268  
##       V15                V16                 V17           
##  Min.   :-4.49894   Min.   :-14.12985   Min.   :-25.16280  
##  1st Qu.:-0.58288   1st Qu.: -0.46804   1st Qu.: -0.48375  
##  Median : 0.04807   Median :  0.06641   Median : -0.06568  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   :  0.00000  
##  3rd Qu.: 0.64882   3rd Qu.:  0.52330   3rd Qu.:  0.39968  
##  Max.   : 8.87774   Max.   : 17.31511   Max.   :  9.25353  
##       V18                 V19                 V20           
##  Min.   :-9.498746   Min.   :-7.213527   Min.   :-54.49772  
##  1st Qu.:-0.498850   1st Qu.:-0.456299   1st Qu.: -0.21172  
##  Median :-0.003636   Median : 0.003735   Median : -0.06248  
##  Mean   : 0.000000   Mean   : 0.000000   Mean   :  0.00000  
##  3rd Qu.: 0.500807   3rd Qu.: 0.458949   3rd Qu.:  0.13304  
##  Max.   : 5.041069   Max.   : 5.591971   Max.   : 39.42090  
##       V21                 V22                  V23           
##  Min.   :-34.83038   Min.   :-10.933144   Min.   :-44.80774  
##  1st Qu.: -0.22839   1st Qu.: -0.542350   1st Qu.: -0.16185  
##  Median : -0.02945   Median :  0.006782   Median : -0.01119  
##  Mean   :  0.00000   Mean   :  0.000000   Mean   :  0.00000  
##  3rd Qu.:  0.18638   3rd Qu.:  0.528554   3rd Qu.:  0.14764  
##  Max.   : 27.20284   Max.   : 10.503090   Max.   : 22.52841  
##       V24                V25                 V26          
##  Min.   :-2.83663   Min.   :-10.29540   Min.   :-2.60455  
##  1st Qu.:-0.35459   1st Qu.: -0.31715   1st Qu.:-0.32698  
##  Median : 0.04098   Median :  0.01659   Median :-0.05214  
##  Mean   : 0.00000   Mean   :  0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.43953   3rd Qu.:  0.35072   3rd Qu.: 0.24095  
##  Max.   : 4.58455   Max.   :  7.51959   Max.   : 3.51735  
##       V27                  V28                Amount        
##  Min.   :-22.565679   Min.   :-15.43008   Min.   :    0.00  
##  1st Qu.: -0.070840   1st Qu.: -0.05296   1st Qu.:    5.60  
##  Median :  0.001342   Median :  0.01124   Median :   22.00  
##  Mean   :  0.000000   Mean   :  0.00000   Mean   :   88.35  
##  3rd Qu.:  0.091045   3rd Qu.:  0.07828   3rd Qu.:   77.17  
##  Max.   : 31.612198   Max.   : 33.84781   Max.   :25691.16  
##      Class         
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.001728  
##  3rd Qu.:0.000000  
##  Max.   :1.000000

names(raw) <- tolower(names(raw))
  #Time, V1~28, Amount, Class(int)
summary(raw$time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   54202   84692   94814  139321  172792

table(raw$class)

## 
##      0      1 
## 284315    492

#package
#install.packages("dplyr")
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#install.packages("ggplot2")
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.6.3

# library(plotly)
# library(patchwork)
library(outliers)
library(FNN)

## Warning: package 'FNN' was built under R version 3.6.1

library(dbscan)

## Warning: package 'dbscan' was built under R version 3.6.3

library(isofor)
library(cluster)
#install.packages("OutlierDetection")
library(OutlierDetection)

## Warning: package 'OutlierDetection' was built under R version 3.6.3

## Registered S3 method overwritten by 'spatstat':
##   method     from
##   print.boxx cli

##1.boxplot

  raw_long <- raw %>% gather(varia, val, starts_with("v")) #long하게 만들어서
  raw_box <- raw_long %>% ggplot(aes(x = varia, y = val)) + geom_boxplot() #그림 그리기.

  #잠깐, v1~ 28까지는 단위가 다를 수도 있잖아?
  sc_raw <- as.data.frame(scale(raw[,c(2:29)])) #scale 해버리자. 정규화/표준화에는 다른 방법들도 있지만, scale이 제일 만만함.
  sc_raw <- cbind(raw[,c(1, 30:31)], sc_raw) #다시 나머지 변인들 붙이고.
  sc_raw_long <- sc_raw %>% gather(varia, val, starts_with("v")) #다시 long하게 만들고
  sc_raw_long$varia <- as.factor(sc_raw_long$varia)

  #sc_raw_box <- sc_raw_long %>% ggplot(aes(x = varia, y = val)) + geom_boxplot() #다시 그림 그리기,,, 아까랑 다를바 없다???
  #sc_raw_box 
  #그리고, 이걸 그려주자니 엄청 시간이 오래걸린다. 원래 284807행인 데이터인데, 그걸 28배로 늘렸으니까. 
  sc_raw_long %>% ggplot(aes(x = varia, y = val)) +
    geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3, outlier.colour = "red", shape = 0.3) #이왕이면, 색도 넣고

  #patchwork가 왜 안돌아가지? 
  #raw_box + sc_raw_box

#이 데이터셋에서는 boxplot으로 outlier를 찾는게 의미가 있는지 잘모르겠다.

##잠깐 0이랑 1로 fraud다 아니다 보는것 불편해, 바꿀래
  sc_raw_long$class <- factor(sc_raw_long$class, levels = c("0", "1"), labels = c("NF", "FRA"))

##잠깐, sampling해서 크기를 1/5 수준으로 줄이자고 하자
  set.seed(123)
  idx <- sample(size = NROW(sc_raw), x = c("tr", "vl", "te"), replace= T, prob = c(1,2,2))
  tr <- sc_raw[idx == 'tr', ]
  te <- sc_raw[idx == 'te', ]
  vl <- sc_raw[idx == 'vl', ]
  
  NROW(tr) #57218로 줄였다.

## [1] 57218

  tr_long <- tr %>% gather(varia, val, starts_with("v")) #long으로!

##2. grubbs test 정규분포를 가정하며, histogram으로 분포한번보고, 정규분포인지 테스트하고 돌린다, 결과치를 해석할 수 있어야함
  #https://statkclee.github.io/model/model-anomaly.html
  #https://sosal.kr/945
  #https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h1.htm
  #https://www.youtube.com/watch?v=ovIoVApeH_Q
  #https://www.youtube.com/watch?v=rcYN89Trt4Y

  #v24하나만 보자, histogram 먼저 그려보고.
  tr_long_v24 <- tr_long %>% filter(varia == "v24")
  tr_long_v24 %>% ggplot(aes(x = val)) + geom_histogram() #아, 첨도랑 왜도가,,, 대충 정규분포라 치자

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  grubbs.test(tr$v24)

## 
##  Grubbs test for one outlier
## 
## data:  tr$v24
## G = 6.64222, U = 0.99923, p-value = 8.765e-07
## alternative hypothesis: highest value 6.6283560145564 is an outlier

  #결과는 아래와 같다. 가장 높은 value가 outlier일 확률을 말해준다.
  #하지만, 정규분포를 가정해야하며, 가장 큰 값만 outlier인지 아닌지 봐주는 것 같다. 역시 써먹기는 어렵겠다.
  #grubbs.test(x, type = 10, opposite = FALSE, two.sided = FALSE)
  #   Grubbs test for one outlier
  # 
  # data:  tr$v24
  # G = 6.64222, U = 0.99923, p-value = 8.765e-07
  # alternative hypothesis: highest value 6.6283560145564 is an outlier

##3. AnomalyDetectionVec vector에서 anomaly를 찾고 싶다면.

  #http://research.sualab.com/introduction/review/2020/01/30/anomaly-detection-overview-1.html
  #AnomalyDetectionVec(data, period = , direction = , plot = T/F)
  #예를들어 river_anomalies <- AnomalyDetectionVec(x = river$nitrate, period = 12, direction = 'both', plot = T)
  #어떤 시계열 데이터를 가져와야할까? 아보카도 데이터를 가져와보자

  raw_avo <- read.csv("avocado.csv")
  names(raw_avo) <- tolower(names(raw_avo))
  str(raw_avo)

## 'data.frame':    18249 obs. of  14 variables:
##  $ x           : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ date        : Factor w/ 169 levels "2015-01-04","2015-01-11",..: 52 51 50 49 48 47 46 45 44 43 ...
##  $ averageprice: num  1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
##  $ total.volume: num  64237 54877 118220 78992 51040 ...
##  $ x4046       : num  1037 674 795 1132 941 ...
##  $ x4225       : num  54455 44639 109150 71976 43838 ...
##  $ x4770       : num  48.2 58.3 130.5 72.6 75.8 ...
##  $ total.bags  : num  8697 9506 8145 5811 6184 ...
##  $ small.bags  : num  8604 9408 8042 5677 5986 ...
##  $ large.bags  : num  93.2 97.5 103.1 133.8 197.7 ...
##  $ xlarge.bags : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ type        : Factor w/ 2 levels "conventional",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year        : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ region      : Factor w/ 54 levels "Albany","Atlanta",..: 1 1 1 1 1 1 1 1 1 1 ...

  raw_avo_avprice <- raw_avo %>% select(x, date, averageprice)
  raw_avo_avprice %>% ggplot(aes(date, averageprice)) + geom_line()

  raw_avo_avprice_ts <- ts(na.omit(raw_avo_avprice$averageprice), frequency = 52)
  raw_avo_avprice_ts %>% ts.plot()

  #package를 깔 수가 없다. 뭐가 문제인가,,,, 아, 모르겠다 넘어가자. 
  #AnomalyDetectionVec(x=raw_avo_avprice$averageprice, period = 52, direction = "both", plot = T)

##4. 드디어 KNN

  #1. 첫번째 방법. outlierdetection패키지에서 nn 함수쓰기
  #https://rdrr.io/cran/OutlierDetection/man/nn.html
  
  # knn_tr <- nn(tr, k = 0.05 * nrow(x), cutoff = 0.95, Method = "euclidean", rnames = FALSE, boottimes = 100) 
    #12.2G size vector를 allocate 못한다며 에러난다, 
  
  #그럼 다른 데이터로 해보자
  knn_te <- nn(iris[,1:4], k = 0.05*nrow(iris), cutoff = 0.95, Method = "euclidean", rnames = FALSE, boottimes =100)
  knn_te

## $`Outlier Observations`
##     Sepal.Length Sepal.Width Petal.Length Petal.Width
## 107          4.9         2.5          4.5         1.7
## 118          7.7         3.8          6.7         2.2
## 119          7.7         2.6          6.9         2.3
## 132          7.9         3.8          6.4         2.0
## 
## $`Location of Outlier`
## [1] 107 118 119 132
## 
## $`Outlier Probability`
## [1] 1 1 1 1
## 
## $`3Dplot`

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

    #와, 3차원 plotly로 보여주네? 재밌다.
  
  #또 다른 데이터로도 해볼래! 2015년 행복도 데이터
  happy_2015 <- read.csv("2015.csv")
  str(happy_2015)

## 'data.frame':    158 obs. of  12 variables:
##  $ Country                      : Factor w/ 158 levels "Afghanistan",..: 136 59 38 106 25 46 100 135 101 7 ...
##  $ Region                       : Factor w/ 10 levels "Australia and New Zealand",..: 10 10 10 10 6 10 10 10 1 1 ...
##  $ Happiness.Rank               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Happiness.Score              : num  7.59 7.56 7.53 7.52 7.43 ...
##  $ Standard.Error               : num  0.0341 0.0488 0.0333 0.0388 0.0355 ...
##  $ Economy..GDP.per.Capita.     : num  1.4 1.3 1.33 1.46 1.33 ...
##  $ Family                       : num  1.35 1.4 1.36 1.33 1.32 ...
##  $ Health..Life.Expectancy.     : num  0.941 0.948 0.875 0.885 0.906 ...
##  $ Freedom                      : num  0.666 0.629 0.649 0.67 0.633 ...
##  $ Trust..Government.Corruption.: num  0.42 0.141 0.484 0.365 0.33 ...
##  $ Generosity                   : num  0.297 0.436 0.341 0.347 0.458 ...
##  $ Dystopia.Residual            : num  2.52 2.7 2.49 2.47 2.45 ...

  free_gen <- happy_2015[,c("Freedom", "Generosity")]
  
  free_gen_happy <- nn(free_gen , k = 0.05*nrow(free_gen), cutoff = 0.95, Method = "manhattan", rnames = FALSE)
  free_gen_happy

## $`Outlier Observations`
##     Freedom Generosity
## 102 0.07699    0.00000
## 112 0.00000    0.17922
## 129 0.44017    0.79588
## 156 0.15684    0.47179
## 
## $`Location of Outlier`
## [1] 102 112 129 156
## 
## $`Outlier Probability`
## [1] 1.00 0.96 1.00 1.00
## 
## $`Scatter plot`

    #knn으로 outlier 시각화하기는 이게 재밌네. method를 다양하게 볼 수 있음. 아까는 유클리디안, 이번에는 맨하탄
  
  
  #2. 두번째 방법 get.knn
    
    sc_free_gen <- as.data.frame(scale(free_gen)) #거리다룰 때는 scale 필수 (끝나고 as.data.frame으로 만들걸그랬나)
    g_knn_happy <- get.knn(data = sc_free_gen, k = 5) #dataknn <- get.knn(data, k = )
    names(g_knn_happy) #nn.dist와 nn.index가 나온다.

## [1] "nn.index" "nn.dist"

    head(g_knn_happy$nn.dist, 3)

##           [,1]      [,2]      [,3]      [,4]      [,5]
## [1,] 0.2830544 0.3019585 0.3546693 0.3563672 0.3681585
## [2,] 0.1492079 0.1744009 0.1935559 0.2458342 0.3135681
## [3,] 0.1372279 0.1420937 0.1812864 0.1857843 0.3681585

      # dist는 거리다. [1,1]을 보자. 이건 첫 데이터와 가장 가까운 이웃과의 거리다
      # [2,1]은 첫데이터와 두번째로 가까운 이웃과의 거리
    free_gen$score <- rowMeans(g_knn_happy$nn.dist)
    g_knn_happy_score <- rowMeans(g_knn_happy$nn.dist)
    #이웃들과의 거리 평균을 구한다. 
    which.max(g_knn_happy_score)

## [1] 129

    #그 중에서 제일 거리 평균이 먼 친구를 찾는다! 자, 얘가 outlier다!
    plot(Freedom ~ Generosity, cex = sqrt(score), free_gen, pch =20)

    #떨어진 애들을 크게 그려서 시각화하기!
    
    #자, 카드 데이터로 똑같이 해보,,,기는 귀찮다. 게다가, 이건 dimension이 너무 많아.
    g_knn_tr <- get.knn(data = tr, k = 5)
    knn_tr_score <- rowMeans(g_knn_tr$nn.dist)
    which.max(knn_tr_score)

## [1] 57103

##5. lof (local outlier factor) 데이터의 밀도까지 고려해서 (주변 k 개 밀도의 평균과 점 p의 밀도를 비교해서, 주변 k개의 밀도가 높으면 lof는 올라감)
  #https://godongyoung.github.io/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D/2019/03/11/Local-Outlier-Factor(LOF).html
  #https://jayhey.github.io/novelty%20detection/2017/11/10/Novelty_detection_LOF/
  #https://m.blog.naver.com/PostView.nhn?blogId=wjddudwo209&logNo=220309587111&proxyReferer=https:%2F%2Fwww.google.com%2F
  #https://buillee.tistory.com/93

  library(dbscan)
  library(DMwR)

## Warning: package 'DMwR' was built under R version 3.6.1

## Loading required package: lattice

## Loading required package: grid

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## 
## Attaching package: 'DMwR'

## The following object is masked from 'package:dbscan':
## 
##     kNN

  #사용법: lof(scale(data), k = )
  #사용법: lofactor(data, k = )

  #lof값이 1보다 크면 가능성있음. 

  lof_happy <- lof(sc_free_gen, k = 5)
  plot(density(lof_happy), main = "distr of out") #분포 densitiy를 그려줌

  #top3만 뽑자
  top3outlier <- order(lof_happy, decreasing = T)[1:3]
  top3outlier

## [1] 129 102  81

  #visualize outliers with plots
  n <- nrow(sc_free_gen)
  labels <- 1:n
  labels[-top3outlier] <- "."
  biplot(prcomp(sc_free_gen), cex = .98, xlabs = labels)

  pch <- rep(".", n)
  pch[top3outlier] <- "+"
  col <- rep("black", n)
  col[top3outlier] <- "red"
  pairs(sc_free_gen, pch = pch, col = col)

  #knn 이랑 비교를 해보면....에, 똑같은 애 찾네
  
  # Calculate and append kNN distance as a new column
  sc_free_gen_nn <- get.knn(sc_free_gen, k = 5)
  sc_free_gen$score_knn <- rowMeans(sc_free_gen_nn$nn.dist)
 
  # Calculate and append LOF as a new column
  sc_free_gen$score_lof <- lof(sc_free_gen, k = 10)
  
  # Find the row location of highest kNN
  which.max(sc_free_gen$score_knn)

## [1] 129

  # Find the row location of highest LOF
  which.max(sc_free_gen$score_lof)

## [1] 129

##6.isolation tree
#https://donghwa-kim.github.io/iforest.html
#https://www.kaggle.com/norealityshows/outlier-detection-with-isolation-forest-in-r

# #datacamp 버젼
# #install.packages("remotes")
# remotes::install_github("Zelazny7/isofor")
# library(isofor)
# #itree <- iForest(data, nt = #)
# #preidct(itree, newdata = )
# 
#   # Build an isolation tree 
#   sc_free_gen_tree <- iForest(sc_free_gen, nt = 1)
#   # Create isolation score
#   sc_free_gen$tree_score <- predict(sc_free_gen_tree, newdata = sc_free_gen)
#   # Histogram plot of the scores
#   hist(sc_free_gen$tree_score, breaks = 40)
#   # Fit isolation forest
#   sc_free_gen_forest <- iForest(sc_free_gen, nt = 100, phi = 200)
#   # Create isolation score from forest
#   sc_free_gen_score <- predict(sc_free_gen_forest, newdata = sc_free_gen)
#   # Append score to the happy data
#   sc_free_gen$score <- sc_free_gen_score
#   contourplot(score ~ Freedom + Generosity, data = sc_free_gen, region = TRUE)
#   
# #캐글에서 본 버젼...인데 안된다...왜지.
#   #install.packages("solitude")
#   #library(solitude)
#   
#   #우선 데이터를 다른 걸로 준비해보자
#   econ_free_hs <- happy_2015[,c("Economy..GDP.per.Capita.", "Freedom", "Happiness.Score")]
#   econ_free_hs <- as.data.frame(scale(econ_free_hs))
#   econ_free_hs %>% ggplot(aes(x = Happiness.Score, y = Economy..GDP.per.Capita.)) + geom_point()
#   
#   #initiate an iso forest
#   iso <- isolationForest$new()
#   econ_free_hsX <- econ_free_hs[,setdiff(colnames(econ_free_hs), "econ_free_hs")]
#   # iso$fit(econ_free_hsX) 에러남 에휴.

##7.categorical 이라면? Gower distance (거리기반)
#https://www.rdocumentation.org/packages/StatMatch/versions/1.3.0/topics/gower.dist
#https://www.youtube.com/watch?v=0qp7p98Su6U
#https://www.rdocumentation.org/packages/cluster/versions/2.1.0/topics/daisy
#hamming distance

  library(cluster)
  # 사용법
  # daisy(data, metric = "gower")

#데이터를 다시 준비해야겠군
glimpse(happy_2015)

## Rows: 158
## Columns: 12
## $ Country                       <fct> Switzerland, Iceland, Denmark, N...
## $ Region                        <fct> Western Europe, Western Europe, ...
## $ Happiness.Rank                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1...
## $ Happiness.Score               <dbl> 7.587, 7.561, 7.527, 7.522, 7.42...
## $ Standard.Error                <dbl> 0.03411, 0.04884, 0.03328, 0.038...
## $ Economy..GDP.per.Capita.      <dbl> 1.39651, 1.30232, 1.32548, 1.459...
## $ Family                        <dbl> 1.34951, 1.40223, 1.36058, 1.330...
## $ Health..Life.Expectancy.      <dbl> 0.94143, 0.94784, 0.87464, 0.885...
## $ Freedom                       <dbl> 0.66557, 0.62877, 0.64938, 0.669...
## $ Trust..Government.Corruption. <dbl> 0.41978, 0.14145, 0.48357, 0.365...
## $ Generosity                    <dbl> 0.29678, 0.43630, 0.34139, 0.346...
## $ Dystopia.Residual             <dbl> 2.51738, 2.70201, 2.49204, 2.465...

happy_four <- happy_2015 %>% select(Country, Region, Happiness.Rank, Happiness.Score)
happy_four$Happiness.Rank <- as.numeric(happy_four$Happiness.Rank)

daisy_happy <- as.matrix(daisy(happy_four, metric = "gower"))
#distance matrix, 
#can't apply kmeans clustering, 
#can use kmedoids, hierarchical clustering

#통계분산
#거리
#밀도
#나무

#유사성 nominal: if 1, p = q / ordinal: |p-q|/(n-1) / interval: - |p-q|