1. Handling Missing Values

Load Dataset

data=airquality
sum(is.na(data))

## [1] 44

Terlihat bahwa dataset airquality mengandung 44 missing values

Detect Missing Value

library(mice)
md.pattern(data)

##     Wind Temp Month Day Solar.R Ozone   
## 111    1    1     1   1       1     1  0
## 35     1    1     1   1       1     0  1
## 5      1    1     1   1       0     1  1
## 2      1    1     1   1       0     0  2
##        0    0     0   0       7    37 44

Output di atas menunjukkan bahwa 111 object lengkap, 35 object mengandung missing value pada variabel ozone, 5 object mengandung missing value pada variabel Solar.R, dan 2 object mengandung missing value pada kedua variabel.

apply(is.na(data), 2, which)

## $Ozone
##  [1]   5  10  25  26  27  32  33  34  35  36  37  39  42  43  45  46  52  53  54
## [20]  55  56  57  58  59  60  61  65  72  75  83  84 102 103 107 115 119 150
## 
## $Solar.R
## [1]  5  6 11 27 96 97 98
## 
## $Wind
## integer(0)
## 
## $Temp
## integer(0)
## 
## $Month
## integer(0)
## 
## $Day
## integer(0)

Output di atas menunjukkan object mana yang mengandung missing value.

df=data[c(5,6, 10,11,25,26,27,32,33,34,35,36,37,39,42,43,45,46,52,53,54,55,56,57,58,59,60,61,65,72,75,83,84,96,97,98,102,103,107,115,119,150),]

library(ggplot2)
ggplot(data, aes(x = Wind, y = Temp, color = Ozone)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Wind', y='Temperature',
       color = 'Ozone') + 
  scale_color_gradient(low = "green", high = "blue",
                       na.value = "red", guide = "legend") +
  theme_minimal()+theme(legend.position="right")

Plot di atas menunjukkan sebaran object merah yang mengandung missing value. Selain itu, object juga dikelompokkan berdasarkan tingkatan nilai Ozone. Selanjutnya, object yang mengandung missing value akan diimputasi dengan metode KNN dengan k=5

KNN Imputation

library(caret)
library(dplyr)
library(RANN)
preProcValues <- preProcess(data %>% 
                          dplyr::select(Wind, Temp, Ozone, Solar.R, Month,Day),
                            method = c("knnImpute"),
                            k = 5,
                            knnSummary = mean)
imputasi <- predict(preProcValues, data,na.action = na.pass)
head(imputasi)

##         Ozone      Solar.R       Wind       Temp     Month       Day
## 1 -0.03423409  0.045176154 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.754304874 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.410083876  0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977  1.410956244  0.4378323 -1.6779609 -1.407294 -1.331592
## 5 -0.81027658 -0.221317522  1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817  0.007422883  1.4029185 -1.2553634 -1.407294 -1.105973

Hasil imputasi dalam bentuk yang terstandarisasi, untuk mengembalikan sesuai nilainya maka perlu transformasi.

procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
 imputasi[i] <- imputasi[i]*preProcValues$std[i]+preProcValues$mean[i] 
}
imputasi

##     Ozone Solar.R Wind Temp Month Day
## 1    41.0   190.0  7.4   67     5   1
## 2    36.0   118.0  8.0   72     5   2
## 3    12.0   149.0 12.6   74     5   3
## 4    18.0   313.0 11.5   62     5   4
## 5    15.4   166.0 14.3   56     5   5
## 6    28.0   186.6 14.9   66     5   6
## 7    23.0   299.0  8.6   65     5   7
## 8    19.0    99.0 13.8   59     5   8
## 9     8.0    19.0 20.1   61     5   9
## 10   21.0   194.0  8.6   69     5  10
## 11    7.0   253.4  6.9   74     5  11
## 12   16.0   256.0  9.7   69     5  12
## 13   11.0   290.0  9.2   66     5  13
## 14   14.0   274.0 10.9   68     5  14
## 15   18.0    65.0 13.2   58     5  15
## 16   14.0   334.0 11.5   64     5  16
## 17   34.0   307.0 12.0   66     5  17
## 18    6.0    78.0 18.4   57     5  18
## 19   30.0   322.0 11.5   68     5  19
## 20   11.0    44.0  9.7   62     5  20
## 21    1.0     8.0  9.7   59     5  21
## 22   11.0   320.0 16.6   73     5  22
## 23    4.0    25.0  9.7   61     5  23
## 24   32.0    92.0 12.0   61     5  24
## 25   16.6    66.0 16.6   57     5  25
## 26   20.6   266.0 14.9   58     5  26
## 27   13.6    41.2  8.0   57     5  27
## 28   23.0    13.0 12.0   67     5  28
## 29   45.0   252.0 14.9   81     5  29
## 30  115.0   223.0  5.7   79     5  30
## 31   37.0   279.0  7.4   76     5  31
## 32   59.0   286.0  8.6   78     6   1
## 33   32.0   287.0  9.7   74     6   2
## 34   15.0   242.0 16.1   67     6   3
## 35   43.6   186.0  9.2   84     6   4
## 36   43.6   220.0  8.6   85     6   5
## 37   36.2   264.0 14.3   79     6   6
## 38   29.0   127.0  9.7   82     6   7
## 39   73.6   273.0  6.9   87     6   8
## 40   71.0   291.0 13.8   90     6   9
## 41   39.0   323.0 11.5   87     6  10
## 42   67.2   259.0 10.9   93     6  11
## 43   77.8   250.0  9.2   92     6  12
## 44   23.0   148.0  8.0   82     6  13
## 45   39.0   332.0 13.8   80     6  14
## 46   30.8   322.0 11.5   79     6  15
## 47   21.0   191.0 14.9   77     6  16
## 48   37.0   284.0 20.7   72     6  17
## 49   20.0    37.0  9.2   65     6  18
## 50   12.0   120.0 11.5   73     6  19
## 51   13.0   137.0 10.3   76     6  20
## 52   50.0   150.0  6.3   77     6  21
## 53   39.4    59.0  1.7   76     6  22
## 54   26.2    91.0  4.6   76     6  23
## 55   74.4   250.0  6.3   76     6  24
## 56   36.6   135.0  8.0   75     6  25
## 57   53.6   127.0  8.0   78     6  26
## 58   17.6    47.0 10.3   73     6  27
## 59   24.0    98.0 11.5   80     6  28
## 60   20.6    31.0 14.9   77     6  29
## 61   77.8   138.0  8.0   83     6  30
## 62  135.0   269.0  4.1   84     7   1
## 63   49.0   248.0  9.2   85     7   2
## 64   32.0   236.0  9.2   81     7   3
## 65   35.6   101.0 10.9   84     7   4
## 66   64.0   175.0  4.6   83     7   5
## 67   40.0   314.0 10.9   83     7   6
## 68   77.0   276.0  5.1   88     7   7
## 69   97.0   267.0  6.3   92     7   8
## 70   97.0   272.0  5.7   92     7   9
## 71   85.0   175.0  7.4   89     7  10
## 72   45.0   139.0  8.6   82     7  11
## 73   10.0   264.0 14.3   73     7  12
## 74   27.0   175.0 14.9   81     7  13
## 75   47.2   291.0 14.9   91     7  14
## 76    7.0    48.0 14.3   80     7  15
## 77   48.0   260.0  6.9   81     7  16
## 78   35.0   274.0 10.3   82     7  17
## 79   61.0   285.0  6.3   84     7  18
## 80   79.0   187.0  5.1   87     7  19
## 81   63.0   220.0 11.5   85     7  20
## 82   16.0     7.0  6.9   74     7  21
## 83   63.4   258.0  9.7   81     7  22
## 84   53.6   295.0 11.5   82     7  23
## 85   80.0   294.0  8.6   86     7  24
## 86  108.0   223.0  8.0   85     7  25
## 87   20.0    81.0  8.6   82     7  26
## 88   52.0    82.0 12.0   86     7  27
## 89   82.0   213.0  7.4   88     7  28
## 90   50.0   275.0  7.4   86     7  29
## 91   64.0   253.0  7.4   83     7  30
## 92   59.0   254.0  9.2   81     7  31
## 93   39.0    83.0  6.9   81     8   1
## 94    9.0    24.0 13.8   81     8   2
## 95   16.0    77.0  7.4   82     8   3
## 96   78.0   198.0  6.9   86     8   4
## 97   35.0   147.8  7.4   85     8   5
## 98   66.0   185.2  4.6   87     8   6
## 99  122.0   255.0  4.0   89     8   7
## 100  89.0   229.0 10.3   90     8   8
## 101 110.0   207.0  8.0   90     8   9
## 102  85.0   222.0  8.6   92     8  10
## 103  66.0   137.0 11.5   86     8  11
## 104  44.0   192.0 11.5   86     8  12
## 105  28.0   273.0 11.5   82     8  13
## 106  65.0   157.0  9.7   80     8  14
## 107  23.2    64.0 11.5   79     8  15
## 108  22.0    71.0 10.3   77     8  16
## 109  59.0    51.0  6.3   79     8  17
## 110  23.0   115.0  7.4   76     8  18
## 111  31.0   244.0 10.9   78     8  19
## 112  44.0   190.0 10.3   78     8  20
## 113  21.0   259.0 15.5   77     8  21
## 114   9.0    36.0 14.3   72     8  22
## 115  31.0   255.0 12.6   75     8  23
## 116  45.0   212.0  9.7   79     8  24
## 117 168.0   238.0  3.4   81     8  25
## 118  73.0   215.0  8.0   86     8  26
## 119  85.4   153.0  5.7   88     8  27
## 120  76.0   203.0  9.7   97     8  28
## 121 118.0   225.0  2.3   94     8  29
## 122  84.0   237.0  6.3   96     8  30
## 123  85.0   188.0  6.3   94     8  31
## 124  96.0   167.0  6.9   91     9   1
## 125  78.0   197.0  5.1   92     9   2
## 126  73.0   183.0  2.8   93     9   3
## 127  91.0   189.0  4.6   93     9   4
## 128  47.0    95.0  7.4   87     9   5
## 129  32.0    92.0 15.5   84     9   6
## 130  20.0   252.0 10.9   80     9   7
## 131  23.0   220.0 10.3   78     9   8
## 132  21.0   230.0 10.9   75     9   9
## 133  24.0   259.0  9.7   73     9  10
## 134  44.0   236.0 14.9   81     9  11
## 135  21.0   259.0 15.5   76     9  12
## 136  28.0   238.0  6.3   77     9  13
## 137   9.0    24.0 10.9   71     9  14
## 138  13.0   112.0 11.5   71     9  15
## 139  46.0   237.0  6.9   78     9  16
## 140  18.0   224.0 13.8   67     9  17
## 141  13.0    27.0 10.3   76     9  18
## 142  24.0   238.0 10.3   68     9  19
## 143  16.0   201.0  8.0   82     9  20
## 144  13.0   238.0 12.6   64     9  21
## 145  23.0    14.0  9.2   71     9  22
## 146  36.0   139.0 10.3   81     9  23
## 147   7.0    49.0 10.3   69     9  24
## 148  14.0    20.0 16.6   63     9  25
## 149  30.0   193.0  6.9   70     9  26
## 150  31.8   145.0 13.2   77     9  27
## 151  14.0   191.0 14.3   75     9  28
## 152  18.0   131.0  8.0   76     9  29
## 153  20.0   223.0 11.5   68     9  30

Catatan: terdapat script/syntax KNN imputation yang lebih sederhana. Lihat pada materi Decision Tree

sum(is.na(imputasi))

## [1] 0

Data frame terakhir yang terbentuk sudah tidak mengandung missing value. Imputasi berhasil.

df2 adalah dataframe dari object-object yang sebelumnya mengandung missing value.

df2=imputasi[c(5,6, 10,11,25,  26,  27,  32,  33,  34,  35  ,36 , 37  ,39 , 42,  43,  45  ,46 , 52  ,53 , 54  ,55 , 56  ,57 , 58,  59,  60,  61,65,  72 , 75,  83,  84, 96,97,98,102, 103, 107, 115, 119, 150),]

Delete Object

clear=na.omit(data)

Mean Concept Imputation

ggplot(data, aes(x = Wind, y = Temp, color = Month)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Wind', y='Temperature',  title = "Berdasarkan Bulan",
       color = 'Month') + 
  scale_color_gradient(low = "green", high = "red", guide = "legend") +
  theme_minimal()+theme(legend.position="right")

rata=clear%>%group_by(Month) %>% 
  summarise(rataOzone=mean(Ozone),
            rataSolar= mean(Solar.R))
rata

## # A tibble: 5 x 3
##   Month rataOzone rataSolar
##   <int>     <dbl>     <dbl>
## 1     5      24.1      182.
## 2     6      29.4      184.
## 3     7      59.1      216.
## 4     8      60        173.
## 5     9      31.4      168.

imputasi2=data

for (i in 1:nrow(imputasi2))
{
  if (imputasi2$Month[i]==5)
  {
    imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[1]
    imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[1]
  }
  if (imputasi2$Month[i]==6)
  {
    imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[2]
    imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[2]
  }
  if (imputasi2$Month[i]==7)
  {
    imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[3]
    imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[3]
  }
  if (imputasi2$Month[i]==8)
  {
    imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[4]
    imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[4]
  }
  if (imputasi2$Month[i]==9)
  {
    imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[5]
    imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[5]
  }
}

2. Handling Imbalance Data

Oversampling SMOTE

library(scutr)
data2=iris[90:150,]

data2$Species= droplevels(data2$Species)
plot(data2$Species)

coba=oversample_smote(data2, "versicolor", "Species", 35)
nrow(coba)

## [1] 35

hasil_oversamp=rbind(data2,coba)
plot(hasil_oversamp$Species)

ggplot(data2, aes(x = Sepal.Length, y = Petal.Length, color = Species)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Sepal Length', y='Petal Length',
       color = 'Species')

ggplot(hasil_oversamp, aes(x = Sepal.Length, y = Petal.Length, color = Species)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Sepal Length', y='Petal Length',
       color = 'Species')

Undersampling TOMEK LINKs

library(readr)
data3 <- read_csv("D:/1. DATA MINING/AB_NYC_2019.csv")

## Rows: 48895 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

data3=data3[1:4000,]
data3$room_type=as.factor(data3$room_type)
summary(data3$room_type)

## Entire home/apt    Private room     Shared room 
##            2422            1525              53

library(dplyr)
data3=data3%>%
  select(id,host_id,neighbourhood_group,neighbourhood,room_type,price,availability_365,minimum_nights)%>%
  filter(room_type %in% c("Private room","Entire home/apt"))

data3$room_type=droplevels(data3$room_type)
plot(data3$room_type)

Anggap saja kedua factor timpang

library(scutr)

undersamp <- undersample_tomek(data3, "Entire home/apt", "room_type", 1700, tomek = "diff", force_m = T)

## Warning in dist(data[, -which(names(data) == cls_col)], method = dist_calc):
## NAs introduced by coercion

nrow(undersamp)

## [1] 1670

Catatan: masih terdapat warning dan hasil undersampling tidak sesuai sasaran (sebanyak 1700)

private=data3%>%filter(room_type=="Private room")

ggplot(data3, aes(x = availability_365, y = minimum_nights, color = room_type)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Availability', y='Minimum Nights',  title = "APA YA JUDULNYA",
       color = 'room_type')

hasil_undersamp=rbind(undersamp,private)
plot(hasil_undersamp$room_type)

ggplot(hasil_undersamp, aes(x = availability_365, y = minimum_nights, color = room_type)) + 
  geom_point(show.legend = TRUE) +
  labs(x = 'Availability', y='Minimum Nights',  title = "APA YA JUDULNYA",
       color = 'room_type')

Pertemuan Data Mining

212011525_Annisa Syifaulhaq

2023-03-11