data=airquality
sum(is.na(data))
## [1] 44
Terlihat bahwa dataset airquality mengandung 44 missing values
library(mice)
md.pattern(data)
## Wind Temp Month Day Solar.R Ozone
## 111 1 1 1 1 1 1 0
## 35 1 1 1 1 1 0 1
## 5 1 1 1 1 0 1 1
## 2 1 1 1 1 0 0 2
## 0 0 0 0 7 37 44
Output di atas menunjukkan bahwa 111 object lengkap, 35 object mengandung missing value pada variabel ozone, 5 object mengandung missing value pada variabel Solar.R, dan 2 object mengandung missing value pada kedua variabel.
apply(is.na(data), 2, which)
## $Ozone
## [1] 5 10 25 26 27 32 33 34 35 36 37 39 42 43 45 46 52 53 54
## [20] 55 56 57 58 59 60 61 65 72 75 83 84 102 103 107 115 119 150
##
## $Solar.R
## [1] 5 6 11 27 96 97 98
##
## $Wind
## integer(0)
##
## $Temp
## integer(0)
##
## $Month
## integer(0)
##
## $Day
## integer(0)
Output di atas menunjukkan object mana yang mengandung missing value.
df=data[c(5,6, 10,11,25,26,27,32,33,34,35,36,37,39,42,43,45,46,52,53,54,55,56,57,58,59,60,61,65,72,75,83,84,96,97,98,102,103,107,115,119,150),]
library(ggplot2)
ggplot(data, aes(x = Wind, y = Temp, color = Ozone)) +
geom_point(show.legend = TRUE) +
labs(x = 'Wind', y='Temperature',
color = 'Ozone') +
scale_color_gradient(low = "green", high = "blue",
na.value = "red", guide = "legend") +
theme_minimal()+theme(legend.position="right")
Plot di atas menunjukkan sebaran object merah yang mengandung missing value. Selain itu, object juga dikelompokkan berdasarkan tingkatan nilai Ozone. Selanjutnya, object yang mengandung missing value akan diimputasi dengan metode KNN dengan k=5
library(caret)
library(dplyr)
library(RANN)
preProcValues <- preProcess(data %>%
dplyr::select(Wind, Temp, Ozone, Solar.R, Month,Day),
method = c("knnImpute"),
k = 5,
knnSummary = mean)
imputasi <- predict(preProcValues, data,na.action = na.pass)
head(imputasi)
## Ozone Solar.R Wind Temp Month Day
## 1 -0.03423409 0.045176154 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.754304874 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.410083876 0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977 1.410956244 0.4378323 -1.6779609 -1.407294 -1.331592
## 5 -0.81027658 -0.221317522 1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817 0.007422883 1.4029185 -1.2553634 -1.407294 -1.105973
Hasil imputasi dalam bentuk yang terstandarisasi, untuk mengembalikan sesuai nilainya maka perlu transformasi.
procNames <- data.frame(col = names(preProcValues$mean), mean = preProcValues$mean, sd = preProcValues$std)
for(i in procNames$col){
imputasi[i] <- imputasi[i]*preProcValues$std[i]+preProcValues$mean[i]
}
imputasi
## Ozone Solar.R Wind Temp Month Day
## 1 41.0 190.0 7.4 67 5 1
## 2 36.0 118.0 8.0 72 5 2
## 3 12.0 149.0 12.6 74 5 3
## 4 18.0 313.0 11.5 62 5 4
## 5 15.4 166.0 14.3 56 5 5
## 6 28.0 186.6 14.9 66 5 6
## 7 23.0 299.0 8.6 65 5 7
## 8 19.0 99.0 13.8 59 5 8
## 9 8.0 19.0 20.1 61 5 9
## 10 21.0 194.0 8.6 69 5 10
## 11 7.0 253.4 6.9 74 5 11
## 12 16.0 256.0 9.7 69 5 12
## 13 11.0 290.0 9.2 66 5 13
## 14 14.0 274.0 10.9 68 5 14
## 15 18.0 65.0 13.2 58 5 15
## 16 14.0 334.0 11.5 64 5 16
## 17 34.0 307.0 12.0 66 5 17
## 18 6.0 78.0 18.4 57 5 18
## 19 30.0 322.0 11.5 68 5 19
## 20 11.0 44.0 9.7 62 5 20
## 21 1.0 8.0 9.7 59 5 21
## 22 11.0 320.0 16.6 73 5 22
## 23 4.0 25.0 9.7 61 5 23
## 24 32.0 92.0 12.0 61 5 24
## 25 16.6 66.0 16.6 57 5 25
## 26 20.6 266.0 14.9 58 5 26
## 27 13.6 41.2 8.0 57 5 27
## 28 23.0 13.0 12.0 67 5 28
## 29 45.0 252.0 14.9 81 5 29
## 30 115.0 223.0 5.7 79 5 30
## 31 37.0 279.0 7.4 76 5 31
## 32 59.0 286.0 8.6 78 6 1
## 33 32.0 287.0 9.7 74 6 2
## 34 15.0 242.0 16.1 67 6 3
## 35 43.6 186.0 9.2 84 6 4
## 36 43.6 220.0 8.6 85 6 5
## 37 36.2 264.0 14.3 79 6 6
## 38 29.0 127.0 9.7 82 6 7
## 39 73.6 273.0 6.9 87 6 8
## 40 71.0 291.0 13.8 90 6 9
## 41 39.0 323.0 11.5 87 6 10
## 42 67.2 259.0 10.9 93 6 11
## 43 77.8 250.0 9.2 92 6 12
## 44 23.0 148.0 8.0 82 6 13
## 45 39.0 332.0 13.8 80 6 14
## 46 30.8 322.0 11.5 79 6 15
## 47 21.0 191.0 14.9 77 6 16
## 48 37.0 284.0 20.7 72 6 17
## 49 20.0 37.0 9.2 65 6 18
## 50 12.0 120.0 11.5 73 6 19
## 51 13.0 137.0 10.3 76 6 20
## 52 50.0 150.0 6.3 77 6 21
## 53 39.4 59.0 1.7 76 6 22
## 54 26.2 91.0 4.6 76 6 23
## 55 74.4 250.0 6.3 76 6 24
## 56 36.6 135.0 8.0 75 6 25
## 57 53.6 127.0 8.0 78 6 26
## 58 17.6 47.0 10.3 73 6 27
## 59 24.0 98.0 11.5 80 6 28
## 60 20.6 31.0 14.9 77 6 29
## 61 77.8 138.0 8.0 83 6 30
## 62 135.0 269.0 4.1 84 7 1
## 63 49.0 248.0 9.2 85 7 2
## 64 32.0 236.0 9.2 81 7 3
## 65 35.6 101.0 10.9 84 7 4
## 66 64.0 175.0 4.6 83 7 5
## 67 40.0 314.0 10.9 83 7 6
## 68 77.0 276.0 5.1 88 7 7
## 69 97.0 267.0 6.3 92 7 8
## 70 97.0 272.0 5.7 92 7 9
## 71 85.0 175.0 7.4 89 7 10
## 72 45.0 139.0 8.6 82 7 11
## 73 10.0 264.0 14.3 73 7 12
## 74 27.0 175.0 14.9 81 7 13
## 75 47.2 291.0 14.9 91 7 14
## 76 7.0 48.0 14.3 80 7 15
## 77 48.0 260.0 6.9 81 7 16
## 78 35.0 274.0 10.3 82 7 17
## 79 61.0 285.0 6.3 84 7 18
## 80 79.0 187.0 5.1 87 7 19
## 81 63.0 220.0 11.5 85 7 20
## 82 16.0 7.0 6.9 74 7 21
## 83 63.4 258.0 9.7 81 7 22
## 84 53.6 295.0 11.5 82 7 23
## 85 80.0 294.0 8.6 86 7 24
## 86 108.0 223.0 8.0 85 7 25
## 87 20.0 81.0 8.6 82 7 26
## 88 52.0 82.0 12.0 86 7 27
## 89 82.0 213.0 7.4 88 7 28
## 90 50.0 275.0 7.4 86 7 29
## 91 64.0 253.0 7.4 83 7 30
## 92 59.0 254.0 9.2 81 7 31
## 93 39.0 83.0 6.9 81 8 1
## 94 9.0 24.0 13.8 81 8 2
## 95 16.0 77.0 7.4 82 8 3
## 96 78.0 198.0 6.9 86 8 4
## 97 35.0 147.8 7.4 85 8 5
## 98 66.0 185.2 4.6 87 8 6
## 99 122.0 255.0 4.0 89 8 7
## 100 89.0 229.0 10.3 90 8 8
## 101 110.0 207.0 8.0 90 8 9
## 102 85.0 222.0 8.6 92 8 10
## 103 66.0 137.0 11.5 86 8 11
## 104 44.0 192.0 11.5 86 8 12
## 105 28.0 273.0 11.5 82 8 13
## 106 65.0 157.0 9.7 80 8 14
## 107 23.2 64.0 11.5 79 8 15
## 108 22.0 71.0 10.3 77 8 16
## 109 59.0 51.0 6.3 79 8 17
## 110 23.0 115.0 7.4 76 8 18
## 111 31.0 244.0 10.9 78 8 19
## 112 44.0 190.0 10.3 78 8 20
## 113 21.0 259.0 15.5 77 8 21
## 114 9.0 36.0 14.3 72 8 22
## 115 31.0 255.0 12.6 75 8 23
## 116 45.0 212.0 9.7 79 8 24
## 117 168.0 238.0 3.4 81 8 25
## 118 73.0 215.0 8.0 86 8 26
## 119 85.4 153.0 5.7 88 8 27
## 120 76.0 203.0 9.7 97 8 28
## 121 118.0 225.0 2.3 94 8 29
## 122 84.0 237.0 6.3 96 8 30
## 123 85.0 188.0 6.3 94 8 31
## 124 96.0 167.0 6.9 91 9 1
## 125 78.0 197.0 5.1 92 9 2
## 126 73.0 183.0 2.8 93 9 3
## 127 91.0 189.0 4.6 93 9 4
## 128 47.0 95.0 7.4 87 9 5
## 129 32.0 92.0 15.5 84 9 6
## 130 20.0 252.0 10.9 80 9 7
## 131 23.0 220.0 10.3 78 9 8
## 132 21.0 230.0 10.9 75 9 9
## 133 24.0 259.0 9.7 73 9 10
## 134 44.0 236.0 14.9 81 9 11
## 135 21.0 259.0 15.5 76 9 12
## 136 28.0 238.0 6.3 77 9 13
## 137 9.0 24.0 10.9 71 9 14
## 138 13.0 112.0 11.5 71 9 15
## 139 46.0 237.0 6.9 78 9 16
## 140 18.0 224.0 13.8 67 9 17
## 141 13.0 27.0 10.3 76 9 18
## 142 24.0 238.0 10.3 68 9 19
## 143 16.0 201.0 8.0 82 9 20
## 144 13.0 238.0 12.6 64 9 21
## 145 23.0 14.0 9.2 71 9 22
## 146 36.0 139.0 10.3 81 9 23
## 147 7.0 49.0 10.3 69 9 24
## 148 14.0 20.0 16.6 63 9 25
## 149 30.0 193.0 6.9 70 9 26
## 150 31.8 145.0 13.2 77 9 27
## 151 14.0 191.0 14.3 75 9 28
## 152 18.0 131.0 8.0 76 9 29
## 153 20.0 223.0 11.5 68 9 30
Catatan: terdapat script/syntax KNN imputation yang lebih sederhana. Lihat pada materi Decision Tree
sum(is.na(imputasi))
## [1] 0
Data frame terakhir yang terbentuk sudah tidak mengandung missing value. Imputasi berhasil.
df2 adalah dataframe dari object-object yang sebelumnya mengandung missing value.
df2=imputasi[c(5,6, 10,11,25, 26, 27, 32, 33, 34, 35 ,36 , 37 ,39 , 42, 43, 45 ,46 , 52 ,53 , 54 ,55 , 56 ,57 , 58, 59, 60, 61,65, 72 , 75, 83, 84, 96,97,98,102, 103, 107, 115, 119, 150),]
clear=na.omit(data)
ggplot(data, aes(x = Wind, y = Temp, color = Month)) +
geom_point(show.legend = TRUE) +
labs(x = 'Wind', y='Temperature', title = "Berdasarkan Bulan",
color = 'Month') +
scale_color_gradient(low = "green", high = "red", guide = "legend") +
theme_minimal()+theme(legend.position="right")
rata=clear%>%group_by(Month) %>%
summarise(rataOzone=mean(Ozone),
rataSolar= mean(Solar.R))
rata
## # A tibble: 5 x 3
## Month rataOzone rataSolar
## <int> <dbl> <dbl>
## 1 5 24.1 182.
## 2 6 29.4 184.
## 3 7 59.1 216.
## 4 8 60 173.
## 5 9 31.4 168.
imputasi2=data
for (i in 1:nrow(imputasi2))
{
if (imputasi2$Month[i]==5)
{
imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[1]
imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[1]
}
if (imputasi2$Month[i]==6)
{
imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[2]
imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[2]
}
if (imputasi2$Month[i]==7)
{
imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[3]
imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[3]
}
if (imputasi2$Month[i]==8)
{
imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[4]
imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[4]
}
if (imputasi2$Month[i]==9)
{
imputasi2$Ozone[i][is.na(imputasi2$Ozone[i])]<-rata$rataOzone[5]
imputasi2$Solar.R[i][is.na(imputasi2$Solar.R[i])]<-rata$rataSolar[5]
}
}
library(scutr)
data2=iris[90:150,]
data2$Species= droplevels(data2$Species)
plot(data2$Species)
coba=oversample_smote(data2, "versicolor", "Species", 35)
nrow(coba)
## [1] 35
hasil_oversamp=rbind(data2,coba)
plot(hasil_oversamp$Species)
ggplot(data2, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
geom_point(show.legend = TRUE) +
labs(x = 'Sepal Length', y='Petal Length',
color = 'Species')
ggplot(hasil_oversamp, aes(x = Sepal.Length, y = Petal.Length, color = Species)) +
geom_point(show.legend = TRUE) +
labs(x = 'Sepal Length', y='Petal Length',
color = 'Species')
library(readr)
data3 <- read_csv("D:/1. DATA MINING/AB_NYC_2019.csv")
## Rows: 48895 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date (1): last_review
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
data3=data3[1:4000,]
data3$room_type=as.factor(data3$room_type)
summary(data3$room_type)
## Entire home/apt Private room Shared room
## 2422 1525 53
library(dplyr)
data3=data3%>%
select(id,host_id,neighbourhood_group,neighbourhood,room_type,price,availability_365,minimum_nights)%>%
filter(room_type %in% c("Private room","Entire home/apt"))
data3$room_type=droplevels(data3$room_type)
plot(data3$room_type)
Anggap saja kedua factor timpang
library(scutr)
undersamp <- undersample_tomek(data3, "Entire home/apt", "room_type", 1700, tomek = "diff", force_m = T)
## Warning in dist(data[, -which(names(data) == cls_col)], method = dist_calc):
## NAs introduced by coercion
nrow(undersamp)
## [1] 1670
Catatan: masih terdapat warning dan hasil undersampling tidak sesuai sasaran (sebanyak 1700)
private=data3%>%filter(room_type=="Private room")
ggplot(data3, aes(x = availability_365, y = minimum_nights, color = room_type)) +
geom_point(show.legend = TRUE) +
labs(x = 'Availability', y='Minimum Nights', title = "APA YA JUDULNYA",
color = 'room_type')
hasil_undersamp=rbind(undersamp,private)
plot(hasil_undersamp$room_type)
ggplot(hasil_undersamp, aes(x = availability_365, y = minimum_nights, color = room_type)) +
geom_point(show.legend = TRUE) +
labs(x = 'Availability', y='Minimum Nights', title = "APA YA JUDULNYA",
color = 'room_type')