1. Package
Berikut ini adalah package yang dipakai dalam analisa week 4 ini.
library(gam)## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
library(rpart)
library(pROC)## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::when() masks foreach::when()
library(mlr3verse)## Loading required package: mlr3
library(mlr3extralearners)##
## Attaching package: 'mlr3extralearners'
## The following objects are masked from 'package:mlr3':
##
## lrn, lrns
library(precrec)##
## Attaching package: 'precrec'
## The following object is masked from 'package:pROC':
##
## auc
library(adabag)## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library(ROCR)
library(ROCit)
library(magrittr)##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(visdat)
library(naniar)
library(UpSetR)##
## Attaching package: 'UpSetR'
## The following object is masked from 'package:lattice':
##
## histogram
library(laeken)
library(vcd)## Loading required package: grid
library(VIM)## Loading required package: colorspace
##
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
##
## coords
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(sm)## Package 'sm', version 2.2-5.7: type help(sm) for summary information
library(ggplot2)
library(dplyr)
library(mlbench)
library(caret)
library(mlr3verse)
library(mlr3fselect)
library(DataExplorer)
library(skimr)##
## Attaching package: 'skimr'
## The following object is masked from 'package:naniar':
##
## n_complete
## The following object is masked from 'package:mlr3':
##
## partition
library(corrplot)## corrplot 0.92 loaded
library(leaps)
library(olsrr)##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(kableExtra) #Tampilan Tabel##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(agricolae) #Pemeriksaan Asumsi
library(lmtest) #Untuk pengecekan asumsi## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) #Untuk pengecekan asumsi## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:ROCit':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(tseries) #Untuk pengecekan asumsi## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(glmnet)## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-2
library(glmnetUtils)##
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
##
## cv.glmnet, glmnet
library(broom)
library(ggpubr)
library(modelr)##
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
##
## bootstrap
## The following object is masked from 'package:mlr3':
##
## resample
library(precrec)
library(adabag)
library(rpart.plot)
library(mice)##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
2. Data Training
Data training yang dipakai adalah data training hasil pre-processing pada week sebelumnya dengan sedikit ada perubahan yaitu merubah variabel factor menjadi variabel dummy.
data2 <- read.csv2("D:/Magister IPB/Kuliah/Semester 2/STA582_Pembelajaran_Mesin_Statistika/Praktikum/Tugas_UTS/Problem 2/datatrainnew.csv",stringsAsFactors = TRUE)
dim(data2)## [1] 49692 13
glimpse(data2)## Rows: 49,692
## Columns: 13
## $ property_type <fct> Apartment, Apartment, Apartment, Apartment, Apartm~
## $ room_type <fct> Entire home/apt, Entire home/apt, Entire home/apt,~
## $ accommodates <int> 3, 7, 5, 2, 2, 3, 2, 2, 2, 2, 2, 6, 2, 2, 2, 2, 8,~
## $ bathrooms <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,~
## $ bed_type <fct> Real Bed, Real Bed, Real Bed, Real Bed, Real Bed, ~
## $ cancellation_policy <fct> strict, strict, moderate, moderate, strict, modera~
## $ cleaning_fee <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR~
## $ city <fct> NYC, NYC, NYC, DC, SF, LA, LA, SF, LA, NYC, LA, DC~
## $ instant_bookable <fct> f, t, t, t, t, t, f, f, t, f, f, t, f, f, f, f, t,~
## $ bedrooms <int> 1, 3, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 3,~
## $ beds <int> 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 3,~
## $ price <dbl> 5.010635, 5.129899, 4.976734, 4.744932, 4.442651, ~
## $ sum_amenities <int> 9, 15, 19, 12, 10, 21, 26, 21, 13, 15, 17, 25, 7, ~
data2$bed_type <- ifelse(data2$bed_type == "Airbed", 1, ifelse(data2$bed_type == "Couch", 2, ifelse(data2$bed_type == "Futon", 3, ifelse(data2$bed_type == "Pull-out Sofa", 4,5))))
data2$cancellation_policy <- ifelse(data2$cancellation_policy == "flexible", 1, ifelse(data2$cancellation_policy == "moderate", 2, ifelse(data2$cancellation_policy == "strict", 3, ifelse(data2$cancellation_policy == "Psuper_strict_30", 4,5))))
data2$city <- ifelse(data2$city == "Boston", 1, ifelse(data2$city == "Chicago", 2, ifelse(data2$city == "DC", 3, ifelse(data2$city == "LA", 4,ifelse(data2$city == "NYC", 5,6)))))
data2$property_type <- ifelse(data2$property_type == "Apartment", 1,
ifelse(data2$property_type == "Bed & Breakfast", 2,
ifelse(data2$property_type == "Boat", 3,
ifelse(data2$property_type == "Boutique hotel", 4,
ifelse(data2$property_type == "Bungalow", 5,
ifelse(data2$property_type == "Cabin", 6,
ifelse(data2$property_type == "Camper/RV", 7,
ifelse(data2$property_type == "Castle", 8,
ifelse(data2$property_type == "Cave", 9,
ifelse(data2$property_type == "Chalet", 10,
ifelse(data2$property_type == "Condominium", 11,
ifelse(data2$property_type == "Dorm", 12,
ifelse(data2$property_type == "Earth House", 13,
ifelse(data2$property_type == "Guest suite", 14,
ifelse(data2$property_type == "Guesthouse", 15,
ifelse(data2$property_type == "Hostel", 16,
ifelse(data2$property_type == "House", 17,
ifelse(data2$property_type == "Hut", 18,
ifelse(data2$property_type == "In-law", 19,
ifelse(data2$property_type == "Loft", 20,
ifelse(data2$property_type == "Other", 21,
ifelse(data2$property_type == "Serviced apartment", 22,
ifelse(data2$property_type == "Tent", 23,
ifelse(data2$property_type == "Timeshare", 24,
ifelse(data2$property_type == "Townhouse", 25,
ifelse(data2$property_type == "Train", 26,
ifelse(data2$property_type == "Treehouse", 27,
ifelse(data2$property_type == "Vacation home", 28,
ifelse(data2$property_type == "Villa", 29,
30)))))))))))))))))))))))))))))
str(data2)## 'data.frame': 49692 obs. of 13 variables:
## $ property_type : num 1 1 1 1 1 1 11 17 17 1 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 1 1 1 1 2 1 1 2 2 2 ...
## $ accommodates : int 3 7 5 2 2 3 2 2 2 2 ...
## $ bathrooms : int 1 1 1 1 1 1 1 1 1 1 ...
## $ bed_type : num 5 5 5 5 5 5 5 5 5 5 ...
## $ cancellation_policy: num 3 3 2 2 3 2 2 2 2 3 ...
## $ cleaning_fee : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ city : num 5 5 5 3 6 4 4 6 4 5 ...
## $ instant_bookable : Factor w/ 2 levels "f","t": 1 2 2 2 2 2 1 1 2 1 ...
## $ bedrooms : int 1 3 1 0 1 1 1 1 1 1 ...
## $ beds : int 1 3 3 1 1 1 1 1 1 1 ...
## $ price : num 5.01 5.13 4.98 4.74 4.44 ...
## $ sum_amenities : int 9 15 19 12 10 21 26 21 13 15 ...
anyNA(data2)## [1] FALSE
3. Data Testing
Data testing tetap menggunakan data test awal, namun ada sedikit modifkasi yaitu adanya proses imputasi data hilang agar bisa dieksekuis prediksinya dalam model yang sudah dibangun. Metode dalam imputasi menggunakan random forest.
data_test <- read.csv2("D:/Magister IPB/Kuliah/Semester 2/STA582_Pembelajaran_Mesin_Statistika/Praktikum/Tugas_UTS/Problem 2/test2.csv",stringsAsFactors = TRUE)
data_test<- data_test[,c(2,3,5,6,7,8,9,10,17,27,28,29)]
data_test$bed_type <- ifelse(data_test$bed_type == "Airbed", 1, ifelse(data_test$bed_type == "Couch", 2, ifelse(data_test$bed_type == "Futon", 3, ifelse(data_test$bed_type == "Pull-out Sofa", 4,5))))
data_test$cancellation_policy <- ifelse(data_test$cancellation_policy == "flexible", 1, ifelse(data_test$cancellation_policy == "moderate", 2, ifelse(data_test$cancellation_policy == "strict", 3, ifelse(data_test$cancellation_policy == "Psuper_strict_30", 4,5))))
data_test$city <- ifelse(data_test$city == "Boston", 1, ifelse(data_test$city == "Chicago", 2, ifelse(data_test$city == "DC", 3, ifelse(data_test$city == "LA", 4,ifelse(data_test$city == "NYC", 5,6)))))
data_test$property_type <- ifelse(data_test$property_type == "Apartment", 1,
ifelse(data_test$property_type == "Bed & Breakfast", 2,
ifelse(data_test$property_type == "Boat", 3,
ifelse(data_test$property_type == "Boutique hotel", 4,
ifelse(data_test$property_type == "Bungalow", 5,
ifelse(data_test$property_type == "Cabin", 6,
ifelse(data_test$property_type == "Camper/RV", 7,
ifelse(data_test$property_type == "Castle", 8,
ifelse(data_test$property_type == "Cave", 9,
ifelse(data_test$property_type == "Chalet", 10,
ifelse(data_test$property_type == "Condominium", 11,
ifelse(data_test$property_type == "Dorm", 12,
ifelse(data_test$property_type == "Earth House", 13,
ifelse(data_test$property_type == "Guest suite", 14,
ifelse(data_test$property_type == "Guesthouse", 15,
ifelse(data_test$property_type == "Hostel", 16,
ifelse(data_test$property_type == "House", 17,
ifelse(data_test$property_type == "Hut", 18,
ifelse(data_test$property_type == "In-law", 19,
ifelse(data_test$property_type == "Loft", 20,
ifelse(data_test$property_type == "Other", 21,
ifelse(data_test$property_type == "Serviced apartment", 22,
ifelse(data_test$property_type == "Tent", 23,
ifelse(data_test$property_type == "Timeshare", 24,
ifelse(data_test$property_type == "Townhouse", 25,
ifelse(data_test$property_type == "Train", 26,
ifelse(data_test$property_type == "Treehouse", 27,
ifelse(data_test$property_type == "Vacation home", 28,
ifelse(data_test$property_type == "Villa", 29,
30)))))))))))))))))))))))))))))
glimpse(data_test)## Rows: 22,232
## Columns: 12
## $ property_type <dbl> 17, 1, 1, 17, 1, 1, 1, 1, 1, 1, 17, 17, 1, 1, 1, 1~
## $ room_type <fct> Entire home/apt, Private room, Entire home/apt, Pr~
## $ accommodates <int> 4, 2, 4, 2, 4, 6, 4, 8, 2, 1, 2, 2, 2, 5, 6, 1, 3,~
## $ bathrooms <dbl> 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ~
## $ bed_type <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,~
## $ cancellation_policy <dbl> 3, 2, 2, 2, 2, 3, 3, 2, 1, 1, 3, 3, 1, 3, 3, 1, 2,~
## $ cleaning_fee <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FA~
## $ city <dbl> 4, 2, 4, 4, 3, 5, 4, 4, 5, 5, 5, 2, 4, 5, 1, 5, 4,~
## $ instant_bookable <fct> f, f, t, f, f, f, f, t, f, f, t, f, f, f, f, f, f,~
## $ bedrooms <int> 2, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 0, 2, 0, 1,~
## $ beds <int> 2, 1, 1, 1, 2, 3, 3, 7, 1, 1, 1, 1, 1, 2, 4, 1, 1,~
## $ sum_amenities <int> 22, 12, 29, 16, 14, 44, 27, 15, 14, 16, 22, 20, 15~
str(data_test)## 'data.frame': 22232 obs. of 12 variables:
## $ property_type : num 17 1 1 17 1 1 1 1 1 1 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 1 2 1 2 1 1 1 2 1 2 ...
## $ accommodates : int 4 2 4 2 4 6 4 8 2 1 ...
## $ bathrooms : num 1.5 1.5 1 1 1 1 1 1 1 1 ...
## $ bed_type : num 5 5 5 5 5 5 5 5 5 5 ...
## $ cancellation_policy: num 3 2 2 2 2 3 3 2 1 1 ...
## $ cleaning_fee : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ city : num 4 2 4 4 3 5 4 4 5 5 ...
## $ instant_bookable : Factor w/ 2 levels "f","t": 1 1 2 1 1 1 1 2 1 1 ...
## $ bedrooms : int 2 1 1 1 1 3 2 1 1 1 ...
## $ beds : int 2 1 1 1 2 3 3 7 1 1 ...
## $ sum_amenities : int 22 12 29 16 14 44 27 15 14 16 ...
Cek data hilang
sum_mis2<-miss_var_summary(data_test)
sum_mis_plot2<-head(sum_mis2,7)
sum_mis_plot2Imputasi data hilang
my_imptest<- mice(data_test, m = 2, method = "rf", maxit = 10, seed = 52500)##
## iter imp variable
## 1 1 bathrooms bedrooms beds
## 1 2 bathrooms bedrooms beds
## 2 1 bathrooms bedrooms beds
## 2 2 bathrooms bedrooms beds
## 3 1 bathrooms bedrooms beds
## 3 2 bathrooms bedrooms beds
## 4 1 bathrooms bedrooms beds
## 4 2 bathrooms bedrooms beds
## 5 1 bathrooms bedrooms beds
## 5 2 bathrooms bedrooms beds
## 6 1 bathrooms bedrooms beds
## 6 2 bathrooms bedrooms beds
## 7 1 bathrooms bedrooms beds
## 7 2 bathrooms bedrooms beds
## 8 1 bathrooms bedrooms beds
## 8 2 bathrooms bedrooms beds
## 9 1 bathrooms bedrooms beds
## 9 2 bathrooms bedrooms beds
## 10 1 bathrooms bedrooms beds
## 10 2 bathrooms bedrooms beds
summary(my_imptest)## Class: mids
## Number of multiple imputations: 2
## Imputation methods:
## property_type room_type accommodates bathrooms
## "" "" "" "rf"
## bed_type cancellation_policy cleaning_fee city
## "" "" "" ""
## instant_bookable bedrooms beds sum_amenities
## "" "rf" "rf" ""
## PredictorMatrix:
## property_type room_type accommodates bathrooms bed_type
## property_type 0 1 1 1 1
## room_type 1 0 1 1 1
## accommodates 1 1 0 1 1
## bathrooms 1 1 1 0 1
## bed_type 1 1 1 1 0
## cancellation_policy 1 1 1 1 1
## cancellation_policy cleaning_fee city instant_bookable
## property_type 1 1 1 1
## room_type 1 1 1 1
## accommodates 1 1 1 1
## bathrooms 1 1 1 1
## bed_type 1 1 1 1
## cancellation_policy 0 1 1 1
## bedrooms beds sum_amenities
## property_type 1 1 1
## room_type 1 1 1
## accommodates 1 1 1
## bathrooms 1 1 1
## bed_type 1 1 1
## cancellation_policy 1 1 1
data_test_final = complete(my_imptest,2)Cek kembali data hilang
anyNA(data_test_final)## [1] FALSE
sum_mis3<-miss_var_summary(data_test_final)
sum_mis_plot3<-head(sum_mis3,7)
sum_mis_plot3Data hilang sudah berhasil diselesaikan.
4. Eksplorasi Korelasi Data
Berikut ini adalah pola hubungan setiap variabel prediktor dengan variabel respon.
DataExplorer::plot_scatterplot(data = data2,
by = "price",nrow = 3,ncol = 3,geom_point_args = list(color="Steelblue"))5. Pemodelan
Pada pertemuan ini beberapa metode pemodelan yang akan dicobakan diantaranya regresi spline dan loess, serta juga akan disandingkan dengan metode regresi linier sebagai baseline.
Regresi Spline
Pertama kita akan bangun regresi spline dengan terlebih dahulu mengeluarkan variabel yang tidak bisa digunakan dalam metode ini agar model dapat berjalan dengan baik.
formula_spline1 <- str_c("s(",
names(data2 %>% select(-price, -room_type, -cleaning_fee, -instant_bookable)),
",df=",300,")",
collapse = "+")
formula_spline1 <- as.formula(str_c("price~",formula_spline1))
formula_spline1## price ~ s(property_type, df = 300) + s(accommodates, df = 300) +
## s(bathrooms, df = 300) + s(bed_type, df = 300) + s(cancellation_policy,
## df = 300) + s(city, df = 300) + s(bedrooms, df = 300) + s(beds,
## df = 300) + s(sum_amenities, df = 300)
mod_spline1 <- gam(formula_spline1,data=data2,family = "gaussian")
summary(mod_spline1)##
## Call: gam(formula = formula_spline1, family = "gaussian", data = data2)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.759570 -0.304236 -0.009553 0.297008 1.802362
##
## (Dispersion Parameter for gaussian family taken to be 0.2)
##
## Null Deviance: 19932.27 on 49691 degrees of freedom
## Residual Deviance: 9910.823 on 49546 degrees of freedom
## AIC: 61199.52
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(property_type, df = 300) 1 2.1 2.1 10.530 0.001176 **
## s(accommodates, df = 300) 1 5434.7 5434.7 27168.983 < 2.2e-16 ***
## s(bathrooms, df = 300) 1 173.1 173.1 865.152 < 2.2e-16 ***
## s(bed_type, df = 300) 1 44.2 44.2 220.916 < 2.2e-16 ***
## s(cancellation_policy, df = 300) 1 15.4 15.4 76.887 < 2.2e-16 ***
## s(city, df = 300) 1 268.6 268.6 1342.857 < 2.2e-16 ***
## s(bedrooms, df = 300) 1 523.4 523.4 2616.791 < 2.2e-16 ***
## s(beds, df = 300) 1 47.6 47.6 238.090 < 2.2e-16 ***
## s(sum_amenities, df = 300) 1 44.1 44.1 220.572 < 2.2e-16 ***
## Residuals 49546 9910.8 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(property_type, df = 300) 28 43.53 < 2.2e-16 ***
## s(accommodates, df = 300) 14 240.82 < 2.2e-16 ***
## s(bathrooms, df = 300) 7 13.17 < 2.2e-16 ***
## s(bed_type, df = 300) 3 1.71 0.1628
## s(cancellation_policy, df = 300) 2 58.90 < 2.2e-16 ***
## s(city, df = 300) 4 691.83 < 2.2e-16 ***
## s(bedrooms, df = 300) 9 372.96 < 2.2e-16 ***
## s(beds, df = 300) 15 4.38 2.546e-08 ***
## s(sum_amenities, df = 300) 54 3.25 1.066e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_spline1$aic## [1] 61199.52
Setelah itu kita coba predict dengan menggunakan data latih untuk menguji mae nya.
#Predict data training
pred_spline1 <- predict(mod_spline1,newdata=data2)
maespline<- abs(pred_spline1-data2$price)
maespline<-as.data.frame(maespline)
mae <- function(response,pred){
mean(abs(response-pred),na.rm = TRUE)
}
mae_spline<-mae(data2$price, pred_spline1)
mae_spline## [1] 0.3567572
Kemudian kita simulasikan dengan data test yang sebelumnya kita siapkan. Data test kita kembalikan ke bentuk asalnya karena data test yang kita gunakan sudah ditransformasi ke bentuk logaritma natural.
#Predict data test
pred_spline2 <- predict(mod_spline1,newdata=data_test_final)
#Mengembalikan fungsi log natural
pred_spline2<- 2.718282^(pred_spline2)
pred_spline2<-as.data.frame(pred_spline2)Pada tahap terahir, setelah data prediksi price pada data test sudah didapatkan, selanjutnya ada hal yang perlu kita lakukan agar hasil prediksi tsb bisa dimasukkan ke dalam keggle, yaitu dengan menduga nilai price dengan menggunakan imputasi dengan metode random forest.
#Imputasi data test untuk submit keggle
data_test_imp <- read.csv2("D:/Magister IPB/Kuliah/Semester 2/STA582_Pembelajaran_Mesin_Statistika/Praktikum/Tugas_UTS/Problem 2/Week 3/datatestweek3.csv",stringsAsFactors = TRUE)
glimpse(data_test_imp)## Rows: 22,232
## Columns: 13
## $ property_type <int> 17, 1, 1, 17, 1, 1, 1, 1, 1, 1, 17, 17, 1, 1, 1, 1~
## $ room_type <fct> Entire home/apt, Private room, Entire home/apt, Pr~
## $ accommodates <int> 4, 2, 4, 2, 4, 6, 4, 8, 2, 1, 2, 2, 2, 5, 6, 1, 3,~
## $ bathrooms <dbl> 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ~
## $ bed_type <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,~
## $ cancellation_policy <int> 3, 2, 2, 2, 2, 3, 3, 2, 1, 1, 3, 3, 1, 3, 3, 1, 2,~
## $ cleaning_fee <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FA~
## $ city <int> 4, 2, 4, 4, 3, 5, 4, 4, 5, 5, 5, 2, 4, 5, 1, 5, 4,~
## $ instant_bookable <fct> f, f, t, f, f, f, f, t, f, f, t, f, f, f, f, f, f,~
## $ bedrooms <int> 2, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 0, 2, 0, 1,~
## $ beds <int> 2, 1, 1, 1, 2, 3, 3, 7, 1, 1, 1, 1, 1, 2, 4, 1, 1,~
## $ sum_amenities <int> 22, 12, 29, 16, 14, 44, 27, 15, 14, 16, 22, 20, 15~
## $ price <dbl> 158.29650, 67.76210, 129.78215, 69.24783, 124.9959~
my_imptest2<- mice(data_test_imp, m = 2, method = "rf", maxit = 10, seed = 52500)##
## iter imp variable
## 1 1 price
## 1 2 price
## 2 1 price
## 2 2 price
## 3 1 price
## 3 2 price
## 4 1 price
## 4 2 price
## 5 1 price
## 5 2 price
## 6 1 price
## 6 2 price
## 7 1 price
## 7 2 price
## 8 1 price
## 8 2 price
## 9 1 price
## 9 2 price
## 10 1 price
## 10 2 price
summary(my_imptest2)## Class: mids
## Number of multiple imputations: 2
## Imputation methods:
## property_type room_type accommodates bathrooms
## "" "" "" ""
## bed_type cancellation_policy cleaning_fee city
## "" "" "" ""
## instant_bookable bedrooms beds sum_amenities
## "" "" "" ""
## price
## "rf"
## PredictorMatrix:
## property_type room_type accommodates bathrooms bed_type
## property_type 0 1 1 1 1
## room_type 1 0 1 1 1
## accommodates 1 1 0 1 1
## bathrooms 1 1 1 0 1
## bed_type 1 1 1 1 0
## cancellation_policy 1 1 1 1 1
## cancellation_policy cleaning_fee city instant_bookable
## property_type 1 1 1 1
## room_type 1 1 1 1
## accommodates 1 1 1 1
## bathrooms 1 1 1 1
## bed_type 1 1 1 1
## cancellation_policy 0 1 1 1
## bedrooms beds sum_amenities price
## property_type 1 1 1 1
## room_type 1 1 1 1
## accommodates 1 1 1 1
## bathrooms 1 1 1 1
## bed_type 1 1 1 1
## cancellation_policy 1 1 1 1
data_test_final2 = complete(my_imptest2,2)
anyNA(data_test_final2)## [1] FALSE
Local Regression (Loess)
Pada kali ini kita akan menyelesaikan case tsb dengan menggunakan model loess.
#Generato model loess
formula_loess <- str_c("lo(",
names(data2 %>% select( -price, -room_type, -cleaning_fee, -instant_bookable)),
")",
collapse = "+")
formula_loess <- as.formula(str_c("price~",formula_loess))
mod_loess <- gam(formula_loess,data=data2,family = "gaussian")## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## span too small. fewer data values than degrees of freedom.
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## pseudoinverse used at 0.98
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## neighborhood radius 1.02
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## reciprocal condition number 0
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## There are other near singularities as well. 1.0404
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## span too small. fewer data values than degrees of freedom.
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## pseudoinverse used at 0.98
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## neighborhood radius 1.02
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## reciprocal condition number 0
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## There are other near singularities as well. 4.0804
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## pseudoinverse used at 3
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## neighborhood radius 1
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## reciprocal condition number -0
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## There are other near singularities as well. 1
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## span too small. fewer data values than degrees of freedom.
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## pseudoinverse used at 0.98
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## neighborhood radius 1.02
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## reciprocal condition number 0
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## There are other near singularities as well. 1.0404
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## span too small. fewer data values than degrees of freedom.
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## pseudoinverse used at 0.98
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## neighborhood radius 1.02
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## reciprocal condition number 0
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## There are other near singularities as well. 4.0804
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## pseudoinverse used at 3
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## neighborhood radius 1
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## reciprocal condition number -0
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## There are other near singularities as well. 1
summary(mod_loess)##
## Call: gam(formula = formula_loess, family = "gaussian", data = data2)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.161596 -0.318884 -0.004896 0.307604 1.983106
##
## (Dispersion Parameter for gaussian family taken to be 0.2129)
##
## Null Deviance: 19932.27 on 49691 degrees of freedom
## Residual Deviance: 10569.14 on 49653.81 degrees of freedom
## AIC: 64179.65
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## lo(property_type) 1 0.3 0.3 1.622 0.2028
## lo(accommodates) 1 6338.2 6338.2 29776.685 <2e-16 ***
## lo(bathrooms) 1 115.9 115.9 544.592 <2e-16 ***
## lo(bed_type) 1 60.0 60.0 281.682 <2e-16 ***
## lo(cancellation_policy) 1 23.8 23.8 111.870 <2e-16 ***
## lo(city) 1 265.7 265.7 1248.387 <2e-16 ***
## lo(bedrooms) 1 151.2 151.2 710.316 <2e-16 ***
## lo(beds) 1 40.1 40.1 188.575 <2e-16 ***
## lo(sum_amenities) 1 52.7 52.7 247.504 <2e-16 ***
## Residuals 49654 10569.1 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## lo(property_type) 2.4 229.15 < 2.2e-16 ***
## lo(accommodates) 2.8 1001.83 < 2.2e-16 ***
## lo(bathrooms) 3.6 38.25 < 2.2e-16 ***
## lo(bed_type) 3.0 1.11 0.3452
## lo(cancellation_policy) 2.0 57.95 < 2.2e-16 ***
## lo(city) 4.0 691.39 < 2.2e-16 ***
## lo(bedrooms) 4.2 283.78 < 2.2e-16 ***
## lo(beds) 3.2 20.39 5.074e-14 ***
## lo(sum_amenities) 2.9 26.95 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod_loess$aic## [1] 64179.65
Setelah model didapatkan, kita uji dengan membuat prediksi dengan data latih.
#Prediksi data train
pred_loess1 <- predict(mod_loess,newdata=data2)## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, : span
## too small. fewer data values than degrees of freedom.
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, :
## pseudoinverse used at 0.98
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, :
## neighborhood radius 1.02
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, :
## reciprocal condition number 0
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, : There
## are other near singularities as well. 1.0404
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : span too small. fewer data values than degrees of freedom.
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : pseudoinverse used at 0.98
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : neighborhood radius 1.02
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : reciprocal condition number 0
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : There are other near singularities as well. 4.0804
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## pseudoinverse used at 3
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## neighborhood radius 1
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## reciprocal condition number -0
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## There are other near singularities as well. 1
maeloess<- abs(pred_loess1-data2$price)
maeloess<-as.data.frame(maeloess)
mae_loess<-mae(data2$price, pred_loess1)
mae_loess## [1] 0.3681556
Selanutnya kita juga uji dengan menggunakan data test yang sudah diimputasi.
#Predict data test
pred_loess2 <- predict(mod_loess,newdata=data_test_final)## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, : span
## too small. fewer data values than degrees of freedom.
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, :
## pseudoinverse used at 0.98
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, :
## neighborhood radius 1.02
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, :
## reciprocal condition number 0
## Warning in gam.lo(data[["lo(bed_type)"]], z, w, span = 0.5, degree = 1, : There
## are other near singularities as well. 1.0404
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : span too small. fewer data values than degrees of freedom.
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : pseudoinverse used at 0.98
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : neighborhood radius 1.02
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : reciprocal condition number 0
## Warning in gam.lo(data[["lo(cancellation_policy)"]], z, w, span = 0.5, degree =
## 1, : There are other near singularities as well. 4.0804
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## pseudoinverse used at 3
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## neighborhood radius 1
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## reciprocal condition number -0
## Warning in gam.lo(data[["lo(city)"]], z, w, span = 0.5, degree = 1, ncols = 1, :
## There are other near singularities as well. 1
pred_loess2<- 2.718282^(pred_loess2)
pred_loess2<-as.data.frame(pred_loess2)Linier Regression
Sebagai pembanding berikut ini kita akan coba simulasikan dengan menggunakan regresi linier.
mod_linear <- glm(price~.,data=data2,family = "gaussian")
summary(mod_linear)##
## Call:
## glm(formula = price ~ ., family = "gaussian", data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.41747 -0.28852 -0.01181 0.26908 1.76612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.114345 0.024525 167.765 < 2e-16 ***
## property_type -0.001247 0.000250 -4.990 6.07e-07 ***
## room_typePrivate room -0.634547 0.004478 -141.715 < 2e-16 ***
## room_typeShared room -1.085193 0.012065 -89.945 < 2e-16 ***
## accommodates 0.066480 0.001806 36.801 < 2e-16 ***
## bathrooms 0.101465 0.004581 22.149 < 2e-16 ***
## bed_type 0.016970 0.004477 3.791 0.00015 ***
## cancellation_policy 0.006996 0.002322 3.013 0.00259 **
## cleaning_feeTRUE -0.030862 0.004563 -6.764 1.36e-11 ***
## city 0.066340 0.001581 41.971 < 2e-16 ***
## instant_bookablet -0.060605 0.004225 -14.343 < 2e-16 ***
## bedrooms 0.158703 0.003545 44.773 < 2e-16 ***
## beds -0.038494 0.002879 -13.369 < 2e-16 ***
## sum_amenities 0.006597 0.000288 22.907 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1669608)
##
## Null deviance: 19932.3 on 49691 degrees of freedom
## Residual deviance: 8294.3 on 49678 degrees of freedom
## AIC: 52087
##
## Number of Fisher Scoring iterations: 2
mod_linear$aic## [1] 52087.31
Setelah model didapatkan, kita uji dengan membuat prediksi dengan data latih.
#Pred data train
pred_linier1 <- predict(mod_linear,newdata=data2)
maelinier<- abs(pred_linier1-data2$price)
maelinier<-as.data.frame(maelinier)
mae_linier<-mae(data2$price, pred_linier1)
mae_linier## [1] 0.3281817
Selanjutnya kita juga uji dengan menggunakan data test yang sudah diimputasi.
#Predict data test
pred_linier2 <- predict(mod_linear,newdata=data_test_final)
pred_linier2<- 2.718282^(pred_linier2)
pred_linier2<-as.data.frame(pred_linier2)6. Komparasi Model
Setelah semua proses pemodelan selesai dilakukan, tahap selanjutnya adalah membuat tabel pembandingan semua model yang mengacu pada nilai mae nya sehingga didapatkan hasil perbandingan sebagai berikut.
komp_mae<-as.data.frame(c(mae_spline,mae_loess, mae_linier))
colnames(komp_mae)## [1] "c(mae_spline, mae_loess, mae_linier)"
names(komp_mae)[names(komp_mae) == "c(mae_spline, mae_loess, mae_linier)"] <- "Nilai MAE"
Model<-c("Spline", "Loess", "Linier")
komp_mae<- cbind(Model, komp_mae)
komp_maeBerdasarkan hasil komparasi tsb, didapatkan model regresi yang paling baik adalah dengan menggunakan model regresi linier
7. Kesimpulan
Adapun yang dapat disimpulkan dari simulasi di atas adalah sbb:
- Model terbaik adalah regresi linier
- Metode spline dan loess tidak cocok dengan problem ini karena pada spline dan loess hanya bisa untuk variabel numerik minimum nilai unik = 4
- Metode spline dan loess akan lebih baik diterapkan pada variabel dengan skala interval atau rasio
- Untuk memprediksi dengan model loess dan spline tidak boleh ada data yang hilang
- Regresi linier lebih baik karena jumlah variabel bebas yang dimasukkan bisa lebih banyak dan fleksibel
Referensi
Breheny P. (n.d.). Getting started with grpreg. GitHub Pages. https://pbreheny.github.io/grpreg/articles/getting-started.html
Hastie T. (2013, May 9). glmnet: Lasso and elastic-net regularization in R. Revolutions. https://blog.revolutionanalytics.com/2013/05/hastie-glmnet.html
Post J. (2014, September 29). LASSO, Ridge, and Elastic Net. https://www4.stat.ncsu.edu/~post/josh/LASSO_Ridge_Elastic_Net_-_Examples.html
Tibshirani, R., Saunders, M., Rosset, S., Zhu, J., & Knight, K. (2005). Sparsity and smoothness via the fused lasso. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 67(1), 91-108. https://doi.org/10.1111/j.1467-9868.2005.00490.x
Tripathy A. (2013, July 14). Regularization – Predictive Modeling Beyond Ordinary Least Squares Fit. ShatterLine Blog. https://shatterline.com/blog/2013/07/
Xiaotong C., Chen G., & Chong W. (n.d.). Statistical Learning and Data Mining Codes. Biostatistics - Academic Divisions - School of Public Health - University of Minnesota. https://www.biostat.umn.edu/~weip/course/dm/examples/