Es útil pensar la validación cruzada como lo que son: procesos iterativos.
Un proceso iterativo es un tipo de proceso que debe repetirse. Lo central es cambiar los input del proceso, pero no la función que se aplica
Ejemplos de proceso iterativo:
for(i in 1:10){
print(paste0("Hola soy R y estoy contando del 1 al 10 y voy en : ",i))
}
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 1"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 2"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 3"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 4"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 5"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 6"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 7"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 8"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 9"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 10"
A<-NULL
for(i in 1:10){
A[i]<-i^2
}
A
## [1] 1 4 9 16 25 36 49 64 81 100
Instalamos los paquetes necesarios para este laboratorio
Cargamos los paquetes para este laboratorios. Los dos que acabamos de instalar además de data.table
library(data.table)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(jtools)
Abrimos los datos de Airbnb de NYC disponibles en webcursos.
airbnb<-fread("/Users/hassansantiago/Desktop/uai/data science/ultimas 3 clases/AB_NYC_2019.csv")
summary(airbnb)
## id name host_id host_name
## Min. : 2539 Length:48895 Min. : 2438 Length:48895
## 1st Qu.: 9471945 Class :character 1st Qu.: 7822033 Class :character
## Median :19677284 Mode :character Median : 30793816 Mode :character
## Mean :19017143 Mean : 67620011
## 3rd Qu.:29152178 3rd Qu.:107434423
## Max. :36487245 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:48895 Length:48895 Min. :40.50 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:48895 Min. : 0.0 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Median : 106.0 Median : 3.00 Median : 5.00
## Mean : 152.7 Mean : 7.03 Mean : 23.27
## 3rd Qu.: 175.0 3rd Qu.: 5.00 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.00 Max. :629.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-03-28 Min. : 0.010 Min. : 1.000
## 1st Qu.:2018-07-08 1st Qu.: 0.190 1st Qu.: 1.000
## Median :2019-05-19 Median : 0.720 Median : 1.000
## Mean :2018-10-04 Mean : 1.373 Mean : 7.144
## 3rd Qu.:2019-06-23 3rd Qu.: 2.020 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :58.500 Max. :327.000
## NA's :10052 NA's :10052
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 45.0
## Mean :112.8
## 3rd Qu.:227.0
## Max. :365.0
##
Vamos a transformar las variables del barrio, grupo del barrio y el tipo de habitación a factor.
airbnb[,nbhg:=as.factor(neighbourhood_group)]
airbnb[,nbh:=as.factor(neighbourhood)]
airbnb[,roomt:=as.factor(room_type)]
reg1<-lm(data=airbnb,formula =price~roomt)
summary(reg1)
##
## Call:
## lm(formula = price ~ roomt, data = airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -211.8 -59.8 -29.8 9.2 9910.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211.794 1.456 145.43 <2e-16 ***
## roomtPrivate room -122.013 2.130 -57.30 <2e-16 ***
## roomtShared room -141.667 6.970 -20.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 232.1 on 48892 degrees of freedom
## Multiple R-squared: 0.06561, Adjusted R-squared: 0.06558
## F-statistic: 1717 on 2 and 48892 DF, p-value: < 2.2e-16
reg2<-lm(data=airbnb,formula=price~roomt+number_of_reviews+minimum_nights)
summary(reg2)
##
## Call:
## lm(formula = price ~ roomt + number_of_reviews + minimum_nights,
## data = airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -353.9 -60.2 -27.8 9.6 9882.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 215.19734 1.62337 132.562 < 2e-16 ***
## roomtPrivate room -120.96531 2.13282 -56.716 < 2e-16 ***
## roomtShared room -142.66648 6.96325 -20.488 < 2e-16 ***
## number_of_reviews -0.23776 0.02362 -10.067 < 2e-16 ***
## minimum_nights 0.23838 0.05143 4.635 3.58e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 231.8 on 48890 degrees of freedom
## Multiple R-squared: 0.06811, Adjusted R-squared: 0.06804
## F-statistic: 893.3 on 4 and 48890 DF, p-value: < 2.2e-16
pred1<-predict(reg1) ## calcular los y predichos para cada observación
airbnb[,pred1:=predict(reg1)]
predicciones1<-data.table(RMSE=RMSE(pred1,airbnb$price),
MAE=MAE(pred1,airbnb$price))
predicciones1
## RMSE MAE
## 1: 232.1394 75.89648
pred2<-predict(reg2) ## calcular los precios predichos de este modelo
predicciones2<-data.table(RMSE=RMSE(pred2,airbnb$price),
MAE=MAE(pred2,airbnb$price))
predicciones2
## RMSE MAE
## 1: 231.8289 76.53866
set.seed(12345) ## setear una semilla
setupKCV <- trainControl(method = "cv" , number = 5)
predkfolds1<-train(price~roomt,data=airbnb,method="lm",trControl= setupKCV)
predkfolds2<-train(price~roomt+number_of_reviews+minimum_nights,data=airbnb,method="lm",trControl= setupKCV)
print(predkfolds1)
## Linear Regression
##
## 48895 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39115, 39117, 39116, 39116, 39116
## Resampling results:
##
## RMSE Rsquared MAE
## 231.1046 0.06742812 75.89747
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
print(predkfolds2)
## Linear Regression
##
## 48895 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39114, 39117, 39116, 39116, 39117
## Resampling results:
##
## RMSE Rsquared MAE
## 230.9595 0.06957219 76.53617
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(12345)## setear una semilla
setupRKCV <- trainControl(method = "repeatedcv" , number = 5, repeats= 3)
predRKCV<-train(price~roomt+number_of_reviews+minimum_nights+nbhg,data=airbnb,method="lm",trControl= setupRKCV)
print(predRKCV)
## Linear Regression
##
## 48895 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 39115, 39117, 39116, 39116, 39116, 39117, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 228.5229 0.08608289 73.7174
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(12345) ## setear una semilla
training.samples <- createDataPartition(airbnb$price,p = 0.7, list = FALSE)
train.data <- airbnb[training.samples, ]
test.data <- airbnb[-training.samples, ]
model <- lm(price~roomt+number_of_reviews+minimum_nights, data = train.data)
predictions <- predict(model,test.data)
data.frame( R2 = R2(predictions, test.data$price),
RMSE = RMSE(predictions, test.data$price),
MAE = MAE(predictions, test.data$price))
## R2 RMSE MAE
## 1 0.07560695 215.0031 76.20859
set.seed(12345)## setear una semilla
setupLOO <- trainControl(method = "LOOCV")
predLOO<-train(price~roomt+number_of_reviews+minimum_nights+nbhg,data=airbnb[1:1000],method="lm",trControl= setupLOO)
print(predLOO)
## Linear Regression
##
## 1000 samples
## 4 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 999, 999, 999, 999, 999, 999, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 143.1674 0.112473 63.99506
##
## Tuning parameter 'intercept' was held constant at a value of TRUE