Es útil pensar la validación cruzada como lo que son: procesos iterativos.
Un proceso iterativo es un tipo de proceso que debe repetirse. Lo central es cambiar los input del proceso, pero no la función que se aplica
Ejemplos de proceso iterativo:
for(i in 1:10){
print(paste0("Hola soy R y estoy contando del 1 al 10 y voy en : ",i))
}
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 1"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 2"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 3"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 4"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 5"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 6"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 7"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 8"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 9"
## [1] "Hola soy R y estoy contando del 1 al 10 y voy en : 10"
A<-NULL
for(i in 1:10){
A[i]<-i^2
}
A
## [1] 1 4 9 16 25 36 49 64 81 100
install.packages("caret")
install.packages("jtools")
library(data.table)
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: ggplot2
library(jtools)
## Warning: package 'jtools' was built under R version 4.0.3
airbnb<-fread("AB_NYC_2019.csv")
airbnb[,nbhg:=as.factor(neighbourhood_group)]
airbnb[,nbh:=as.factor(neighbourhood)]
airbnb[,roomt:=as.factor(room_type)]
reg1<-lm(data=airbnb,formula =price~roomt)
summary(reg1)
##
## Call:
## lm(formula = price ~ roomt, data = airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -211.8 -59.8 -29.8 9.2 9910.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211.794 1.456 145.43 <2e-16 ***
## roomtPrivate room -122.013 2.130 -57.30 <2e-16 ***
## roomtShared room -141.667 6.970 -20.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 232.1 on 48892 degrees of freedom
## Multiple R-squared: 0.06561, Adjusted R-squared: 0.06558
## F-statistic: 1717 on 2 and 48892 DF, p-value: < 2.2e-16
reg2<-lm(data=airbnb,formula=price~roomt+number_of_reviews+minimum_nights+nbhg)
summary(reg2)
##
## Call:
## lm(formula = price ~ roomt + number_of_reviews + minimum_nights +
## nbhg, data = airbnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -304.3 -63.1 -23.9 11.7 9920.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 165.92245 7.13445 23.257 < 2e-16 ***
## roomtPrivate room -110.94748 2.14630 -51.692 < 2e-16 ***
## roomtShared room -134.82447 6.91878 -19.487 < 2e-16 ***
## number_of_reviews -0.20908 0.02345 -8.915 < 2e-16 ***
## minimum_nights 0.16007 0.05108 3.134 0.00173 **
## nbhgBrooklyn 21.23645 7.15621 2.968 0.00300 **
## nbhgManhattan 77.83906 7.16043 10.871 < 2e-16 ***
## nbhgQueens 9.29678 7.60385 1.223 0.22147
## nbhgStaten Island 13.75862 13.79607 0.997 0.31863
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 229.9 on 48886 degrees of freedom
## Multiple R-squared: 0.08336, Adjusted R-squared: 0.08321
## F-statistic: 555.7 on 8 and 48886 DF, p-value: < 2.2e-16
pred1<-predict(reg1) ## calcular los y predichos para cada observación
airbnb[,pred1:=predict(reg1)]
predicciones1<-data.table(RMSE=RMSE(pred1,airbnb$price),
MAE=MAE(pred1,airbnb$price))
predicciones1
## RMSE MAE
## 1: 232.1394 75.89648
pred2<-predict(reg2) ## calcular los precios predichos de este modelo
predicciones2<-data.table(RMSE=RMSE(pred2,airbnb$price),
MAE=MAE(pred2,airbnb$price))
predicciones2
## RMSE MAE
## 1: 229.9244 73.68691
k<-5
set.seed(12345) ## setear una semilla
airbnb[,fold:=sample(k,size=nrow(airbnb),replace=TRUE)] # Divide la muestra en 5 grupos
airbnb[,nr:=.I] # Genera una columna que corresponde al número de la observación
ind<-list() # Creamos una lista
for(i in 1:k){
regk<-lm(data=airbnb[fold!=i],formula=price~roomt+number_of_reviews+minimum_nights)
coef<-coef(regk)
airbnb[fold==i,predk:=coef[1]+ifelse(roomt=="Private room",coef[2],0)+ifelse(roomt=="Shared room",coef[3],0)+number_of_reviews*coef[4]+minimum_nights*coef[5]]
prediccionesk<-data.table(RMSE=RMSE(airbnb[fold==i,predk],airbnb[fold==i,price]),
MAE=MAE(airbnb[fold==i,predk],airbnb[fold==i,price]))
ind[i]<-list(airbnb[fold!=i,nr])
if(i==1) {predkf<-prediccionesk
} else {predkf<-rbind(predkf,prediccionesk)}
}
prediccioneskk<-data.table(RMSE=RMSE(airbnb[,predk],airbnb[,price]),
MAE=MAE(airbnb[,predk],airbnb[,price]))
prediccioneskk
## RMSE MAE
## 1: 231.8942 76.57592
train.control <- trainControl(index=ind)
set.seed(12345) ## setear una semilla
pred<-train(price~roomt+number_of_reviews+minimum_nights,data=airbnb,method="lm",trControl= train.control)
print(pred)
## Linear Regression
##
## 48895 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (5 reps)
## Summary of sample sizes: 39015, 39239, 38956, 39152, 39218
## Resampling results:
##
## RMSE Rsquared MAE
## 231.0026 0.06962121 76.58142
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(12345) ## setear una semilla
setupKCV <- trainControl(method = "cv" , number = 5)
predkfolds1<-train(price~roomt,data=airbnb,method="lm",trControl= setupKCV)
predkfolds2<-train(price~roomt+number_of_reviews+minimum_nights+nbhg,data=airbnb,method="lm",trControl= setupKCV)
print(predkfolds1)
## Linear Regression
##
## 48895 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39115, 39117, 39116, 39116, 39116
## Resampling results:
##
## RMSE Rsquared MAE
## 231.1046 0.06742812 75.89747
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
print(predkfolds2)
## Linear Regression
##
## 48895 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39114, 39117, 39116, 39116, 39117
## Resampling results:
##
## RMSE Rsquared MAE
## 229.056 0.0850292 73.68725
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(12345)## setear una semilla
setupRKCV <- trainControl(method = "repeatedcv" , number = 5, repeats= 3)
predRKCV<-train(price~roomt+number_of_reviews+minimum_nights+nbhg,data=airbnb,method="lm",trControl= setupRKCV)
print(predRKCV)
## Linear Regression
##
## 48895 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 39115, 39117, 39116, 39116, 39116, 39117, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 228.5229 0.08608289 73.7174
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
set.seed(12345)## setear una semilla
setupLOO <- trainControl(method = "LOOCV")
predLOO<-train(price~roomt+number_of_reviews+minimum_nights+nbhg,data=airbnb[1:1000],method="lm",trControl= setupLOO)
print(predLOO)
## Linear Regression
##
## 1000 samples
## 4 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 999, 999, 999, 999, 999, 999, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 143.1674 0.112473 63.99506
##
## Tuning parameter 'intercept' was held constant at a value of TRUE