library(tibble)
library(dplyr)
library(Hmisc)
library(MASS)
library(randomForest)
library(nnet)
library(gamlss.add)
RMSE <- function(X, Y){
Error <- (log(X) - log(Y))^2
RMSE <- sqrt(mean(Error))
return(RMSE)
}
train <- read.csv("train.csv")
test <- read.csv("test.csv")
# Contar los NA por columna
na_count <-sapply(train, function(y) sum(length(which(is.na(y)))))
# Conteo en un DF
na_count <- data.frame(na_count)
# Reorganización
na_count <- na_count %>%
tibble::rownames_to_column("Variable") %>%
arrange(desc(na_count))
head(na_count)
## Variable na_count
## 1 PoolQC 1453
## 2 MiscFeature 1406
## 3 Alley 1369
## 4 Fence 1179
## 5 FireplaceQu 690
## 6 LotFrontage 259
# Se descartan las columnas con más de 500 NA's
# Los nombres de estas columnas se guardan
cols_delete <- na_count$Variable[1:5]
cols_delete
## [1] "PoolQC" "MiscFeature" "Alley" "Fence" "FireplaceQu"
# Columnas a imputar
cols_impute <- na_count$Variable[6:19]
cols_impute
## [1] "LotFrontage" "GarageType" "GarageYrBlt" "GarageFinish" "GarageQual"
## [6] "GarageCond" "BsmtExposure" "BsmtFinType2" "BsmtQual" "BsmtCond"
## [11] "BsmtFinType1" "MasVnrType" "MasVnrArea" "Electrical"
Se crea una función que:
Elimina las columbas con más de 500 NA’s obtenidas previamente
Transforma variables numéricas como años a factores.
Transforma strings a factores.
Imputa datos faltantes con la media y la moda según sea el caso utilizando la librería recomendada Hmisc.
# FUNCIÓN QUE TRANSFORMA LAS BASES DE DATOS
trans_df <- function(df){
# Se eliminan las columnas con más de 500 NA's
df <- df[!names(df) %in% cols_delete]
# Las variables de año (en vez de numéricas, se trabajan como factores)
cols_factor <- c("YearBuilt", "YearRemodAdd")
df[cols_factor] <- lapply(df[cols_factor], as.factor)
# Strings como factores
df <- df %>% mutate_if(is.character, as.factor)
# Imputación con la media y la moda
for (i in 1:length(cols_impute)){
if(class(df[,cols_impute[i]]) == "integer"){
df[,cols_impute[i]] <- impute(df[,cols_impute[i]], mean)
}
else if(class(df[,cols_impute[i]]) == "factor"){
df[,cols_impute[i]] <- impute(df[,cols_impute[i]], mode)
}
}
# Salida
return(df)
}
# Se aplica la función de transformación a cada BD
train <- trans_df(train)
test <- trans_df(test)
set.seed(99)
indices_validacion <- sample(1:nrow(train), size = 100)
validacion <- train[indices_validacion,]
train2 <- anti_join(train, validacion, by = "Id")
# En entrenamiento
RMSE(train2$SalePrice, predict(mod_fit, newdata = train2))
## [1] 0.1170181
# En validación
RMSE(validacion$SalePrice, predict(mod_fit, newdata = validacion))
## [1] 0.144431
mod_rf <- randomForest(formula = SalePrice ~ MSZoning +
Street + LotShape + LandContour + LotConfig + LandSlope +
Neighborhood + Condition1 + Condition2 + OverallQual + MasVnrArea +
BsmtFinSF1 + LotArea + TotalBsmtSF + GrLivArea + TotRmsAbvGrd +
Fireplaces + GarageArea + WoodDeckSF, data = train2
)
mod_rf
##
## Call:
## randomForest(formula = SalePrice ~ MSZoning + Street + LotShape + LandContour + LotConfig + LandSlope + Neighborhood + Condition1 + Condition2 + OverallQual + MasVnrArea + BsmtFinSF1 + LotArea + TotalBsmtSF + GrLivArea + TotRmsAbvGrd + Fireplaces + GarageArea + WoodDeckSF, data = train2)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 794371154
## % Var explained: 87.44
# En entrenamiento
RMSE(train2$SalePrice, predict(mod_rf, newdata = train2))
## [1] 0.06987373
# En validación
RMSE(validacion$SalePrice, predict(mod_rf, newdata = validacion))
## [1] 0.1490881
mod_rf2 <- randomForest(formula = SalePrice ~ LandContour +
Neighborhood + OverallQual + MasVnrArea +
BsmtFinSF1 + LotArea + TotalBsmtSF + GrLivArea + TotRmsAbvGrd +
Fireplaces + GarageArea + WoodDeckSF, data = train2,
importancia = TRUE
)
mod_rf2
##
## Call:
## randomForest(formula = SalePrice ~ LandContour + Neighborhood + OverallQual + MasVnrArea + BsmtFinSF1 + LotArea + TotalBsmtSF + GrLivArea + TotRmsAbvGrd + Fireplaces + GarageArea + WoodDeckSF, data = train2, importancia = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 790019312
## % Var explained: 87.51
varImpPlot(mod_rf2)
# En entrenamiento
RMSE(train2$SalePrice, predict(mod_rf2, newdata = train2))
## [1] 0.06949556
# En validación
RMSE(validacion$SalePrice, predict(mod_rf2, newdata = validacion))
## [1] 0.1541612
cc <- nn.control(size=3, decay=1e-3, linout=TRUE, skip=TRUE, max=1000,
Hess=TRUE)
mod_gamlss <- gamlss(SalePrice ~ nn(~ LandContour +
Neighborhood + OverallQual + MasVnrArea +
BsmtFinSF1 + LotArea + TotalBsmtSF + GrLivArea + TotRmsAbvGrd +
Fireplaces + GarageArea + WoodDeckSF,
size=3,
control=cc),
data = train2,
family = GA())
## GAMLSS-RS iteration 1: Global Deviance = 31213.73
## GAMLSS-RS iteration 2: Global Deviance = 31213.72
# En entrenamiento
RMSE(train2$SalePrice, predict(mod_gamlss, newdata = train2, type = "response"))
## new prediction
## [1] 0.1465688
# En validación
RMSE(validacion$SalePrice, predict(mod_gamlss, newdata = validacion, type = "response"))
## new prediction
## [1] 0.1551517
pred_gamlss <- predict(mod_gamlss, newdata = test, type = "response")
## new prediction
intento_gamlss <- data.frame(Id = 1461:2919, SalePrice = pred_gamlss)
# Corrección de consistencia
intento_gamlss <- intento_gamlss %>% mutate(SalePrice = case_when(SalePrice < 0 ~ 17627,
TRUE ~ as.numeric(SalePrice))
)
intento_gamlss$SalePrice[is.na(intento_gamlss$SalePrice)] <- mean(na.omit(intento_gamlss$SalePrice))
# Se presentaron algunos NA's
write.csv(intento_gamlss, "intento_gamlss.csv", row.names = F)