Esta aplicação tem como objetivo aplicar redes neurais (e outros modelos) em problemas de regressão. Para isso a base de dados da competição do kaggle denominada House Prices será utilizada. Para obter os dados podemos acessar o seguinte endereço https://www.kaggle.com/c/house-prices-advanced-regression-techniques.
Carregando pacotes e criando as matrizes de dados
Inicialmente vamos carregar os pacotes do tidyverse e fazer uma limpeza nos dados. Essa é uma fase muito importante, pois é aqui que iremos gerar a matéria prima para nosso modelo.
suppressMessages(suppressWarnings(library(tidyverse)))
library(keras)
model <- keras::load_model_hdf5(filepath = "modelo_rnn")
mod_boost <- xgboost::xgb.load(modelfile = "mod_boost1")## [00:22:22] WARNING: amalgamation/../src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
train <- read.csv("/opt/datasets/houseprices/train.csv",sep = ",",
dec = ".")
train <- readr::read_csv("/opt/datasets/houseprices/train.csv",na = "character")## Parsed with column specification:
## cols(
## .default = col_character(),
## Id = col_double(),
## MSSubClass = col_double(),
## LotArea = col_double(),
## OverallQual = col_double(),
## OverallCond = col_double(),
## YearBuilt = col_double(),
## YearRemodAdd = col_double(),
## BsmtFinSF1 = col_double(),
## BsmtFinSF2 = col_double(),
## BsmtUnfSF = col_double(),
## TotalBsmtSF = col_double(),
## `1stFlrSF` = col_double(),
## `2ndFlrSF` = col_double(),
## LowQualFinSF = col_double(),
## GrLivArea = col_double(),
## BsmtFullBath = col_double(),
## BsmtHalfBath = col_double(),
## FullBath = col_double(),
## HalfBath = col_double(),
## BedroomAbvGr = col_double()
## # ... with 15 more columns
## )
## See spec(...) for full column specifications.
test <- read.csv("/opt/datasets/houseprices/test.csv",sep = ",",dec = ".")
submission <- read.csv("/opt/datasets/houseprices/sample_submission.csv",sep = ",",dec = ".")Primeiro, vamos trabalhar nas variáveis númericas, em especial nas variaveis que indicam tamanho de área e sobre vendas preços de venda.
Antes de fazer algo, vamos observar a distribuição dessas variáveis.
Começando pelas variáveis de área.
train %>%
select_at(.vars = vars(ends_with("Area"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 25,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")Veja que a variável GarageArea possui vários valores como zero e que ela parece ser assimétrica. A assimetria parece estar presente também em GrLivArea e PoolArea, sendo que são pouquíssimas casas com valores diferentes de zero para esta variável.
Variáveis sobre vendas.
train %>%
select_at(.vars = vars(ends_with("Sold"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 10,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")Aqui não temos nada muito fora dos padrões, somente que YrSold possui casas com ano de venda recentes.
Variável resposta/target, SalePrice.
train %>%
select_at(.vars = vars(ends_with("Price"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 25,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")A variável resposta mostra-se bastante assimétrica à direita. Para essas variáveis assimétricas, serão feitas transformações logaritmicas. Provavelmente teremos problemas de estimação nestas caudas.
A seguir o target com transformação log.
train %>%
select_at(.vars = vars(ends_with("Price"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = log(value))) +
geom_histogram(bins = 25,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")Parece ter ficado muito bom.
Utilizaremos todas variáveis numericas e também algumas categoricas. As categoricas que serão utilizadas são: MSZoning, MSSubClass, OverallQual, OverallCond, GarageCars e Neighborhood.
train_num <- train %>%
select_if(is.numeric) %>%
select(-Id,-MSSubClass,-SalePrice,-OverallQual,-OverallCond,-GarageCars) %>%
mutate_at(.vars = vars(ends_with("Area")),.funs = function(x)log(x+1)) %>%
mutate_at(.vars = vars(ends_with("Sold")),.funs = function(x)log(x+1)) %>%
mutate_at(.vars = vars(ends_with("Price")),.funs = function(x)log(x+1))
# DT com as medias e desvios padroes das variaveis.
# Serao utilizadas posteriormente para a normalizacao das variaveis na
# base de submissao
mean_train <- train_num %>%
summarise_if(is.numeric,mean,na.rm = TRUE) %>%
tidyr::gather(key,xbar)
sd_train <- train_num %>%
summarise_if(is.numeric,sd,na.rm = TRUE) %>%
tidyr::gather(key,s)
set.seed(123)
train_num <- train_num %>%
mutate_all(.funs = function(x) (x - mean(x))/sd(x)) %>%
mutate(Id = train$Id,
Flag = sample(c(1,0),size = nrow(train),replace = T,prob = c(.2,.8)),
MSZoning = factor(train$MSZoning),
MSSubClass = factor(train$MSSubClass),
OverallQual = factor(train$OverallQual),
OverallCond = factor(train$OverallCond),
GarageCars = factor(train$GarageCars),
Neighborhood = factor(train$Neighborhood))
y_train <- train$SalePrice[train_num$Flag==0]
y_train_test <- train$SalePrice[train_num$Flag==1]
# Matriz de dados de treino
Xtrain <- model.matrix(~.-1,data = train_num %>% filter(Flag == 0) %>% dplyr::select(-Id, -Flag))
shape <- dim(Xtrain)
shape[2]## [1] 93
Vamos começar treinando um modelo de Rede Neural utilizando keras.
model <- keras_model_sequential() %>%
layer_dense(units = 200, activation = "linear",
input_shape = shape[2], kernel_initializer='normal') %>%
layer_dropout(0.3) %>%
layer_dense(units = 50, activation = "linear", kernel_initializer='normal') %>%
layer_dropout(0.15) %>%
layer_dense(units = 15, activation = "linear", kernel_initializer='normal') %>%
layer_dense(units = 1)
model %>% compile(
optimizer = optimizer_adam(),
loss = 'mse',
metrics = list("mean_absolute_error")
)
history <- model %>% fit(
Xtrain,
log(y_train),
epochs = 20,
validation_split = 0.2,
verbose = 1
)Vamos testar em nossa base de teste.
Xtrain_test <- model.matrix(~.-1,train_num %>%
filter(Flag == 1) %>%
dplyr::select(-Id, -Flag))
evaluate(model,
x = Xtrain_test,
y = log(y_train_test)
)## $loss
## [1] 0.3701013
##
## $mean_absolute_error
## [1] 0.3578395
data.frame(y = y_train_test,
yhat = exp(predict(model,Xtrain_test)[,1])) %>%
plot(pch = 19)
curve(1*x,add = T,col = "red")# RMSE da variável na escala original
data.frame(y = y_train_test,
yhat = exp(predict(model,Xtrain_test)[,1])) %>%
summarise(rmse = sqrt(sum((y - yhat)^2)/298))## rmse
## 1 473130.7
Precisamos aplicar o modelo na base de submissão do kaggle, e, precisamos tratar a base de dados de submissão. A primeira coisa a se fazer é normalizar as variáveis númericas de acordo com a base de treinamento. a função a seguir trás o vetor de médias e de desvios padrões de cada variável.
normaliza_teste <- function(d){
name_var <- colnames(d)
n <- ncol(d)
res <- data.frame()
res_aux <- data.frame()
for(i in 1:n){
res_aux <- d %>%
dplyr::select(.data[[name_var[i]]]) %>%
mutate(key = name_var[i]) %>%
left_join(mean_train,by = "key") %>%
left_join(sd_train,by = "key") %>%
transmute(x = (.data[[name_var[i]]] - xbar)/s) %>%
as_tibble()
if(i==1){
res <- res_aux
}
else{
res <- cbind.data.frame(res,res_aux)
}
}
colnames(res) <- name_var
res <- as_tibble(res)
return(res)
}Algumas variáveis foram carregadas com o nome das colunas diferente da base de treino. Vamos arrumar =).
# Variaveis da base de treino
for_test <- names(train_num)
for_test <- for_test[c(-30,-31)]
# Arrumando nomes errados
test2 <- test %>%
mutate(`1stFlrSF` = X1stFlrSF,
`2ndFlrSF` = X2ndFlrSF,
`3SsnPorch` = X3SsnPorch) %>%
dplyr::select(-X1stFlrSF,-X2ndFlrSF,-X3SsnPorch)
# Verificando se todas as variaveis do treinamento estao no teste
for_test%in%names(test)## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE
# Organizando base teste
test2 <- test2[,for_test] %>%
dplyr::select_if(is.numeric) %>%
dplyr::select(-MSSubClass,-OverallQual,-OverallCond,-GarageCars) %>%
dplyr::mutate_at(.vars = vars(ends_with("Area")),.funs = function(x)log(x+1)) %>%
dplyr::mutate_at(.vars = vars(ends_with("Sold")),.funs = function(x)log(x+1)) %>%
dplyr::mutate_at(.vars = vars(ends_with("Price")),.funs = function(x)log(x+1)) %>%
normaliza_teste() %>%
mutate(MSZoning = factor(test$MSZoning),
MSSubClass = test$MSSubClass,
OverallQual = factor(test$OverallQual),
OverallCond = factor(test$OverallCond),
GarageCars = test$GarageCars,
Neighborhood = factor(test$Neighborhood)) %>%
mutate(MSSubClass = if_else(MSSubClass==150,160,as.numeric(MSSubClass)) %>%
factor(),
GarageCars = if_else(GarageCars==5,4,as.numeric(GarageCars)) %>%
factor())Ainda, a base de submissao possui variáveis com valores faltantes. Precisamos inputar estes dados de alguma forma. Eu usarei modelos de árvore de decisão e regressão linear. Pode-se criar modelos bem acurados para estimar estes dados, mas isso fica para outra hora. =P
## # A tibble: 5 x 9
## BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0534 0.604 -0.673 -0.400 -0.820 -0.241
## 2 1.05 -0.289 -0.365 0.619 -0.820 -0.241
## 3 0.762 -0.289 -0.974 -0.295 -0.820 -0.241
## 4 0.347 -0.289 -0.550 -0.300 -0.820 -0.241
## 5 -0.396 -0.289 1.02 0.507 -0.820 -0.241
## # … with 3 more variables: GarageArea <dbl>, MSZoning <fct>,
## # GarageCars <fct>
# Estimando modelos
mod_GarageCars <- partykit::ctree(GarageCars~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,-Id,-Flag))
table(test2$GarageCars,predict(mod_GarageCars,test2))##
## 0 1 2 3 4
## 0 0 44 32 0 0
## 1 1 341 65 0 0
## 2 2 168 563 37 0
## 3 0 8 63 122 0
## 4 0 3 8 1 0
mod_MSZoning <- partykit::ctree(MSZoning~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,-Id,-Flag))
table(test2$MSZoning,predict(mod_MSZoning,test2))##
## C (all) FV RH RL RM
## C (all) 0 0 0 1 14
## FV 0 70 0 4 0
## RH 0 0 2 4 4
## RL 0 14 13 1062 25
## RM 0 0 0 15 227
mod_BsmtFinSF1 <- lm(BsmtFinSF1~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtFinSF1 <- step(mod_BsmtFinSF1,trace = 0)
summary(mod_BsmtFinSF1)##
## Call:
## lm(formula = BsmtFinSF1 ~ YearBuilt + `1stFlrSF` + `2ndFlrSF` +
## GrLivArea + FullBath + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
## Fireplaces + WoodDeckSF + PoolArea + MSSubClass + OverallQual +
## Neighborhood, data = dplyr::select(train_num, -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7202 -0.5468 -0.0016 0.5379 4.0315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.800778 0.632490 -2.847 0.004476 **
## YearBuilt 0.216033 0.060406 3.576 0.000360 ***
## `1stFlrSF` 0.723127 0.085566 8.451 < 2e-16 ***
## `2ndFlrSF` 0.331490 0.090027 3.682 0.000240 ***
## GrLivArea -0.409239 0.110884 -3.691 0.000232 ***
## FullBath -0.110590 0.034765 -3.181 0.001500 **
## BedroomAbvGr -0.136404 0.035128 -3.883 0.000108 ***
## KitchenAbvGr -0.082557 0.037732 -2.188 0.028835 *
## TotRmsAbvGrd -0.095035 0.049000 -1.940 0.052641 .
## Fireplaces 0.087623 0.027369 3.201 0.001398 **
## WoodDeckSF 0.044882 0.023178 1.936 0.053019 .
## PoolArea 0.066101 0.022567 2.929 0.003455 **
## MSSubClass30 -0.222643 0.143030 -1.557 0.119787
## MSSubClass40 0.194204 0.414144 0.469 0.639193
## MSSubClass45 -0.488466 0.253841 -1.924 0.054520 .
## MSSubClass50 0.091131 0.130435 0.699 0.484874
## MSSubClass60 0.328914 0.137776 2.387 0.017104 *
## MSSubClass70 0.268156 0.187117 1.433 0.152054
## MSSubClass75 0.245874 0.269502 0.912 0.361754
## MSSubClass80 -0.002696 0.117379 -0.023 0.981680
## MSSubClass85 0.525119 0.187629 2.799 0.005201 **
## MSSubClass90 0.362723 0.178783 2.029 0.042662 *
## MSSubClass120 0.068913 0.120708 0.571 0.568157
## MSSubClass160 0.290572 0.188351 1.543 0.123126
## MSSubClass180 -0.160992 0.328307 -0.490 0.623950
## MSSubClass190 0.935154 0.197607 4.732 2.44e-06 ***
## OverallQual2 0.005300 0.740253 0.007 0.994288
## OverallQual3 0.414301 0.611218 0.678 0.497992
## OverallQual4 0.588168 0.588651 0.999 0.317881
## OverallQual5 0.723435 0.588681 1.229 0.219313
## OverallQual6 0.624612 0.592260 1.055 0.291779
## OverallQual7 0.517609 0.597058 0.867 0.386128
## OverallQual8 0.494234 0.602162 0.821 0.411919
## OverallQual9 0.746426 0.615708 1.212 0.225600
## OverallQual10 1.181198 0.632447 1.868 0.062018 .
## NeighborhoodBlueste 0.657722 0.630536 1.043 0.297074
## NeighborhoodBrDale 0.952339 0.339546 2.805 0.005105 **
## NeighborhoodBrkSide 0.994497 0.276191 3.601 0.000328 ***
## NeighborhoodClearCr 1.275565 0.276676 4.610 4.39e-06 ***
## NeighborhoodCollgCr 0.967998 0.233793 4.140 3.67e-05 ***
## NeighborhoodCrawfor 1.215841 0.267549 4.544 5.98e-06 ***
## NeighborhoodEdwards 1.139622 0.253000 4.504 7.21e-06 ***
## NeighborhoodGilbert 0.664110 0.247037 2.688 0.007267 **
## NeighborhoodIDOTRR 1.031108 0.288030 3.580 0.000356 ***
## NeighborhoodMeadowV 1.065368 0.344439 3.093 0.002020 **
## NeighborhoodMitchel 1.303499 0.257652 5.059 4.77e-07 ***
## NeighborhoodNAmes 1.166863 0.245848 4.746 2.28e-06 ***
## NeighborhoodNoRidge 1.306603 0.267429 4.886 1.15e-06 ***
## NeighborhoodNPkVill 1.200363 0.355180 3.380 0.000746 ***
## NeighborhoodNridgHt 0.930973 0.236800 3.931 8.85e-05 ***
## NeighborhoodNWAmes 1.125668 0.249850 4.505 7.18e-06 ***
## NeighborhoodOldTown 0.990796 0.266990 3.711 0.000215 ***
## NeighborhoodSawyer 1.184551 0.257369 4.603 4.55e-06 ***
## NeighborhoodSawyerW 1.156202 0.247572 4.670 3.30e-06 ***
## NeighborhoodSomerst 0.724574 0.241346 3.002 0.002728 **
## NeighborhoodStoneBr 1.251064 0.268157 4.665 3.37e-06 ***
## NeighborhoodSWISU 1.201435 0.303871 3.954 8.08e-05 ***
## NeighborhoodTimber 1.277049 0.261647 4.881 1.18e-06 ***
## NeighborhoodVeenker 1.346675 0.328161 4.104 4.30e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8044 on 1401 degrees of freedom
## Multiple R-squared: 0.3787, Adjusted R-squared: 0.3529
## F-statistic: 14.72 on 58 and 1401 DF, p-value: < 2.2e-16
mod_BsmtFinSF2 <- lm(BsmtFinSF2~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,-Id,-Flag))
mod_BsmtFinSF2 <- step(mod_BsmtFinSF2,trace = 0)
summary(mod_BsmtFinSF2)##
## Call:
## lm(formula = BsmtFinSF2 ~ LotArea + YearBuilt + `1stFlrSF` +
## BedroomAbvGr + WoodDeckSF + EnclosedPorch + `3SsnPorch` +
## ScreenPorch + MSSubClass + OverallQual, data = dplyr::select(train_num,
## -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2993 -0.4076 -0.1690 -0.0044 8.7751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.13110 0.68920 -0.190 0.849161
## LotArea 0.11268 0.03966 2.841 0.004563 **
## YearBuilt -0.25642 0.05589 -4.588 4.86e-06 ***
## `1stFlrSF` 0.12255 0.04139 2.961 0.003117 **
## BedroomAbvGr -0.05640 0.03322 -1.698 0.089765 .
## WoodDeckSF 0.06411 0.02746 2.335 0.019701 *
## EnclosedPorch 0.04683 0.02835 1.652 0.098719 .
## `3SsnPorch` -0.04006 0.02549 -1.572 0.116261
## ScreenPorch 0.06881 0.02631 2.616 0.009003 **
## MSSubClass30 -0.59603 0.15285 -3.900 0.000101 ***
## MSSubClass40 -0.75819 0.48820 -1.553 0.120640
## MSSubClass45 -0.67073 0.29442 -2.278 0.022867 *
## MSSubClass50 -0.48307 0.11701 -4.128 3.86e-05 ***
## MSSubClass60 0.02985 0.09542 0.313 0.754437
## MSSubClass70 -0.76083 0.17815 -4.271 2.08e-05 ***
## MSSubClass75 -0.82436 0.28108 -2.933 0.003412 **
## MSSubClass80 0.05145 0.13575 0.379 0.704741
## MSSubClass85 -0.09746 0.22189 -0.439 0.660556
## MSSubClass90 -0.26955 0.14730 -1.830 0.067456 .
## MSSubClass120 0.11637 0.13547 0.859 0.390453
## MSSubClass160 0.38553 0.16763 2.300 0.021601 *
## MSSubClass180 0.73745 0.33311 2.214 0.026997 *
## MSSubClass190 -0.72521 0.19965 -3.632 0.000291 ***
## OverallQual2 0.10321 0.87970 0.117 0.906620
## OverallQual3 0.01420 0.71747 0.020 0.984211
## OverallQual4 0.11808 0.69137 0.171 0.864411
## OverallQual5 0.40356 0.68921 0.586 0.558282
## OverallQual6 0.28706 0.69162 0.415 0.678165
## OverallQual7 0.20076 0.69600 0.288 0.773042
## OverallQual8 0.14037 0.70141 0.200 0.841415
## OverallQual9 -0.21480 0.71619 -0.300 0.764277
## OverallQual10 -0.23383 0.74453 -0.314 0.753519
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9632 on 1428 degrees of freedom
## Multiple R-squared: 0.09201, Adjusted R-squared: 0.0723
## F-statistic: 4.668 on 31 and 1428 DF, p-value: 1.155e-15
mod_BsmtUnfSF <- lm(BsmtUnfSF~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtUnfSF <- step(mod_BsmtUnfSF,trace = 0)
summary(mod_BsmtUnfSF)##
## Call:
## lm(formula = BsmtUnfSF ~ YearBuilt + `2ndFlrSF` + GrLivArea +
## FullBath + BedroomAbvGr + KitchenAbvGr + Fireplaces + WoodDeckSF +
## PoolArea + MSSubClass + OverallQual + GarageCars + Neighborhood,
## data = dplyr::select(train_num, -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.50598 -0.57032 -0.03428 0.54607 2.87743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.179397 0.652849 1.807 0.071049 .
## YearBuilt 0.115485 0.063339 1.823 0.068473 .
## `2ndFlrSF` -0.303597 0.062631 -4.847 1.39e-06 ***
## GrLivArea 0.324735 0.052816 6.148 1.02e-09 ***
## FullBath 0.075727 0.036336 2.084 0.037337 *
## BedroomAbvGr 0.206930 0.033307 6.213 6.85e-10 ***
## KitchenAbvGr 0.118046 0.038203 3.090 0.002041 **
## Fireplaces -0.095690 0.028309 -3.380 0.000744 ***
## WoodDeckSF -0.060861 0.024078 -2.528 0.011591 *
## PoolArea -0.038155 0.022821 -1.672 0.094762 .
## MSSubClass30 0.628689 0.148489 4.234 2.45e-05 ***
## MSSubClass40 0.178031 0.428637 0.415 0.677955
## MSSubClass45 0.863666 0.263995 3.272 0.001096 **
## MSSubClass50 0.216339 0.129138 1.675 0.094106 .
## MSSubClass60 -0.370699 0.137094 -2.704 0.006935 **
## MSSubClass70 0.191794 0.187898 1.021 0.307557
## MSSubClass75 0.164667 0.271033 0.608 0.543583
## MSSubClass80 -0.436150 0.121191 -3.599 0.000331 ***
## MSSubClass85 -0.555997 0.195597 -2.843 0.004540 **
## MSSubClass90 -0.702143 0.188005 -3.735 0.000195 ***
## MSSubClass120 -0.103553 0.128055 -0.809 0.418848
## MSSubClass160 -0.522347 0.190481 -2.742 0.006180 **
## MSSubClass180 -0.516206 0.341107 -1.513 0.130423
## MSSubClass190 -0.659040 0.205847 -3.202 0.001397 **
## OverallQual2 -0.008521 0.771933 -0.011 0.991194
## OverallQual3 -0.311560 0.632300 -0.493 0.622273
## OverallQual4 -0.132353 0.610533 -0.217 0.828410
## OverallQual5 -0.195325 0.611318 -0.320 0.749384
## OverallQual6 0.056667 0.614232 0.092 0.926507
## OverallQual7 0.200870 0.619150 0.324 0.745662
## OverallQual8 0.261634 0.625622 0.418 0.675866
## OverallQual9 0.183519 0.642034 0.286 0.775043
## OverallQual10 0.119246 0.663276 0.180 0.857349
## GarageCars1 -0.011519 0.112561 -0.102 0.918507
## GarageCars2 -0.163171 0.115196 -1.416 0.156860
## GarageCars3 0.104252 0.146072 0.714 0.475530
## GarageCars4 0.051580 0.395930 0.130 0.896367
## NeighborhoodBlueste -0.999934 0.655046 -1.527 0.127109
## NeighborhoodBrDale -0.676323 0.353723 -1.912 0.056079 .
## NeighborhoodBrkSide -0.969109 0.286267 -3.385 0.000731 ***
## NeighborhoodClearCr -1.207335 0.288907 -4.179 3.11e-05 ***
## NeighborhoodCollgCr -0.907552 0.243722 -3.724 0.000204 ***
## NeighborhoodCrawfor -1.301920 0.278730 -4.671 3.29e-06 ***
## NeighborhoodEdwards -1.153361 0.263785 -4.372 1.32e-05 ***
## NeighborhoodGilbert -0.657741 0.259372 -2.536 0.011324 *
## NeighborhoodIDOTRR -0.857441 0.299933 -2.859 0.004316 **
## NeighborhoodMeadowV -0.899838 0.357761 -2.515 0.012008 *
## NeighborhoodMitchel -1.227167 0.268096 -4.577 5.13e-06 ***
## NeighborhoodNAmes -1.161110 0.256657 -4.524 6.58e-06 ***
## NeighborhoodNoRidge -1.170028 0.271322 -4.312 1.73e-05 ***
## NeighborhoodNPkVill -0.876430 0.369491 -2.372 0.017827 *
## NeighborhoodNridgHt -0.814525 0.245182 -3.322 0.000916 ***
## NeighborhoodNWAmes -1.094725 0.262511 -4.170 3.23e-05 ***
## NeighborhoodOldTown -0.854935 0.276956 -3.087 0.002062 **
## NeighborhoodSawyer -1.230536 0.268578 -4.582 5.02e-06 ***
## NeighborhoodSawyerW -1.388686 0.257719 -5.388 8.33e-08 ***
## NeighborhoodSomerst -0.654013 0.250242 -2.614 0.009058 **
## NeighborhoodStoneBr -1.190713 0.278332 -4.278 2.01e-05 ***
## NeighborhoodSWISU -1.127279 0.315972 -3.568 0.000372 ***
## NeighborhoodTimber -1.349352 0.271458 -4.971 7.49e-07 ***
## NeighborhoodVeenker -1.541636 0.342139 -4.506 7.16e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8353 on 1399 degrees of freedom
## Multiple R-squared: 0.331, Adjusted R-squared: 0.3023
## F-statistic: 11.53 on 60 and 1399 DF, p-value: < 2.2e-16
mod_TotalBsmtSF <- lm(TotalBsmtSF~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_TotalBsmtSF <- step(mod_TotalBsmtSF,trace = 0)
summary(mod_TotalBsmtSF)##
## Call:
## lm(formula = TotalBsmtSF ~ YearBuilt + YearRemodAdd + `1stFlrSF` +
## GrLivArea + FullBath + BedroomAbvGr + TotRmsAbvGrd + OpenPorchSF +
## PoolArea + MSSubClass + OverallQual + GarageCars, data = dplyr::select(train_num,
## -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4729 -0.0942 0.0574 0.1998 3.6108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.801315 0.369902 -2.166 0.030455 *
## YearBuilt 0.228480 0.031516 7.250 6.83e-13 ***
## YearRemodAdd -0.035751 0.018394 -1.944 0.052136 .
## `1stFlrSF` 0.800214 0.035009 22.857 < 2e-16 ***
## GrLivArea -0.114470 0.047547 -2.407 0.016189 *
## FullBath -0.046661 0.021119 -2.209 0.027305 *
## BedroomAbvGr 0.045065 0.021534 2.093 0.036547 *
## TotRmsAbvGrd -0.062438 0.029150 -2.142 0.032364 *
## OpenPorchSF 0.049799 0.014706 3.386 0.000728 ***
## PoolArea 0.039728 0.013812 2.876 0.004084 **
## MSSubClass30 0.205957 0.082015 2.511 0.012142 *
## MSSubClass40 0.215035 0.259443 0.829 0.407339
## MSSubClass45 0.093209 0.156143 0.597 0.550637
## MSSubClass50 0.208767 0.074932 2.786 0.005405 **
## MSSubClass60 0.041821 0.080451 0.520 0.603263
## MSSubClass70 0.257919 0.108657 2.374 0.017743 *
## MSSubClass75 0.175930 0.164441 1.070 0.284860
## MSSubClass80 -0.417776 0.073186 -5.708 1.39e-08 ***
## MSSubClass85 -0.066452 0.117475 -0.566 0.571710
## MSSubClass90 -0.365006 0.082863 -4.405 1.14e-05 ***
## MSSubClass120 -0.013991 0.065852 -0.212 0.831779
## MSSubClass160 0.007177 0.092446 0.078 0.938127
## MSSubClass180 -0.388593 0.168558 -2.305 0.021288 *
## MSSubClass190 0.191921 0.112500 1.706 0.088233 .
## OverallQual2 0.139565 0.466262 0.299 0.764734
## OverallQual3 0.221217 0.382658 0.578 0.563284
## OverallQual4 0.602469 0.369623 1.630 0.103332
## OverallQual5 0.796172 0.370245 2.150 0.031692 *
## OverallQual6 0.885938 0.371835 2.383 0.017321 *
## OverallQual7 0.921029 0.375098 2.455 0.014190 *
## OverallQual8 0.992678 0.378228 2.625 0.008769 **
## OverallQual9 1.112491 0.386444 2.879 0.004052 **
## OverallQual10 1.485552 0.400966 3.705 0.000220 ***
## GarageCars1 -0.055653 0.068052 -0.818 0.413610
## GarageCars2 -0.135426 0.068726 -1.971 0.048971 *
## GarageCars3 0.052839 0.086572 0.610 0.541728
## GarageCars4 0.312740 0.237994 1.314 0.189036
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5077 on 1423 degrees of freedom
## Multiple R-squared: 0.7486, Adjusted R-squared: 0.7422
## F-statistic: 117.7 on 36 and 1423 DF, p-value: < 2.2e-16
mod_BsmtFullBath <- lm(BsmtFullBath~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtFullBath <- step(mod_BsmtFullBath,trace = 0)
summary(mod_BsmtFullBath)##
## Call:
## lm(formula = BsmtFullBath ~ YearBuilt + YearRemodAdd + `1stFlrSF` +
## FullBath + BedroomAbvGr + KitchenAbvGr + Fireplaces + WoodDeckSF +
## OpenPorchSF + EnclosedPorch + PoolArea + YrSold + MSSubClass +
## Neighborhood, data = dplyr::select(train_num, -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9659 -0.6845 -0.2349 0.7560 4.5779
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.24808 0.25254 -4.942 8.66e-07 ***
## YearBuilt 0.12528 0.06527 1.919 0.055128 .
## YearRemodAdd 0.05733 0.03221 1.780 0.075298 .
## `1stFlrSF` 0.21163 0.03816 5.545 3.50e-08 ***
## FullBath -0.22675 0.03657 -6.200 7.38e-10 ***
## BedroomAbvGr -0.13109 0.03209 -4.085 4.66e-05 ***
## KitchenAbvGr -0.10721 0.03935 -2.725 0.006515 **
## Fireplaces 0.07112 0.02949 2.412 0.016010 *
## WoodDeckSF 0.09054 0.02539 3.566 0.000375 ***
## OpenPorchSF 0.05155 0.02641 1.952 0.051135 .
## EnclosedPorch 0.04825 0.02603 1.853 0.064025 .
## PoolArea 0.04178 0.02401 1.740 0.082018 .
## YrSold 0.06235 0.02361 2.641 0.008367 **
## MSSubClass30 -0.20751 0.15410 -1.347 0.178335
## MSSubClass40 -0.01439 0.44999 -0.032 0.974497
## MSSubClass45 -0.44666 0.27515 -1.623 0.104736
## MSSubClass50 -0.16186 0.11944 -1.355 0.175600
## MSSubClass60 0.08776 0.09412 0.932 0.351262
## MSSubClass70 0.02914 0.17314 0.168 0.866388
## MSSubClass75 0.04697 0.26587 0.177 0.859806
## MSSubClass80 0.10210 0.12579 0.812 0.417133
## MSSubClass85 0.69184 0.20454 3.382 0.000738 ***
## MSSubClass90 0.82612 0.19675 4.199 2.85e-05 ***
## MSSubClass120 0.25227 0.12864 1.961 0.050062 .
## MSSubClass160 0.10435 0.17690 0.590 0.555367
## MSSubClass180 1.13187 0.35599 3.179 0.001507 **
## MSSubClass190 1.04860 0.21271 4.930 9.21e-07 ***
## NeighborhoodBlueste 1.38094 0.68756 2.008 0.044785 *
## NeighborhoodBrDale 0.81116 0.36752 2.207 0.027465 *
## NeighborhoodBrkSide 1.18561 0.29613 4.004 6.56e-05 ***
## NeighborhoodClearCr 1.44391 0.29850 4.837 1.46e-06 ***
## NeighborhoodCollgCr 1.23790 0.25164 4.919 9.71e-07 ***
## NeighborhoodCrawfor 1.14303 0.28737 3.978 7.32e-05 ***
## NeighborhoodEdwards 1.35068 0.27162 4.973 7.41e-07 ***
## NeighborhoodGilbert 0.98202 0.26551 3.699 0.000225 ***
## NeighborhoodIDOTRR 0.95279 0.31265 3.047 0.002350 **
## NeighborhoodMeadowV 1.00962 0.37139 2.719 0.006638 **
## NeighborhoodMitchel 1.45226 0.27560 5.270 1.58e-07 ***
## NeighborhoodNAmes 1.22447 0.26380 4.642 3.78e-06 ***
## NeighborhoodNoRidge 1.48040 0.28203 5.249 1.76e-07 ***
## NeighborhoodNPkVill 0.71171 0.38488 1.849 0.064645 .
## NeighborhoodNridgHt 1.03827 0.25136 4.131 3.83e-05 ***
## NeighborhoodNWAmes 1.01191 0.26931 3.757 0.000179 ***
## NeighborhoodOldTown 0.99579 0.28823 3.455 0.000567 ***
## NeighborhoodSawyer 1.24833 0.27553 4.531 6.38e-06 ***
## NeighborhoodSawyerW 1.32058 0.26618 4.961 7.86e-07 ***
## NeighborhoodSomerst 0.91984 0.26177 3.514 0.000456 ***
## NeighborhoodStoneBr 1.34422 0.28502 4.716 2.64e-06 ***
## NeighborhoodSWISU 1.36174 0.32692 4.165 3.30e-05 ***
## NeighborhoodTimber 1.41545 0.28166 5.025 5.67e-07 ***
## NeighborhoodVeenker 1.59697 0.35521 4.496 7.50e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.882 on 1409 degrees of freedom
## Multiple R-squared: 0.2488, Adjusted R-squared: 0.2221
## F-statistic: 9.334 on 50 and 1409 DF, p-value: < 2.2e-16
mod_BsmtHalfBath <- lm(BsmtHalfBath~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtHalfBath <- step(mod_BsmtHalfBath,trace = 0)
summary(mod_BsmtHalfBath)##
## Call:
## lm(formula = BsmtHalfBath ~ LotArea + FullBath + BedroomAbvGr +
## KitchenAbvGr + YrSold + MSSubClass + OverallCond, data = dplyr::select(train_num,
## -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4953 -0.3376 -0.1715 -0.0531 7.8570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0780020 0.9887295 0.079 0.937130
## LotArea 0.0616563 0.0366905 1.680 0.093090 .
## FullBath -0.0694509 0.0326080 -2.130 0.033352 *
## BedroomAbvGr 0.0841374 0.0334244 2.517 0.011936 *
## KitchenAbvGr -0.0939319 0.0421468 -2.229 0.025990 *
## YrSold -0.0520713 0.0257892 -2.019 0.043661 *
## MSSubClass30 -0.3092915 0.1327309 -2.330 0.019933 *
## MSSubClass40 -0.3307621 0.4926840 -0.671 0.502108
## MSSubClass45 0.0179536 0.2893769 0.062 0.950538
## MSSubClass50 -0.1283112 0.0962164 -1.334 0.182557
## MSSubClass60 -0.0016445 0.0786134 -0.021 0.983314
## MSSubClass70 -0.3958858 0.1411449 -2.805 0.005103 **
## MSSubClass75 -0.2611251 0.2548455 -1.025 0.305706
## MSSubClass80 0.4801960 0.1355926 3.541 0.000411 ***
## MSSubClass85 -0.0989491 0.2233247 -0.443 0.657780
## MSSubClass90 0.4491064 0.2122450 2.116 0.034519 *
## MSSubClass120 0.3652255 0.1339309 2.727 0.006470 **
## MSSubClass160 0.1306647 0.1676245 0.780 0.435809
## MSSubClass180 -0.0179613 0.3346941 -0.054 0.957210
## MSSubClass190 -0.0263683 0.2181277 -0.121 0.903799
## OverallCond2 0.0074328 1.0745857 0.007 0.994482
## OverallCond3 0.0492902 1.0043333 0.049 0.960864
## OverallCond4 0.0173333 0.9939545 0.017 0.986089
## OverallCond5 -0.2244319 0.9881123 -0.227 0.820354
## OverallCond6 0.0291618 0.9875742 0.030 0.976447
## OverallCond7 0.0005022 0.9880353 0.001 0.999595
## OverallCond8 0.2863048 0.9923111 0.289 0.772988
## OverallCond9 0.9608746 1.0087683 0.953 0.340993
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9762 on 1432 degrees of freedom
## Multiple R-squared: 0.06475, Adjusted R-squared: 0.04711
## F-statistic: 3.672 on 27 and 1432 DF, p-value: 8.942e-10
mod_GarageArea <- lm(GarageArea~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+MSZoning),
data = dplyr::select(train_num,
-Id,-Flag))
mod_GarageArea <- step(mod_GarageArea,trace = 0)
summary(mod_GarageArea)##
## Call:
## lm(formula = GarageArea ~ LotArea + YearBuilt + `1stFlrSF` +
## `2ndFlrSF` + LowQualFinSF + FullBath + HalfBath + BedroomAbvGr +
## TotRmsAbvGrd + Fireplaces + EnclosedPorch + MSSubClass +
## OverallQual + OverallCond + GarageCars + Neighborhood, data = dplyr::select(train_num,
## -Id, -Flag))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.40990 -0.06740 -0.00964 0.05611 0.74216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.028881 0.125775 -32.033 < 2e-16 ***
## LotArea 0.016962 0.005640 3.007 0.002683 **
## YearBuilt 0.013748 0.009517 1.445 0.148780
## `1stFlrSF` 0.044434 0.006709 6.623 5.02e-11 ***
## `2ndFlrSF` 0.033552 0.009678 3.467 0.000542 ***
## LowQualFinSF 0.008937 0.003615 2.472 0.013549 *
## FullBath -0.008881 0.005578 -1.592 0.111575
## HalfBath -0.011495 0.004922 -2.336 0.019655 *
## BedroomAbvGr -0.010265 0.005138 -1.998 0.045920 *
## TotRmsAbvGrd -0.014920 0.007039 -2.120 0.034218 *
## Fireplaces -0.009444 0.004054 -2.330 0.019958 *
## EnclosedPorch 0.006732 0.003534 1.905 0.056967 .
## MSSubClass30 -0.011521 0.021157 -0.545 0.586148
## MSSubClass40 -0.066058 0.061232 -1.079 0.280859
## MSSubClass45 -0.068730 0.037362 -1.840 0.066047 .
## MSSubClass50 -0.020056 0.018476 -1.086 0.277865
## MSSubClass60 0.020339 0.020082 1.013 0.311312
## MSSubClass70 -0.042476 0.026806 -1.585 0.113299
## MSSubClass75 -0.032665 0.040403 -0.808 0.418951
## MSSubClass80 0.039670 0.017170 2.310 0.021011 *
## MSSubClass85 0.032759 0.027750 1.181 0.237994
## MSSubClass90 -0.003119 0.019850 -0.157 0.875178
## MSSubClass120 -0.043732 0.019419 -2.252 0.024474 *
## MSSubClass160 -0.044424 0.029640 -1.499 0.134158
## MSSubClass180 -0.049761 0.049370 -1.008 0.313669
## MSSubClass190 0.022821 0.027733 0.823 0.410721
## OverallQual2 -0.065016 0.138980 -0.468 0.639994
## OverallQual3 -0.096658 0.126137 -0.766 0.443630
## OverallQual4 -0.057239 0.124398 -0.460 0.645496
## OverallQual5 -0.060826 0.124609 -0.488 0.625529
## OverallQual6 -0.089031 0.124780 -0.714 0.475651
## OverallQual7 -0.080433 0.125233 -0.642 0.520803
## OverallQual8 -0.053539 0.125974 -0.425 0.670905
## OverallQual9 -0.104116 0.127723 -0.815 0.415113
## OverallQual10 -0.084601 0.130455 -0.649 0.516761
## OverallCond2 0.098473 0.181681 0.542 0.587898
## OverallCond3 0.062954 0.170979 0.368 0.712783
## OverallCond4 0.073754 0.173783 0.424 0.671338
## OverallCond5 0.064749 0.173546 0.373 0.709136
## OverallCond6 0.098833 0.173583 0.569 0.569196
## OverallCond7 0.071406 0.173626 0.411 0.680945
## OverallCond8 0.086234 0.174089 0.495 0.620434
## OverallCond9 0.134005 0.175608 0.763 0.445538
## GarageCars1 3.895497 0.016251 239.708 < 2e-16 ***
## GarageCars2 4.267971 0.016510 258.510 < 2e-16 ***
## GarageCars3 4.516565 0.020971 215.369 < 2e-16 ***
## GarageCars4 4.592017 0.056184 81.732 < 2e-16 ***
## NeighborhoodBlueste 0.104734 0.092998 1.126 0.260275
## NeighborhoodBrDale 0.133330 0.050138 2.659 0.007921 **
## NeighborhoodBrkSide 0.038963 0.041187 0.946 0.344312
## NeighborhoodClearCr 0.034064 0.043078 0.791 0.429222
## NeighborhoodCollgCr 0.089074 0.035372 2.518 0.011907 *
## NeighborhoodCrawfor -0.009271 0.040139 -0.231 0.817363
## NeighborhoodEdwards 0.061156 0.038176 1.602 0.109394
## NeighborhoodGilbert -0.044374 0.037624 -1.179 0.238437
## NeighborhoodIDOTRR 0.118138 0.043023 2.746 0.006112 **
## NeighborhoodMeadowV 0.168689 0.050800 3.321 0.000921 ***
## NeighborhoodMitchel 0.062555 0.039042 1.602 0.109329
## NeighborhoodNAmes 0.079753 0.037066 2.152 0.031599 *
## NeighborhoodNoRidge 0.025054 0.039795 0.630 0.529071
## NeighborhoodNPkVill 0.071492 0.052440 1.363 0.173008
## NeighborhoodNridgHt 0.112076 0.035407 3.165 0.001582 **
## NeighborhoodNWAmes 0.061193 0.037990 1.611 0.107458
## NeighborhoodOldTown 0.046884 0.039703 1.181 0.237858
## NeighborhoodSawyer 0.054093 0.038853 1.392 0.164071
## NeighborhoodSawyerW 0.054017 0.037388 1.445 0.148751
## NeighborhoodSomerst 0.124631 0.036105 3.452 0.000573 ***
## NeighborhoodStoneBr 0.055408 0.040259 1.376 0.168955
## NeighborhoodSWISU 0.045674 0.045796 0.997 0.318777
## NeighborhoodTimber 0.034353 0.039761 0.864 0.387745
## NeighborhoodVeenker 0.042966 0.049757 0.864 0.388006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1181 on 1389 degrees of freedom
## Multiple R-squared: 0.9867, Adjusted R-squared: 0.9861
## F-statistic: 1474 on 70 and 1389 DF, p-value: < 2.2e-16
# Imputando os dados faltantes
test2 <- test2 %>%
mutate(BsmtFinSF1 = if_else(is.na(BsmtFinSF1)==TRUE,
predict(mod_BsmtFinSF1,newdata = test2),
BsmtFinSF1),
BsmtFinSF2 = if_else(is.na(BsmtFinSF2)==TRUE,
predict(mod_BsmtFinSF2,newdata = test2),
BsmtFinSF2),
BsmtUnfSF = if_else(is.na(BsmtUnfSF)==TRUE,
predict(mod_BsmtUnfSF,newdata = test2),
BsmtUnfSF),
TotalBsmtSF = if_else(is.na(TotalBsmtSF)==TRUE,
predict(mod_TotalBsmtSF,newdata = test2),
TotalBsmtSF),
BsmtFullBath = if_else(is.na(BsmtFullBath)==TRUE,
predict(mod_BsmtFullBath,newdata = test2),
BsmtFullBath),
BsmtHalfBath = if_else(is.na(BsmtHalfBath)==TRUE,
predict(mod_BsmtHalfBath,newdata = test2),
BsmtHalfBath),
GarageCars = if_else(is.na(GarageCars)==TRUE,
predict(mod_GarageCars,newdata = test2),
GarageCars),
MSZoning = if_else(is.na(MSZoning)==TRUE,
predict(mod_MSZoning,newdata = test2),
MSZoning))
test2 <- test2 %>%
mutate(GarageArea = if_else(is.na(GarageArea)==TRUE,
predict(mod_GarageArea,newdata = test2),
GarageArea),
Id = test$Id)A hora da verdade hehe.
## [1] 1459 93
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## Warning in ks.test(pred, log(y_train)): p-value will be approximate in the
## presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: pred and log(y_train)
## D = 0.10215, p-value = 2.445e-06
## alternative hypothesis: two-sided
Agora vamos aplicar um modelo mais comum em problemas de regressão, o XGBoost.
Para utilizar esse modelo, precisaremos carregar o pacote xgboost
# Melhor foi 0.097
mod_boost <- xgboost::xgboost(data = Xtrain,label = log(y_train),
nrounds = 300,eval_metric = "rmse",
params = list(eta = 0.097, # Obtido a partir de uma busca intensiva
gamma = 0,
max_depth = 15,
nround = 100,
subsample = 0.5, # Nao queremos overfitting
seed = 123,
objective = "reg:linear"))Hora da verdade 2 !!!
# Qualidade da predicao na base teste
data.frame(yhat = exp(predict(mod_boost,newdata = Xtrain_test)),
y = y_train_test) %>%
plot()
curve(1*x,add = TRUE,col = "red")data.frame(yhat = exp(predict(mod_boost,newdata = Xtrain_test)),
y = y_train_test) %>%
summarise(rmse = sqrt(sum((y - yhat)^2)/298))## rmse
## 1 12300.85
# Aderencia ao treino
ks.test(exp(predict(mod_boost,newdata = Xtest)),y_train) # Ta legalzinho mas pode melhorar bastante## Warning in ks.test(exp(predict(mod_boost, newdata = Xtest)), y_train): p-
## value will be approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: exp(predict(mod_boost, newdata = Xtest)) and y_train
## D = 0.047527, p-value = 0.105
## alternative hypothesis: two-sided