O programa tem como objetivo fazer uma rede neural que estime a duraçao media de uma viagem de taxi na cidade de Nova Iorque, foi usada a base de dados no link a seguir: https://www.kaggle.com/c/nyc-taxi-trip-duration/notebooks
suppressMessages(suppressWarnings(library(tidyverse)))
library(keras)
model <- keras::load_model_hdf5(filepath = "modelo_rnn")
mod_boost <- xgboost::xgb.load(modelfile = "mod_boost1")
Parte do código acima tem como finalidade carregar os pacotes e os arquivos que serão necessários para a resolução do problema.
train <- read.csv("/opt/datasets/houseprices/train.csv",sep = ",",
dec = ".")
train <- readr::read_csv("/opt/datasets/houseprices/train.csv",na = "character")
test <- read.csv("/opt/datasets/houseprices/test.csv",sep = ",",dec = ".")
submission <- read.csv("/opt/datasets/houseprices/sample_submission.csv",sep = ",",dec = ".")
train %>%
select_at(.vars = vars(ends_with("Area"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 25,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")
Nessa parte foi feita a analise d4escritiva dos dados, para entendermos em qul tipo de distribuiçaomela se encontra.
train %>%
select_at(.vars = vars(ends_with("Sold"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 10,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")
Já abaixo, será feita uma outra relação, porém com os tempos da viagem, onde é possível observar a assimetria.
train %>%
select_at(.vars = vars(ends_with("Price"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 25,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")
Ao contrário da anterior, aplicando o log, vemos que esta se torna simétrica.
train %>%
select_at(.vars = vars(ends_with("Price"))) %>%
select_if(is.numeric) %>%
gather(var,value) %>%
ggplot(aes(x = log(value))) +
geom_histogram(bins = 25,
fill = "#2b8cbe",
colour = "black") +
theme_light() +
facet_wrap(~var, scales = "free")
train_num <- train %>%
select_if(is.numeric) %>%
select(-Id,-MSSubClass,-SalePrice,-OverallQual,-OverallCond,-GarageCars) %>%
mutate_at(.vars = vars(ends_with("Area")),.funs = function(x)log(x+1)) %>%
mutate_at(.vars = vars(ends_with("Sold")),.funs = function(x)log(x+1)) %>%
mutate_at(.vars = vars(ends_with("Price")),.funs = function(x)log(x+1))
mean_train <- train_num %>%
summarise_if(is.numeric,mean,na.rm = TRUE) %>%
tidyr::gather(key,xbar)
sd_train <- train_num %>%
summarise_if(is.numeric,sd,na.rm = TRUE) %>%
tidyr::gather(key,s)
set.seed(123)
train_num <- train_num %>%
mutate_all(.funs = function(x) (x - mean(x))/sd(x)) %>%
mutate(Id = train$Id,
Flag = sample(c(1,0),size = nrow(train),replace = T,prob = c(.2,.8)),
MSZoning = factor(train$MSZoning),
MSSubClass = factor(train$MSSubClass),
OverallQual = factor(train$OverallQual),
OverallCond = factor(train$OverallCond),
GarageCars = factor(train$GarageCars),
Neighborhood = factor(train$Neighborhood))
y_train <- train$SalePrice[train_num$Flag==0]
y_train_test <- train$SalePrice[train_num$Flag==1]
Xtrain <- model.matrix(~.-1,data = train_num %>% filter(Flag == 0) %>% dplyr::select(-Id, -Flag))
shape <- dim(Xtrain)
shape[2]
Acima, no código em questão, estão alojadas as médias e os desvios padrões das variáveis a fim de normalizá-las posteriormente.
model <- keras_model_sequential() %>%
layer_dense(units = 200, activation = "linear",
input_shape = shape[2], kernel_initializer='normal') %>%
layer_dropout(0.3) %>%
layer_dense(units = 50, activation = "linear", kernel_initializer='normal') %>%
layer_dropout(0.15) %>%
layer_dense(units = 15, activation = "linear", kernel_initializer='normal') %>%
layer_dense(units = 1)
model %>% compile(
optimizer = optimizer_adam(),
loss = 'mse',
metrics = list("mean_absolute_error")
)
history <- model %>% fit(
Xtrain,
log(y_train),
epochs = 20,
validation_split = 0.2,
verbose = 1
)
Aqui se deu início ao treinamento com base no keras, e em seguida serão feitos os testes.
Xtrain_test <- model.matrix(~.-1,train_num %>%
filter(Flag == 1) %>%
dplyr::select(-Id, -Flag))
evaluate(model,
x = Xtrain_test,
y = log(y_train_test)
)
data.frame(y = y_train_test,
yhat = exp(predict(model,Xtrain_test)[,1])) %>%
plot(pch = 19)
curve(1*x,add = T,col = "red")
data.frame(y = y_train_test,
yhat = exp(predict(model,Xtrain_test)[,1])) %>%
summarise(rmse = sqrt(sum((y - yhat)^2)/298))
normaliza_teste <- function(d){
name_var <- colnames(d)
n <- ncol(d)
res <- data.frame()
res_aux <- data.frame()
for(i in 1:n){
res_aux <- d %>%
dplyr::select(.data[[name_var[i]]]) %>%
mutate(key = name_var[i]) %>%
left_join(mean_train,by = "key") %>%
left_join(sd_train,by = "key") %>%
transmute(x = (.data[[name_var[i]]] - xbar)/s) %>%
as_tibble()
if(i==1){
res <- res_aux
}
else{
res <- cbind.data.frame(res,res_aux)
}
}
colnames(res) <- name_var
res <- as_tibble(res)
return(res)
}
Aqui foi utilizado as médias e os desvios padrões que foram alojados anteriormente com o objetivo de normalizar as variáveis.
for_test <- names(train_num)
for_test <- for_test[c(-30,-31)]
test2 <- test %>%
mutate(`1stFlrSF` = X1stFlrSF,
`2ndFlrSF` = X2ndFlrSF,
`3SsnPorch` = X3SsnPorch) %>%
dplyr::select(-X1stFlrSF,-X2ndFlrSF,-X3SsnPorch)
for_test%in%names(test)
test2 <- test2[,for_test] %>%
dplyr::select_if(is.numeric) %>%
dplyr::select(-MSSubClass,-OverallQual,-OverallCond,-GarageCars) %>%
dplyr::mutate_at(.vars = vars(ends_with("Area")),.funs = function(x)log(x+1)) %>%
dplyr::mutate_at(.vars = vars(ends_with("Sold")),.funs = function(x)log(x+1)) %>%
dplyr::mutate_at(.vars = vars(ends_with("Price")),.funs = function(x)log(x+1)) %>%
normaliza_teste() %>%
mutate(MSZoning = factor(test$MSZoning),
MSSubClass = test$MSSubClass,
OverallQual = factor(test$OverallQual),
OverallCond = factor(test$OverallCond),
GarageCars = test$GarageCars,
Neighborhood = factor(test$Neighborhood)) %>%
mutate(MSSubClass = if_else(MSSubClass==150,160,as.numeric(MSSubClass)) %>%
factor(),
GarageCars = if_else(GarageCars==5,4,as.numeric(GarageCars)) %>%
factor())
test2 %>%
dplyr::select_if(anyNA) %>%
head(5)
mod_GarageCars <- partykit::ctree(GarageCars~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,-Id,-Flag))
table(test2$GarageCars,predict(mod_GarageCars,test2))
mod_MSZoning <- partykit::ctree(MSZoning~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,-Id,-Flag))
table(test2$MSZoning,predict(mod_MSZoning,test2))
mod_BsmtFinSF1 <- lm(BsmtFinSF1~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtFinSF1 <- step(mod_BsmtFinSF1,trace = 0)
summary(mod_BsmtFinSF1)
mod_BsmtFinSF2 <- lm(BsmtFinSF2~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea+MSZoning),
data = dplyr::select(train_num,-Id,-Flag))
mod_BsmtFinSF2 <- step(mod_BsmtFinSF2,trace = 0)
summary(mod_BsmtFinSF2)
mod_BsmtUnfSF <- lm(BsmtUnfSF~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtUnfSF <- step(mod_BsmtUnfSF,trace = 0)
summary(mod_BsmtUnfSF)
mod_TotalBsmtSF <- lm(TotalBsmtSF~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_TotalBsmtSF <- step(mod_TotalBsmtSF,trace = 0)
summary(mod_TotalBsmtSF)
mod_BsmtFullBath <- lm(BsmtFullBath~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtFullBath <- step(mod_BsmtFullBath,trace = 0)
summary(mod_BsmtFullBath)
mod_BsmtHalfBath <- lm(BsmtHalfBath~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+GarageArea),
data = dplyr::select(train_num,
-Id,-Flag))
mod_BsmtHalfBath <- step(mod_BsmtHalfBath,trace = 0)
summary(mod_BsmtHalfBath)
mod_GarageArea <- lm(GarageArea~.-(BsmtFinSF1+BsmtFinSF2+BsmtUnfSF+
TotalBsmtSF+BsmtFullBath+
BsmtHalfBath+MSZoning),
data = dplyr::select(train_num,
-Id,-Flag))
mod_GarageArea <- step(mod_GarageArea,trace = 0)
summary(mod_GarageArea)
test2 <- test2 %>%
mutate(BsmtFinSF1 = if_else(is.na(BsmtFinSF1)==TRUE,
predict(mod_BsmtFinSF1,newdata = test2),
BsmtFinSF1),
BsmtFinSF2 = if_else(is.na(BsmtFinSF2)==TRUE,
predict(mod_BsmtFinSF2,newdata = test2),
BsmtFinSF2),
BsmtUnfSF = if_else(is.na(BsmtUnfSF)==TRUE,
predict(mod_BsmtUnfSF,newdata = test2),
BsmtUnfSF),
TotalBsmtSF = if_else(is.na(TotalBsmtSF)==TRUE,
predict(mod_TotalBsmtSF,newdata = test2),
TotalBsmtSF),
BsmtFullBath = if_else(is.na(BsmtFullBath)==TRUE,
predict(mod_BsmtFullBath,newdata = test2),
BsmtFullBath),
BsmtHalfBath = if_else(is.na(BsmtHalfBath)==TRUE,
predict(mod_BsmtHalfBath,newdata = test2),
BsmtHalfBath),
GarageCars = if_else(is.na(GarageCars)==TRUE,
predict(mod_GarageCars,newdata = test2),
GarageCars),
MSZoning = if_else(is.na(MSZoning)==TRUE,
predict(mod_MSZoning,newdata = test2),
MSZoning))
test2 <- test2 %>%
mutate(GarageArea = if_else(is.na(GarageArea)==TRUE,
predict(mod_GarageArea,newdata = test2),
GarageArea),
Id = test$Id)
Xtest <- model.matrix(~.-1,test2 %>%
select(-"Id"))
dim(Xtest)
colnames(Xtest)%in%colnames(Xtrain)
pred <- predict(model,Xtest)
ks.test(pred,log(y_train))
mod_boost <- xgboost::xgboost(data = Xtrain,label = log(y_train),
nrounds = 300,eval_metric = "rmse",
params = list(eta = 0.097,
gamma = 0,
max_depth = 15,
nround = 100,
subsample = 0.5,
seed = 123,
objective = "reg:linear"))
data.frame(yhat = exp(predict(mod_boost,newdata = Xtrain_test)),
y = y_train_test) %>%
plot()
curve(1*x,add = TRUE,col = "red")
data.frame(yhat = exp(predict(mod_boost,newdata = Xtrain_test)),
y = y_train_test) %>%
summarise(rmse = sqrt(sum((y - yhat)^2)/298))
ks.test(exp(predict(mod_boost,newdata = Xtest)),y_train)