# LibrerÃas
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
df <- read_excel("C:\\Users\\anavi\\Downloads\\mx_tourism_state_data.xlsx", sheet = "panel_data")
df <- read_excel("C:\\Users\\anavi\\Downloads\\mx_tourism_state_data.xlsx")
df <- df %>%
arrange(state, year) %>%
group_by(state) %>%
mutate(tourism_gdp_lag1 = lag(tourism_gdp, 1)) %>%
ungroup() %>%
select(tourism_gdp,
real_wage,
pop_density,
college_education,
region,
ratio_public_investment,
tourism_gdp_lag1) %>%
na.omit()
set.seed(123)
train_rows <- sample(nrow(df), 0.8 * nrow(df))
train <- df[train_rows, ]
test <- df[-train_rows, ]
train_ols <- train %>%
mutate(
log_tourism_gdp = log(tourism_gdp),
log_real_wage = log(real_wage),
log_pop_density = log(pop_density),
log_tourism_gdp_lag1 = log(tourism_gdp_lag1),
region = as.factor(region)
)
test_ols <- test %>%
mutate(
log_tourism_gdp = log(tourism_gdp),
log_real_wage = log(real_wage),
log_pop_density = log(pop_density),
log_tourism_gdp_lag1 = log(tourism_gdp_lag1),
region = as.factor(region)
)
modelo_lm <- lm(log_tourism_gdp ~ log_real_wage + log_pop_density +
college_education + region +
ratio_public_investment + log_tourism_gdp_lag1,
data = train_ols)
pred_lm_log <- predict(modelo_lm, newdata = test_ols)
pred_lm <- exp(pred_lm_log)
res_lm <- postResample(pred = pred_lm, obs = test_ols$tourism_gdp)
plot(test_ols$tourism_gdp, pred_lm,
main = "OLS",
xlab = "Actual tourism_gdp",
ylab = "Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)
# Modelo Random Forest
train_rf <- train
test_rf <- test
train_rf$region <- as.factor(train_rf$region)
test_rf$region <- as.factor(test_rf$region)
modelo_rf <- randomForest(tourism_gdp ~ real_wage + pop_density +
college_education + region +
ratio_public_investment + tourism_gdp_lag1,
data = train_rf)
pred_rf <- predict(modelo_rf, newdata = test_rf)
res_rf <- postResample(pred = pred_rf, obs = test_rf$tourism_gdp)
plot(test_rf$tourism_gdp, pred_rf,
main = "Random Forest",
xlab = "Actual tourism_gdp",
ylab = "Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)
# Modelos ANN
train_ann <- train
test_ann <- test
train_ann$region <- as.factor(train_ann$region)
test_ann$region <- as.factor(test_ann$region)
dummies_train <- model.matrix(~ region - 1, data = train_ann)
dummies_test <- model.matrix(~ region - 1, data = test_ann)
train_ann <- cbind(train_ann, dummies_train)
test_ann <- cbind(test_ann, dummies_test)
train_ann$region <- NULL
test_ann$region <- NULL
normalizar <- function(x){
(x - min(x)) / (max(x) - min(x))
}
cols_norm <- c("tourism_gdp","real_wage","pop_density",
"college_education","ratio_public_investment",
"tourism_gdp_lag1")
train_ann[,cols_norm] <- lapply(train_ann[,cols_norm], normalizar)
test_ann[,cols_norm] <- lapply(test_ann[,cols_norm], normalizar)
x_vars <- setdiff(names(train_ann),"tourism_gdp")
form_ann <- as.formula(
paste("tourism_gdp ~", paste(x_vars, collapse=" + "))
)
#ANN 1
ann1 <- neuralnet(form_ann, data=train_ann,
hidden=c(5), linear.output=TRUE, stepmax=1e6)
pred_ann1 <- compute(ann1, test_ann[,x_vars])$net.result
res_ann1 <- postResample(pred=pred_ann1[,1], obs=test_ann$tourism_gdp)
plot(test_ann$tourism_gdp, pred_ann1,
main="ANN 1",
xlab="Actual tourism_gdp",
ylab="Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)
# ANN 2
ann2 <- neuralnet(form_ann, data=train_ann,
hidden=c(10,5), linear.output=TRUE, stepmax=1e6)
pred_ann2 <- compute(ann2, test_ann[,x_vars])$net.result
res_ann2 <- postResample(pred=pred_ann2[,1], obs=test_ann$tourism_gdp)
plot(test_ann$tourism_gdp, pred_ann2,
main="ANN 2",
xlab="Actual tourism_gdp",
ylab="Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)
# ANN 3
ann3 <- neuralnet(form_ann, data=train_ann,
hidden=c(15,10,5), linear.output=TRUE, stepmax=1e6)
pred_ann3 <- compute(ann3, test_ann[,x_vars])$net.result
res_ann3 <- postResample(pred=pred_ann3[,1], obs=test_ann$tourism_gdp)
plot(test_ann$tourism_gdp, pred_ann3,
main="ANN 3",
xlab="Actual tourism_gdp",
ylab="Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)
plot(ann1,rep="best")
plot(ann2,rep="best")
plot(ann3,rep="best")
resultados <- rbind(
OLS = res_lm,
RF = res_rf,
ANN1 = res_ann1,
ANN2 = res_ann2,
ANN3 = res_ann3
)
print(resultados)
## RMSE Rsquared MAE
## OLS 4.545732e+03 0.9953706 2.717958e+03
## RF 6.705427e+03 0.9892169 3.797212e+03
## ANN1 2.415430e-02 0.9822973 1.676995e-02
## ANN2 2.699232e-02 0.9818940 1.838905e-02
## ANN3 2.522017e-02 0.9825725 1.472599e-02