# Librerías
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice

Instalar paquetes y llamar librerías

df <- read_excel("C:\\Users\\anavi\\Downloads\\mx_tourism_state_data.xlsx", sheet = "panel_data")

Transformación de datos

df <- read_excel("C:\\Users\\anavi\\Downloads\\mx_tourism_state_data.xlsx")

df <- df %>%
  arrange(state, year) %>%
  group_by(state) %>%
  mutate(tourism_gdp_lag1 = lag(tourism_gdp, 1)) %>%
  ungroup() %>%
  select(tourism_gdp,
         real_wage,
         pop_density,
         college_education,
         region,
         ratio_public_investment,
         tourism_gdp_lag1) %>%
  na.omit()

set.seed(123)
train_rows <- sample(nrow(df), 0.8 * nrow(df))

train <- df[train_rows, ]
test  <- df[-train_rows, ]

Modelo OLS

train_ols <- train %>%
  mutate(
    log_tourism_gdp = log(tourism_gdp),
    log_real_wage = log(real_wage),
    log_pop_density = log(pop_density),
    log_tourism_gdp_lag1 = log(tourism_gdp_lag1),
    region = as.factor(region)
  )

test_ols <- test %>%
  mutate(
    log_tourism_gdp = log(tourism_gdp),
    log_real_wage = log(real_wage),
    log_pop_density = log(pop_density),
    log_tourism_gdp_lag1 = log(tourism_gdp_lag1),
    region = as.factor(region)
  )

modelo_lm <- lm(log_tourism_gdp ~ log_real_wage + log_pop_density +
                  college_education + region +
                  ratio_public_investment + log_tourism_gdp_lag1,
                data = train_ols)

pred_lm_log <- predict(modelo_lm, newdata = test_ols)
pred_lm <- exp(pred_lm_log)

res_lm <- postResample(pred = pred_lm, obs = test_ols$tourism_gdp)

plot(test_ols$tourism_gdp, pred_lm,
     main = "OLS",
     xlab = "Actual tourism_gdp",
     ylab = "Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)

# Modelo Random Forest

train_rf <- train
test_rf  <- test

train_rf$region <- as.factor(train_rf$region)
test_rf$region  <- as.factor(test_rf$region)

modelo_rf <- randomForest(tourism_gdp ~ real_wage + pop_density +
                            college_education + region +
                            ratio_public_investment + tourism_gdp_lag1,
                          data = train_rf)

pred_rf <- predict(modelo_rf, newdata = test_rf)

res_rf <- postResample(pred = pred_rf, obs = test_rf$tourism_gdp)

plot(test_rf$tourism_gdp, pred_rf,
     main = "Random Forest",
     xlab = "Actual tourism_gdp",
     ylab = "Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)

# Modelos ANN

train_ann <- train
test_ann  <- test

train_ann$region <- as.factor(train_ann$region)
test_ann$region  <- as.factor(test_ann$region)

dummies_train <- model.matrix(~ region - 1, data = train_ann)
dummies_test  <- model.matrix(~ region - 1, data = test_ann)

train_ann <- cbind(train_ann, dummies_train)
test_ann  <- cbind(test_ann, dummies_test)

train_ann$region <- NULL
test_ann$region  <- NULL

normalizar <- function(x){
  (x - min(x)) / (max(x) - min(x))
}

cols_norm <- c("tourism_gdp","real_wage","pop_density",
               "college_education","ratio_public_investment",
               "tourism_gdp_lag1")

train_ann[,cols_norm] <- lapply(train_ann[,cols_norm], normalizar)
test_ann[,cols_norm]  <- lapply(test_ann[,cols_norm], normalizar)

x_vars <- setdiff(names(train_ann),"tourism_gdp")

form_ann <- as.formula(
  paste("tourism_gdp ~", paste(x_vars, collapse=" + "))
)

Comparativas

#ANN 1 
ann1 <- neuralnet(form_ann, data=train_ann,
                  hidden=c(5), linear.output=TRUE, stepmax=1e6)

pred_ann1 <- compute(ann1, test_ann[,x_vars])$net.result

res_ann1 <- postResample(pred=pred_ann1[,1], obs=test_ann$tourism_gdp)

plot(test_ann$tourism_gdp, pred_ann1,
     main="ANN 1",
     xlab="Actual tourism_gdp",
     ylab="Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)

# ANN 2
ann2 <- neuralnet(form_ann, data=train_ann,
                  hidden=c(10,5), linear.output=TRUE, stepmax=1e6)

pred_ann2 <- compute(ann2, test_ann[,x_vars])$net.result

res_ann2 <- postResample(pred=pred_ann2[,1], obs=test_ann$tourism_gdp)

plot(test_ann$tourism_gdp, pred_ann2,
     main="ANN 2",
     xlab="Actual tourism_gdp",
     ylab="Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)

# ANN 3
ann3 <- neuralnet(form_ann, data=train_ann,
                  hidden=c(15,10,5), linear.output=TRUE, stepmax=1e6)

pred_ann3 <- compute(ann3, test_ann[,x_vars])$net.result

res_ann3 <- postResample(pred=pred_ann3[,1], obs=test_ann$tourism_gdp)

plot(test_ann$tourism_gdp, pred_ann3,
     main="ANN 3",
     xlab="Actual tourism_gdp",
     ylab="Predicted tourism_gdp")
abline(0,1,col="red",lwd=2)

plot(ann1,rep="best")

plot(ann2,rep="best")

plot(ann3,rep="best")

resultados <- rbind(
  OLS  = res_lm,
  RF   = res_rf,
  ANN1 = res_ann1,
  ANN2 = res_ann2,
  ANN3 = res_ann3
)

print(resultados)
##              RMSE  Rsquared          MAE
## OLS  4.545732e+03 0.9953706 2.717958e+03
## RF   6.705427e+03 0.9892169 3.797212e+03
## ANN1 2.415430e-02 0.9822973 1.676995e-02
## ANN2 2.699232e-02 0.9818940 1.838905e-02
## ANN3 2.522017e-02 0.9825725 1.472599e-02