library(foreign)
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(Hmisc) # several useful functions for data analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych) # functions for multivariate analysis
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # summaries and visualization of missing values NA's
library(dlookr) # summaries and visualization of missing values NA's
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:base':
##
## transform
library(corrplot) # correlation plots
## corrplot 0.92 loaded
library(jtools) # presentation of regression analysis
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
library(lmtest) # diagnostic checks - linear regression analysis
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car) # diagnostic checks - linear regression analysis
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(olsrr) # diagnostic checks - linear regression analysis
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(naniar) # identifying missing values
library(stargazer) # create publication quality tables
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(effects) # displays for linear and other regression models
## Registered S3 method overwritten by 'survey':
## method from
## summary.pps dlookr
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(tidyverse) # collection of R packages designed for data science
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # Classification and Regression Training
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet) # methods for prediction and plotting, and functions for cross-validation
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(corrplot)
# Exploratory Data Analysis
#Here we load the dataset
df <- read.csv("/Users/valeriacantulobo/Downloads/sp_data.csv")
df
## periodo IED_Flujos IED_M Exportaciones Exportaciones_m Empleo Educacion
## 1 1997 12145.60 294151.2 9087.62 220090.8 NA 7.20
## 2 1998 8373.50 210875.6 9875.07 248690.6 NA 7.31
## 3 1999 13960.32 299734.4 10990.01 235960.5 NA 7.43
## 4 2000 18248.69 362631.8 12482.96 248057.2 97.83 7.56
## 5 2001 30057.18 546548.4 11300.44 205482.9 97.36 7.68
## 6 2002 24099.21 468332.0 11923.10 231707.6 97.66 7.80
## 7 2003 18249.97 368752.8 13156.00 265825.7 97.06 7.93
## 8 2004 25015.57 481349.2 13573.13 261173.9 96.48 8.04
## 9 2005 25795.82 458544.8 16465.81 292695.1 97.17 8.14
## 10 2006 21232.54 368495.8 17485.93 303472.5 96.53 8.26
## 11 2007 32393.33 542793.7 19103.85 320110.6 96.60 8.36
## 12 2008 29502.46 586217.7 16924.76 336297.2 95.68 8.46
## 13 2009 17849.95 324318.4 19702.63 357980.1 95.20 8.56
## 14 2010 27189.28 449223.7 22673.14 374607.6 95.06 8.63
## 15 2011 25632.52 460653.8 24333.02 437299.9 95.49 8.75
## 16 2012 21769.32 350978.6 26297.98 423992.5 95.53 8.85
## 17 2013 48354.42 754437.5 27687.57 431988.2 95.75 8.95
## 18 2014 30351.25 512758.2 31676.78 535151.9 96.24 9.05
## 19 2015 35943.75 699904.1 29959.94 583386.1 96.04 9.15
## 20 2016 31188.98 700091.6 31375.06 704268.5 96.62 9.25
## 21 2017 34017.05 683318.0 33322.62 669368.6 96.85 9.35
## 22 2018 34100.43 671018.4 35341.90 695447.7 96.64 9.45
## 23 2019 34577.16 615945.4 36414.73 648679.3 97.09 9.58
## 24 2020 28205.89 514711.7 41077.34 749594.7 96.21 NA
## 25 2021 31553.52 551937.8 44914.78 785654.5 96.49 NA
## 26 2022 36215.37 555771.9 46477.59 713259.0 97.24 NA
## Salario_Diario Innovacion Inseguridad_Robo Inseguridad_Homicidio
## 1 24.30 11.30 266.51 14.55
## 2 31.91 11.37 314.78 14.32
## 3 31.91 12.46 272.89 12.64
## 4 35.12 13.15 216.98 10.86
## 5 37.57 13.47 214.53 10.25
## 6 39.74 12.80 197.80 9.94
## 7 41.53 11.81 183.22 9.81
## 8 43.30 12.61 146.28 8.92
## 9 45.24 13.41 136.94 9.22
## 10 47.05 14.23 135.59 9.60
## 11 48.88 15.04 145.92 8.04
## 12 50.84 14.82 158.17 12.52
## 13 53.19 12.59 175.77 17.46
## 14 55.77 12.69 201.94 22.43
## 15 58.06 12.10 212.61 23.42
## 16 60.75 13.03 190.28 22.09
## 17 63.12 13.22 185.56 19.74
## 18 65.58 13.65 154.41 16.93
## 19 70.10 15.11 180.44 17.37
## 20 73.04 14.40 160.57 20.31
## 21 88.36 14.05 230.43 26.22
## 22 88.36 13.25 184.25 29.59
## 23 102.68 12.70 173.45 29.21
## 24 123.22 11.28 133.90 28.98
## 25 141.70 NA 127.13 27.89
## 26 172.87 NA 120.49 NA
## Tipo_de_Cambio Densidad_Carretera Densidad_Poblacion CO2_Emisiones
## 1 8.06 0.05 47.44 3.68
## 2 9.94 0.05 48.76 3.85
## 3 9.52 0.06 49.48 3.69
## 4 9.60 0.06 50.58 3.87
## 5 9.17 0.06 51.28 3.81
## 6 10.36 0.06 51.95 3.82
## 7 11.20 0.06 52.61 3.95
## 8 11.22 0.06 53.27 3.98
## 9 10.71 0.06 54.78 4.10
## 10 10.88 0.06 55.44 4.19
## 11 10.90 0.06 56.17 4.22
## 12 13.77 0.07 56.96 4.19
## 13 13.04 0.07 57.73 4.04
## 14 12.38 0.07 58.45 4.11
## 15 13.98 0.07 59.15 4.19
## 16 12.99 0.07 59.85 4.20
## 17 13.07 0.08 59.49 4.06
## 18 14.73 0.08 60.17 3.89
## 19 17.34 0.08 60.86 3.93
## 20 20.66 0.08 61.57 3.89
## 21 19.74 0.09 62.28 3.84
## 22 19.66 0.09 63.11 3.65
## 23 18.87 0.09 63.90 3.59
## 24 19.94 0.09 64.59 NA
## 25 20.52 0.09 65.16 NA
## 26 19.41 0.09 65.60 NA
## PIB_Per_Capita INPC crisis_financiera
## 1 127570.1 33.28 0
## 2 126738.8 39.47 0
## 3 129164.7 44.34 0
## 4 130874.9 48.31 0
## 5 128083.4 50.43 0
## 6 128205.9 53.31 0
## 7 128737.9 55.43 0
## 8 132563.5 58.31 0
## 9 132941.1 60.25 0
## 10 135894.9 62.69 0
## 11 137795.7 65.05 0
## 12 135176.0 69.30 1
## 13 131233.0 71.77 1
## 14 134991.7 74.93 0
## 15 138891.9 77.79 0
## 16 141530.2 80.57 0
## 17 144112.0 83.77 0
## 18 147277.4 87.19 0
## 19 149433.5 89.05 0
## 20 152275.4 92.04 0
## 21 153235.7 98.27 0
## 22 153133.8 99.91 0
## 23 150233.1 105.93 0
## 24 142609.3 109.27 0
## 25 142772.0 117.31 0
## 26 146826.7 126.48 0
#Here we analyze the first 6 rows and the columns
head(df)
## periodo IED_Flujos IED_M Exportaciones Exportaciones_m Empleo Educacion
## 1 1997 12145.60 294151.2 9087.62 220090.8 NA 7.20
## 2 1998 8373.50 210875.6 9875.07 248690.6 NA 7.31
## 3 1999 13960.32 299734.4 10990.01 235960.5 NA 7.43
## 4 2000 18248.69 362631.8 12482.96 248057.2 97.83 7.56
## 5 2001 30057.18 546548.4 11300.44 205482.9 97.36 7.68
## 6 2002 24099.21 468332.0 11923.10 231707.6 97.66 7.80
## Salario_Diario Innovacion Inseguridad_Robo Inseguridad_Homicidio
## 1 24.30 11.30 266.51 14.55
## 2 31.91 11.37 314.78 14.32
## 3 31.91 12.46 272.89 12.64
## 4 35.12 13.15 216.98 10.86
## 5 37.57 13.47 214.53 10.25
## 6 39.74 12.80 197.80 9.94
## Tipo_de_Cambio Densidad_Carretera Densidad_Poblacion CO2_Emisiones
## 1 8.06 0.05 47.44 3.68
## 2 9.94 0.05 48.76 3.85
## 3 9.52 0.06 49.48 3.69
## 4 9.60 0.06 50.58 3.87
## 5 9.17 0.06 51.28 3.81
## 6 10.36 0.06 51.95 3.82
## PIB_Per_Capita INPC crisis_financiera
## 1 127570.1 33.28 0
## 2 126738.8 39.47 0
## 3 129164.7 44.34 0
## 4 130874.9 48.31 0
## 5 128083.4 50.43 0
## 6 128205.9 53.31 0
#Here we see the structure of the dataset
str(df)
## 'data.frame': 26 obs. of 18 variables:
## $ periodo : int 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 ...
## $ IED_Flujos : num 12146 8374 13960 18249 30057 ...
## $ IED_M : num 294151 210876 299734 362632 546548 ...
## $ Exportaciones : num 9088 9875 10990 12483 11300 ...
## $ Exportaciones_m : num 220091 248691 235961 248057 205483 ...
## $ Empleo : num NA NA NA 97.8 97.4 ...
## $ Educacion : num 7.2 7.31 7.43 7.56 7.68 7.8 7.93 8.04 8.14 8.26 ...
## $ Salario_Diario : num 24.3 31.9 31.9 35.1 37.6 ...
## $ Innovacion : num 11.3 11.4 12.5 13.2 13.5 ...
## $ Inseguridad_Robo : num 267 315 273 217 215 ...
## $ Inseguridad_Homicidio: num 14.6 14.3 12.6 10.9 10.2 ...
## $ Tipo_de_Cambio : num 8.06 9.94 9.52 9.6 9.17 ...
## $ Densidad_Carretera : num 0.05 0.05 0.06 0.06 0.06 0.06 0.06 0.06 0.06 0.06 ...
## $ Densidad_Poblacion : num 47.4 48.8 49.5 50.6 51.3 ...
## $ CO2_Emisiones : num 3.68 3.85 3.69 3.87 3.81 3.82 3.95 3.98 4.1 4.19 ...
## $ PIB_Per_Capita : num 127570 126739 129165 130875 128083 ...
## $ INPC : num 33.3 39.5 44.3 48.3 50.4 ...
## $ crisis_financiera : int 0 0 0 0 0 0 0 0 0 0 ...
#Here we try to identify if there are some missing values in the complete database
sum(is.na(df))
## [1] 12
#Here we identify NA per column
gg_miss_var(df)
#Here we fill all NA with median
columnas_a_llenar <- c("Empleo", "CO2_Emisiones", "Innovacion", "Inseguridad_Homicidio", "Educacion")
df2<- df %>%
mutate_at(vars(all_of(columnas_a_llenar)), ~ ifelse(is.na(.), median(., na.rm = TRUE), .))
#Descriptive statistics: here we see the descriptive stadistics of the complete df, but we are missing standard deviation
summary(df2)
## periodo IED_Flujos IED_M Exportaciones
## Min. :1997 Min. : 8374 Min. :210876 Min. : 9088
## 1st Qu.:2003 1st Qu.:21367 1st Qu.:368560 1st Qu.:13260
## Median :2010 Median :27698 Median :497054 Median :21188
## Mean :2010 Mean :26770 Mean :493596 Mean :23601
## 3rd Qu.:2016 3rd Qu.:32183 3rd Qu.:578606 3rd Qu.:31601
## Max. :2022 Max. :48354 Max. :754438 Max. :46478
## Exportaciones_m Empleo Educacion Salario_Diario
## Min. :205483 Min. :95.06 Min. :7.200 Min. : 24.30
## 1st Qu.:262337 1st Qu.:96.08 1st Qu.:7.957 1st Qu.: 41.97
## Median :366294 Median :96.53 Median :8.460 Median : 54.48
## Mean :433856 Mean :96.48 Mean :8.428 Mean : 65.16
## 3rd Qu.:632356 3rd Qu.:97.01 3rd Qu.:8.925 3rd Qu.: 72.31
## Max. :785654 Max. :97.83 Max. :9.580 Max. :172.87
## Innovacion Inseguridad_Robo Inseguridad_Homicidio Tipo_de_Cambio
## Min. :11.28 Min. :120.5 Min. : 8.04 Min. : 8.06
## 1st Qu.:12.60 1st Qu.:148.3 1st Qu.:10.40 1st Qu.:10.75
## Median :13.09 Median :181.8 Median :16.93 Median :13.02
## Mean :13.10 Mean :185.4 Mean :17.28 Mean :13.91
## 3rd Qu.:13.61 3rd Qu.:209.9 3rd Qu.:22.34 3rd Qu.:18.49
## Max. :15.11 Max. :314.8 Max. :29.59 Max. :20.66
## Densidad_Carretera Densidad_Poblacion CO2_Emisiones PIB_Per_Capita
## Min. :0.05000 Min. :47.44 Min. :3.590 Min. :126739
## 1st Qu.:0.06000 1st Qu.:52.77 1st Qu.:3.842 1st Qu.:130964
## Median :0.07000 Median :58.09 Median :3.930 Median :136845
## Mean :0.07115 Mean :57.33 Mean :3.943 Mean :138550
## 3rd Qu.:0.08000 3rd Qu.:61.39 3rd Qu.:4.090 3rd Qu.:146148
## Max. :0.09000 Max. :65.60 Max. :4.220 Max. :153236
## INPC crisis_financiera
## Min. : 33.28 Min. :0.00000
## 1st Qu.: 56.15 1st Qu.:0.00000
## Median : 73.35 Median :0.00000
## Mean : 75.17 Mean :0.07692
## 3rd Qu.: 91.29 3rd Qu.:0.00000
## Max. :126.48 Max. :1.00000
# Here since we did not obtain the standard deviation with the summary we go individually per numeric column
standard_deviationPER <- sd(df2$periodo)
standard_deviationcIEDF <- sd(df2$IED_Flujos)
standard_deviationIEDM <- sd(df2$IED_M)
standard_deviationEXP <- sd(df2$Exportaciones)
standard_deviationEXPM <- sd(df2$Exportaciones_m)
standard_deviationEMP <- sd(df2$Empleo)
standard_deviationEDU <- sd(df2$Educacion)
standard_deviationDIA <- sd(df2$Salario_Diario)
standard_deviationINN <- sd(df2$Innovacion)
standard_deviationINSR <- sd(df2$Inseguridad_Robo)
standard_deviationINSH <- sd(df2$Inseguridad_Homicidio)
standard_deviationTIP <- sd(df2$Tipo_de_Cambio)
standard_deviationCAR <- sd(df2$Densidad_Carretera)
standard_deviationPOB <- sd(df2$Densidad_Poblacion)
standard_deviationCO2 <- sd(df2$CO2_Emisiones)
standard_deviationPIB <- sd(df2$PIB_Per_Capita)
standard_deviationINPC <- sd(df2$INPC)
standard_deviationFIN <- sd(df2$crisis_financiera)
print(standard_deviationPER)
## [1] 7.648529
print(standard_deviationcIEDF)
## [1] 8770.679
print(standard_deviationIEDM)
## [1] 143849.2
print(standard_deviationEXP)
## [1] 11346.13
print(standard_deviationEXPM)
## [1] 195018.7
print(standard_deviationEMP)
## [1] 0.720897
print(standard_deviationEDU)
## [1] 0.6752085
print(standard_deviationDIA)
## [1] 35.84897
print(standard_deviationINN)
## [1] 1.072348
print(standard_deviationINSR)
## [1] 47.66618
print(standard_deviationINSH)
## [1] 7.116324
print(standard_deviationTIP)
## [1] 4.150147
print(standard_deviationCAR)
## [1] 0.01336471
print(standard_deviationPOB)
## [1] 5.411248
print(standard_deviationCO2)
## [1] 0.1800209
print(standard_deviationPIB)
## [1] 8861.102
print(standard_deviationINPC)
## [1] 24.8087
print(standard_deviationFIN)
## [1] 0.2717465
#Data Visualization: here we create different visualizations with variables from the df
#1
# Here we display a histogram of the dependent variable: IED_M.(IED_M is the same as IED_FLUJOS just that this variable is in pesos)
hist(df2$IED_M)
hist(log(df$IED_M))
#2
# here a plot of IED_M compared to exports.
plot_crime <- ggplot(df2, aes(x = IED_M, y = Exportaciones)) +
geom_point() +
labs(x = "IED_M", y = "Exportaciones_m")
print(plot_crime)
#3
#here a plot of IED_M compared to innovation.
plot_rooms <- ggplot(df2, aes(x = IED_M, y = Innovacion)) +
geom_point() +
labs(x = "IED_M", y = "Innovacion")
print(plot_rooms)
#4
#Here we display a histogram of jobs.
hist(df2$Empleo)
hist(log(df2$Empleo))
#5
#Here we display a correlation plot, with all the variables because all of them are quantitative variables.
cor_matrix <- cor(df2)
corrplot(cor_matrix, method = "circle")
###Which is the estimation method to be used to estimate the linear regression model?
#The method is the ordinary least squares regression (OLS) is a really common method for estimating coefficients of linear regressions, this method describes the relationship between the independent variable or variables with the dependent variable and it can be used in a simple or a multiple linear regression.
#Here is the creation of 3 hypothesis based from the visualizations created above
#hypothesis 1: With more innovation in Mexico it will be more attractive for nearshoring and will create more IED.
#hypothesis 2: With more IED there will be more jobs in Mexico.
#hypothesis 3: With more exports to Mexico the IED will be bigger.
#Here we create 3 different linear regression analysis
#This is the fist analysis with its 3 models and its diagnostic tests:
######## Multiple Linear Regression Analysis
#model 1
model1<-lm(IED_Flujos ~ Empleo+Educacion+CO2_Emisiones+Innovacion+Inseguridad_Homicidio+Tipo_de_Cambio+Salario_Diario+PIB_Per_Capita+Inseguridad_Robo+INPC+Exportaciones+Densidad_Poblacion+Densidad_Carretera+crisis_financiera,data=df2)
summary(model1)
##
## Call:
## lm(formula = IED_Flujos ~ Empleo + Educacion + CO2_Emisiones +
## Innovacion + Inseguridad_Homicidio + Tipo_de_Cambio + Salario_Diario +
## PIB_Per_Capita + Inseguridad_Robo + INPC + Exportaciones +
## Densidad_Poblacion + Densidad_Carretera + crisis_financiera,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6946 -2595 106 2229 9372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.556e+05 3.946e+05 0.394 0.701
## Empleo -2.224e+03 3.451e+03 -0.644 0.532
## Educacion 1.240e+04 2.372e+04 0.523 0.611
## CO2_Emisiones 1.297e+04 2.201e+04 0.589 0.568
## Innovacion 2.745e+03 2.294e+03 1.197 0.257
## Inseguridad_Homicidio -4.168e+01 8.262e+02 -0.050 0.961
## Tipo_de_Cambio -9.816e+02 1.509e+03 -0.650 0.529
## Salario_Diario 3.070e+02 5.840e+02 0.526 0.609
## PIB_Per_Capita -2.101e-01 1.176e+00 -0.179 0.861
## Inseguridad_Robo -4.644e+01 6.803e+01 -0.683 0.509
## INPC -2.626e+02 2.096e+03 -0.125 0.903
## Exportaciones -4.642e-01 1.647e+00 -0.282 0.783
## Densidad_Poblacion -2.386e+03 5.930e+03 -0.402 0.695
## Densidad_Carretera 1.330e+06 9.071e+05 1.466 0.171
## crisis_financiera -1.185e+04 8.770e+03 -1.351 0.204
##
## Residual standard error: 5531 on 11 degrees of freedom
## Multiple R-squared: 0.825, Adjusted R-squared: 0.6023
## F-statistic: 3.704 on 14 and 11 DF, p-value: 0.01752
#diagnostic test model 1
vif(model1) #helps to check for multicollinearity
## Empleo Educacion CO2_Emisiones
## 5.058787 209.658854 12.834930
## Innovacion Inseguridad_Homicidio Tipo_de_Cambio
## 4.943981 28.247181 32.071399
## Salario_Diario PIB_Per_Capita Inseguridad_Robo
## 358.143768 88.765573 8.593307
## INPC Exportaciones Densidad_Poblacion
## 2210.498517 285.248773 841.408695
## Densidad_Carretera crisis_financiera
## 120.112164 4.641225
bptest(model1) #helps to check for heteroscedasticity
##
## studentized Breusch-Pagan test
##
## data: model1
## BP = 14.019, df = 14, p-value = 0.4483
AIC(model1)
## [1] 531.5624
histogram(model1$residuals)
summary(model1)
##
## Call:
## lm(formula = IED_Flujos ~ Empleo + Educacion + CO2_Emisiones +
## Innovacion + Inseguridad_Homicidio + Tipo_de_Cambio + Salario_Diario +
## PIB_Per_Capita + Inseguridad_Robo + INPC + Exportaciones +
## Densidad_Poblacion + Densidad_Carretera + crisis_financiera,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6946 -2595 106 2229 9372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.556e+05 3.946e+05 0.394 0.701
## Empleo -2.224e+03 3.451e+03 -0.644 0.532
## Educacion 1.240e+04 2.372e+04 0.523 0.611
## CO2_Emisiones 1.297e+04 2.201e+04 0.589 0.568
## Innovacion 2.745e+03 2.294e+03 1.197 0.257
## Inseguridad_Homicidio -4.168e+01 8.262e+02 -0.050 0.961
## Tipo_de_Cambio -9.816e+02 1.509e+03 -0.650 0.529
## Salario_Diario 3.070e+02 5.840e+02 0.526 0.609
## PIB_Per_Capita -2.101e-01 1.176e+00 -0.179 0.861
## Inseguridad_Robo -4.644e+01 6.803e+01 -0.683 0.509
## INPC -2.626e+02 2.096e+03 -0.125 0.903
## Exportaciones -4.642e-01 1.647e+00 -0.282 0.783
## Densidad_Poblacion -2.386e+03 5.930e+03 -0.402 0.695
## Densidad_Carretera 1.330e+06 9.071e+05 1.466 0.171
## crisis_financiera -1.185e+04 8.770e+03 -1.351 0.204
##
## Residual standard error: 5531 on 11 degrees of freedom
## Multiple R-squared: 0.825, Adjusted R-squared: 0.6023
## F-statistic: 3.704 on 14 and 11 DF, p-value: 0.01752
predicted_values <- predict(model1)
rmse <- sqrt(mean((df2$IED_Flujos - predicted_values)^2))
rmse
## [1] 3597.629
#model 2
model2<-lm(log(IED_Flujos) ~ Empleo+Educacion+CO2_Emisiones+Innovacion+Inseguridad_Homicidio+Tipo_de_Cambio+Salario_Diario+PIB_Per_Capita+Inseguridad_Robo+INPC+Exportaciones+Densidad_Poblacion+Densidad_Carretera+crisis_financiera,data=df2)
summary(model2)
##
## Call:
## lm(formula = log(IED_Flujos) ~ Empleo + Educacion + CO2_Emisiones +
## Innovacion + Inseguridad_Homicidio + Tipo_de_Cambio + Salario_Diario +
## PIB_Per_Capita + Inseguridad_Robo + INPC + Exportaciones +
## Densidad_Poblacion + Densidad_Carretera + crisis_financiera,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28355 -0.06414 -0.01363 0.07633 0.21769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.115e+01 1.326e+01 0.841 0.418
## Empleo -4.445e-02 1.160e-01 -0.383 0.709
## Educacion -1.785e-02 7.974e-01 -0.022 0.983
## CO2_Emisiones 1.176e-01 7.400e-01 0.159 0.877
## Innovacion 1.218e-01 7.710e-02 1.580 0.142
## Inseguridad_Homicidio 6.587e-03 2.777e-02 0.237 0.817
## Tipo_de_Cambio -5.679e-02 5.074e-02 -1.119 0.287
## Salario_Diario 8.130e-04 1.963e-02 0.041 0.968
## PIB_Per_Capita 2.760e-06 3.953e-05 0.070 0.946
## Inseguridad_Robo -2.903e-03 2.287e-03 -1.269 0.231
## INPC 2.113e-02 7.046e-02 0.300 0.770
## Exportaciones -6.015e-05 5.535e-05 -1.087 0.300
## Densidad_Poblacion -1.730e-02 1.993e-01 -0.087 0.932
## Densidad_Carretera 4.214e+01 3.049e+01 1.382 0.194
## crisis_financiera -4.792e-01 2.948e-01 -1.626 0.132
##
## Residual standard error: 0.1859 on 11 degrees of freedom
## Multiple R-squared: 0.8975, Adjusted R-squared: 0.767
## F-statistic: 6.878 on 14 and 11 DF, p-value: 0.001394
#diagnostic test model 2
vif(model2) #helps to check for multicollinearity
## Empleo Educacion CO2_Emisiones
## 5.058787 209.658854 12.834930
## Innovacion Inseguridad_Homicidio Tipo_de_Cambio
## 4.943981 28.247181 32.071399
## Salario_Diario PIB_Per_Capita Inseguridad_Robo
## 358.143768 88.765573 8.593307
## INPC Exportaciones Densidad_Poblacion
## 2210.498517 285.248773 841.408695
## Densidad_Carretera crisis_financiera
## 120.112164 4.641225
bptest(model2) #helps to check for heteroscedasticity
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 11.648, df = 14, p-value = 0.6345
AIC(model2)
## [1] -4.070233
histogram(model2$residuals)
summary(model2)
##
## Call:
## lm(formula = log(IED_Flujos) ~ Empleo + Educacion + CO2_Emisiones +
## Innovacion + Inseguridad_Homicidio + Tipo_de_Cambio + Salario_Diario +
## PIB_Per_Capita + Inseguridad_Robo + INPC + Exportaciones +
## Densidad_Poblacion + Densidad_Carretera + crisis_financiera,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28355 -0.06414 -0.01363 0.07633 0.21769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.115e+01 1.326e+01 0.841 0.418
## Empleo -4.445e-02 1.160e-01 -0.383 0.709
## Educacion -1.785e-02 7.974e-01 -0.022 0.983
## CO2_Emisiones 1.176e-01 7.400e-01 0.159 0.877
## Innovacion 1.218e-01 7.710e-02 1.580 0.142
## Inseguridad_Homicidio 6.587e-03 2.777e-02 0.237 0.817
## Tipo_de_Cambio -5.679e-02 5.074e-02 -1.119 0.287
## Salario_Diario 8.130e-04 1.963e-02 0.041 0.968
## PIB_Per_Capita 2.760e-06 3.953e-05 0.070 0.946
## Inseguridad_Robo -2.903e-03 2.287e-03 -1.269 0.231
## INPC 2.113e-02 7.046e-02 0.300 0.770
## Exportaciones -6.015e-05 5.535e-05 -1.087 0.300
## Densidad_Poblacion -1.730e-02 1.993e-01 -0.087 0.932
## Densidad_Carretera 4.214e+01 3.049e+01 1.382 0.194
## crisis_financiera -4.792e-01 2.948e-01 -1.626 0.132
##
## Residual standard error: 0.1859 on 11 degrees of freedom
## Multiple R-squared: 0.8975, Adjusted R-squared: 0.767
## F-statistic: 6.878 on 14 and 11 DF, p-value: 0.001394
predicted_values <- predict(model2)
rmse <- sqrt(mean((df2$IED_Flujos - predicted_values)^2))
rmse
## [1] 28107.96
#model 3
model3<-lm(log(IED_Flujos) ~ Educacion+I(Educacion^2)+Empleo+CO2_Emisiones+Innovacion+Inseguridad_Homicidio+Tipo_de_Cambio+Salario_Diario+PIB_Per_Capita+Inseguridad_Robo+INPC+Exportaciones+Densidad_Poblacion+Densidad_Carretera+crisis_financiera,data=df2)
summary(model3)
##
## Call:
## lm(formula = log(IED_Flujos) ~ Educacion + I(Educacion^2) + Empleo +
## CO2_Emisiones + Innovacion + Inseguridad_Homicidio + Tipo_de_Cambio +
## Salario_Diario + PIB_Per_Capita + Inseguridad_Robo + INPC +
## Exportaciones + Densidad_Poblacion + Densidad_Carretera +
## crisis_financiera, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28233 -0.05903 -0.01030 0.07970 0.20995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.976e+01 5.078e+01 0.389 0.705
## Educacion -2.033e+00 1.147e+01 -0.177 0.863
## I(Educacion^2) 1.149e-01 6.520e-01 0.176 0.864
## Empleo -5.151e-02 1.279e-01 -0.403 0.696
## CO2_Emisiones 2.466e-01 1.066e+00 0.231 0.822
## Innovacion 1.191e-01 8.216e-02 1.450 0.178
## Inseguridad_Homicidio 2.251e-03 3.809e-02 0.059 0.954
## Tipo_de_Cambio -5.607e-02 5.329e-02 -1.052 0.317
## Salario_Diario -1.440e-03 2.420e-02 -0.059 0.954
## PIB_Per_Capita -1.440e-06 4.777e-05 -0.030 0.977
## Inseguridad_Robo -3.047e-03 2.531e-03 -1.204 0.256
## INPC 2.608e-02 7.894e-02 0.330 0.748
## Exportaciones -6.404e-05 6.202e-05 -1.033 0.326
## Densidad_Poblacion -4.187e-03 2.216e-01 -0.019 0.985
## Densidad_Carretera 4.446e+01 3.453e+01 1.287 0.227
## crisis_financiera -5.133e-01 3.643e-01 -1.409 0.189
##
## Residual standard error: 0.1947 on 10 degrees of freedom
## Multiple R-squared: 0.8978, Adjusted R-squared: 0.7445
## F-statistic: 5.856 on 15 and 10 DF, p-value: 0.003829
#diagnostic test model 3
vif(model3) #helps to check for multicollinearity
## Educacion I(Educacion^2) Empleo
## 39533.150849 36018.272594 5.608856
## CO2_Emisiones Innovacion Inseguridad_Homicidio
## 24.280934 5.120191 48.464386
## Tipo_de_Cambio Salario_Diario PIB_Per_Capita
## 32.257777 496.621390 118.170197
## Inseguridad_Robo INPC Exportaciones
## 9.600084 2529.858053 326.593953
## Densidad_Poblacion Densidad_Carretera crisis_financiera
## 948.262127 140.488468 6.465792
bptest(model3) #helps to check for heteroscedasticity
##
## studentized Breusch-Pagan test
##
## data: model3
## BP = 11.833, df = 15, p-value = 0.6917
AIC(model3)
## [1] -2.150876
histogram(model3$residuals)
summary(model3)
##
## Call:
## lm(formula = log(IED_Flujos) ~ Educacion + I(Educacion^2) + Empleo +
## CO2_Emisiones + Innovacion + Inseguridad_Homicidio + Tipo_de_Cambio +
## Salario_Diario + PIB_Per_Capita + Inseguridad_Robo + INPC +
## Exportaciones + Densidad_Poblacion + Densidad_Carretera +
## crisis_financiera, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28233 -0.05903 -0.01030 0.07970 0.20995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.976e+01 5.078e+01 0.389 0.705
## Educacion -2.033e+00 1.147e+01 -0.177 0.863
## I(Educacion^2) 1.149e-01 6.520e-01 0.176 0.864
## Empleo -5.151e-02 1.279e-01 -0.403 0.696
## CO2_Emisiones 2.466e-01 1.066e+00 0.231 0.822
## Innovacion 1.191e-01 8.216e-02 1.450 0.178
## Inseguridad_Homicidio 2.251e-03 3.809e-02 0.059 0.954
## Tipo_de_Cambio -5.607e-02 5.329e-02 -1.052 0.317
## Salario_Diario -1.440e-03 2.420e-02 -0.059 0.954
## PIB_Per_Capita -1.440e-06 4.777e-05 -0.030 0.977
## Inseguridad_Robo -3.047e-03 2.531e-03 -1.204 0.256
## INPC 2.608e-02 7.894e-02 0.330 0.748
## Exportaciones -6.404e-05 6.202e-05 -1.033 0.326
## Densidad_Poblacion -4.187e-03 2.216e-01 -0.019 0.985
## Densidad_Carretera 4.446e+01 3.453e+01 1.287 0.227
## crisis_financiera -5.133e-01 3.643e-01 -1.409 0.189
##
## Residual standard error: 0.1947 on 10 degrees of freedom
## Multiple R-squared: 0.8978, Adjusted R-squared: 0.7445
## F-statistic: 5.856 on 15 and 10 DF, p-value: 0.003829
predicted_values <- predict(model3)
rmse <- sqrt(mean((df2$IED_Flujos - predicted_values)^2))
rmse
## [1] 28107.96
#The best model is model number 1 because it has de lowest r squared and the lowest RMSE out of the 3 models.
# Detect Heteroscedasticity
boxplot(formula=IED_Flujos ~ periodo, data=df, col=cm.colors(3)) #there is non heteroscedasticity because there is no pattern and for there to be Heteroscedasticity it needs a pattern.
#second analysis:
######## LASSO
set.seed(123)
training.samples<-df2$IED_Flujos %>%
createDataPartition(p=0.75,list=FALSE)
train.data<-df2[training.samples, ]
test.data<-df2[-training.samples, ]
x<-model.matrix(log(IED_Flujos) ~ Empleo+Educacion+CO2_Emisiones+Innovacion+Inseguridad_Homicidio+Tipo_de_Cambio+Salario_Diario+PIB_Per_Capita+Inseguridad_Robo+INPC+Exportaciones+Densidad_Poblacion+Densidad_Carretera+crisis_financiera,train.data)[,-1]
y<-train.data$IED_Flujos
set.seed(123)
cv.lasso <- cv.glmnet(x,y,alpha=1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
lambda_min <- cv.lasso$lambda.min
lassomodel<-glmnet(x,y,alpha=1,lambda=cv.lasso$lambda.min)
coef(lassomodel)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -57133.58862
## Empleo 261.03044
## Educacion 1020.65081
## CO2_Emisiones .
## Innovacion 2652.48753
## Inseguridad_Homicidio .
## Tipo_de_Cambio .
## Salario_Diario .
## PIB_Per_Capita .
## Inseguridad_Robo -24.61072
## INPC 35.67360
## Exportaciones .
## Densidad_Poblacion .
## Densidad_Carretera 233017.75889
## crisis_financiera -2728.74532
x.test<-model.matrix(log(IED_Flujos) ~Empleo+Educacion+CO2_Emisiones+Innovacion+Inseguridad_Homicidio+Tipo_de_Cambio+Salario_Diario+PIB_Per_Capita+Inseguridad_Robo+INPC+Exportaciones+Densidad_Poblacion+Densidad_Carretera+crisis_financiera,test.data)[,-1]
lassopredictions <- lassomodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(lassopredictions, test.data$IED_Flujos),
Rsquare = R2(lassopredictions, test.data$IED_Flujos))
## RMSE Rsquare
## 1 9720.708 0.738481
# Graph
lbs_fun <- function(fit, offset_x=1, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])+ offset_x
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels=labs, ...)
}
lasso<-glmnet(scale(x),y,alpha=1)
plot(lasso,xvar="lambda",label=T)
lbs_fun(lasso)
abline(v=cv.lasso$lambda.min,col="red",lty=2)
abline(v=cv.lasso$lambda.1se,col="blue",lty=2)
#Third analysis:
#################################
### RIDGE REGRESSION ANALYSIS ###
#################################
# Find the best lambda using cross-validation
set.seed(123) # x: independent variables | y: dependent variable
cv.ridge <- cv.glmnet(x,y,alpha=0.1) # alpha = 0 for RIDGE
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
# Display the best lambda value
cv.ridge$lambda.min # lambda: a numeric value defining the amount of shrinkage. Why min? the higher the value of ?? , the more penalization there is
## [1] 1170.653
# Fit the final model on the training data
ridgemodel<-glmnet(x,y,alpha=0,lambda=cv.ridge$lambda.min)
# Display regression coefficients
coef(ridgemodel)
## 15 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -9.945280e+04
## Empleo 6.825950e+02
## Educacion 2.169073e+03
## CO2_Emisiones -2.234609e+03
## Innovacion 2.535383e+03
## Inseguridad_Homicidio -2.606497e+01
## Tipo_de_Cambio -5.529012e+01
## Salario_Diario 1.893855e+01
## PIB_Per_Capita 1.112910e-03
## Inseguridad_Robo -2.341469e+01
## INPC 4.067949e+01
## Exportaciones -1.452290e-02
## Densidad_Poblacion 1.910076e+02
## Densidad_Carretera 1.050239e+05
## crisis_financiera -2.651910e+03
# Make predictions on the test data
x.test<-model.matrix(log(IED_Flujos) ~ Empleo+Educacion+CO2_Emisiones+Innovacion+Inseguridad_Homicidio+Tipo_de_Cambio+Salario_Diario+PIB_Per_Capita+Inseguridad_Robo+INPC+Exportaciones+Densidad_Poblacion+Densidad_Carretera+crisis_financiera,test.data)[,-1]
ridgepredictions<-ridgemodel %>% predict(x.test) %>% as.vector()
# Model Accuracy
data.frame(
RMSE = RMSE(ridgepredictions, test.data$IED_Flujos),
Rsquare = R2(ridgepredictions, test.data$IED_Flujos)
)
## RMSE Rsquare
## 1 10104.8 0.7186774
### visualizing ridge regression results
ridge<-glmnet(scale(x),y,alpha=0)
plot(ridge, xvar = "lambda", label=T)
lbs_fun(ridge)
abline(v=cv.ridge$lambda.min, col = "red", lty=2)
abline(v=cv.ridge$lambda.1se, col="blue", lty=2)
#diagnostic test of model 1, lasso and ridge (with round numbers)
tab <- matrix(c(3597.6,28107.96, 28107.96, 9720.7,10104.8,0.83,0.897, 0.897, 0.74,0.72), ncol=2, byrow=FALSE)
colnames(tab) <- c('RMSE','R2')
rownames(tab) <- c('Linear Regression Model 1','Linear Regression Model 2', 'Linear Regression Model 3', 'Lasso','Ridge')
tab <- as.table(tab)
tab
## RMSE R2
## Linear Regression Model 1 3597.600 0.830
## Linear Regression Model 2 28107.960 0.897
## Linear Regression Model 3 28107.960 0.897
## Lasso 9720.700 0.740
## Ridge 10104.800 0.720
#After making the Multiple linear regression with the 3 models, the lasso and the ridge with all the variables the results were the following:
#Side note: R^2 represents the proportion of the variability in the dependent variable that is explained by the regression model, or how much variability in the response variable (dependent variable) can be explained by the predictor variables (independent variables) included in the model. It is a value between 0 and 1, where: #R^2 = 0 means that the model does not explain any variability in the dependent variable. #R^2 = 1 means that the model explains all the variability in the dependent variable.
#Side note 2: RMSE( Root Mean Square Error): The larger the difference indicates a larger gap between the predicted and observed values, which means poor regression model fit. In the same way, the smaller RMSE that indicates the better the model.
#Side note 3: Multicollinearity is present if the VIF is higher than 10
# We use model 2 without the variables with higher multicollinearity and only with the variables that are better for the 3 hypotheses
model2<-lm(log(IED_Flujos) ~ Empleo+CO2_Emisiones+Innovacion+PIB_Per_Capita+Exportaciones+crisis_financiera,data=df2)
summary(model2)
##
## Call:
## lm(formula = log(IED_Flujos) ~ Empleo + CO2_Emisiones + Innovacion +
## PIB_Per_Capita + Exportaciones + crisis_financiera, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53220 -0.12466 -0.04681 0.10337 0.52353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.930e+00 1.225e+01 0.158 0.876
## Empleo 3.994e-02 1.084e-01 0.368 0.717
## CO2_Emisiones 2.838e-01 4.367e-01 0.650 0.524
## Innovacion 1.418e-01 8.241e-02 1.720 0.102
## PIB_Per_Capita 7.253e-06 1.689e-05 0.429 0.673
## Exportaciones 1.578e-05 1.048e-05 1.505 0.149
## crisis_financiera -6.679e-02 2.378e-01 -0.281 0.782
##
## Residual standard error: 0.2524 on 19 degrees of freedom
## Multiple R-squared: 0.6737, Adjusted R-squared: 0.5706
## F-statistic: 6.537 on 6 and 19 DF, p-value: 0.0007199
#New VIF with the variables that are important and without multicollinearity (VIF lower than 10)
vif(model2)
## Empleo CO2_Emisiones Innovacion PIB_Per_Capita
## 2.398510 2.426430 3.065663 8.797343
## Exportaciones crisis_financiera
## 5.550288 1.639284
#Visual of the predicted values of the dependent variable
model2 <- lm(log(IED_Flujos) ~ Empleo + CO2_Emisiones + Innovacion +
PIB_Per_Capita + Exportaciones + crisis_financiera, data = df2)
colors <- c("blue", "green", "red", "purple")
par(mfrow=c(2,2))
plot(model2, which = 1, col = colors[1], pch = 19)
plot(model2, which = 2, col = colors[2], pch = 19)
plot(model2, which = 3, col = colors[3], pch = 19)
plot(model2, which = 5, col = colors[4], pch = 19)
par(mfrow=c(1,1))
#Hypothesis 1: With more innovation in Mexico it will be more attractive for nearshoring and will create more IED. #The coefficient of “inovacion” is positive (1.1418), which suggests a positive relation with the variable inovacion and IED_Flujos, but the p-value is 0.102 which is not significant (is greater than 0.05). With both this data analyzed the hypothesis is rejected because there is not enough evidence to accept it.
#Hypothesis 2: With more IED there will be more jobs in Mexico. #The coefficient of “empleo” is positive as well with almost 0.04, but with no significance, also its p-value is 0.717 which is also higher than 0.05, so both this data will also make me reject this hypothesis.
#Hypothesis 3: With more exports to Mexico the IED will be bigger. #The coefficient in here is also positive with 1.578 which suggests a positive relation with IED_Flujos, but the p-value is 0.149 which is also higher than 0.05 which will make me reject this hypothesis.
#In conclusion for my final model the 2 variables which had a significance for IED_Flujos were inovacion and exportaciones, but since the p values were higher than 0.05 it could mean that more data is needed for a deeper analysis. Also another insight from this evidence was that the model in general is significative with a p value of 0.0007199 which indicates that it contains at least some coefficients that are significative to the model, also with the adjusted R^2 with 0.5706 means that 57% of the variability in the logarithm of IED_Flujos is explained by the independent variables in the model. Another insight I found was that the variables: Empleo, CO2_Emisiones, PIB_Per_Capita, and Crisis_Financiera have 0 significance in the IED_Flujos because their coefficients are not significative and their p-values are higher than 0.05.