suppressWarnings({
library(foreign)
library(dplyr) # data manipulation
library(forcats) # to work with categorical variables
library(ggplot2) # data visualization
library(readr) # read specific csv files
library(janitor) # data exploration and cleaning
library(Hmisc) # several useful functions for data analysis
library(psych) # functions for multivariate analysis
library(naniar) # summaries and visualization of missing values NA's
library(dlookr) # summaries and visualization of missing values NA's
library(corrplot) # correlation plots
library(jtools) # presentation of regression analysis
library(lmtest) # diagnostic checks - linear regression analysis
library(car) # diagnostic checks - linear regression analysis
library(olsrr) # diagnostic checks - linear regression analysis
library(naniar) # identifying missing values
library(stargazer) # create publication quality tables
library(effects) # displays for linear and other regression models
library(tidyverse) # collection of R packages designed for data science
library(caret) # Classification and Regression Training
library(glmnet) # methods for prediction and plotting, and functions for cross-validation
library(corrplot)
library(openxlsx)
library(visdat)
library(naniar)
library(zoo)
library(caret)
library(glmnet)
library(forecast)
})
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:psych':
##
## describe
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:base':
##
## transform
## corrplot 0.92 loaded
##
## Attaching package: 'jtools'
## The following object is masked from 'package:Hmisc':
##
## %nin%
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Registered S3 method overwritten by 'survey':
## method from
## summary.pps dlookr
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ Hmisc::src() masks dplyr::src()
## ✖ Hmisc::summarize() masks dplyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice
##
##
## Attaching package: 'caret'
##
##
## The following object is masked from 'package:purrr':
##
## lift
##
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loaded glmnet 4.1-8
##
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Exploratory Data Analysis
#Here we load the dataset
df<-read.xlsx("/Users/valeriacantulobo/Downloads/IED_Flujos.xlsx")
df
## Año Trimestre IED_Flujos
## 1 1999 I 3596.075
## 2 1999 II 3395.890
## 3 1999 III 3028.448
## 4 1999 IV 3939.905
## 5 2000 I 4600.636
## 6 2000 II 4857.423
## 7 2000 III 3056.946
## 8 2000 IV 5733.683
## 9 2001 I 3598.677
## 10 2001 II 5218.830
## 11 2001 III 16314.048
## 12 2001 IV 4925.626
## 13 2002 I 5067.984
## 14 2002 II 6258.520
## 15 2002 III 6114.337
## 16 2002 IV 6658.365
## 17 2003 I 3963.685
## 18 2003 II 5547.336
## 19 2003 III 2521.677
## 20 2003 IV 6217.271
## 21 2004 I 9363.458
## 22 2004 II 4351.496
## 23 2004 III 3284.913
## 24 2004 IV 8015.703
## 25 2005 I 6761.621
## 26 2005 II 6773.615
## 27 2005 III 5478.925
## 28 2005 IV 6781.664
## 29 2006 I 7436.808
## 30 2006 II 6634.311
## 31 2006 III 2346.572
## 32 2006 IV 4814.848
## 33 2007 I 10815.781
## 34 2007 II 6137.638
## 35 2007 III 7628.445
## 36 2007 IV 7811.468
## 37 2008 I 8546.438
## 38 2008 II 8376.139
## 39 2008 III 5643.682
## 40 2008 IV 6936.196
## 41 2009 I 6105.295
## 42 2009 II 6094.182
## 43 2009 III 2397.522
## 44 2009 IV 3252.947
## 45 2010 I 8722.262
## 46 2010 II 9301.621
## 47 2010 III 3932.893
## 48 2010 IV 5232.504
## 49 2011 I 8431.215
## 50 2011 II 6697.411
## 51 2011 III 4349.891
## 52 2011 IV 6154.004
## 53 2012 I 7892.686
## 54 2012 II 5622.127
## 55 2012 III 5736.292
## 56 2012 IV 2518.213
## 57 2013 I 10571.580
## 58 2013 II 21019.142
## 59 2013 III 4178.806
## 60 2013 IV 12584.890
## 61 2014 I 13827.996
## 62 2014 II 5478.736
## 63 2014 III 3222.084
## 64 2014 IV 7822.434
## 65 2015 I 12136.331
## 66 2015 II 6656.845
## 67 2015 III 9635.562
## 68 2015 IV 7515.015
## 69 2016 I 12805.358
## 70 2016 II 6210.621
## 71 2016 III 4317.266
## 72 2016 IV 7855.732
## 73 2017 I 13779.063
## 74 2017 II 6814.159
## 75 2017 III 6361.221
## 76 2017 IV 7062.609
## 77 2018 I 14067.517
## 78 2018 II 9577.749
## 79 2018 III 4132.974
## 80 2018 IV 6322.190
## 81 2019 I 15175.274
## 82 2019 II 6504.615
## 83 2019 III 8217.401
## 84 2019 IV 4679.868
## 85 2020 I 16807.599
## 86 2020 II 7293.958
## 87 2020 III 1340.582
## 88 2020 IV 2763.751
## 89 2021 I 16206.049
## 90 2021 II 5883.727
## 91 2021 III 6419.428
## 92 2021 IV 3044.313
## 93 2022 I 22794.161
## 94 2022 II 8164.273
## 95 2022 III 3479.682
## 96 2023 IV 1777.257
#Nueva columa en formato date
df <- df %>%
mutate(Month = case_when(
Trimestre == "I" ~ "01",
Trimestre == "II" ~ "04",
Trimestre == "III" ~ "07",
Trimestre == "IV" ~ "10",
TRUE ~ NA_character_
)) %>%
mutate(Date = as.Date(paste(Año, Month, "01", sep = "-"), format = "%Y-%m-%d"))
df
## Año Trimestre IED_Flujos Month Date
## 1 1999 I 3596.075 01 1999-01-01
## 2 1999 II 3395.890 04 1999-04-01
## 3 1999 III 3028.448 07 1999-07-01
## 4 1999 IV 3939.905 10 1999-10-01
## 5 2000 I 4600.636 01 2000-01-01
## 6 2000 II 4857.423 04 2000-04-01
## 7 2000 III 3056.946 07 2000-07-01
## 8 2000 IV 5733.683 10 2000-10-01
## 9 2001 I 3598.677 01 2001-01-01
## 10 2001 II 5218.830 04 2001-04-01
## 11 2001 III 16314.048 07 2001-07-01
## 12 2001 IV 4925.626 10 2001-10-01
## 13 2002 I 5067.984 01 2002-01-01
## 14 2002 II 6258.520 04 2002-04-01
## 15 2002 III 6114.337 07 2002-07-01
## 16 2002 IV 6658.365 10 2002-10-01
## 17 2003 I 3963.685 01 2003-01-01
## 18 2003 II 5547.336 04 2003-04-01
## 19 2003 III 2521.677 07 2003-07-01
## 20 2003 IV 6217.271 10 2003-10-01
## 21 2004 I 9363.458 01 2004-01-01
## 22 2004 II 4351.496 04 2004-04-01
## 23 2004 III 3284.913 07 2004-07-01
## 24 2004 IV 8015.703 10 2004-10-01
## 25 2005 I 6761.621 01 2005-01-01
## 26 2005 II 6773.615 04 2005-04-01
## 27 2005 III 5478.925 07 2005-07-01
## 28 2005 IV 6781.664 10 2005-10-01
## 29 2006 I 7436.808 01 2006-01-01
## 30 2006 II 6634.311 04 2006-04-01
## 31 2006 III 2346.572 07 2006-07-01
## 32 2006 IV 4814.848 10 2006-10-01
## 33 2007 I 10815.781 01 2007-01-01
## 34 2007 II 6137.638 04 2007-04-01
## 35 2007 III 7628.445 07 2007-07-01
## 36 2007 IV 7811.468 10 2007-10-01
## 37 2008 I 8546.438 01 2008-01-01
## 38 2008 II 8376.139 04 2008-04-01
## 39 2008 III 5643.682 07 2008-07-01
## 40 2008 IV 6936.196 10 2008-10-01
## 41 2009 I 6105.295 01 2009-01-01
## 42 2009 II 6094.182 04 2009-04-01
## 43 2009 III 2397.522 07 2009-07-01
## 44 2009 IV 3252.947 10 2009-10-01
## 45 2010 I 8722.262 01 2010-01-01
## 46 2010 II 9301.621 04 2010-04-01
## 47 2010 III 3932.893 07 2010-07-01
## 48 2010 IV 5232.504 10 2010-10-01
## 49 2011 I 8431.215 01 2011-01-01
## 50 2011 II 6697.411 04 2011-04-01
## 51 2011 III 4349.891 07 2011-07-01
## 52 2011 IV 6154.004 10 2011-10-01
## 53 2012 I 7892.686 01 2012-01-01
## 54 2012 II 5622.127 04 2012-04-01
## 55 2012 III 5736.292 07 2012-07-01
## 56 2012 IV 2518.213 10 2012-10-01
## 57 2013 I 10571.580 01 2013-01-01
## 58 2013 II 21019.142 04 2013-04-01
## 59 2013 III 4178.806 07 2013-07-01
## 60 2013 IV 12584.890 10 2013-10-01
## 61 2014 I 13827.996 01 2014-01-01
## 62 2014 II 5478.736 04 2014-04-01
## 63 2014 III 3222.084 07 2014-07-01
## 64 2014 IV 7822.434 10 2014-10-01
## 65 2015 I 12136.331 01 2015-01-01
## 66 2015 II 6656.845 04 2015-04-01
## 67 2015 III 9635.562 07 2015-07-01
## 68 2015 IV 7515.015 10 2015-10-01
## 69 2016 I 12805.358 01 2016-01-01
## 70 2016 II 6210.621 04 2016-04-01
## 71 2016 III 4317.266 07 2016-07-01
## 72 2016 IV 7855.732 10 2016-10-01
## 73 2017 I 13779.063 01 2017-01-01
## 74 2017 II 6814.159 04 2017-04-01
## 75 2017 III 6361.221 07 2017-07-01
## 76 2017 IV 7062.609 10 2017-10-01
## 77 2018 I 14067.517 01 2018-01-01
## 78 2018 II 9577.749 04 2018-04-01
## 79 2018 III 4132.974 07 2018-07-01
## 80 2018 IV 6322.190 10 2018-10-01
## 81 2019 I 15175.274 01 2019-01-01
## 82 2019 II 6504.615 04 2019-04-01
## 83 2019 III 8217.401 07 2019-07-01
## 84 2019 IV 4679.868 10 2019-10-01
## 85 2020 I 16807.599 01 2020-01-01
## 86 2020 II 7293.958 04 2020-04-01
## 87 2020 III 1340.582 07 2020-07-01
## 88 2020 IV 2763.751 10 2020-10-01
## 89 2021 I 16206.049 01 2021-01-01
## 90 2021 II 5883.727 04 2021-04-01
## 91 2021 III 6419.428 07 2021-07-01
## 92 2021 IV 3044.313 10 2021-10-01
## 93 2022 I 22794.161 01 2022-01-01
## 94 2022 II 8164.273 04 2022-04-01
## 95 2022 III 3479.682 07 2022-07-01
## 96 2023 IV 1777.257 10 2023-10-01
df2<-read.xlsx("/Users/valeriacantulobo/Downloads/Copia de SP_DataMexicoAtractiveness_ValeriaCantu-VF.xlsx")
df2
## Año IED_Flujos Exportaciones Empleo Educación Salario_Diario Innovación
## 1 1997 12145.60 9087.616 NA 7.197968 24.30 11.30141
## 2 1998 8373.50 9875.065 NA 7.311587 31.91 11.37221
## 3 1999 13960.32 10990.013 NA 7.427924 31.91 12.45897
## 4 2000 18248.69 12482.963 97.83000 7.560000 35.12 13.14556
## 5 2001 30057.18 11300.439 97.36000 7.678270 37.57 13.46812
## 6 2002 24099.21 11923.099 97.66000 7.803418 39.74 12.79893
## 7 2003 18249.97 13156.000 97.06000 7.925400 41.53 11.81075
## 8 2004 25015.57 13573.131 96.48000 8.043895 43.30 12.60841
## 9 2005 25795.82 16465.810 97.17000 8.140000 45.24 13.41442
## 10 2006 21232.54 17485.926 96.53000 8.259136 47.05 14.23137
## 11 2007 32393.33 19103.854 96.60000 8.357959 48.88 15.04295
## 12 2008 29502.46 16924.758 95.68000 8.456868 50.84 14.81882
## 13 2009 17849.95 19702.630 95.20264 8.555896 53.19 12.59250
## 14 2010 27189.28 22673.141 95.06220 8.630000 55.77 12.69477
## 15 2011 25632.52 24333.016 95.49284 8.754465 58.06 12.09530
## 16 2012 21769.32 26297.978 95.52654 8.853993 60.75 13.02609
## 17 2013 48354.42 27687.570 95.74676 8.953656 63.12 13.21576
## 18 2014 30351.25 31676.782 96.23509 9.053426 65.58 13.65014
## 19 2015 35943.75 29959.941 96.04487 9.153300 70.10 15.11449
## 20 2016 31188.98 31375.063 96.62297 9.253337 73.04 14.39837
## 21 2017 34017.05 33322.621 96.85275 9.353542 88.36 14.04673
## 22 2018 34100.43 35341.901 96.64163 9.453819 88.36 13.24773
## 23 2019 34577.16 36414.734 97.09482 9.578832 102.68 12.69960
## 24 2020 28205.89 41077.343 96.20963 NA 123.22 11.27991
## 25 2021 31553.52 44914.781 96.48553 NA 141.70 NA
## 26 2022 36215.37 46477.585 97.23665 NA 172.87 NA
## Inseguridad_Robo Inseguridad_Homicidio Tipo_de_Cambio Densidad_Carretera
## 1 266.5065 14.554142 8.0640 0.05205217
## 2 314.7762 14.319399 9.9395 0.05295475
## 3 272.8936 12.641071 9.5222 0.05501698
## 4 216.9808 10.857846 9.5997 0.05522774
## 5 214.5269 10.249509 9.1692 0.05646070
## 6 197.8004 9.938719 10.3613 0.05758828
## 7 183.2161 9.809879 11.1998 0.05957258
## 8 146.2782 8.915908 11.2183 0.05952376
## 9 136.9431 9.223579 10.7109 0.06245141
## 10 135.5928 9.598370 10.8755 0.06279554
## 11 145.9200 8.036682 10.9043 0.06473967
## 12 158.1701 12.518373 13.7738 0.06681259
## 13 175.7748 17.462500 13.0437 0.06931313
## 14 201.9388 22.432707 12.3817 0.07045701
## 15 212.6070 23.418661 13.9787 0.07196232
## 16 190.2792 22.087532 12.9880 0.07443639
## 17 185.5649 19.735505 13.0652 0.07550950
## 18 154.4111 16.928367 14.7348 0.07902716
## 19 180.4412 17.366060 17.3398 0.07982040
## 20 160.5666 20.308040 20.6640 0.08372092
## 21 230.4330 26.224818 19.7354 0.08904103
## 22 184.2470 29.592026 19.6566 0.09020259
## 23 173.4478 29.207254 18.8727 0.08915048
## 24 133.8954 28.982413 19.9352 0.08972318
## 25 127.1289 27.891485 20.5157 0.08990085
## 26 120.4935 NA 19.4143 0.09009684
## Densidad_Población CO2_Emisiones PIB_Per_Cápita INPC
## 1 47.43650 3.675330 127570.1 33.27987
## 2 48.76163 3.853045 126738.8 39.47297
## 3 49.48089 3.686918 129164.7 44.33552
## 4 50.57930 3.874147 130874.9 48.30767
## 5 51.27675 3.811393 128083.4 50.43490
## 6 51.95311 3.824969 128205.9 53.30993
## 7 52.61469 3.950941 128737.9 55.42981
## 8 53.27109 3.983827 132563.5 58.30709
## 9 54.78357 4.098802 132941.1 60.25031
## 10 55.44476 4.194184 135894.9 62.69242
## 11 56.17259 4.220763 137795.7 65.04906
## 12 56.96037 4.189729 135176.0 69.29555
## 13 57.73272 4.037596 131233.0 71.77186
## 14 58.45062 4.113213 134991.7 74.93095
## 15 59.15479 4.190992 138891.9 77.79239
## 16 59.84807 4.202416 141530.2 80.56824
## 17 59.48988 4.056057 144112.0 83.77006
## 18 60.17382 3.892358 147277.4 87.18898
## 19 60.86454 3.925371 149433.5 89.04682
## 20 61.56528 3.894980 152275.4 92.03903
## 21 62.27654 3.838990 153235.7 98.27288
## 22 63.11216 3.649327 153133.8 99.90910
## 23 63.90002 3.591678 150233.1 105.93400
## 24 64.59071 NA 142609.3 109.27100
## 25 65.15865 NA 142772.0 117.30800
## 26 65.59724 NA 146826.7 126.47800
#Here we fill all NA with median
columnas_a_llenar <- c("Empleo", "CO2_Emisiones", "Innovación", "Inseguridad_Homicidio", "Educación")
df3<- df2 %>%
mutate_at(vars(all_of(columnas_a_llenar)), ~ ifelse(is.na(.), median(., na.rm = TRUE), .))
sum(is.na(df3))
## [1] 0
df3
## Año IED_Flujos Exportaciones Empleo Educación Salario_Diario Innovación
## 1 1997 12145.60 9087.616 96.53000 7.197968 24.30 11.30141
## 2 1998 8373.50 9875.065 96.53000 7.311587 31.91 11.37221
## 3 1999 13960.32 10990.013 96.53000 7.427924 31.91 12.45897
## 4 2000 18248.69 12482.963 97.83000 7.560000 35.12 13.14556
## 5 2001 30057.18 11300.439 97.36000 7.678270 37.57 13.46812
## 6 2002 24099.21 11923.099 97.66000 7.803418 39.74 12.79893
## 7 2003 18249.97 13156.000 97.06000 7.925400 41.53 11.81075
## 8 2004 25015.57 13573.131 96.48000 8.043895 43.30 12.60841
## 9 2005 25795.82 16465.810 97.17000 8.140000 45.24 13.41442
## 10 2006 21232.54 17485.926 96.53000 8.259136 47.05 14.23137
## 11 2007 32393.33 19103.854 96.60000 8.357959 48.88 15.04295
## 12 2008 29502.46 16924.758 95.68000 8.456868 50.84 14.81882
## 13 2009 17849.95 19702.630 95.20264 8.555896 53.19 12.59250
## 14 2010 27189.28 22673.141 95.06220 8.630000 55.77 12.69477
## 15 2011 25632.52 24333.016 95.49284 8.754465 58.06 12.09530
## 16 2012 21769.32 26297.978 95.52654 8.853993 60.75 13.02609
## 17 2013 48354.42 27687.570 95.74676 8.953656 63.12 13.21576
## 18 2014 30351.25 31676.782 96.23509 9.053426 65.58 13.65014
## 19 2015 35943.75 29959.941 96.04487 9.153300 70.10 15.11449
## 20 2016 31188.98 31375.063 96.62297 9.253337 73.04 14.39837
## 21 2017 34017.05 33322.621 96.85275 9.353542 88.36 14.04673
## 22 2018 34100.43 35341.901 96.64163 9.453819 88.36 13.24773
## 23 2019 34577.16 36414.734 97.09482 9.578832 102.68 12.69960
## 24 2020 28205.89 41077.343 96.20963 8.456868 123.22 11.27991
## 25 2021 31553.52 44914.781 96.48553 8.456868 141.70 13.08583
## 26 2022 36215.37 46477.585 97.23665 8.456868 172.87 13.08583
## Inseguridad_Robo Inseguridad_Homicidio Tipo_de_Cambio Densidad_Carretera
## 1 266.5065 14.554142 8.0640 0.05205217
## 2 314.7762 14.319399 9.9395 0.05295475
## 3 272.8936 12.641071 9.5222 0.05501698
## 4 216.9808 10.857846 9.5997 0.05522774
## 5 214.5269 10.249509 9.1692 0.05646070
## 6 197.8004 9.938719 10.3613 0.05758828
## 7 183.2161 9.809879 11.1998 0.05957258
## 8 146.2782 8.915908 11.2183 0.05952376
## 9 136.9431 9.223579 10.7109 0.06245141
## 10 135.5928 9.598370 10.8755 0.06279554
## 11 145.9200 8.036682 10.9043 0.06473967
## 12 158.1701 12.518373 13.7738 0.06681259
## 13 175.7748 17.462500 13.0437 0.06931313
## 14 201.9388 22.432707 12.3817 0.07045701
## 15 212.6070 23.418661 13.9787 0.07196232
## 16 190.2792 22.087532 12.9880 0.07443639
## 17 185.5649 19.735505 13.0652 0.07550950
## 18 154.4111 16.928367 14.7348 0.07902716
## 19 180.4412 17.366060 17.3398 0.07982040
## 20 160.5666 20.308040 20.6640 0.08372092
## 21 230.4330 26.224818 19.7354 0.08904103
## 22 184.2470 29.592026 19.6566 0.09020259
## 23 173.4478 29.207254 18.8727 0.08915048
## 24 133.8954 28.982413 19.9352 0.08972318
## 25 127.1289 27.891485 20.5157 0.08990085
## 26 120.4935 16.928367 19.4143 0.09009684
## Densidad_Población CO2_Emisiones PIB_Per_Cápita INPC
## 1 47.43650 3.675330 127570.1 33.27987
## 2 48.76163 3.853045 126738.8 39.47297
## 3 49.48089 3.686918 129164.7 44.33552
## 4 50.57930 3.874147 130874.9 48.30767
## 5 51.27675 3.811393 128083.4 50.43490
## 6 51.95311 3.824969 128205.9 53.30993
## 7 52.61469 3.950941 128737.9 55.42981
## 8 53.27109 3.983827 132563.5 58.30709
## 9 54.78357 4.098802 132941.1 60.25031
## 10 55.44476 4.194184 135894.9 62.69242
## 11 56.17259 4.220763 137795.7 65.04906
## 12 56.96037 4.189729 135176.0 69.29555
## 13 57.73272 4.037596 131233.0 71.77186
## 14 58.45062 4.113213 134991.7 74.93095
## 15 59.15479 4.190992 138891.9 77.79239
## 16 59.84807 4.202416 141530.2 80.56824
## 17 59.48988 4.056057 144112.0 83.77006
## 18 60.17382 3.892358 147277.4 87.18898
## 19 60.86454 3.925371 149433.5 89.04682
## 20 61.56528 3.894980 152275.4 92.03903
## 21 62.27654 3.838990 153235.7 98.27288
## 22 63.11216 3.649327 153133.8 99.90910
## 23 63.90002 3.591678 150233.1 105.93400
## 24 64.59071 3.925371 142609.3 109.27100
## 25 65.15865 3.925371 142772.0 117.30800
## 26 65.59724 3.925371 146826.7 126.47800
#IV. Data and Methodology
#Plot the variable IED_Flujos using a time series format:
# Convertir la columna "Date" en una serie de tiempo
timeseries_df <- ts(df$IED_Flujos, start = c(1999, 1), frequency = 4)
# Trazar la serie de tiempo
plot(timeseries_df, xlab = "Date", ylab = "IED_Flujos", main = "Serie de Tiempo de IED_Flujos")

#i) decompose the time series data into trend, seasonal, and random components. Briefly, describe the decomposition time series plot. Do the time series data show a trend? Do the time series data show seasonality?
# 2) Decompose a time series
# Decomposition: observed + trend + seasonal + random
# observed: data observations
# trend: increasing / decreasing value of data observations
# seasonality: repeating short-term cycle in time series
# noise: random variation in time series
DecDF<-ts(df$IED_Flujos,frequency=4,start=c(1999,1),end=c(2023,4))
df_decompose<-decompose(DecDF)
plot(df_decompose)

#Briefly, describe the decomposition time series plot. Do the time series data show a trend? Do the time series data show seasonality?
#there is no trend in the decomposition as we can´t see one positive or negative, but in there is seasonality because we can see how every year in the beggining is in its lowest and its start going up until reaching its highest aprox at the middle of the year and it again goes down by the end to start again going uo for the next year.
#ii) detect the presence of stationary
#Detect if the time series data is stationary.
# checking if variables are stationary or non-stationary
suppressWarnings({
library(tseries)
adf.test(DecDF) # non-stationary (p-value > 0.05)
})
##
## Augmented Dickey-Fuller Test
##
## data: DecDF
## Dickey-Fuller = -3.2829, Lag order = 4, p-value = 0.07798
## alternative hypothesis: stationary
#iii) detect the presence of serial autocorrelation
acf(df$IED_Flujos) #there is autocorrelation

#V. Time Series Regression Analysis
#a. Time Series Model 1
#Estimate 2 different time series regression models. You might want to consider ARMA (p,q) and / or ARIMA (p,d,q).
### ARMA
# time series modeling
summary(DS_ARMA<-arma(log(df$IED_Flujos),order=c(1,1)))
##
## Call:
## arma(x = log(df$IED_Flujos), order = c(1, 1))
##
## Model:
## ARMA(1,1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41234 -0.35244 -0.00571 0.27709 1.50760
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## ar1 -0.2977 0.2271 -1.311 0.18987
## ma1 0.5174 0.1999 2.588 0.00964 **
## intercept 11.3157 1.9793 5.717 1.08e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Fit:
## sigma^2 estimated as 0.2688, Conditional Sum-of-Squares = 25.26, AIC = 152.3
plot(DS_ARMA)



DS_IED_Flujos<-exp(DS_ARMA$fitted.values)
plot(DS_IED_Flujos)

# model evaluation
# Testing serial autocorrelation in regression residuals
# Ho: There is no serial autocorrelation
# Ha: There is serial autocorrelation
suppressWarnings({
DS_ARMA_residuals<-DS_ARMA$residuals
Box.test(DS_ARMA_residuals,lag=5,type="Ljung-Box") # Reject the Ho. P-value is < 0.05 indicating that ARMA Model does show residual serial autocorrelation.
adf.test(na.omit(DS_IED_Flujos))
})
##
## Augmented Dickey-Fuller Test
##
## data: na.omit(DS_IED_Flujos)
## Dickey-Fuller = -4.1127, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
### ARIMA
### plotting log-transformation and differences
suppressWarnings({
plot(df$Date,df$IED_Flujos, type="l",col="blue", lwd=2, xlab ="Date",ylab ="IED_Flujos", main = "IED_Flujos")
plot(df$Año,log(df$IED_Flujos), type="l",col="blue", lwd=2, xlab ="Año",ylab ="log(IED_Flujos)", main = "IED_Flujos")
plot(diff(log(df$IED_Flujos)),type="l",ylab="first order difference",main = "Diff - IED_Flujos")
adf.test(log(df$IED_Flujos))
adf.test(diff(log(df$IED_Flujos)))
})



##
## Augmented Dickey-Fuller Test
##
## data: diff(log(df$IED_Flujos))
## Dickey-Fuller = -5.9411, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
# forecast (ARIMA)
suppressWarnings({
DS_ARIMA <- Arima(df$IED_Flujos,order=c(1,1,1))
print(DS_ARIMA)
plot(DS_ARIMA$residuals,main="ARIMA(1,1,1) - IED_FLUJO")
acf(DS_ARIMA$residuals,main="ACF - ARIMA (1,1,1)") # ACF plot displays weak or no autocorrelation.
Box.test(DS_ARIMA$residuals,lag=1,type="Ljung-Box") # P-value is > 0.05 indicating that ARMA model does not show residual serial autocorrelation.
adf.test(DS_ARIMA$residuals) # ADF test suggest that ARMA residuals are stationary since p-value is < 0.05.
})
## Series: df$IED_Flujos
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## -0.0520 -0.9399
## s.e. 0.1072 0.0323
##
## sigma^2 = 15868763: log likelihood = -922.46
## AIC=1850.91 AICc=1851.17 BIC=1858.57


##
## Augmented Dickey-Fuller Test
##
## data: DS_ARIMA$residuals
## Dickey-Fuller = -4.3162, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
#Based on diagnostic tests, compare the 2 estimated time series regression models, and select the results that you consider might generate the best forecast.
#evaluating both models selected (ARMA and ARIMA) and evaluating all the factors to choose the best model first of all I check the AIC which for the best model it needs to be lowest AIC of all the models, in the ARMA the AIC was 152.3 and in the ARIMA was 1850.91 , so in the factor of AIC the best model is ARMA, then following this the next factor is about the residuals in the models, for both models the residuals are small so is a great sign because it is better to select a model with small amount of residuals, for the ARMA in the Augmented Dickey-Fuller Test had a p value of 0.01 wich suggests non stationary, in the ARIMA for the same model the p value is 0.01 so there is some stationary, the in the ARMA model checking the Box-Ljung test the p value gave 0.01 indicating no serial correlation and in ARIMA the p value was 0.5955 suggesting also no correlation
#checking the 3 factors in order of importance: stationary, serial correlation and at last AIC this were the results:
#ARMA:
# stationary: non stationary
# serial correlation: no serial correlation
#AIC: 152.3
#ARIMA:
# stationary: stationary
# serial correlation: no serial correlation
#AIC: 1850.91
#the model I select is ARMA because the non stationary the 3 factors are better that the ARIMA being non stationary not having seriak correlation and having a lower AIC
### FORECAST - ARMA
#By using the selected model, make a forecast for the next 5 periods. In doing so, include a time series plot showing your forecast.
# forecast (ARMA)
suppressWarnings({
DS_ARMA_forecast<-forecast(DS_IED_Flujos,h=5)
DS_ARMA_forecast
plot(DS_ARMA_forecast)
autoplot(DS_ARMA_forecast)
})


colnames(df3)
## [1] "Año" "IED_Flujos" "Exportaciones"
## [4] "Empleo" "Educación" "Salario_Diario"
## [7] "Innovación" "Inseguridad_Robo" "Inseguridad_Homicidio"
## [10] "Tipo_de_Cambio" "Densidad_Carretera" "Densidad_Población"
## [13] "CO2_Emisiones" "PIB_Per_Cápita" "INPC"
#b. Time Series Model 2
#In describing the above relationships, please include a time series plot that displays the selected variables’ performance over the time period.
# plotting time series data
par(mfrow=c(2,3))
plot(df3$Año,df3$IED_Flujos,type="l",col="blue",lwd=2,xlab="Date",ylab="IED_flujos",main="IED_Flujos")
plot(df3$Año,df3$Exportaciones,type="l",col="blue",lwd=2,xlab="Date",ylab="Exportaciones",main="Exportaciones")
plot(df3$Año,df3$Empleo,type="l",col="blue",lwd=2,xlab="Date",ylab="Empleo",main="Empleo")
plot(df3$Año,df3$Innovacion,type="l",col="blue",lwd=2,xlab="Date",ylab="Innovacion",main="Innovacion")

# Crear una serie de tiempo anual
ts_obj <- ts(df3$Año, start = min(df3$Año), frequency = 1)
# Verificar la serie de tiempo
print(ts_obj)
## Time Series:
## Start = 1997
## End = 2022
## Frequency = 1
## [1] 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## [16] 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
#Estimate a VAR_Model that includes at least 1 explanatory factor that might affect the dependent variable IED_Flujos.
### converting to time series format
IED_Flujos<-ts(df3$IED_Flujos,start=c(1997,1),end=c(2022,12),frequency=1)
Exportaciones<-ts(df3$Exportaciones,start=c(1997,1),end=c(2022,12),frequency=1)
Empleo<-ts(df3$Empleo,start=c(1997,1),end=c(2022,12),frequency=1)
Innovación<-ts(df3$Innovación,start=c(1997,1),end=c(2022,12),frequency=1)
VAR_ts<-cbind(IED_Flujos,Exportaciones,Empleo, Innovación)
colnames(VAR_ts)<-cbind("IED_Flujos","Exportaciones","Empleo","Innovación")
# this line will automatically generate the preferred lag order based on multiple iterations of the AIC.
library(vars)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: strucchange
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
## Loading required package: urca
##
## Attaching package: 'vars'
## The following object is masked from 'package:dlookr':
##
## normality
lag_selection<-VARselect(VAR_ts,lag.max=5,type="const", season=4)
lag_selection$selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 5 5 5 5
lag_selection$criteria
## 1 2 3 4 5
## AIC(n) 3.415700e+01 3.396112e+01 3.354808e+01 3.183179e+01 2.770452e+01
## HQ(n) 3.464285e+01 3.468989e+01 3.451978e+01 3.304641e+01 2.916207e+01
## SC(n) 3.562274e+01 3.615972e+01 3.647956e+01 3.549613e+01 3.210173e+01
## FPE(n) 7.128606e+14 6.545347e+14 5.508777e+14 1.585571e+14 6.405406e+12
# We estimate the VAR model. The p option refers to the number of lags used.
VAR_model1<-VAR(VAR_ts,p=1,type="const",season=4)
# rather than interpreting the coefficients of the VAR, we focus on interpreting the results of the applications.
# we have estimated coefficients for each lag and each equation in the VAR.
# each equation represents an equation in the VAR system.
summary(VAR_model1) ### too many variables in model specification
##
## VAR Estimation Results:
## =========================
## Endogenous variables: IED_Flujos, Exportaciones, Empleo, Innovación
## Deterministic variables: const
## Sample size: 36
## Log Likelihood: -782.843
## Roots of the characteristic polynomial:
## 0.7684 0.7684 0.5161 0.03958
## Call:
## VAR(y = VAR_ts, p = 1, type = "const", season = 4L)
##
##
## Estimation results for equation IED_Flujos:
## ===========================================
## IED_Flujos = IED_Flujos.l1 + Exportaciones.l1 + Empleo.l1 + Innovación.l1 + const + sd1 + sd2 + sd3
##
## Estimate Std. Error t value Pr(>|t|)
## IED_Flujos.l1 -5.540e-02 2.491e-01 -0.222 0.8256
## Exportaciones.l1 3.157e-01 1.697e-01 1.860 0.0734 .
## Empleo.l1 -1.271e+03 1.835e+03 -0.693 0.4943
## Innovación.l1 3.335e+03 1.550e+03 2.151 0.0402 *
## const 9.978e+04 1.784e+05 0.559 0.5804
## sd1 4.251e+03 3.524e+03 1.206 0.2378
## sd2 1.987e+03 3.467e+03 0.573 0.5712
## sd3 1.503e+03 3.503e+03 0.429 0.6712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 7302 on 28 degrees of freedom
## Multiple R-Squared: 0.4225, Adjusted R-squared: 0.2781
## F-statistic: 2.926 on 7 and 28 DF, p-value: 0.01982
##
##
## Estimation results for equation Exportaciones:
## ==============================================
## Exportaciones = IED_Flujos.l1 + Exportaciones.l1 + Empleo.l1 + Innovación.l1 + const + sd1 + sd2 + sd3
##
## Estimate Std. Error t value Pr(>|t|)
## IED_Flujos.l1 1.336e-01 2.112e-01 0.632 0.5323
## Exportaciones.l1 6.977e-01 1.439e-01 4.850 4.18e-05 ***
## Empleo.l1 -3.028e+03 1.556e+03 -1.945 0.0618 .
## Innovación.l1 -1.839e+02 1.314e+03 -0.140 0.8897
## const 2.981e+05 1.513e+05 1.970 0.0588 .
## sd1 9.909e+02 2.988e+03 0.332 0.7427
## sd2 6.443e+02 2.940e+03 0.219 0.8281
## sd3 -3.358e+03 2.970e+03 -1.131 0.2678
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 6191 on 28 degrees of freedom
## Multiple R-Squared: 0.733, Adjusted R-squared: 0.6662
## F-statistic: 10.98 on 7 and 28 DF, p-value: 1.353e-06
##
##
## Estimation results for equation Empleo:
## =======================================
## Empleo = IED_Flujos.l1 + Exportaciones.l1 + Empleo.l1 + Innovación.l1 + const + sd1 + sd2 + sd3
##
## Estimate Std. Error t value Pr(>|t|)
## IED_Flujos.l1 -5.313e-07 1.947e-05 -0.027 0.9784
## Exportaciones.l1 -4.003e-06 1.326e-05 -0.302 0.7649
## Empleo.l1 6.591e-01 1.434e-01 4.595 8.38e-05 ***
## Innovación.l1 -3.651e-02 1.211e-01 -0.301 0.7653
## const 3.351e+01 1.394e+01 2.404 0.0231 *
## sd1 5.714e-02 2.754e-01 0.207 0.8371
## sd2 1.945e-01 2.709e-01 0.718 0.4787
## sd3 7.783e-02 2.737e-01 0.284 0.7783
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 0.5706 on 28 degrees of freedom
## Multiple R-Squared: 0.4685, Adjusted R-squared: 0.3356
## F-statistic: 3.525 on 7 and 28 DF, p-value: 0.007721
##
##
## Estimation results for equation Innovación:
## ===========================================
## Innovación = IED_Flujos.l1 + Exportaciones.l1 + Empleo.l1 + Innovación.l1 + const + sd1 + sd2 + sd3
##
## Estimate Std. Error t value Pr(>|t|)
## IED_Flujos.l1 -3.395e-05 3.035e-05 -1.119 0.272816
## Exportaciones.l1 1.927e-06 2.067e-05 0.093 0.926404
## Empleo.l1 -2.002e-01 2.236e-01 -0.895 0.378340
## Innovación.l1 7.903e-01 1.889e-01 4.185 0.000256 ***
## const 2.297e+01 2.174e+01 1.057 0.299719
## sd1 3.785e-02 4.294e-01 0.088 0.930382
## sd2 7.054e-02 4.224e-01 0.167 0.868570
## sd3 5.525e-03 4.268e-01 0.013 0.989762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 0.8896 on 28 degrees of freedom
## Multiple R-Squared: 0.4381, Adjusted R-squared: 0.2976
## F-statistic: 3.119 on 7 and 28 DF, p-value: 0.01457
##
##
##
## Covariance matrix of residuals:
## IED_Flujos Exportaciones Empleo Innovación
## IED_Flujos 5.332e+07 2.584e+07 672.8743 3284.6421
## Exportaciones 2.584e+07 3.833e+07 648.0772 1656.3661
## Empleo 6.729e+02 6.481e+02 0.3255 0.1052
## Innovación 3.285e+03 1.656e+03 0.1052 0.7914
##
## Correlation matrix of residuals:
## IED_Flujos Exportaciones Empleo Innovación
## IED_Flujos 1.0000 0.5717 0.1615 0.5057
## Exportaciones 0.5717 1.0000 0.1835 0.3007
## Empleo 0.1615 0.1835 1.0000 0.2073
## Innovación 0.5057 0.3007 0.2073 1.0000
VAR_model1_residuals<-data.frame(residuals(VAR_model1))
adf.test(VAR_model1_residuals$IED_Flujos) #P value = 0.0310 < 0.05 so it is stationary
##
## Augmented Dickey-Fuller Test
##
## data: VAR_model1_residuals$IED_Flujos
## Dickey-Fuller = -3.81, Lag order = 3, p-value = 0.03106
## alternative hypothesis: stationary
Box.test(VAR_model1_residuals$IED_Flujos,lag=1,type="Ljung-Box") #p value = 0.5723 > 0.05 so there is no serial correlation
##
## Box-Ljung test
##
## data: VAR_model1_residuals$IED_Flujos
## X-squared = 0.319, df = 1, p-value = 0.5722
# Detect if the estimated VAR_Model residuals are stationary.
#P value = 0.0310 < 0.05 so the var model is stationary
#Detect if the estimated VAR_Model residuals show serial autocorrelation.
#p value = 0.5723 > 0.05 so there is no serial correlation in the model
#Based on the regression results and diagnostic tests, select the VAR_Model that you consider might generate the best forecast.
#in general the model is great so it was choosen because it is stationary, without serial autocorrelation, has a good aic value as well.
#Briefly interpret the regression results. That is, is there a statistically significant relationship between the explanatory variable(s) and the main dependent variable?
#The explanatory variables in this equation include "IED_Flujos.l1," "Exportaciones.l1," "Empleo.l1," "Innovacion.l1," and constant terms. The "t value" and "Pr(>|t|)" columns give information about the statistical significance of each coefficient."IED_Flujos.l1" has a coefficient of approximately -0.05632, but its p-value is 0.8228. This suggests that there is no statistically significant relationship between the lagged "IED_Flujos" variable and the current "IED_Flujos" value, then "Exportaciones.l1" has a coefficient of approximately 0.3158, and its p-value is 0.0732(p-value > 0.05), indicating almos statistically significant relationship between the lagged "Exportaciones" variable and the current "IED_Flujos" value, but it dint complete the factor to being significant, then "Empleo.l1" has a coefficient of approximately -1272, but its p-value is 0.4937. so there is no statistically significant relationship between the lagged "Empleo" variable and the current "IED_Flujos" value, and finally "Innovacion.l1" has a coefficient of approximately 3342, and its p-value is 0.0399, which is less than 0.05. This indicates a statistically significant relationship between the lagged "Innovacion" variable and the current "IED_Flujos" value.
#Is there an instantaneous causality between IED_Flujos and the selected explanatory variables? Estimate a Granger Causality Test to either reject or fail to reject the hypothesis of instantaneous causality.
# Granger causality testing each variable against all the others.
# There could be a unidirectional, bidirectional, or no causality relationships between variables.
granger_GE<-causality(VAR_model1,cause="IED_Flujos")
granger_GE
## $Granger
##
## Granger causality H0: IED_Flujos do not Granger-cause Exportaciones
## Empleo Innovación
##
## data: VAR object VAR_model1
## F-Test = 0.76305, df1 = 3, df2 = 112, p-value = 0.5171
##
##
## $Instant
##
## H0: No instantaneous causality between: IED_Flujos and Exportaciones
## Empleo Innovación
##
## data: VAR object VAR_model1
## Chi-squared = 11.16, df = 3, p-value = 0.01089
#since we have a p-value < 0.05 (with 0.01085) we fail to reject the null hypothesis of no instantaneous causality.
#Based on the selected VAR_Model, forecast the increasing / decreasing trend of FDI inflows in Mexico for the next 5 periods. Display the forecast in a time series plot.
# Forecast the next 5 periods
forecasted_values <- forecast(VAR_model1, h = 5)
# Plot the forecasted values
plot(forecasted_values, main = "IED_flujos Forecast for the Next 5 Periods")
