R Notebook

Paola Nieto (20150967)

Descargue y limpie las siguientes bases de datos: https://en.wikipedia.org/wiki/Democracy_Index *Use la tabla “Democracy Index by country 2019”

library(rio)
linkToData='https://github.com/jcgcjuan/Magallanes-Clases-/raw/master/Data%20EconoFreedom.xlsx'
data1=import(linkToData)

library(htmltab)
linkPage = "https://en.wikipedia.org/wiki/Democracy_Index"
linkPath = '//*[@id="mw-content-text"]/div/table[2]'
data2 = htmltab(doc = linkPage,
                       which = linkPath)

 De la primera base de datos extraiga solo las variables Country, Property Rights, Judical Effectiveness, Government Integrity;. Estas serán las variables independientes.

data1[,c(1,3:6,10:18)]=NULL
data1[,]=lapply(data1[,], trimws,whitespace = "[\\h\\v]")
names(data1)[names(data1)=='Country Name']='Country'
data1[,-c(1)]=lapply(data1[,-c(1)],as.numeric)

## Warning in lapply(data1[, -c(1)], as.numeric): NAs introduced by coercion

## Warning in lapply(data1[, -c(1)], as.numeric): NAs introduced by coercion

## Warning in lapply(data1[, -c(1)], as.numeric): NAs introduced by coercion

str(data1)

## 'data.frame':    186 obs. of  4 variables:
##  $ Country              : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ Property Rights      : num  19.6 54.8 31.6 35.9 47.8 57.2 79.1 84.2 59.1 42.2 ...
##  $ Judical Effectiveness: num  29.6 30.6 36.2 26.6 44.5 46.3 86.5 71.3 53.1 46.9 ...
##  $ Government Integrity : num  25.2 40.4 28.9 20.5 33.5 38.6 79.9 77.4 44.7 43.7 ...

 De la segunda base de datos extraiga las variables Country y Score Esta será su variable dependiente.

library(readr)
library(magrittr)
library(stringr)
data2[,c(1,4:11)]=NULL
names(data2)=str_split(names(data2),">>",simplify = T)[,1]%>%gsub('\\s','',.)
data2[,]=lapply(data2[,], trimws,whitespace = "[\\h\\v]")
data2$Score = as.numeric(data2$Score)
str(data2)

## 'data.frame':    167 obs. of  2 variables:
##  $ Country: chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ Score  : num  9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...

Realice el merge de ambas bases

data3 = merge(data1,data2, by.x = 'Country',
                               by.y = 'Country')
str(data3)

## 'data.frame':    157 obs. of  5 variables:
##  $ Country              : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ Property Rights      : num  19.6 54.8 31.6 35.9 47.8 57.2 79.1 84.2 59.1 63.5 ...
##  $ Judical Effectiveness: num  29.6 30.6 36.2 26.6 44.5 46.3 86.5 71.3 53.1 50.7 ...
##  $ Government Integrity : num  25.2 40.4 28.9 20.5 33.5 38.6 79.9 77.4 44.7 53.6 ...
##  $ Score                : num  2.85 5.89 4.01 3.72 7.02 5.54 9.09 8.29 2.75 2.55 ...

Desarrolle 3 modelos:  El primero con solo Property Rights como variable independiente.

names(data3)=c("Country","PRORI","JUEF","GOIN","Score")

#REGRESION

#modelos
modelo1=formula(Score ~ PRORI)
modelo2=formula(Score ~ PRORI + JUEF)
modelo3= formula(Score ~ PRORI + JUEF + GOIN)

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

reg1=lm(modelo1,data=data3)
stargazer(reg1,type = "text",intercept.bottom = FALSE)

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                Score           
## -----------------------------------------------
## Constant                     1.320***          
##                               (0.350)          
##                                                
## PRORI                        0.079***          
##                               (0.006)          
##                                                
## -----------------------------------------------
## Observations                    157            
## R2                             0.514           
## Adjusted R2                    0.511           
## Residual Std. Error      1.554 (df = 155)      
## F Statistic          163.796*** (df = 1; 155)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

reg2=lm(modelo2,data=data3)
stargazer(reg2,type = "text",intercept.bottom = FALSE)

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                Score           
## -----------------------------------------------
## Constant                     1.454***          
##                               (0.347)          
##                                                
## PRORI                        0.105***          
##                               (0.012)          
##                                                
## JUEF                         -0.034***         
##                               (0.013)          
##                                                
## -----------------------------------------------
## Observations                    157            
## R2                             0.535           
## Adjusted R2                    0.529           
## Residual Std. Error      1.525 (df = 154)      
## F Statistic           88.586*** (df = 2; 154)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

#Checkeando que el error disminuya significativamente

tanova=anova(reg1,reg2)
stargazer(tanova,type = 'text',summary = F,title = "Table de Análisis de Varianza")

## 
## Table de Análisis de Varianza
## ===========================================
##   Res.Df   RSS   Df Sum of Sq   F   Pr(> F)
## -------------------------------------------
## 1  155   374.434                           
## 2  154   358.116 1   16.318   7.017  0.009 
## -------------------------------------------

#El H0 de anova es que los modelos (o medias) no difieren
#PR es 0, por lo que el H0 se rechaza

reg3=lm(modelo3,data=data3)
stargazer(reg3,type = "text",intercept.bottom = FALSE)

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                Score           
## -----------------------------------------------
## Constant                     1.648***          
##                               (0.372)          
##                                                
## PRORI                        0.096***          
##                               (0.013)          
##                                                
## JUEF                         -0.047***         
##                               (0.016)          
##                                                
## GOIN                           0.022           
##                               (0.015)          
##                                                
## -----------------------------------------------
## Observations                    157            
## R2                             0.541           
## Adjusted R2                    0.532           
## Residual Std. Error      1.520 (df = 153)      
## F Statistic           60.119*** (df = 3; 153)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

#Comparando entre modelos

tanova2=anova(reg2,reg3)
stargazer(tanova2,type = 'text',summary = F,title = "Table de Análisis de Varianza 2")

## 
## Table de Análisis de Varianza 2
## ===========================================
##   Res.Df   RSS   Df Sum of Sq   F   Pr(> F)
## -------------------------------------------
## 1  154   358.116                           
## 2  153   353.460 1    4.656   2.015  0.158 
## -------------------------------------------

Compare los resultados de R2 de los modelos y, mediante una prueba anova, determine cuál es el mejor modelo. Justifique.

La comparación de modelos usando la tabla de análisis de varianza propone como hipótesis nula que los modelos no difieren. El mejor modelo es el tercero porque el r2 está mas cerca de 1

A partir de su selección del mejor modelo, responda a las siguientes preguntas:  ¿El modelo pasa las pruebas de validez (linealidad, homocedasticidad, normalidad de residuos, multicolinealidad)?  ¿Cuáles son los valores influyentes?  En base a las pruebas anteriores ¿Cree que el modelo es válido? ¿Qué problemas tiene?