Examen-final-regresion--1-.utf8

#SCRIPT GABRIELA VIDAL
library(XML)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(magrittr)

#Junte 3 tablas de datos. Las tablas estan en estos links 
#(en ningun caso necesita los rankings, ni cambie los nombres de 
#pa?ses):

#1)
#Data de gastos en educacion (la data aparece como 'Education Expenditure' - 
#la data se muestra como % del GDP). Quedese s?lo con los datos de antes del 
#2018 pero luego del 2010.
library(htmltab)
link1="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/369rank.html"
linkpath1='//*[@id="rankOrder"]'
edu=htmltab(doc = link1, 
              which = linkpath1, #herramientas de desarrollador
              encoding = "UTF-8")

## No encoding supplied: defaulting to UTF-8.

edu$Rank=NULL
names(edu)<-c("Country","Expenditure","Year")
str(edu)

## 'data.frame':    175 obs. of  3 variables:
##  $ Country    : chr  "Cuba" "Micronesia, Federated States of" "Solomon Islands" "Botswana" ...
##  $ Expenditure: chr  "12.80" "12.50" "9.90" "9.60" ...
##  $ Year       : chr  "2010" "2015" "2010" "2009" ...

edu[,]=lapply(edu[,], trimws,whitespace = "[\\h\\v]") # no blanks
#filtrando los a?os pedidos
edu<-edu %>% filter(Year>2010&Year<2018)
edu[,-c(1)]=lapply(edu[,-c(1)], as.numeric) # a numerico

#2)
#tabla donde est? el "score" de percepcion de corrupcion, que da un menor 
#puntaje donde habria mayor corrupcion,la data se muestra para varios a?os. 
#Qu?dese con la data del 2018 y 2012. Elimine cualquier pa?s con valor algun 
#valor perdido en esos a?os. 


link="https://en.wikipedia.org/wiki/Corruption_Perceptions_Index"
linkpath='//*[@id="mw-content-text"]/div/table[3]'
corru=htmltab(doc = link, 
              which = linkpath, #herramientas de desarrollador
              encoding = "UTF-8")

## Warning: Header dimension doesn't match body dimension

## Warning: Columns [NA] seem to have no data and are removed. Use rm_nodata_cols =
## F to suppress this behavior

names(corru)=str_split(names(corru),">>",simplify = T)[,1]%>%gsub('\\s','',.)

corrupcion=corru[c(2,5,17)]
names(corrupcion)<-c("Country","Corru2018","Corru2012")

corrupcion[,]=lapply(corrupcion[,], trimws,whitespace = "[\\h\\v]") # no blanks
trimws(corrupcion,whitespace = "[\\h\\v]")

## [1] "c(\"New Zealand\", \"Denmark\", \"Finland\", \"Sweden\", \"Switzerland\", \"Singapore\", \"Norway\", \"Netherlands\", \"Luxembourg\", \"Germany\", \"Iceland\", \"Canada\", \"United Kingdom\", \"Australia\", \"Austria\", \"Hong Kong\", \"Belgium\", \"Ireland\", \"Estonia\", \"Japan\", \"Uruguay\", \"United Arab Emirates\", \"France\", \"United States\", \"Bhutan\", \"Chile\", \"Taiwan\", \"Bahamas\", \"Spain\", \"Barbados\", \"Portugal\", \"Qatar\", \"Botswana\", \"Brunei\", \"Israel\", \"Slovenia\", \"Lithuania\", \"Saint Vincent and the Grenadines\", \"South Korea\", \"Poland\", \n\"Cyprus\", \"Cape Verde\", \"Czech Republic\", \"Georgia\", \"Latvia\", \"Costa Rica\", \"Dominica\", \"Saint Lucia\", \"Malta\", \"Rwanda\", \"Grenada\", \"Italy\", \"Saudi Arabia\", \"Malaysia\", \"Namibia\", \"Oman\", \"Mauritius\", \"Slovakia\", \"Jordan\", \"Cuba\", \"Greece\", \"Croatia\", \"São Tomé and Príncipe\", \"Montenegro\", \"Senegal\", \"Belarus\", \"Argentina\", \"Romania\", \"South Africa\", \"Suriname\", \"Hungary\", \"Jamaica\", \"Tunisia\", \"Bulgaria\", \"Solomon Islands\", \"Bahrain\", \"Armenia\", \"Morocco\", \"India\", \"Ghana\", \"Benin\", \"China\", \n\"Burkina Faso\", \"Kuwait\", \"Indonesia\", \"Guyana\", \"Lesotho\", \"Trinidad and Tobago\", \"Serbia\", \"Turkey\", \"East Timor\", \"Sri Lanka\", \"Ecuador\", \"Vietnam\", \"Colombia\", \"Tanzania\", \"Ethiopia\", \"Gambia\", \"Bosnia and Herzegovina\", \"Panama\", \"Kosovo\", \"Thailand\", \"Peru\", \"Egypt\", \"Ivory Coast\", \"Mongolia\", \"North Macedonia\", \"Albania\", \"Brazil\", \"Algeria\", \"Philippines\", \"Zambia\", \"El Salvador\", \"Kazakhstan\", \"Nepal\", \"Sierra Leone\", \"Niger\", \"Pakistan\", \"Moldova\", \"Bolivia\", \"Malawi\", \"Gabon\", \"Djibouti\", \n\"Ukraine\", \"Azerbaijan\", \"Kyrgyzstan\", \"Myanmar\", \"Mexico\", \"Mali\", \"Maldives\", \"Togo\", \"Guinea\", \"Laos\", \"Liberia\", \"Dominican Republic\", \"Paraguay\", \"Russia\", \"Lebanon\", \"Papua New Guinea\", \"Mauritania\", \"Uganda\", \"Kenya\", \"Honduras\", \"Iran\", \"Guatemala\", \"Nigeria\", \"Mozambique\", \"Bangladesh\", \"Angola\", \"Comoros\", \"Uzbekistan\", \"Tajikistan\", \"Central African Republic\", \"Cameroon\", \"Madagascar\", \"Zimbabwe\", \"Eritrea\", \"Nicaragua\", \"Cambodia\", \"Chad\", \"Iraq\", \"Turkmenistan\", \"Republic of the Congo\", \n\"Burundi\", \"Haiti\", \"DR Congo\", \"Guinea-Bissau\", \"Libya\", \"North Korea\", \"Venezuela\", \"Afghanistan\", \"Sudan\", \"Yemen\", \"Syria\", \"South Sudan\", \"Somalia\")"
## [2] "c(\"87\", \"88\", \"85\", \"85\", \"85\", \"85\", \"84\", \"82\", \"81\", \"80\", \"76\", \"81\", \"80\", \"77\", \"76\", \"76\", \"75\", \"73\", \"73\", \"73\", \"70\", \"70\", \"72\", \"71\", \"68\", \"67\", \"63\", \"65\", \"58\", \"68\", \"64\", \"62\", \"61\", \"63\", \"61\", \"60\", \"59\", \"58\", \"57\", \"60\", \"59\", \"57\", \"59\", \"58\", \"58\", \"56\", \"57\", \"55\", \"54\", \"56\", \"52\", \"52\", \"49\", \"47\", \"53\", \"52\", \"51\", \"50\", \"49\", \"47\", \"45\", \"48\", \"46\", \"45\", \"45\", \"44\", \"40\", \"47\", \"43\", \"43\", \"46\", \"44\", \"43\", \"42\", \"44\", \"36\", \"35\", \"43\", \"41\", \"41\", \"40\", \"39\", \"41\", \"41\", \n\"38\", \"37\", \"41\", \"41\", \"39\", \"41\", \"35\", \"38\", \"34\", \"33\", \"36\", \"36\", \"34\", \"37\", \"38\", \"37\", \"37\", \"36\", \"35\", \"35\", \"35\", \"37\", \"37\", \"36\", \"35\", \"35\", \"36\", \"35\", \"35\", \"31\", \"31\", \"30\", \"34\", \"33\", \"33\", \"29\", \"32\", \"31\", \"31\", \"32\", \"25\", \"29\", \"29\", \"28\", \"32\", \"31\", \"30\", \"28\", \"29\", \"32\", \"30\", \"29\", \"28\", \"28\", \"28\", \"27\", \"26\", \"27\", \"29\", \"28\", \"27\", \"27\", \"23\", \"26\", \"19\", \"27\", \"23\", \"25\", \"26\", \"25\", \"25\", \"22\", \"24\", \"25\", \"20\", \"19\", \"18\", \"20\", \"19\", \"17\", \"20\", \"19\", \"28\", \"17\", \n\"14\", \"18\", \"16\", \"16\", \"14\", \"13\", \"13\", \"10\")"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [3] "c(\"90\", \"90\", \"90\", \"88\", \"86\", \"87\", \"85\", \"84\", \"80\", \"79\", \"82\", \"84\", \"74\", \"85\", \"69\", \"77\", \"75\", \"69\", \"64\", \"74\", \"72\", \"68\", \"71\", \"73\", \"63\", \"72\", \"61\", \"71\", \"65\", \"76\", \"63\", \"68\", \"65\", \"55\", \"60\", \"61\", \"54\", \"62\", \"56\", \"58\", \"66\", \"60\", \"49\", \"52\", \"49\", \"54\", \"58\", \"71\", \"57\", \"53\", NA, \"42\", \"44\", \"49\", \"48\", \"47\", \"57\", \"46\", \"48\", \"48\", \"36\", \"46\", \"42\", \"41\", \"36\", \"31\", \"35\", \"44\", \"43\", \"37\", \"55\", \"38\", \"41\", \"41\", NA, \"51\", \"34\", \"37\", \"36\", \"45\", \"36\", \"39\", \"38\", \"44\", \n\"32\", \"28\", \"45\", \"39\", \"39\", \"49\", \"33\", \"40\", \"32\", \"31\", \"36\", \"35\", \"33\", \"34\", \"42\", \"38\", \"34\", \"37\", \"38\", \"32\", \"29\", \"36\", \"43\", \"33\", \"43\", \"34\", \"34\", \"37\", \"38\", \"28\", \"27\", \"31\", \"33\", \"27\", \"36\", \"34\", \"37\", \"35\", \"36\", \"26\", \"27\", \"24\", \"15\", \"34\", \"34\", NA, \"30\", \"24\", \"21\", \"41\", \"32\", \"25\", \"28\", \"30\", \"25\", \"31\", \"29\", \"27\", \"28\", \"28\", \"33\", \"27\", \"31\", \"26\", \"22\", \"28\", \"17\", \"22\", \"26\", \"26\", \"32\", \"20\", \"25\", \"29\", \"22\", \"19\", \"18\", \"17\", \"26\", \"19\", \"19\", \"21\", \"25\", \"21\", \n\"08\", \"19\", \"08\", \"13\", \"23\", \"26\", NA, \"08\")"

corrupcion[,-c(1)]=lapply(corrupcion[,-c(1)], as.numeric) # a numerico

sum(is.na(corrupcion)) # No hay valores perdidos

## [1] 4

#3)
#Esa wikipage tiene una tabla donde est? el "score" de democracia, as? como 
#otros indicadores. 

Link3="https://en.wikipedia.org/wiki/Democracy_Index"
linkpath3='//*[@id="mw-content-text"]/div/table[2]'
demo=htmltab(doc = Link3, 
              which = linkpath3, #herramientas de desarrollador
              encoding = "UTF-8")

library(stringr)
library(magrittr)
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,c(1:11)]=lapply(demo[,c(1:11)], trimws,whitespace = "[\\h\\v]") # no blanks

demo$Rank=NULL

demo[,c(2,3,4,5,6,7)]=lapply(demo[,c(2,3,4,5,6,7)], as.numeric) # a numerico

str(demo)

## 'data.frame':    167 obs. of  10 variables:
##  $ Country                     : chr  "Norway" "Iceland" "Sweden" "New Zealand" ...
##  $ Score                       : num  9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
##  $ Electoralprocessandpluralism: num  10 10 9.58 10 10 10 10 9.58 10 9.58 ...
##  $ Functioningofgovernment   : num  9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
##  $ Politicalparticipation     : num  10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
##  $ Politicalculture            : num  10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
##  $ Civilliberties              : num  9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
##  $ Regimetype                  : chr  "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
##  $ Region                      : chr  "Western Europe" "Western Europe" "Western Europe" "Asia & Australasia" ...
##  $ Changesfromlastyear         : chr  "Score: Rank:" "Score: Rank:" "Score: Rank:" "Score: Rank:" ...

#4.-
#Luego de juntarlas todas, qu?dese al final con las filas 
#sin valores perdidos.

#Juntando primero education expenditure con datacorruption


join<-merge(edu, corrupcion, by.x='Country', by.y='Country')

#Ahora juntando las dos tablas anteriores con data democracy
#datademocracy[,-c(1,8,9,10)]=lapply(datademocracy[,-c(1,8,9,10)], as.numeric) # a numerico

join<-merge(join, demo, by.x='Country', by.y='Country')


#omitiendo los valores perdidos....

sum(is.na(join))

## [1] 0

#[1] 13 hay 13 valores perdidos

join<-na.omit(join)

sum(is.na(join))

## [1] 0

#[1] 0 ya no hay valores perdidos

str(join)

## 'data.frame':    113 obs. of  14 variables:
##  $ Country                     : chr  "Afghanistan" "Albania" "Argentina" "Armenia" ...
##  $ Expenditure                 : num  4.1 4 5.5 2.7 5.3 5.5 2.5 2.3 4.8 6.5 ...
##  $ Year                        : num  2017 2016 2017 2017 2016 ...
##  $ Corru2018                   : num  16 36 40 35 77 76 25 36 44 75 ...
##  $ Corru2012                   : num  8 33 35 34 85 69 27 51 31 75 ...
##  $ Score                       : num  2.85 5.89 7.02 5.54 9.09 8.29 2.75 2.55 2.48 7.64 ...
##  $ Electoralprocessandpluralism: num  3.42 7 9.17 7.5 10 9.58 0.5 0.83 0.92 9.58 ...
##  $ Functioningofgovernment   : num  0.64 5.36 5.36 5.36 8.93 7.86 3.21 2.71 2 8.21 ...
##  $ Politicalparticipation     : num  3.89 4.44 6.11 6.11 7.78 8.33 2.78 2.78 2.78 5 ...
##  $ Politicalculture            : num  2.5 5 6.25 3.13 8.75 6.88 3.75 4.38 4.38 6.88 ...
##  $ Civilliberties              : num  3.82 7.65 8.24 5.59 10 8.82 3.53 2.06 2.35 8.53 ...
##  $ Regimetype                  : chr  "Authoritarian" "Hybrid regime" "Flawed democracy" "Hybrid regime" ...
##  $ Region                      : chr  "Asia & Australasia" "Eastern Europe" "Latin America" "Eastern Europe" ...
##  $ Changesfromlastyear         : chr  "Score:  0.12Rank:  2" "Score:  0.09Rank:  3" "Score: Rank:  1" "Score:  0.75Rank:  17" ...

#5.-ALGUNOS GRAFICOS Y COEFCIENTES PREVIOS

join$Regimetype<-as.factor(join$Regimetype)
join$Region<-as.factor(join$Region)

#antes de hacer la regresion veamos correlaciones graficamente

library(ggcorrplot)

## Loading required package: ggplot2

data=join[,-c(1,3,12,13,14)]

#Observamos que el gasto en educacion se correlaciona #positivamente con Indice de Corrupcion del 12 y con el #score de democracia.

library(ggplot2)

ggplot(join,aes (x = Score,
                     y = Expenditure, 
                     colour = as.factor(Regimetype) ))+ geom_point ()

ggplot(join,aes (x = Score,
                 y = Expenditure, 
                 colour = as.factor(Regimetype) ))+ geom_boxplot ()

#Podemos comparar que conforme el score de democracia sube se
#gasta m?s en salud (gasto expresado como % del GDP Gross
#Domestic Product o Producto Bruto Interno).

cor(join$Score,data$Expenditure,method = "pearson")

## [1] 0.470936

#Correlacion 0.4279 #en realidad es 0.470936


ggplot(join,aes (x = Corru2012,
                 y = Expenditure, 
                 colour = as.factor(Regimetype) ))+ geom_point ()

ggplot(join,aes (x = Corru2012,
                 y = Expenditure, 
                 colour = as.factor(Regimetype) ))+ geom_boxplot ()

#Se puede decir que a mayor Score de Corrupcion en el 2012
#mayor gasto en salud.

cor(join$Corru2012,join$Expenditure,method = "pearson")

## [1] 0.4228772

#Correlaci?n 0.3514  #en realidad es 0.4228772


#Veamos si a m?s democracia m?s corrupcion
#respecto al indice del 2012

ggplot(join,aes (x = Corru2012,
                 y = Score, 
                 colour = as.factor(Regimetype) ))+ geom_point ()

#Seg?n vemos en los gr?ficos existe una correlaci?n positiva

cor(join$Corru2012,join$Score,method = "pearson")

## [1] 0.8008988

#CORRELACION DE 0.7306, regresionar con estas dos    #0.8008988 en realidad
#variables como predcitoras es no deseable pues habria
#multicolinealidad.

#5.-REGRESION LINEAL
#Veamos el modelo de regresion solo con las variables con que
#Se efectua la afirmacion

modelo1=formula(Expenditure ~ Score + Corru2012)
reg1=lm(modelo1,data=join)
stargazer::stargazer(reg1,type = "text",intercept.bottom = FALSE)

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                             Expenditure        
## -----------------------------------------------
## Constant                     2.634***          
##                               (0.347)          
##                                                
## Score                        0.245***          
##                               (0.093)          
##                                                
## Corru2012                      0.009           
##                               (0.010)          
##                                                
## -----------------------------------------------
## Observations                    113            
## R2                             0.228           
## Adjusted R2                    0.214           
## Residual Std. Error      1.307 (df = 110)      
## F Statistic           16.207*** (df = 2; 110)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

summary(reg1)

## 
## Call:
## lm(formula = modelo1, data = join)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9963 -1.0203 -0.2457  0.6963  3.1718 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.634131   0.346915   7.593 1.09e-11 ***
## Score       0.245243   0.093045   2.636  0.00961 ** 
## Corru2012   0.009033   0.009917   0.911  0.36435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.307 on 110 degrees of freedom
## Multiple R-squared:  0.2276, Adjusted R-squared:  0.2136 
## F-statistic: 16.21 on 2 and 110 DF,  p-value: 6.778e-07

#names(join)
#set.seed(2020)
#regresion<- lm(Expenditure~Score + Corru2012,data= join)

#summary(regresion)


#El modelo explica tan solo el 41.69% de la variacion del gasto en 
#Salud con estas dos variables, es una regresion poco sugerida 
#en realidad explica 21.36%

plot(reg1)

#La Afirmacion es : ##AL HACER UNA REGRESION, alguien llega a la conclusi?n que, si un pais aumenta #en 1 punto su nivel de democracia, la probabilidad de que el gasto en educacion #aumente crece en 7%, controlando por el nivel de corrupcion (al 2012). #UD indicar?a:

#Lo que podemos indicar es que ante un cambio unitario (1) del #nivel de democracia el gasto en educaci?n aumentara en promedio #en 0.50574 manteniendo el coeficiente de corrupci?n 2012 constante.

#en realidad es 0.49096

#Si entendemos el aumento de un punto como un punto b?sico(0.01), #entonces el gasto en salud EN PROMEDIO aumentaria en 5.05%. #en realidad seria 4.9% #Sin embargo como vimos el R ajustado, y los gr?ficos, esta regresi?n #no es recomendada.

modelo2=formula(Corru2018 ~ Score + Expenditure)
reg2=lm(modelo2,data=join)
#stargazer(reg2,type = "text",intercept.bottom = FALSE)

#se puede afirmar que por cada unidad que aumente la democracia la corrupción aumentará en 6.996 manteniendose constante el gasto en educacion

summary(reg2)

## 
## Call:
## lm(formula = modelo2, data = join)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.702 -10.017   0.093   7.900  39.840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.06813    3.91016   0.017    0.986    
## Score        6.99611    0.57668  12.132   <2e-16 ***
## Expenditure  1.02595    0.86734   1.183    0.239    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.93 on 110 degrees of freedom
## Multiple R-squared:  0.6544, Adjusted R-squared:  0.6481 
## F-statistic: 104.1 on 2 and 110 DF,  p-value: < 2.2e-16

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(reg2)

## 
##  studentized Breusch-Pagan test
## 
## data:  reg2
## BP = 0.14077, df = 2, p-value = 0.932