#SCRIPT GABRIELA VIDAL
library(XML)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(magrittr)
#Junte 3 tablas de datos. Las tablas estan en estos links
#(en ningun caso necesita los rankings, ni cambie los nombres de
#pa?ses):
#1)
#Data de gastos en educacion (la data aparece como 'Education Expenditure' -
#la data se muestra como % del GDP). Quedese s?lo con los datos de antes del
#2018 pero luego del 2010.
library(htmltab)
link1="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/369rank.html"
linkpath1='//*[@id="rankOrder"]'
edu=htmltab(doc = link1,
which = linkpath1, #herramientas de desarrollador
encoding = "UTF-8")
## No encoding supplied: defaulting to UTF-8.
edu$Rank=NULL
names(edu)<-c("Country","Expenditure","Year")
str(edu)
## 'data.frame': 175 obs. of 3 variables:
## $ Country : chr "Cuba" "Micronesia, Federated States of" "Solomon Islands" "Botswana" ...
## $ Expenditure: chr "12.80" "12.50" "9.90" "9.60" ...
## $ Year : chr "2010" "2015" "2010" "2009" ...
edu[,]=lapply(edu[,], trimws,whitespace = "[\\h\\v]") # no blanks
#filtrando los a?os pedidos
edu<-edu %>% filter(Year>2010&Year<2018)
edu[,-c(1)]=lapply(edu[,-c(1)], as.numeric) # a numerico
#2)
#tabla donde est? el "score" de percepcion de corrupcion, que da un menor
#puntaje donde habria mayor corrupcion,la data se muestra para varios a?os.
#Qu?dese con la data del 2018 y 2012. Elimine cualquier pa?s con valor algun
#valor perdido en esos a?os.
link="https://en.wikipedia.org/wiki/Corruption_Perceptions_Index"
linkpath='//*[@id="mw-content-text"]/div/table[3]'
corru=htmltab(doc = link,
which = linkpath, #herramientas de desarrollador
encoding = "UTF-8")
## Warning: Header dimension doesn't match body dimension
## Warning: Columns [NA] seem to have no data and are removed. Use rm_nodata_cols =
## F to suppress this behavior
names(corru)=str_split(names(corru),">>",simplify = T)[,1]%>%gsub('\\s','',.)
corrupcion=corru[c(2,5,17)]
names(corrupcion)<-c("Country","Corru2018","Corru2012")
corrupcion[,]=lapply(corrupcion[,], trimws,whitespace = "[\\h\\v]") # no blanks
trimws(corrupcion,whitespace = "[\\h\\v]")
## [1] "c(\"New Zealand\", \"Denmark\", \"Finland\", \"Sweden\", \"Switzerland\", \"Singapore\", \"Norway\", \"Netherlands\", \"Luxembourg\", \"Germany\", \"Iceland\", \"Canada\", \"United Kingdom\", \"Australia\", \"Austria\", \"Hong Kong\", \"Belgium\", \"Ireland\", \"Estonia\", \"Japan\", \"Uruguay\", \"United Arab Emirates\", \"France\", \"United States\", \"Bhutan\", \"Chile\", \"Taiwan\", \"Bahamas\", \"Spain\", \"Barbados\", \"Portugal\", \"Qatar\", \"Botswana\", \"Brunei\", \"Israel\", \"Slovenia\", \"Lithuania\", \"Saint Vincent and the Grenadines\", \"South Korea\", \"Poland\", \n\"Cyprus\", \"Cape Verde\", \"Czech Republic\", \"Georgia\", \"Latvia\", \"Costa Rica\", \"Dominica\", \"Saint Lucia\", \"Malta\", \"Rwanda\", \"Grenada\", \"Italy\", \"Saudi Arabia\", \"Malaysia\", \"Namibia\", \"Oman\", \"Mauritius\", \"Slovakia\", \"Jordan\", \"Cuba\", \"Greece\", \"Croatia\", \"São Tomé and Príncipe\", \"Montenegro\", \"Senegal\", \"Belarus\", \"Argentina\", \"Romania\", \"South Africa\", \"Suriname\", \"Hungary\", \"Jamaica\", \"Tunisia\", \"Bulgaria\", \"Solomon Islands\", \"Bahrain\", \"Armenia\", \"Morocco\", \"India\", \"Ghana\", \"Benin\", \"China\", \n\"Burkina Faso\", \"Kuwait\", \"Indonesia\", \"Guyana\", \"Lesotho\", \"Trinidad and Tobago\", \"Serbia\", \"Turkey\", \"East Timor\", \"Sri Lanka\", \"Ecuador\", \"Vietnam\", \"Colombia\", \"Tanzania\", \"Ethiopia\", \"Gambia\", \"Bosnia and Herzegovina\", \"Panama\", \"Kosovo\", \"Thailand\", \"Peru\", \"Egypt\", \"Ivory Coast\", \"Mongolia\", \"North Macedonia\", \"Albania\", \"Brazil\", \"Algeria\", \"Philippines\", \"Zambia\", \"El Salvador\", \"Kazakhstan\", \"Nepal\", \"Sierra Leone\", \"Niger\", \"Pakistan\", \"Moldova\", \"Bolivia\", \"Malawi\", \"Gabon\", \"Djibouti\", \n\"Ukraine\", \"Azerbaijan\", \"Kyrgyzstan\", \"Myanmar\", \"Mexico\", \"Mali\", \"Maldives\", \"Togo\", \"Guinea\", \"Laos\", \"Liberia\", \"Dominican Republic\", \"Paraguay\", \"Russia\", \"Lebanon\", \"Papua New Guinea\", \"Mauritania\", \"Uganda\", \"Kenya\", \"Honduras\", \"Iran\", \"Guatemala\", \"Nigeria\", \"Mozambique\", \"Bangladesh\", \"Angola\", \"Comoros\", \"Uzbekistan\", \"Tajikistan\", \"Central African Republic\", \"Cameroon\", \"Madagascar\", \"Zimbabwe\", \"Eritrea\", \"Nicaragua\", \"Cambodia\", \"Chad\", \"Iraq\", \"Turkmenistan\", \"Republic of the Congo\", \n\"Burundi\", \"Haiti\", \"DR Congo\", \"Guinea-Bissau\", \"Libya\", \"North Korea\", \"Venezuela\", \"Afghanistan\", \"Sudan\", \"Yemen\", \"Syria\", \"South Sudan\", \"Somalia\")"
## [2] "c(\"87\", \"88\", \"85\", \"85\", \"85\", \"85\", \"84\", \"82\", \"81\", \"80\", \"76\", \"81\", \"80\", \"77\", \"76\", \"76\", \"75\", \"73\", \"73\", \"73\", \"70\", \"70\", \"72\", \"71\", \"68\", \"67\", \"63\", \"65\", \"58\", \"68\", \"64\", \"62\", \"61\", \"63\", \"61\", \"60\", \"59\", \"58\", \"57\", \"60\", \"59\", \"57\", \"59\", \"58\", \"58\", \"56\", \"57\", \"55\", \"54\", \"56\", \"52\", \"52\", \"49\", \"47\", \"53\", \"52\", \"51\", \"50\", \"49\", \"47\", \"45\", \"48\", \"46\", \"45\", \"45\", \"44\", \"40\", \"47\", \"43\", \"43\", \"46\", \"44\", \"43\", \"42\", \"44\", \"36\", \"35\", \"43\", \"41\", \"41\", \"40\", \"39\", \"41\", \"41\", \n\"38\", \"37\", \"41\", \"41\", \"39\", \"41\", \"35\", \"38\", \"34\", \"33\", \"36\", \"36\", \"34\", \"37\", \"38\", \"37\", \"37\", \"36\", \"35\", \"35\", \"35\", \"37\", \"37\", \"36\", \"35\", \"35\", \"36\", \"35\", \"35\", \"31\", \"31\", \"30\", \"34\", \"33\", \"33\", \"29\", \"32\", \"31\", \"31\", \"32\", \"25\", \"29\", \"29\", \"28\", \"32\", \"31\", \"30\", \"28\", \"29\", \"32\", \"30\", \"29\", \"28\", \"28\", \"28\", \"27\", \"26\", \"27\", \"29\", \"28\", \"27\", \"27\", \"23\", \"26\", \"19\", \"27\", \"23\", \"25\", \"26\", \"25\", \"25\", \"22\", \"24\", \"25\", \"20\", \"19\", \"18\", \"20\", \"19\", \"17\", \"20\", \"19\", \"28\", \"17\", \n\"14\", \"18\", \"16\", \"16\", \"14\", \"13\", \"13\", \"10\")"
## [3] "c(\"90\", \"90\", \"90\", \"88\", \"86\", \"87\", \"85\", \"84\", \"80\", \"79\", \"82\", \"84\", \"74\", \"85\", \"69\", \"77\", \"75\", \"69\", \"64\", \"74\", \"72\", \"68\", \"71\", \"73\", \"63\", \"72\", \"61\", \"71\", \"65\", \"76\", \"63\", \"68\", \"65\", \"55\", \"60\", \"61\", \"54\", \"62\", \"56\", \"58\", \"66\", \"60\", \"49\", \"52\", \"49\", \"54\", \"58\", \"71\", \"57\", \"53\", NA, \"42\", \"44\", \"49\", \"48\", \"47\", \"57\", \"46\", \"48\", \"48\", \"36\", \"46\", \"42\", \"41\", \"36\", \"31\", \"35\", \"44\", \"43\", \"37\", \"55\", \"38\", \"41\", \"41\", NA, \"51\", \"34\", \"37\", \"36\", \"45\", \"36\", \"39\", \"38\", \"44\", \n\"32\", \"28\", \"45\", \"39\", \"39\", \"49\", \"33\", \"40\", \"32\", \"31\", \"36\", \"35\", \"33\", \"34\", \"42\", \"38\", \"34\", \"37\", \"38\", \"32\", \"29\", \"36\", \"43\", \"33\", \"43\", \"34\", \"34\", \"37\", \"38\", \"28\", \"27\", \"31\", \"33\", \"27\", \"36\", \"34\", \"37\", \"35\", \"36\", \"26\", \"27\", \"24\", \"15\", \"34\", \"34\", NA, \"30\", \"24\", \"21\", \"41\", \"32\", \"25\", \"28\", \"30\", \"25\", \"31\", \"29\", \"27\", \"28\", \"28\", \"33\", \"27\", \"31\", \"26\", \"22\", \"28\", \"17\", \"22\", \"26\", \"26\", \"32\", \"20\", \"25\", \"29\", \"22\", \"19\", \"18\", \"17\", \"26\", \"19\", \"19\", \"21\", \"25\", \"21\", \n\"08\", \"19\", \"08\", \"13\", \"23\", \"26\", NA, \"08\")"
corrupcion[,-c(1)]=lapply(corrupcion[,-c(1)], as.numeric) # a numerico
sum(is.na(corrupcion)) # No hay valores perdidos
## [1] 4
#3)
#Esa wikipage tiene una tabla donde est? el "score" de democracia, as? como
#otros indicadores.
Link3="https://en.wikipedia.org/wiki/Democracy_Index"
linkpath3='//*[@id="mw-content-text"]/div/table[2]'
demo=htmltab(doc = Link3,
which = linkpath3, #herramientas de desarrollador
encoding = "UTF-8")
library(stringr)
library(magrittr)
names(demo)=str_split(names(demo),">>",simplify = T)[,1]%>%gsub('\\s','',.)
demo[,c(1:11)]=lapply(demo[,c(1:11)], trimws,whitespace = "[\\h\\v]") # no blanks
demo$Rank=NULL
demo[,c(2,3,4,5,6,7)]=lapply(demo[,c(2,3,4,5,6,7)], as.numeric) # a numerico
str(demo)
## 'data.frame': 167 obs. of 10 variables:
## $ Country : chr "Norway" "Iceland" "Sweden" "New Zealand" ...
## $ Score : num 9.87 9.58 9.39 9.26 9.25 9.24 9.22 9.22 9.09 9.03 ...
## $ Electoralprocessandpluralism: num 10 10 9.58 10 10 10 10 9.58 10 9.58 ...
## $ Functioningofgovernment : num 9.64 9.29 9.64 9.29 8.93 7.86 9.29 9.64 8.93 9.29 ...
## $ Politicalparticipation : num 10 8.89 8.33 8.89 8.89 8.33 8.33 7.78 7.78 7.78 ...
## $ Politicalculture : num 10 10 10 8.13 8.75 10 9.38 9.38 8.75 9.38 ...
## $ Civilliberties : num 9.71 9.71 9.41 10 9.71 10 9.12 9.71 10 9.12 ...
## $ Regimetype : chr "Full democracy" "Full democracy" "Full democracy" "Full democracy" ...
## $ Region : chr "Western Europe" "Western Europe" "Western Europe" "Asia & Australasia" ...
## $ Changesfromlastyear : chr "Score: Rank:" "Score: Rank:" "Score: Rank:" "Score: Rank:" ...
#4.-
#Luego de juntarlas todas, qu?dese al final con las filas
#sin valores perdidos.
#Juntando primero education expenditure con datacorruption
join<-merge(edu, corrupcion, by.x='Country', by.y='Country')
#Ahora juntando las dos tablas anteriores con data democracy
#datademocracy[,-c(1,8,9,10)]=lapply(datademocracy[,-c(1,8,9,10)], as.numeric) # a numerico
join<-merge(join, demo, by.x='Country', by.y='Country')
#omitiendo los valores perdidos....
sum(is.na(join))
## [1] 0
#[1] 13 hay 13 valores perdidos
join<-na.omit(join)
sum(is.na(join))
## [1] 0
#[1] 0 ya no hay valores perdidos
str(join)
## 'data.frame': 113 obs. of 14 variables:
## $ Country : chr "Afghanistan" "Albania" "Argentina" "Armenia" ...
## $ Expenditure : num 4.1 4 5.5 2.7 5.3 5.5 2.5 2.3 4.8 6.5 ...
## $ Year : num 2017 2016 2017 2017 2016 ...
## $ Corru2018 : num 16 36 40 35 77 76 25 36 44 75 ...
## $ Corru2012 : num 8 33 35 34 85 69 27 51 31 75 ...
## $ Score : num 2.85 5.89 7.02 5.54 9.09 8.29 2.75 2.55 2.48 7.64 ...
## $ Electoralprocessandpluralism: num 3.42 7 9.17 7.5 10 9.58 0.5 0.83 0.92 9.58 ...
## $ Functioningofgovernment : num 0.64 5.36 5.36 5.36 8.93 7.86 3.21 2.71 2 8.21 ...
## $ Politicalparticipation : num 3.89 4.44 6.11 6.11 7.78 8.33 2.78 2.78 2.78 5 ...
## $ Politicalculture : num 2.5 5 6.25 3.13 8.75 6.88 3.75 4.38 4.38 6.88 ...
## $ Civilliberties : num 3.82 7.65 8.24 5.59 10 8.82 3.53 2.06 2.35 8.53 ...
## $ Regimetype : chr "Authoritarian" "Hybrid regime" "Flawed democracy" "Hybrid regime" ...
## $ Region : chr "Asia & Australasia" "Eastern Europe" "Latin America" "Eastern Europe" ...
## $ Changesfromlastyear : chr "Score: 0.12Rank: 2" "Score: 0.09Rank: 3" "Score: Rank: 1" "Score: 0.75Rank: 17" ...
#5.-ALGUNOS GRAFICOS Y COEFCIENTES PREVIOS
join$Regimetype<-as.factor(join$Regimetype)
join$Region<-as.factor(join$Region)
#antes de hacer la regresion veamos correlaciones graficamente
library(ggcorrplot)
## Loading required package: ggplot2
data=join[,-c(1,3,12,13,14)]
#Observamos que el gasto en educacion se correlaciona #positivamente con Indice de Corrupcion del 12 y con el #score de democracia.
library(ggplot2)
ggplot(join,aes (x = Score,
y = Expenditure,
colour = as.factor(Regimetype) ))+ geom_point ()
ggplot(join,aes (x = Score,
y = Expenditure,
colour = as.factor(Regimetype) ))+ geom_boxplot ()
#Podemos comparar que conforme el score de democracia sube se
#gasta m?s en salud (gasto expresado como % del GDP Gross
#Domestic Product o Producto Bruto Interno).
cor(join$Score,data$Expenditure,method = "pearson")
## [1] 0.470936
#Correlacion 0.4279 #en realidad es 0.470936
ggplot(join,aes (x = Corru2012,
y = Expenditure,
colour = as.factor(Regimetype) ))+ geom_point ()
ggplot(join,aes (x = Corru2012,
y = Expenditure,
colour = as.factor(Regimetype) ))+ geom_boxplot ()
#Se puede decir que a mayor Score de Corrupcion en el 2012
#mayor gasto en salud.
cor(join$Corru2012,join$Expenditure,method = "pearson")
## [1] 0.4228772
#Correlaci?n 0.3514 #en realidad es 0.4228772
#Veamos si a m?s democracia m?s corrupcion
#respecto al indice del 2012
ggplot(join,aes (x = Corru2012,
y = Score,
colour = as.factor(Regimetype) ))+ geom_point ()
#Seg?n vemos en los gr?ficos existe una correlaci?n positiva
cor(join$Corru2012,join$Score,method = "pearson")
## [1] 0.8008988
#CORRELACION DE 0.7306, regresionar con estas dos #0.8008988 en realidad
#variables como predcitoras es no deseable pues habria
#multicolinealidad.
#5.-REGRESION LINEAL
#Veamos el modelo de regresion solo con las variables con que
#Se efectua la afirmacion
modelo1=formula(Expenditure ~ Score + Corru2012)
reg1=lm(modelo1,data=join)
stargazer::stargazer(reg1,type = "text",intercept.bottom = FALSE)
##
## ===============================================
## Dependent variable:
## ---------------------------
## Expenditure
## -----------------------------------------------
## Constant 2.634***
## (0.347)
##
## Score 0.245***
## (0.093)
##
## Corru2012 0.009
## (0.010)
##
## -----------------------------------------------
## Observations 113
## R2 0.228
## Adjusted R2 0.214
## Residual Std. Error 1.307 (df = 110)
## F Statistic 16.207*** (df = 2; 110)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
summary(reg1)
##
## Call:
## lm(formula = modelo1, data = join)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9963 -1.0203 -0.2457 0.6963 3.1718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.634131 0.346915 7.593 1.09e-11 ***
## Score 0.245243 0.093045 2.636 0.00961 **
## Corru2012 0.009033 0.009917 0.911 0.36435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.307 on 110 degrees of freedom
## Multiple R-squared: 0.2276, Adjusted R-squared: 0.2136
## F-statistic: 16.21 on 2 and 110 DF, p-value: 6.778e-07
#names(join)
#set.seed(2020)
#regresion<- lm(Expenditure~Score + Corru2012,data= join)
#summary(regresion)
#El modelo explica tan solo el 41.69% de la variacion del gasto en
#Salud con estas dos variables, es una regresion poco sugerida
#en realidad explica 21.36%
plot(reg1)
#La Afirmacion es : ##AL HACER UNA REGRESION, alguien llega a la conclusi?n que, si un pais aumenta #en 1 punto su nivel de democracia, la probabilidad de que el gasto en educacion #aumente crece en 7%, controlando por el nivel de corrupcion (al 2012). #UD indicar?a:
#Lo que podemos indicar es que ante un cambio unitario (1) del #nivel de democracia el gasto en educaci?n aumentara en promedio #en 0.50574 manteniendo el coeficiente de corrupci?n 2012 constante.
#en realidad es 0.49096
#Si entendemos el aumento de un punto como un punto b?sico(0.01), #entonces el gasto en salud EN PROMEDIO aumentaria en 5.05%. #en realidad seria 4.9% #Sin embargo como vimos el R ajustado, y los gr?ficos, esta regresi?n #no es recomendada.
modelo2=formula(Corru2018 ~ Score + Expenditure)
reg2=lm(modelo2,data=join)
#stargazer(reg2,type = "text",intercept.bottom = FALSE)
#se puede afirmar que por cada unidad que aumente la democracia la corrupción aumentará en 6.996 manteniendose constante el gasto en educacion
summary(reg2)
##
## Call:
## lm(formula = modelo2, data = join)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.702 -10.017 0.093 7.900 39.840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.06813 3.91016 0.017 0.986
## Score 6.99611 0.57668 12.132 <2e-16 ***
## Expenditure 1.02595 0.86734 1.183 0.239
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.93 on 110 degrees of freedom
## Multiple R-squared: 0.6544, Adjusted R-squared: 0.6481
## F-statistic: 104.1 on 2 and 110 DF, p-value: < 2.2e-16
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(reg2)
##
## studentized Breusch-Pagan test
##
## data: reg2
## BP = 0.14077, df = 2, p-value = 0.932