knitr::opts_chunk$set(echo = T, message = F, warning = F, out.width="100%",
out.height = "100%")
library(wbstats) #to load data from World Bank
library(DT) #to transform the data
library(ggplot2) #to visualize the data
library(dplyr) #to transform the data
library(readr) #to read csv
library(corrplot) #to build correlation plot
library(countrycode) #to match country name with country code
library(ggthemes) #theme for graphics
library(visdat) #to visualize NA
library(ggrepel) #to visualize country names
library(plotly) #to create interactive graphs
library(olsrr) # to check linear regression assumptions
library(paletteer) #color palette
library(stargazer)
#set working directory
setwd("C:/upwork/Monica/milestone4")Figures below depicts the correlation matrix. Positive correlation coefficients are highlighted in blue colors, negative correlation coefficients are in the red color palette. Corruption perception index has a relatively strong negative correlation with freedom, rule of law, control of corruption, government effectivness and regulatory quality. With other variables, corruption perception index correlates weakly. Rule of law, control of corruption, government effectivness and regulatory qualityngly strongly correlate with each other. It is necessary to pay attention to this since this fact can cause the problem of multicollinearity in the predictive model. We use this information to build a regression model to predict corruption perception index. However, correlation does not imply causation.
# correlation matrix
M<-cor(num_var_corup)
# visualize correlation matrix
corrplot(M, method = 'number', number.cex=0.75)The graph below shows us 20 countries with the highest corruption perception index in 2016. In the first place is Moldova, in the second place is Bosnia and Herzegovina, in third is Romania. The top mostly includes developed countries.
top<-combined2016_raw_hdi %>%
ungroup() %>%
arrange(desc(corruption)) %>%
slice(1:20)
# Top 40 сountries with the highest corruption perception index
ggplot(top, aes(corruption, reorder(country, corruption), fill = economic_status))+
geom_col()+
labs(x = "Corruption perception index", y = "Country",
title = "Top 20 сountries with the highest corruption\nperception index (2016)",
fill = "Economic status")+
theme_few()Top 20 countries with the lowest corruption perception index in 2016 have predominantly developed countries except for countries such as Rwanda and Somalia In the first place is Singapore, in the second is Rwanda, in third is Denmark.
bottom<-combined2016_raw_hdi %>%
ungroup() %>%
arrange(corruption) %>%
slice(1:20)
ggplot(bottom, aes(corruption, reorder(country, -corruption), fill = economic_status))+
geom_col()+
labs(x = "Corruption perception index", y = "Country",
title = "Top 20 сountries with the lowest corruption\nperception index (2016)",
fill = "Economic status")+
theme_few()This graph is interactive. You can put your cursor on the graph to find out more information.
# add corruption rank column
combined2016_raw_hdi = combined2016_raw_hdi %>%
ungroup() %>%
mutate(ranks_corrupt = order(order(corruption)))
score_bottom<-combined2016_raw_hdi %>%
arrange(score) %>%
slice(1:10)
score_bottom_plot<-ggplot(score_bottom, aes(score, reorder(country, -score), fill = economic_status,
text = paste("Happiness score:", score_bottom$score, "<br>",
"Corruption perception index:", score_bottom$corruption, "<br>",
"Corruption perception index rank:", score_bottom$ranks_corrupt, "<br>",
"Country:", score_bottom$country, "<br>",
"Sub-region:", score_bottom$regional_indicator)))+
geom_col()+
labs(x = "Happiness index score", y = "Country",
title = "Top 10 сountries with the lowest Happiness index score (2016)",
fill = "Economic status")+
theme_few()
score_bottom_plot <- ggplotly(score_bottom_plot, tooltip = "text")
score_bottom_plotThis graph is interactive. You can put your cursor on the graph to find out more information.
score_top<-combined2016_raw_hdi %>%
arrange(-score) %>%
slice(1:10)
score_top_plot<-ggplot(score_top, aes(score, reorder(country, score), fill = economic_status,
text = paste("Happiness score:", score_top$score, "<br>",
"Corruption perception index:", score_top$corruption, "<br>",
"Corruption perception index rank:", score_top$ranks_corrupt, "<br>",
"Country:", score_top$country, "<br>",
"Sub-region:", score_top$regional_indicator)))+
geom_col()+
labs(x = "Happiness index score", y = "Country",
title = "Top 10 сountries with the highest Happiness index score (2016)",
fill = "Economic status")+
theme_few()
score_top_plot <- ggplotly(score_top_plot, tooltip = "text")
score_top_plotIn the graph below we see the histogram of the corruption perception index. The median value of this index is 0.811
# Histogram: Corruption perception index (2016)
ggplot(combined2016_raw_hdi)+
geom_histogram(aes(corruption), fill = "#B1E693", col = "#4A403A")+
labs(x = "Corruption perception index", y = "Number of countries",
title = "Histogram: Corruption perception index (2016)")+
theme_few()This graph shows how is GDP per capita($) and corruption perception index related. The red line is the median value of the corruption perception index. The blue line is the median value of the GDP per capita. There is no linear relationship between these parameters. All Western and North European countries (except Lithuania and Latvia), Australia and New Zealand have a corruption perception index lower the median value. Eastern European countries have GDP per capita lower than in Western Europe. Eastern European countries have a corruption perception index higher than the median (except Belarus). Sub-Saharan Africa, Latina America, South Europe have corruption perception index at the median level and above it.
# corruption and GDP
fig1<-ggplot(combined2016_raw_hdi)+
geom_point(aes(exp(gdp), corruption, col = regional_indicator,
text = paste("GDP per capita:", paste("$", round(exp(combined2016_raw_hdi$gdp),2)), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=exp(median(combined2016_raw_hdi$gdp, na.rm = T)), color="blue")+
labs(x = "GDP per capita", y = "Corruption perception index",
title = "Corruption perception index and GDP per capita depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig1 <- ggplotly(fig1, tooltip = "text")
fig1In the graph below we see the histogram of the freedom. The median value of this index is 0.774
# Histogram: Freedom (2016)
ggplot(combined2016_raw_hdi)+
geom_histogram(aes(freedom), fill = "#FFC069", col = "#4A403A")+
labs(x = "Freedom", y = "Number of countries",
title = "Histogram: Freedom (2016)")+
theme_few()In the graph below corruption perception index and freedom depending on sub-region is depicted.The red line is the median value corruption perception index. The blue line is the median value of freedom. There is no linear relationship between freedom and corruption perception index. Also, there is no division into clusters according to the regions or sub-regions. Most countries in the graph are located around the median value of the corruption perception index regardless of the freedom value. The developed European and North American countries have higher freedom and lower corruption perception index than in other countries.
# freedom and corruption
fig2<-ggplot(combined2016_raw_hdi)+
geom_point(aes(freedom, corruption, col = regional_indicator,
text = paste("Freedom:", combined2016_raw_hdi$freedom, "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$freedom, na.rm = T), color="blue")+
labs(x = "Freedom", y = "Corruption perception index",
title = "Corruption perception index and freedom depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig2 <- ggplotly(fig2, tooltip = "text")
fig2In the graph below corruption perception index and freedom depending on the economic status is depicted. We can see two clusters: developed countries (light blue color) and developing countries (light pink color). Developed countries have a relatively low negative correlation (-0.504) between freedom and corruption. In developing countries correlation between these parameters is lower (-0.322).
# freedom and corruption (Economic status)
fig_2_dev<-ggplot(combined2016_raw_hdi)+
geom_point(aes(freedom, corruption, col = economic_status,
text = paste("Freedom:", combined2016_raw_hdi$freedom, "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Economic status:", combined2016_raw_hdi$economic_status)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$freedom, na.rm = T), color="blue")+
labs(x = "Freedom", y = "Corruption perception index",
title = "Corruption perception index and freedom depending on economic status (2016)",
colour = "Economic status:")+
geom_smooth(aes(freedom,corruption, col=economic_status), method='lm', se = F)+
theme_few()+
scale_color_paletteer_d("ggthemes::excel_Headlines")
fig_2_dev <- ggplotly(fig_2_dev, tooltip = "text")
fig_2_devHere you can see clearly the relationship between the corruption perception index and freedom depending on the Economic status. Median values of freedom (blue) and corruption perception index (red) are calculated for each group of development.
# same graph as upper by group (Economic status)
fig_2_dev_facet<-ggplot(combined2016_raw_hdi)+
geom_point(aes(freedom, corruption, col = economic_status,
text = paste("Freedom:", combined2016_raw_hdi$freedom, "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Economic status:", combined2016_raw_hdi$economic_status)),
size = 3)+
labs(x = "Freedom", y = "Corruption perception index",
title = "Corruption perception index and freedom depending on economic status (2016)",
colour = "Economic status:")+
geom_smooth(aes(freedom,corruption, col=economic_status), method='lm', se = F)+
theme_few()+
scale_color_paletteer_d("ggthemes::excel_Headlines")+
facet_grid(~economic_status)+
geom_hline(aes(yintercept = med_corrupt, group = economic_status), colour = 'red')+
geom_vline(aes(xintercept = med_freedom, group = economic_status), colour = 'blue')
fig_2_dev_facet <- ggplotly(fig_2_dev_facet, tooltip = "text")
fig_2_dev_facetIn the graph below we see the histogram of the control of corruption. The median value of this index is -0.374
# Histogram: Control of Corruption (2016)
ggplot(combined2016_raw_hdi)+
geom_histogram(aes(wbo.ctr_corrupt), fill = "#57CC99", col = "#4A403A")+
labs(x = "Control of Corruption", y = "Number of countries",
title = "Histogram: World Bank Control of Corruption (2016)")+
theme_few()This graph depicted the corruption perception index and control of corruption by sub-region. Red line on the graph shows the median value of the corruption perception index. The blue line is the median value control of corruption. Majority of sub-regions are on the median value, exceptions are Western Europe, Northen America, Australia and New Zealand. In South-Eastern Asia countries have very different perceptions and control of corruption, therefore, Singapour has the lowest value of corruption perception index and one of the highest control of corruption when Vietnam, Philipines, Thailand are on the median value of the corruption perception. Such countries as Somalia, Rwanda from the Sub-Saharan region most probably are outliers. Western Europe, Northen America, Australia and New Zealand have one of the lowest corruption perceptions and one of the highest corruption control. Graphs with the corruption perception index and Rule of law, corruption perception index and Government effectiveness, corruption perception index and Regulatory quality, share entirely the same pattern as the graph with corruption perception index and control of corruption.
# control of corruption and corruption
fig4<-ggplot(combined2016_raw_hdi)+
geom_point(aes(wbo.ctr_corrupt, corruption, col = regional_indicator,
text = paste("Control of Corruption:", round(combined2016_raw_hdi$wbo.ctr_corrupt,3), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$wbo.ctr_corrupt, na.rm = T), color="blue")+
labs(x = "Control of Corruption", y = "Corruption perception index",
title = "Corruption perception index and Control of Corruption depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig4 <- ggplotly(fig4, tooltip = "text")
fig4In the graph below corruption perception index and control of corruption depending on the economic status are depicted. We can see two clusters: developed countries (light blue color) and developing countries (light pink color). Developed countries have a high negative correlation (-0.731) between control of corruption and corruption. In developing countries correlation between these parameters is low (-0.268)
# control of corruption and corruption (Economic status)
fig_4_dev<-ggplot(combined2016_raw_hdi)+
geom_point(aes(wbo.ctr_corrupt, corruption, col = economic_status,
text = paste("Control of Corruption:", round(combined2016_raw_hdi$wbo.ctr_corrupt,3), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Economic status:", combined2016_raw_hdi$economic_status)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$wbo.ctr_corrupt, na.rm = T), color="blue")+
labs(x = "Control of Corruption", y = "Corruption perception index",
title = "Corruption perception index and Control of Corruption depending on economic status",
colour = "Economic status:")+
geom_smooth(aes(wbo.ctr_corrupt,corruption, col=economic_status), method='lm', se = F)+
theme_few()+
scale_color_paletteer_d("ggthemes::excel_Headlines")
fig_4_dev <- ggplotly(fig_4_dev, tooltip = "text")
fig_4_devHere you can see clearly the relationship between the corruption perception index and control of corruption depending on the Economic status. Median values of control of corruption (blue) and corruption perception index (red) are calculated for each group of development.
# same graph as upper by group (Economic status)
fig_4_dev_facet<-ggplot(combined2016_raw_hdi)+
geom_point(aes(wbo.ctr_corrupt, corruption, col = economic_status,
text = paste("Control of corruption:", round(combined2016_raw_hdi$wbo.ctr_corrupt,3), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Economic status:", combined2016_raw_hdi$economic_status)),
size = 3)+
labs(x = "Control of corruption", y = "Corruption perception index",
title = "Corruption perception index and control of corruption depending on economic status",
colour = "Economic status:")+
geom_smooth(aes(wbo.ctr_corrupt,corruption, col=economic_status), method='lm', se = F)+
theme_few()+
scale_color_paletteer_d("ggthemes::excel_Headlines")+
facet_grid(~economic_status)+
geom_hline(aes(yintercept = med_corrupt, group = economic_status), colour = 'red')+
geom_vline(aes(xintercept = med_cor_ctr, group = economic_status), colour = 'blue')
fig_4_dev_facet <- ggplotly(fig_4_dev_facet, tooltip = "text")
fig_4_dev_facetIn the graph below we see the histogram of the rule of law. The median value of this index is -0.264
# Histogram: World Bank Rule of Law (2016)
ggplot(combined2016_raw_hdi)+
geom_histogram(aes(wbo.law), fill = "#79B4B7", col = "#4A403A")+
labs(x = "Rule of Law", y = "Number of countries",
title = "Histogram: World Bank Rule of Law (2016)")+
theme_few()# law and corruption
fig3<-ggplot(combined2016_raw_hdi)+
geom_point(aes(wbo.law, corruption, col = regional_indicator,
text = paste("Rule of Law:", round(combined2016_raw_hdi$wbo.law,3), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$wbo.law, na.rm = T), color="blue")+
labs(x = "Rule of Law", y = "Corruption perception index",
title = "Corruption perception index and rule of Law depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig3 <- ggplotly(fig3, tooltip = "text")
fig3In the graph below we see the histogram of the government effectiveness. The median value of this index is -0.161
ggplot(combined2016_raw_hdi)+
geom_histogram(aes(wbo.gov_effect), fill = "#FFC069", col = "#4A403A")+
labs(x = "Control of Corruption", y = "Number of countries",
title = "Histogram: World Bank Government Effectiveness (2016)")+
theme_few()# law and corruption
fig5<-ggplot(combined2016_raw_hdi)+
geom_point(aes(wbo.gov_effect, corruption, col = regional_indicator,
text = paste("Government Effectiveness:", round(combined2016_raw_hdi$wbo.gov_effect,3), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$wbo.gov_effect, na.rm = T), color="blue")+
labs(x = "Government Effectiveness", y = "Corruption perception index",
title = "Corruption perception index and Government Effectiveness depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig5 <- ggplotly(fig5, tooltip = "text")
fig5In the graph below we see the histogram of the regulatory quality. The median value of this index is -0.107
ggplot(combined2016_raw_hdi)+
geom_histogram(aes(wbo.reg_qua), fill = "#FFDAC7", col = "#4A403A")+
labs(x = "Regulatory Quality", y = "Number of countries",
title = "Histogram: World Bank Regulatory Quality (2016)")+
theme_few()# law and corruption
fig6<-ggplot(combined2016_raw_hdi)+
geom_point(aes(wbo.reg_qua, corruption, col = regional_indicator,
text = paste("Regulatory Quality:", round(combined2016_raw_hdi$wbo.reg_qua,3), "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$wbo.reg_qua, na.rm = T), color="blue")+
labs(x = "Regulatory Quality", y = "Corruption perception index",
title = "Corruption perception index and Regulatory Quality depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig6 <- ggplotly(fig6, tooltip = "text")
fig6On the graph below corruption perception index and happiness score depending on sub-region are depicted. We don’t see here a linear relationship between the corruption perception index and the happiness score. On the graph, we see several clusters of countries such as Sub-Sahara Africa countries (except for Mauritius), Latin America and Carribean countries (except for Haiti and Venezuela ), Western Europe with North America and ANZ. Sub-Saharan Africa countries have a happiness score lower than the median. Most Latin American and Carribean countries have a happiness score higher than the median. These two clusters are characterized by a corruption perception index of 0.67 to 0.9. Western Europe (Northern countries) with North America and ANZ have the lowest corruption perception index (less than 0.52) and the highest happiness score (more than 6).
# Corruption perception index and happiness score depending on sub-region
fig7<-ggplot(combined2016_raw_hdi)+
geom_point(aes(corruption, score, col = regional_indicator,
text = paste("Happiness score:", combined2016_raw_hdi$score, "<br>",
"Corruption perception index:", combined2016_raw_hdi$corruption, "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator)),
size = 3)+
geom_hline(yintercept=median(combined2016_raw_hdi$score, na.rm = T), color="red")+
geom_vline(xintercept=median(combined2016_raw_hdi$corruption, na.rm = T), color="blue")+
labs(x = "Corruption perception index", y = "Happiness score",
title = "Corruption perception index and happiness score depending on sub-region (2016)",
colour = "Sub-region")+
theme_few()+
scale_color_paletteer_d("ggthemes::Red_Blue_Brown")
fig7 <- ggplotly(fig7, tooltip = "text")
fig7Let’s build a linear regression model to explore what factors influence the corruption perception index. Rule of law, government effectiveness, regulatory quality are not used in models as explanatory variables in order to avoid multicollinearity problems (because the correlation between variables about law from World Bank is high (more than 0.9)).
Interpretation
Ceteris paribus:
If we increase freedom by 0.1, we would expect the corruption perception index to decrease by 0.0235.
In developing countries, the corruption perception index is on average lower than in developed countries by 0.065.
b0 - intercept
b1 = wbo.ctr_corrupt - regression coefficient for linear effect of X on Y
b2 = I(wbo.ctr_corrupt2) - regression coefficient for quadratic effect of X on Y
The value of -0.065 represents the downward linear trend in the value of Y along the X-axis and the value of -0.072 represents the curvature in the data. Here the quadratic term is negative implying that the corruption perception index decreases as control of corruption increases. The adjusted R2 of this model is 0.656.
model<-lm(corruption ~ freedom + wbo.ctr_corrupt + I(wbo.ctr_corrupt^2) + economic_status,
combined2016_raw_hdi)
stargazer(model, type = "html")| Dependent variable: | |
| corruption | |
| freedom | -0.235*** |
| (0.080) | |
| wbo.ctr_corrupt | -0.065*** |
| (0.013) | |
| I(wbo.ctr_corrupt2) | -0.072*** |
| (0.008) | |
| economic_statusDeveloping | -0.065*** |
| (0.023) | |
| Constant | 1.025*** |
| (0.062) | |
| Observations | 142 |
| R2 | 0.665 |
| Adjusted R2 | 0.656 |
| Residual Std. Error | 0.107 (df = 137) |
| F Statistic | 68.078*** (df = 4; 137) |
| Note: | p<0.1; p<0.05; p<0.01 |
The stochastic (disturbance) of error term is normally distributed (Kolmogorov-Smirnov test).
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9402 0.0000
## Kolmogorov-Smirnov 0.0954 0.1510
## Cramer-von Mises 38.9011 0.0000
## Anderson-Darling 1.7517 2e-04
## -----------------------------------------------
In the EDA part we saw that there are two clusters: developed and developing countries. Let’s try to build the models for each group. Results you can see in the table below. The 2-nd model is the best in this case. This model is built on data about 95 developed countries. This model could be used to predict the corruption perception index for developed countries. The adjusted R2 of this model is 0.736. For this model as an explanatory variables control of corruption and squared term of control of corruption are used.
b0 - intercept
b1 = wbo.ctr_corrupt - regression coefficient for linear effect of X on Y
b2 = I(wbo.ctr_corrupt2) - regression coefficient for quadratic effect of X on Y
The value of -0.048 represents the downward linear trend in the value of Y along the X-axis and the value of -0.092 represents the curvature in the data. Here the quadratic term is negative implying that the corruption perception index decreases as control of corruption increases. The adjusted R2 of this model is 0.736.
Models with data about developing countries have low R2.
developed<-combined2016_raw_hdi %>%
filter(economic_status == "Developed")
developing<-combined2016_raw_hdi %>%
filter(economic_status == "Developing")
model1<-lm(corruption ~ wbo.ctr_corrupt^2 + I(wbo.ctr_corrupt^2) + freedom,
developed)
model2<-lm(corruption ~ wbo.ctr_corrupt^2 + I(wbo.ctr_corrupt^2),
developed)
model3<-lm(corruption ~ I(wbo.ctr_corrupt^2),
developed)
model4<-lm(corruption ~ wbo.ctr_corrupt^2 + I(wbo.ctr_corrupt^2) + freedom,
developing)
model5<-lm(corruption ~ wbo.ctr_corrupt^2 + I(wbo.ctr_corrupt^2),
developing)
model6<-lm(corruption ~ freedom + I(freedom^2),
developing)
stargazer(model1, model2, model3, model4, model5, model6, type = "html")| Dependent variable: | ||||||
| corruption | ||||||
| (1) | (2) | (3) | (4) | (5) | (6) | |
| wbo.ctr_corrupt | -0.045*** | -0.048*** | -0.392*** | -0.411*** | ||
| (0.015) | (0.015) | (0.055) | (0.061) | |||
| I(wbo.ctr_corrupt2) | -0.088*** | -0.092*** | -0.117*** | -0.247*** | -0.234*** | |
| (0.011) | (0.011) | (0.008) | (0.034) | (0.037) | ||
| freedom | -0.128 | -0.329*** | 1.594* | |||
| (0.102) | (0.102) | (0.901) | ||||
| I(freedom2) | -1.437** | |||||
| (0.674) | ||||||
| Constant | 0.955*** | 0.860*** | 0.874*** | 0.921*** | 0.655*** | 0.401 |
| (0.076) | (0.014) | (0.014) | (0.086) | (0.027) | (0.297) | |
| Observations | 95 | 95 | 95 | 47 | 47 | 47 |
| R2 | 0.746 | 0.742 | 0.713 | 0.606 | 0.511 | 0.187 |
| Adjusted R2 | 0.738 | 0.736 | 0.710 | 0.578 | 0.489 | 0.150 |
| Residual Std. Error | 0.104 (df = 91) | 0.105 (df = 92) | 0.110 (df = 93) | 0.082 (df = 43) | 0.090 (df = 44) | 0.116 (df = 44) |
| F Statistic | 89.136*** (df = 3; 91) | 132.074*** (df = 2; 92) | 231.021*** (df = 1; 93) | 22.016*** (df = 3; 43) | 22.975*** (df = 2; 44) | 5.074** (df = 2; 44) |
| Note: | p<0.1; p<0.05; p<0.01 | |||||
# correlation in developed countries (corruption, wbo.ctr_corrupt, freedom)
cor(developed$wbo.ctr_corrupt, developed$corruption)
cor(developed$freedom, developed$corruption)
# correlation in developing countries (corruption, wbo.ctr_corrupt, freedom)
cor(developing$wbo.ctr_corrupt, developing$corruption)
cor(developing$freedom, developing$corruption)The stochastic (disturbance) of error term is normally distributed (Kolmogorov-Smirnov test).
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9863 0.4304
## Kolmogorov-Smirnov 0.0591 0.8748
## Cramer-von Mises 25.482 0.0000
## Anderson-Darling 0.4608 0.2546
## -----------------------------------------------
A residual is a difference between the observed value and the mean value that the model predicts for that observation. Residuals in the second linear regression model are represented on the graph below. They are shown as light grey straight lines. Observed values are shown by colored points. The redder and larger the point, the further it is from the predicted value. The blacker and smaller the point, the closer it is to the predicted value. The black (transparent inside) points that lie on the line are the mean values that the model predicts for that observations.
First of all, the line is curved because between control of corruption and corruption perception index there is a quadratic relationship (quadratic function), and a graph that describes this function is a parabola. Equation of a parabola in math is y = ax^2 + bx +c. In our case it is: corruption perception index = -0.092 * (control of corruption)^2 - 0.048 * control of corruption + 0.860
where a = -0.092, b = 0.048, c = 0.860, x = control of corruption
In our case parabola at first, goes up (if control of corruption < -0.26) and then goes down (if control of corruption > -0.26). It means if control of corruption is less than -0.26 increase control of corruption leads to increase corruption perception index, but if control of corruption is more than -0.26 increase control of corruption leads to decrease corruption perception index.
How I received “control of corruption” = -0.26? It is math.
y’ = (ax^2 + bx +c)’
y = 2ax + x
or
(corruption perception index)’ = (-0.092 * (control of corruption)^2 - 0.048 * control of corruption + 0.860)’
(corruption perception index)’ = -0,184 * control of corruption - 0.048
-0,184 * control of corruption - 0.048 = 0
x = -0.26
Example from our graph:
Corruption Perception index increase if control of corruption is less than -0.26. Let’s compare 2 countries
Uzbekistan (Control of Corruption -1.169; Corruption perception index 0.84)
Mongolia (Control of Corruption -0.487; Corruption perception index 0.9)
In Uzbekistan Control of Corruption is less than in Mongolia (-1.169<-0.487) but Corruption perception index in Uzbekistan is less than in Mongolia (0.84<0.9) Corruption Perception index decrease if control of corruption is more than -0.26.
Let’s compare 2 countries
Latvia(Control of Corruption 0.431; Corruption perception index 0.9)
Austria (Control of Corruption 1.549; Corruption perception index 0.524)
In Latvia Control of Corruption is less than in Austria (0.431<1.549) and Corruption perception index in Latvia is more than in Austria (0.9>0.524)
developed$predicted <- predict(model2) # Save the predicted values
developed$residuals <- residuals(model2) # Save the residual values
# graph with residuals (2018)
result<-ggplot(developed, aes(x = wbo.ctr_corrupt, y = corruption)) +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = wbo.ctr_corrupt, yend = predicted), alpha = .2) +
# > Color AND size adjustments made here...
geom_point(aes(color = abs(residuals), size = abs(residuals),
text = paste("Control of corruption:", round(developed$wbo.ctr_corrupt,3), "<br>",
"Corruption perception index:", developed$corruption, "<br>",
"Corruption perception index prediction:", round(developed$predicted,3), "<br>",
"Country:", developed$country, "<br>",
"Sub-region:", developed$regional_indicator))) + # size also mapped
scale_color_continuous(low = "black", high = "red") +
guides(color = FALSE, size = FALSE) + # Size legend also removed
# <
geom_point(aes(y = predicted), shape = 1) +
labs(x = "Control of corruption", y = "Corruption",
title = "Residuals in the Linear Regression model (2016)")+
theme_few()
result <- ggplotly(result, tooltip = "text")
resultThe first model describes the relationship between Happiness score based on data about 142 countries (developed and developing). The second model contains information only about developing countries, the third is about developed. There is positive correlation (0.7) between Happiness score and control of corruption. Based on the results of the first model, we can draw the following conclusion: an increase in control of corruption by 0.1 leads to an increase in happiness score by 0.0765. Control of corruption is statistically significant with a p-value less than 0.01 and 48.7% variability of the response data can be explained by the model, with a normal distribution of the residual and relatively constant variance of error terms.
The second model (developing countries) is not reliable. Control of corruption is not statistically significant.
Based on the results of the third model, we can draw the following conclusion: an increase in control of corruption by 0.1 leads to an increase in happiness score by 0.06. Control of corruption is statistically significant with a p-value less than 0.01 and 46.3% variability of the response data can be explained by the model, with a normal distribution of the residual and relatively constant variance of error terms.
# happiness score and control of corruption and happiness score
fit_happy_corrup <- lm(score ~ wbo.ctr_corrupt, combined2016_raw_hdi)
# happiness score and control of corruption and happiness score
fit_happy_corrup_developing <- lm(score ~ wbo.ctr_corrupt, developing)
# happiness score and control of corruption and happiness score in developed countries
fit_happy_corrup_developed <- lm(score ~ wbo.ctr_corrupt, developed)
stargazer(fit_happy_corrup, fit_happy_corrup_developing, fit_happy_corrup_developed, type = "html")| Dependent variable: | |||
| score | |||
| (1) | (2) | (3) | |
| wbo.ctr_corrupt | 0.765*** | 0.201 | 0.600*** |
| (0.066) | (0.250) | (0.066) | |
| Constant | 5.442*** | 4.546*** | 5.712*** |
| (0.069) | (0.237) | (0.072) | |
| Observations | 142 | 47 | 95 |
| R2 | 0.490 | 0.014 | 0.469 |
| Adjusted R2 | 0.487 | -0.008 | 0.463 |
| Residual Std. Error | 0.817 (df = 140) | 0.840 (df = 45) | 0.670 (df = 93) |
| F Statistic | 134.725*** (df = 1; 140) | 0.651 (df = 1; 45) | 82.108*** (df = 1; 93) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.978 0.0217
## Kolmogorov-Smirnov 0.0863 0.2403
## Cramer-von Mises 9.778 0.0000
## Anderson-Darling 0.7654 0.0456
## -----------------------------------------------
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9531 0.0019
## Kolmogorov-Smirnov 0.0817 0.5239
## Cramer-von Mises 7.235 0.0000
## Anderson-Darling 0.8893 0.0222
## -----------------------------------------------
Residuals in the first linear regression model are represented on the graph below. They are shown as light grey straight lines. Observed values are shown by colored points. The redder and larger the point, the further it is from the predicted value. The blacker and smaller the point, the closer it is to the predicted value. The black (transparent inside) points that lie on the line are the mean values that the model predicts for that observations.
combined2016_raw_hdi$predicted <- predict(fit_happy_corrup) # Save the predicted values
combined2016_raw_hdi$residuals <- residuals(fit_happy_corrup) # Save the residual values
# graph with residuals (2018)
result<-ggplot(combined2016_raw_hdi, aes(x = wbo.ctr_corrupt, y = score)) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = wbo.ctr_corrupt, yend = predicted), alpha = .2) +
# > Color AND size adjustments made here...
geom_point(aes(color = abs(residuals), size = abs(residuals),
text = paste("Control of corruption:", round(combined2016_raw_hdi$wbo.ctr_corrupt,3), "<br>",
"Happiness score:", combined2016_raw_hdi$score, "<br>",
"Happiness score prediction:", round(combined2016_raw_hdi$predicted,3), "<br>",
"Country:", combined2016_raw_hdi$country, "<br>",
"Sub-region:", combined2016_raw_hdi$regional_indicator))) + # size also mapped
scale_color_continuous(low = "black", high = "red") +
guides(color = FALSE, size = FALSE) + # Size legend also removed
# <
geom_point(aes(y = predicted), shape = 1) +
labs(x = "Control of corruption", y = "Happiness score",
title = "Residuals in the Linear Regression model (2016)")+
theme_few()
result <- ggplotly(result, tooltip = "text")
result