#Source of the data: https://www.kaggle.com/datasets/psterk/income-inequality?select=combined_final_last_10_years.csv
library(psych)
library(car)
## Warning: package 'car' was built under R version 4.0.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.5
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(onewaytests)
##
## Attaching package: 'onewaytests'
## The following object is masked from 'package:psych':
##
## describe
library(ggpubr)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.0.5
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.0.5
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:onewaytests':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
library(estimatr)
library(readxl)
library(openxlsx)
setwd("C:/Users/Korisnik/Desktop/RMT R Homework")
data <- read.xlsx("Gini_Index.xlsx")
#creating dummy variables for continents
continent_dummy <- model.matrix(~ continent - 1, data = data)
#renaming dummy variables
colnames(continent_dummy) <- gsub("continent", "", colnames(continent_dummy))
#connecting the dataset and dummy variables
data <- cbind(data, continent_dummy)
head(data)
## continent country year demox_eiu income_per_person invest_%_gdp
## 1 Africa Congo, Dem. Rep. 2006 27.6 605 14.6
## 2 Africa Congo, Dem. Rep. 2007 25.2 623 13.7
## 3 Africa Congo, Dem. Rep. 2008 22.8 640 10.9
## 4 Africa Congo, Dem. Rep. 2009 22.1 637 14.6
## 5 Africa Congo, Dem. Rep. 2010 21.5 660 28.8
## 6 Africa Congo, Rep. 2006 31.9 5000 21.6
## tax_%_gdp gini_index Africa Americas Asia Europe Oceania
## 1 6.83 42.2 1 0 0 0 0
## 2 6.99 42.1 1 0 0 0 0
## 3 8.97 42.1 1 0 0 0 0
## 4 7.89 42.1 1 0 0 0 0
## 5 8.35 42.1 1 0 0 0 0
## 6 5.79 47.6 1 0 0 0 0
summary(data)
## continent country year demox_eiu
## Length:1234 Length:1234 Min. :2006 Min. :16.60
## Class :character Class :character 1st Qu.:2008 1st Qu.:46.25
## Mode :character Mode :character Median :2011 Median :64.90
## Mean :2011 Mean :62.08
## 3rd Qu.:2014 3rd Qu.:78.80
## Max. :2016 Max. :99.30
## income_per_person invest_%_gdp tax_%_gdp gini_index
## Min. : 605 Min. : 0.00 Min. : 0.0435 Min. :24.40
## 1st Qu.: 5935 1st Qu.:19.50 1st Qu.:12.3250 1st Qu.:32.10
## Median : 13700 Median :23.20 Median :15.9000 Median :36.00
## Mean : 20520 Mean :24.39 Mean :16.7688 Mean :38.04
## 3rd Qu.: 31175 3rd Qu.:27.70 3rd Qu.:21.3000 3rd Qu.:42.58
## Max. :120000 Max. :67.90 Max. :62.9000 Max. :63.90
## Africa Americas Asia Europe
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2164 Mean :0.1548 Mean :0.2658 Mean :0.3452
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Oceania
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.01783
## 3rd Qu.:0.00000
## Max. :1.00000
describe(data)
## data
##
## 13 Variables 1234 Observations
## --------------------------------------------------------------------------------
## continent
## n missing distinct
## 1234 0 5
##
## lowest : Africa Americas Asia Europe Oceania
## highest: Africa Americas Asia Europe Oceania
##
## Value Africa Americas Asia Europe Oceania
## Frequency 267 191 328 426 22
## Proportion 0.216 0.155 0.266 0.345 0.018
## --------------------------------------------------------------------------------
## country
## n missing distinct
## 1234 0 138
##
## lowest : Afghanistan Albania Algeria Angola Argentina
## highest: Uruguay Uzbekistan Vietnam Zambia Zimbabwe
## --------------------------------------------------------------------------------
## year
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 11 0.992 2011 3.547 2006 2007
## .25 .50 .75 .90 .95
## 2008 2011 2014 2015 2016
##
## lowest : 2006 2007 2008 2009 2010, highest: 2012 2013 2014 2015 2016
##
## Value 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
## Frequency 108 110 115 117 121 122 119 112 107 106 97
## Proportion 0.088 0.089 0.093 0.095 0.098 0.099 0.096 0.091 0.087 0.086 0.079
## --------------------------------------------------------------------------------
## demox_eiu
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 498 1 62.08 23.45 27.57 30.83
## .25 .50 .75 .90 .95
## 46.25 64.90 78.80 88.47 91.90
##
## lowest : 16.6 17.2 17.4 17.5 17.7, highest: 97.3 97.4 98.0 98.8 99.3
## --------------------------------------------------------------------------------
## income_per_person
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 726 1 20520 20024 1386 2153
## .25 .50 .75 .90 .95
## 5935 13700 31175 44170 56235
##
## lowest : 605 623 637 640 660, highest: 97900 113000 116000 117000 120000
## --------------------------------------------------------------------------------
## invest_%_gdp
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 319 1 24.39 8.336 13.90 16.23
## .25 .50 .75 .90 .95
## 19.50 23.20 27.70 34.10 39.30
##
## lowest : 0.00 4.70 8.27 8.33 8.40, highest: 57.80 58.20 61.70 67.70 67.90
## --------------------------------------------------------------------------------
## tax_%_gdp
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 400 1 16.77 8.199 4.892 8.433
## .25 .50 .75 .90 .95
## 12.325 15.900 21.300 25.700 27.400
##
## lowest : 0.0435 0.0577 0.3210 0.3560 0.3640
## highest: 49.3000 51.2000 58.4000 60.2000 62.9000
## --------------------------------------------------------------------------------
## gini_index
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 314 1 38.04 9.254 26.96 28.20
## .25 .50 .75 .90 .95
## 32.10 36.00 42.58 49.20 54.23
##
## lowest : 24.4 24.5 24.7 24.8 24.9, highest: 63.2 63.3 63.4 63.6 63.9
## --------------------------------------------------------------------------------
## Africa
## n missing distinct Info Sum Mean Gmd
## 1234 0 2 0.509 267 0.2164 0.3394
##
## --------------------------------------------------------------------------------
## Americas
## n missing distinct Info Sum Mean Gmd
## 1234 0 2 0.392 191 0.1548 0.2619
##
## --------------------------------------------------------------------------------
## Asia
## n missing distinct Info Sum Mean Gmd
## 1234 0 2 0.585 328 0.2658 0.3906
##
## --------------------------------------------------------------------------------
## Europe
## n missing distinct Info Sum Mean Gmd
## 1234 0 2 0.678 426 0.3452 0.4525
##
## --------------------------------------------------------------------------------
## Oceania
## n missing distinct Info Sum Mean Gmd
## 1234 0 2 0.053 22 0.01783 0.03505
##
## --------------------------------------------------------------------------------
data$continentFactor <- factor(data$continent,
levels = c("Africa", "Americas", "Asia", "Europe", "Oceania"),
labels = c("Africa", "Americas", "Asia", "Europe", "Oceania"))
#descriptive statistics:
#there are 1234 rows, 8 initial columns, with added 5 dummy variables for continents
#variables continent and country are string type, out of which continent was transformed into factor while country wasn't used - the rest of the variables are numeric
#the years span from 2006 to 2016
#looking at the closeness of the median and mean for each of the variables, we can conclude that the income_per_person is significantly skewed to the right - later on, we will winsorize this variable
#based on the means of the dummy variables, we can also see that the most countries are from Europe, while the least countries are from Oceania
#there are no missing values in the dataset
#RQ 1 : is there a statistical difference among continents when it comes to the democracy index?
#levene's test and shapiro test are both significant, so I did the Welch ANOVA and Kruskal-Wallis (since the assumptions are violated that much, Kruskal-Wallis is the better option)
describe(data$demox_eiu)
## data$demox_eiu
## n missing distinct Info Mean Gmd .05 .10
## 1234 0 498 1 62.08 23.45 27.57 30.83
## .25 .50 .75 .90 .95
## 46.25 64.90 78.80 88.47 91.90
##
## lowest : 16.6 17.2 17.4 17.5 17.7, highest: 97.3 97.4 98.0 98.8 99.3
describeBy(x = data$demox_eiu, group = data$continentFactor)
##
## Descriptive statistics by group
## group: Africa
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 267 47.5 18.03 45.1 46.79 20.61 16.6 82.8 66.2 0.33 -0.98 1.1
## ------------------------------------------------------------
## group: Americas
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 191 69.8 9.34 66.7 69.29 8.9 48.1 91.5 43.4 0.52 -0.47 0.68
## ------------------------------------------------------------
## group: Asia
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 328 49.48 18 48.6 49.04 24.02 17.2 82.5 65.3 0.12 -1.21 0.99
## ------------------------------------------------------------
## group: Europe
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 426 75.95 14.66 78 77.25 12.45 30.4 99.3 68.9 -0.91 0.89
## se
## X1 0.71
## ------------------------------------------------------------
## group: Oceania
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 22 91.63 0.97 92.05 91.69 0.82 90.1 92.6 2.5 -0.44 -1.44 0.21
leveneTest(data$demox_eiu, group = data$continentFactor)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 45.389 < 2.2e-16 ***
## 1229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data %>%
group_by(continentFactor) %>%
shapiro_test(demox_eiu)
## # A tibble: 5 x 4
## continentFactor variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Africa demox_eiu 0.944 1.50e- 8
## 2 Americas demox_eiu 0.948 1.87e- 6
## 3 Asia demox_eiu 0.951 4.80e- 9
## 4 Europe demox_eiu 0.942 7.20e-12
## 5 Oceania demox_eiu 0.837 2.05e- 3
welch.test(demox_eiu ~ continentFactor,
data = data)
##
## Welch's Heteroscedastic F Test (alpha = 0.05)
## -------------------------------------------------------------
## data : demox_eiu and continentFactor
##
## statistic : 1018.486
## num df : 4
## denom df : 535.9722
## p.value : 8.503868e-249
##
## Result : Difference is statistically significant.
## -------------------------------------------------------------
pairwise.t.test(x = data$demox_eiu, g = data$continentFactor,
p.adj = "bonf")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: data$demox_eiu and data$continentFactor
##
## Africa Americas Asia Europe
## Americas < 2e-16 - - -
## Asia 1 < 2e-16 - -
## Europe < 2e-16 6.9e-05 < 2e-16 -
## Oceania < 2e-16 7.5e-09 < 2e-16 4.9e-05
##
## P value adjustment method: bonferroni
pwc <- data %>%
pairwise_t_test(demox_eiu ~ continentFactor,
paired = FALSE,
p.adjust.method = "bonferroni")
ANOVA_results <- anova_test(demox_eiu ~ continentFactor,
data = data)
pwc <- pwc %>%
add_y_position(fun = "median", step.increase = 0.35)
ggboxplot(data, x = "continentFactor", y = "demox_eiu", add = "point") +
stat_pvalue_manual(pwc, hide.ns = FALSE) +
stat_summary(fun = mean, geom = "point", shape = 16, size = 4,
aes(group = continentFactor), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = mean, colour = "darkred",
position = position_dodge(0.8),
geom = "text", vjust = -0.5, hjust = -1,
aes(label = round(after_stat(y), digits = 2), group = continentFactor)) +
labs(subtitle = get_test_label(ANOVA_results, detailed = TRUE),
caption = get_pwc_label(pwc))

kruskal.test(demox_eiu ~ continentFactor,
data = data)
##
## Kruskal-Wallis rank sum test
##
## data: demox_eiu by continentFactor
## Kruskal-Wallis chi-squared = 530.9, df = 4, p-value < 2.2e-16
kruskal_effsize(demox_eiu ~ continentFactor,
data = data)
## # A tibble: 1 x 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 demox_eiu 1234 0.429 eta2[H] large
groups_nonpar <- wilcox_test(demox_eiu ~ continentFactor,
paired = FALSE,
p.adjust.method = "bonferroni",
data = data)
groups_nonpar
## # A tibble: 10 x 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 demox_eiu Africa Ameri~ 267 191 7750 5.34e-37 5.34e-36 ****
## 2 demox_eiu Africa Asia 267 328 41226 2.19e- 1 1 e+ 0 ns
## 3 demox_eiu Africa Europe 267 426 13916. 5.83e-63 5.83e-62 ****
## 4 demox_eiu Africa Ocean~ 267 22 0 6.48e-15 6.48e-14 ****
## 5 demox_eiu Americ~ Asia 191 328 51190 1.78e-33 1.78e-32 ****
## 6 demox_eiu Americ~ Europe 191 426 26398 2.99e-12 2.99e-11 ****
## 7 demox_eiu Americ~ Ocean~ 191 22 49.5 6.71e-14 6.71e-13 ****
## 8 demox_eiu Asia Europe 328 426 18348. 1.29e-67 1.29e-66 ****
## 9 demox_eiu Asia Ocean~ 328 22 0 4.08e-15 4.08e-14 ****
## 10 demox_eiu Europe Ocean~ 426 22 1208. 4.27e- 9 4.27e- 8 ****
pwc_2 <- data %>%
wilcox_test(demox_eiu ~ continentFactor,
paired = FALSE,
p.adjust.method = "bonferroni")
Kruskal_results <- kruskal_test(demox_eiu ~ continentFactor,
data = data)
pwc_2 <- pwc_2 %>%
add_y_position(fun = "median", step.increase = 0.35)
ggboxplot(data, x = "continentFactor", y = "demox_eiu", add = "point") +
stat_pvalue_manual(pwc_2, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = continentFactor), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(0.8),
geom = "text", vjust = -0.5, hjust = -1,
aes(label = round(after_stat(y), digits = 2), group = continentFactor)) +
labs(subtitle = get_test_label(Kruskal_results, detailed = TRUE),
caption = get_pwc_label(pwc_2))

#interpreting the results of Kruskal-Wallis test, we found that the distribution of the democracy indexed differs significantly for at least one of the continents (chi=530.9, p<0.001), the effect size was large (eta2 = 0.429)
#post hoc tests have shown that there are significant differences (p<0.001) among all of the possible pairs, except for the Africa-Asia pair which wasn't significant
#RQ 2 : are the democracy index, GDP per capita, capital formation and tax revenue able to predict the gini index of countries?
data_numer <- data[ , c("demox_eiu", "income_per_person", "invest_%_gdp", "tax_%_gdp", "gini_index")]
scatterplotMatrix(data_numer,
smooth = FALSE)

#due to the distribution skewed to the right, we winsorized the income_per_person variable
quantile(data$income_per_person, 0.95)
## 95%
## 56235
data$income_per_person_W <- ifelse(test = data$income_per_person > 56235,
yes = 56235,
no = data$income_per_person)
par(mfrow = c(1, 2))
hist(data$income_per_person,
main = "Income per person")
hist(data$income_per_person_W,
main = "Income per person (W)")

fit <- lm(gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + `tax_%_gdp`,
data = data)
vif(fit)
## demox_eiu income_per_person_W `invest_%_gdp` `tax_%_gdp`
## 1.673661 1.422970 1.017604 1.199460
#vif values are not noticeably high, indicating that there is no multicolinearity
data$StdResid <- round(rstandard(fit), 3)
data$CooksD <- round(cooks.distance(fit), 3)
hist(data$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(data$StdResid)
##
## Shapiro-Wilk normality test
##
## data: data$StdResid
## W = 0.957, p-value < 2.2e-16
#shapiro test of standard residuals is significant (p<0.001) meaning that we have some extreme values that we need to clean
hist(data$CooksD,
xlab = "Cook's distances",
ylab = "Frequency",
main = "Histogram of Cook's distances")

head(data[order(data$StdResid), c("country", "StdResid")], 5)
## country StdResid
## 1197 Ukraine -2.027
## 1200 Ukraine -2.013
## 1198 Ukraine -2.002
## 1199 Ukraine -1.994
## 1201 Ukraine -1.992
head(data[order(-data$StdResid), c("country", "StdResid")], 5)
## country StdResid
## 220 South Africa 3.138
## 219 South Africa 3.128
## 221 South Africa 3.105
## 222 South Africa 3.071
## 223 South Africa 3.063
head(data[order(-data$CooksD), c("country", "CooksD")], 5)
## country CooksD
## 436 Suriname 0.026
## 435 Suriname 0.020
## 121 Lesotho 0.016
## 433 Suriname 0.012
## 432 Suriname 0.011
#identify which rows are outliers based on their standard residuals being above 2.5 or below -2.5
outliers <- which(data$StdResid > 2.5 | data$StdResid < -2.5)
print(outliers)
## [1] 38 39 40 41 42 43 44 45 46 47 48 181 182 183 184 185 186 187 188
## [20] 189 190 191 219 220 221 222 223 224 225 226 227 228 229 430 431 432 433 434
## [39] 435 436
#identify which rows are influential based on the 4/N rule for Cook's distance
threshold <- 4/1234
influential_obs <- which(data$CooksD > threshold)
print(influential_obs)
## [1] 13 14 15 16 17 18 38 39 40 41 42 43 44 45 46
## [16] 47 68 69 70 71 103 104 105 119 120 121 122 123 124 125
## [31] 126 127 128 177 178 181 182 183 184 185 186 187 188 189 190
## [46] 191 219 220 221 222 223 224 225 226 227 228 229 230 231 232
## [61] 236 262 263 431 432 433 434 435 436 595 596 597 598 599 600
## [76] 601 602 603 604 692 693 694 695 696 716 767 768 769 770 771
## [91] 772 1022 1023
#remove the rows that are outliers and/or influential
data <- data[-outliers, ]
data <- data[-influential_obs, ]
summary(data)
## continent country year demox_eiu
## Length:1101 Length:1101 Min. :2006 Min. :16.60
## Class :character Class :character 1st Qu.:2008 1st Qu.:47.20
## Mode :character Mode :character Median :2011 Median :65.90
## Mean :2011 Mean :62.77
## 3rd Qu.:2014 3rd Qu.:79.60
## Max. :2016 Max. :99.30
## income_per_person invest_%_gdp tax_%_gdp gini_index
## Min. : 605 Min. : 0.00 Min. : 0.0435 Min. :24.40
## 1st Qu.: 6310 1st Qu.:19.40 1st Qu.:12.1000 1st Qu.:32.00
## Median : 15700 Median :23.10 Median :15.7000 Median :35.60
## Mean : 21792 Mean :24.04 Mean :16.4541 Mean :37.14
## 3rd Qu.: 33400 3rd Qu.:27.20 3rd Qu.:21.2000 3rd Qu.:42.00
## Max. :120000 Max. :67.90 Max. :62.9000 Max. :57.30
## Africa Americas Asia Europe
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.158 Mean :0.1644 Mean :0.2779 Mean :0.3797
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Oceania continentFactor income_per_person_W StdResid
## Min. :0.00000 Africa :174 Min. : 605 Min. :-2.02700
## 1st Qu.:0.00000 Americas:181 1st Qu.: 6310 1st Qu.:-0.75900
## Median :0.00000 Asia :306 Median :15700 Median :-0.19800
## Mean :0.01998 Europe :418 Mean :20621 Mean :-0.08729
## 3rd Qu.:0.00000 Oceania : 22 3rd Qu.:33400 3rd Qu.: 0.48200
## Max. :1.00000 Max. :56235 Max. : 2.08600
## CooksD
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0000000
## Mean :0.0005867
## 3rd Qu.:0.0010000
## Max. :0.0160000
fit <- lm(gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + `tax_%_gdp`,
data = data)
data$StdFitted <- scale(fit$fitted.values)
scatterplot(StdResid ~ StdFitted,
ylab = "Standardized residuals",
xlab = "Standardized fitted values",
smooth = FALSE,
regLine = FALSE,
boxplots = FALSE,
data = data)

#scatter plot between the standardized fitted values and standardized residuals indicates that the homoskedasticity assumption is not violated
ols_test_breusch_pagan(fit)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## --------------------------------------
## Response : gini_index
## Variables: fitted values of gini_index
##
## Test Summary
## -------------------------------
## DF = 1
## Chi2 = 52.66446
## Prob > Chi2 = 3.956867e-13
#Breusch-Pagan test is significant (p<0.001) meaning however that there might be some issues with the heteroskedasticity when it comes to the variance of residuals
round(stat.desc(data[c("gini_index", "demox_eiu", "income_per_person_W", "invest_%_gdp", "tax_%_gdp")], basic = FALSE), 2)
## gini_index demox_eiu income_per_person_W invest_%_gdp tax_%_gdp
## median 35.60 65.90 15700.00 23.10 15.70
## mean 37.14 62.77 20620.63 24.04 16.45
## SE.mean 0.22 0.63 500.03 0.24 0.23
## CI.mean.0.95 0.43 1.24 981.12 0.46 0.45
## var 53.27 437.31 275280248.34 61.00 57.67
## std.dev 7.30 20.91 16591.57 7.81 7.59
## coef.var 0.20 0.33 0.80 0.32 0.46
#coefficient of variance indicates that the income_per_person_W has the highest variance by far, which is to be expected given the range and the differences in possible incomes
rcorr(as.matrix(data[c("gini_index", "demox_eiu", "income_per_person_W", "invest_%_gdp", "tax_%_gdp")],
type = "pearson"))
## gini_index demox_eiu income_per_person_W invest_%_gdp
## gini_index 1.00 -0.32 -0.40 -0.04
## demox_eiu -0.32 1.00 0.53 -0.12
## income_per_person_W -0.40 0.53 1.00 -0.06
## invest_%_gdp -0.04 -0.12 -0.06 1.00
## tax_%_gdp -0.18 0.43 0.16 -0.04
## tax_%_gdp
## gini_index -0.18
## demox_eiu 0.43
## income_per_person_W 0.16
## invest_%_gdp -0.04
## tax_%_gdp 1.00
##
## n= 1101
##
##
## P
## gini_index demox_eiu income_per_person_W invest_%_gdp
## gini_index 0.0000 0.0000 0.2364
## demox_eiu 0.0000 0.0000 0.0000
## income_per_person_W 0.0000 0.0000 0.0454
## invest_%_gdp 0.2364 0.0000 0.0454
## tax_%_gdp 0.0000 0.0000 0.0000 0.2367
## tax_%_gdp
## gini_index 0.0000
## demox_eiu 0.0000
## income_per_person_W 0.0000
## invest_%_gdp 0.2367
## tax_%_gdp
#correlation analysis shows that the dependent variable gini_index has significant correlation (p<0.001) with each predictor except for the invest_%_gdp
#additionally, demox_eiu predictor indicating democracy level in the country is significantly correlated with every other predictor (p<0.001), and tax_%_gdp predictor is correlated additionally with income_per_person_W
summary(fit)
##
## Call:
## lm(formula = gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` +
## `tax_%_gdp`, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1538 -4.7270 -0.5216 4.0412 17.9605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.569e+01 9.524e-01 47.970 < 2e-16 ***
## demox_eiu -4.508e-02 1.238e-02 -3.641 0.000284 ***
## income_per_person_W -1.422e-04 1.423e-05 -9.990 < 2e-16 ***
## `invest_%_gdp` -6.821e-02 2.564e-02 -2.660 0.007930 **
## `tax_%_gdp` -6.960e-02 2.906e-02 -2.395 0.016772 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.595 on 1096 degrees of freedom
## Multiple R-squared: 0.1865, Adjusted R-squared: 0.1835
## F-statistic: 62.8 on 4 and 1096 DF, p-value: < 2.2e-16
#results of the regression analysis show that each of the predictors has significant predictive value for the gini_index (p<0.01), and all of them are negatively oriented
#the model overall explains 18.65% variability of the dependent variable (R2 coefficient of determination) while the correlation coefficient R is 0.4318, and significant at p<0.001
fit1 <- lm_robust(gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + `tax_%_gdp`,
se_type = "HC1",
data = data)
summary(fit1)
##
## Call:
## lm_robust(formula = gini_index ~ demox_eiu + income_per_person_W +
## `invest_%_gdp` + `tax_%_gdp`, data = data, se_type = "HC1")
##
## Standard error type: HC1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower
## (Intercept) 45.6867094 9.760e-01 46.810 1.160e-263 43.7716779
## demox_eiu -0.0450753 1.237e-02 -3.644 2.810e-04 -0.0693461
## income_per_person_W -0.0001422 1.436e-05 -9.906 3.262e-22 -0.0001704
## `invest_%_gdp` -0.0682079 2.553e-02 -2.671 7.668e-03 -0.1183087
## `tax_%_gdp` -0.0696012 3.634e-02 -1.915 5.574e-02 -0.1409111
## CI Upper DF
## (Intercept) 47.601741 1096
## demox_eiu -0.020805 1096
## income_per_person_W -0.000114 1096
## `invest_%_gdp` -0.018107 1096
## `tax_%_gdp` 0.001709 1096
##
## Multiple R-squared: 0.1865 , Adjusted R-squared: 0.1835
## F-statistic: 96.14 on 4 and 1096 DF, p-value: < 2.2e-16
#due to the violation of homoskedasticity according to the Breusch-Pagan test, the additional correction was conducted, however none of the results significantly changed
sqrt(summary(fit1)$r.squared)
## [1] 0.431803
#the regression model has shown that, the gini index of inequality can be predicted by the democracy index, GDP per capita (income), tax revenue as % of GDP and higher investments as % of GDP, in a way that the higher the values of these predictors are, the lower the gini index of inequality is in the country