#Source of the data: https://www.kaggle.com/datasets/psterk/income-inequality?select=combined_final_last_10_years.csv

library(psych)
library(car)
## Warning: package 'car' was built under R version 4.0.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.5
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(onewaytests)
## 
## Attaching package: 'onewaytests'
## The following object is masked from 'package:psych':
## 
##     describe
library(ggpubr)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.0.5
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.0.5
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:onewaytests':
## 
##     describe
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(estimatr)
library(readxl)
library(openxlsx)

setwd("C:/Users/Korisnik/Desktop/RMT R Homework")
data <- read.xlsx("Gini_Index.xlsx")

#creating dummy variables for continents
continent_dummy <- model.matrix(~ continent - 1, data = data)
#renaming dummy variables
colnames(continent_dummy) <- gsub("continent", "", colnames(continent_dummy))
#connecting the dataset and dummy variables
data <- cbind(data, continent_dummy)

head(data)
##   continent          country year demox_eiu income_per_person invest_%_gdp
## 1    Africa Congo, Dem. Rep. 2006      27.6               605         14.6
## 2    Africa Congo, Dem. Rep. 2007      25.2               623         13.7
## 3    Africa Congo, Dem. Rep. 2008      22.8               640         10.9
## 4    Africa Congo, Dem. Rep. 2009      22.1               637         14.6
## 5    Africa Congo, Dem. Rep. 2010      21.5               660         28.8
## 6    Africa      Congo, Rep. 2006      31.9              5000         21.6
##   tax_%_gdp gini_index Africa Americas Asia Europe Oceania
## 1      6.83       42.2      1        0    0      0       0
## 2      6.99       42.1      1        0    0      0       0
## 3      8.97       42.1      1        0    0      0       0
## 4      7.89       42.1      1        0    0      0       0
## 5      8.35       42.1      1        0    0      0       0
## 6      5.79       47.6      1        0    0      0       0
summary(data)
##   continent           country               year        demox_eiu    
##  Length:1234        Length:1234        Min.   :2006   Min.   :16.60  
##  Class :character   Class :character   1st Qu.:2008   1st Qu.:46.25  
##  Mode  :character   Mode  :character   Median :2011   Median :64.90  
##                                        Mean   :2011   Mean   :62.08  
##                                        3rd Qu.:2014   3rd Qu.:78.80  
##                                        Max.   :2016   Max.   :99.30  
##  income_per_person  invest_%_gdp     tax_%_gdp         gini_index   
##  Min.   :   605    Min.   : 0.00   Min.   : 0.0435   Min.   :24.40  
##  1st Qu.:  5935    1st Qu.:19.50   1st Qu.:12.3250   1st Qu.:32.10  
##  Median : 13700    Median :23.20   Median :15.9000   Median :36.00  
##  Mean   : 20520    Mean   :24.39   Mean   :16.7688   Mean   :38.04  
##  3rd Qu.: 31175    3rd Qu.:27.70   3rd Qu.:21.3000   3rd Qu.:42.58  
##  Max.   :120000    Max.   :67.90   Max.   :62.9000   Max.   :63.90  
##      Africa          Americas           Asia            Europe      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2164   Mean   :0.1548   Mean   :0.2658   Mean   :0.3452  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Oceania       
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.01783  
##  3rd Qu.:0.00000  
##  Max.   :1.00000
describe(data)
## data 
## 
##  13  Variables      1234  Observations
## --------------------------------------------------------------------------------
## continent 
##        n  missing distinct 
##     1234        0        5 
## 
## lowest : Africa   Americas Asia     Europe   Oceania 
## highest: Africa   Americas Asia     Europe   Oceania 
##                                                        
## Value        Africa Americas     Asia   Europe  Oceania
## Frequency       267      191      328      426       22
## Proportion    0.216    0.155    0.266    0.345    0.018
## --------------------------------------------------------------------------------
## country 
##        n  missing distinct 
##     1234        0      138 
## 
## lowest : Afghanistan Albania     Algeria     Angola      Argentina  
## highest: Uruguay     Uzbekistan  Vietnam     Zambia      Zimbabwe   
## --------------------------------------------------------------------------------
## year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0       11    0.992     2011    3.547     2006     2007 
##      .25      .50      .75      .90      .95 
##     2008     2011     2014     2015     2016 
## 
## lowest : 2006 2007 2008 2009 2010, highest: 2012 2013 2014 2015 2016
##                                                                             
## Value       2006  2007  2008  2009  2010  2011  2012  2013  2014  2015  2016
## Frequency    108   110   115   117   121   122   119   112   107   106    97
## Proportion 0.088 0.089 0.093 0.095 0.098 0.099 0.096 0.091 0.087 0.086 0.079
## --------------------------------------------------------------------------------
## demox_eiu 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0      498        1    62.08    23.45    27.57    30.83 
##      .25      .50      .75      .90      .95 
##    46.25    64.90    78.80    88.47    91.90 
## 
## lowest : 16.6 17.2 17.4 17.5 17.7, highest: 97.3 97.4 98.0 98.8 99.3
## --------------------------------------------------------------------------------
## income_per_person 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0      726        1    20520    20024     1386     2153 
##      .25      .50      .75      .90      .95 
##     5935    13700    31175    44170    56235 
## 
## lowest :    605    623    637    640    660, highest:  97900 113000 116000 117000 120000
## --------------------------------------------------------------------------------
## invest_%_gdp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0      319        1    24.39    8.336    13.90    16.23 
##      .25      .50      .75      .90      .95 
##    19.50    23.20    27.70    34.10    39.30 
## 
## lowest :  0.00  4.70  8.27  8.33  8.40, highest: 57.80 58.20 61.70 67.70 67.90
## --------------------------------------------------------------------------------
## tax_%_gdp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0      400        1    16.77    8.199    4.892    8.433 
##      .25      .50      .75      .90      .95 
##   12.325   15.900   21.300   25.700   27.400 
## 
## lowest :  0.0435  0.0577  0.3210  0.3560  0.3640
## highest: 49.3000 51.2000 58.4000 60.2000 62.9000
## --------------------------------------------------------------------------------
## gini_index 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0      314        1    38.04    9.254    26.96    28.20 
##      .25      .50      .75      .90      .95 
##    32.10    36.00    42.58    49.20    54.23 
## 
## lowest : 24.4 24.5 24.7 24.8 24.9, highest: 63.2 63.3 63.4 63.6 63.9
## --------------------------------------------------------------------------------
## Africa 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1234        0        2    0.509      267   0.2164   0.3394 
## 
## --------------------------------------------------------------------------------
## Americas 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1234        0        2    0.392      191   0.1548   0.2619 
## 
## --------------------------------------------------------------------------------
## Asia 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1234        0        2    0.585      328   0.2658   0.3906 
## 
## --------------------------------------------------------------------------------
## Europe 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1234        0        2    0.678      426   0.3452   0.4525 
## 
## --------------------------------------------------------------------------------
## Oceania 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     1234        0        2    0.053       22  0.01783  0.03505 
## 
## --------------------------------------------------------------------------------
data$continentFactor <- factor(data$continent,
                               levels = c("Africa", "Americas", "Asia", "Europe", "Oceania"),
                               labels = c("Africa", "Americas", "Asia", "Europe", "Oceania"))

#descriptive statistics:
#there are 1234 rows, 8 initial columns, with added 5 dummy variables for continents
#variables continent and country are string type, out of which continent was transformed into factor while country wasn't used - the rest of the variables are numeric
#the years span from 2006 to 2016
#looking at the closeness of the median and mean for each of the variables, we can conclude that the income_per_person is significantly skewed to the right - later on, we will winsorize this variable
#based on the means of the dummy variables, we can also see that the most countries are from Europe, while the least countries are from Oceania
#there are no missing values in the dataset

#RQ 1 : is there a statistical difference among continents when it comes to the democracy index?
#levene's test and shapiro test are both significant, so I did the Welch ANOVA and Kruskal-Wallis (since the assumptions are violated that much, Kruskal-Wallis is the better option)

describe(data$demox_eiu)
## data$demox_eiu 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1234        0      498        1    62.08    23.45    27.57    30.83 
##      .25      .50      .75      .90      .95 
##    46.25    64.90    78.80    88.47    91.90 
## 
## lowest : 16.6 17.2 17.4 17.5 17.7, highest: 97.3 97.4 98.0 98.8 99.3
describeBy(x = data$demox_eiu, group = data$continentFactor)
## 
##  Descriptive statistics by group 
## group: Africa
##    vars   n mean    sd median trimmed   mad  min  max range skew kurtosis  se
## X1    1 267 47.5 18.03   45.1   46.79 20.61 16.6 82.8  66.2 0.33    -0.98 1.1
## ------------------------------------------------------------ 
## group: Americas
##    vars   n mean   sd median trimmed mad  min  max range skew kurtosis   se
## X1    1 191 69.8 9.34   66.7   69.29 8.9 48.1 91.5  43.4 0.52    -0.47 0.68
## ------------------------------------------------------------ 
## group: Asia
##    vars   n  mean sd median trimmed   mad  min  max range skew kurtosis   se
## X1    1 328 49.48 18   48.6   49.04 24.02 17.2 82.5  65.3 0.12    -1.21 0.99
## ------------------------------------------------------------ 
## group: Europe
##    vars   n  mean    sd median trimmed   mad  min  max range  skew kurtosis
## X1    1 426 75.95 14.66     78   77.25 12.45 30.4 99.3  68.9 -0.91     0.89
##      se
## X1 0.71
## ------------------------------------------------------------ 
## group: Oceania
##    vars  n  mean   sd median trimmed  mad  min  max range  skew kurtosis   se
## X1    1 22 91.63 0.97  92.05   91.69 0.82 90.1 92.6   2.5 -0.44    -1.44 0.21
leveneTest(data$demox_eiu, group = data$continentFactor)
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value    Pr(>F)    
## group    4  45.389 < 2.2e-16 ***
##       1229                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data %>%
  group_by(continentFactor) %>%
  shapiro_test(demox_eiu)
## # A tibble: 5 x 4
##   continentFactor variable  statistic        p
##   <fct>           <chr>         <dbl>    <dbl>
## 1 Africa          demox_eiu     0.944 1.50e- 8
## 2 Americas        demox_eiu     0.948 1.87e- 6
## 3 Asia            demox_eiu     0.951 4.80e- 9
## 4 Europe          demox_eiu     0.942 7.20e-12
## 5 Oceania         demox_eiu     0.837 2.05e- 3
welch.test(demox_eiu ~ continentFactor,
           data = data)
## 
##   Welch's Heteroscedastic F Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : demox_eiu and continentFactor 
## 
##   statistic  : 1018.486 
##   num df     : 4 
##   denom df   : 535.9722 
##   p.value    : 8.503868e-249 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
pairwise.t.test(x = data$demox_eiu, g = data$continentFactor,
                p.adj = "bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  data$demox_eiu and data$continentFactor 
## 
##          Africa  Americas Asia    Europe 
## Americas < 2e-16 -        -       -      
## Asia     1       < 2e-16  -       -      
## Europe   < 2e-16 6.9e-05  < 2e-16 -      
## Oceania  < 2e-16 7.5e-09  < 2e-16 4.9e-05
## 
## P value adjustment method: bonferroni
pwc <- data %>%
  pairwise_t_test(demox_eiu ~ continentFactor,
                  paired = FALSE,
                  p.adjust.method = "bonferroni")

ANOVA_results <- anova_test(demox_eiu ~ continentFactor,
                            data = data)

pwc <- pwc %>%
        add_y_position(fun = "median", step.increase = 0.35)

ggboxplot(data, x = "continentFactor", y = "demox_eiu", add = "point") +
  stat_pvalue_manual(pwc, hide.ns = FALSE) +
  stat_summary(fun = mean, geom = "point", shape = 16, size = 4,
               aes(group = continentFactor), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = mean, colour = "darkred",
               position = position_dodge(0.8),
               geom = "text", vjust = -0.5, hjust = -1,
               aes(label = round(after_stat(y), digits = 2), group = continentFactor)) +
  labs(subtitle = get_test_label(ANOVA_results, detailed = TRUE),
       caption = get_pwc_label(pwc))

kruskal.test(demox_eiu ~ continentFactor,
             data = data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  demox_eiu by continentFactor
## Kruskal-Wallis chi-squared = 530.9, df = 4, p-value < 2.2e-16
kruskal_effsize(demox_eiu ~ continentFactor,
                data = data)
## # A tibble: 1 x 5
##   .y.           n effsize method  magnitude
## * <chr>     <int>   <dbl> <chr>   <ord>    
## 1 demox_eiu  1234   0.429 eta2[H] large
groups_nonpar <- wilcox_test(demox_eiu ~ continentFactor,
                             paired = FALSE,
                             p.adjust.method = "bonferroni",
                             data = data)
groups_nonpar
## # A tibble: 10 x 9
##    .y.       group1  group2    n1    n2 statistic        p    p.adj p.adj.signif
##  * <chr>     <chr>   <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
##  1 demox_eiu Africa  Ameri~   267   191    7750   5.34e-37 5.34e-36 ****        
##  2 demox_eiu Africa  Asia     267   328   41226   2.19e- 1 1   e+ 0 ns          
##  3 demox_eiu Africa  Europe   267   426   13916.  5.83e-63 5.83e-62 ****        
##  4 demox_eiu Africa  Ocean~   267    22       0   6.48e-15 6.48e-14 ****        
##  5 demox_eiu Americ~ Asia     191   328   51190   1.78e-33 1.78e-32 ****        
##  6 demox_eiu Americ~ Europe   191   426   26398   2.99e-12 2.99e-11 ****        
##  7 demox_eiu Americ~ Ocean~   191    22      49.5 6.71e-14 6.71e-13 ****        
##  8 demox_eiu Asia    Europe   328   426   18348.  1.29e-67 1.29e-66 ****        
##  9 demox_eiu Asia    Ocean~   328    22       0   4.08e-15 4.08e-14 ****        
## 10 demox_eiu Europe  Ocean~   426    22    1208.  4.27e- 9 4.27e- 8 ****
pwc_2 <- data %>%
  wilcox_test(demox_eiu ~ continentFactor,
                  paired = FALSE,
                  p.adjust.method = "bonferroni")

Kruskal_results <- kruskal_test(demox_eiu ~ continentFactor,
                                data = data)

pwc_2 <- pwc_2 %>%
  add_y_position(fun = "median", step.increase = 0.35)

ggboxplot(data, x = "continentFactor", y = "demox_eiu", add = "point") +
  stat_pvalue_manual(pwc_2, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = continentFactor), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred",
               position = position_dodge(0.8),
               geom = "text", vjust = -0.5, hjust = -1,
               aes(label = round(after_stat(y), digits = 2), group = continentFactor)) +
  labs(subtitle = get_test_label(Kruskal_results, detailed = TRUE),
       caption = get_pwc_label(pwc_2))

#interpreting the results of Kruskal-Wallis test, we found that the distribution of the democracy indexed differs significantly for at least one of the continents (chi=530.9, p<0.001), the effect size was large (eta2 = 0.429)
#post hoc tests have shown that there are significant differences (p<0.001) among all of the possible pairs, except for the Africa-Asia pair which wasn't significant

#RQ 2 : are the democracy index, GDP per capita, capital formation and tax revenue able to predict the gini index of countries?

data_numer <- data[ , c("demox_eiu", "income_per_person", "invest_%_gdp", "tax_%_gdp", "gini_index")]

scatterplotMatrix(data_numer,
                  smooth = FALSE)

#due to the distribution skewed to the right, we winsorized the income_per_person variable

quantile(data$income_per_person, 0.95)
##   95% 
## 56235
data$income_per_person_W <- ifelse(test = data$income_per_person > 56235,
                                   yes = 56235,
                                   no = data$income_per_person)

par(mfrow = c(1, 2))
hist(data$income_per_person,
     main = "Income per person")
hist(data$income_per_person_W,
     main = "Income per person (W)")

fit <- lm(gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + `tax_%_gdp`,
          data = data)

vif(fit)
##           demox_eiu income_per_person_W      `invest_%_gdp`         `tax_%_gdp` 
##            1.673661            1.422970            1.017604            1.199460
#vif values are not noticeably high, indicating that there is no multicolinearity

data$StdResid <- round(rstandard(fit), 3)
data$CooksD <- round(cooks.distance(fit), 3)

hist(data$StdResid,
     xlab = "Standardized residuals",
     ylab = "Frequency",
     main = "Histogram of standardized residuals")

shapiro.test(data$StdResid)
## 
##  Shapiro-Wilk normality test
## 
## data:  data$StdResid
## W = 0.957, p-value < 2.2e-16
#shapiro test of standard residuals is significant (p<0.001) meaning that we have some extreme values that we need to clean

hist(data$CooksD,
     xlab = "Cook's distances",
     ylab = "Frequency",
     main = "Histogram of Cook's distances")

head(data[order(data$StdResid), c("country", "StdResid")], 5)
##      country StdResid
## 1197 Ukraine   -2.027
## 1200 Ukraine   -2.013
## 1198 Ukraine   -2.002
## 1199 Ukraine   -1.994
## 1201 Ukraine   -1.992
head(data[order(-data$StdResid), c("country", "StdResid")], 5)
##          country StdResid
## 220 South Africa    3.138
## 219 South Africa    3.128
## 221 South Africa    3.105
## 222 South Africa    3.071
## 223 South Africa    3.063
head(data[order(-data$CooksD), c("country", "CooksD")], 5)
##      country CooksD
## 436 Suriname  0.026
## 435 Suriname  0.020
## 121  Lesotho  0.016
## 433 Suriname  0.012
## 432 Suriname  0.011
#identify which rows are outliers based on their standard residuals being above 2.5 or below -2.5
outliers <- which(data$StdResid > 2.5 | data$StdResid < -2.5)
print(outliers)
##  [1]  38  39  40  41  42  43  44  45  46  47  48 181 182 183 184 185 186 187 188
## [20] 189 190 191 219 220 221 222 223 224 225 226 227 228 229 430 431 432 433 434
## [39] 435 436
#identify which rows are influential based on the 4/N rule for Cook's distance
threshold <- 4/1234
influential_obs <- which(data$CooksD > threshold)
print(influential_obs)
##  [1]   13   14   15   16   17   18   38   39   40   41   42   43   44   45   46
## [16]   47   68   69   70   71  103  104  105  119  120  121  122  123  124  125
## [31]  126  127  128  177  178  181  182  183  184  185  186  187  188  189  190
## [46]  191  219  220  221  222  223  224  225  226  227  228  229  230  231  232
## [61]  236  262  263  431  432  433  434  435  436  595  596  597  598  599  600
## [76]  601  602  603  604  692  693  694  695  696  716  767  768  769  770  771
## [91]  772 1022 1023
#remove the rows that are outliers and/or influential
data <- data[-outliers, ]
data <- data[-influential_obs, ]

summary(data)
##   continent           country               year        demox_eiu    
##  Length:1101        Length:1101        Min.   :2006   Min.   :16.60  
##  Class :character   Class :character   1st Qu.:2008   1st Qu.:47.20  
##  Mode  :character   Mode  :character   Median :2011   Median :65.90  
##                                        Mean   :2011   Mean   :62.77  
##                                        3rd Qu.:2014   3rd Qu.:79.60  
##                                        Max.   :2016   Max.   :99.30  
##  income_per_person  invest_%_gdp     tax_%_gdp         gini_index   
##  Min.   :   605    Min.   : 0.00   Min.   : 0.0435   Min.   :24.40  
##  1st Qu.:  6310    1st Qu.:19.40   1st Qu.:12.1000   1st Qu.:32.00  
##  Median : 15700    Median :23.10   Median :15.7000   Median :35.60  
##  Mean   : 21792    Mean   :24.04   Mean   :16.4541   Mean   :37.14  
##  3rd Qu.: 33400    3rd Qu.:27.20   3rd Qu.:21.2000   3rd Qu.:42.00  
##  Max.   :120000    Max.   :67.90   Max.   :62.9000   Max.   :57.30  
##      Africa         Americas           Asia            Europe      
##  Min.   :0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.158   Mean   :0.1644   Mean   :0.2779   Mean   :0.3797  
##  3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     Oceania        continentFactor income_per_person_W    StdResid       
##  Min.   :0.00000   Africa  :174    Min.   :  605       Min.   :-2.02700  
##  1st Qu.:0.00000   Americas:181    1st Qu.: 6310       1st Qu.:-0.75900  
##  Median :0.00000   Asia    :306    Median :15700       Median :-0.19800  
##  Mean   :0.01998   Europe  :418    Mean   :20621       Mean   :-0.08729  
##  3rd Qu.:0.00000   Oceania : 22    3rd Qu.:33400       3rd Qu.: 0.48200  
##  Max.   :1.00000                   Max.   :56235       Max.   : 2.08600  
##      CooksD         
##  Min.   :0.0000000  
##  1st Qu.:0.0000000  
##  Median :0.0000000  
##  Mean   :0.0005867  
##  3rd Qu.:0.0010000  
##  Max.   :0.0160000
fit <- lm(gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + `tax_%_gdp`,
          data = data)

data$StdFitted <- scale(fit$fitted.values)

scatterplot(StdResid ~ StdFitted,
            ylab = "Standardized residuals",
            xlab = "Standardized fitted values",
            smooth = FALSE,
            regLine = FALSE,
            boxplots = FALSE,
            data = data)

#scatter plot between the standardized fitted values and standardized residuals indicates that the homoskedasticity assumption is not violated

ols_test_breusch_pagan(fit)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##                  Data                  
##  --------------------------------------
##  Response : gini_index 
##  Variables: fitted values of gini_index 
## 
##          Test Summary           
##  -------------------------------
##  DF            =    1 
##  Chi2          =    52.66446 
##  Prob > Chi2   =    3.956867e-13
#Breusch-Pagan test is significant (p<0.001) meaning however that there might be some issues with the heteroskedasticity when it comes to the variance of residuals

round(stat.desc(data[c("gini_index", "demox_eiu", "income_per_person_W", "invest_%_gdp", "tax_%_gdp")], basic = FALSE), 2)
##              gini_index demox_eiu income_per_person_W invest_%_gdp tax_%_gdp
## median            35.60     65.90            15700.00        23.10     15.70
## mean              37.14     62.77            20620.63        24.04     16.45
## SE.mean            0.22      0.63              500.03         0.24      0.23
## CI.mean.0.95       0.43      1.24              981.12         0.46      0.45
## var               53.27    437.31        275280248.34        61.00     57.67
## std.dev            7.30     20.91            16591.57         7.81      7.59
## coef.var           0.20      0.33                0.80         0.32      0.46
#coefficient of variance indicates that the income_per_person_W has the highest variance by far, which is to be expected given the range and the differences in possible incomes

rcorr(as.matrix(data[c("gini_index", "demox_eiu", "income_per_person_W", "invest_%_gdp", "tax_%_gdp")],
                type = "pearson"))
##                     gini_index demox_eiu income_per_person_W invest_%_gdp
## gini_index                1.00     -0.32               -0.40        -0.04
## demox_eiu                -0.32      1.00                0.53        -0.12
## income_per_person_W      -0.40      0.53                1.00        -0.06
## invest_%_gdp             -0.04     -0.12               -0.06         1.00
## tax_%_gdp                -0.18      0.43                0.16        -0.04
##                     tax_%_gdp
## gini_index              -0.18
## demox_eiu                0.43
## income_per_person_W      0.16
## invest_%_gdp            -0.04
## tax_%_gdp                1.00
## 
## n= 1101 
## 
## 
## P
##                     gini_index demox_eiu income_per_person_W invest_%_gdp
## gini_index                     0.0000    0.0000              0.2364      
## demox_eiu           0.0000               0.0000              0.0000      
## income_per_person_W 0.0000     0.0000                        0.0454      
## invest_%_gdp        0.2364     0.0000    0.0454                          
## tax_%_gdp           0.0000     0.0000    0.0000              0.2367      
##                     tax_%_gdp
## gini_index          0.0000   
## demox_eiu           0.0000   
## income_per_person_W 0.0000   
## invest_%_gdp        0.2367   
## tax_%_gdp
#correlation analysis shows that the dependent variable gini_index has significant correlation (p<0.001) with each predictor except for the invest_%_gdp
#additionally, demox_eiu predictor indicating democracy level in the country is significantly correlated with every other predictor (p<0.001), and tax_%_gdp predictor is correlated additionally with income_per_person_W

summary(fit)
## 
## Call:
## lm(formula = gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + 
##     `tax_%_gdp`, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1538  -4.7270  -0.5216   4.0412  17.9605 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          4.569e+01  9.524e-01  47.970  < 2e-16 ***
## demox_eiu           -4.508e-02  1.238e-02  -3.641 0.000284 ***
## income_per_person_W -1.422e-04  1.423e-05  -9.990  < 2e-16 ***
## `invest_%_gdp`      -6.821e-02  2.564e-02  -2.660 0.007930 ** 
## `tax_%_gdp`         -6.960e-02  2.906e-02  -2.395 0.016772 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.595 on 1096 degrees of freedom
## Multiple R-squared:  0.1865, Adjusted R-squared:  0.1835 
## F-statistic:  62.8 on 4 and 1096 DF,  p-value: < 2.2e-16
#results of the regression analysis show that each of the predictors has significant predictive value for the gini_index (p<0.01), and all of them are negatively oriented
#the model overall explains 18.65% variability of the dependent variable (R2 coefficient of determination) while the correlation coefficient R is 0.4318, and significant at p<0.001

fit1 <- lm_robust(gini_index ~ demox_eiu + income_per_person_W + `invest_%_gdp` + `tax_%_gdp`,
                  se_type = "HC1",
                  data = data)

summary(fit1)
## 
## Call:
## lm_robust(formula = gini_index ~ demox_eiu + income_per_person_W + 
##     `invest_%_gdp` + `tax_%_gdp`, data = data, se_type = "HC1")
## 
## Standard error type:  HC1 
## 
## Coefficients:
##                       Estimate Std. Error t value   Pr(>|t|)   CI Lower
## (Intercept)         45.6867094  9.760e-01  46.810 1.160e-263 43.7716779
## demox_eiu           -0.0450753  1.237e-02  -3.644  2.810e-04 -0.0693461
## income_per_person_W -0.0001422  1.436e-05  -9.906  3.262e-22 -0.0001704
## `invest_%_gdp`      -0.0682079  2.553e-02  -2.671  7.668e-03 -0.1183087
## `tax_%_gdp`         -0.0696012  3.634e-02  -1.915  5.574e-02 -0.1409111
##                      CI Upper   DF
## (Intercept)         47.601741 1096
## demox_eiu           -0.020805 1096
## income_per_person_W -0.000114 1096
## `invest_%_gdp`      -0.018107 1096
## `tax_%_gdp`          0.001709 1096
## 
## Multiple R-squared:  0.1865 ,    Adjusted R-squared:  0.1835 
## F-statistic: 96.14 on 4 and 1096 DF,  p-value: < 2.2e-16
#due to the violation of homoskedasticity according to the Breusch-Pagan test, the additional correction was conducted, however none of the results significantly changed

sqrt(summary(fit1)$r.squared)
## [1] 0.431803
#the regression model has shown that, the gini index of inequality can be predicted by the democracy index, GDP per capita (income), tax revenue as % of GDP and higher investments as % of GDP, in a way that the higher the values of these predictors are, the lower the gini index of inequality is in the country