Linkedin LinkedIn   GitHub GitHub

Exploratory Data Analysis and Regression Modeling in R

This code performs exploratory data analysis (EDA) and builds multiple regression models using various statistical packages in R. It includes tasks such as data pre processing, checking for missing values, identifying outliers, visualizing data through box plots and bar plots, calculating summary statistics, conducting correlation analysis, and performing multiple regression.

Data preparation/exploration

Libraries

library(dplyr)
library(Hmisc)
library(lmtest)
library(ggplot2)
library(car)
library(psych)
library(gridExtra)
library(corrplot)

Importing data

data <- read.csv("C:/Users/Adebolu/Downloads/qog_std_ts_jan23.csv")

Data cleaning

Attaching only the variables we need by Subsetting the data

data2 <- subset(data, select = c(cname, year,wbgi_gee, wdi_gdpgr, wjp_overall,ti_cpi))

View the subsetted data

head(data2)
##         cname year wbgi_gee wdi_gdpgr wjp_overall ti_cpi
## 1 Afghanistan 1946       NA        NA          NA     NA
## 2 Afghanistan 1947       NA        NA          NA     NA
## 3 Afghanistan 1948       NA        NA          NA     NA
## 4 Afghanistan 1949       NA        NA          NA     NA
## 5 Afghanistan 1950       NA        NA          NA     NA
## 6 Afghanistan 1951       NA        NA          NA     NA

Removing years less than 2012

data2 <- subset(data2, year >= 2012)

Missing values

##       cname        year    wbgi_gee   wdi_gdpgr wjp_overall      ti_cpi 
##           0           0         258         303        1133         251

Excluding Missing values

cleaned_data <- na.omit(data2)

Descriptive Statistics

Summary statistics

summary(cleaned_data) 
##     cname                year         wbgi_gee          wdi_gdpgr      
##  Length:885         Min.   :2012   Min.   :-2.18749   Min.   :-24.365  
##  Class :character   1st Qu.:2015   1st Qu.:-0.58619   1st Qu.:  1.000  
##  Mode  :character   Median :2017   Median :-0.03497   Median :  2.977  
##                     Mean   :2017   Mean   : 0.11373   Mean   :  2.479  
##                     3rd Qu.:2020   3rd Qu.: 0.67732   3rd Qu.:  4.920  
##                     Max.   :2021   Max.   : 2.32486   Max.   : 43.480  
##   wjp_overall         ti_cpi     
##  Min.   :0.3105   Min.   :11.00  
##  1st Qu.:0.4633   1st Qu.:31.00  
##  Median :0.5214   Median :39.00  
##  Mean   :0.5634   Mean   :45.59  
##  3rd Qu.:0.6544   3rd Qu.:58.00  
##  Max.   :0.8995   Max.   :92.00

Identification of outliers

Select numeric variables

numeric_vars <- cleaned_data[, sapply(cleaned_data, is.numeric)]

Create boxplot

boxplot(numeric_vars, main = "Boxplot of  Variables", xlab = "Variables")

Boxplot for growth rate using ggplot

ggplot(numeric_vars, aes(y = wdi_gdpgr)) +
  geom_boxplot(fill = "#1F78B4", color = "#7570B3", outlier.color = "#E7298A") +
  labs(title = "Boxplot for Growth Rate", y = "Growth Rate") +
  theme_minimal()

EDA

Corruption perception

average_corruption <- cleaned_data %>%
  group_by(cname) %>%
  summarise(average_corruption = mean(ti_cpi))%>%
  arrange(desc(average_corruption))

Most and Least corrupt country

top5a <- head(average_corruption, 5)
plot1 <- ggplot(top5a, aes(x = reorder(cname, -average_corruption), y = average_corruption, fill = average_corruption)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#E6F5E6", high = "#006600")+
  labs(title = "Least Corrupt", x = "Country", y = "Average Corruption") +
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))

last5a <- tail(average_corruption, 5)
plot2 <- ggplot(last5a, aes(x = reorder(cname, -average_corruption), y = average_corruption, fill = average_corruption)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
  labs(title = "Most Corrupt", x = "Country", y = "Average Corruption")+
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))

grid.arrange(plot1, plot2, ncol = 2)

Government effectiveness

average_effectiveness <- cleaned_data %>%
  group_by(cname) %>%
  summarise(average_effectiveness = mean(wbgi_gee))%>%
  arrange(desc(average_effectiveness))

Most and Least Effective government (country)

top5b <- head(average_effectiveness, 5)
last5b <- tail(average_effectiveness, 5)
plot1 <- ggplot(top5b, aes(x = reorder(cname, -average_effectiveness), y = average_effectiveness, fill = average_effectiveness)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#E6F5FF", high = "#0000CC")+
  labs(title = "Top 5 Effective govt", x = "Country", y = "Average Effectiveness") +
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))
        
plot2 <- ggplot(last5b, aes(x = reorder(cname, average_effectiveness), y = average_effectiveness, fill = average_effectiveness)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
  labs(title = "Least 5 Effective govt", x = "Country", y = "Average Effectiveness")+
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))

grid.arrange(plot1, plot2, ncol = 2)

Rule of law

average_ruled <- cleaned_data %>%
  group_by(cname) %>%
  summarise(average_ruled = mean(wjp_overall))%>%
  arrange(desc(average_ruled))

Best and worst rule adhered country

top5c <- head(average_ruled, 5)
last5c <- tail(average_ruled, 5)
plot1 <- ggplot(top5c, aes(x = reorder(cname, -average_ruled), y = average_ruled, fill = average_ruled)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#FFFFE6", high = "#CCCC00")+
  labs(title = "Top 5 Rule Adhered countries", x = "Country", y = "Average Adherence") +
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))

plot2 <- ggplot(last5c, aes(x = reorder(cname, -average_ruled), y = average_ruled, fill = average_ruled)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
  labs(title = "Least 5 Rule Adhered countries", x = "Country", y = "Average Adherence")+
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol = 2)

GDP growth

average_gdp <- cleaned_data %>%
  group_by(cname) %>%
  summarise(average_gdp = mean(wdi_gdpgr))%>%
  arrange(desc(average_gdp))

Highest and lowest GDP growth per country

top5d <- head(average_gdp, 5)
last5d <- tail(average_gdp, 5)

plot1 <- ggplot(top5d, aes(x = reorder(cname, -average_gdp), y = average_gdp, fill = average_gdp)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#FFE6CC", high = "#CC6600")+
  labs(title = "Top GDP growth", x = "Country", y = "Average GDP growth") +
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))

plot2 <- ggplot(last5d, aes(x = reorder(cname, -average_gdp), y = average_gdp, fill = average_gdp)) +
  geom_bar(stat = "identity",width = 0.5) +
  scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
  labs(title = "Least GDP growth", x = "Country", y = "Average GDP growth")+
  theme_minimal()+
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol = 2)

Inferential statistics

CORRELATION

correlation_matrix1 <- cor(cleaned_data[, c("wjp_overall", "ti_cpi", "wbgi_gee")])

Correlation matrix plot

corrplot(correlation_matrix1, method = "number")

MULTIPLE REGRESSION for model 1

multiple_regression_model <- lm(wdi_gdpgr ~ ti_cpi + wbgi_gee + wjp_overall, data = cleaned_data)
summary(multiple_regression_model)
## 
## Call:
## lm(formula = wdi_gdpgr ~ ti_cpi + wbgi_gee + wjp_overall, data = cleaned_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.544  -1.392   0.693   2.281  40.862 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.66619    1.61957   5.351 1.12e-07 ***
## ti_cpi        0.02138    0.03259   0.656  0.51203    
## wbgi_gee      1.01504    0.49667   2.044  0.04128 *  
## wjp_overall -12.91696    4.17213  -3.096  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.938 on 881 degrees of freedom
## Multiple R-squared:  0.02132,    Adjusted R-squared:  0.01799 
## F-statistic: 6.398 on 3 and 881 DF,  p-value: 0.0002734

calculate the VIF for each predictor variable in the model

vif(multiple_regression_model)
##      ti_cpi    wbgi_gee wjp_overall 
##   13.632512    7.235112   12.238973

The high VIF shows that there’s presence of multicorrelation in the model. Therefore, another model is used by removing on of the independent variable

MULTIPLE REGRESSION for the model 2

multiple_regression_model2 <- lm(wdi_gdpgr ~ wjp_overall + wbgi_gee, data = cleaned_data)
summary(multiple_regression_model2)
## 
## Call:
## lm(formula = wdi_gdpgr ~ wjp_overall + wbgi_gee, data = cleaned_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.457  -1.418   0.670   2.293  40.954 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.5228     1.6042   5.313 1.37e-07 ***
## wjp_overall -10.9598     2.9152  -3.760 0.000181 ***
## wbgi_gee      1.1508     0.4514   2.550 0.010955 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.936 on 882 degrees of freedom
## Multiple R-squared:  0.02085,    Adjusted R-squared:  0.01863 
## F-statistic: 9.389 on 2 and 882 DF,  p-value: 9.234e-05

Assumptions

Linearity of the data

plot(multiple_regression_model2, 1)

Normality of residual

model_residuals=multiple_regression_model2$residuals
### Create the QQ plot
qqnorm(model_residuals, col = "#1F78B4", pch = 16)
qqline(model_residuals, col = "#E31A1C")

shapiro.test(resid(multiple_regression_model2))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(multiple_regression_model2)
## W = 0.86119, p-value < 2.2e-16

Homoscedacity

ncvTest(multiple_regression_model2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.3534133, Df = 1, p = 0.55219
#Using the  Breusch–Pagan Test to Check Heteroscedasticity, using the bptest function from the lmtest package,
bptest(multiple_regression_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  multiple_regression_model
## BP = 9.9049, df = 3, p-value = 0.01939

Autocorrelation (Independence assumption)

dwtest(multiple_regression_model2)
## 
##  Durbin-Watson test
## 
## data:  multiple_regression_model2
## DW = 1.9378, p-value = 0.1632
## alternative hypothesis: true autocorrelation is greater than 0

Multi-collinearity

vif(multiple_regression_model2)
## wjp_overall    wbgi_gee 
##    5.979325    5.979325