Exploratory Data Analysis and Regression Modeling in R
This code performs exploratory data analysis (EDA) and builds multiple regression models using various statistical packages in R. It includes tasks such as data pre processing, checking for missing values, identifying outliers, visualizing data through box plots and bar plots, calculating summary statistics, conducting correlation analysis, and performing multiple regression.
Libraries
library(dplyr)
library(Hmisc)
library(lmtest)
library(ggplot2)
library(car)
library(psych)
library(gridExtra)
library(corrplot)
data <- read.csv("C:/Users/Adebolu/Downloads/qog_std_ts_jan23.csv")
Attaching only the variables we need by Subsetting the data
data2 <- subset(data, select = c(cname, year,wbgi_gee, wdi_gdpgr, wjp_overall,ti_cpi))
View the subsetted data
head(data2)
## cname year wbgi_gee wdi_gdpgr wjp_overall ti_cpi
## 1 Afghanistan 1946 NA NA NA NA
## 2 Afghanistan 1947 NA NA NA NA
## 3 Afghanistan 1948 NA NA NA NA
## 4 Afghanistan 1949 NA NA NA NA
## 5 Afghanistan 1950 NA NA NA NA
## 6 Afghanistan 1951 NA NA NA NA
Removing years less than 2012
data2 <- subset(data2, year >= 2012)
Missing values
## cname year wbgi_gee wdi_gdpgr wjp_overall ti_cpi
## 0 0 258 303 1133 251
Excluding Missing values
cleaned_data <- na.omit(data2)
summary(cleaned_data)
## cname year wbgi_gee wdi_gdpgr
## Length:885 Min. :2012 Min. :-2.18749 Min. :-24.365
## Class :character 1st Qu.:2015 1st Qu.:-0.58619 1st Qu.: 1.000
## Mode :character Median :2017 Median :-0.03497 Median : 2.977
## Mean :2017 Mean : 0.11373 Mean : 2.479
## 3rd Qu.:2020 3rd Qu.: 0.67732 3rd Qu.: 4.920
## Max. :2021 Max. : 2.32486 Max. : 43.480
## wjp_overall ti_cpi
## Min. :0.3105 Min. :11.00
## 1st Qu.:0.4633 1st Qu.:31.00
## Median :0.5214 Median :39.00
## Mean :0.5634 Mean :45.59
## 3rd Qu.:0.6544 3rd Qu.:58.00
## Max. :0.8995 Max. :92.00
Select numeric variables
numeric_vars <- cleaned_data[, sapply(cleaned_data, is.numeric)]
Create boxplot
boxplot(numeric_vars, main = "Boxplot of Variables", xlab = "Variables")
Boxplot for growth rate using ggplot
ggplot(numeric_vars, aes(y = wdi_gdpgr)) +
geom_boxplot(fill = "#1F78B4", color = "#7570B3", outlier.color = "#E7298A") +
labs(title = "Boxplot for Growth Rate", y = "Growth Rate") +
theme_minimal()
average_corruption <- cleaned_data %>%
group_by(cname) %>%
summarise(average_corruption = mean(ti_cpi))%>%
arrange(desc(average_corruption))
Most and Least corrupt country
top5a <- head(average_corruption, 5)
plot1 <- ggplot(top5a, aes(x = reorder(cname, -average_corruption), y = average_corruption, fill = average_corruption)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#E6F5E6", high = "#006600")+
labs(title = "Least Corrupt", x = "Country", y = "Average Corruption") +
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
last5a <- tail(average_corruption, 5)
plot2 <- ggplot(last5a, aes(x = reorder(cname, -average_corruption), y = average_corruption, fill = average_corruption)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
labs(title = "Most Corrupt", x = "Country", y = "Average Corruption")+
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol = 2)
average_effectiveness <- cleaned_data %>%
group_by(cname) %>%
summarise(average_effectiveness = mean(wbgi_gee))%>%
arrange(desc(average_effectiveness))
Most and Least Effective government (country)
top5b <- head(average_effectiveness, 5)
last5b <- tail(average_effectiveness, 5)
plot1 <- ggplot(top5b, aes(x = reorder(cname, -average_effectiveness), y = average_effectiveness, fill = average_effectiveness)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#E6F5FF", high = "#0000CC")+
labs(title = "Top 5 Effective govt", x = "Country", y = "Average Effectiveness") +
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
plot2 <- ggplot(last5b, aes(x = reorder(cname, average_effectiveness), y = average_effectiveness, fill = average_effectiveness)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
labs(title = "Least 5 Effective govt", x = "Country", y = "Average Effectiveness")+
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol = 2)
average_ruled <- cleaned_data %>%
group_by(cname) %>%
summarise(average_ruled = mean(wjp_overall))%>%
arrange(desc(average_ruled))
Best and worst rule adhered country
top5c <- head(average_ruled, 5)
last5c <- tail(average_ruled, 5)
plot1 <- ggplot(top5c, aes(x = reorder(cname, -average_ruled), y = average_ruled, fill = average_ruled)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#FFFFE6", high = "#CCCC00")+
labs(title = "Top 5 Rule Adhered countries", x = "Country", y = "Average Adherence") +
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
plot2 <- ggplot(last5c, aes(x = reorder(cname, -average_ruled), y = average_ruled, fill = average_ruled)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
labs(title = "Least 5 Rule Adhered countries", x = "Country", y = "Average Adherence")+
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol = 2)
average_gdp <- cleaned_data %>%
group_by(cname) %>%
summarise(average_gdp = mean(wdi_gdpgr))%>%
arrange(desc(average_gdp))
Highest and lowest GDP growth per country
top5d <- head(average_gdp, 5)
last5d <- tail(average_gdp, 5)
plot1 <- ggplot(top5d, aes(x = reorder(cname, -average_gdp), y = average_gdp, fill = average_gdp)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#FFE6CC", high = "#CC6600")+
labs(title = "Top GDP growth", x = "Country", y = "Average GDP growth") +
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
plot2 <- ggplot(last5d, aes(x = reorder(cname, -average_gdp), y = average_gdp, fill = average_gdp)) +
geom_bar(stat = "identity",width = 0.5) +
scale_fill_gradient(low = "#FFEEEE", high = "#AA0000")+
labs(title = "Least GDP growth", x = "Country", y = "Average GDP growth")+
theme_minimal()+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, ncol = 2)
correlation_matrix1 <- cor(cleaned_data[, c("wjp_overall", "ti_cpi", "wbgi_gee")])
Correlation matrix plot
corrplot(correlation_matrix1, method = "number")
multiple_regression_model <- lm(wdi_gdpgr ~ ti_cpi + wbgi_gee + wjp_overall, data = cleaned_data)
summary(multiple_regression_model)
##
## Call:
## lm(formula = wdi_gdpgr ~ ti_cpi + wbgi_gee + wjp_overall, data = cleaned_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.544 -1.392 0.693 2.281 40.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.66619 1.61957 5.351 1.12e-07 ***
## ti_cpi 0.02138 0.03259 0.656 0.51203
## wbgi_gee 1.01504 0.49667 2.044 0.04128 *
## wjp_overall -12.91696 4.17213 -3.096 0.00202 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.938 on 881 degrees of freedom
## Multiple R-squared: 0.02132, Adjusted R-squared: 0.01799
## F-statistic: 6.398 on 3 and 881 DF, p-value: 0.0002734
calculate the VIF for each predictor variable in the model
vif(multiple_regression_model)
## ti_cpi wbgi_gee wjp_overall
## 13.632512 7.235112 12.238973
The high VIF shows that there’s presence of multicorrelation in the model. Therefore, another model is used by removing on of the independent variable
multiple_regression_model2 <- lm(wdi_gdpgr ~ wjp_overall + wbgi_gee, data = cleaned_data)
summary(multiple_regression_model2)
##
## Call:
## lm(formula = wdi_gdpgr ~ wjp_overall + wbgi_gee, data = cleaned_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.457 -1.418 0.670 2.293 40.954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5228 1.6042 5.313 1.37e-07 ***
## wjp_overall -10.9598 2.9152 -3.760 0.000181 ***
## wbgi_gee 1.1508 0.4514 2.550 0.010955 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.936 on 882 degrees of freedom
## Multiple R-squared: 0.02085, Adjusted R-squared: 0.01863
## F-statistic: 9.389 on 2 and 882 DF, p-value: 9.234e-05
Linearity of the data
plot(multiple_regression_model2, 1)
Normality of residual
model_residuals=multiple_regression_model2$residuals
### Create the QQ plot
qqnorm(model_residuals, col = "#1F78B4", pch = 16)
qqline(model_residuals, col = "#E31A1C")
shapiro.test(resid(multiple_regression_model2))
##
## Shapiro-Wilk normality test
##
## data: resid(multiple_regression_model2)
## W = 0.86119, p-value < 2.2e-16
Homoscedacity
ncvTest(multiple_regression_model2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.3534133, Df = 1, p = 0.55219
#Using the Breusch–Pagan Test to Check Heteroscedasticity, using the bptest function from the lmtest package,
bptest(multiple_regression_model)
##
## studentized Breusch-Pagan test
##
## data: multiple_regression_model
## BP = 9.9049, df = 3, p-value = 0.01939
Autocorrelation (Independence assumption)
dwtest(multiple_regression_model2)
##
## Durbin-Watson test
##
## data: multiple_regression_model2
## DW = 1.9378, p-value = 0.1632
## alternative hypothesis: true autocorrelation is greater than 0
Multi-collinearity
vif(multiple_regression_model2)
## wjp_overall wbgi_gee
## 5.979325 5.979325