First, we load the World Value Survey data collected in Tunisia in 2019 from a Stata file and inspect its structure.
# Read the Stata file
df2019 <- read_dta("~/Downloads/WVS_Wave_7_Tunisia_Stata_v5.0.dta")
We then select the columns of interest for the analysis.
columns_of_interest <- c("Q107",
"Q114",
"J_INTDATE",
"Q260",
"Q262",
"Q245",
"Q235",
"Q237")
df2019_sub <- df2019[, columns_of_interest]
We will clean the selected data by replacing negative values with NA.
df2019_sub <- df2019_sub %>%
mutate(Q107 = ifelse(Q107 < 0, NA, Q107),
Q237 = ifelse(Q237 < 0, NA, Q237),
Q235 = ifelse(Q235 < 0, NA, Q235),
Q245 = ifelse(Q245 < 0, NA, Q245),
Q114 = ifelse(Q114 < 0, NA, Q114),
Q260 = ifelse(Q260 < 0, NA, Q260))
We will check for multicollinearity by calculating and plotting the correlation matrix.
# Calculate correlation matrix
cor_matrix_2019 <- cor(df2019_sub, use = "complete.obs")
# Create more descriptive variable labels
var_labels <- c(
"Q107" = "Gov't Ownership Preference",
"Q114" = "Perceived Corruption",
"J_INTDATE" = "Interview Date",
"Q260" = "Urban/Rural",
"Q262" = "Age",
"Q245" = "Coup Acceptance",
"Q235"= "Strongman Rule",
"Q237" = "Army Rule"
)
# Melt the correlation matrix
melted_cormat <- melt(cor_matrix_2019)
# Create the plot
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#002F6C", high = "#BA0C2F", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(1.5, 0.3),
legend.direction = "vertical") +
coord_fixed() +
geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
scale_x_discrete(labels = var_labels) +
scale_y_discrete(labels = var_labels) +
ggtitle("Correlation Matrix of Key Variables (2019)") +
labs(subtitle = "Tunisia World Values Survey Data",
caption = "Source: World Values Survey, Wave 7 (2019)")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We will perform simple imputation to handle missing values.
# Perform simple imputation
df2019_sub <- df2019_sub %>%
mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
We will test the association between business ownership preference
(Q107
) and the perception of corruption among business
executives (Q114
) using linear and generalized linear
models.
# Descriptive statistics for Q107 and Q114
descriptive_stats2019 <- describe(df2019_sub[, c("Q107", "Q114", "Q245", "Q237")])
# Print the descriptive statistics
print(descriptive_stats2019)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Q107 1 1208 6.68 3.08 7.00 6.97 4.45 1 10 9 -0.53 -1.05 0.09
## Q114 2 1208 2.93 0.76 3.00 2.94 1.48 1 4 3 -0.17 -0.59 0.02
## Q245 3 1208 5.56 3.14 5.56 5.57 3.79 1 10 9 -0.04 -1.26 0.09
## Q237 4 1208 2.87 0.94 3.00 2.96 1.48 1 4 3 -0.55 -0.52 0.03
# Test the association using lm
lm_model2019 <- lm(Q107 ~ Q114, df2019_sub)
summary(lm_model2019)
##
## Call:
## lm(formula = Q107 ~ Q114, data = df2019_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4312 -2.4312 0.5688 2.5688 4.6649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6365 0.3499 13.250 < 2e-16 ***
## Q114 0.6987 0.1156 6.045 1.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.035 on 1206 degrees of freedom
## Multiple R-squared: 0.02941, Adjusted R-squared: 0.0286
## F-statistic: 36.54 on 1 and 1206 DF, p-value: 1.99e-09
# Fit the linear model with additional covariates
lm_model2019_cov <- lm(Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
summary(lm_model2019_cov)
##
## Call:
## lm(formula = Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7162 -2.4267 0.6046 2.6498 4.9564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.803e+05 5.685e+04 -3.172 0.00155 **
## Q114 7.112e-01 1.150e-01 6.185 8.49e-10 ***
## J_INTDATE 8.930e-03 2.816e-03 3.172 0.00155 **
## Q262 1.287e-02 5.531e-03 2.328 0.02010 *
## Q260 1.785e-01 1.746e-01 1.022 0.30683
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.018 on 1203 degrees of freedom
## Multiple R-squared: 0.04278, Adjusted R-squared: 0.0396
## F-statistic: 13.44 on 4 and 1203 DF, p-value: 1.012e-10
# Test the association using glm (assuming Gaussian family)
glm_model2019 <- glm(Q107 ~ Q114, family = gaussian, df2019_sub)
summary(glm_model2019)
##
## Call:
## glm(formula = Q107 ~ Q114, family = gaussian, data = df2019_sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6365 0.3499 13.250 < 2e-16 ***
## Q114 0.6987 0.1156 6.045 1.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 9.212431)
##
## Null deviance: 11447 on 1207 degrees of freedom
## Residual deviance: 11110 on 1206 degrees of freedom
## AIC: 6114.6
##
## Number of Fisher Scoring iterations: 2
# Fit the glm with additional covariates
glm_model2019_cov <- glm(Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
summary(glm_model2019_cov)
##
## Call:
## glm(formula = Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.803e+05 5.685e+04 -3.172 0.00155 **
## Q114 7.112e-01 1.150e-01 6.185 8.49e-10 ***
## J_INTDATE 8.930e-03 2.816e-03 3.172 0.00155 **
## Q262 1.287e-02 5.531e-03 2.328 0.02010 *
## Q260 1.785e-01 1.746e-01 1.022 0.30683
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 9.108157)
##
## Null deviance: 11447 on 1207 degrees of freedom
## Residual deviance: 10957 on 1203 degrees of freedom
## AIC: 6103.8
##
## Number of Fisher Scoring iterations: 2
We will visualize the density distribution of the two variables to understand their spread and central tendencies.
# Plot the density distribution for Q107
ggplot(df2019_sub, aes(x = Q107)) +
geom_density(fill = "#002F6C", alpha = 0.7) +
labs(title = "Distribution of Preference for Government Ownership of Business",
subtitle = "Tunisia, 2019",
x = "Preference for Government Ownership (1 = Private, 10 = Government)",
y = "Density",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1))
# Plot the density distribution for Q114
ggplot(df2019_sub, aes(x = Q114)) +
geom_density(fill = "#BA0C2F", alpha = 0.7) +
labs(title = "Distribution of Perceived Corruption Among Business Executives",
subtitle = "Tunisia, 2019",
x = "Perception of Corruption (1 = None, 10 = All)",
y = "Density",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1))
We will visualize the results of the linear and generalized linear models.
library(ggplot2)
# Visualize the results of the linear model
ggplot(df2019_sub, aes(y = Q107, x = Q114)) +
geom_smooth(method = "lm", color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
labs(title = "Perceived Corruption and Government Ownership Preference",
subtitle = "Linear Model (2019)",
y = "Preference for Government Ownership of Business",
x = "Perception of Corruption Among Business Executives",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
## `geom_smooth()` using formula = 'y ~ x'
# Visualize the results of the generalized linear model
ggplot(df2019_sub, aes(y = Q107, x = Q114)) +
geom_smooth(method = "glm", method.args = list(family = "gaussian"),
color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
labs(title = "Perceived Corruption and Government Ownership Preference",
subtitle = "Generalized Linear Model (2019)",
y = "Preference for Government Ownership of Business",
x = "Perception of Corruption Among Business Executives",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
## `geom_smooth()` using formula = 'y ~ x'
# Visualize the age distribution of preferences
ggplot(df2019_sub, aes(y = Q107, x = Q262)) +
geom_smooth(method = "lm", color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
labs(title = "Age and Government Ownership Preference",
subtitle = "Linear Model (2019)",
y = "Preference for Government Ownership of Business",
x = "Age",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
## `geom_smooth()` using formula = 'y ~ x'
We will create an effect plot to visualize the effects of the variables in the model.
# Compute the effect for the linear model with covariates
effect_lm2019 <- effect("Q114", lm_model2019_cov)
# Convert effect object to data frame
effect_lm_df <- as.data.frame(effect_lm2019)
# Plot the effect for the linear model
ggplot(effect_lm_df, aes(x = Q114, y = fit)) +
geom_line(color = "#002F6C", size = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#002F6C") +
labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
subtitle = "Linear Model (2019)",
x = "Perception of Corruption Among Business Executives",
y = "Preference for Government Ownership of Business",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Compute the effect for the generalized linear model with covariates
effect_glm2019 <- effect("Q114", glm_model2019_cov)
# Convert effect object to data frame
effect_glm_df <- as.data.frame(effect_glm2019)
# Plot the effect for the generalized linear model
ggplot(effect_glm_df, aes(x = Q114, y = fit)) +
geom_line(color = "#BA0C2F", size = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#BA0C2F") +
labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
subtitle = "Generalized Linear Model (2019)",
x = "Perception of Corruption Among Business Executives",
y = "Preference for Government Ownership of Business",
caption = "Source: World Values Survey, Wave 7 (2019)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
Finally, we will create a table of regression results using the
stargazer
package.
# Create a stargazer table with improved labels
stargazer(lm_model2019, lm_model2019_cov, glm_model2019, glm_model2019_cov, type = "html",
title = "Regression Results",
dep.var.labels.include = FALSE,
column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
dep.var.labels = c("Preference for Government Ownership of Business"),
covariate.labels = c("Perception of Corruption Among Business Executives",
"Urban/Rural",
"Interview Date",
"Age",
"Sex"),
keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
notes.append = TRUE,
notes.align = "r")
Dependent variable: | ||||
OLS | normal | |||
OLS | OLS with Covariates | GLM | GLM with Covariates | |
(1) | (2) | (3) | (4) | |
Perception of Corruption Among Business Executives | 0.699*** | 0.711*** | 0.699*** | 0.711*** |
(0.116) | (0.115) | (0.116) | (0.115) | |
Urban/Rural | 0.009*** | 0.009*** | ||
(0.003) | (0.003) | |||
Interview Date | 0.013** | 0.013** | ||
(0.006) | (0.006) | |||
Age | 0.178 | 0.178 | ||
(0.175) | (0.175) | |||
Sex | 4.636*** | -180,293.000*** | 4.636*** | -180,293.000*** |
(0.350) | (56,846.910) | (0.350) | (56,846.910) | |
Observations | 1,208 | 1,208 | 1,208 | 1,208 |
R2 | 0.029 | 0.043 | ||
Adjusted R2 | 0.029 | 0.040 | ||
Log Likelihood | -3,055.291 | -3,046.911 | ||
Akaike Inf. Crit. | 6,114.583 | 6,103.823 | ||
F Statistic | 36.541*** (df = 1; 1206) | 13.442*** (df = 4; 1203) | ||
Note: | p<0.1; p<0.05; p<0.01 |
Now, we load the World Value Survey data collected in Tunisia in 2013 from a Stata file and inspect its structure.
# Read the Stata file
df2013 <- read_dta("~/Downloads/WV6_Data_Tunisia_Stata_v20201117.dta")
We then select the columns of interest for the analysis.
columns_of_interest <- c("V97", #Q107 in 2019
"MN_228M",#Q114 in 2019
"V261", #J_INTDATE in 2019
"V240", #Q260 in 2019
"V242",#Q262 in 2019
"V129", #Q237 in 2019
"V127",#Q235 in 2019
"V135") #Q245 in 2019
df2013_sub <- df2013[, columns_of_interest]
We will clean the selected data by replacing negative values with NA.
df2013_sub <- df2013_sub %>%
mutate(V97 = ifelse(V97 < 0, NA, V97),
V135= ifelse(V135 < 0, NA, V135),
V129= ifelse(V129 < 0, NA, V129),
V127= ifelse(V127 < 0, NA, V127),
MN_228M = ifelse(MN_228M < 0, NA, MN_228M),
V240 = ifelse(V240 < 0, NA, V240))
We will check for multicollinearity by calculating and plotting the correlation matrix.
# Calculate correlation matrix
cor_matrix_2013 <- cor(df2013_sub, use = "complete.obs")
# Create more descriptive variable labels
var_labels <- c(
"Q107" = "Gov't Ownership Preference",
"Q114" = "Perceived Corruption",
"J_INTDATE" = "Interview Date",
"Q260" = "Urban/Rural",
"Q262" = "Age"
)
# Melt the correlation matrix
melted_cormat <- melt(cor_matrix_2013)
# Create the plot
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#002F6C", high = "#BA0C2F", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(1.5, 0.3),
legend.direction = "vertical") +
coord_fixed() +
geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
scale_x_discrete(labels = var_labels) +
scale_y_discrete(labels = var_labels) +
ggtitle("Correlation Matrix of Key Variables (2013)") +
labs(subtitle = "Tunisia World Values Survey Data",
caption = "Source: World Values Survey, Wave 6 (2013)")
We will perform simple imputation to handle missing values.
# Perform simple imputation
df2013_sub <- df2013_sub %>%
mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
We will test the association between business ownership preference
(V97
) and the perception of corruption among business
executives (MN_228M
) using linear and generalized linear
models.
# Descriptive statistics for V97 and MN_228M
descriptive_stats2013 <- describe(df2013_sub[, c("V97", "MN_228M", "V135","V129")])
# Print the descriptive statistics
print(descriptive_stats2013)
## vars n mean sd median trimmed mad min max range skew kurtosis
## V97 1 1205 5.91 2.34 5.91 5.96 1.34 1 10 9 -0.09 -0.09
## MN_228M 2 1205 7.47 2.47 8.00 7.74 2.97 1 10 9 -0.65 -0.55
## V135 3 1205 4.85 3.19 4.85 4.69 4.22 1 10 9 0.30 -1.17
## V129 4 1205 2.79 1.05 3.00 2.86 1.48 1 4 3 -0.41 -0.99
## se
## V97 0.07
## MN_228M 0.07
## V135 0.09
## V129 0.03
# Test the association using lm
lm_model2013 <- lm(V97 ~ MN_228M, df2013_sub)
summary(lm_model2013)
##
## Call:
## lm(formula = V97 ~ MN_228M, data = df2013_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2320 -1.0311 0.0191 1.1698 4.2201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.28228 0.21483 29.243 <2e-16 ***
## MN_228M -0.05024 0.02732 -1.839 0.0662 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.338 on 1203 degrees of freedom
## Multiple R-squared: 0.002803, Adjusted R-squared: 0.001974
## F-statistic: 3.381 on 1 and 1203 DF, p-value: 0.06619
# Fit the linear model with additional covariates
lm_model2013_cov <- lm(V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
summary(lm_model2013_cov)
##
## Call:
## lm(formula = V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3839 -0.9975 -0.0611 1.2373 4.4311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.528e+04 3.741e+04 0.676 0.4993
## MN_228M -5.218e-02 2.740e-02 -1.905 0.0571 .
## V261 -1.255e-03 1.858e-03 -0.676 0.4994
## V242 6.387e-03 4.161e-03 1.535 0.1251
## V240 1.215e-01 1.352e-01 0.899 0.3689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.338 on 1200 degrees of freedom
## Multiple R-squared: 0.005941, Adjusted R-squared: 0.002627
## F-statistic: 1.793 on 4 and 1200 DF, p-value: 0.1279
# Test the association using glm (assuming Gaussian family)
glm_model2013 <- glm(V97 ~ MN_228M, family = gaussian, df2013_sub)
summary(glm_model2013)
##
## Call:
## glm(formula = V97 ~ MN_228M, family = gaussian, data = df2013_sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.28228 0.21483 29.243 <2e-16 ***
## MN_228M -0.05024 0.02732 -1.839 0.0662 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.467589)
##
## Null deviance: 6596.0 on 1204 degrees of freedom
## Residual deviance: 6577.5 on 1203 degrees of freedom
## AIC: 5470.7
##
## Number of Fisher Scoring iterations: 2
# Fit the glm with additional covariates
glm_model2013_cov <- glm(V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
summary(glm_model2013_cov)
##
## Call:
## glm(formula = V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.528e+04 3.741e+04 0.676 0.4993
## MN_228M -5.218e-02 2.740e-02 -1.905 0.0571 .
## V261 -1.255e-03 1.858e-03 -0.676 0.4994
## V242 6.387e-03 4.161e-03 1.535 0.1251
## V240 1.215e-01 1.352e-01 0.899 0.3689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.46401)
##
## Null deviance: 6596.0 on 1204 degrees of freedom
## Residual deviance: 6556.8 on 1200 degrees of freedom
## AIC: 5472.9
##
## Number of Fisher Scoring iterations: 2
We will visualize the density distribution of the two variables to understand their spread and central tendencies.
library(ggplot2)
# Plot the density distribution for V97
ggplot(df2013_sub, aes(x = V97)) +
geom_density(fill = "#002F6C", alpha = 0.7) +
labs(title = "Distribution of Preference for Government Ownership of Business",
subtitle = "Tunisia, 2013",
x = "Preference for Government Ownership (1 = Private, 10 = Government)",
y = "Density",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1))
# Plot the density distribution for MN_228M
ggplot(df2013_sub, aes(x = MN_228M)) +
geom_density(fill = "#BA0C2F", alpha = 0.7) +
labs(title = "Distribution of Perceived Corruption Among Business Executives",
subtitle = "Tunisia, 2013",
x = "Perception of Corruption (1 = None, 10 = All)",
y = "Density",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1))
We will visualize the results of the linear and generalized linear models.
# Visualize the results of the linear model
ggplot(df2013_sub, aes(y = V97, x = MN_228M)) +
geom_smooth(method = "lm", color = "#002F6C", fill = "#002F6C", alpha = 0.2) +
labs(title = "Linear Model: Government Ownership Preference vs Perceived Corruption",
subtitle = "Tunisia, 2013",
y = "Preference for Government Ownership of Business",
x = "Perception of Corruption Among Business Executives",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1)) +
scale_y_continuous(breaks = seq(1, 10, 1))
## `geom_smooth()` using formula = 'y ~ x'
# Visualize the results of the generalized linear model
ggplot(df2013_sub, aes(y = V97, x = MN_228M)) +
geom_smooth(method = "glm", method.args = list(family = "gaussian"), color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
labs(title = "Generalized Linear Model: Government Ownership Preference vs Perceived Corruption",
subtitle = "Tunisia, 2013",
y = "Preference for Government Ownership of Business",
x = "Perception of Corruption Among Business Executives",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1)) +
scale_y_continuous(breaks = seq(1, 10, 1))
## `geom_smooth()` using formula = 'y ~ x'
# Visualize the age distribution of preferences
ggplot(df2013_sub, aes(y = V97, x = V242)) +
geom_smooth(method = "lm", color = "#002F6C", fill = "#002F6C", alpha = 0.2) +
labs(title = "Age Distribution of Government Ownership Preferences",
subtitle = "Tunisia, 2013",
y = "Preference for Government Ownership of Business",
x = "Age",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(18, max(df2013_sub$V242, na.rm = TRUE), by = 10)) +
scale_y_continuous(breaks = seq(1, 10, 1))
## `geom_smooth()` using formula = 'y ~ x'
We will create an effect plot to visualize the effects of the variables in the model.
library(ggplot2)
library(effects)
# Compute the effect for the linear model with covariates
effect_lm2013 <- effect("MN_228M", lm_model2013_cov)
# Convert effect object to data frame
effect_lm_df <- as.data.frame(effect_lm2013)
# Plot the effect for the linear model
ggplot(effect_lm_df, aes(x = MN_228M, y = fit)) +
geom_line(color = "#002F6C", size = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#002F6C") +
labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
subtitle = "Linear Model (2013)",
x = "Perception of Corruption Among Business Executives",
y = "Preference for Government Ownership of Business",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
# Compute the effect for the generalized linear model with covariates
effect_glm2013 <- effect("MN_228M", glm_model2013_cov)
# Convert effect object to data frame
effect_glm_df <- as.data.frame(effect_glm2013)
# Plot the effect for the generalized linear model
ggplot(effect_glm_df, aes(x = MN_228M, y = fit)) +
geom_line(color = "#BA0C2F", size = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#BA0C2F") +
labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
subtitle = "Generalized Linear Model (2013)",
x = "Perception of Corruption Among Business Executives",
y = "Preference for Government Ownership of Business",
caption = "Source: World Values Survey, Wave 6 (2013)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
)
Finally, we will create a table of regression results using the
stargazer
package.
# Create a stargazer table with improved labels
stargazer(lm_model2013, lm_model2013_cov, glm_model2013, glm_model2013_cov, type = "html",
title = "Regression Results",
dep.var.labels.include = FALSE,
column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
dep.var.labels = c("Preference for Government Ownership of Business"),
covariate.labels = c("Perception of Corruption Among Business Executives",
"Urban/Rural",
"Interview Date",
"Age",
"Sex"),
keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
notes.append = TRUE,
notes.align = "r")
Dependent variable: | ||||
OLS | normal | |||
OLS | OLS with Covariates | GLM | GLM with Covariates | |
(1) | (2) | (3) | (4) | |
Perception of Corruption Among Business Executives | -0.050* | -0.052* | -0.050* | -0.052* |
(0.027) | (0.027) | (0.027) | (0.027) | |
Urban/Rural | -0.001 | -0.001 | ||
(0.002) | (0.002) | |||
Interview Date | 0.006 | 0.006 | ||
(0.004) | (0.004) | |||
Age | 0.122 | 0.122 | ||
(0.135) | (0.135) | |||
Sex | 6.282*** | 25,277.960 | 6.282*** | 25,277.960 |
(0.215) | (37,408.500) | (0.215) | (37,408.500) | |
Observations | 1,205 | 1,205 | 1,205 | 1,205 |
R2 | 0.003 | 0.006 | ||
Adjusted R2 | 0.002 | 0.003 | ||
Log Likelihood | -2,733.370 | -2,731.471 | ||
Akaike Inf. Crit. | 5,470.740 | 5,472.942 | ||
F Statistic | 3.381* (df = 1; 1203) | 1.793 (df = 4; 1200) | ||
Note: | p<0.1; p<0.05; p<0.01 |
# Load required libraries if not already loaded
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
# Combine the data from both years
df_combined <- rbind(
data.frame(year = "2013", preference = df2013_sub$V97),
data.frame(year = "2019", preference = df2019_sub$Q107)
)
# Create the overlapping density plot
ggplot(df_combined, aes(x = preference, fill = year)) +
geom_density(alpha = 0.7) +
labs(title = "Preference for Government Ownership of Business in Tunisia",
subtitle = "Comparison between 2013 and 2019",
x = "Preference for Government Ownership (1 = Private, 10 = Government)",
y = "Density",
fill = "Year",
caption = "Source: World Values Survey, Waves 6 (2013) and 7 (2019)") +
scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
plot.subtitle = element_text(size = 12, color = "#002F6C"),
axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
axis.text = element_text(size = 10, color = "#002F6C"),
legend.title = element_text(face = "bold", color = "#002F6C"),
legend.text = element_text(color = "#002F6C"),
legend.position = "top",
panel.grid.major = element_line(color = "#E6E6E6"),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, color = "#002F6C")
) +
scale_x_continuous(breaks = seq(1, 10, 1)) +
scale_y_continuous(labels = scales::percent_format(scale = 1))
# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(ggpubr)
# Combine the data from both years (assuming df2013_sub and df2019_sub exist)
df_combined <- rbind(
data.frame(year = "2013", preference = df2013_sub$V97),
data.frame(year = "2019", preference = df2019_sub$Q107)
)
# Calculate summary statistics
summary_stats <- df_combined %>%
group_by(year) %>%
summarise(
mean = mean(preference, na.rm = TRUE),
median = median(preference, na.rm = TRUE),
lower = quantile(preference, 0.25, na.rm = TRUE),
upper = quantile(preference, 0.75, na.rm = TRUE)
)
# Perform t-test
t_test_result <- t.test(preference ~ year, data = df_combined)
p_value <- t_test_result$p.value
significance <- if(p_value < 0.05) "Statistically Significant" else "Not Statistically Significant"
# Create the improved plot
plot <- ggplot(summary_stats, aes(x = year, y = median, color = year)) +
geom_pointrange(aes(ymin = lower, ymax = upper), size = 1.5, fatten = 3) +
geom_text(aes(y = upper, label = sprintf("Median: %.1f", median)),
vjust = -1, hjust = 0.5, size = 4, fontface = "bold") +
labs(title = "Preference for Government Ownership of Business in Tunisia",
subtitle = "Comparison between 2013 and 2019",
x = "Year",
y = "Preference for Government Ownership",
caption = paste("Source: World Values Survey, Waves 6 (2013) and 7 (2019)",
"\nBars show interquartile range",
"\np-value:", format.pval(p_value, digits = 3))) +
scale_color_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
scale_y_continuous(breaks = seq(1, 10, 1),
labels = c("1\n(Private\nOwnership)", "2", "3", "4", "5", "6", "7", "8", "9", "10\n(Government\nOwnership)"),
limits = c(1, 10)) +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
plot.subtitle = element_text(size = 16, hjust = 0.5),
axis.title.x = element_text(face = "bold", size = 14, margin = margin(t = 10)),
axis.title.y = element_blank(),
axis.text = element_text(size = 12),
legend.position = "none",
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
plot.caption = element_text(hjust = 0, size = 10, margin = margin(t = 20)),
plot.margin = margin(20, 20, 20, 20)
)
# Display the plot
print(plot)
# Print detailed t-test results
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: preference by year
## t = -6.9847, df = 2252.5, p-value = 3.743e-12
## alternative hypothesis: true difference in means between group 2013 and group 2019 is not equal to 0
## 95 percent confidence interval:
## -0.9960414 -0.5593524
## sample estimates:
## mean in group 2013 mean in group 2019
## 5.907177 6.684874
# Load required libraries
library(ggplot2)
library(dplyr)
library(knitr)
library(tidyr)
# Function to find the correct column name
find_column <- function(df, possible_names) {
for (name in possible_names) {
if (name %in% names(df)) {
return(name)
}
}
stop("Could not find a matching column name")
}
# Find the correct column names
col_2013 <- find_column(df2013_sub, c("V97", "Q107", "government_ownership"))
col_2019 <- find_column(df2019_sub, c("Q107", "V97", "government_ownership"))
# Create a contingency table
cont_table <- table(Year = c(rep("2013", nrow(df2013_sub)), rep("2019", nrow(df2019_sub))),
Preference = c(df2013_sub[[col_2013]], df2019_sub[[col_2019]]))
# Perform Fisher's Exact Test
fisher_test_result <- fisher.test(cont_table, simulate.p.value = TRUE, B = 10000)
# Create a table of results
result_table <- data.frame(
P_Value = fisher_test_result$p.value,
Significant = ifelse(fisher_test_result$p.value < 0.05, "Yes", "No")
)
# Print the test results table
kable(result_table, caption = "Fisher's Exact Test Results")
P_Value | Significant |
---|---|
1e-04 | Yes |
# Calculate summary statistics
summary_stats <- rbind(
df2013_sub %>% summarise(Year = 2013,
Mean = mean(!!sym(col_2013), na.rm = TRUE),
Median = median(!!sym(col_2013), na.rm = TRUE),
SD = sd(!!sym(col_2013), na.rm = TRUE)),
df2019_sub %>% summarise(Year = 2019,
Mean = mean(!!sym(col_2019), na.rm = TRUE),
Median = median(!!sym(col_2019), na.rm = TRUE),
SD = sd(!!sym(col_2019), na.rm = TRUE))
)
# Print summary statistics table
kable(summary_stats, caption = "Summary Statistics", digits = 2)
Year | Mean | Median | SD |
---|---|---|---|
2013 | 5.91 | 5.91 | 2.34 |
2019 | 6.68 | 7.00 | 3.08 |
# Prepare data for plotting
plot_data <- as.data.frame(cont_table) %>%
group_by(Year) %>%
mutate(Percentage = Freq / sum(Freq) * 100)
# Create a grouped bar plot
ggplot(plot_data, aes(x = Preference, y = Percentage, fill = Year)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
labs(title = "Distribution of Government Ownership Preferences",
subtitle = "Comparison between 2013 and 2019",
x = "Preference for Government Ownership (1 = Private, 10 = Government)",
y = "Percentage",
fill = "Year") +
theme_minimal() +
theme(legend.position = "top") +
scale_x_discrete(limits = as.character(1:10))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
# Create the density plot
ggplot() +
geom_density(aes(x = df2013_sub[[col_2013]], fill = "2013"), alpha = 0.5) +
geom_density(aes(x = df2019_sub[[col_2019]], fill = "2019"), alpha = 0.5) +
scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
labs(title = "Distribution of Government Ownership Preferences",
subtitle = "Comparison between 2013 and 2019",
x = "Preference for Government Ownership (1 = Private, 10 = Government)",
y = "Density",
fill = "Year") +
theme_minimal() +
theme(legend.position = "top")