First, we load the World Value Survey data collected in Tunisia in 2019 from a Stata file and inspect its structure.
# Read the Stata file
df <- read_dta("~/Downloads/WVS_Wave_7_Tunisia_Stata_v5.0.dta")
# Display the first few rows of the dataset
head(df)
## # A tibble: 6 × 397
## version doi A_YEAR B_COUNTRY B_COUNTRY_ALPHA C_COW_NUM C_COW_ALPHA
## <chr> <chr> <dbl+lbl> <dbl+lbl> <chr> <dbl+lbl> <chr>
## 1 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN 616 [Tun… TUN
## 2 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN 616 [Tun… TUN
## 3 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN 616 [Tun… TUN
## 4 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN 616 [Tun… TUN
## 5 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN 616 [Tun… TUN
## 6 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN 616 [Tun… TUN
## # ℹ 390 more variables: D_INTERVIEW <dbl>, J_INTDATE <dbl+lbl>, FW_START <dbl>,
## # FW_END <dbl>, K_TIME_START <dbl+lbl>, K_TIME_END <dbl+lbl>,
## # K_DURATION <dbl+lbl>, Q_MODE <dbl+lbl>, N_REGION_ISO <dbl+lbl>,
## # N_REGION_WVS <dbl+lbl>, N_TOWN <dbl+lbl>, G_TOWNSIZE <dbl+lbl>,
## # G_TOWNSIZE2 <dbl+lbl>, H_SETTLEMENT <dbl+lbl>, H_URBRURAL <dbl+lbl>,
## # O1_LONGITUDE <dbl+lbl>, O2_LATITUDE <dbl+lbl>, S_INTLANGUAGE <dbl+lbl>,
## # LNGE_ISO <chr>, E_RESPINT <dbl+lbl>, F_INTPRIVACY <dbl+lbl>, …
We then select the columns of interest for the analysis.
columns_of_interest <- c("Q107",
"Q114",
"H_URBRURAL",
"J_INTDATE",
"Q260",
"Q262")
df_sub <- df[, columns_of_interest]
We will clean the selected data by replacing negative values with NA.
df_sub <- df_sub %>%
mutate(Q107 = ifelse(Q107 < 0, NA, Q107),
Q114 = ifelse(Q114 < 0, NA, Q114),
Q260 = ifelse(Q260 < 0, NA, Q260))
We will check for multicollinearity by calculating and plotting the correlation matrix.
# Calculate correlation matrix
cor_matrix <- cor(df_sub, use = "complete.obs")
# Plot correlation matrix
corrplot(cor_matrix, method = "circle")
We will perform simple imputation to handle missing values.
# Perform simple imputation
df_sub <- df_sub %>%
mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
We will test the association between business ownership preference
(Q107
) and the perception of corruption among business
executives (Q114
) using linear and generalized linear
models.
# Descriptive statistics for Q107 and Q114
descriptive_stats <- describe(df_sub[, c("Q107", "Q114")])
# Print the descriptive statistics
print(descriptive_stats)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## Q107 1 1208 6.68 3.08 7 6.97 4.45 1 10 9 -0.53 -1.05 0.09
## Q114 2 1208 2.93 0.76 3 2.94 1.48 1 4 3 -0.17 -0.59 0.02
# Test the association using lm
lm_model <- lm(Q107 ~ Q114, df_sub)
summary(lm_model)
##
## Call:
## lm(formula = Q107 ~ Q114, data = df_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4312 -2.4312 0.5688 2.5688 4.6649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6365 0.3499 13.250 < 2e-16 ***
## Q114 0.6987 0.1156 6.045 1.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.035 on 1206 degrees of freedom
## Multiple R-squared: 0.02941, Adjusted R-squared: 0.0286
## F-statistic: 36.54 on 1 and 1206 DF, p-value: 1.99e-09
# Fit the linear model with additional covariates
lm_model_cov <- lm(Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260, data = df_sub)
summary(lm_model_cov)
##
## Call:
## lm(formula = Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260,
## data = df_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.751 -2.432 0.649 2.586 5.079
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.887e+05 5.657e+04 -3.336 0.000876 ***
## Q114 7.083e-01 1.144e-01 6.194 8.03e-10 ***
## H_URBRURAL 7.026e-01 1.846e-01 3.805 0.000149 ***
## J_INTDATE 9.347e-03 2.802e-03 3.336 0.000876 ***
## Q262 1.245e-02 5.501e-03 2.263 0.023795 *
## Q260 1.863e-01 1.736e-01 1.073 0.283333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.001 on 1202 degrees of freedom
## Multiple R-squared: 0.05418, Adjusted R-squared: 0.05024
## F-statistic: 13.77 on 5 and 1202 DF, p-value: 4.24e-13
# Test the association using glm (assuming Gaussian family)
glm_model <- glm(Q107 ~ Q114, family = gaussian, df_sub)
summary(glm_model)
##
## Call:
## glm(formula = Q107 ~ Q114, family = gaussian, data = df_sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6365 0.3499 13.250 < 2e-16 ***
## Q114 0.6987 0.1156 6.045 1.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 9.212431)
##
## Null deviance: 11447 on 1207 degrees of freedom
## Residual deviance: 11110 on 1206 degrees of freedom
## AIC: 6114.6
##
## Number of Fisher Scoring iterations: 2
# Fit the glm with additional covariates
glm_model_cov <- glm(Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260, data = df_sub)
summary(glm_model_cov)
##
## Call:
## glm(formula = Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260,
## data = df_sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.887e+05 5.657e+04 -3.336 0.000876 ***
## Q114 7.083e-01 1.144e-01 6.194 8.03e-10 ***
## H_URBRURAL 7.026e-01 1.846e-01 3.805 0.000149 ***
## J_INTDATE 9.347e-03 2.802e-03 3.336 0.000876 ***
## Q262 1.245e-02 5.501e-03 2.263 0.023795 *
## Q260 1.863e-01 1.736e-01 1.073 0.283333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 9.007219)
##
## Null deviance: 11447 on 1207 degrees of freedom
## Residual deviance: 10827 on 1202 degrees of freedom
## AIC: 6091.4
##
## Number of Fisher Scoring iterations: 2
We will visualize the density distribution of the two variables to understand their spread and central tendencies.
# Plot the density distribution for Q107
ggplot(df_sub, aes(x = Q107)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Density Distribution of Q107: Preference for Government Ownership of Business",
x = "Q107: Preference for Government Ownership of Business",
y = "Density") +
theme_minimal()
# Plot the density distribution for Q114
ggplot(df_sub, aes(x = Q114)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Density Distribution of Q114: Perception of Corruption Among Business Executives",
x = "Q114: Perception of Corruption Among Business Executives",
y = "Density") +
theme_minimal()
We will visualize the results of the linear and generalized linear models.
# Visualize the results of the linear model
ggplot(df_sub, aes(y = Q107, x = Q114)) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Linear Model: Q107 vs Q114",
y = "Q107: Preference for Government Ownership of Business",
x = "Q114: Perception of Corruption Among Business Executives") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Visualize the results of the generalized linear model
ggplot(df_sub, aes(y = Q107, x = Q114)) +
geom_smooth(method = "glm", method.args = list(family = "gaussian"), color = "blue") +
labs(title = "Generalized Linear Model: Q107 vs Q114",
y = "Q107: Preference for Government Ownership of Business",
x = "Q114: Perception of Corruption Among Business Executives") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Visualize the age distribution of preferences
ggplot(df_sub, aes(y = Q107, x = Q262)) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Generalized Linear Model: Q107 vs Q114",
y = "Preference for Government Ownership of Business",
x = "Age") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
We will create an effect plot to visualize the effects of the variables in the model.
# Compute the effect for the linear model with covariates
effect_lm <- effect("Q114", lm_model_cov)
# Plot the effect for the linear model
plot(effect_lm,
main = "Linear Model Effect Plot",
xlab = "Q114: Perception of Corruption Among Business Executives",
ylab = "Q107: Preference for Government Ownership of Business",
grid = TRUE)
# Compute the effect for the generalized linear model with covariates
effect_glm <- effect("Q114", glm_model_cov)
# Plot the effect for the generalized linear model
plot(effect_glm,
main = "Generalized Linear Model Effect Plot",
xlab = "Q114: Perception of Corruption Among Business Executives",
ylab = "Q107: Preference for Government Ownership of Business",
grid = TRUE)
Finally, we will create a table of regression results using the
stargazer
package.
# Create a stargazer table with improved labels
stargazer(lm_model, lm_model_cov, glm_model, glm_model_cov, type = "html",
title = "Regression Results",
dep.var.labels.include = FALSE,
column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
dep.var.labels = c("Perception of Corruption Among Business Executives"),
covariate.labels = c("Preference for Government Ownership of Business",
"Urban/Rural",
"Interview Date",
"Age",
"Sex"),
keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
notes.append = TRUE,
notes.align = "r")
Dependent variable: | ||||
OLS | normal | |||
OLS | OLS with Covariates | GLM | GLM with Covariates | |
(1) | (2) | (3) | (4) | |
Preference for Government Ownership of Business | 0.699*** | 0.708*** | 0.699*** | 0.708*** |
(0.116) | (0.114) | (0.116) | (0.114) | |
Urban/Rural | 0.703*** | 0.703*** | ||
(0.185) | (0.185) | |||
Interview Date | 0.009*** | 0.009*** | ||
(0.003) | (0.003) | |||
Age | 0.012** | 0.012** | ||
(0.006) | (0.006) | |||
Sex | 0.186 | 0.186 | ||
(0.174) | (0.174) | |||
Constant | 4.636*** | -188,724.500*** | 4.636*** | -188,724.500*** |
(0.350) | (56,574.450) | (0.350) | (56,574.450) | |
Observations | 1,208 | 1,208 | 1,208 | 1,208 |
R2 | 0.029 | 0.054 | ||
Adjusted R2 | 0.029 | 0.050 | ||
Log Likelihood | -3,055.291 | -3,039.678 | ||
Akaike Inf. Crit. | 6,114.583 | 6,091.356 | ||
F Statistic | 36.541*** (df = 1; 1206) | 13.770*** (df = 5; 1202) | ||
Note: | p<0.1; p<0.05; p<0.01 |