Data Preparation

First, we load the World Value Survey data collected in Tunisia in 2019 from a Stata file and inspect its structure.

# Read the Stata file
df <- read_dta("~/Downloads/WVS_Wave_7_Tunisia_Stata_v5.0.dta")
# Display the first few rows of the dataset
head(df)
## # A tibble: 6 × 397
##   version       doi   A_YEAR     B_COUNTRY B_COUNTRY_ALPHA C_COW_NUM C_COW_ALPHA
##   <chr>         <chr> <dbl+lbl>  <dbl+lbl> <chr>           <dbl+lbl> <chr>      
## 1 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN             616 [Tun… TUN        
## 2 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN             616 [Tun… TUN        
## 3 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN             616 [Tun… TUN        
## 4 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN             616 [Tun… TUN        
## 5 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN             616 [Tun… TUN        
## 6 1-5-0 (2020-… doi.… 2019 [201… 788 [Tun… TUN             616 [Tun… TUN        
## # ℹ 390 more variables: D_INTERVIEW <dbl>, J_INTDATE <dbl+lbl>, FW_START <dbl>,
## #   FW_END <dbl>, K_TIME_START <dbl+lbl>, K_TIME_END <dbl+lbl>,
## #   K_DURATION <dbl+lbl>, Q_MODE <dbl+lbl>, N_REGION_ISO <dbl+lbl>,
## #   N_REGION_WVS <dbl+lbl>, N_TOWN <dbl+lbl>, G_TOWNSIZE <dbl+lbl>,
## #   G_TOWNSIZE2 <dbl+lbl>, H_SETTLEMENT <dbl+lbl>, H_URBRURAL <dbl+lbl>,
## #   O1_LONGITUDE <dbl+lbl>, O2_LATITUDE <dbl+lbl>, S_INTLANGUAGE <dbl+lbl>,
## #   LNGE_ISO <chr>, E_RESPINT <dbl+lbl>, F_INTPRIVACY <dbl+lbl>, …

We then select the columns of interest for the analysis.

columns_of_interest <- c("Q107", 
                         "Q114",
                         "H_URBRURAL",
                         "J_INTDATE", 
                         "Q260",
                         "Q262")

df_sub <- df[, columns_of_interest]

Data Cleaning

We will clean the selected data by replacing negative values with NA.

df_sub <- df_sub %>%
  mutate(Q107 = ifelse(Q107 < 0, NA, Q107),
         Q114 = ifelse(Q114 < 0, NA, Q114),
         Q260 = ifelse(Q260 < 0, NA, Q260))

Check for Multicollinearity

We will check for multicollinearity by calculating and plotting the correlation matrix.

# Calculate correlation matrix
cor_matrix <- cor(df_sub, use = "complete.obs")
# Plot correlation matrix
corrplot(cor_matrix, method = "circle")

Data Imputation

We will perform simple imputation to handle missing values.

# Perform simple imputation
df_sub <- df_sub %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Statistical Analysis

We will test the association between business ownership preference (Q107) and the perception of corruption among business executives (Q114) using linear and generalized linear models.

# Descriptive statistics for Q107 and Q114
descriptive_stats <- describe(df_sub[, c("Q107", "Q114")])

# Print the descriptive statistics
print(descriptive_stats)
##      vars    n mean   sd median trimmed  mad min max range  skew kurtosis   se
## Q107    1 1208 6.68 3.08      7    6.97 4.45   1  10     9 -0.53    -1.05 0.09
## Q114    2 1208 2.93 0.76      3    2.94 1.48   1   4     3 -0.17    -0.59 0.02

Linear Model

# Test the association using lm
lm_model <- lm(Q107 ~ Q114, df_sub)
summary(lm_model)
## 
## Call:
## lm(formula = Q107 ~ Q114, data = df_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4312 -2.4312  0.5688  2.5688  4.6649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6365     0.3499  13.250  < 2e-16 ***
## Q114          0.6987     0.1156   6.045 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.035 on 1206 degrees of freedom
## Multiple R-squared:  0.02941,    Adjusted R-squared:  0.0286 
## F-statistic: 36.54 on 1 and 1206 DF,  p-value: 1.99e-09
# Fit the linear model with additional covariates
lm_model_cov <- lm(Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260, data = df_sub)
summary(lm_model_cov)
## 
## Call:
## lm(formula = Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260, 
##     data = df_sub)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.751 -2.432  0.649  2.586  5.079 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.887e+05  5.657e+04  -3.336 0.000876 ***
## Q114         7.083e-01  1.144e-01   6.194 8.03e-10 ***
## H_URBRURAL   7.026e-01  1.846e-01   3.805 0.000149 ***
## J_INTDATE    9.347e-03  2.802e-03   3.336 0.000876 ***
## Q262         1.245e-02  5.501e-03   2.263 0.023795 *  
## Q260         1.863e-01  1.736e-01   1.073 0.283333    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.001 on 1202 degrees of freedom
## Multiple R-squared:  0.05418,    Adjusted R-squared:  0.05024 
## F-statistic: 13.77 on 5 and 1202 DF,  p-value: 4.24e-13

Generalized Linear Model

# Test the association using glm (assuming Gaussian family)
glm_model <- glm(Q107 ~ Q114, family = gaussian, df_sub)
summary(glm_model)
## 
## Call:
## glm(formula = Q107 ~ Q114, family = gaussian, data = df_sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6365     0.3499  13.250  < 2e-16 ***
## Q114          0.6987     0.1156   6.045 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.212431)
## 
##     Null deviance: 11447  on 1207  degrees of freedom
## Residual deviance: 11110  on 1206  degrees of freedom
## AIC: 6114.6
## 
## Number of Fisher Scoring iterations: 2
# Fit the glm with additional covariates
glm_model_cov <- glm(Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260, data = df_sub)
summary(glm_model_cov)
## 
## Call:
## glm(formula = Q107 ~ Q114 + H_URBRURAL + J_INTDATE + Q262 + Q260, 
##     data = df_sub)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.887e+05  5.657e+04  -3.336 0.000876 ***
## Q114         7.083e-01  1.144e-01   6.194 8.03e-10 ***
## H_URBRURAL   7.026e-01  1.846e-01   3.805 0.000149 ***
## J_INTDATE    9.347e-03  2.802e-03   3.336 0.000876 ***
## Q262         1.245e-02  5.501e-03   2.263 0.023795 *  
## Q260         1.863e-01  1.736e-01   1.073 0.283333    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.007219)
## 
##     Null deviance: 11447  on 1207  degrees of freedom
## Residual deviance: 10827  on 1202  degrees of freedom
## AIC: 6091.4
## 
## Number of Fisher Scoring iterations: 2

Data Visualization

Density Distribution Plots

We will visualize the density distribution of the two variables to understand their spread and central tendencies.

# Plot the density distribution for Q107
ggplot(df_sub, aes(x = Q107)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Density Distribution of Q107: Preference for Government Ownership of Business",
       x = "Q107: Preference for Government Ownership of Business",
       y = "Density") +
  theme_minimal()

# Plot the density distribution for Q114
ggplot(df_sub, aes(x = Q114)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Density Distribution of Q114: Perception of Corruption Among Business Executives",
       x = "Q114: Perception of Corruption Among Business Executives",
       y = "Density") +
  theme_minimal()

Regression Plots

We will visualize the results of the linear and generalized linear models.

# Visualize the results of the linear model
ggplot(df_sub, aes(y = Q107, x = Q114)) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Linear Model: Q107 vs Q114",
       y = "Q107: Preference for Government Ownership of Business",
       x = "Q114: Perception of Corruption Among Business Executives") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Visualize the results of the generalized linear model
ggplot(df_sub, aes(y = Q107, x = Q114)) +
  geom_smooth(method = "glm", method.args = list(family = "gaussian"), color = "blue") +
  labs(title = "Generalized Linear Model: Q107 vs Q114",
       y = "Q107: Preference for Government Ownership of Business",
       x = "Q114: Perception of Corruption Among Business Executives") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Visualize the age distribution of preferences
ggplot(df_sub, aes(y = Q107, x = Q262)) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Generalized Linear Model: Q107 vs Q114",
       y = "Preference for Government Ownership of Business",
       x = "Age") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Effect Plot

We will create an effect plot to visualize the effects of the variables in the model.

# Compute the effect for the linear model with covariates
effect_lm <- effect("Q114", lm_model_cov)

# Plot the effect for the linear model
plot(effect_lm,
     main = "Linear Model Effect Plot",
     xlab = "Q114: Perception of Corruption Among Business Executives",
     ylab = "Q107: Preference for Government Ownership of Business",
     grid = TRUE)

# Compute the effect for the generalized linear model with covariates
effect_glm <- effect("Q114", glm_model_cov)

# Plot the effect for the generalized linear model
plot(effect_glm,
     main = "Generalized Linear Model Effect Plot",
     xlab = "Q114: Perception of Corruption Among Business Executives",
     ylab = "Q107: Preference for Government Ownership of Business",
     grid = TRUE)

Reporting Results

Finally, we will create a table of regression results using the stargazer package.

# Create a stargazer table with improved labels
stargazer(lm_model, lm_model_cov, glm_model, glm_model_cov, type = "html",
          title = "Regression Results",
          dep.var.labels.include = FALSE,
          column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
          dep.var.labels = c("Perception of Corruption Among Business Executives"),
          covariate.labels = c("Preference for Government Ownership of Business", 
                               "Urban/Rural", 
                               "Interview Date",
                               "Age",
                               "Sex"),
          keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
          notes.append = TRUE,
          notes.align = "r")
Regression Results
Dependent variable:
OLS normal
OLS OLS with Covariates GLM GLM with Covariates
(1) (2) (3) (4)
Preference for Government Ownership of Business 0.699*** 0.708*** 0.699*** 0.708***
(0.116) (0.114) (0.116) (0.114)
Urban/Rural 0.703*** 0.703***
(0.185) (0.185)
Interview Date 0.009*** 0.009***
(0.003) (0.003)
Age 0.012** 0.012**
(0.006) (0.006)
Sex 0.186 0.186
(0.174) (0.174)
Constant 4.636*** -188,724.500*** 4.636*** -188,724.500***
(0.350) (56,574.450) (0.350) (56,574.450)
Observations 1,208 1,208 1,208 1,208
R2 0.029 0.054
Adjusted R2 0.029 0.050
Log Likelihood -3,055.291 -3,039.678
Akaike Inf. Crit. 6,114.583 6,091.356
F Statistic 36.541*** (df = 1; 1206) 13.770*** (df = 5; 1202)
Note: p<0.1; p<0.05; p<0.01