Data Preparation

First, we load the World Value Survey data collected in Tunisia in 2019 from a Stata file and inspect its structure.

# Read the Stata file
df2019 <- read_dta("~/Downloads/WVS_Wave_7_Tunisia_Stata_v5.0.dta")

We then select the columns of interest for the analysis.

columns_of_interest <- c("Q107", 
                         "Q114",
                         "J_INTDATE", 
                         "Q260",
                         "Q262",
                         "Q245",
                         "Q235",
                         "Q237")

df2019_sub <- df2019[, columns_of_interest]

Data Cleaning

We will clean the selected data by replacing negative values with NA.

df2019_sub <- df2019_sub %>%
  mutate(Q107 = ifelse(Q107 < 0, NA, Q107),
          Q237 = ifelse(Q237 < 0, NA, Q237),

                   Q235 = ifelse(Q235 < 0, NA, Q235),
                   Q245 = ifelse(Q245 < 0, NA, Q245),
         Q114 = ifelse(Q114 < 0, NA, Q114),
         Q260 = ifelse(Q260 < 0, NA, Q260))

Check for Multicollinearity

We will check for multicollinearity by calculating and plotting the correlation matrix.

# Calculate correlation matrix
cor_matrix_2019 <- cor(df2019_sub, use = "complete.obs")

# Create more descriptive variable labels
var_labels <- c(
  "Q107" = "Gov't Ownership Preference",
  "Q114" = "Perceived Corruption",
  "J_INTDATE" = "Interview Date",
  "Q260" = "Urban/Rural",
  "Q262" = "Age",
  "Q245" = "Coup Acceptance",
  "Q235"= "Strongman Rule",
  "Q237" = "Army Rule"
)

# Melt the correlation matrix
melted_cormat <- melt(cor_matrix_2019)

# Create the plot
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#002F6C", high = "#BA0C2F", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        legend.justification = c(1, 0),
        legend.position = c(1.5, 0.3),
        legend.direction = "vertical") +
  coord_fixed() +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  scale_x_discrete(labels = var_labels) +
  scale_y_discrete(labels = var_labels) +
  ggtitle("Correlation Matrix of Key Variables (2019)") +
  labs(subtitle = "Tunisia World Values Survey Data",
       caption = "Source: World Values Survey, Wave 7 (2019)")
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Data Imputation

We will perform simple imputation to handle missing values.

# Perform simple imputation
df2019_sub <- df2019_sub %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Statistical Analysis

We will test the association between business ownership preference (Q107) and the perception of corruption among business executives (Q114) using linear and generalized linear models.

# Descriptive statistics for Q107 and Q114
descriptive_stats2019 <- describe(df2019_sub[, c("Q107", "Q114", "Q245", "Q237")])

# Print the descriptive statistics
print(descriptive_stats2019)
##      vars    n mean   sd median trimmed  mad min max range  skew kurtosis   se
## Q107    1 1208 6.68 3.08   7.00    6.97 4.45   1  10     9 -0.53    -1.05 0.09
## Q114    2 1208 2.93 0.76   3.00    2.94 1.48   1   4     3 -0.17    -0.59 0.02
## Q245    3 1208 5.56 3.14   5.56    5.57 3.79   1  10     9 -0.04    -1.26 0.09
## Q237    4 1208 2.87 0.94   3.00    2.96 1.48   1   4     3 -0.55    -0.52 0.03

Linear Model

# Test the association using lm
lm_model2019 <- lm(Q107 ~ Q114, df2019_sub)
summary(lm_model2019)
## 
## Call:
## lm(formula = Q107 ~ Q114, data = df2019_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4312 -2.4312  0.5688  2.5688  4.6649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6365     0.3499  13.250  < 2e-16 ***
## Q114          0.6987     0.1156   6.045 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.035 on 1206 degrees of freedom
## Multiple R-squared:  0.02941,    Adjusted R-squared:  0.0286 
## F-statistic: 36.54 on 1 and 1206 DF,  p-value: 1.99e-09
# Fit the linear model with additional covariates
lm_model2019_cov <- lm(Q107 ~ Q114  + J_INTDATE + Q262 + Q260, data = df2019_sub)
summary(lm_model2019_cov)
## 
## Call:
## lm(formula = Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7162 -2.4267  0.6046  2.6498  4.9564 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.803e+05  5.685e+04  -3.172  0.00155 ** 
## Q114         7.112e-01  1.150e-01   6.185 8.49e-10 ***
## J_INTDATE    8.930e-03  2.816e-03   3.172  0.00155 ** 
## Q262         1.287e-02  5.531e-03   2.328  0.02010 *  
## Q260         1.785e-01  1.746e-01   1.022  0.30683    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.018 on 1203 degrees of freedom
## Multiple R-squared:  0.04278,    Adjusted R-squared:  0.0396 
## F-statistic: 13.44 on 4 and 1203 DF,  p-value: 1.012e-10

Generalized Linear Model

# Test the association using glm (assuming Gaussian family)
glm_model2019 <- glm(Q107 ~ Q114, family = gaussian, df2019_sub)
summary(glm_model2019)
## 
## Call:
## glm(formula = Q107 ~ Q114, family = gaussian, data = df2019_sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6365     0.3499  13.250  < 2e-16 ***
## Q114          0.6987     0.1156   6.045 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.212431)
## 
##     Null deviance: 11447  on 1207  degrees of freedom
## Residual deviance: 11110  on 1206  degrees of freedom
## AIC: 6114.6
## 
## Number of Fisher Scoring iterations: 2
# Fit the glm with additional covariates
glm_model2019_cov <- glm(Q107 ~ Q114  + J_INTDATE + Q262 + Q260, data = df2019_sub)
summary(glm_model2019_cov)
## 
## Call:
## glm(formula = Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.803e+05  5.685e+04  -3.172  0.00155 ** 
## Q114         7.112e-01  1.150e-01   6.185 8.49e-10 ***
## J_INTDATE    8.930e-03  2.816e-03   3.172  0.00155 ** 
## Q262         1.287e-02  5.531e-03   2.328  0.02010 *  
## Q260         1.785e-01  1.746e-01   1.022  0.30683    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.108157)
## 
##     Null deviance: 11447  on 1207  degrees of freedom
## Residual deviance: 10957  on 1203  degrees of freedom
## AIC: 6103.8
## 
## Number of Fisher Scoring iterations: 2

Data Visualization

Density Distribution Plots

We will visualize the density distribution of the two variables to understand their spread and central tendencies.

# Plot the density distribution for Q107
ggplot(df2019_sub, aes(x = Q107)) +
  geom_density(fill = "#002F6C", alpha = 0.7) +
  labs(title = "Distribution of Preference for Government Ownership of Business",
       subtitle = "Tunisia, 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

# Plot the density distribution for Q114
ggplot(df2019_sub, aes(x = Q114)) +
  geom_density(fill = "#BA0C2F", alpha = 0.7) +
  labs(title = "Distribution of Perceived Corruption Among Business Executives",
       subtitle = "Tunisia, 2019",
       x = "Perception of Corruption (1 = None, 10 = All)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

Regression Plots

We will visualize the results of the linear and generalized linear models.

library(ggplot2)

# Visualize the results of the linear model
ggplot(df2019_sub, aes(y = Q107, x = Q114)) +
  geom_smooth(method = "lm", color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Perceived Corruption and Government Ownership Preference",
       subtitle = "Linear Model (2019)",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )
## `geom_smooth()` using formula = 'y ~ x'

# Visualize the results of the generalized linear model
ggplot(df2019_sub, aes(y = Q107, x = Q114)) +
  geom_smooth(method = "glm", method.args = list(family = "gaussian"), 
              color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Perceived Corruption and Government Ownership Preference",
       subtitle = "Generalized Linear Model (2019)",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )
## `geom_smooth()` using formula = 'y ~ x'

# Visualize the age distribution of preferences
ggplot(df2019_sub, aes(y = Q107, x = Q262)) +
  geom_smooth(method = "lm", color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Age and Government Ownership Preference",
       subtitle = "Linear Model (2019)",
       y = "Preference for Government Ownership of Business",
       x = "Age",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )
## `geom_smooth()` using formula = 'y ~ x'

Effect Plot

We will create an effect plot to visualize the effects of the variables in the model.

# Compute the effect for the linear model with covariates
effect_lm2019 <- effect("Q114", lm_model2019_cov)

# Convert effect object to data frame
effect_lm_df <- as.data.frame(effect_lm2019)

# Plot the effect for the linear model
ggplot(effect_lm_df, aes(x = Q114, y = fit)) +
  geom_line(color = "#002F6C", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#002F6C") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Linear Model (2019)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Compute the effect for the generalized linear model with covariates
effect_glm2019 <- effect("Q114", glm_model2019_cov)

# Convert effect object to data frame
effect_glm_df <- as.data.frame(effect_glm2019)

# Plot the effect for the generalized linear model
ggplot(effect_glm_df, aes(x = Q114, y = fit)) +
  geom_line(color = "#BA0C2F", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#BA0C2F") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Generalized Linear Model (2019)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

Reporting Results

Finally, we will create a table of regression results using the stargazer package.

# Create a stargazer table with improved labels
stargazer(lm_model2019, lm_model2019_cov, glm_model2019, glm_model2019_cov, type = "html",
          title = "Regression Results",
          dep.var.labels.include = FALSE,
          column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
          dep.var.labels = c("Preference for Government Ownership of Business"),
          covariate.labels = c("Perception of Corruption Among Business Executives", 
                               "Urban/Rural", 
                               "Interview Date",
                               "Age",
                               "Sex"),
          keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
          notes.append = TRUE,
          notes.align = "r")
Regression Results
Dependent variable:
OLS normal
OLS OLS with Covariates GLM GLM with Covariates
(1) (2) (3) (4)
Perception of Corruption Among Business Executives 0.699*** 0.711*** 0.699*** 0.711***
(0.116) (0.115) (0.116) (0.115)
Urban/Rural 0.009*** 0.009***
(0.003) (0.003)
Interview Date 0.013** 0.013**
(0.006) (0.006)
Age 0.178 0.178
(0.175) (0.175)
Sex 4.636*** -180,293.000*** 4.636*** -180,293.000***
(0.350) (56,846.910) (0.350) (56,846.910)
Observations 1,208 1,208 1,208 1,208
R2 0.029 0.043
Adjusted R2 0.029 0.040
Log Likelihood -3,055.291 -3,046.911
Akaike Inf. Crit. 6,114.583 6,103.823
F Statistic 36.541*** (df = 1; 1206) 13.442*** (df = 4; 1203)
Note: p<0.1; p<0.05; p<0.01

Data Preparation

Now, we load the World Value Survey data collected in Tunisia in 2013 from a Stata file and inspect its structure.

# Read the Stata file
df2013 <- read_dta("~/Downloads/WV6_Data_Tunisia_Stata_v20201117.dta")

We then select the columns of interest for the analysis.

columns_of_interest <- c("V97", #Q107 in 2019
                         "MN_228M",#Q114 in 2019
                         "V261", #J_INTDATE in 2019
                         "V240", #Q260 in 2019
                         "V242",#Q262 in 2019
                         "V129", #Q237 in 2019
                         "V127",#Q235 in 2019
                         "V135") #Q245 in 2019


df2013_sub <- df2013[, columns_of_interest]

Data Cleaning

We will clean the selected data by replacing negative values with NA.

df2013_sub <- df2013_sub %>%
  mutate(V97 = ifelse(V97 < 0, NA, V97),
         V135= ifelse(V135 < 0, NA, V135),
                  V129= ifelse(V129 < 0, NA, V129),
                           V127= ifelse(V127 < 0, NA, V127),
         MN_228M = ifelse(MN_228M < 0, NA, MN_228M),
         V240 = ifelse(V240 < 0, NA, V240))

Check for Multicollinearity

We will check for multicollinearity by calculating and plotting the correlation matrix.

# Calculate correlation matrix
cor_matrix_2013 <- cor(df2013_sub, use = "complete.obs")

# Create more descriptive variable labels
var_labels <- c(
  "Q107" = "Gov't Ownership Preference",
  "Q114" = "Perceived Corruption",
  "J_INTDATE" = "Interview Date",
  "Q260" = "Urban/Rural",
  "Q262" = "Age"
)

# Melt the correlation matrix
melted_cormat <- melt(cor_matrix_2013)

# Create the plot
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#002F6C", high = "#BA0C2F", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        legend.justification = c(1, 0),
        legend.position = c(1.5, 0.3),
        legend.direction = "vertical") +
  coord_fixed() +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  scale_x_discrete(labels = var_labels) +
  scale_y_discrete(labels = var_labels) +
  ggtitle("Correlation Matrix of Key Variables (2013)") +
  labs(subtitle = "Tunisia World Values Survey Data",
       caption = "Source: World Values Survey, Wave 6 (2013)")

Data Imputation

We will perform simple imputation to handle missing values.

# Perform simple imputation
df2013_sub <- df2013_sub %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Statistical Analysis

We will test the association between business ownership preference (V97) and the perception of corruption among business executives (MN_228M) using linear and generalized linear models.

# Descriptive statistics for V97 and MN_228M
descriptive_stats2013 <- describe(df2013_sub[, c("V97", "MN_228M", "V135","V129")])

# Print the descriptive statistics
print(descriptive_stats2013)
##         vars    n mean   sd median trimmed  mad min max range  skew kurtosis
## V97        1 1205 5.91 2.34   5.91    5.96 1.34   1  10     9 -0.09    -0.09
## MN_228M    2 1205 7.47 2.47   8.00    7.74 2.97   1  10     9 -0.65    -0.55
## V135       3 1205 4.85 3.19   4.85    4.69 4.22   1  10     9  0.30    -1.17
## V129       4 1205 2.79 1.05   3.00    2.86 1.48   1   4     3 -0.41    -0.99
##           se
## V97     0.07
## MN_228M 0.07
## V135    0.09
## V129    0.03

Linear Model

# Test the association using lm
lm_model2013 <- lm(V97 ~ MN_228M, df2013_sub)
summary(lm_model2013)
## 
## Call:
## lm(formula = V97 ~ MN_228M, data = df2013_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2320 -1.0311  0.0191  1.1698  4.2201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.28228    0.21483  29.243   <2e-16 ***
## MN_228M     -0.05024    0.02732  -1.839   0.0662 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.338 on 1203 degrees of freedom
## Multiple R-squared:  0.002803,   Adjusted R-squared:  0.001974 
## F-statistic: 3.381 on 1 and 1203 DF,  p-value: 0.06619
# Fit the linear model with additional covariates
lm_model2013_cov <- lm(V97 ~ MN_228M  + V261 + V242 + V240, data = df2013_sub)
summary(lm_model2013_cov)
## 
## Call:
## lm(formula = V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3839 -0.9975 -0.0611  1.2373  4.4311 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  2.528e+04  3.741e+04   0.676   0.4993  
## MN_228M     -5.218e-02  2.740e-02  -1.905   0.0571 .
## V261        -1.255e-03  1.858e-03  -0.676   0.4994  
## V242         6.387e-03  4.161e-03   1.535   0.1251  
## V240         1.215e-01  1.352e-01   0.899   0.3689  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.338 on 1200 degrees of freedom
## Multiple R-squared:  0.005941,   Adjusted R-squared:  0.002627 
## F-statistic: 1.793 on 4 and 1200 DF,  p-value: 0.1279

Generalized Linear Model

# Test the association using glm (assuming Gaussian family)
glm_model2013 <- glm(V97 ~ MN_228M, family = gaussian, df2013_sub)
summary(glm_model2013)
## 
## Call:
## glm(formula = V97 ~ MN_228M, family = gaussian, data = df2013_sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.28228    0.21483  29.243   <2e-16 ***
## MN_228M     -0.05024    0.02732  -1.839   0.0662 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.467589)
## 
##     Null deviance: 6596.0  on 1204  degrees of freedom
## Residual deviance: 6577.5  on 1203  degrees of freedom
## AIC: 5470.7
## 
## Number of Fisher Scoring iterations: 2
# Fit the glm with additional covariates
glm_model2013_cov <- glm(V97 ~ MN_228M  + V261 + V242 + V240, data = df2013_sub)
summary(glm_model2013_cov)
## 
## Call:
## glm(formula = V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  2.528e+04  3.741e+04   0.676   0.4993  
## MN_228M     -5.218e-02  2.740e-02  -1.905   0.0571 .
## V261        -1.255e-03  1.858e-03  -0.676   0.4994  
## V242         6.387e-03  4.161e-03   1.535   0.1251  
## V240         1.215e-01  1.352e-01   0.899   0.3689  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.46401)
## 
##     Null deviance: 6596.0  on 1204  degrees of freedom
## Residual deviance: 6556.8  on 1200  degrees of freedom
## AIC: 5472.9
## 
## Number of Fisher Scoring iterations: 2

Data Visualization

Density Distribution Plots

We will visualize the density distribution of the two variables to understand their spread and central tendencies.

library(ggplot2)

# Plot the density distribution for V97
ggplot(df2013_sub, aes(x = V97)) +
  geom_density(fill = "#002F6C", alpha = 0.7) +
  labs(title = "Distribution of Preference for Government Ownership of Business",
       subtitle = "Tunisia, 2013",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

# Plot the density distribution for MN_228M
ggplot(df2013_sub, aes(x = MN_228M)) +
  geom_density(fill = "#BA0C2F", alpha = 0.7) +
  labs(title = "Distribution of Perceived Corruption Among Business Executives",
       subtitle = "Tunisia, 2013",
       x = "Perception of Corruption (1 = None, 10 = All)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

Regression Plots

We will visualize the results of the linear and generalized linear models.

# Visualize the results of the linear model
ggplot(df2013_sub, aes(y = V97, x = MN_228M)) +
  geom_smooth(method = "lm", color = "#002F6C", fill = "#002F6C", alpha = 0.2) +
  labs(title = "Linear Model: Government Ownership Preference vs Perceived Corruption",
       subtitle = "Tunisia, 2013",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  scale_y_continuous(breaks = seq(1, 10, 1))
## `geom_smooth()` using formula = 'y ~ x'

# Visualize the results of the generalized linear model
ggplot(df2013_sub, aes(y = V97, x = MN_228M)) +
  geom_smooth(method = "glm", method.args = list(family = "gaussian"), color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Generalized Linear Model: Government Ownership Preference vs Perceived Corruption",
       subtitle = "Tunisia, 2013",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  scale_y_continuous(breaks = seq(1, 10, 1))
## `geom_smooth()` using formula = 'y ~ x'

# Visualize the age distribution of preferences
ggplot(df2013_sub, aes(y = V97, x = V242)) +
  geom_smooth(method = "lm", color = "#002F6C", fill = "#002F6C", alpha = 0.2) +
  labs(title = "Age Distribution of Government Ownership Preferences",
       subtitle = "Tunisia, 2013",
       y = "Preference for Government Ownership of Business",
       x = "Age",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(18, max(df2013_sub$V242, na.rm = TRUE), by = 10)) +
  scale_y_continuous(breaks = seq(1, 10, 1))
## `geom_smooth()` using formula = 'y ~ x'

Effect Plot

We will create an effect plot to visualize the effects of the variables in the model.

library(ggplot2)
library(effects)

# Compute the effect for the linear model with covariates
effect_lm2013 <- effect("MN_228M", lm_model2013_cov)

# Convert effect object to data frame
effect_lm_df <- as.data.frame(effect_lm2013)

# Plot the effect for the linear model
ggplot(effect_lm_df, aes(x = MN_228M, y = fit)) +
  geom_line(color = "#002F6C", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#002F6C") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Linear Model (2013)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

# Compute the effect for the generalized linear model with covariates
effect_glm2013 <- effect("MN_228M", glm_model2013_cov)

# Convert effect object to data frame
effect_glm_df <- as.data.frame(effect_glm2013)

# Plot the effect for the generalized linear model
ggplot(effect_glm_df, aes(x = MN_228M, y = fit)) +
  geom_line(color = "#BA0C2F", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#BA0C2F") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Generalized Linear Model (2013)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

Reporting Results

Finally, we will create a table of regression results using the stargazer package.

# Create a stargazer table with improved labels
stargazer(lm_model2013, lm_model2013_cov, glm_model2013, glm_model2013_cov, type = "html",
          title = "Regression Results",
          dep.var.labels.include = FALSE,
          column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
          dep.var.labels = c("Preference for Government Ownership of Business"),
          covariate.labels = c("Perception of Corruption Among Business Executives", 
                               "Urban/Rural", 
                               "Interview Date",
                               "Age",
                               "Sex"),
          keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
          notes.append = TRUE,
          notes.align = "r")
Regression Results
Dependent variable:
OLS normal
OLS OLS with Covariates GLM GLM with Covariates
(1) (2) (3) (4)
Perception of Corruption Among Business Executives -0.050* -0.052* -0.050* -0.052*
(0.027) (0.027) (0.027) (0.027)
Urban/Rural -0.001 -0.001
(0.002) (0.002)
Interview Date 0.006 0.006
(0.004) (0.004)
Age 0.122 0.122
(0.135) (0.135)
Sex 6.282*** 25,277.960 6.282*** 25,277.960
(0.215) (37,408.500) (0.215) (37,408.500)
Observations 1,205 1,205 1,205 1,205
R2 0.003 0.006
Adjusted R2 0.002 0.003
Log Likelihood -2,733.370 -2,731.471
Akaike Inf. Crit. 5,470.740 5,472.942
F Statistic 3.381* (df = 1; 1203) 1.793 (df = 4; 1200)
Note: p<0.1; p<0.05; p<0.01
# Load required libraries if not already loaded
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
# Combine the data from both years
df_combined <- rbind(
  data.frame(year = "2013", preference = df2013_sub$V97),
  data.frame(year = "2019", preference = df2019_sub$Q107)
)

# Create the overlapping density plot
ggplot(df_combined, aes(x = preference, fill = year)) +
  geom_density(alpha = 0.7) +
  labs(title = "Preference for Government Ownership of Business in Tunisia",
       subtitle = "Comparison between 2013 and 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       fill = "Year",
       caption = "Source: World Values Survey, Waves 6 (2013) and 7 (2019)") +
  scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) + 
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    legend.title = element_text(face = "bold", color = "#002F6C"),
    legend.text = element_text(color = "#002F6C"),
    legend.position = "top",
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  scale_y_continuous(labels = scales::percent_format(scale = 1))

# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(ggpubr)

# Combine the data from both years (assuming df2013_sub and df2019_sub exist)
df_combined <- rbind(
  data.frame(year = "2013", preference = df2013_sub$V97),
  data.frame(year = "2019", preference = df2019_sub$Q107)
)

# Calculate summary statistics
summary_stats <- df_combined %>%
  group_by(year) %>%
  summarise(
    mean = mean(preference, na.rm = TRUE),
    median = median(preference, na.rm = TRUE),
    lower = quantile(preference, 0.25, na.rm = TRUE),
    upper = quantile(preference, 0.75, na.rm = TRUE)
  )

# Perform t-test
t_test_result <- t.test(preference ~ year, data = df_combined)
p_value <- t_test_result$p.value
significance <- if(p_value < 0.05) "Statistically Significant" else "Not Statistically Significant"

# Create the improved plot
plot <- ggplot(summary_stats, aes(x = year, y = median, color = year)) +
  geom_pointrange(aes(ymin = lower, ymax = upper), size = 1.5, fatten = 3) +
  geom_text(aes(y = upper, label = sprintf("Median: %.1f", median)), 
            vjust = -1, hjust = 0.5, size = 4, fontface = "bold") +
  labs(title = "Preference for Government Ownership of Business in Tunisia",
       subtitle = "Comparison between 2013 and 2019",
       x = "Year",
       y = "Preference for Government Ownership",
       caption = paste("Source: World Values Survey, Waves 6 (2013) and 7 (2019)",
                       "\nBars show interquartile range",
                       "\np-value:", format.pval(p_value, digits = 3))) +
  scale_color_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
  scale_y_continuous(breaks = seq(1, 10, 1),
                     labels = c("1\n(Private\nOwnership)", "2", "3", "4", "5", "6", "7", "8", "9", "10\n(Government\nOwnership)"),
                     limits = c(1, 10)) +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
    plot.subtitle = element_text(size = 16, hjust = 0.5),
    axis.title.x = element_text(face = "bold", size = 14, margin = margin(t = 10)),
    axis.title.y = element_blank(),
    axis.text = element_text(size = 12),
    legend.position = "none",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, size = 10, margin = margin(t = 20)),
    plot.margin = margin(20, 20, 20, 20)
  )

# Display the plot
print(plot)

# Print detailed t-test results
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  preference by year
## t = -6.9847, df = 2252.5, p-value = 3.743e-12
## alternative hypothesis: true difference in means between group 2013 and group 2019 is not equal to 0
## 95 percent confidence interval:
##  -0.9960414 -0.5593524
## sample estimates:
## mean in group 2013 mean in group 2019 
##           5.907177           6.684874
# Load required libraries
library(ggplot2)
library(dplyr)
library(knitr)
library(tidyr)

# Function to find the correct column name
find_column <- function(df, possible_names) {
  for (name in possible_names) {
    if (name %in% names(df)) {
      return(name)
    }
  }
  stop("Could not find a matching column name")
}

# Find the correct column names
col_2013 <- find_column(df2013_sub, c("V97", "Q107", "government_ownership"))
col_2019 <- find_column(df2019_sub, c("Q107", "V97", "government_ownership"))

# Create a contingency table
cont_table <- table(Year = c(rep("2013", nrow(df2013_sub)), rep("2019", nrow(df2019_sub))),
                    Preference = c(df2013_sub[[col_2013]], df2019_sub[[col_2019]]))

# Perform Fisher's Exact Test
fisher_test_result <- fisher.test(cont_table, simulate.p.value = TRUE, B = 10000)

# Create a table of results
result_table <- data.frame(
  P_Value = fisher_test_result$p.value,
  Significant = ifelse(fisher_test_result$p.value < 0.05, "Yes", "No")
)

# Print the test results table
kable(result_table, caption = "Fisher's Exact Test Results")
Fisher’s Exact Test Results
P_Value Significant
1e-04 Yes
# Calculate summary statistics
summary_stats <- rbind(
  df2013_sub %>% summarise(Year = 2013, 
                           Mean = mean(!!sym(col_2013), na.rm = TRUE),
                           Median = median(!!sym(col_2013), na.rm = TRUE),
                           SD = sd(!!sym(col_2013), na.rm = TRUE)),
  df2019_sub %>% summarise(Year = 2019, 
                           Mean = mean(!!sym(col_2019), na.rm = TRUE),
                           Median = median(!!sym(col_2019), na.rm = TRUE),
                           SD = sd(!!sym(col_2019), na.rm = TRUE))
)

# Print summary statistics table
kable(summary_stats, caption = "Summary Statistics", digits = 2)
Summary Statistics
Year Mean Median SD
2013 5.91 5.91 2.34
2019 6.68 7.00 3.08
# Prepare data for plotting
plot_data <- as.data.frame(cont_table) %>%
  group_by(Year) %>%
  mutate(Percentage = Freq / sum(Freq) * 100)

# Create a grouped bar plot
ggplot(plot_data, aes(x = Preference, y = Percentage, fill = Year)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
  labs(title = "Distribution of Government Ownership Preferences",
       subtitle = "Comparison between 2013 and 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Percentage",
       fill = "Year") +
  theme_minimal() +
  theme(legend.position = "top") +
  scale_x_discrete(limits = as.character(1:10))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Create the density plot
ggplot() +
  geom_density(aes(x = df2013_sub[[col_2013]], fill = "2013"), alpha = 0.5) +
  geom_density(aes(x = df2019_sub[[col_2019]], fill = "2019"), alpha = 0.5) +
  scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
  labs(title = "Distribution of Government Ownership Preferences",
       subtitle = "Comparison between 2013 and 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       fill = "Year") +
  theme_minimal() +
  theme(legend.position = "top")