Effect of the perception of corruption on ownership preference

Data Preparation

First, we load the World Value Survey data collected in Tunisia in 2019 from a Stata file and inspect its structure.

# Read the Stata file
df2019 <- read_dta("~/Downloads/WVS_Wave_7_Tunisia_Stata_v5.0.dta")

We then select the columns of interest for the analysis.

columns_of_interest <- c("Q107", 
                         "Q114",
                         "J_INTDATE", 
                         "Q260",
                         "Q262",
                         "Q245",
                         "Q235",
                         "Q237")

df2019_sub <- df2019[, columns_of_interest]

Data Cleaning

We will clean the selected data by replacing negative values with NA.

df2019_sub <- df2019_sub %>%
  mutate(Q107 = ifelse(Q107 < 0, NA, Q107),
          Q237 = ifelse(Q237 < 0, NA, Q237),

                   Q235 = ifelse(Q235 < 0, NA, Q235),
                   Q245 = ifelse(Q245 < 0, NA, Q245),
         Q114 = ifelse(Q114 < 0, NA, Q114),
         Q260 = ifelse(Q260 < 0, NA, Q260))

Check for Multicollinearity

We will check for multicollinearity by calculating and plotting the correlation matrix.

# Calculate correlation matrix
cor_matrix_2019 <- cor(df2019_sub, use = "complete.obs")

# Create more descriptive variable labels
var_labels <- c(
  "Q107" = "Gov't Ownership Preference",
  "Q114" = "Perceived Corruption",
  "J_INTDATE" = "Interview Date",
  "Q260" = "Urban/Rural",
  "Q262" = "Age",
  "Q245" = "Coup Acceptance",
  "Q235"= "Strongman Rule",
  "Q237" = "Army Rule"
)

# Melt the correlation matrix
melted_cormat <- melt(cor_matrix_2019)

# Create the plot
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#002F6C", high = "#BA0C2F", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        legend.justification = c(1, 0),
        legend.position = c(1.5, 0.3),
        legend.direction = "vertical") +
  coord_fixed() +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  scale_x_discrete(labels = var_labels) +
  scale_y_discrete(labels = var_labels) +
  ggtitle("Correlation Matrix of Key Variables (2019)") +
  labs(subtitle = "Tunisia World Values Survey Data",
       caption = "Source: World Values Survey, Wave 7 (2019)")

## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Data Imputation

We will perform simple imputation to handle missing values.

# Perform simple imputation
df2019_sub <- df2019_sub %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Statistical Analysis

We will test the association between business ownership preference (Q107) and the perception of corruption among business executives (Q114) using linear and generalized linear models.

# Descriptive statistics for Q107 and Q114
descriptive_stats2019 <- describe(df2019_sub[, c("Q107", "Q114", "Q245", "Q237")])

# Print the descriptive statistics
print(descriptive_stats2019)

##      vars    n mean   sd median trimmed  mad min max range  skew kurtosis   se
## Q107    1 1208 6.68 3.08   7.00    6.97 4.45   1  10     9 -0.53    -1.05 0.09
## Q114    2 1208 2.93 0.76   3.00    2.94 1.48   1   4     3 -0.17    -0.59 0.02
## Q245    3 1208 5.56 3.14   5.56    5.57 3.79   1  10     9 -0.04    -1.26 0.09
## Q237    4 1208 2.87 0.94   3.00    2.96 1.48   1   4     3 -0.55    -0.52 0.03

Linear Model

# Test the association using lm
lm_model2019 <- lm(Q107 ~ Q114, df2019_sub)
summary(lm_model2019)

## 
## Call:
## lm(formula = Q107 ~ Q114, data = df2019_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4312 -2.4312  0.5688  2.5688  4.6649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6365     0.3499  13.250  < 2e-16 ***
## Q114          0.6987     0.1156   6.045 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.035 on 1206 degrees of freedom
## Multiple R-squared:  0.02941,    Adjusted R-squared:  0.0286 
## F-statistic: 36.54 on 1 and 1206 DF,  p-value: 1.99e-09

# Fit the linear model with additional covariates
lm_model2019_cov <- lm(Q107 ~ Q114  + J_INTDATE + Q262 + Q260, data = df2019_sub)
summary(lm_model2019_cov)

## 
## Call:
## lm(formula = Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7162 -2.4267  0.6046  2.6498  4.9564 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.803e+05  5.685e+04  -3.172  0.00155 ** 
## Q114         7.112e-01  1.150e-01   6.185 8.49e-10 ***
## J_INTDATE    8.930e-03  2.816e-03   3.172  0.00155 ** 
## Q262         1.287e-02  5.531e-03   2.328  0.02010 *  
## Q260         1.785e-01  1.746e-01   1.022  0.30683    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.018 on 1203 degrees of freedom
## Multiple R-squared:  0.04278,    Adjusted R-squared:  0.0396 
## F-statistic: 13.44 on 4 and 1203 DF,  p-value: 1.012e-10

Generalized Linear Model

# Test the association using glm (assuming Gaussian family)
glm_model2019 <- glm(Q107 ~ Q114, family = gaussian, df2019_sub)
summary(glm_model2019)

## 
## Call:
## glm(formula = Q107 ~ Q114, family = gaussian, data = df2019_sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.6365     0.3499  13.250  < 2e-16 ***
## Q114          0.6987     0.1156   6.045 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.212431)
## 
##     Null deviance: 11447  on 1207  degrees of freedom
## Residual deviance: 11110  on 1206  degrees of freedom
## AIC: 6114.6
## 
## Number of Fisher Scoring iterations: 2

# Fit the glm with additional covariates
glm_model2019_cov <- glm(Q107 ~ Q114  + J_INTDATE + Q262 + Q260, data = df2019_sub)
summary(glm_model2019_cov)

## 
## Call:
## glm(formula = Q107 ~ Q114 + J_INTDATE + Q262 + Q260, data = df2019_sub)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.803e+05  5.685e+04  -3.172  0.00155 ** 
## Q114         7.112e-01  1.150e-01   6.185 8.49e-10 ***
## J_INTDATE    8.930e-03  2.816e-03   3.172  0.00155 ** 
## Q262         1.287e-02  5.531e-03   2.328  0.02010 *  
## Q260         1.785e-01  1.746e-01   1.022  0.30683    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.108157)
## 
##     Null deviance: 11447  on 1207  degrees of freedom
## Residual deviance: 10957  on 1203  degrees of freedom
## AIC: 6103.8
## 
## Number of Fisher Scoring iterations: 2

Data Visualization

Density Distribution Plots

We will visualize the density distribution of the two variables to understand their spread and central tendencies.

# Plot the density distribution for Q107
ggplot(df2019_sub, aes(x = Q107)) +
  geom_density(fill = "#002F6C", alpha = 0.7) +
  labs(title = "Distribution of Preference for Government Ownership of Business",
       subtitle = "Tunisia, 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

# Plot the density distribution for Q114
ggplot(df2019_sub, aes(x = Q114)) +
  geom_density(fill = "#BA0C2F", alpha = 0.7) +
  labs(title = "Distribution of Perceived Corruption Among Business Executives",
       subtitle = "Tunisia, 2019",
       x = "Perception of Corruption (1 = None, 10 = All)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

Regression Plots

We will visualize the results of the linear and generalized linear models.

library(ggplot2)

# Visualize the results of the linear model
ggplot(df2019_sub, aes(y = Q107, x = Q114)) +
  geom_smooth(method = "lm", color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Perceived Corruption and Government Ownership Preference",
       subtitle = "Linear Model (2019)",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

## `geom_smooth()` using formula = 'y ~ x'

# Visualize the results of the generalized linear model
ggplot(df2019_sub, aes(y = Q107, x = Q114)) +
  geom_smooth(method = "glm", method.args = list(family = "gaussian"), 
              color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Perceived Corruption and Government Ownership Preference",
       subtitle = "Generalized Linear Model (2019)",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

## `geom_smooth()` using formula = 'y ~ x'

# Visualize the age distribution of preferences
ggplot(df2019_sub, aes(y = Q107, x = Q262)) +
  geom_smooth(method = "lm", color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Age and Government Ownership Preference",
       subtitle = "Linear Model (2019)",
       y = "Preference for Government Ownership of Business",
       x = "Age",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

## `geom_smooth()` using formula = 'y ~ x'

Effect Plot

We will create an effect plot to visualize the effects of the variables in the model.

# Compute the effect for the linear model with covariates
effect_lm2019 <- effect("Q114", lm_model2019_cov)

# Convert effect object to data frame
effect_lm_df <- as.data.frame(effect_lm2019)

# Plot the effect for the linear model
ggplot(effect_lm_df, aes(x = Q114, y = fit)) +
  geom_line(color = "#002F6C", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#002F6C") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Linear Model (2019)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Compute the effect for the generalized linear model with covariates
effect_glm2019 <- effect("Q114", glm_model2019_cov)

# Convert effect object to data frame
effect_glm_df <- as.data.frame(effect_glm2019)

# Plot the effect for the generalized linear model
ggplot(effect_glm_df, aes(x = Q114, y = fit)) +
  geom_line(color = "#BA0C2F", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#BA0C2F") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Generalized Linear Model (2019)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 7 (2019)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

Reporting Results

Finally, we will create a table of regression results using the stargazer package.

# Create a stargazer table with improved labels
stargazer(lm_model2019, lm_model2019_cov, glm_model2019, glm_model2019_cov, type = "html",
          title = "Regression Results",
          dep.var.labels.include = FALSE,
          column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
          dep.var.labels = c("Preference for Government Ownership of Business"),
          covariate.labels = c("Perception of Corruption Among Business Executives", 
                               "Urban/Rural", 
                               "Interview Date",
                               "Age",
                               "Sex"),
          keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
          notes.append = TRUE,
          notes.align = "r")

**Regression Results**

	Dependent variable:

	OLS		normal
	OLS	OLS with Covariates	GLM	GLM with Covariates
	(1)	(2)	(3)	(4)

Perception of Corruption Among Business Executives	0.699^***	0.711^***	0.699^***	0.711^***
	(0.116)	(0.115)	(0.116)	(0.115)

Urban/Rural		0.009^***		0.009^***
		(0.003)		(0.003)

Interview Date		0.013^**		0.013^**
		(0.006)		(0.006)

Age		0.178		0.178
		(0.175)		(0.175)

Sex	4.636^***	-180,293.000^***	4.636^***	-180,293.000^***
	(0.350)	(56,846.910)	(0.350)	(56,846.910)


Observations	1,208	1,208	1,208	1,208
R²	0.029	0.043
Adjusted R²	0.029	0.040
Log Likelihood			-3,055.291	-3,046.911
Akaike Inf. Crit.			6,114.583	6,103.823
F Statistic	36.541^*** (df = 1; 1206)	13.442^*** (df = 4; 1203)

Note:	p<0.1; p<0.05; p<0.01

Data Preparation

Now, we load the World Value Survey data collected in Tunisia in 2013 from a Stata file and inspect its structure.

# Read the Stata file
df2013 <- read_dta("~/Downloads/WV6_Data_Tunisia_Stata_v20201117.dta")

We then select the columns of interest for the analysis.

columns_of_interest <- c("V97", #Q107 in 2019
                         "MN_228M",#Q114 in 2019
                         "V261", #J_INTDATE in 2019
                         "V240", #Q260 in 2019
                         "V242",#Q262 in 2019
                         "V129", #Q237 in 2019
                         "V127",#Q235 in 2019
                         "V135") #Q245 in 2019


df2013_sub <- df2013[, columns_of_interest]

Data Cleaning

We will clean the selected data by replacing negative values with NA.

df2013_sub <- df2013_sub %>%
  mutate(V97 = ifelse(V97 < 0, NA, V97),
         V135= ifelse(V135 < 0, NA, V135),
                  V129= ifelse(V129 < 0, NA, V129),
                           V127= ifelse(V127 < 0, NA, V127),
         MN_228M = ifelse(MN_228M < 0, NA, MN_228M),
         V240 = ifelse(V240 < 0, NA, V240))

Check for Multicollinearity

We will check for multicollinearity by calculating and plotting the correlation matrix.

# Calculate correlation matrix
cor_matrix_2013 <- cor(df2013_sub, use = "complete.obs")

# Create more descriptive variable labels
var_labels <- c(
  "Q107" = "Gov't Ownership Preference",
  "Q114" = "Perceived Corruption",
  "J_INTDATE" = "Interview Date",
  "Q260" = "Urban/Rural",
  "Q262" = "Age"
)

# Melt the correlation matrix
melted_cormat <- melt(cor_matrix_2013)

# Create the plot
ggplot(data = melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#002F6C", high = "#BA0C2F", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        legend.justification = c(1, 0),
        legend.position = c(1.5, 0.3),
        legend.direction = "vertical") +
  coord_fixed() +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  scale_x_discrete(labels = var_labels) +
  scale_y_discrete(labels = var_labels) +
  ggtitle("Correlation Matrix of Key Variables (2013)") +
  labs(subtitle = "Tunisia World Values Survey Data",
       caption = "Source: World Values Survey, Wave 6 (2013)")

Data Imputation

We will perform simple imputation to handle missing values.

# Perform simple imputation
df2013_sub <- df2013_sub %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

Statistical Analysis

We will test the association between business ownership preference (V97) and the perception of corruption among business executives (MN_228M) using linear and generalized linear models.

# Descriptive statistics for V97 and MN_228M
descriptive_stats2013 <- describe(df2013_sub[, c("V97", "MN_228M", "V135","V129")])

# Print the descriptive statistics
print(descriptive_stats2013)

##         vars    n mean   sd median trimmed  mad min max range  skew kurtosis
## V97        1 1205 5.91 2.34   5.91    5.96 1.34   1  10     9 -0.09    -0.09
## MN_228M    2 1205 7.47 2.47   8.00    7.74 2.97   1  10     9 -0.65    -0.55
## V135       3 1205 4.85 3.19   4.85    4.69 4.22   1  10     9  0.30    -1.17
## V129       4 1205 2.79 1.05   3.00    2.86 1.48   1   4     3 -0.41    -0.99
##           se
## V97     0.07
## MN_228M 0.07
## V135    0.09
## V129    0.03

Linear Model

# Test the association using lm
lm_model2013 <- lm(V97 ~ MN_228M, df2013_sub)
summary(lm_model2013)

## 
## Call:
## lm(formula = V97 ~ MN_228M, data = df2013_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2320 -1.0311  0.0191  1.1698  4.2201 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.28228    0.21483  29.243   <2e-16 ***
## MN_228M     -0.05024    0.02732  -1.839   0.0662 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.338 on 1203 degrees of freedom
## Multiple R-squared:  0.002803,   Adjusted R-squared:  0.001974 
## F-statistic: 3.381 on 1 and 1203 DF,  p-value: 0.06619

# Fit the linear model with additional covariates
lm_model2013_cov <- lm(V97 ~ MN_228M  + V261 + V242 + V240, data = df2013_sub)
summary(lm_model2013_cov)

## 
## Call:
## lm(formula = V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3839 -0.9975 -0.0611  1.2373  4.4311 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  2.528e+04  3.741e+04   0.676   0.4993  
## MN_228M     -5.218e-02  2.740e-02  -1.905   0.0571 .
## V261        -1.255e-03  1.858e-03  -0.676   0.4994  
## V242         6.387e-03  4.161e-03   1.535   0.1251  
## V240         1.215e-01  1.352e-01   0.899   0.3689  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.338 on 1200 degrees of freedom
## Multiple R-squared:  0.005941,   Adjusted R-squared:  0.002627 
## F-statistic: 1.793 on 4 and 1200 DF,  p-value: 0.1279

Generalized Linear Model

# Test the association using glm (assuming Gaussian family)
glm_model2013 <- glm(V97 ~ MN_228M, family = gaussian, df2013_sub)
summary(glm_model2013)

## 
## Call:
## glm(formula = V97 ~ MN_228M, family = gaussian, data = df2013_sub)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.28228    0.21483  29.243   <2e-16 ***
## MN_228M     -0.05024    0.02732  -1.839   0.0662 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.467589)
## 
##     Null deviance: 6596.0  on 1204  degrees of freedom
## Residual deviance: 6577.5  on 1203  degrees of freedom
## AIC: 5470.7
## 
## Number of Fisher Scoring iterations: 2

# Fit the glm with additional covariates
glm_model2013_cov <- glm(V97 ~ MN_228M  + V261 + V242 + V240, data = df2013_sub)
summary(glm_model2013_cov)

## 
## Call:
## glm(formula = V97 ~ MN_228M + V261 + V242 + V240, data = df2013_sub)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  2.528e+04  3.741e+04   0.676   0.4993  
## MN_228M     -5.218e-02  2.740e-02  -1.905   0.0571 .
## V261        -1.255e-03  1.858e-03  -0.676   0.4994  
## V242         6.387e-03  4.161e-03   1.535   0.1251  
## V240         1.215e-01  1.352e-01   0.899   0.3689  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.46401)
## 
##     Null deviance: 6596.0  on 1204  degrees of freedom
## Residual deviance: 6556.8  on 1200  degrees of freedom
## AIC: 5472.9
## 
## Number of Fisher Scoring iterations: 2

Data Visualization

Density Distribution Plots

We will visualize the density distribution of the two variables to understand their spread and central tendencies.

library(ggplot2)

# Plot the density distribution for V97
ggplot(df2013_sub, aes(x = V97)) +
  geom_density(fill = "#002F6C", alpha = 0.7) +
  labs(title = "Distribution of Preference for Government Ownership of Business",
       subtitle = "Tunisia, 2013",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

# Plot the density distribution for MN_228M
ggplot(df2013_sub, aes(x = MN_228M)) +
  geom_density(fill = "#BA0C2F", alpha = 0.7) +
  labs(title = "Distribution of Perceived Corruption Among Business Executives",
       subtitle = "Tunisia, 2013",
       x = "Perception of Corruption (1 = None, 10 = All)",
       y = "Density",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1))

Regression Plots

We will visualize the results of the linear and generalized linear models.

# Visualize the results of the linear model
ggplot(df2013_sub, aes(y = V97, x = MN_228M)) +
  geom_smooth(method = "lm", color = "#002F6C", fill = "#002F6C", alpha = 0.2) +
  labs(title = "Linear Model: Government Ownership Preference vs Perceived Corruption",
       subtitle = "Tunisia, 2013",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  scale_y_continuous(breaks = seq(1, 10, 1))

## `geom_smooth()` using formula = 'y ~ x'

# Visualize the results of the generalized linear model
ggplot(df2013_sub, aes(y = V97, x = MN_228M)) +
  geom_smooth(method = "glm", method.args = list(family = "gaussian"), color = "#BA0C2F", fill = "#BA0C2F", alpha = 0.2) +
  labs(title = "Generalized Linear Model: Government Ownership Preference vs Perceived Corruption",
       subtitle = "Tunisia, 2013",
       y = "Preference for Government Ownership of Business",
       x = "Perception of Corruption Among Business Executives",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  scale_y_continuous(breaks = seq(1, 10, 1))

## `geom_smooth()` using formula = 'y ~ x'

# Visualize the age distribution of preferences
ggplot(df2013_sub, aes(y = V97, x = V242)) +
  geom_smooth(method = "lm", color = "#002F6C", fill = "#002F6C", alpha = 0.2) +
  labs(title = "Age Distribution of Government Ownership Preferences",
       subtitle = "Tunisia, 2013",
       y = "Preference for Government Ownership of Business",
       x = "Age",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(18, max(df2013_sub$V242, na.rm = TRUE), by = 10)) +
  scale_y_continuous(breaks = seq(1, 10, 1))

## `geom_smooth()` using formula = 'y ~ x'

Effect Plot

We will create an effect plot to visualize the effects of the variables in the model.

library(ggplot2)
library(effects)

# Compute the effect for the linear model with covariates
effect_lm2013 <- effect("MN_228M", lm_model2013_cov)

# Convert effect object to data frame
effect_lm_df <- as.data.frame(effect_lm2013)

# Plot the effect for the linear model
ggplot(effect_lm_df, aes(x = MN_228M, y = fit)) +
  geom_line(color = "#002F6C", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#002F6C") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Linear Model (2013)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

# Compute the effect for the generalized linear model with covariates
effect_glm2013 <- effect("MN_228M", glm_model2013_cov)

# Convert effect object to data frame
effect_glm_df <- as.data.frame(effect_glm2013)

# Plot the effect for the generalized linear model
ggplot(effect_glm_df, aes(x = MN_228M, y = fit)) +
  geom_line(color = "#BA0C2F", size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, fill = "#BA0C2F") +
  labs(title = "Effect of Perceived Corruption on Government Ownership Preference",
       subtitle = "Generalized Linear Model (2013)",
       x = "Perception of Corruption Among Business Executives",
       y = "Preference for Government Ownership of Business",
       caption = "Source: World Values Survey, Wave 6 (2013)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  )

Reporting Results

Finally, we will create a table of regression results using the stargazer package.

# Create a stargazer table with improved labels
stargazer(lm_model2013, lm_model2013_cov, glm_model2013, glm_model2013_cov, type = "html",
          title = "Regression Results",
          dep.var.labels.include = FALSE,
          column.labels = c("OLS", "OLS with Covariates", "GLM", "GLM with Covariates"),
          dep.var.labels = c("Preference for Government Ownership of Business"),
          covariate.labels = c("Perception of Corruption Among Business Executives", 
                               "Urban/Rural", 
                               "Interview Date",
                               "Age",
                               "Sex"),
          keep.stat = c("n", "rsq", "adj.rsq", "f", "ll", "aic", "bic"),
          notes.append = TRUE,
          notes.align = "r")

**Regression Results**

	Dependent variable:

	OLS		normal
	OLS	OLS with Covariates	GLM	GLM with Covariates
	(1)	(2)	(3)	(4)

Perception of Corruption Among Business Executives	-0.050^*	-0.052^*	-0.050^*	-0.052^*
	(0.027)	(0.027)	(0.027)	(0.027)

Urban/Rural		-0.001		-0.001
		(0.002)		(0.002)

Interview Date		0.006		0.006
		(0.004)		(0.004)

Age		0.122		0.122
		(0.135)		(0.135)

Sex	6.282^***	25,277.960	6.282^***	25,277.960
	(0.215)	(37,408.500)	(0.215)	(37,408.500)


Observations	1,205	1,205	1,205	1,205
R²	0.003	0.006
Adjusted R²	0.002	0.003
Log Likelihood			-2,733.370	-2,731.471
Akaike Inf. Crit.			5,470.740	5,472.942
F Statistic	3.381^* (df = 1; 1203)	1.793 (df = 4; 1200)

Note:	p<0.1; p<0.05; p<0.01

# Load required libraries if not already loaded
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following objects are masked from 'package:psych':
## 
##     alpha, rescale

# Combine the data from both years
df_combined <- rbind(
  data.frame(year = "2013", preference = df2013_sub$V97),
  data.frame(year = "2019", preference = df2019_sub$Q107)
)

# Create the overlapping density plot
ggplot(df_combined, aes(x = preference, fill = year)) +
  geom_density(alpha = 0.7) +
  labs(title = "Preference for Government Ownership of Business in Tunisia",
       subtitle = "Comparison between 2013 and 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       fill = "Year",
       caption = "Source: World Values Survey, Waves 6 (2013) and 7 (2019)") +
  scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) + 
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, color = "#002F6C"),
    plot.subtitle = element_text(size = 12, color = "#002F6C"),
    axis.title = element_text(face = "bold", size = 12, color = "#002F6C"),
    axis.text = element_text(size = 10, color = "#002F6C"),
    legend.title = element_text(face = "bold", color = "#002F6C"),
    legend.text = element_text(color = "#002F6C"),
    legend.position = "top",
    panel.grid.major = element_line(color = "#E6E6E6"),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, color = "#002F6C")
  ) +
  scale_x_continuous(breaks = seq(1, 10, 1)) +
  scale_y_continuous(labels = scales::percent_format(scale = 1))

# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

library(ggpubr)

# Combine the data from both years (assuming df2013_sub and df2019_sub exist)
df_combined <- rbind(
  data.frame(year = "2013", preference = df2013_sub$V97),
  data.frame(year = "2019", preference = df2019_sub$Q107)
)

# Calculate summary statistics
summary_stats <- df_combined %>%
  group_by(year) %>%
  summarise(
    mean = mean(preference, na.rm = TRUE),
    median = median(preference, na.rm = TRUE),
    lower = quantile(preference, 0.25, na.rm = TRUE),
    upper = quantile(preference, 0.75, na.rm = TRUE)
  )

# Perform t-test
t_test_result <- t.test(preference ~ year, data = df_combined)
p_value <- t_test_result$p.value
significance <- if(p_value < 0.05) "Statistically Significant" else "Not Statistically Significant"

# Create the improved plot
plot <- ggplot(summary_stats, aes(x = year, y = median, color = year)) +
  geom_pointrange(aes(ymin = lower, ymax = upper), size = 1.5, fatten = 3) +
  geom_text(aes(y = upper, label = sprintf("Median: %.1f", median)), 
            vjust = -1, hjust = 0.5, size = 4, fontface = "bold") +
  labs(title = "Preference for Government Ownership of Business in Tunisia",
       subtitle = "Comparison between 2013 and 2019",
       x = "Year",
       y = "Preference for Government Ownership",
       caption = paste("Source: World Values Survey, Waves 6 (2013) and 7 (2019)",
                       "\nBars show interquartile range",
                       "\np-value:", format.pval(p_value, digits = 3))) +
  scale_color_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
  scale_y_continuous(breaks = seq(1, 10, 1),
                     labels = c("1\n(Private\nOwnership)", "2", "3", "4", "5", "6", "7", "8", "9", "10\n(Government\nOwnership)"),
                     limits = c(1, 10)) +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
    plot.subtitle = element_text(size = 16, hjust = 0.5),
    axis.title.x = element_text(face = "bold", size = 14, margin = margin(t = 10)),
    axis.title.y = element_blank(),
    axis.text = element_text(size = 12),
    legend.position = "none",
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.caption = element_text(hjust = 0, size = 10, margin = margin(t = 20)),
    plot.margin = margin(20, 20, 20, 20)
  )

# Display the plot
print(plot)

# Print detailed t-test results
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  preference by year
## t = -6.9847, df = 2252.5, p-value = 3.743e-12
## alternative hypothesis: true difference in means between group 2013 and group 2019 is not equal to 0
## 95 percent confidence interval:
##  -0.9960414 -0.5593524
## sample estimates:
## mean in group 2013 mean in group 2019 
##           5.907177           6.684874

# Load required libraries
library(ggplot2)
library(dplyr)
library(knitr)
library(tidyr)

# Function to find the correct column name
find_column <- function(df, possible_names) {
  for (name in possible_names) {
    if (name %in% names(df)) {
      return(name)
    }
  }
  stop("Could not find a matching column name")
}

# Find the correct column names
col_2013 <- find_column(df2013_sub, c("V97", "Q107", "government_ownership"))
col_2019 <- find_column(df2019_sub, c("Q107", "V97", "government_ownership"))

# Create a contingency table
cont_table <- table(Year = c(rep("2013", nrow(df2013_sub)), rep("2019", nrow(df2019_sub))),
                    Preference = c(df2013_sub[[col_2013]], df2019_sub[[col_2019]]))

# Perform Fisher's Exact Test
fisher_test_result <- fisher.test(cont_table, simulate.p.value = TRUE, B = 10000)

# Create a table of results
result_table <- data.frame(
  P_Value = fisher_test_result$p.value,
  Significant = ifelse(fisher_test_result$p.value < 0.05, "Yes", "No")
)

# Print the test results table
kable(result_table, caption = "Fisher's Exact Test Results")

Fisher’s Exact Test Results
P_Value	Significant
1e-04	Yes

# Calculate summary statistics
summary_stats <- rbind(
  df2013_sub %>% summarise(Year = 2013, 
                           Mean = mean(!!sym(col_2013), na.rm = TRUE),
                           Median = median(!!sym(col_2013), na.rm = TRUE),
                           SD = sd(!!sym(col_2013), na.rm = TRUE)),
  df2019_sub %>% summarise(Year = 2019, 
                           Mean = mean(!!sym(col_2019), na.rm = TRUE),
                           Median = median(!!sym(col_2019), na.rm = TRUE),
                           SD = sd(!!sym(col_2019), na.rm = TRUE))
)

# Print summary statistics table
kable(summary_stats, caption = "Summary Statistics", digits = 2)

Summary Statistics
Year	Mean	Median	SD
2013	5.91	5.91	2.34
2019	6.68	7.00	3.08

# Prepare data for plotting
plot_data <- as.data.frame(cont_table) %>%
  group_by(Year) %>%
  mutate(Percentage = Freq / sum(Freq) * 100)

# Create a grouped bar plot
ggplot(plot_data, aes(x = Preference, y = Percentage, fill = Year)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
  labs(title = "Distribution of Government Ownership Preferences",
       subtitle = "Comparison between 2013 and 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Percentage",
       fill = "Year") +
  theme_minimal() +
  theme(legend.position = "top") +
  scale_x_discrete(limits = as.character(1:10))

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

# Create the density plot
ggplot() +
  geom_density(aes(x = df2013_sub[[col_2013]], fill = "2013"), alpha = 0.5) +
  geom_density(aes(x = df2019_sub[[col_2019]], fill = "2019"), alpha = 0.5) +
  scale_fill_manual(values = c("2013" = "#002F6C", "2019" = "#BA0C2F")) +
  labs(title = "Distribution of Government Ownership Preferences",
       subtitle = "Comparison between 2013 and 2019",
       x = "Preference for Government Ownership (1 = Private, 10 = Government)",
       y = "Density",
       fill = "Year") +
  theme_minimal() +
  theme(legend.position = "top")

Effect of the perception of corruption on ownership preference

Mohamed Dhia Hammami

2024-08-06

Data Preparation

Data Cleaning

Check for Multicollinearity

Data Imputation

Statistical Analysis

Linear Model

Generalized Linear Model

Data Visualization

Density Distribution Plots

Regression Plots

Effect Plot

Reporting Results

Data Preparation

Data Cleaning

Check for Multicollinearity

Data Imputation

Statistical Analysis

Linear Model

Generalized Linear Model

Data Visualization

Density Distribution Plots

Regression Plots

Effect Plot

Reporting Results