Assignment 3

Data

#install.packages("haven") 
library(haven)
ruta <- "/Users/diegoduran/Desktop/survey.dta"
datos <- read_dta(ruta)
head(datos)

## # A tibble: 6 × 195
##   idnum pais     nationality estratopri estratosec strata prov      municipio   
##   <dbl> <dbl+lb>       <dbl> <dbl+lbl>  <dbl+lbl>   <dbl> <dbl+lbl> <dbl+lbl>   
## 1  7394 1 [Méxi…           1 102 [Cent… 1 [Grande…    102 111 [Gua… 111020 [Leó…
## 2   778 1 [Méxi…           1 102 [Cent… 1 [Grande…    102 111 [Gua… 111023 [Pén…
## 3  1719 1 [Méxi…           1 102 [Cent… 1 [Grande…    102 111 [Gua… 111020 [Leó…
## 4  7737 1 [Méxi…           1 103 [Cent… 1 [Grande…    103 113 [Hid… 113047 [Pac…
## 5  3203 1 [Méxi…           1 103 [Cent… 1 [Grande…    103 122 [Que… 122010 [El …
## 6  9816 1 [Méxi…           1 102 [Cent… 1 [Grande…    102 111 [Gua… 111020 [Leó…
## # ℹ 187 more variables: upm <dbl>, ur <dbl+lbl>, cluster <dbl>, year <dbl+lbl>,
## #   wave <dbl+lbl>, wt <dbl>, q1tc_r <dbl+lbl>, q2 <dbl>, a4n <dbl+lbl>,
## #   soct2 <dbl+lbl>, idio2 <dbl+lbl>, mesfut1 <dbl+lbl>, np1 <dbl+lbl>,
## #   np1new <dbl+lbl>, sgl1 <dbl+lbl>, cp8 <dbl+lbl>, cp13 <dbl+lbl>,
## #   cp20 <dbl+lbl>, it1 <dbl+lbl>, l1n <dbl+lbl>, jc10 <dbl+lbl>,
## #   jc13 <dbl+lbl>, jc15a <dbl+lbl>, jc16a <dbl+lbl>, vic1ext <dbl+lbl>,
## #   aoj11 <dbl+lbl>, aoj12 <dbl+lbl>, countfair1 <dbl+lbl>, …

The question “How strongly do you approve or disapprove that same-sex couples have the right to marry?” measures the level of approval on a continuous scale from 1 to 10, where 1 means “Strongly disapprove” and 10 means “Strongly approve.” It also includes two non-response categories: 888888 (“Don’t know”) and 988888 (“No answer”).

##Confidence intervals

# Convertir la variable a numérica
datos$d6 <- as.numeric(datos$d6)

# Reemplazar valores faltantes
datos$d6[datos$d6 %in% c(888888, 988888)] <- NA

# Calcular la media y desviación estándar
mean_d6 <- mean(datos$d6, na.rm = TRUE)
sd_d6 <- sd(datos$d6, na.rm = TRUE)
n <- sum(!is.na(datos$d6)) # Número de observaciones válidas

# Calcular el error estándar
se <- sd_d6 / sqrt(n)

# Intervalo de confianza al 95%
lower_bound <- mean_d6 - 1.96 * se
upper_bound <- mean_d6 + 1.96 * se

print(paste("Mean:", round(mean_d6, 2)))

## [1] "Mean: 6.4"

print(paste("Standard Deviation:", round(sd_d6, 2)))

## [1] "Standard Deviation: 3.37"

print(paste("95% CI: [", round(lower_bound, 2), ",", round(upper_bound, 2), "]"))

## [1] "95% CI: [ 6.23 , 6.56 ]"

# Validación con función t.test (opcional)
t.test(datos$d6, conf.level = 0.95)$conf.int

## [1] 6.233503 6.564489
## attr(,"conf.level")
## [1] 0.95

##Hypothesis testing

datos$ur <- as.numeric(datos$ur)
# Filtrar los grupos (urbano = 1, rural = 2)
urbano <- datos$d6[datos$ur == 1]
rural <- datos$d6[datos$ur == 2]

# Prueba t para comparar las medias
t_test <- t.test(urbano, rural, na.rm = TRUE)

# Imprimir los resultados
print(t_test)

## 
##  Welch Two Sample t-test
## 
## data:  urbano and rural
## t = 3.3583, df = 493.86, p-value = 0.0008448
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2915135 1.1135493
## sample estimates:
## mean of x mean of y 
##  6.540031  5.837500

*The p-value = 0.0008 (< 0.05) confirms that the difference is statistically significant.

# Convertir la variable a numérica, para este caso, usare la variable q1tc_r, ya que es la que habla del género del entrevistado
datos$q1tc_r <- as.numeric(datos$q1tc_r)

# Reemplazar valores faltantes para d6
datos$d6[datos$d6 %in% c(888888, 988888)] <- NA

# Filtrar solo hombres (1) y mujeres (2)
hombres <- datos$d6[datos$q1tc_r == 1]
mujeres <- datos$d6[datos$q1tc_r == 2]

# Prueba t para comparar las medias
t_test_sexo <- t.test(hombres, mujeres, na.rm = TRUE)

# Imprimir los resultados
print(t_test_sexo)

## 
##  Welch Two Sample t-test
## 
## data:  hombres and mujeres
## t = -2.2344, df = 1586.8, p-value = 0.0256
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.70764772 -0.04603065
## sample estimates:
## mean of x mean of y 
##  6.211613  6.588452

Regression

# Convertir variables a numéricas 
datos$b3 <- as.numeric(datos$b3)
datos$b37 <- as.numeric(datos$b37)

# Eliminar valores faltantes (No sabe y No responde)
datos$b3[datos$b3 %in% c(888888, 988888)] <- NA
datos$b37[datos$b37 %in% c(888888, 988888)] <- NA

# Ajustar el modelo de regresión bivariable
modelo <- lm(b3 ~ b37, data = datos)

summary(modelo)

## 
## Call:
## lm(formula = b3 ~ b37, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0801 -1.0801 -0.0497  1.2583  3.9503 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.71124    0.09630   28.16   <2e-16 ***
## b37          0.33841    0.02234   15.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.642 on 1598 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.1255, Adjusted R-squared:  0.125 
## F-statistic: 229.4 on 1 and 1598 DF,  p-value: < 2.2e-16

library(ggplot2)

ggplot(datos, aes(x = b37, y = b3)) +
  geom_point(color = "blue", alpha = 0.5) +   # Puntos
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Línea de regresión
  labs(title = "Bivariate Regression: b3 ~ b37",
       x = "b37",
       y = "b3") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).

Interpretation of the model The regression model estimates the relationship between the perception of how well citizens’ basic rights are protected by the Mexican political system (b3) and trust in the media (b37).

The coefficient for b37 is 0.34, meaning that for each one-unit increase in trust in the media, the expected perception of rights protection increases by 0.34 units, holding all other factors constant. The positive direction suggests that greater trust in the media is associated with a higher perception of rights protection.

The coefficient is highly statistically significant (p-value < 0.001), indicating that the observed relationship is unlikely to be due to random chance. However, the R-squared value of 0.13 shows that the model explains only 12.5% of the variability in perceptions of rights protection, implying that other factors not included in the model likely play a significant role.

In summary, the results suggest a small but statistically significant positive relationship between trust in the media and the perception that citizens’ basic rights are well protected by the Mexican political system.

# Convertir variables a numéricas
datos$q1tc_r <- as.numeric(datos$q1tc_r)
datos$q2 <- as.numeric(datos$q2)
datos$estratosec <- as.numeric(datos$estratosec)
datos$ur <- as.numeric(datos$ur)

# Eliminar valores faltantes para cada variable
datos$b13[datos$b13 %in% c(888888, 988888)] <- NA
datos$eff2[datos$eff2 %in% c(888888, 988888)] <- NA
datos$q1tc_r[datos$q1tc_r %in% c(888888, 988888)] <- NA
datos$q2[datos$q2 %in% c(888888, 988888)] <- NA
datos$estratosec[datos$estratosec %in% c(888888, 988888)] <- NA
datos$ur[datos$ur %in% c(888888, 988888)] <- NA

# Modelo de regresión multivariable
model1 <- lm(b13 ~ ur + q1tc_r + q2 + estratosec, data = datos)

# Resumen del modelo
summary(model1)

## 
## Call:
## lm(formula = b13 ~ ur + q1tc_r + q2 + estratosec, data = datos)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.950 -1.139  0.075  1.098  3.102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.567872   0.230594  15.473  < 2e-16 ***
## ur           0.356330   0.112542   3.166  0.00157 ** 
## q1tc_r      -0.157199   0.087440  -1.798  0.07240 .  
## q2           0.006073   0.002659   2.284  0.02250 *  
## estratosec   0.172458   0.065073   2.650  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.727 on 1563 degrees of freedom
##   (54 observations deleted due to missingness)
## Multiple R-squared:  0.02025,    Adjusted R-squared:  0.01774 
## F-statistic: 8.075 on 4 and 1563 DF,  p-value: 1.921e-06

Interpretation of the model The multivariable regression model estimates the relationship between the dependent variable b13 (perception of citizens’ basic rights protection) and several control variables: ur (urbanization), q1tc_r (gender), q2 (age), and estratosec (socioeconomic status).

Interpretation of Coefficients: - Intercept (3.57): When all control variables are set to zero, the baseline perception of rights protection is 3.57. - ur (0.36): The positive coefficient for urbanization indicates that, on average, urban respondents perceive better protection of basic rights than rural respondents. This effect is significant with a p-value < 0.01. - q1tc_r (-0.16): The negative coefficient for gender suggests that, compared to males, females have a slightly lower perception of rights protection, though this effect is marginally significant with a p-value of 0.072 (approaching significance). - q2 (0.01): The positive coefficient for age indicates that as age increases, individuals’ perceptions of rights protection also slightly improve. This effect is significant with a p-value < 0.05. - estratosec (0.17): The positive coefficient for socioeconomic status shows that people from higher socioeconomic strata perceive better protection of their rights. This effect is significant with a p-value < 0.01.

Model Significance: - R-squared (0.02): The model explains only 2% of the variation in perceptions of rights protection. This suggests that there are other factors not included in the model that could have a stronger influence. - F-statistic (8.08): The model is overall significant, with a p-value < 0.001, indicating that at least one of the variables in the model significantly affects the perception of rights protection.

Why Include These Control Variables: - Urbanization (ur): This controls for differences in perceptions between urban and rural populations, where access to services and political engagement can vary. - Gender (q1tc_r): Gender might affect political perceptions, as men and women can have different experiences with political systems and rights. - Age (q2): Age influences political attitudes and experiences, so it is important to control for generational differences. - Socioeconomic Status (estratosec): Socioeconomic status can impact individuals’ access to rights and protections, which might affect how they perceive the system’s ability to protect their basic rights.

This regression model suggests that urban respondents, older individuals, and those from higher socioeconomic backgrounds tend to perceive better protection of their rights. The gender variable is marginally significant, suggesting that there may be slight differences between men and women in their perceptions of rights protection. However, the low R-squared value indicates that other unmeasured factors likely contribute to the perception of rights protection.

##Bonus

if (!require(ggplot2)) 
  #install.packages("ggplot2")
library(ggplot2)

# Obtener los coeficientes y errores estándar del modelo
coef_model <- summary(model1)$coefficients
coef_df <- data.frame(
  Variable = rownames(coef_model),
  Estimate = coef_model[, "Estimate"],
  Std_Error = coef_model[, "Std. Error"],
  p_value = coef_model[, "Pr(>|t|)"]
)

# Filtrar para quitar el intercepto
coef_df <- coef_df[coef_df$Variable != "(Intercept)",]

# Gráfico de coeficientes con intervalos de confianza
ggplot(coef_df, aes(x = reorder(Variable, Estimate), y = Estimate)) +
  geom_point(color = "blue", size = 3) +
  geom_errorbar(aes(ymin = Estimate - 1.96 * Std_Error, ymax = Estimate + 1.96 * Std_Error), width = 0.2, color = "red") +
  coord_flip() +
  labs(title = "Coefficient Plot", 
       x = "Variables",
       y = "Estimated Coefficient") +
  theme_minimal()

Assignment 3

Diego Durán Martínez

2025-03-03

Data

Regression