Research Report

Recoding

Using Italy ESS Data, to interpret impact education has on choosing a high paying job

italy_task1 <- italy_data %>%
  mutate(
    ID = case_when(
      ipjbhin %in% 0:4 ~ "high income",                     
      ipjbhin %in% 6:10 ~ "low income",                  
      ipjbhin %in% c(77, 88) ~ NA_character_,    
      TRUE ~ NA_character_                           
    ),
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))  
  ) %>%
  filter(!is.na(ID), !is.na(educ.ba))

table(italy_task1$educ.ba)

## 
##      No BA BA or more 
##          0          0

# Now, proceed with the Chi-Square test for italy_data with the recoded variables
mytab <- table(italy_task1$ID, italy_task1$educ.ba)
mytab

## < table of extent 0 x 2 >

rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
expected <- outer(rsums, csums) / N
expected

##      No BA BA or more

alpha <- 0.05
df <- (nrow(mytab) - 1) * (ncol(mytab) - 1)
critical_value <- qchisq(1 - alpha, df)

## Warning in qchisq(1 - alpha, df): NaNs produced

observed <- mytab
chi_squared <- sum((observed - expected)^2 / expected)

p_value <- pchisq(chi_squared, df, lower.tail = FALSE)

## Warning in pchisq(chi_squared, df, lower.tail = FALSE): NaNs produced

cat("Critical Value:", critical_value, "\n")

## Critical Value: NaN

cat("Chi-Squared Statistic:", chi_squared, "\n")

## Chi-Squared Statistic: 0

cat("P-Value:", p_value, "\n")

## P-Value: NaN

##Interpreting output

italy_task2 <- italy_data %>%
  mutate(
    geo = recode(as.character(domicil), 
                 '1' = "High Income", 
                 '2' = "Lower Income",
                 '3' = NA_character_,
                 '4' = NA_character_,
                 '5' = NA_character_,
                 '7' = NA_character_,
                 '8' = NA_character_,
                 '9' = NA_character_),
    socben = case_when(
      sbbsntx == 1 ~ "Agree Strongly",  
      sbbsntx == 2 ~ "Agree",
      sbbsntx == 3 ~ "Neither agree nor disagree",
      sbbsntx == 4 ~ "Disagree",
      sbbsntx == 5 ~ "Disagree strongly",
      sbbsntx %in% c(7, 8, 9) ~ NA_character_, 
      TRUE ~ as.character(sbbsntx)
    )
  ) %>%
  filter(!is.na(geo), !is.na(socben))  # Ensuring to remove NA values for cleaner analysis

table(italy_task2$geo)

## 
##  High Income Lower Income 
##          245          135

table(italy_task2$socben)

## 
##                      Agree             Agree Strongly 
##                        110                         37 
##                   Disagree          Disagree strongly 
##                         63                         22 
## Neither agree nor disagree 
##                        148

# Step 1: Calculate the test statistic
test_stat <- italy_task2 %>%
  specify(response = socben, explanatory = geo) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq")

print(test_stat$stat)

## X-squared 
##  12.72196

# Steps 2 & 3: Simulate the null distribution and calculate the p-value
null_distribution <- italy_task2 %>%
  specify(response = socben, explanatory = geo) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")

# p-value
p_value <- null_distribution %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")

p_value

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1   0.004

##Visualizing Null Distribution

conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int)

Research Report

Raya Raposo

Recoding