1. Sensitivity Analysis for Differential Measurement Error of the
Outcome
1) Data Generating Process for Differential Measurement Error of the
Outcome
# Set seed for reproducibility
set.seed(12345)
# Number of observations
n <- 1000
# Generate binary exposure A
A <- rbinom(n, 1, 0.5)
# Generate covariates C1, C2, and C3 on a 5-point Likert scale
C1 <- sample(1:5, n, replace = TRUE)
C2 <- sample(1:5, n, replace = TRUE)
C3 <- sample(1:5, n, replace = TRUE)
# Generate binary outcome Y based on A and covariates
logit <- function(p) { log(p / (1 - p)) }
prob <- function(logit) { exp(logit) / (1 + exp(logit)) }
L <- -1 + 0.5*A + 0.1*C1 - 0.1*C2 + 0.2*C3
Y <- rbinom(n, 1, prob(L))
# Introduce differential measurement error for Y* based on A
# Here, we're making it more likely for Y* to be 1 when A is 1, even when conditioned on Y and C
Y_star <- ifelse(Y == 1 & A == 1, rbinom(n, 1, 0.9), 
                 ifelse(Y == 1 & A == 0, rbinom(n, 1, 0.7), 
                        ifelse(Y == 0 & A == 1, rbinom(n, 1, 0.3), rbinom(n, 1, 0.1))))
# Generate measured value A* (for simplicity, let's assume it's the same as A in this example)
A_star <- A
# Create dataframe
df <- data.frame(A, Y, C1, C2, C3, A_star, Y_star)
# View the first few rows of the dataset
head(df, n= 10)
##    A Y C1 C2 C3 A_star Y_star
## 1  1 0  2  4  1      1      0
## 2  1 0  4  1  5      1      1
## 3  1 1  4  5  1      1      1
## 4  1 1  4  5  5      1      1
## 5  0 0  1  1  3      0      1
## 6  0 1  5  5  5      0      1
## 7  0 1  5  4  4      0      0
## 8  1 1  1  2  3      1      1
## 9  1 0  1  4  3      1      0
## 10 1 0  1  1  1      1      0
 
2) Calculate Sensitivity Rates and False Positive Rates
- s_1 and s_0 measure how accurately the mismeasured outcome reflects
the true outcome when the outcome is present (Y = 1)
- p_1 and p_0 measure the likelihood of incorrectly classifying the
outcome as present (Y* = 1) when it is actually absent (Y = 0).
# Calculate s_1, s_0, f_1, and f_0
s_1 <- mean(Y_star[Y == 1 & A == 1])
s_0 <- mean(Y_star[Y == 1 & A == 0])
f_1 <- mean(Y_star[Y == 0 & A == 1])
f_0 <- mean(Y_star[Y == 0 & A == 0])
list(sensitivity_Y1 = s_1, false_positive_Y1 = f_1, sensitivity_Y0 = s_0, false_positive_Y0 = f_0)
## $sensitivity_Y1
## [1] 0.8992806
## 
## $false_positive_Y1
## [1] 0.2766798
## 
## $sensitivity_Y0
## [1] 0.6989247
## 
## $false_positive_Y0
## [1] 0.07067138
 
3) Calculate Risk Ratio of Sensitivity Rates and False Positive
Rates
# Calculate risk ratios
RR_Y1 <- s_1 / s_0
RR_Y0 <- f_1 / f_0
list(Risk_Ratio_Y1 = RR_Y1, Risk_Ratio_Y0 = RR_Y0)
## $Risk_Ratio_Y1
## [1] 1.286663
## 
## $Risk_Ratio_Y0
## [1] 3.91502
 
4) Calculate Probability of True/Mis-measured Outcome
- p_0 and p_1 represents the probability of the true outcome (Y) being
1 when the true exposure (A) is 0 or 1. It’s calculated as the mean of Y
when A is 0 or 1.
- p_star_0 and p_star_1 represents the probability of the mis-measured
outcome (Y_star) being 1 when the measured exposure (in this case where
outcome is mis-measured, assumed to be the same as A) is 0 / 1.
# Calculate p_a and p_a* for both A = 0 and A = 1
p_0 <- mean(Y[A == 0])
p_1 <- mean(Y[A == 1])
# It's calculated as the mean of Y_star when A_star is 0 / 1.
p_star_0 <- mean(Y_star[A == 0])
p_star_1 <- mean(Y_star[A == 1])
list(p_0 = p_0, p_1 = p_1, p_star_0 = p_star_0, p_star_1 = p_star_1)
## $p_0
## [1] 0.3965885
## 
## $p_1
## [1] 0.5235405
## 
## $p_star_0
## [1] 0.3198294
## 
## $p_star_1
## [1] 0.6026365
 
5) Calculate Risk Ratio of True Association and Mis-measured
Association
- true_RR represents the actual, underlying association between the
exposure and the outcome in the absence of any measurement error
- observed_RR reflects the association as it appears in the data,
which may be affected by measurement error. It’s the association that
would be observed and reported in a study.
# Calculate risk ratios
true_RR <- p_1 / p_0
observed_RR <- p_star_1 / p_star_0
list(true_RR = true_RR, observed_RR = observed_RR)
## $true_RR
## [1] 1.32011
## 
## $observed_RR
## [1] 1.884244
 
6) Calculate Lower Bound / Upper Bound based on True Risk Ratio
- In this case, true_RR is bigger than 1, there exists only lower
bound
# Calculate bounds based on the theorem
if (true_RR >= 1) {
  lower_bound <- observed_RR / max(s_1/s_0, f_1/f_0)
  upper_bound <- Inf
} else {
  lower_bound <- 0
  upper_bound <- observed_RR / min(s_1/s_0, f_1/f_0)
}
list(upper_bound = upper_bound, lower_bound = lower_bound)
## $upper_bound
## [1] Inf
## 
## $lower_bound
## [1] 0.4812858
# Check robustness
if (observed_RR > 1) {
  if (s_1/s_0 >= observed_RR || f_1/f_0 >= observed_RR) {
    cat("The observed positive association may not be robust due to differential measurement error.\n")
  } else {
    cat("The observed positive association is likely robust to differential measurement error.\n")
  }
} else if (observed_RR < 1) {
  if (s_1/s_0 <= observed_RR || f_1/f_0 <= observed_RR) {
    cat("The observed negative association may not be robust due to differential measurement error.\n")
  } else {
    cat("The observed negative association is likely robust to differential measurement error.\n")
  }
} else {
  cat("The observed association is null.\n")
}
## The observed positive association may not be robust due to differential measurement error.
 
 
2. Sensitivity Analysis for Differential Measurement Error of the
Exposure
1) Data Generating Process for Differential Measurement Error of the
Exposure
# Set seed for reproducibility
set.seed(12345)
# Number of observations
n <- 1000
# Generate binary exposure A
A <- rbinom(n, 1, 0.5)
# Generate covariates C1, C2, and C3 on a 5-point Likert scale
C1 <- sample(1:5, n, replace = TRUE)
C2 <- sample(1:5, n, replace = TRUE)
C3 <- sample(1:5, n, replace = TRUE)
# Define the logistic function to transform logits to probabilities
logit <- function(p) { log(p / (1 - p)) }
prob <- function(logit) { exp(logit) / (1 + exp(logit)) }
# Generate binary outcome Y based on A and covariates
L <- -1 + 0.5*A + 0.1*C1 - 0.1*C2 + 0.2*C3
Y <- rbinom(n, 1, prob(L))
# Introduce differential measurement error for A* based on Y
# Making it more likely for A* to be misclassified when Y is 1
A_star <- ifelse(A == 1 & Y == 1, rbinom(n, 1, 0.7), 
                 ifelse(A == 1 & Y == 0, rbinom(n, 1, 0.9), 
                        ifelse(A == 0 & Y == 1, rbinom(n, 1, 0.3), rbinom(n, 1, 0.1))))
# Create dataframe
df <- data.frame(A, Y, C1, C2, C3, A_star)
# View the first few rows of the dataset
head(df)
##   A Y C1 C2 C3 A_star
## 1 1 0  2  4  1      1
## 2 1 0  4  1  5      1
## 3 1 1  4  5  1      0
## 4 1 1  4  5  5      1
## 5 0 0  1  1  3      1
## 6 0 1  5  5  5      0
 
2) Calculate Sensitivity Rates and False Positive Rates
- s_1 and s_0 measure how accurately the exposure is identified when
the exposure is present (A = 1) and outcome are present or not (Y = 1 or
0).
- f_1 and f_0 measure how often the exposure is incorrectly identified
as present when it’s actually absent (A = 0), but the outcome is present
or not (Y = 1 or 0).
# For A = 1
s_1 <- mean(A_star[Y == 1 & A == 1])
s_0 <- mean(A_star[Y == 0 & A == 1])
# For A = 0
f_1 <- mean(A_star[Y == 1 & A == 0])
f_0 <- mean(A_star[Y == 0 & A == 0])
list(sensitivity_Y1 = s_1, sensitivity_Y0 = s_0, false_positive_Y1 = f_1, false_positive_Y0 = f_0)
## $sensitivity_Y1
## [1] 0.7194245
## 
## $sensitivity_Y0
## [1] 0.9051383
## 
## $false_positive_Y1
## [1] 0.344086
## 
## $false_positive_Y0
## [1] 0.07067138
 
3) Calculate Odds Ratios for Sensitivity and False-Positive
Probability
- These odds ratios represent how the probability of correctly or
incorrectly measuring A* changes depending on the state of Y.
OR_sensitivity <- (s_1 / (1 - s_1)) / (s_0 / (1 - s_0))
OR_false_positive <- (f_1 / (1 - f_1)) / (f_0 / (1 - f_0))
list(`Odds Ratio of sensitivity` = OR_sensitivity, `Odds Ratio of False Positivity` = OR_false_positive)
## $`Odds Ratio of sensitivity`
## [1] 0.2687269
## 
## $`Odds Ratio of False Positivity`
## [1] 6.898361
 
4) Calculate Correct and Incorrect Classification Ratios
- The r_c and r_i calculations assess the relative increase in correct
and incorrect classifications due to the presence of the outcome.
# Calculate the correct classification ratio
r_c <- (s_1 / s_0) / ((1 - f_1) / (1 - f_0))
# Calculate the incorrect classification ratio
r_i <- (f_1 / f_0) / ((1 - s_1) / (1 - s_0))
list(correct_classification_ratio = r_c, incorrect_classification_ratio = r_i)
## $correct_classification_ratio
## [1] 1.126141
## 
## $incorrect_classification_ratio
## [1] 1.646131
 
5) Calculate Odds Ratio of True Association and Mis-measured
Association
- true_OR represents the actual, underlying association between the
exposure and the outcome in the absence of any measurement error
- observed_OR reflects the association as it appears in the data,
which may be affected by measurement error. It’s the association that
would be observed and reported in a study.
# Calculate true and observed odds ratios
# Calculate p_a and p_a* for both Y = 0 and Y = 1
p_0 <- mean(A[Y == 0])
p_1 <- mean(A[Y == 1])
# It's calculated as the mean of A_star when Y is 0 / 1.
p_star_0 <- mean(A_star[Y == 0])
p_star_1 <- mean(A_star[Y == 1])
list(p_0 = p_0, p_1 = p_1, p_star_0 = p_star_0, p_star_1 = p_star_1)
## $p_0
## [1] 0.4720149
## 
## $p_1
## [1] 0.5991379
## 
## $p_star_0
## [1] 0.4645522
## 
## $p_star_1
## [1] 0.5689655
true_OR <- (p_1 / (1 - p_1)) / (p_0 / (1 - p_0))
observed_OR <- (p_star_1 / (1 - p_star_1)) / (p_star_0 / (1 - p_star_0))
 
6) Calculate Bounds Based on Theorem 2
- The true effect of A on Y must be at least as large as the observed
association divided by the maximum strength of differential measurement
error.
# Calculate bounds based on the value of true_OR
if (true_OR >= 1) {
  upper_bound <- observed_OR / max(OR_sensitivity, OR_false_positive, r_c, r_i)
  result <- list(true_OR = true_OR, upper_bound = upper_bound)
} else {
  lower_bound <- observed_OR / min(OR_sensitivity, OR_false_positive, r_c, r_i)
  result <- list(true_OR = true_OR, lower_bound = lower_bound)
}
print(result)
## $true_OR
## [1] 1.671852
## 
## $upper_bound
## [1] 0.2205518
if (observed_OR > 1) {
  if (s_1/s_0 >= observed_OR || f_1/f_0 >= observed_OR) {
    cat("The observed positive association may not be robust due to differential measurement error.\n")
  } else {
    cat("The observed positive association is likely robust to differential measurement error.\n")
  }
} else if (observed_OR < 1) {
  if (s_1/s_0 <= observed_OR || f_1/f_0 <= observed_OR) {
    cat("The observed negative association may not be robust due to differential measurement error.\n")
  } else {
    cat("The observed negative association is likely robust to differential measurement error.\n")
  }
} else {
  cat("The observed association is null.\n")
}
## The observed positive association may not be robust due to differential measurement error.
 
 
3. Sensitivity Analysis for Differential Measurement Error of the
Outcome Using Education Data
## import data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
library(stats)
setwd("/Users/jihoonchoi/Downloads/R_Personal")
1) Call HSLS Data
# Use Data from open-educational dataset, HSLS
data <- read_sav("hsls_17_student_pets_sr_v1_0.sav")
data1 = data %>% 
  select(STU_ID, M1POORJOBRES, M1ADMSUPPORT, X1LOCALE, X1CONTROL)
# M1POORJOBRES = The principal does a poor job of getting resources for this school. (1= Strongly Agree, 2= Agree, 3= Disagree, 4= Strongly Disagree)
# M1ADMSUPPORT = Teaching is limited by inadequate administrative support (1= Not at all, 2= A little, 3= Some, 4= A lot)
# X1CONTROL = Indicates the control of the sample member's base year school (1= Pub, 2= Cat, 3= Other Prv)
# X1LOCALE = Indicates the locale (urbanicity) of the sample member's base year school (1= City, 2= Suburb, 3= Town, 4= Rural)
 
2) Mutate Variables
data2 = data1 %>% 
  filter(M1POORJOBRES > 0 & M1ADMSUPPORT > 0)
data3 = data2 %>% 
  mutate(X1LOCALE = as.factor(X1LOCALE), X1CONTROL = as.factor(X1CONTROL)) %>% 
  mutate(STU_ID = as.integer(STU_ID), M1POORJOBRES = as.integer(M1POORJOBRES), M1ADMSUPPORT = as.integer(M1ADMSUPPORT))
data4 = data3 %>% 
  mutate(poorjob = ifelse(M1POORJOBRES < 3, 1, 0)) %>% 
  mutate(admsup = ifelse(M1ADMSUPPORT >2, 1, 0)) %>% 
  mutate(poorjob = as.factor(poorjob), admsup = as.factor(admsup))
print(data4)
## # A tibble: 14,025 × 7
##    STU_ID M1POORJOBRES M1ADMSUPPORT X1LOCALE X1CONTROL poorjob admsup
##     <int>        <int>        <int> <fct>    <fct>     <fct>   <fct> 
##  1  10001            3            1 4        1         0       0     
##  2  10002            3            2 4        1         0       0     
##  3  10003            3            1 2        1         0       0     
##  4  10005            4            1 1        1         0       0     
##  5  10006            4            1 4        1         0       0     
##  6  10008            4            1 3        1         0       0     
##  7  10010            3            1 1        1         0       0     
##  8  10014            3            1 3        1         0       0     
##  9  10015            2            1 1        1         1       0     
## 10  10019            4            1 1        1         0       0     
## # ℹ 14,015 more rows
 
4) Introduce Hypothetical Mismeasured Situation for Comparison
- We can assume that hypothetically mis-measured dataset for
comparison that have 10% ~ 20% of misclassification rates
- By using this hypothetical dataset, we can measure threshold value
of differential measurement error that could explain away observed Odds
Ratio.
# Hypothetical sensitivity and specificity
s_1 <- 0.9  # Sensitivity when admsup is present
s_0 <- 0.8  # Sensitivity when admsup is absent
f_1 <- 1 - 0.85  # False-positive rate when admsup is present
f_0 <- 1 - 0.95  # False-positive rate when admsup is absent
# Calculate Odds Ratio of Sensitivitiy and False-positive Rates
OR_sensitivity <- (s_1 / (1 - s_1)) / (s_0 / (1 - s_0))
OR_false_positive <- (f_1 / (1 - f_1)) / (f_0 / (1 - f_0))
# Calculating correct and incorrect classification ratios
r_c <- (s_1 / s_0) / ((1 - f_1) / (1 - f_0))
r_i <- (f_1 / f_0) / ((1 - s_1) / (1 - s_0))
# Observed Odds Ratio from logistic model
OR_observed <- exp(1.76024)  
# Applying Theorem 2 to calculate upper bound (maximum strength of measurement error) for observed_OR
if (OR_observed >= 1) {
    upper_bound <- OR_observed / max(OR_sensitivity, OR_false_positive, r_c, r_i)
} else {
    lower_bound <- OR_observed / min(OR_sensitivity, OR_false_positive,r_c, r_i)
}
# Display the results
cat("Upper Bound: ", upper_bound, "\n")
## Upper Bound:  0.9689721