Poisson ER

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(Rcpp)
library(htmltools)
library(knitr)
library(skimr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(ggplot2)
library(patchwork)
library(plyr)

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact

library(PKNCA)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(broom)
library(glue)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

raw dataset

rm(list = ls())
# file.choose()
"/global/pkms2/data/Test/Test.PC/PoissonERM/PoissonERM-Example-main/obsdata.csv"%>% 
    read_csv() %>% 
    clean_names() %>% 
    filter(flag==2)->raw

## Rows: 3000 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): PROT, ID, SEX, RACE, LOCATION, AGE, BWT, TIME, CAVE1, CAVE2, FLAG,...
## lgl  (1): C
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

raw %>% 
    count('dv')

##   dv freq
## 1  0  482
## 2  1  504
## 3 NA   14

cleaned_raw <- raw %>% filter(!is.na(dv))  
summary(raw$cave2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   33.58   48.58   53.92   70.05  194.39

summary(cleaned_raw$cave2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   33.57   48.62   53.99   70.01  194.39

#Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 #  0.00   33.58   48.58   53.92   70.05  194.39 

raw$cave2squart<-ifelse(raw$cave2<=33.58,"1Q",
                                     ifelse(raw$cave2<=48.58,"2Q",
                                            ifelse(raw$cave2<=70.05,"3Q","4Q")))

cleaned_raw <- raw %>% filter(!is.na(dv)) ####mutate(time=300)

# Fit the logistic regression model
POI <-glm(dv ~ cave2+offset(log(time)), family = poisson(link = "log"), data = cleaned_raw)
LOG <- glm(formula = dv ~ cave2, family = binomial(link = "logit"), data = cleaned_raw)

summary(LOG)

## 
## Call:
## glm(formula = dv ~ cave2, family = binomial(link = "logit"), 
##     data = cleaned_raw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8751  -1.1382   0.6841   1.1664   1.4926  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.715971   0.135863   -5.27 1.37e-07 ***
## cave2        0.014233   0.002266    6.28 3.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1366.4  on 985  degrees of freedom
## Residual deviance: 1323.1  on 984  degrees of freedom
## AIC: 1327.1
## 
## Number of Fisher Scoring iterations: 4

summary(POI)

## 
## Call:
## glm(formula = dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), 
##     data = cleaned_raw)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6357  -0.9383  -0.1656   0.5523   2.0911  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.302518   0.090915 -69.323  < 2e-16 ***
## cave2        0.006055   0.001316   4.601 4.21e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 826.12  on 985  degrees of freedom
## Residual deviance: 806.16  on 984  degrees of freedom
## AIC: 1818.2
## 
## Number of Fisher Scoring iterations: 5

cleaned_raw <- cleaned_raw %>%
  mutate(
    pred_poi = predict(POI, type = "response"),
    pred_log = predict(LOG, type = "response")
  )

logistic_eq <- paste0("Logistic: logit(p) = ", round(coef(LOG)[1], 3), " + ", round(coef(LOG)[2], 3), " * cave2, AIC=", round(AIC(LOG), 2))
poisson_eq <- paste0("Poisson: log(λ) = ", round(coef(POI)[1], 3), " + ",  round(coef(POI)[2], 3), " * cave2 + log(time), AIC=", round(AIC(POI),2))

sm_cave <- ddply(cleaned_raw, c("cave2squart"), summarise,
                 mc = median(cave2),
                 n = length(dv),
                 rsp = sum(dv),
                 rate = round(rsp/n, dig = 2),
                 lo = ifelse(rsp > 0 & rsp < n, 
                             round(prop.test(rsp, n)$conf.int[1], dig = 4), 
                             NA),
                 hi = ifelse(rsp > 0 & rsp < n, 
                             round(prop.test(rsp, n)$conf.int[2], dig = 4), 
                             NA)
)

ggplot(cleaned_raw, aes(x = cave2, y = dv)) + 
  geom_point(color = "blue")+ 
  theme_bw() +
  stat_smooth(method = glm, method.args = list(family = "binomial"), 
              aes(color = "Logistic", linetype = "Logistic"), se = FALSE) +  
  stat_smooth(method = glm, method.args = list(family = "poisson"), 
              aes(color = "Poisson", linetype = "Poisson"), se = FALSE) +  
  geom_errorbar(data = sm_cave, aes(x = mc, y = rate, ymin = lo, ymax = hi), 
                width = 0.2, position = position_dodge(0.05)) +
  geom_point(data = sm_cave, aes(x = mc, y = rate), color = "black") +
  xlab("cave2") + ylab("Rate of DV (AE2)") +
  labs(title = "Comparison of Logistic and Poisson Fits", 
       color = "Model", linetype = "Model") +  # Add labels for the legend
  theme(legend.position = "top") +
    annotate("text", x = 0, y = 0.2, label = logistic_eq, size = 4, hjust = 0) +
  annotate("text", x = 0, y = 0.1, label = poisson_eq,  size = 4, hjust = 0)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggplot(cleaned_raw, aes(x = cave2, y = dv)) + 
  geom_point(color = "blue", alpha = 0.6) +  # Raw data points
  geom_line(aes(y = pred_poi, color = "Poisson", linetype = "Poisson"), size = 1) +  # Poisson fit
  geom_line(aes(y = pred_log, color = "Logistic", linetype = "Logistic"), size = 1) +  # Logistic fit
  geom_errorbar(data = sm_cave, aes(x = mc, y = rate, ymin = lo, ymax = hi), 
                width = 0.2, position = position_dodge(0.05)) +
  geom_point(data = sm_cave, aes(x = mc, y = rate), color = "black") +
  xlab("cave2") + ylab("Rate of DV (AE2)") +
  labs(
    title = "Comparison of Logistic and Poisson Fits", 
    color = "Model", linetype = "Model"
  ) +  # Add labels for the legend
  theme_bw() +
  theme(legend.position = "top") +
  annotate("text", x = 0, y = 0.2, label = logistic_eq, size = 3, hjust = 0) +
  annotate("text", x = 0, y = 0.1, label = poisson_eq, size = 3, hjust = 0)

########more cut, 10
 cutoff_10 <- quantile(cleaned_raw$cave2, probs = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9), na.rm = TRUE)
    
    raww <- cleaned_raw %>%
      mutate(cave2_10 = cut(
        cave2, 
        breaks = c(-Inf, cutoff_10, Inf), 
        labels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
      ))
    
    p <- raww %>% summarise(mean_dv = mean(dv, na.rm = TRUE)) %>% pull(mean_dv)
    n <- nrow(raww)
    
    POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
    LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
    
    POI_summary <- tidy(POI) %>% mutate(model = "Poisson")
    LOG_summary <- tidy(LOG) %>% mutate(model = "Logistic")
    
    logistic_p_value <- LOG_summary %>% filter(term == "cave2") %>% pull(p.value)
    poisson_p_value <- POI_summary %>% filter(term == "cave2") %>% pull(p.value)
    
   logistic_p_value_int <- LOG_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    poisson_p_value_int <- POI_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    
    combined_summary <- bind_rows(POI_summary, LOG_summary)
    
    sm_cave_10 <- raww %>%
      group_by(cave2_10) %>%
      dplyr::summarise(
        mc = median(cave2),
        n = n(),
        rsp = sum(dv),
        rate = round(rsp / n, 2),
        lo = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[1], 4), NA),
        hi = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[2], 4), NA)
      )
    
ggplot(raww, aes(x = cave2, y = dv)) + 
  geom_point(color = "blue")+ 
  theme_bw() +
  stat_smooth(method = glm, method.args = list(family = "binomial"), 
              aes(color = "Logistic", linetype = "Logistic"), se = FALSE) +  
  stat_smooth(method = glm, method.args = list(family = "poisson"), 
              aes(color = "Poisson", linetype = "Poisson"), se = FALSE) +  
  geom_errorbar(data = sm_cave_10, aes(x = mc, y = rate, ymin = lo, ymax = hi), 
                width = 0.2, position = position_dodge(0.05)) +
  geom_point(data = sm_cave_10, aes(x = mc, y = rate), color = "black") +
  xlab("cave2") + ylab("Rate of DV (AE2)") +
  labs(title = "Comparison of Logistic and Poisson Fits", 
       color = "Model", linetype = "Model") +  # Add labels for the legend
  theme(legend.position = "top") +
    annotate("text", x = 0, y = 0.2, label = logistic_eq, size = 5, hjust = 0) +
  annotate("text", x = 0, y = 0.1, label = poisson_eq,  size = 5, hjust = 0)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#######cut 20
########more cut
 cutoff_20 <- quantile(cleaned_raw$cave2, probs = c(0.2, 0.4, 0.6, 0.8, 0.9, 0.93, 0.96, 0.99, 0.995 ), na.rm = TRUE)
    
    raww <- cleaned_raw %>%
      mutate(cave2_20 = cut(
        cave2, 
        breaks = c(-Inf, cutoff_20, Inf), 
        labels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
      ))
    
    p <- raww %>% summarise(mean_dv = mean(dv, na.rm = TRUE)) %>% pull(mean_dv)
    n <- nrow(raww)
    
    POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
    LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
    
    POI_summary <- tidy(POI) %>% mutate(model = "Poisson")
    LOG_summary <- tidy(LOG) %>% mutate(model = "Logistic")
    
    logistic_p_value <- LOG_summary %>% filter(term == "cave2") %>% pull(p.value)
    poisson_p_value <- POI_summary %>% filter(term == "cave2") %>% pull(p.value)
    
   logistic_p_value_int <- LOG_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    poisson_p_value_int <- POI_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    
    combined_summary <- bind_rows(POI_summary, LOG_summary)
    
    sm_cave_20 <- raww %>%
      group_by(cave2_20) %>%
      dplyr::summarise(
        mc = median(cave2),
        n = n(),
        rsp = sum(dv),
        rate = round(rsp / n, 2),
        lo = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[1], 4), NA),
        hi = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[2], 4), NA)
      )

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

ggplot(raww, aes(x = cave2, y = dv)) + 
  geom_point(color = "blue")+ 
  theme_bw() +
  stat_smooth(method = glm, method.args = list(family = "binomial"), 
              aes(color = "Logistic", linetype = "Logistic"), se = FALSE) +  
  stat_smooth(method = glm, method.args = list(family = "poisson"), 
              aes(color = "Poisson", linetype = "Poisson"), se = FALSE) +  
  geom_errorbar(data = sm_cave_20, aes(x = mc, y = rate, ymin = lo, ymax = hi), 
                width = 0.2, position = position_dodge(0.05)) +
  geom_point(data = sm_cave_20, aes(x = mc, y = rate), color = "black") +
  xlab("cave2") + ylab("Rate of DV (AE2)") +
  labs(title = "Comparison of Logistic and Poisson Fits", 
       color = "Model", linetype = "Model") +  # Add labels for the legend
  theme(legend.position = "top") +
    annotate("text", x = 0, y = 0.2, label = logistic_eq, size = 5, hjust = 0) +
  annotate("text", x = 0, y = 0.1, label = poisson_eq,  size = 5, hjust = 0)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

pull from the dataset, from both positive and negative

n<-20
id0 <- raw %>% pull(id) %>% unique %>% sample(size=n)

raw %>% 
  filter(id%in%id0) %>% 
        ggplot(aes(x = id %>% as.factor ,y = time, fill = dv %>% as.factor)) +
    geom_col(alpha=.2) +
        theme(legend.position="top")+
  coord_flip()

ID <- raw %>% pull(id) %>% unique
N<-50

data1 <- tibble(n=seq(5 %>% log, 500 %>% log, length.out=N) %>% exp %>% round(0)) %>% 
  mutate(
    var=map(n, .progress = T, ~{ 
      .x -> n
      id0 <- sample(ID, n)
      raww <-  raw %>% filter(id %in%id0)
      p <-  raww %>%
        summarise(mean(dv, na.rm=T)) %>% 
        clean_names() %>% 
        pull(mean_dv_na_rm_t)
      POI <- glm(dv ~ cave2 + offset(log(time)), 
             family = poisson(link = "log"), data = raww)
      LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
      summary(POI)
      
      POI %>% 
        tidy %>% 
        mutate(flag="poisson") %>% 
        full_join(LOG %>% tidy %>% mutate(flag="logistic"))%>% 
        mutate(p=p) %>% 
        suppressMessages()
      }
    )
  ) %>% 
  suppressMessages()

## Warning: glm.fit: fitted rates numerically 0 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

nn<-0.5

data1 %>% 
  unnest() %>% filter(term=="cave2") %>% 
  clean_names() %>% 
        ggplot(aes(x = n ,y = p_value, col = flag %>% as.factor())) +
    geom_point(position = position_dodge(nn),aes(alpha=p)) +
    geom_line(position = position_dodge(nn)) +
        theme(legend.position="top")+
  geom_hline(yintercept = 0.05, linetype="dotted")->p

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(var)`

p %>% ggplotly

## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

data1 %>% 
  unnest() %>% filter(term=="cave2") %>% 
  clean_names() %>% 
        ggplot(aes(x = n ,y = p_value, col = flag %>% as.factor())) +
    geom_point(position = position_dodge(nn),aes(alpha=p)) +
    geom_line(position = position_dodge(nn)) +
        theme(legend.position="top")+
  geom_hline(yintercept = 0.05, linetype="dotted")->p1

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(var)`

p %>% ggplotly

dataset 1 time sensitivity

#####when event rate is 50%
N <- 50
data1_1 <- tibble(t = seq(log(1), log(500), length.out = N) %>% exp() %>% round(0)) %>%
  mutate(
    var = map(t, ~{
      raww <- raw %>% mutate(time = .x)
      
      POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
      LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
      
      poisson_results <- tidy(POI) %>% mutate(model = "poisson")
      logistic_results <- tidy(LOG) %>% mutate(model = "logistic")
      
      bind_rows(poisson_results, logistic_results) %>%
        mutate(time = .x)
    })
  ) %>%
  unnest(cols = c(var))

nn <- 0.5

# Plot for p-values (cave2)
p_1 <- data1_1 %>%
  filter(term == "cave2") %>%
  ggplot(aes(x = time, y = p.value, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  geom_hline(yintercept = 0.05, linetype = "dotted", color = "red") +
  labs(title = "p-values (cave2) across Time", x = "Time", y = "p-value") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

# Plot for slope estimates (sigma)
p1_1 <- data1_1 %>%
  filter(term == "cave2") %>%
  ggplot(aes(x = time, y = estimate, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  labs(title = "Slope Estimates (cave2) across Time", x = "Time", y = "Slope Estimate (Sigma)") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

# Plot for intercept estimates
p_intercept_1 <- data1_1 %>%
  filter(term == "(Intercept)") %>%
  ggplot(aes(x = time, y = estimate, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  labs(title = "Intercept Estimates across Time", x = "Time", y = "Intercept Estimate") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

# Plot for p-values of intercept
p_intercept_pval_1 <- data1_1 %>%
  filter(term == "(Intercept)") %>%
  ggplot(aes(x = time, y = p.value, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  geom_hline(yintercept = 0.05, linetype = "dotted", color = "red") +
  labs(title = "Intercept p-values across Time", x = "Time", y = "p-value") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

p_1_interactive <- ggplotly(p_1)
p1_1_interactive <- ggplotly(p1_1)
p_intercept_1_interactive <- ggplotly(p_intercept_1)
p_intercept_pval_1_interactive <- ggplotly(p_intercept_pval_1)

p_1_interactive

p1_1_interactive

p_intercept_1_interactive

p_intercept_pval_1_interactive

# Event Rate: 0.8
id_p_0 <- raw %>% filter(dv == 1) %>% pull(id) %>% unique  ### positive id 504
id_n_0 <- raw %>% filter(dv == 0) %>% pull(id) %>% unique  ### negative id 482

id_p <- sample(id_p_0, length(id_p_0) * 1)
id_n <- sample(id_n_0, length(id_n_0) * 0.25)

raw0.8 <- raw %>% 
  filter(id %in% c(id_p, id_n)) %>% suppressMessages()

data1_2 <- tibble(t = seq(log(1), log(500), length.out = N) %>% exp() %>% round(0)) %>%
  mutate(
    var = map(t, ~{
      raww <- raw0.8 %>% mutate(time = .x)
      
      POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
      LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
      
      poisson_results <- tidy(POI) %>% mutate(model = "poisson")
      logistic_results <- tidy(LOG) %>% mutate(model = "logistic")
      
      bind_rows(poisson_results, logistic_results) %>%
        mutate(time = .x)
    })
  ) %>%
  unnest(cols = c(var))

# Plot for p-values (cave2)
p_2 <- data1_2 %>%
  filter(term == "cave2") %>%
  ggplot(aes(x = time, y = p.value, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  geom_hline(yintercept = 0.05, linetype = "dotted", color = "red") +
  labs(title = "p-values (cave2) across Time", x = "Time", y = "p-value") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

# Plot for slope estimates (sigma)
p1_2 <- data1_2 %>%
  filter(term == "cave2") %>%
  ggplot(aes(x = time, y = estimate, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  labs(title = "Slope Estimates (cave2) across Time", x = "Time", y = "Slope Estimate (Sigma)") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

# Plot for intercept estimates
p_intercept <- data1_2 %>%
  filter(term == "(Intercept)") %>%
  ggplot(aes(x = time, y = estimate, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  labs(title = "Intercept Estimates across Time", x = "Time", y = "Intercept Estimate") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

# Plot for p-values of intercept
p_intercept_pval <- data1_2 %>%
  filter(term == "(Intercept)") %>%
  ggplot(aes(x = time, y = p.value, color = model)) +
  geom_point(position = position_dodge(nn)) +
  geom_line(position = position_dodge(nn)) +
  geom_hline(yintercept = 0.05, linetype = "dotted", color = "red") +
  labs(title = "Intercept p-values across Time", x = "Time", y = "p-value") +
  theme(legend.position = "top") +
  scale_color_manual(values = c("poisson" = "blue", "logistic" = "green"))

p_2_interactive <- ggplotly(p_2)
p1_2_interactive <- ggplotly(p1_2)
p_intercept_interactive <- ggplotly(p_intercept)
p_intercept_pval_interactive <- ggplotly(p_intercept_pval)

p_2_interactive

p1_2_interactive

p_intercept_interactive

p_intercept_pval_interactive

N <- 50
time_values <- seq(log(1), log(500), length.out = N) %>% exp() %>% round(0)

data_fits <- map_dfr(time_values, function(time_point) {
  raw %>%
    mutate(time = time_point) %>%
    mutate(fit_time = paste("Time =", time_point))  
})

logistic_fits_plot <- data_fits %>%
  ggplot(aes(x = cave2, y = dv, color = fit_time, linetype = fit_time)) +
  stat_smooth(method = glm, method.args = list(family = "binomial"), se = FALSE) +
  labs(title = "Logistic Regression Fits Across Different Time",
       x = "Cave2", y = "Predicted Probability", color = "Time", linetype = "Time") +
  theme_minimal() +
  theme(legend.position = "right")

poisson_fits_plot <- data_fits %>%
  ggplot(aes(x = cave2, y = dv, color = fit_time, linetype = fit_time)) +
  stat_smooth(method = glm, method.args = list(family = "poisson"), se = FALSE) +
  labs(title = "Poisson Regression Fits Across Different Time",
       x = "Cave2", y = "Predicted Count", color = "Time", linetype = "Time") +
  theme_minimal() +
  theme(legend.position = "right")

logistic_fits_interactive <- ggplotly(logistic_fits_plot)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 700 rows containing non-finite values (stat_smooth).

poisson_fits_interactive <- ggplotly(poisson_fits_plot)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 700 rows containing non-finite values (stat_smooth).

logistic_fits_interactive

poisson_fits_interactive

pull different numbers of id from positive and negative

id_p_0 <- raw %>%filter(dv==1) %>%  pull(id) %>% unique  ###positive id 504
id_n_0 <- raw %>%filter(dv==0) %>%  pull(id) %>% unique   ### negative id  482
per <- c(seq(0.05, 1, .1),1)

# NN<-200


data2 <- tibble(crossing(var1=per, var2=per)) %>% 
  mutate(varr=map2(var1, var2, .progress = T, ~{
    .x->per1
    .y->per2
    id_p <- sample(id_p_0, id_p_0 %>% length*per1)
    id_n <- sample(id_n_0, id_n_0 %>% length*per2)
    raww <- raw %>% 
      filter(id%in%c(id_p, id_n)) %>% suppressMessages()
    p <- raww %>%
        summarise(mean(dv, na.rm=T)) %>% 
        clean_names() %>% pull(mean_dv_na_rm_t) %>% 
      suppressMessages()
    n <- raww %>% nrow()
      
      
POI <- glm(dv ~ cave2 + offset(log(time)), 
             family = poisson(link = "log"), data = raww)


# Fit the logistic regression model
LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)

summary(POI)

POI %>% tidy %>% mutate(flag="poisson") %>% full_join(
  LOG %>% tidy %>% mutate(flag="logistic")
)%>% mutate(p=p, n=n) %>% suppressMessages()


  }))


#####################
data2

## # A tibble: 121 × 3
##     var1  var2 varr            
##    <dbl> <dbl> <list>          
##  1  0.05  0.05 <tibble [4 × 8]>
##  2  0.05  0.15 <tibble [4 × 8]>
##  3  0.05  0.25 <tibble [4 × 8]>
##  4  0.05  0.35 <tibble [4 × 8]>
##  5  0.05  0.45 <tibble [4 × 8]>
##  6  0.05  0.55 <tibble [4 × 8]>
##  7  0.05  0.65 <tibble [4 × 8]>
##  8  0.05  0.75 <tibble [4 × 8]>
##  9  0.05  0.85 <tibble [4 × 8]>
## 10  0.05  0.95 <tibble [4 × 8]>
## # … with 111 more rows

data2 %>%unnest()

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(varr)`

## # A tibble: 484 × 10
##     var1  var2 term        estimate std.error stati…¹  p.value flag      p     n
##    <dbl> <dbl> <chr>          <dbl>     <dbl>   <dbl>    <dbl> <chr> <dbl> <int>
##  1  0.05  0.05 (Intercept)  -7.26     0.629    -11.5  8.82e-31 pois… 0.510    49
##  2  0.05  0.05 cave2         0.0193   0.00877    2.21 2.75e- 2 pois… 0.510    49
##  3  0.05  0.05 (Intercept)  -2.32     0.991     -2.34 1.90e- 2 logi… 0.510    49
##  4  0.05  0.05 cave2         0.0399   0.0159     2.50 1.23e- 2 logi… 0.510    49
##  5  0.05  0.15 (Intercept)  -7.29     0.426    -17.1  8.80e-66 pois… 0.258    97
##  6  0.05  0.15 cave2         0.0125   0.00566    2.22 2.65e- 2 pois… 0.258    97
##  7  0.05  0.15 (Intercept)  -2.08     0.539     -3.86 1.15e- 4 logi… 0.258    97
##  8  0.05  0.15 cave2         0.0176   0.00804    2.19 2.83e- 2 logi… 0.258    97
##  9  0.05  0.25 (Intercept)  -7.64     0.434    -17.6  2.06e-69 pois… 0.172   145
## 10  0.05  0.25 cave2         0.0123   0.00668    1.85 6.49e- 2 pois… 0.172   145
## # … with 474 more rows, and abbreviated variable name ¹statistic

a<-0.1

forplot <- data2 %>%  unnest()

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(varr)`

p1 <- forplot %>% 
  ggplot(aes(x = p ,y = estimate, col = flag %>% as.factor)) +
    geom_point(alpha=a) +
    geom_line(alpha=a) +
        #geom_smooth(method = "lm", se = T) +
        #geom_col(position = position_dodge(width = .25), alpha = .5) +
        #scale_y_log10()+
        facet_wrap(~ term, scales = "free") +
        theme(legend.position="top")+
  geom_smooth()+
  labs(x="event rate(%)")+
  geom_vline(xintercept =raw %>% summarise(m=dv %>% mean(na.rm=T)) %>% pull(m), linetype="dotted")




forplot <-data2 %>% 
  unnest() %>% 
  clean_names()

## Warning: `cols` is now required when using unnest().
## Please use `cols = c(varr)`

p2 <- forplot%>% 
        ggplot(aes(x = p ,y = p_value, col = flag %>% as.factor)) +
    geom_point(alpha=a) +
    geom_line(alpha=a) +
        #geom_smooth(method = "lm", se = T) +
        #geom_col(position = position_dodge(width = .25), alpha = .5) +
        #scale_y_log10()+
        facet_wrap(~ term, scales = "free") +
        theme(legend.position="top")+
  geom_smooth()+
  labs(x="event rate(%)")+
  geom_vline(xintercept =raw %>% summarise(m=dv %>% mean(na.rm=T)) %>% pull(m), linetype="dotted")+
  geom_hline(yintercept = 0.05, col="blue", linetype="dotted")




p3 <- forplot%>% 
  mutate(rse=std_error/estimate) %>% 
        ggplot(aes(x = p ,y = rse, col = flag %>% as.factor)) +
    geom_point(alpha=a) +
    geom_line(alpha=a) +
        #geom_smooth(method = "lm", se = T) +
        #geom_col(position = position_dodge(width = .25), alpha = .5) +
        #scale_y_log10()+
        facet_wrap(~ term, scales = "free") +
        theme(legend.position="top")+
  geom_smooth()+
  labs(x="event rate(%)")+
  geom_vline(xintercept =raw %>% summarise(m=dv %>% mean(na.rm=T)) %>% pull(m), linetype="dotted")

p1 %>% ggplotly

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p2 %>% ggplotly

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p3 %>% ggplotly

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

pull different ratio

id_p_0 <- raw %>%filter(dv==1) %>%  pull(id) %>% unique  ###positive id 504
id_n_0 <- raw %>%filter(dv==0) %>%  pull(id) %>% unique   ### negative id  482
per6 <- c(seq(0.05, 1, 0.2), 1)
data3 <- tibble(crossing(var1 = per6, var2 = per6)) %>%
  mutate(results = map2(var1, var2, ~{
    per1 <- .x
    per2 <- .y
    
    id_p <- sample(id_p_0, length(id_p_0) * per1)
    id_n <- sample(id_n_0, length(id_n_0) * per2)
    
    raww <- cleaned_raw %>%
      filter(id %in% c(id_p, id_n)) %>%
      mutate(cave2 = as.numeric(cave2))  
    
    quantile_cutoffs <- quantile(raww$cave2, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
    
    raww <- raww %>%
      mutate(cave2squart = cut(
        cave2, 
        breaks = c(-Inf, quantile_cutoffs, Inf), 
        labels = c("1Q", "2Q", "3Q", "4Q")
      ))
    
    p <- raww %>% summarise(mean_dv = mean(dv, na.rm = TRUE)) %>% pull(mean_dv)
    n <- nrow(raww)
    
    POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
    LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
    
    POI_summary <- tidy(POI) %>% mutate(model = "Poisson", dataset_ratio = paste0("p:", round(per1, 2), ", n:", round(per2, 2)))
    LOG_summary <- tidy(LOG) %>% mutate(model = "Logistic", dataset_ratio = paste0("p:", round(per1, 2), ", n:", round(per2, 2)))
    
    logistic_p_value <- LOG_summary %>% filter(term == "cave2") %>% pull(p.value)
    poisson_p_value <- POI_summary %>% filter(term == "cave2") %>% pull(p.value)
    
   logistic_p_value_int <- LOG_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    poisson_p_value_int <- POI_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    
    combined_summary <- bind_rows(POI_summary, LOG_summary)
    
    sm_cave <- raww %>%
      group_by(cave2squart) %>%
      dplyr::summarise(
        mc = median(cave2),
        n = n(),
        rsp = sum(dv),
        rate = round(rsp / n, 2),
        lo = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[1], 4), NA),
        hi = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[2], 4), NA)
      )
    
    logistic_eq <- paste0("Logistic: logit(p) = ", round(coef(LOG)[1], 3), " + ", round(coef(LOG)[2], 3), " * cave2, p_slope = ", format(signif(logistic_p_value, 3), scientific = TRUE), ", p_intercept = ", format(signif(logistic_p_value_int, 3), scientific = TRUE), ", AIC=", round(AIC(LOG),2))
    poisson_eq <- paste0("Poisson: log(λ) = ", round(coef(POI)[1], 3), " + ",  round(coef(POI)[2], 3), " * cave2 + log(time), p_slope = ", format(signif(poisson_p_value, 3), scientific = TRUE), ", p_intercept = ", format(signif(poisson_p_value_int, 3), scientific = TRUE),", AIC=", round(AIC(POI),2))
    
    plot <- ggplot(raww, aes(x = cave2, y = dv)) + 
      scale_x_continuous() +  
      geom_point(color = "blue") + 
      theme_bw() +
      stat_smooth(method = glm, method.args = list(family = "binomial"), 
                  aes(color = "Logistic", linetype = "Logistic"), se = FALSE) +  
      stat_smooth(method = glm, method.args = list(family = "poisson"), 
                  aes(color = "Poisson", linetype = "Poisson"), se = FALSE) +  
      geom_errorbar(data = sm_cave, aes(x = mc, y = rate, ymin = lo, ymax = hi), 
                    width = 0.2) +
      geom_point(data = sm_cave, aes(x = mc, y = rate), color = "black") +
      xlab("cave2") + ylab("Rate of DV (AE2)") +
      labs(title = paste0("%positive=", round(per1, 2), ", %negative=", round(per2, 2), ", event rate (%)=", round(p, 3)),
           color = "Model", linetype = "Model") + 
      theme(legend.position = "top") +
      annotate("text", x = 0.1, y = 0.15, label = logistic_eq, size = 3, hjust = 0) +
      annotate("text", x = 0.1, y = 0.08, label = poisson_eq, size = 3, hjust = 0)
    
    list(raww = raww, plot = plot, summary = combined_summary)
  }))

plots <- map(data3$results, "plot") %>% discard(is.null)

for (i in seq_along(plots)) {
  print(plots[[i]])
}

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

pull different ratio, more cutoff

id_p_0 <- raw %>%filter(dv==1) %>%  pull(id) %>% unique  ###positive id 504
id_n_0 <- raw %>%filter(dv==0) %>%  pull(id) %>% unique   ### negative id  482
per6 <- c(seq(0.05, 1, 0.2), 1)
data3 <- tibble(crossing(var1 = per6, var2 = per6)) %>%
  mutate(results = map2(var1, var2, ~{
    per1 <- .x
    per2 <- .y
    
    id_p <- sample(id_p_0, length(id_p_0) * per1)
    id_n <- sample(id_n_0, length(id_n_0) * per2)
    
    raww <- cleaned_raw %>%
      filter(id %in% c(id_p, id_n)) %>%
      mutate(cave2 = as.numeric(cave2))  
    
    
    
  
    
    quantile_cutoffs <- quantile(raww$cave2, probs = c(0.2, 0.4, 0.6, 0.8, 0.9, 0.93, 0.96, 0.99, 0.995), na.rm = TRUE)
    
    raww <- raww %>%
      mutate(cave2squart = cut(
        cave2, 
        breaks = c(-Inf, quantile_cutoffs, Inf), 
        labels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
      ))
    
    p <- raww %>% summarise(mean_dv = mean(dv, na.rm = TRUE)) %>% pull(mean_dv)
    n <- nrow(raww)
    
    POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
    LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
    
    POI_summary <- tidy(POI) %>% mutate(model = "Poisson", dataset_ratio = paste0("p:", round(per1, 2), ", n:", round(per2, 2)))
    LOG_summary <- tidy(LOG) %>% mutate(model = "Logistic", dataset_ratio = paste0("p:", round(per1, 2), ", n:", round(per2, 2)))
    
    logistic_p_value <- LOG_summary %>% filter(term == "cave2") %>% pull(p.value)
    poisson_p_value <- POI_summary %>% filter(term == "cave2") %>% pull(p.value)
    
   logistic_p_value_int <- LOG_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    poisson_p_value_int <- POI_summary %>% filter(term == "(Intercept)") %>% pull(p.value)
    
    combined_summary <- bind_rows(POI_summary, LOG_summary)
    
    sm_cave <- raww %>%
      group_by(cave2squart) %>%
      dplyr::summarise(
        mc = median(cave2),
        n = n(),
        rsp = sum(dv),
        rate = round(rsp / n, 2),
        lo = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[1], 4), NA),
        hi = ifelse(rsp > 0 & rsp < n, round(prop.test(rsp, n)$conf.int[2], 4), NA)
      )
    
    logistic_eq <- paste0("Logistic: logit(p) = ", round(coef(LOG)[1], 3), " + ", round(coef(LOG)[2], 3), " * cave2, p_slope = ", format(signif(logistic_p_value, 3), scientific = TRUE), ", p_intercept = ", format(signif(logistic_p_value_int, 3), scientific = TRUE), ", AIC=", round(AIC(LOG),2))
    poisson_eq <- paste0("Poisson: log(λ) = ", round(coef(POI)[1], 3), " + ",  round(coef(POI)[2], 3), " * cave2 + log(time), p_slope = ", format(signif(poisson_p_value, 3), scientific = TRUE), ", p_intercept = ", format(signif(poisson_p_value_int, 3), scientific = TRUE), ", AIC=", round(AIC(POI),2))
    
    plot <- ggplot(raww, aes(x = cave2, y = dv)) + 
      scale_x_continuous() +  
      geom_point(color = "blue") + 
      theme_bw() +
      stat_smooth(method = glm, method.args = list(family = "binomial"), 
                  aes(color = "Logistic", linetype = "Logistic"), se = FALSE) +  
      stat_smooth(method = glm, method.args = list(family = "poisson"), 
                  aes(color = "Poisson", linetype = "Poisson"), se = FALSE) +  
      geom_errorbar(data = sm_cave, aes(x = mc, y = rate, ymin = lo, ymax = hi), 
                    width = 0.2) +
      geom_point(data = sm_cave, aes(x = mc, y = rate), color = "black") +
      xlab("cave2") + ylab("Rate of DV (AE2)") +
      labs(title = paste0("%positive=", round(per1, 2), ", %negative=", round(per2, 2), ", event rate (%)=", round(p, 3)),
           color = "Model", linetype = "Model") + 
      theme(legend.position = "top") +
      annotate("text", x = 0.1, y = 0.15, label = logistic_eq, size = 3, hjust = 0) +
      annotate("text", x = 0.1, y = 0.08, label = poisson_eq, size = 3, hjust = 0)
    
    list(raww = raww, plot = plot, summary = combined_summary)
  }))

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

## Warning in prop.test(rsp, n): Chi-squared approximation may be incorrect

plots <- map(data3$results, "plot") %>% discard(is.null)

for (i in seq_along(plots)) {
  print(plots[[i]])
}

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

per6 <- c(seq(0.05, 1, 0.2), 1)
data4 <- tibble(crossing(var1 = per6, var2 = per6)) %>%
  mutate(results = map2(var1, var2, ~{
    per1 <- .x
    per2 <- .y
    
    id_p <- sample(id_p_0, length(id_p_0) * per1)
    id_n <- sample(id_n_0, length(id_n_0) * per2)
    
    raww <- cleaned_raw %>%
      filter(id %in% c(id_p, id_n)) %>%
      mutate(cave2 = as.numeric(cave2))  # Ensure cave2 is numeric
    
    if (nrow(raww) < 10 || !("dv" %in% colnames(raww))) {
      return(list(raww = NULL, plot = NULL, summary = NULL))
    }
    
    quantile_cutoffs <- quantile(raww$cave2, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
    
    raww <- raww %>%
      mutate(cave2squart = cut(
        cave2, 
        breaks = c(-Inf, quantile_cutoffs, Inf), 
        labels = c("1Q", "2Q", "3Q", "4Q")
      ))
    
    POI <- glm(dv ~ cave2 + offset(log(time)), family = poisson(link = "log"), data = raww)
    LOG <- glm(dv ~ cave2, family = binomial(link = "logit"), data = raww)
    
    list(raww = raww, plot = NULL, summary = NULL)
  }))

plot_logistic <- function(results) {
  combined_data <- map_dfr(seq_along(results), function(i) {
    data <- results[[i]]
    if (!is.null(data) && is.data.frame(data)) {
      logistic_model <- glm(dv ~ cave2, family = binomial, data = data)
      logistic_p_value <- summary(logistic_model)$coefficients[2, 4]
      event_rate <- mean(data$dv, na.rm = TRUE)
      logistic_label <- paste0("Logistic (Rate: ", round(event_rate, 2), ", p: ", formatC(logistic_p_value, format = "e", digits = 2), ")")
      
      data %>%
        mutate(label = logistic_label)
    } else {
      NULL
    }
  })

  ggplot(combined_data, aes(x = cave2, y = dv)) +
    theme_bw() +
    stat_smooth(aes(color = label, linetype = label), 
                method = glm, method.args = list(family = "binomial"), se = FALSE) +
    xlab("cave2") + ylab("Rate of DV (AE2)") +
    labs(title = "Logistic Model Fits", color = "Model", linetype = "Model") +
    theme(legend.position = "top")
}

plot_poisson <- function(results) {
  combined_data <- map_dfr(seq_along(results), function(i) {
    data <- results[[i]]
    
      poisson_model <- glm(dv ~ cave2 + offset(log(time)), family = poisson, data = data)
      poisson_p_value <- summary(poisson_model)$coefficients[2, 4]
      event_rate <- mean(data$dv, na.rm = TRUE)
      poisson_label <- paste0("Poisson (Rate: ", round(event_rate, 2), ", p: ", formatC(poisson_p_value, format = "e", digits = 2), ")")
      
      data %>%
        mutate(label = poisson_label)
  })

  ggplot(combined_data, aes(x = cave2, y = dv)) +
    theme_bw() +
    stat_smooth(aes(color = label, linetype = label), 
                method = glm, method.args = list(family = "poisson"), se = FALSE) +
    xlab("cave2") + ylab("Rate of DV (AE2)") +
    labs(title = "Poisson Model Fits", color = "Model", linetype = "Model") +
    theme(legend.position = "top")
}

logistic_results <- map(data4$results, ~ if (!is.null(.x$raww)) .x$raww)
poisson_results <- map(data4$results, ~ if (!is.null(.x$raww)) .x$raww)

print(plot_logistic(logistic_results))

## `geom_smooth()` using formula 'y ~ x'

print(plot_poisson(poisson_results))

## `geom_smooth()` using formula 'y ~ x'

logistic_summaries <- data3 %>%
  mutate(model_results = map(results, "summary")) %>%
  unnest(cols = c(model_results)) %>%
  filter(model == "Logistic" & term %in% c("(Intercept)", "cave2"))

poisson_summaries <- data3 %>%
  mutate(model_results = map(results, "summary")) %>%
  unnest(cols = c(model_results)) %>%
  filter(model == "Poisson" & term %in% c("(Intercept)", "cave2"))

LOG_slope <- logistic_summaries %>% 
  filter(term == "cave2") %>%
  mutate(slope = estimate, slope_p_value = p.value) %>%
  select(var1, var2, results, slope_p_value, slope, model, dataset_ratio)

LOG_intercept <- logistic_summaries %>% 
  filter(term == "(Intercept)") %>%
  mutate(intercept = estimate) %>%
  select(var1, var2, results, intercept, model, dataset_ratio)
logistic_combined <- inner_join(LOG_slope, LOG_intercept, 
                                by = c("var1", "var2", "results", "model", "dataset_ratio"))

time <- 300  # Replace with the appropriate offset value for your context

# Generate the dataset for POI_combined (Poisson model)
POI_slope <- poisson_summaries %>% 
  filter(term == "cave2") %>%
  mutate(slope = estimate, slope_p_value = p.value) %>%
  select(var1, var2, results, slope_p_value, slope, model, dataset_ratio)

POI_intercept <- poisson_summaries %>% 
  filter(term == "(Intercept)") %>%
  mutate(intercept = estimate) %>%
  select(var1, var2, results, intercept, model, dataset_ratio)

# Use inner_join to combine slope and intercept data
POI_combined <- inner_join(POI_slope, POI_intercept, 
                           by = c("var1", "var2", "results", "model", "dataset_ratio"))

# Check the final structure of POI_combined
print(head(POI_combined))

## # A tibble: 6 × 8
##    var1  var2 results          slope_p_value   slope model   dataset_r…¹ inter…²
##   <dbl> <dbl> <list>                   <dbl>   <dbl> <chr>   <chr>         <dbl>
## 1  0.05  0.05 <named list [3]>       0.0898  0.00767 Poisson p:0.05, n:…   -6.33
## 2  0.05  0.25 <named list [3]>       0.0462  0.0131  Poisson p:0.05, n:…   -7.69
## 3  0.05  0.45 <named list [3]>       0.0527  0.0111  Poisson p:0.05, n:…   -8.15
## 4  0.05  0.65 <named list [3]>       0.324   0.00695 Poisson p:0.05, n:…   -8.18
## 5  0.05  0.85 <named list [3]>       0.00883 0.0163  Poisson p:0.05, n:…   -8.99
## 6  0.05  1    <named list [3]>       0.0242  0.0141  Poisson p:0.05, n:1   -9.01
## # … with abbreviated variable names ¹dataset_ratio, ²intercept

# Create a sequence of cave2 values for prediction
cave2_seq <- seq(min(cleaned_raw$cave2, na.rm = TRUE), max(cleaned_raw$cave2, na.rm = TRUE), length.out = 100)

# Generate predictions for the logistic model
logistic_predictions <- logistic_combined %>%
  mutate(
    cave2_values = list(cave2_seq),
    predicted = map2(slope, intercept, ~ {
      if (is.na(.x) | is.na(.y)) return(rep(NA, length(cave2_seq)))
      plogis(.y + .x * cave2_seq)
    })
  ) %>%
  unnest(c(cave2_values, predicted))

# Generate predictions for the Poisson model with the offset (log(time))
poisson_predictions <- POI_combined %>%
  mutate(
    cave2_values = list(cave2_seq),
    predicted = map2(slope, intercept, ~ {
      if (is.na(.x) | is.na(.y)) return(rep(NA, length(cave2_seq)))
      exp(.y + .x * cave2_seq + log(time))  # Include the offset for time
    })
  ) %>%
  unnest(c(cave2_values, predicted))

# Plot for Logistic Model
logistic_plot <- ggplot(logistic_predictions, aes(x = cave2_values, y = predicted, color = dataset_ratio)) +
  geom_line() +
  labs(
    title = "Logistic Model: Fitting Lines Across Different Datasets",
    x = "cave2",
    y = "Predicted Probability",
    color = "Dataset Ratio"
  ) +
  theme_minimal()

# Plot for Poisson Model
poisson_plot <- ggplot(poisson_predictions, aes(x = cave2_values, y = predicted, color = dataset_ratio)) +
  geom_line() +
  labs(
    title = "Poisson Model: Fitting Lines Across Different Datasets (With Time Offset)",
    x = "cave2",
    y = "Predicted Rate",
    color = "Dataset Ratio"
  ) +
  theme_minimal()

Poisson ER

Ping Chen

2024-08-12

R Markdown