library(here)
## here() starts at C:/Users/Yash/Documents/Data Viz Final Project
master_data <- read.csv(here("Data/master_17Oct24_plus_mabloc_plus_baseline.csv"))

head(master_data)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
subset_data <- master_data %>%
  select(
    ID,
    RBD_AUC,
    smokelife,
    smokenow,
    smokedays,
    smokeheavy,
    vapelife,
    vapenow,
    vapedays,
    Age,
    Sex,
    Race,
    education,
    Vaccine,
)
subset_data <- subset_data %>%
  mutate(
    smokelife = factor(smokelife, levels = c(0, 1), labels = c("No", "Yes")),
    smokenow = factor(smokenow, levels = c(3, 2, 1), labels = c("Not at all", "Some days", "Every day")),
    smokeheavy = factor(smokeheavy, levels = c(1, 2, 3, 4, 5, 6, 7), 
                        labels = c("Never smoked", "Tried, not regular", 
                                   "Less than 1 a day", "Up to 10 a day (half a pack)", 
                                   "Up to 20 a day (one pack)", "Up to 40 a day (2 packs)", 
                                   "More than 40 a day (over 2 packs)")),
    vapelife = factor(vapelife, levels = c(0, 1), labels = c("No", "Yes")),
    vapenow = factor(vapenow, levels = c(3, 2, 1), labels = c("Not at all", "Some days", "Every day"))
  )

subset_data <- subset_data %>%
  mutate(
    smokedays = ifelse(smokelife == "No", 0, smokedays),
    vapedays = ifelse(vapelife == "No", 0, vapedays)
  )

subset_data <- subset_data %>%
  mutate(
    smoking_status = case_when(
      smokelife == "No" ~ "Non-Smoker",
      smokelife == "Yes" & smokenow == "Not at all" ~ "Former Smoker",
      smokenow %in% c("Every day", "Some days") ~ "Current Smoker",
      TRUE ~ NA_character_
    ),
    vaping_status = case_when(
      vapelife == "No" ~ "Non-Vaper",
      vapelife == "Yes" & vapenow == "Not at all" ~ "Former Vaper",
      vapenow %in% c("Every day", "Some days") ~ "Current Vaper",
      TRUE ~ NA_character_
    )
  )



str(subset_data)
## 'data.frame':    6548 obs. of  16 variables:
##  $ ID            : int  62741953 62741953 62741953 63212000 63212000 21011969 21011969 21011969 21011969 21011969 ...
##  $ RBD_AUC       : num  11752 49450 31479 8704 86085 ...
##  $ smokelife     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ smokenow      : Factor w/ 3 levels "Not at all","Some days",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ smokedays     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ smokeheavy    : Factor w/ 7 levels "Never smoked",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ vapelife      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ vapenow       : Factor w/ 3 levels "Not at all","Some days",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ vapedays      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age           : int  68 68 68 21 21 52 52 52 52 52 ...
##  $ Sex           : chr  "Female" "Female" "Female" "Male" ...
##  $ Race          : chr  "No Latino - White" "No Latino - White" "No Latino - White" "No Latino - Asian" ...
##  $ education     : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ Vaccine       : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ smoking_status: chr  "Non-Smoker" "Non-Smoker" "Non-Smoker" "Non-Smoker" ...
##  $ vaping_status : chr  "Non-Vaper" "Non-Vaper" "Non-Vaper" "Non-Vaper" ...
head(subset_data)
# Create a new subset keeping only essential variables and dropping the smoking/vaping question variables
subset_data_status <- subset_data %>%
  select(ID, RBD_AUC, Age, Sex, Race, education, Vaccine, smoking_status, vaping_status)

# Preview the new subset
head(subset_data_status)
# Check structure to confirm unwanted columns are dropped
str(subset_data_status)
## 'data.frame':    6548 obs. of  9 variables:
##  $ ID            : int  62741953 62741953 62741953 63212000 63212000 21011969 21011969 21011969 21011969 21011969 ...
##  $ RBD_AUC       : num  11752 49450 31479 8704 86085 ...
##  $ Age           : int  68 68 68 21 21 52 52 52 52 52 ...
##  $ Sex           : chr  "Female" "Female" "Female" "Male" ...
##  $ Race          : chr  "No Latino - White" "No Latino - White" "No Latino - White" "No Latino - Asian" ...
##  $ education     : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ Vaccine       : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ smoking_status: chr  "Non-Smoker" "Non-Smoker" "Non-Smoker" "Non-Smoker" ...
##  $ vaping_status : chr  "Non-Vaper" "Non-Vaper" "Non-Vaper" "Non-Vaper" ...
# Save the new subset as a CSV (optional)

write.csv(head(subset_data_status), "head_of_data_status.csv", row.names = FALSE)
write.csv(subset_data_status, "subset_data_status.csv", row.names = FALSE)

print("New subset created with only smoking_status and vaping_status retained.")
## [1] "New subset created with only smoking_status and vaping_status retained."
subset_data_unique <- subset_data_status %>%
  distinct(ID, .keep_all = TRUE)

head(subset_data_unique)
str(subset_data_unique)
## 'data.frame':    2178 obs. of  9 variables:
##  $ ID            : int  62741953 63212000 21011969 22091961 21901947 21841956 21911976 21851958 21961968 21861951 ...
##  $ RBD_AUC       : num  11752 8703.8 61461.3 71.4 31546.2 ...
##  $ Age           : int  68 21 52 59 74 64 45 63 53 70 ...
##  $ Sex           : chr  "Female" "Male" "Male" "Female" ...
##  $ Race          : chr  "No Latino - White" "No Latino - Asian" "No Latino - White" "No Latino - White" ...
##  $ education     : int  6 6 6 6 6 6 4 6 6 5 ...
##  $ Vaccine       : chr  "Yes" "Yes" "Yes" "No" ...
##  $ smoking_status: chr  "Non-Smoker" "Non-Smoker" "Non-Smoker" "Former Smoker" ...
##  $ vaping_status : chr  "Non-Vaper" "Non-Vaper" "Non-Vaper" "Non-Vaper" ...
subset_data_vaccinated <- subset_data_unique %>%
  filter(Vaccine == "Yes")

head(subset_data_vaccinated)
str(subset_data_vaccinated)
## 'data.frame':    1753 obs. of  9 variables:
##  $ ID            : int  62741953 63212000 21011969 21901947 21841956 21911976 21851958 21961968 21861951 21941953 ...
##  $ RBD_AUC       : num  11752 8704 61461 31546 12434 ...
##  $ Age           : int  68 21 52 74 64 45 63 53 70 67 ...
##  $ Sex           : chr  "Female" "Male" "Male" "Male" ...
##  $ Race          : chr  "No Latino - White" "No Latino - Asian" "No Latino - White" "No Latino - White" ...
##  $ education     : int  6 6 6 6 6 4 6 6 5 6 ...
##  $ Vaccine       : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ smoking_status: chr  "Non-Smoker" "Non-Smoker" "Non-Smoker" "Non-Smoker" ...
##  $ vaping_status : chr  "Non-Vaper" "Non-Vaper" "Non-Vaper" "Non-Vaper" ...
write.csv(subset_data_vaccinated, "subset_data_vaccinated.csv", row.names = FALSE)

print("Filtered dataset to include only individuals with positive vaccine status (Yes).")
## [1] "Filtered dataset to include only individuals with positive vaccine status (Yes)."
subset_data_vaccinated <- subset_data_vaccinated %>%
  mutate(education = factor(education, levels = c(2, 3, 4, 5, 6),
                            labels = c(
                              "Elementary",
                              "Some High School",
                              "High School Graduate",
                              "Some College / Associate's Degree",
                              "Graduate or Professional Degree"
                            )))

Creating table 1

library(tableone)
## Warning: package 'tableone' was built under R version 4.4.2
# Define baseline variables
baseline_vars <- c("Age", "Sex", "Race", "education")

# Smoking Status Table
table_smoking <- CreateTableOne(vars = baseline_vars, strata = "smoking_status", data = subset_data_vaccinated, addOverall = TRUE)
table_smoking_df <- as.data.frame(print(table_smoking, showAllLevels = TRUE, noSpaces = TRUE))
##                  Stratified by smoking_status
##                   level                                       Overall      
##   n                                                           1753         
##   Age (mean (SD))                                             50.92 (16.93)
##   Sex (%)         Female                                      1079 (61.6)  
##                   Male                                        674 (38.4)   
##   Race (%)        Hispanic/Latino                             119 (6.8)    
##                   More than 1                                 83 (4.7)     
##                   No Latino - America Indian or Alaska Native 55 (3.1)     
##                   No Latino - Asian                           183 (10.4)   
##                   No Latino - Black or African-American       184 (10.5)   
##                   No Latino - Other                           21 (1.2)     
##                   No Latino - White                           1095 (62.5)  
##                   Unknown                                     13 (0.7)     
##   education (%)   Elementary                                  5 (0.3)      
##                   Some High School                            19 (1.1)     
##                   High School Graduate                        190 (10.9)   
##                   Some College / Associate's Degree           505 (29.0)   
##                   Graduate or Professional Degree             1020 (58.7)  
##                  Stratified by smoking_status
##                   Current Smoker Former Smoker Non-Smoker    p      test
##   n               173            429           1131                     
##   Age (mean (SD)) 52.75 (14.79)  58.79 (14.19) 47.66 (17.21) <0.001     
##   Sex (%)         100 (57.8)     273 (63.6)    694 (61.4)    0.400      
##                   73 (42.2)      156 (36.4)    437 (38.6)               
##   Race (%)        15 (8.7)       20 (4.7)      84 (7.4)      <0.001     
##                   4 (2.3)        15 (3.5)      64 (5.7)                 
##                   6 (3.5)        15 (3.5)      34 (3.0)                 
##                   20 (11.6)      9 (2.1)       151 (13.4)               
##                   30 (17.3)      27 (6.3)      125 (11.1)               
##                   2 (1.2)        5 (1.2)       13 (1.1)                 
##                   94 (54.3)      335 (78.1)    654 (57.8)               
##                   2 (1.2)        3 (0.7)       6 (0.5)                  
##   education (%)   2 (1.2)        0 (0.0)       3 (0.3)       <0.001     
##                   6 (3.5)        5 (1.2)       8 (0.7)                  
##                   33 (19.1)      53 (12.4)     102 (9.1)                
##                   62 (35.8)      151 (35.4)    288 (25.7)               
##                   70 (40.5)      218 (51.1)    721 (64.3)
# Vaping Status Table
table_vaping <- CreateTableOne(vars = baseline_vars, strata = "vaping_status", data = subset_data_vaccinated, addOverall = TRUE)
table_vaping_df <- as.data.frame(print(table_vaping, showAllLevels = TRUE, noSpaces = TRUE))
##                  Stratified by vaping_status
##                   level                                       Overall      
##   n                                                           1753         
##   Age (mean (SD))                                             50.92 (16.93)
##   Sex (%)         Female                                      1079 (61.6)  
##                   Male                                        674 (38.4)   
##   Race (%)        Hispanic/Latino                             119 (6.8)    
##                   More than 1                                 83 (4.7)     
##                   No Latino - America Indian or Alaska Native 55 (3.1)     
##                   No Latino - Asian                           183 (10.4)   
##                   No Latino - Black or African-American       184 (10.5)   
##                   No Latino - Other                           21 (1.2)     
##                   No Latino - White                           1095 (62.5)  
##                   Unknown                                     13 (0.7)     
##   education (%)   Elementary                                  5 (0.3)      
##                   Some High School                            19 (1.1)     
##                   High School Graduate                        190 (10.9)   
##                   Some College / Associate's Degree           505 (29.0)   
##                   Graduate or Professional Degree             1020 (58.7)  
##                  Stratified by vaping_status
##                   Current Vaper Former Vaper  Non-Vaper     p      test
##   n               92            259           1354                     
##   Age (mean (SD)) 45.24 (15.97) 41.91 (16.73) 53.02 (16.43) <0.001     
##   Sex (%)         57 (62.0)     161 (62.2)    835 (61.7)    0.988      
##                   35 (38.0)     98 (37.8)     519 (38.3)               
##   Race (%)        6 (6.5)       18 (6.9)      93 (6.9)      0.001      
##                   4 (4.3)       23 (8.9)      54 (4.0)                 
##                   1 (1.1)       7 (2.7)       46 (3.4)                 
##                   5 (5.4)       32 (12.4)     139 (10.3)               
##                   7 (7.6)       12 (4.6)      157 (11.6)               
##                   2 (2.2)       1 (0.4)       18 (1.3)                 
##                   66 (71.7)     162 (62.5)    842 (62.2)               
##                   1 (1.1)       4 (1.5)       5 (0.4)                  
##   education (%)   0 (0.0)       0 (0.0)       5 (0.4)       0.002      
##                   1 (1.1)       3 (1.2)       15 (1.1)                 
##                   15 (16.3)     24 (9.3)      146 (10.9)               
##                   40 (43.5)     91 (35.1)     362 (26.9)               
##                   36 (39.1)     141 (54.4)    817 (60.7)
# Export the tables in CSV-friendly format
write.csv(table_smoking_df, "table1_smoking_formatted.csv", row.names = FALSE)
write.csv(table_vaping_df, "table1_vaping_formatted.csv", row.names = FALSE)

print("Formatted Table 1s have been saved as 'table1_smoking_formatted.csv' and 'table1_vaping_formatted.csv'.")
## [1] "Formatted Table 1s have been saved as 'table1_smoking_formatted.csv' and 'table1_vaping_formatted.csv'."
smoking_model <- lm(RBD_AUC ~ smoking_status + Age + Sex + Race + education, data = subset_data_vaccinated)
summary(smoking_model)
## 
## Call:
## lm(formula = RBD_AUC ~ smoking_status + Age + Sex + Race + education, 
##     data = subset_data_vaccinated)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26647 -16119  -8346   8396 102485 
## 
## Coefficients:
##                                                 Estimate Std. Error t value
## (Intercept)                                     17037.51   10857.33   1.569
## smoking_statusFormer Smoker                      3821.05    2127.48   1.796
## smoking_statusNon-Smoker                         2271.16    1928.71   1.178
## Age                                               -81.14      37.98  -2.136
## SexMale                                          -713.66    1157.69  -0.616
## RaceMore than 1                                 -1342.33    3346.60  -0.401
## RaceNo Latino - America Indian or Alaska Native -5735.89    3842.03  -1.493
## RaceNo Latino - Asian                           -2161.52    2836.49  -0.762
## RaceNo Latino - Black or African-American        -475.07    2751.57  -0.173
## RaceNo Latino - Other                           -6023.71    5742.63  -1.049
## RaceNo Latino - White                           -5789.29    2296.23  -2.521
## RaceUnknown                                      4871.49    8028.59   0.607
## educationSome High School                       13969.68   11724.26   1.192
## educationHigh School Graduate                    9830.45   10560.77   0.931
## educationSome College / Associate's Degree      10691.36   10479.11   1.020
## educationGraduate or Professional Degree        11216.20   10466.31   1.072
##                                                 Pr(>|t|)  
## (Intercept)                                       0.1168  
## smoking_statusFormer Smoker                       0.0727 .
## smoking_statusNon-Smoker                          0.2391  
## Age                                               0.0328 *
## SexMale                                           0.5377  
## RaceMore than 1                                   0.6884  
## RaceNo Latino - America Indian or Alaska Native   0.1356  
## RaceNo Latino - Asian                             0.4461  
## RaceNo Latino - Black or African-American         0.8629  
## RaceNo Latino - Other                             0.2944  
## RaceNo Latino - White                             0.0118 *
## RaceUnknown                                       0.5441  
## educationSome High School                         0.2336  
## educationHigh School Graduate                     0.3521  
## educationSome College / Associate's Degree        0.3078  
## educationGraduate or Professional Degree          0.2840  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23150 on 1706 degrees of freedom
##   (31 observations deleted due to missingness)
## Multiple R-squared:  0.01852,    Adjusted R-squared:  0.009889 
## F-statistic: 2.146 on 15 and 1706 DF,  p-value: 0.006417
vaping_model <- lm(RBD_AUC ~ vaping_status + Age + Sex + Race + education, data = subset_data_vaccinated)
summary(vaping_model)
## 
## Call:
## lm(formula = RBD_AUC ~ vaping_status + Age + Sex + Race + education, 
##     data = subset_data_vaccinated)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27557 -16175  -8380   8378 103293 
## 
## Coefficients:
##                                                 Estimate Std. Error t value
## (Intercept)                                     14979.95   11013.60   1.360
## vaping_statusFormer Vaper                        2841.72    2820.07   1.008
## vaping_statusNon-Vaper                           3075.87    2541.52   1.210
## Age                                               -80.87      38.73  -2.088
## SexMale                                          -719.51    1165.52  -0.617
## RaceMore than 1                                  -904.56    3383.99  -0.267
## RaceNo Latino - America Indian or Alaska Native -5272.73    3870.10  -1.362
## RaceNo Latino - Asian                           -2247.59    2857.42  -0.787
## RaceNo Latino - Black or African-American       -1144.81    2779.41  -0.412
## RaceNo Latino - Other                           -5494.74    5622.34  -0.977
## RaceNo Latino - White                           -5625.01    2314.91  -2.430
## RaceUnknown                                      4943.16    8050.20   0.614
## educationSome High School                       15262.69   11709.38   1.303
## educationHigh School Graduate                   11193.75   10542.48   1.062
## educationSome College / Associate's Degree      12139.74   10456.80   1.161
## educationGraduate or Professional Degree        12695.05   10433.54   1.217
##                                                 Pr(>|t|)  
## (Intercept)                                       0.1740  
## vaping_statusFormer Vaper                         0.3138  
## vaping_statusNon-Vaper                            0.2264  
## Age                                               0.0369 *
## SexMale                                           0.5371  
## RaceMore than 1                                   0.7893  
## RaceNo Latino - America Indian or Alaska Native   0.1732  
## RaceNo Latino - Asian                             0.4316  
## RaceNo Latino - Black or African-American         0.6805  
## RaceNo Latino - Other                             0.3286  
## RaceNo Latino - White                             0.0152 *
## RaceUnknown                                       0.5393  
## educationSome High School                         0.1926  
## educationHigh School Graduate                     0.2885  
## educationSome College / Associate's Degree        0.2458  
## educationGraduate or Professional Degree          0.2239  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23110 on 1680 degrees of freedom
##   (57 observations deleted due to missingness)
## Multiple R-squared:  0.01798,    Adjusted R-squared:  0.009207 
## F-statistic:  2.05 on 15 and 1680 DF,  p-value: 0.009961
library(ggplot2)

# Create a histogram for RBD_AUC
ggplot(subset_data_vaccinated, aes(x = RBD_AUC)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Histogram of RBD_AUC",
    x = "RBD_AUC",
    y = "Frequency"
  )

subset_data_vaccinated <- subset_data_vaccinated %>%
  mutate(log_RBD_AUC = log(RBD_AUC + 1))  # Add 1 to avoid log(0)
# Updated regression models


ggplot(subset_data_vaccinated, aes(x = log_RBD_AUC)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Histogram of log_RBD_AUC",
    x = "log_RBD_AUC",
    y = "Frequency"
  )

smoking_model_log <- lm(log_RBD_AUC ~ smoking_status + Age + Sex + Race + education, data = subset_data_vaccinated)
summary(smoking_model_log)
## 
## Call:
## lm(formula = log_RBD_AUC ~ smoking_status + Age + Sex + Race + 
##     education, data = subset_data_vaccinated)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5455 -0.7694  0.1948  1.0413  2.8261 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                      9.516983   0.679293  14.010
## smoking_statusFormer Smoker                      0.297989   0.133106   2.239
## smoking_statusNon-Smoker                         0.271639   0.120670   2.251
## Age                                             -0.009538   0.002376  -4.013
## SexMale                                         -0.051920   0.072431  -0.717
## RaceMore than 1                                 -0.259190   0.209381  -1.238
## RaceNo Latino - America Indian or Alaska Native -0.429046   0.240378  -1.785
## RaceNo Latino - Asian                           -0.165303   0.177466  -0.931
## RaceNo Latino - Black or African-American       -0.031622   0.172153  -0.184
## RaceNo Latino - Other                           -0.261681   0.359290  -0.728
## RaceNo Latino - White                           -0.241357   0.143665  -1.680
## RaceUnknown                                      0.427525   0.502312   0.851
## educationSome High School                        0.028321   0.733532   0.039
## educationHigh School Graduate                    0.073667   0.660739   0.111
## educationSome College / Associate's Degree       0.192375   0.655629   0.293
## educationGraduate or Professional Degree         0.251308   0.654828   0.384
##                                                 Pr(>|t|)    
## (Intercept)                                      < 2e-16 ***
## smoking_statusFormer Smoker                       0.0253 *  
## smoking_statusNon-Smoker                          0.0245 *  
## Age                                             6.24e-05 ***
## SexMale                                           0.4736    
## RaceMore than 1                                   0.2159    
## RaceNo Latino - America Indian or Alaska Native   0.0745 .  
## RaceNo Latino - Asian                             0.3517    
## RaceNo Latino - Black or African-American         0.8543    
## RaceNo Latino - Other                             0.4665    
## RaceNo Latino - White                             0.0931 .  
## RaceUnknown                                       0.3948    
## educationSome High School                         0.9692    
## educationHigh School Graduate                     0.9112    
## educationSome College / Associate's Degree        0.7692    
## educationGraduate or Professional Degree          0.7012    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.448 on 1706 degrees of freedom
##   (31 observations deleted due to missingness)
## Multiple R-squared:  0.02536,    Adjusted R-squared:  0.01679 
## F-statistic: 2.959 on 15 and 1706 DF,  p-value: 0.0001107
vaping_model_log <- lm(log_RBD_AUC ~ vaping_status + Age + Sex + Race + education, data = subset_data_vaccinated)
summary(vaping_model_log)
## 
## Call:
## lm(formula = log_RBD_AUC ~ vaping_status + Age + Sex + Race + 
##     education, data = subset_data_vaccinated)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4058 -0.7612  0.1915  1.0338  2.7421 
## 
## Coefficients:
##                                                  Estimate Std. Error t value
## (Intercept)                                      9.424818   0.690489  13.649
## vaping_statusFormer Vaper                        0.116868   0.176802   0.661
## vaping_statusNon-Vaper                           0.289649   0.159338   1.818
## Age                                             -0.011284   0.002428  -4.647
## SexMale                                         -0.051454   0.073071  -0.704
## RaceMore than 1                                 -0.203949   0.212156  -0.961
## RaceNo Latino - America Indian or Alaska Native -0.347439   0.242633  -1.432
## RaceNo Latino - Asian                           -0.177798   0.179144  -0.992
## RaceNo Latino - Black or African-American       -0.062078   0.174253  -0.356
## RaceNo Latino - Other                           -0.190932   0.352488  -0.542
## RaceNo Latino - White                           -0.202273   0.145131  -1.394
## RaceUnknown                                      0.506261   0.504701   1.003
## educationSome High School                        0.149872   0.734110   0.204
## educationHigh School Graduate                    0.243405   0.660952   0.368
## educationSome College / Associate's Degree       0.346083   0.655581   0.528
## educationGraduate or Professional Degree         0.401248   0.654122   0.613
##                                                 Pr(>|t|)    
## (Intercept)                                      < 2e-16 ***
## vaping_statusFormer Vaper                         0.5087    
## vaping_statusNon-Vaper                            0.0693 .  
## Age                                             3.63e-06 ***
## SexMale                                           0.4814    
## RaceMore than 1                                   0.3365    
## RaceNo Latino - America Indian or Alaska Native   0.1523    
## RaceNo Latino - Asian                             0.3211    
## RaceNo Latino - Black or African-American         0.7217    
## RaceNo Latino - Other                             0.5881    
## RaceNo Latino - White                             0.1636    
## RaceUnknown                                       0.3160    
## educationSome High School                         0.8383    
## educationHigh School Graduate                     0.7127    
## educationSome College / Associate's Degree        0.5976    
## educationGraduate or Professional Degree          0.5397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.449 on 1680 degrees of freedom
##   (57 observations deleted due to missingness)
## Multiple R-squared:  0.02475,    Adjusted R-squared:  0.01604 
## F-statistic: 2.842 on 15 and 1680 DF,  p-value: 0.0002047
model_with_age <- lm(log_RBD_AUC ~ smoking_status + Age + Sex + Race + education, data = subset_data_vaccinated)
model_without_age <- lm(log_RBD_AUC ~ smoking_status + Sex + Race + education, data = subset_data_vaccinated)
anova(model_without_age, model_with_age)
library(dplyr)

filtered_data <- subset_data_vaccinated %>%
  filter(!is.na(smoking_status))

# Calculate median values for each smoking_status group
median_values <- filtered_data %>%
  group_by(smoking_status) %>%
  summarize(median_log_RBD_AUC = median(log_RBD_AUC, na.rm = TRUE))

# Jitter plot with median values labeled
ggplot(filtered_data, aes(x = smoking_status, y = log_RBD_AUC, color = smoking_status)) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  stat_summary(fun = "median", geom = "point", size = 6, color = "black", shape = 18) +  # Bold median marker
  geom_text(
    data = median_values,
    aes(x = smoking_status, y = median_log_RBD_AUC, label = round(median_log_RBD_AUC, 2)),
    vjust = -1.5, size = 5, color = "black"
  ) +  # Annotate median values
  theme_minimal() +
  labs(
    title = "Distribution of Antibodies by Smoking Status",
    x = "Smoking Status",
    y = "Log of Antibody Variable"
  ) +
  scale_color_brewer(palette = "Set2") +
  scale_y_continuous(limits = c(3, 12), breaks = seq(3, 12, 1)) +
  theme(legend.position = "none")
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Filter out rows with NA vaping_status
filtered_data_vaping <- filtered_data %>%
  filter(!is.na(vaping_status))

# Calculate median values for each vaping_status group
median_values_vaping <- filtered_data_vaping %>%
  group_by(vaping_status) %>%
  summarize(median_log_RBD_AUC = median(log_RBD_AUC, na.rm = TRUE))

# Jitter plot with median values labeled
ggplot(filtered_data_vaping, aes(x = vaping_status, y = log_RBD_AUC, color = vaping_status)) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  stat_summary(fun = "median", geom = "point", size = 6, color = "black", shape = 18) +  # Bold median marker
  geom_text(
    data = median_values_vaping,
    aes(x = vaping_status, y = median_log_RBD_AUC, label = round(median_log_RBD_AUC, 2)),
    vjust = -1.5, size = 5, color = "black"
  ) +  # Annotate median values
  theme_minimal() +
  labs(
    title = "Distribution of Antibodies by Vaping Status",
    x = "Vaping Status",
    y = "Log of Antibody Variable"
  ) +
  scale_color_brewer(palette = "Set3") +
  scale_y_continuous(limits = c(3, 12), breaks = seq(3, 12, 1)) +
  theme(legend.position = "none")
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_point()`).

library(viridis)
## Loading required package: viridisLite
# Create a new variable for age groups
filtered_data <- filtered_data %>%
  mutate(Age_Group = cut(
    Age,
    breaks = c(10, 20, 30, 40, 50, 60, 70, 80),
    labels = c("10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80"),
    right = FALSE
  ))

# Calculate mean antibody levels for vaping status
heatmap_data_vaping <- filtered_data %>%
  group_by(Age_Group, vaping_status) %>%
  summarize(mean_log_RBD_AUC = mean(log_RBD_AUC, na.rm = TRUE))
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
# Calculate mean antibody levels for vaping status
heatmap_data_smoking <- filtered_data %>%
  group_by(Age_Group, smoking_status) %>%
  summarize(mean_log_RBD_AUC = mean(log_RBD_AUC, na.rm = TRUE))
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
# Remove NA rows from the heatmap_data_vaping
heatmap_data_smoking <- heatmap_data_smoking %>%
  filter(!is.na(Age_Group), !is.na(smoking_status))




# Heatmap for Smoking Status with expanded color range
ggplot(heatmap_data_smoking, aes(x = Age_Group, y = smoking_status, fill = mean_log_RBD_AUC)) +
  geom_tile(color = "white") +
  scale_fill_viridis(
    option = "magma", 
    direction = -1, 
    name = "Mean Antibody\nLevel (Log)",
    limits = c(8, 10)  # Manually expand the range
  ) +
  theme_minimal() +
  labs(
    title = "Heatmap of Age and Antibody Levels by Smoking Status",
    x = "Age Group (Years)",
    y = "Smoking Status"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Create a new variable for age groups
filtered_data <- filtered_data %>%
  mutate(Age_Group = cut(
    Age,
    breaks = c(10, 20, 30, 40, 50, 60, 70, 80),
    labels = c("10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80"),
    right = FALSE
  ))

# Calculate mean antibody levels for vaping status
heatmap_data_vaping <- filtered_data %>%
  group_by(Age_Group, vaping_status) %>%
  summarize(mean_log_RBD_AUC = mean(log_RBD_AUC, na.rm = TRUE))
## `summarise()` has grouped output by 'Age_Group'. You can override using the
## `.groups` argument.
# Remove NA rows from the heatmap_data_vaping
heatmap_data_vaping <- heatmap_data_vaping %>%
  filter(!is.na(Age_Group), !is.na(vaping_status))

# Generate the heatmap
ggplot(heatmap_data_vaping, aes(x = Age_Group, y = vaping_status, fill = mean_log_RBD_AUC)) +
  geom_tile(color = "white") +
  scale_fill_viridis(
    option = "magma", 
    direction = -1, 
    name = "Mean Antibody\nLevel (Log)",
    limits = c(8, 10)  # Adjust to your data range
  ) +
  theme_minimal() +
  labs(
    title = "Heatmap of Age and Antibody Levels by Vaping Status",
    x = "Age Group (Years)",
    y = "Vaping Status"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Filter out rows with NA in smoking_status or vaping_status
filtered_data_no_na <- filtered_data %>%
  filter(!is.na(smoking_status), !is.na(vaping_status))

# Create the interaction plot
ggplot(filtered_data_no_na, aes(x = smoking_status, y = log_RBD_AUC, color = vaping_status, group = vaping_status)) +
  geom_line(stat = "summary", fun = "mean", size = 1) +
  geom_point(stat = "summary", fun = "mean", size = 3) +
  theme_minimal() +
  labs(
    title = "Interaction Plot: Smoking and Vaping Status",
    x = "Smoking Status",
    y = "Log of Antibody Variable",
    color = "Vaping Status"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(ggplot2)
library(dplyr)
library(viridis)
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(shiny)

# Ensure Age_Group is treated as a factor with proper levels for clarity
heatmap_data_smoking <- heatmap_data_smoking %>%
  filter(!is.na(Age_Group)) %>%
  mutate(Age_Group = factor(Age_Group, levels = unique(Age_Group)))

# Shiny App
ui <- fluidPage(
  titlePanel("Interactive Heatmap with Filters"),
  
  sidebarLayout(
    sidebarPanel(
      # Checkbox for Age Group
      checkboxGroupInput(
        inputId = "age_filter",
        label = "Select Age Groups to Display:",
        choices = levels(heatmap_data_smoking$Age_Group),
        selected = levels(heatmap_data_smoking$Age_Group)  # Select all by default
      ),
      # Checkbox for Smoking Status
      checkboxGroupInput(
        inputId = "smoking_filter",
        label = "Select Smoking Status:",
        choices = unique(heatmap_data_smoking$smoking_status),
        selected = unique(heatmap_data_smoking$smoking_status)
      )
    ),
    mainPanel(
      plotlyOutput("heatmap")
    )
  )
)

server <- function(input, output) {
  output$heatmap <- renderPlotly({
    # Filter data based on inputs
    filtered_data <- heatmap_data_smoking %>%
      filter(Age_Group %in% input$age_filter) %>%
      filter(smoking_status %in% input$smoking_filter)
    
    # Ensure data is non-empty after filtering
    validate(
      need(nrow(filtered_data) > 0, "No data available for the selected filters.")
    )
    
    # Create the ggplot heatmap
    heatmap_plot <- ggplot(filtered_data, aes(x = Age_Group, y = smoking_status, fill = mean_log_RBD_AUC)) +
      geom_tile(color = "white") +
      scale_fill_viridis(
        option = "magma",
        direction = -1,
        name = "Mean Antibody\nLevel (Log)"
      ) +
      theme_minimal() +
      labs(
        title = "Heatmap of Age and Antibody Levels by Smoking Status",
        x = "Age Group (Years)",
        y = "Smoking Status"
      ) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    # Convert ggplot to plotly
    ggplotly(heatmap_plot)
  })
}

# Run the Shiny app
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents
library(ggplot2)
library(dplyr)
library(viridis)
library(plotly)
library(shiny)

# Prepare data
heatmap_data_vaping <- heatmap_data_vaping %>%
  filter(!is.na(Age_Group), !is.na(vaping_status)) %>%
  mutate(Age_Group = factor(Age_Group, levels = unique(Age_Group)))

# Shiny App
ui <- fluidPage(
  titlePanel("Interactive Heatmap with Filters: Vaping Status"),
  
  sidebarLayout(
    sidebarPanel(
      # Checkbox for Age Group
      checkboxGroupInput(
        inputId = "age_filter",
        label = "Select Age Groups to Display:",
        choices = levels(heatmap_data_vaping$Age_Group),
        selected = levels(heatmap_data_vaping$Age_Group)  # Select all by default
      ),
      # Checkbox for Vaping Status
      checkboxGroupInput(
        inputId = "vaping_filter",
        label = "Select Vaping Status:",
        choices = unique(heatmap_data_vaping$vaping_status),
        selected = unique(heatmap_data_vaping$vaping_status)  # Select all by default
      )
    ),
    mainPanel(
      plotlyOutput("heatmap")
    )
  )
)

server <- function(input, output) {
  output$heatmap <- renderPlotly({
    # Filter data based on inputs
    filtered_data <- heatmap_data_vaping %>%
      filter(Age_Group %in% input$age_filter) %>%
      filter(vaping_status %in% input$vaping_filter)
    
    # Ensure data is non-empty after filtering
    validate(
      need(nrow(filtered_data) > 0, "No data available for the selected filters.")
    )
    
    # Create the ggplot heatmap
    heatmap_plot <- ggplot(filtered_data, aes(x = Age_Group, y = vaping_status, fill = mean_log_RBD_AUC)) +
      geom_tile(color = "white") +
      scale_fill_viridis(
        option = "magma",
        direction = -1,
        name = "Mean Antibody\nLevel (Log)"
      ) +
      theme_minimal() +
      labs(
        title = "Heatmap of Age and Antibody Levels by Vaping Status",
        x = "Age Group (Years)",
        y = "Vaping Status"
      ) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    # Convert ggplot to plotly
    ggplotly(heatmap_plot)
  })
}

# Run the Shiny app
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents
library(ggplot2)
library(dplyr)
library(plotly)

# Filter out rows with NA in smoking_status or vaping_status
filtered_data_no_na <- filtered_data %>%
  filter(!is.na(smoking_status), !is.na(vaping_status))

# Create the ggplot
ggplot_interaction <- ggplot(filtered_data_no_na, aes(x = smoking_status, y = log_RBD_AUC, color = vaping_status, group = vaping_status)) +
  geom_line(stat = "summary", fun = "mean", size = 1) +
  geom_point(stat = "summary", fun = "mean", size = 3) +
  theme_minimal() +
  labs(
    title = "Interaction Plot: Smoking and Vaping Status",
    x = "Smoking Status",
    y = "Log of Antibody Variable",
    color = "Vaping Status"
  )

# Convert to interactive plot using plotly
interactive_plot <- ggplotly(ggplot_interaction)

# View the interactive plot
interactive_plot
library(htmlwidgets)
saveWidget(interactive_plot, "interactive_interaction_plot.html")
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(crosstalk)
## Warning: package 'crosstalk' was built under R version 4.4.2
## 
## Attaching package: 'crosstalk'
## The following object is masked from 'package:shiny':
## 
##     getDefaultReactiveDomain
# Prepare data for smoking heatmap
heatmap_data_smoking <- heatmap_data_smoking %>%
  filter(!is.na(Age_Group)) %>%
  mutate(Age_Group = factor(Age_Group, levels = unique(Age_Group)))

# Create a shared data object for filtering
shared_smoking_data <- SharedData$new(heatmap_data_smoking, group = "smoking")
# Create ggplot heatmap
smoking_heatmap <- ggplot(shared_smoking_data, aes(x = Age_Group, y = smoking_status, fill = mean_log_RBD_AUC)) +
  geom_tile(color = "white") +
  scale_fill_viridis(
    option = "magma",
    direction = -1,
    name = "Mean Antibody\nLevel (Log)"
  ) +
  theme_minimal() +
  labs(
    title = "Heatmap of Age and Antibody Levels by Smoking Status",
    x = "Age Group (Years)",
    y = "Smoking Status"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Render as plotly interactive plot
heatmap_interactive <- ggplotly(smoking_heatmap)

# Combine the heatmap with filters
bscols(
  list(
    filter_checkbox("Age_Group", "Select Age Groups:", shared_smoking_data, ~Age_Group),
    filter_checkbox("smoking_status", "Select Smoking Status:", shared_smoking_data, ~smoking_status)
  ),
  heatmap_interactive
)