2. Read CSV

# Read in data
df <- read.csv("studypop.csv")

3. Examine data

#examine data
summary(df) #sum stats of all data
##       SEQN          TELOMEAN          PCB74            PCB99       
##  Min.   : 9966   Min.   :0.5113   Min.   :  1100   Min.   :  2000  
##  1st Qu.:12786   1st Qu.:0.8799   1st Qu.:  4200   1st Qu.:  3900  
##  Median :15376   Median :1.0342   Median :  7700   Median :  6400  
##  Mean   :15513   Mean   :1.0662   Mean   : 12780   Mean   : 10171  
##  3rd Qu.:18248   3rd Qu.:1.2158   3rd Qu.: 16150   3rd Qu.: 12000  
##  Max.   :20995   Max.   :2.4291   Max.   :144000   Max.   :123000  
##                                   NA's   :7        NA's   :29      
##      PCB118           PCB138           PCB153           PCB170      
##  Min.   :  2000   Min.   :  2100   Min.   :  2100   Min.   :  2000  
##  1st Qu.:  4500   1st Qu.: 12250   1st Qu.: 17000   1st Qu.:  5200  
##  Median :  9400   Median : 25800   Median : 36600   Median : 11500  
##  Mean   : 16445   Mean   : 38468   Mean   : 52777   Mean   : 15677  
##  3rd Qu.: 19700   3rd Qu.: 51000   3rd Qu.: 69900   3rd Qu.: 21800  
##  Max.   :319000   Max.   :487000   Max.   :708000   Max.   :167000  
##  NA's   :7        NA's   :11       NA's   :7        NA's   :11      
##      PCB180           PCB187           PCB194         lymphocyte   
##  Min.   :  2000   Min.   :  1100   Min.   : 2.300   Min.   : 4.60  
##  1st Qu.: 11100   1st Qu.:  4000   1st Qu.: 5.700   1st Qu.:23.80  
##  Median : 27500   Median :  8100   Median : 6.900   Median :28.80  
##  Mean   : 38487   Mean   : 12331   Mean   : 7.244   Mean   :29.86  
##  3rd Qu.: 54200   3rd Qu.: 16200   3rd Qu.: 8.400   3rd Qu.:35.40  
##  Max.   :572000   Max.   :145000   Max.   :36.600   Max.   :78.90  
##  NA's   :8        NA's   :7                         NA's   :6      
##     monocyte         LBXNEPCT      eosinophils       basophils     
##  Min.   : 1.600   Min.   :15.90   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 6.700   1st Qu.:52.20   1st Qu.: 1.500   1st Qu.:0.4000  
##  Median : 7.800   Median :59.40   Median : 2.300   Median :0.6000  
##  Mean   : 8.032   Mean   :58.66   Mean   : 2.839   Mean   :0.6461  
##  3rd Qu.: 9.200   3rd Qu.:65.12   3rd Qu.: 3.600   3rd Qu.:0.8000  
##  Max.   :23.800   Max.   :88.10   Max.   :28.200   Max.   :5.5000  
##  NA's   :6        NA's   :6       NA's   :6        NA's   :6       
##      hxcdd            hpcdd             ocdd            pncdf       
##  Min.   :  1.40   Min.   :  1.90   Min.   :  36.8   Min.   : 0.700  
##  1st Qu.: 21.98   1st Qu.: 24.57   1st Qu.: 202.0   1st Qu.: 2.300  
##  Median : 38.85   Median : 42.60   Median : 352.0   Median : 5.200  
##  Mean   : 49.82   Mean   : 59.52   Mean   : 506.8   Mean   : 6.872  
##  3rd Qu.: 64.80   3rd Qu.: 75.53   3rd Qu.: 619.0   3rd Qu.: 9.500  
##  Max.   :314.00   Max.   :799.00   Max.   :8190.0   Max.   :46.000  
##  NA's   :190      NA's   :202      NA's   :247      NA's   :193     
##      hxcdf            hxcdf2           hxcdf3           pcnb       
##  Min.   : 1.000   Min.   : 0.800   Min.   :  0.9   Min.   :  1.30  
##  1st Qu.: 3.300   1st Qu.: 2.700   1st Qu.:  6.5   1st Qu.: 14.70  
##  Median : 5.300   Median : 4.300   Median : 10.1   Median : 25.10  
##  Mean   : 6.591   Mean   : 5.577   Mean   : 11.8   Mean   : 38.61  
##  3rd Qu.: 8.025   3rd Qu.: 7.000   3rd Qu.: 14.4   3rd Qu.: 44.30  
##  Max.   :44.400   Max.   :33.500   Max.   :234.0   Max.   :845.00  
##  NA's   :198      NA's   :187      NA's   :204     NA's   :193     
##       hxcb        segmented_neutrophils    bmi_cat3        edu_cat    
##  Min.   :  1.70   Min.   :  1100        Min.   :1.000   Min.   :1.00  
##  1st Qu.:  9.60   1st Qu.:  3800        1st Qu.:1.000   1st Qu.:1.00  
##  Median : 18.90   Median :  7000        Median :2.000   Median :2.00  
##  Mean   : 25.32   Mean   : 10559        Mean   :1.984   Mean   :2.37  
##  3rd Qu.: 35.35   3rd Qu.: 14300        3rd Qu.:3.000   3rd Qu.:3.00  
##  Max.   :172.00   Max.   :187000        Max.   :3.000   Max.   :4.00  
##  NA's   :195      NA's   :21                                          
##     race_cat         male          ln_lbxcot          age_cent      
##  Min.   :1.00   Min.   :0.0000   Min.   :-4.5099   Min.   :-23.000  
##  1st Qu.:2.00   1st Qu.:0.0000   1st Qu.:-3.9633   1st Qu.:-10.000  
##  Median :4.00   Median :0.0000   Median :-2.6173   Median :  2.000  
##  Mean   :3.18   Mean   :0.4774   Mean   :-0.8732   Mean   :  4.887  
##  3rd Qu.:4.00   3rd Qu.:1.0000   3rd Qu.: 2.9829   3rd Qu.: 19.000  
##  Max.   :4.00   Max.   :1.0000   Max.   : 6.8189   Max.   : 42.000  
##                                                                     
##      age_sq      
##  Min.   :   0.0  
##  1st Qu.:  49.0  
##  Median : 196.0  
##  Mean   : 359.6  
##  3rd Qu.: 484.0  
##  Max.   :1764.0  
## 
str(df)
## 'data.frame':    1330 obs. of  33 variables:
##  $ SEQN                 : int  13282 18242 20306 12676 16919 19584 10915 18032 15929 14837 ...
##  $ TELOMEAN             : num  0.726 0.822 1.365 0.787 1.525 ...
##  $ PCB74                : int  127000 33200 67600 57100 64200 144000 67800 56300 62900 54000 ...
##  $ PCB99                : int  123000 31200 61300 63000 86100 37200 61500 42000 66500 46600 ...
##  $ PCB118               : int  319000 38500 59400 89900 110000 75700 95700 40500 99000 45200 ...
##  $ PCB138               : int  487000 259000 310000 207000 306000 219000 204000 226000 214000 145000 ...
##  $ PCB153               : int  708000 445000 416000 292000 316000 249000 296000 268000 248000 271000 ...
##  $ PCB170               : int  165000 167000 76400 76400 76500 69900 62100 76000 50300 77300 ...
##  $ PCB180               : int  572000 481000 191000 197000 146000 172000 165000 181000 160000 167000 ...
##  $ PCB187               : int  144000 145000 88500 83300 57200 62700 48600 61900 67500 51400 ...
##  $ PCB194               : num  4.4 3.1 9.5 5.9 3.5 7.8 6.2 5.7 13.2 6 ...
##  $ lymphocyte           : num  25.7 13.5 25.6 32.6 42.4 14.8 18.9 62.2 43.7 17.5 ...
##  $ monocyte             : num  11.4 11.5 7.5 5.5 7.3 8 10.4 11.5 6.9 6.9 ...
##  $ LBXNEPCT             : num  59.5 66.8 64.8 60.2 47.2 73.1 68.5 21.2 41.4 74.3 ...
##  $ eosinophils          : num  2.7 7.6 1.7 1.5 2.2 3.6 1.7 4.7 7.3 0.9 ...
##  $ basophils            : num  0.6 0.6 0.4 0.2 1 0.6 0.4 0.4 0.6 0.4 ...
##  $ hxcdd                : num  40.3 NA NA 165 142 133 114 190 114 276 ...
##  $ hpcdd                : num  10.6 NA NA 230 137 94.2 72.2 58.6 61.6 143 ...
##  $ ocdd                 : num  282 NA NA 3190 1060 999 640 1700 1150 1760 ...
##  $ pncdf                : num  8.6 NA NA 18.9 21.2 25.6 21.4 19.7 26.1 38.3 ...
##  $ hxcdf                : num  2.8 NA NA 21.7 NA 14.8 12.7 23.6 15.5 21.6 ...
##  $ hxcdf2               : num  3.1 NA NA 21.3 11.6 11.9 14.6 17.4 15.3 19.2 ...
##  $ hxcdf3               : num  2.1 NA NA 27.4 NA 15 27.8 33.1 19.8 15.4 ...
##  $ pcnb                 : num  137 NA NA 51.2 84.7 132 58 38.6 85.5 40.4 ...
##  $ hxcb                 : num  131 NA NA 47.5 103 95.9 87.5 70 62.1 146 ...
##  $ segmented_neutrophils: int  187000 125000 41200 49300 35800 28000 36100 38400 34000 41100 ...
##  $ bmi_cat3             : int  1 1 2 3 1 3 1 1 2 1 ...
##  $ edu_cat              : int  1 1 1 1 4 1 3 1 3 1 ...
##  $ race_cat             : int  4 3 3 3 4 3 4 3 4 3 ...
##  $ male                 : int  1 1 0 0 0 0 0 0 1 0 ...
##  $ ln_lbxcot            : num  -4.51 -4.51 5.2 -2.23 -3.3 ...
##  $ age_cent             : int  42 42 26 42 8 30 38 5 27 42 ...
##  $ age_sq               : int  1764 1764 676 1764 64 900 1444 25 729 1764 ...
hist(df$TELOMEAN)

#subset dataframe
chem <- df[, c('SEQN', 'PCB99', 'PCB153', 'PCB187', 'ocdd', 'hxcdf2', 'hxcb',
               'PCB118', "PCB170", 'PCB194', 'hxcdd', 'pncdf', 'hxcdf3',
               'PCB74', "PCB138", "PCB180", 'hpcdd', 'hxcdf', 'pcnb', 'TELOMEAN')]

summary(chem) #sum stats of just chemical data
##       SEQN           PCB99            PCB153           PCB187      
##  Min.   : 9966   Min.   :  2000   Min.   :  2100   Min.   :  1100  
##  1st Qu.:12786   1st Qu.:  3900   1st Qu.: 17000   1st Qu.:  4000  
##  Median :15376   Median :  6400   Median : 36600   Median :  8100  
##  Mean   :15513   Mean   : 10171   Mean   : 52777   Mean   : 12331  
##  3rd Qu.:18248   3rd Qu.: 12000   3rd Qu.: 69900   3rd Qu.: 16200  
##  Max.   :20995   Max.   :123000   Max.   :708000   Max.   :145000  
##                  NA's   :29       NA's   :7        NA's   :7       
##       ocdd            hxcdf2            hxcb            PCB118      
##  Min.   :  36.8   Min.   : 0.800   Min.   :  1.70   Min.   :  2000  
##  1st Qu.: 202.0   1st Qu.: 2.700   1st Qu.:  9.60   1st Qu.:  4500  
##  Median : 352.0   Median : 4.300   Median : 18.90   Median :  9400  
##  Mean   : 506.8   Mean   : 5.577   Mean   : 25.32   Mean   : 16445  
##  3rd Qu.: 619.0   3rd Qu.: 7.000   3rd Qu.: 35.35   3rd Qu.: 19700  
##  Max.   :8190.0   Max.   :33.500   Max.   :172.00   Max.   :319000  
##  NA's   :247      NA's   :187      NA's   :195      NA's   :7       
##      PCB170           PCB194           hxcdd            pncdf       
##  Min.   :  2000   Min.   : 2.300   Min.   :  1.40   Min.   : 0.700  
##  1st Qu.:  5200   1st Qu.: 5.700   1st Qu.: 21.98   1st Qu.: 2.300  
##  Median : 11500   Median : 6.900   Median : 38.85   Median : 5.200  
##  Mean   : 15677   Mean   : 7.244   Mean   : 49.82   Mean   : 6.872  
##  3rd Qu.: 21800   3rd Qu.: 8.400   3rd Qu.: 64.80   3rd Qu.: 9.500  
##  Max.   :167000   Max.   :36.600   Max.   :314.00   Max.   :46.000  
##  NA's   :11                        NA's   :190      NA's   :193     
##      hxcdf3          PCB74            PCB138           PCB180      
##  Min.   :  0.9   Min.   :  1100   Min.   :  2100   Min.   :  2000  
##  1st Qu.:  6.5   1st Qu.:  4200   1st Qu.: 12250   1st Qu.: 11100  
##  Median : 10.1   Median :  7700   Median : 25800   Median : 27500  
##  Mean   : 11.8   Mean   : 12780   Mean   : 38468   Mean   : 38487  
##  3rd Qu.: 14.4   3rd Qu.: 16150   3rd Qu.: 51000   3rd Qu.: 54200  
##  Max.   :234.0   Max.   :144000   Max.   :487000   Max.   :572000  
##  NA's   :204     NA's   :7        NA's   :11       NA's   :8       
##      hpcdd            hxcdf             pcnb           TELOMEAN     
##  Min.   :  1.90   Min.   : 1.000   Min.   :  1.30   Min.   :0.5113  
##  1st Qu.: 24.57   1st Qu.: 3.300   1st Qu.: 14.70   1st Qu.:0.8799  
##  Median : 42.60   Median : 5.300   Median : 25.10   Median :1.0342  
##  Mean   : 59.52   Mean   : 6.591   Mean   : 38.61   Mean   :1.0662  
##  3rd Qu.: 75.53   3rd Qu.: 8.025   3rd Qu.: 44.30   3rd Qu.:1.2158  
##  Max.   :799.00   Max.   :44.400   Max.   :845.00   Max.   :2.4291  
##  NA's   :202      NA's   :198      NA's   :193
str(chem)
## 'data.frame':    1330 obs. of  20 variables:
##  $ SEQN    : int  13282 18242 20306 12676 16919 19584 10915 18032 15929 14837 ...
##  $ PCB99   : int  123000 31200 61300 63000 86100 37200 61500 42000 66500 46600 ...
##  $ PCB153  : int  708000 445000 416000 292000 316000 249000 296000 268000 248000 271000 ...
##  $ PCB187  : int  144000 145000 88500 83300 57200 62700 48600 61900 67500 51400 ...
##  $ ocdd    : num  282 NA NA 3190 1060 999 640 1700 1150 1760 ...
##  $ hxcdf2  : num  3.1 NA NA 21.3 11.6 11.9 14.6 17.4 15.3 19.2 ...
##  $ hxcb    : num  131 NA NA 47.5 103 95.9 87.5 70 62.1 146 ...
##  $ PCB118  : int  319000 38500 59400 89900 110000 75700 95700 40500 99000 45200 ...
##  $ PCB170  : int  165000 167000 76400 76400 76500 69900 62100 76000 50300 77300 ...
##  $ PCB194  : num  4.4 3.1 9.5 5.9 3.5 7.8 6.2 5.7 13.2 6 ...
##  $ hxcdd   : num  40.3 NA NA 165 142 133 114 190 114 276 ...
##  $ pncdf   : num  8.6 NA NA 18.9 21.2 25.6 21.4 19.7 26.1 38.3 ...
##  $ hxcdf3  : num  2.1 NA NA 27.4 NA 15 27.8 33.1 19.8 15.4 ...
##  $ PCB74   : int  127000 33200 67600 57100 64200 144000 67800 56300 62900 54000 ...
##  $ PCB138  : int  487000 259000 310000 207000 306000 219000 204000 226000 214000 145000 ...
##  $ PCB180  : int  572000 481000 191000 197000 146000 172000 165000 181000 160000 167000 ...
##  $ hpcdd   : num  10.6 NA NA 230 137 94.2 72.2 58.6 61.6 143 ...
##  $ hxcdf   : num  2.8 NA NA 21.7 NA 14.8 12.7 23.6 15.5 21.6 ...
##  $ pcnb    : num  137 NA NA 51.2 84.7 132 58 38.6 85.5 40.4 ...
##  $ TELOMEAN: num  0.726 0.822 1.365 0.787 1.525 ...

4. Plot data

#save custom ggplot aesthetics that I will use in the next code chunk
gghisto <- list(
  theme(axis.text.x = element_text(face = "bold",size=12, color = "deeppink4"),
          axis.text.y = element_text(face="bold", 
          size=14),
          axis.title=element_text(size=17),
          plot.title = element_text(size=17,face="bold")))
#one chem
chembox <-  ggplot(chem, aes(PCB99, TELOMEAN))
chembox + geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 29 rows containing missing values (`stat_boxplot()`).

box <- ggplot(df, aes(SEQN, TELOMEAN))
box + geom_boxplot() 
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

#education
df$edu_cat <- as.factor(df$edu_cat) #change from integer to categorical var


ggplot(df, aes(edu_cat, TELOMEAN)) +
  labs(title = "Edu by Telomean ", x = "Education Category") + 
  geom_boxplot(fill="pink", alpha=.5, color = "deeppink4", width = .4) +
  gghisto