1. Print statement used to output the words “Hello World”
#install.packages("tidyverse")
#install.packages("ggplot2")
library(tidyverse)
library(tidyr)
library(ggplot2)
2. Read CSV
# Read in data
df <- read.csv("studypop.csv")
3. Examine data
#examine data
summary(df) #sum stats of all data
## SEQN TELOMEAN PCB74 PCB99
## Min. : 9966 Min. :0.5113 Min. : 1100 Min. : 2000
## 1st Qu.:12786 1st Qu.:0.8799 1st Qu.: 4200 1st Qu.: 3900
## Median :15376 Median :1.0342 Median : 7700 Median : 6400
## Mean :15513 Mean :1.0662 Mean : 12780 Mean : 10171
## 3rd Qu.:18248 3rd Qu.:1.2158 3rd Qu.: 16150 3rd Qu.: 12000
## Max. :20995 Max. :2.4291 Max. :144000 Max. :123000
## NA's :7 NA's :29
## PCB118 PCB138 PCB153 PCB170
## Min. : 2000 Min. : 2100 Min. : 2100 Min. : 2000
## 1st Qu.: 4500 1st Qu.: 12250 1st Qu.: 17000 1st Qu.: 5200
## Median : 9400 Median : 25800 Median : 36600 Median : 11500
## Mean : 16445 Mean : 38468 Mean : 52777 Mean : 15677
## 3rd Qu.: 19700 3rd Qu.: 51000 3rd Qu.: 69900 3rd Qu.: 21800
## Max. :319000 Max. :487000 Max. :708000 Max. :167000
## NA's :7 NA's :11 NA's :7 NA's :11
## PCB180 PCB187 PCB194 lymphocyte
## Min. : 2000 Min. : 1100 Min. : 2.300 Min. : 4.60
## 1st Qu.: 11100 1st Qu.: 4000 1st Qu.: 5.700 1st Qu.:23.80
## Median : 27500 Median : 8100 Median : 6.900 Median :28.80
## Mean : 38487 Mean : 12331 Mean : 7.244 Mean :29.86
## 3rd Qu.: 54200 3rd Qu.: 16200 3rd Qu.: 8.400 3rd Qu.:35.40
## Max. :572000 Max. :145000 Max. :36.600 Max. :78.90
## NA's :8 NA's :7 NA's :6
## monocyte LBXNEPCT eosinophils basophils
## Min. : 1.600 Min. :15.90 Min. : 0.000 Min. :0.0000
## 1st Qu.: 6.700 1st Qu.:52.20 1st Qu.: 1.500 1st Qu.:0.4000
## Median : 7.800 Median :59.40 Median : 2.300 Median :0.6000
## Mean : 8.032 Mean :58.66 Mean : 2.839 Mean :0.6461
## 3rd Qu.: 9.200 3rd Qu.:65.12 3rd Qu.: 3.600 3rd Qu.:0.8000
## Max. :23.800 Max. :88.10 Max. :28.200 Max. :5.5000
## NA's :6 NA's :6 NA's :6 NA's :6
## hxcdd hpcdd ocdd pncdf
## Min. : 1.40 Min. : 1.90 Min. : 36.8 Min. : 0.700
## 1st Qu.: 21.98 1st Qu.: 24.57 1st Qu.: 202.0 1st Qu.: 2.300
## Median : 38.85 Median : 42.60 Median : 352.0 Median : 5.200
## Mean : 49.82 Mean : 59.52 Mean : 506.8 Mean : 6.872
## 3rd Qu.: 64.80 3rd Qu.: 75.53 3rd Qu.: 619.0 3rd Qu.: 9.500
## Max. :314.00 Max. :799.00 Max. :8190.0 Max. :46.000
## NA's :190 NA's :202 NA's :247 NA's :193
## hxcdf hxcdf2 hxcdf3 pcnb
## Min. : 1.000 Min. : 0.800 Min. : 0.9 Min. : 1.30
## 1st Qu.: 3.300 1st Qu.: 2.700 1st Qu.: 6.5 1st Qu.: 14.70
## Median : 5.300 Median : 4.300 Median : 10.1 Median : 25.10
## Mean : 6.591 Mean : 5.577 Mean : 11.8 Mean : 38.61
## 3rd Qu.: 8.025 3rd Qu.: 7.000 3rd Qu.: 14.4 3rd Qu.: 44.30
## Max. :44.400 Max. :33.500 Max. :234.0 Max. :845.00
## NA's :198 NA's :187 NA's :204 NA's :193
## hxcb segmented_neutrophils bmi_cat3 edu_cat
## Min. : 1.70 Min. : 1100 Min. :1.000 Min. :1.00
## 1st Qu.: 9.60 1st Qu.: 3800 1st Qu.:1.000 1st Qu.:1.00
## Median : 18.90 Median : 7000 Median :2.000 Median :2.00
## Mean : 25.32 Mean : 10559 Mean :1.984 Mean :2.37
## 3rd Qu.: 35.35 3rd Qu.: 14300 3rd Qu.:3.000 3rd Qu.:3.00
## Max. :172.00 Max. :187000 Max. :3.000 Max. :4.00
## NA's :195 NA's :21
## race_cat male ln_lbxcot age_cent
## Min. :1.00 Min. :0.0000 Min. :-4.5099 Min. :-23.000
## 1st Qu.:2.00 1st Qu.:0.0000 1st Qu.:-3.9633 1st Qu.:-10.000
## Median :4.00 Median :0.0000 Median :-2.6173 Median : 2.000
## Mean :3.18 Mean :0.4774 Mean :-0.8732 Mean : 4.887
## 3rd Qu.:4.00 3rd Qu.:1.0000 3rd Qu.: 2.9829 3rd Qu.: 19.000
## Max. :4.00 Max. :1.0000 Max. : 6.8189 Max. : 42.000
##
## age_sq
## Min. : 0.0
## 1st Qu.: 49.0
## Median : 196.0
## Mean : 359.6
## 3rd Qu.: 484.0
## Max. :1764.0
##
str(df)
## 'data.frame': 1330 obs. of 33 variables:
## $ SEQN : int 13282 18242 20306 12676 16919 19584 10915 18032 15929 14837 ...
## $ TELOMEAN : num 0.726 0.822 1.365 0.787 1.525 ...
## $ PCB74 : int 127000 33200 67600 57100 64200 144000 67800 56300 62900 54000 ...
## $ PCB99 : int 123000 31200 61300 63000 86100 37200 61500 42000 66500 46600 ...
## $ PCB118 : int 319000 38500 59400 89900 110000 75700 95700 40500 99000 45200 ...
## $ PCB138 : int 487000 259000 310000 207000 306000 219000 204000 226000 214000 145000 ...
## $ PCB153 : int 708000 445000 416000 292000 316000 249000 296000 268000 248000 271000 ...
## $ PCB170 : int 165000 167000 76400 76400 76500 69900 62100 76000 50300 77300 ...
## $ PCB180 : int 572000 481000 191000 197000 146000 172000 165000 181000 160000 167000 ...
## $ PCB187 : int 144000 145000 88500 83300 57200 62700 48600 61900 67500 51400 ...
## $ PCB194 : num 4.4 3.1 9.5 5.9 3.5 7.8 6.2 5.7 13.2 6 ...
## $ lymphocyte : num 25.7 13.5 25.6 32.6 42.4 14.8 18.9 62.2 43.7 17.5 ...
## $ monocyte : num 11.4 11.5 7.5 5.5 7.3 8 10.4 11.5 6.9 6.9 ...
## $ LBXNEPCT : num 59.5 66.8 64.8 60.2 47.2 73.1 68.5 21.2 41.4 74.3 ...
## $ eosinophils : num 2.7 7.6 1.7 1.5 2.2 3.6 1.7 4.7 7.3 0.9 ...
## $ basophils : num 0.6 0.6 0.4 0.2 1 0.6 0.4 0.4 0.6 0.4 ...
## $ hxcdd : num 40.3 NA NA 165 142 133 114 190 114 276 ...
## $ hpcdd : num 10.6 NA NA 230 137 94.2 72.2 58.6 61.6 143 ...
## $ ocdd : num 282 NA NA 3190 1060 999 640 1700 1150 1760 ...
## $ pncdf : num 8.6 NA NA 18.9 21.2 25.6 21.4 19.7 26.1 38.3 ...
## $ hxcdf : num 2.8 NA NA 21.7 NA 14.8 12.7 23.6 15.5 21.6 ...
## $ hxcdf2 : num 3.1 NA NA 21.3 11.6 11.9 14.6 17.4 15.3 19.2 ...
## $ hxcdf3 : num 2.1 NA NA 27.4 NA 15 27.8 33.1 19.8 15.4 ...
## $ pcnb : num 137 NA NA 51.2 84.7 132 58 38.6 85.5 40.4 ...
## $ hxcb : num 131 NA NA 47.5 103 95.9 87.5 70 62.1 146 ...
## $ segmented_neutrophils: int 187000 125000 41200 49300 35800 28000 36100 38400 34000 41100 ...
## $ bmi_cat3 : int 1 1 2 3 1 3 1 1 2 1 ...
## $ edu_cat : int 1 1 1 1 4 1 3 1 3 1 ...
## $ race_cat : int 4 3 3 3 4 3 4 3 4 3 ...
## $ male : int 1 1 0 0 0 0 0 0 1 0 ...
## $ ln_lbxcot : num -4.51 -4.51 5.2 -2.23 -3.3 ...
## $ age_cent : int 42 42 26 42 8 30 38 5 27 42 ...
## $ age_sq : int 1764 1764 676 1764 64 900 1444 25 729 1764 ...
hist(df$TELOMEAN)

#subset dataframe
chem <- df[, c('SEQN', 'PCB99', 'PCB153', 'PCB187', 'ocdd', 'hxcdf2', 'hxcb',
'PCB118', "PCB170", 'PCB194', 'hxcdd', 'pncdf', 'hxcdf3',
'PCB74', "PCB138", "PCB180", 'hpcdd', 'hxcdf', 'pcnb', 'TELOMEAN')]
summary(chem) #sum stats of just chemical data
## SEQN PCB99 PCB153 PCB187
## Min. : 9966 Min. : 2000 Min. : 2100 Min. : 1100
## 1st Qu.:12786 1st Qu.: 3900 1st Qu.: 17000 1st Qu.: 4000
## Median :15376 Median : 6400 Median : 36600 Median : 8100
## Mean :15513 Mean : 10171 Mean : 52777 Mean : 12331
## 3rd Qu.:18248 3rd Qu.: 12000 3rd Qu.: 69900 3rd Qu.: 16200
## Max. :20995 Max. :123000 Max. :708000 Max. :145000
## NA's :29 NA's :7 NA's :7
## ocdd hxcdf2 hxcb PCB118
## Min. : 36.8 Min. : 0.800 Min. : 1.70 Min. : 2000
## 1st Qu.: 202.0 1st Qu.: 2.700 1st Qu.: 9.60 1st Qu.: 4500
## Median : 352.0 Median : 4.300 Median : 18.90 Median : 9400
## Mean : 506.8 Mean : 5.577 Mean : 25.32 Mean : 16445
## 3rd Qu.: 619.0 3rd Qu.: 7.000 3rd Qu.: 35.35 3rd Qu.: 19700
## Max. :8190.0 Max. :33.500 Max. :172.00 Max. :319000
## NA's :247 NA's :187 NA's :195 NA's :7
## PCB170 PCB194 hxcdd pncdf
## Min. : 2000 Min. : 2.300 Min. : 1.40 Min. : 0.700
## 1st Qu.: 5200 1st Qu.: 5.700 1st Qu.: 21.98 1st Qu.: 2.300
## Median : 11500 Median : 6.900 Median : 38.85 Median : 5.200
## Mean : 15677 Mean : 7.244 Mean : 49.82 Mean : 6.872
## 3rd Qu.: 21800 3rd Qu.: 8.400 3rd Qu.: 64.80 3rd Qu.: 9.500
## Max. :167000 Max. :36.600 Max. :314.00 Max. :46.000
## NA's :11 NA's :190 NA's :193
## hxcdf3 PCB74 PCB138 PCB180
## Min. : 0.9 Min. : 1100 Min. : 2100 Min. : 2000
## 1st Qu.: 6.5 1st Qu.: 4200 1st Qu.: 12250 1st Qu.: 11100
## Median : 10.1 Median : 7700 Median : 25800 Median : 27500
## Mean : 11.8 Mean : 12780 Mean : 38468 Mean : 38487
## 3rd Qu.: 14.4 3rd Qu.: 16150 3rd Qu.: 51000 3rd Qu.: 54200
## Max. :234.0 Max. :144000 Max. :487000 Max. :572000
## NA's :204 NA's :7 NA's :11 NA's :8
## hpcdd hxcdf pcnb TELOMEAN
## Min. : 1.90 Min. : 1.000 Min. : 1.30 Min. :0.5113
## 1st Qu.: 24.57 1st Qu.: 3.300 1st Qu.: 14.70 1st Qu.:0.8799
## Median : 42.60 Median : 5.300 Median : 25.10 Median :1.0342
## Mean : 59.52 Mean : 6.591 Mean : 38.61 Mean :1.0662
## 3rd Qu.: 75.53 3rd Qu.: 8.025 3rd Qu.: 44.30 3rd Qu.:1.2158
## Max. :799.00 Max. :44.400 Max. :845.00 Max. :2.4291
## NA's :202 NA's :198 NA's :193
str(chem)
## 'data.frame': 1330 obs. of 20 variables:
## $ SEQN : int 13282 18242 20306 12676 16919 19584 10915 18032 15929 14837 ...
## $ PCB99 : int 123000 31200 61300 63000 86100 37200 61500 42000 66500 46600 ...
## $ PCB153 : int 708000 445000 416000 292000 316000 249000 296000 268000 248000 271000 ...
## $ PCB187 : int 144000 145000 88500 83300 57200 62700 48600 61900 67500 51400 ...
## $ ocdd : num 282 NA NA 3190 1060 999 640 1700 1150 1760 ...
## $ hxcdf2 : num 3.1 NA NA 21.3 11.6 11.9 14.6 17.4 15.3 19.2 ...
## $ hxcb : num 131 NA NA 47.5 103 95.9 87.5 70 62.1 146 ...
## $ PCB118 : int 319000 38500 59400 89900 110000 75700 95700 40500 99000 45200 ...
## $ PCB170 : int 165000 167000 76400 76400 76500 69900 62100 76000 50300 77300 ...
## $ PCB194 : num 4.4 3.1 9.5 5.9 3.5 7.8 6.2 5.7 13.2 6 ...
## $ hxcdd : num 40.3 NA NA 165 142 133 114 190 114 276 ...
## $ pncdf : num 8.6 NA NA 18.9 21.2 25.6 21.4 19.7 26.1 38.3 ...
## $ hxcdf3 : num 2.1 NA NA 27.4 NA 15 27.8 33.1 19.8 15.4 ...
## $ PCB74 : int 127000 33200 67600 57100 64200 144000 67800 56300 62900 54000 ...
## $ PCB138 : int 487000 259000 310000 207000 306000 219000 204000 226000 214000 145000 ...
## $ PCB180 : int 572000 481000 191000 197000 146000 172000 165000 181000 160000 167000 ...
## $ hpcdd : num 10.6 NA NA 230 137 94.2 72.2 58.6 61.6 143 ...
## $ hxcdf : num 2.8 NA NA 21.7 NA 14.8 12.7 23.6 15.5 21.6 ...
## $ pcnb : num 137 NA NA 51.2 84.7 132 58 38.6 85.5 40.4 ...
## $ TELOMEAN: num 0.726 0.822 1.365 0.787 1.525 ...
4. Plot data
#save custom ggplot aesthetics that I will use in the next code chunk
gghisto <- list(
theme(axis.text.x = element_text(face = "bold",size=12, color = "deeppink4"),
axis.text.y = element_text(face="bold",
size=14),
axis.title=element_text(size=17),
plot.title = element_text(size=17,face="bold")))
#one chem
chembox <- ggplot(chem, aes(PCB99, TELOMEAN))
chembox + geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 29 rows containing missing values (`stat_boxplot()`).

box <- ggplot(df, aes(SEQN, TELOMEAN))
box + geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

#education
df$edu_cat <- as.factor(df$edu_cat) #change from integer to categorical var
ggplot(df, aes(edu_cat, TELOMEAN)) +
labs(title = "Edu by Telomean ", x = "Education Category") +
geom_boxplot(fill="pink", alpha=.5, color = "deeppink4", width = .4) +
gghisto
