library(readxl)
hers.dat <- read_excel("C:/Users/mavourin/Desktop/hersdata.xls")
head(hers.dat)
## # A tibble: 6 × 40
## HT age raceth nonwhite smoking drinkany exercise physact globrat poorfair
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 0 70 2 1 0 0 0 somewh… 3 0
## 2 0 62 2 1 0 0 0 much m… 3 0
## 3 1 69 1 0 0 0 0 about … 3 0
## 4 0 64 1 0 1 1 0 much m… 3 0
## 5 0 65 1 0 0 0 0 much l… 3 0
## 6 1 68 2 1 0 1 0 about … 3 0
## # ℹ 30 more variables: medcond <dbl>, htnmeds <dbl>, statins <dbl>,
## # diabetes <dbl>, dmpills <dbl>, insulin <dbl>, weight <dbl>, BMI <dbl>,
## # waist <dbl>, WHR <dbl>, glucose <dbl>, weight1 <dbl>, BMI1 <dbl>,
## # waist1 <dbl>, WHR1 <dbl>, glucose1 <dbl>, tchol <dbl>, LDL <dbl>,
## # HDL <dbl>, TG <dbl>, tchol1 <dbl>, LDL1 <dbl>, HDL1 <dbl>, TG1 <dbl>,
## # SBP <dbl>, DBP <dbl>, id <dbl>, afr_amer <dbl>, othereth <dbl>, age10 <dbl>
There are missing values in the LDL variable (patient’s LDL was not recorded for all patients). Missing data in R is recorded as NA. Calculations with NA results in an NA.
mean(hers.dat$LDL)
## [1] NA
sum(hers.dat$LDL)
## [1] NA
To find summary statistics for the non-missing data, we can specify na.rm = TRUE which tells the R functions to exclude the NA values.
mean(hers.dat$LDL,na.rm = TRUE)
## [1] 145.0385
Likewise
median(hers.dat$LDL,na.rm = TRUE)
## [1] 141
sd(hers.dat$LDL,na.rm = TRUE)
## [1] 37.80322
quantile(hers.dat$LDL, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 36.8 119.6 141.0 166.0 393.4
colMeans(hers.dat[,1:5])
## HT age raceth nonwhite smoking
## 0.4994571 66.6485704 1.1469417 0.1129207 0.1302932
summary(hers.dat)
## HT age raceth nonwhite
## Min. :0.0000 Min. :44.00 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:62.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :67.00 Median :1.000 Median :0.0000
## Mean :0.4995 Mean :66.65 Mean :1.147 Mean :0.1129
## 3rd Qu.:1.0000 3rd Qu.:72.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :79.00 Max. :3.000 Max. :1.0000
##
## smoking drinkany exercise physact
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Length:2763
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 Class :character
## Median :0.0000 Median :0.0000 Median :0.0000 Mode :character
## Mean :0.1303 Mean :0.3915 Mean :0.3865
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :2
## globrat poorfair medcond htnmeds
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :3.000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :3.063 Mean :0.2409 Mean :0.3721 Mean :0.8216
## 3rd Qu.:4.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :5.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :3 NA's :3
## statins diabetes dmpills insulin
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.3634 Mean :0.2646 Mean :0.09627 Mean :0.09881
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
##
## weight BMI waist WHR
## Min. : 37.50 Min. :15.21 Min. : 56.90 Min. :0.624
## 1st Qu.: 62.20 1st Qu.:24.64 1st Qu.: 82.00 1st Qu.:0.811
## Median : 71.00 Median :27.75 Median : 90.50 Median :0.867
## Mean : 72.73 Mean :28.58 Mean : 91.74 Mean :0.870
## 3rd Qu.: 81.40 3rd Qu.:31.73 3rd Qu.:100.30 3rd Qu.:0.923
## Max. :132.00 Max. :54.13 Max. :170.00 Max. :1.218
## NA's :2 NA's :5 NA's :2 NA's :3
## glucose weight1 BMI1 waist1
## Min. : 29.0 Min. : 37.70 Min. :14.73 Min. : 59.00
## 1st Qu.: 91.0 1st Qu.: 61.20 1st Qu.:24.34 1st Qu.: 81.30
## Median : 99.0 Median : 70.40 Median :27.54 Median : 90.00
## Mean :112.2 Mean : 72.04 Mean :28.36 Mean : 91.12
## 3rd Qu.:114.0 3rd Qu.: 80.90 3rd Qu.:31.54 3rd Qu.:100.00
## Max. :298.0 Max. :142.00 Max. :54.04 Max. :142.00
## NA's :150 NA's :153 NA's :151
## WHR1 glucose1 tchol LDL
## Min. :0.6060 Min. : 42.0 Min. :110.0 Min. : 36.8
## 1st Qu.:0.8100 1st Qu.: 91.0 1st Qu.:201.0 1st Qu.:119.6
## Median :0.8630 Median :100.0 Median :224.0 Median :141.0
## Mean :0.8668 Mean :114.5 Mean :228.6 Mean :145.0
## 3rd Qu.:0.9200 3rd Qu.:116.0 3rd Qu.:252.0 3rd Qu.:166.0
## Max. :1.1500 Max. :440.0 Max. :465.0 Max. :393.4
## NA's :151 NA's :150 NA's :4 NA's :11
## HDL TG tchol1 LDL1
## Min. : 14.00 Min. : 31.0 Min. : 92.0 Min. :-20.0
## 1st Qu.: 41.00 1st Qu.:116.0 1st Qu.:193.0 1st Qu.:106.6
## Median : 49.00 Median :157.0 Median :214.0 Median :128.8
## Mean : 50.26 Mean :166.1 Mean :219.2 Mean :132.4
## 3rd Qu.: 57.00 3rd Qu.:208.0 3rd Qu.:242.0 3rd Qu.:154.1
## Max. :130.00 Max. :476.0 Max. :535.0 Max. :450.2
## NA's :11 NA's :4 NA's :150 NA's :155
## HDL1 TG1 SBP DBP
## Min. : 14.00 Min. : 31.0 Min. : 83.0 Min. : 45.00
## 1st Qu.: 42.00 1st Qu.: 119.0 1st Qu.:122.0 1st Qu.: 67.00
## Median : 50.00 Median : 157.0 Median :134.0 Median : 72.00
## Mean : 51.78 Mean : 175.8 Mean :135.1 Mean : 73.15
## 3rd Qu.: 59.00 3rd Qu.: 214.0 3rd Qu.:147.0 3rd Qu.: 80.00
## Max. :124.00 Max. :1016.0 Max. :224.0 Max. :102.00
## NA's :155 NA's :150 NA's :1
## id afr_amer othereth age10
## Min. : 1.0 Min. :0.0000 Min. :0.00000 Min. :4.400
## 1st Qu.: 691.5 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:6.200
## Median :1382.0 Median :0.0000 Median :0.00000 Median :6.700
## Mean :1382.0 Mean :0.0789 Mean :0.03402 Mean :6.665
## 3rd Qu.:2072.5 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:7.200
## Max. :2763.0 Max. :1.0000 Max. :1.00000 Max. :7.900
##
library(dplyr)
colMeans(select(hers.dat,starts_with("T")),na.rm = TRUE)
## tchol TG tchol1 TG1
## 228.5799 166.1493 219.2346 175.7976
summary(select(hers.dat,starts_with("T")))
## tchol TG tchol1 TG1
## Min. :110.0 Min. : 31.0 Min. : 92.0 Min. : 31.0
## 1st Qu.:201.0 1st Qu.:116.0 1st Qu.:193.0 1st Qu.: 119.0
## Median :224.0 Median :157.0 Median :214.0 Median : 157.0
## Mean :228.6 Mean :166.1 Mean :219.2 Mean : 175.8
## 3rd Qu.:252.0 3rd Qu.:208.0 3rd Qu.:242.0 3rd Qu.: 214.0
## Max. :465.0 Max. :476.0 Max. :535.0 Max. :1016.0
## NA's :4 NA's :4 NA's :150 NA's :150
Let’s examine the physact variable in the HERS dataset. unique(x) will return the unique elements of x, so we can see what the different categories of physical activity are.
unique(hers.dat$physact)
## [1] "somewhat more active" "much more active" "about as active"
## [4] "much less active" "somewhat less active"
length(unique(hers.dat$physact))
## [1] 5
To count how many observations fall into each physical activity category, we can use table or count to construct a frequency table.
table(hers.dat$physact)
##
## about as active much less active much more active
## 919 503 197
## somewhat less active somewhat more active
## 838 306
You can convert to propotions table
prop.table(table(hers.dat$physact))
##
## about as active much less active much more active
## 0.33260948 0.18204850 0.07129931
## somewhat less active somewhat more active
## 0.30329352 0.11074919
tb <- read_excel("C:/Users/mavourin/Desktop/tb_incidence.xlsx")
hist(hers.dat$LDL)
plot(density(hers.dat$LDL, na.rm = TRUE))
boxplot(hers.dat$LDL)
When using the ggplot2 package, we will focus on qplot (quick plot) for now. Later we will see how to use the more general ggplot function with layers. qplot has a few key inputs
library(ggplot2)
qplot(x = LDL, data = hers.dat, geom = "histogram")
qplot(x = LDL, data = hers.dat, geom = "density")
qplot(x = LDL, data = hers.dat, geom = "boxplot")
You want to flip the boxplot
qplot(x = LDL, data = hers.dat, geom = "boxplot") + coord_flip()
For one categorical variable, we graphically summarize it with a
barplot(table(hers.dat$physact))
pie(table(hers.dat$physact))
phys_smoking_ct <- with(hers.dat,table(physact,smoking))
phys_smoking_ct
## smoking
## physact 0 1
## about as active 804 115
## much less active 418 85
## much more active 155 42
## somewhat less active 758 80
## somewhat more active 268 38
prop.table(phys_smoking_ct)
## smoking
## physact 0 1
## about as active 0.29098806 0.04162143
## much less active 0.15128484 0.03076366
## much more active 0.05609844 0.01520087
## somewhat less active 0.27433949 0.02895404
## somewhat more active 0.09699602 0.01375317
prop.table(phys_smoking_ct,margin=1)
## smoking
## physact 0 1
## about as active 0.87486398 0.12513602
## much less active 0.83101392 0.16898608
## much more active 0.78680203 0.21319797
## somewhat less active 0.90453461 0.09546539
## somewhat more active 0.87581699 0.12418301
prop.table(phys_smoking_ct,margin=2)
## smoking
## physact 0 1
## about as active 0.3345818 0.3194444
## much less active 0.1739492 0.2361111
## much more active 0.0645027 0.1166667
## somewhat less active 0.3154390 0.2222222
## somewhat more active 0.1115273 0.1055556
margin.table(phys_smoking_ct)
## [1] 2763
margin.table(phys_smoking_ct,margin=1)
## physact
## about as active much less active much more active
## 919 503 197
## somewhat less active somewhat more active
## 838 306
margin.table(phys_smoking_ct,margin=2)
## smoking
## 0 1
## 2403 360