birthwt
dataset from the MASS
librarylibrary(MASS) # Support Functions and Datasets for Venables and Ripley's MASS
library(psych) # Procedures for Psychological, Psychometric, and Personality
library(car) # Companion to Applied Regression
library(magrittr) # A Forward-Pipe Operator for R
library(RColorBrewer)
birthwt
dataset from the MASS
librarydata(birthwt)
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
# | variable name | variable label | coded levels |
---|---|---|---|
1 | low | indicator of birth weight less than 2.5 kg | 0, 1 |
2 | age | mother’s age in years | continuous variable |
3 | lwt | mother’s weight in pounds at last menstrual period | continuous variable |
4 | race | mother’s race (1 = white, 2 = black, 3 = other) | 1, 2, 3 |
5 | smoke | smoking status during pregnancy | 0, 1 |
6 | ptl | number of previous premature labours | 0, 1, 2, 3 |
7 | ht | history of hypertension | 0, 1 |
8 | ui | presence of uterine irritability | 0, 1 |
9 | ftv | number of physician visits during the first trimester | 0, 1, 2, 3, 4, 6 |
10 | bwt | birth weight in grams | continuous variable |
head(birthwt)
low | age | lwt | race | smoke | ptl | ht | ui | ftv | bwt | |
---|---|---|---|---|---|---|---|---|---|---|
85 | 0 | 19 | 182 | 2 | 0 | 0 | 0 | 1 | 0 | 2523 |
86 | 0 | 33 | 155 | 3 | 0 | 0 | 0 | 0 | 3 | 2551 |
87 | 0 | 20 | 105 | 1 | 1 | 0 | 0 | 0 | 1 | 2557 |
88 | 0 | 21 | 108 | 1 | 1 | 0 | 0 | 1 | 2 | 2594 |
89 | 0 | 18 | 107 | 1 | 1 | 0 | 0 | 1 | 0 | 2600 |
91 | 0 | 21 | 124 | 3 | 0 | 0 | 0 | 0 | 0 | 2622 |
Use factor
to recode
ibwt <- birthwt
ibwt$low <- factor(ibwt$low)
ibwt$race <- factor(ibwt$race)
ibwt$smoke <- factor(ibwt$smoke)
ibwt$ptl <- factor(ibwt$ptl)
ibwt$ht <- factor(ibwt$ht)
ibwt$ui <- factor(ibwt$ui)
ibwt$ftv <- factor(ibwt$ftv)
str(ibwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptl : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ ht : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ui : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
## $ ftv : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
bwt
could be dependent on all other variables except low
ibwtc <- ibwt[-1]
head(ibwtc)
age | lwt | race | smoke | ptl | ht | ui | ftv | bwt | |
---|---|---|---|---|---|---|---|---|---|
85 | 19 | 182 | 2 | 0 | 0 | 0 | 1 | 0 | 2523 |
86 | 33 | 155 | 3 | 0 | 0 | 0 | 0 | 3 | 2551 |
87 | 20 | 105 | 1 | 1 | 0 | 0 | 0 | 1 | 2557 |
88 | 21 | 108 | 1 | 1 | 0 | 0 | 1 | 2 | 2594 |
89 | 18 | 107 | 1 | 1 | 0 | 0 | 1 | 0 | 2600 |
91 | 21 | 124 | 3 | 0 | 0 | 0 | 0 | 0 | 2622 |
par(mfrow=c(1,1))
par(oma = c(1, 1, 1, 1)) # Sets outside margins: b, l, t, r
par(mar = c(4, 1, 4, 1)) # Sets plot margins: b, l, t, r
# Histogram
hist(ibwtc$bwt,
prob = TRUE, # else freq = FALSE
ylim = c(0, 0.0006),
#xlim = c(30, 100),
breaks = 12,
col = "lightgray", #E5E5E5
border = 0,
xlab="bwt",
main = "Infant's birth weight from the \"birthweight\" data set ")
# Normal density curve
curve(dnorm(x, mean = mean(ibwtc$bwt), sd = sd(ibwtc$bwt)),
col = "darkred",
lwd = 2,
add = TRUE)
# Histogram: Kernel density lines
lines(density(ibwtc$bwt), col = "blue", lwd = 2)
# Histogram: Rug
rug(ibwtc$bwt, col = "red", lwd = 2)
par(mfrow=c(1,1))
par(oma = c(1, 1, 1, 1))
par(mar = c(4, 1, 4, 1))
qqnorm(ibwtc$bwt,
xlab = "bwt",
main="Normal Q-Q plot of infant's birth weight")
qqline(ibwtc$bwt)
par(mfrow=c(1,1))
par(oma = c(1, 1, 1, 1))
par(mar = c(4, 1, 4, 1))
boxplot(ibwtc$bwt,
horizontal = T,
xlab = "bwt",
main="Boxplot of infant's birth weight")
par(mfrow=c(1,3))
par(oma = c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
hist(ibwtc$age,
prob = TRUE,
ylim = c(0, 0.1),
#xlim = c(30, 100),
breaks = 12,
col = "lightgray",
border = 0,
xlab="age",
main = "Mother's age from\n the \"birthweight\" data set ")
curve(dnorm(x, mean = mean(ibwtc$age), sd = sd(ibwtc$age)),
col = "darkred",
lwd = 2,
add = TRUE)
lines(density(ibwtc$age), col = "blue", lwd = 2)
rug(ibwtc$age, col = "red", lwd = 2)
qqnorm(ibwtc$age,
xlab = "age",
main="Normal Q-Q plot of\n mother's age")
qqline(ibwtc$age)
boxplot(ibwtc$age,
horizontal = T,
xlab = "age",
main="Boxplot of mother's age")
par(mfrow=c(1,3))
par(oma = c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
hist(ibwtc$lwt,
prob = TRUE, # else freq = FALSE
ylim = c(0, 0.02),
#xlim = c(30, 100),
breaks = 12,
col = "lightgray", #E5E5E5
border = 0,
xlab="lwt",
main = "Mother's weight in pounds\n at last menstrual period")
curve(dnorm(x, mean = mean(ibwtc$lwt), sd = sd(ibwtc$lwt)),
col = "darkred",
lwd = 2,
add = TRUE)
lines(density(ibwtc$lwt), col = "blue", lwd = 2)
rug(ibwtc$lwt, col = "red", lwd = 2)
qqnorm(ibwtc$lwt,
xlab = "lwt",
main="Normal Q-Q plot of mother's weight\n in pounds at last menstrual period")
qqline(ibwtc$lwt)
boxplot(ibwtc$age,
horizontal = T,
xlab = "lwt",
main="Boxplot of mother's weight in\n pounds at last menstrual period")
par(mfrow=c(2,3))
par(oma = c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
barplot(table(ibwtc$race),
col= c("bisque1", "bisque2"),
xlab="race",
ylab="count",
main="Number of participants\n by race",
ylim=c(0, 170))
barplot(table(ibwtc$smoke),
col= c("bisque1", "bisque2"),
xlab="smoke",
main="Number of participants\n by smoking status",
ylim=c(0, 170))
barplot(table(ibwtc$ptl),
col= c("bisque1", "bisque2"),
xlab="ptl",
main="Number of participants by\n premature delivery status",
ylim=c(0, 170))
barplot(table(ibwtc$ht),
col= c("bisque1", "bisque2"),
xlab="ht",
main=" Number of participants\n by hypertension status",
ylim=c(0, 200))
barplot(table(ibwtc$ui),
xlab="ui",
col= c("bisque1", "bisque2"),
main="Number of participants by\n uterine infection status",
ylim=c(0, 200))
barplot(table(ibwtc$ftv),
col= c("bisque1", "bisque2", "bisque3"),
xlab="ftv",
main="Number of participants by 1st\n trimester physician visit status",
ylim=c(0, 200))
par(mfrow=c(2,3))
par(oma = c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
barplot(table(ibwtc$race)[order(table(ibwtc$race))],
col= c("bisque1", "bisque2"),
xlab="race",
ylab="count",
main="Number of participants\n by race",
ylim=c(0, 170))
barplot(table(ibwtc$smoke)[order(table(ibwtc$smoke))],
col= c("bisque1", "bisque2"),
xlab="smoke",
main="Number of participants\n by smoking status",
ylim=c(0, 170))
barplot(table(ibwtc$ptl)[order(table(ibwtc$ptl))],
col= c("bisque1", "bisque2"),
xlab="ptl",
main="Number of participants by\n premature delivery status",
ylim=c(0, 170))
barplot(table(ibwtc$ht)[order(table(ibwtc$ht))],
col= c("bisque1", "bisque2"),
xlab="ht",
main=" Number of participants\n by hypertension status",
ylim=c(0, 200))
barplot(table(ibwtc$ui)[order(table(ibwtc$ui))],
xlab="ui",
col= c("bisque1", "bisque2"),
main="Number of participants by\n uterine infection status",
ylim=c(0, 200))
barplot(table(ibwtc$ftv)[order(table(ibwtc$ftv))],
col= c("bisque1", "bisque2", "bisque3"),
xlab="ftv",
main="Number of participants by 1st\n trimester physician visit status",
ylim=c(0, 200))
par(mfrow=c(3,2))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
barplot(table(ibwtc$race)[order(table(ibwtc$race))],
horiz = TRUE,
col= c("bisque1", "bisque2", "bisque3"),
xlab="count",
ylab="race",
main="Frequency of mother's race")
barplot(table(ibwtc$smoke)[order(table(ibwtc$smoke))],
horiz = TRUE,
col= c("bisque1", "bisque2", "bisque3"),
xlab="count",
ylab="ftv",
main="Frequency of smoking status")
barplot(table(ibwtc$ptl)[order(table(ibwtc$ptl))],
horiz = T,
col= c("bisque1", "bisque2"),
xlab="count",
ylab="ptl",
main="Frequency of participants by\n premature delivery status")
barplot(table(ibwtc$ht)[order(table(ibwtc$ht))],
horiz = TRUE,
col= c("bisque1", "bisque2", "bisque3"),
xlab="count",
ylab="ht",
main="Number of participants\n by hypertension status")
barplot(table(ibwtc$ui)[order(table(ibwtc$ui))],
horiz=T,
las=1, # las gives orientation of axis labels
xlab="count",
ylab="ui",
col= c("bisque1", "bisque2"),
main="Frequency by\n uterine infection status")
barplot(table(ibwtc$ftv)[order(table(ibwtc$ftv))],
horiz = TRUE,
col= c("bisque1", "bisque2", "bisque3"),
xlab="count",
ylab="ftv",
main="Frequency of 1st trimester\n physician visit status")
par(mfrow=c(1, 2))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
# age vs bwt
plot(ibwtc$age,
ibwtc$bwt,
col="grey",
pch=16,
main="Infant's birth weight\n by mother's age",
xlab="Mother's age (years)",
ylab="Birth weight (grams)")
abline(lm(ibwtc$bwt ~ ibwtc$age),
col="darkred",
lwd=2)
lines(lowess (ibwtc$age, ibwtc$bwt),
col = "blue",
lwd = 2)
# lwt vs bwt
plot(ibwtc$lwt,
ibwtc$bwt,
col="grey",
pch=16,
xlab="Mother's weight (pounds)\n at last menstrual period",
ylab="Birth weight (grams)",
main="Infant's birth weight\n by mother's weight in pounds\n at last menstrual period")
abline(lm(ibwtc$bwt ~ ibwtc$lwt),
col="darkred",
lwd=2)
lines(lowess (ibwtc$lwt, ibwtc$bwt),
col = "blue",
lwd = 2)
scatterplotMatrix(~ bwt + lwt + age,
data = ibwtc,
col = "lightgray",
main="Scatterplot Matrix for \"birthwt\" Data")
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, ...)
}
pairs(~ bwt + age + lwt,
data=ibwtc,
panel = panel.smooth, # Optional smoother
main = "Scatterplot Matrix \"birthwt\" Data Using \"pairs\" Function",
diag.panel = panel.hist,
pch = 16,
col = "lightgray")
par(mfrow=c(2,3))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
boxplot(bwt ~ race,
data=ibwtc,
col = brewer.pal(4, "Set2"),
boxwex = .5, # Width of box
whisklty = 1, # Whisker line type; 1 = solid line
outpch = 16, # Outlier symbol; 16 = filled circle
outcol = brewer.pal(4, "Set2"), # Outlier color
xlab = "Race",
ylab = "Infant's body weight",
main="Birth weight by\n mother's race")
boxplot(ibwtc$bwt ~ ibwtc$smoke,
col = brewer.pal(4, "Set2"),
boxwex = .5,
whisklty = 1,
outpch = 16,
outcol = brewer.pal(4, "Set2"),
xlab = "Smoking status",
ylab = "Infant body weight",
main="Birth weight by mother's\n smoking status")
boxplot(ibwtc$bwt ~ ibwtc$ht,
col = brewer.pal(4, "Set2"),
boxwex = 0.5,
whisklty = 1,
outpch = 16,
outcol = brewer.pal(4, "Set2"),
xlab = "Hypertension status",
ylab = "Infant body weight",
main="Birth weight by mother's\n hypertension status")
boxplot(ibwtc$bwt ~ ibwtc$ui,
boxwex = 0.5,
outpch = 16,
xlab = "Uterine infection status",
ylab = "Infant body weight",
main="Birth weight by mother's\n uterine infection status")
boxplot(ibwtc$bwt ~ ibwtc$ptl,
boxwex = 0.5,
outpch = 16,
xlab = "Previous premature labor status",
ylab = "Infant body weight",
main="Birth weight by mother's\n previous premature labor status")
boxplot(ibwtc$bwt ~ ibwtc$ftv,
boxwex = 0.5,
outpch = 16,
xlab = "Physician visit status during 1st trimester",
ylab = "Infant body weight",
main="Birth weight by mother's\n physician visit\n status during 1st trimester")
# Boxplots using plot function
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
plot(bwt ~ race,
data=ibwtc,
col = brewer.pal(4, "Set2"),
boxwex = .5,
whisklty = 1,
outpch = 16,
outcol = brewer.pal(4, "Set2"),
xlab = "Race",
ylab = "Infant's body weight",
main="Birth weight by mother's race")
par(mfrow=c(1,3))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
# ex1 using square brackets to subset and calculate mean
mbwt.race1 <- mean(ibwtc[ibwtc$race==1, ]$bwt)
mbwt.race2 <- mean(ibwtc[ibwtc$race==2, ]$bwt)
mbwt.race3 <- mean(ibwtc[ibwtc$race==3, ]$bwt)
barplot(c(mbwt.race1, mbwt.race2, mbwt.race3),
legend.text = c("1", "2", "3"),
col = c(1, 2, 3),
main = "Mean birth weight by race",
xlab = "Race",
ylab = "Birth weight")
# ex 2 using subset function
mbwt.by.race <- c(mean(subset(ibwtc$bwt, ibwtc$race==1)), mean(subset(ibwtc$bwt, ibwtc$race==2)), mean(subset(ibwtc$bwt, ibwtc$race==3)))
barplot(mbwt.by.race,
legend.text = c("1", "2", "3"),
col = c(1, 2, 3),
main = "Mean birth weight by race",
xlab = "Race",
ylab = "Birth weight")
# ex 3 using forward pipes with subset function
mbwt_by_race <- c(ibwtc$bwt %>% subset(ibwtc$race==1) %>% mean(), ibwtc$bwt %>% subset(ibwtc$race==2) %>% mean(), ibwtc$bwt %>% subset(ibwtc$race==3) %>% mean())
barplot(mbwt_by_race,
legend.text = c("1", "2", "3"),
col = c(1, 2, 3),
main = "Mean birth weight by race",
xlab = "Race",
ylab = "Birth weight")
# Aggregate
bwt.by.race.means <- aggregate(bwt ~ race, data=ibwt, FUN = mean)
bwt.by.smoke.means <- aggregate(bwt ~ smoke, data=ibwtc, FUN = mean)
bwt.by.ht.means <- aggregate(bwt ~ ht, data=ibwtc, FUN = mean)
bwt.by.ui.means <- aggregate(bwt ~ ui, data=ibwtc, FUN = mean)
bwt.by.ptl.means <- aggregate(bwt ~ ptl, data=ibwtc, FUN = mean)
bwt.by.ftv.means <- aggregate(bwt ~ ftv, data=ibwtc, FUN = mean)
bwt.by.race.means
race | bwt |
---|---|
1 | 3102.719 |
2 | 2719.692 |
3 | 2805.284 |
# Transpose and assign variable/column names
bwt.by.race.means.t <- t(bwt.by.race.means[-1])
colnames(bwt.by.race.means.t) <- bwt.by.race.means[, 1]
bwt.by.smoke.means.t <- t(bwt.by.smoke.means[-1])
colnames(bwt.by.smoke.means.t) <- bwt.by.smoke.means[, 1]
bwt.by.ht.means.t <- t(bwt.by.ht.means[-1])
colnames(bwt.by.ht.means.t) <- bwt.by.ht.means[, 1]
bwt.by.ui.means.t <- t(bwt.by.ui.means[-1])
colnames(bwt.by.ui.means.t) <- bwt.by.ui.means[, 1]
bwt.by.ptl.means.t <- t(bwt.by.ptl.means[-1])
colnames(bwt.by.ptl.means.t) <- bwt.by.ptl.means[, 1]
bwt.by.ftv.means.t <- t(bwt.by.ftv.means[-1])
colnames(bwt.by.ftv.means.t) <- bwt.by.ftv.means[, 1]
bwt.by.race.means.t
## 1 2 3
## bwt 3102.719 2719.692 2805.284
# Bar plots
par(mfrow=c(1,6))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
barplot(bwt.by.race.means.t,
col = "lightgray",
main = "bwt by race",
xlab = "race",
ylab = "mean bwt")
barplot(bwt.by.smoke.means.t,
col = "lightgray",
main = "bwt by smoke",
xlab = "smoke",
ylab = "mean bwt")
barplot(bwt.by.ptl.means.t,
col = "lightgray",
main = "bwt by ptl",
xlab = "ptl",
ylab = "mean bwt")
barplot(bwt.by.ht.means.t,
col = "lightgray",
main = "bwt by ht",
xlab = "ht",
ylab = "mean bwt")
barplot(bwt.by.ui.means.t,
col = "lightgray",
main = "bwt by ui",
xlab = "ui",
ylab = "mean bwt")
barplot(bwt.by.ftv.means.t,
col = "lightgray",
main = "bwt by ftv",
xlab = "ftv",
ylab = "mean bwt")
We will go back to the original birthwt
dataset and subset it for a categorical outcome (i.e., low
. Hence we will drop bwt
.
low
could be dependent on all other variables (i.e., “age”, “lwt”, “race”, “smoke”, “ptl”, “ht”, “ui” and “ftv”).
Univariate plotting is done before. The relationship of the categorical outcome (i.e., low) wrt continuous variables could be explored using boxplots and barplots as shown above.
Here we will focus the relationship between categorical outcome and other categorical variables.
ibwtd <- ibwt[-10]
head(ibwtd)
low | age | lwt | race | smoke | ptl | ht | ui | ftv | |
---|---|---|---|---|---|---|---|---|---|
85 | 0 | 19 | 182 | 2 | 0 | 0 | 0 | 1 | 0 |
86 | 0 | 33 | 155 | 3 | 0 | 0 | 0 | 0 | 3 |
87 | 0 | 20 | 105 | 1 | 1 | 0 | 0 | 0 | 1 |
88 | 0 | 21 | 108 | 1 | 1 | 0 | 0 | 1 | 2 |
89 | 0 | 18 | 107 | 1 | 1 | 0 | 0 | 1 | 0 |
91 | 0 | 21 | 124 | 3 | 0 | 0 | 0 | 0 | 0 |
colnames(ibwtd)
## [1] "low" "age" "lwt" "race" "smoke" "ptl" "ht" "ui" "ftv"
par(mfrow=c(2,3))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
plot(ibwtd$race, ibwtd$low, xlab="race", ylab="low", main="low by race")
plot(ibwtd$smoke, ibwtd$low, xlab="smoke", ylab="low", main="low by smoke")
plot(ibwtd$ptl, ibwtd$low, xlab="ptl", ylab="low", main="low by ptl")
plot(ibwtd$ht, ibwtd$low, xlab="ht", ylab="low", main="low by ht")
plot(ibwtd$ui, ibwtd$low, xlab="ui", ylab="low", main="low by ui")
plot(ibwtd$ftv, ibwtd$low, xlab="ftv", ylab="low", main="low by ftv")
# 3 levels of race (1, 2, 3), 2 levels of smoke (0, 1) and 2 levels of low(0, 1)
mosaicplot(~ race + smoke + low, data = ibwtd, ylab="smoke", main="low by race and smoke", shade=TRUE)
mosaicplot(~ race + ptl + low, data = ibwtd, ylab="ptl", main="low by race and ptl", shade=TRUE)
mosaicplot(~ race + ht + low, data = ibwtd, ylab="ht", main="low by race and ht", shade = TRUE)
### low by (race and ui) and (race and ftv)
par(mfrow=c(1, 2))
par(oma=c(1, 1, 1, 1))
par(mar=c(4, 1, 4, 1))
mosaicplot(~ race + ui + low, data = ibwtd, ylab="ui", main="low by race and ui", shade = TRUE)
mosaicplot(~ race + ftv + low, data = ibwtd, ylab= "ftv", main="low by race and ftv", shade= TRUE)