birthwt
is from the MASS
librarycsv
file and import as a tibble
lm
, loess
and spline
)loess
, lm
and splines
)loess
, lm
and splines
)library(tidyverse)
library(pacman)
library(MASS)
library(psych)
p_load(grid, gridExtra, robustbase)
birthwt
is from the MASS
librarystr(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
# | variable name | variable label | coded levels |
---|---|---|---|
1 | low | indicator of birth weight less than 2.5 kg | 0, 1 |
2 | age | mother’s age in years | continuous variable |
3 | lwt | mother’s weight in pounds at last menstrual period | continuous variable |
4 | race | mother’s race (1 = white, 2 = black, 3 = other) | 1, 2, 3 |
5 | smoke | smoking status during pregnancy | 0, 1 |
6 | ptl | number of previous premature labours | 0, 1, 2, 3 |
7 | ht | history of hypertension | 0, 1 |
8 | ui | presence of uterine irritability | 0, 1 |
9 | ftv | number of physician visits during the first trimester | 0, 1, 2, 3, 4, 6 |
10 | bwt | birth weight in grams | continuous variable |
To study the interrelationships of variables with respect to a continuous outcome variable bwt
.
csv
file and import as a tibble
write_csv(birthwt, "birthwt.csv")
tbwt <- read_csv("datalib/birthwt.csv")
str(tbwt)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 10 variables:
## $ low : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age : num 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : num 182 155 105 108 107 124 118 103 123 113 ...
## $ race : num 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: num 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : num 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : num 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : num 2523 2551 2557 2594 2600 ...
## - attr(*, "spec")=
## .. cols(
## .. low = col_double(),
## .. age = col_double(),
## .. lwt = col_double(),
## .. race = col_double(),
## .. smoke = col_double(),
## .. ptl = col_double(),
## .. ht = col_double(),
## .. ui = col_double(),
## .. ftv = col_double(),
## .. bwt = col_double()
## .. )
tbwt <- tbwt[-1]
str(tbwt)
## Classes 'tbl_df', 'tbl' and 'data.frame': 189 obs. of 9 variables:
## $ age : num 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : num 182 155 105 108 107 124 118 103 123 113 ...
## $ race : num 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: num 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : num 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : num 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : num 2523 2551 2557 2594 2600 ...
tbwt <- tbwt %>% mutate(
race=as_factor(race),
smoke=as_factor(smoke),
ptl=as_factor(ptl),
ht=as_factor(ht),
ui=as_factor(ui),
ftv=as_factor(ftv))
glimpse(tbwt)
## Observations: 189
## Variables: 9
## $ age <dbl> 19, 33, 20, 21, 18, 21, 22, 17, 29, 26, 19, 19, 22, 30, ...
## $ lwt <dbl> 182, 155, 105, 108, 107, 124, 118, 103, 123, 113, 95, 15...
## $ race <fct> 2, 3, 1, 1, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1, 1, 2, 1, 3,...
## $ smoke <fct> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ ptl <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ ht <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ui <fct> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,...
## $ ftv <fct> 0, 3, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0,...
## $ bwt <dbl> 2523, 2551, 2557, 2594, 2600, 2622, 2637, 2637, 2663, 26...
Summary statistics could be generated for the continuous variables.
t(describe(tbwt[c(1,2,9)])[, -1])
## age lwt bwt
## n 189.0000000 189.000000 189.0000000
## mean 23.2380952 129.814815 2944.5873016
## sd 5.2986779 30.579380 729.2142952
## median 23.0000000 121.000000 2977.0000000
## trimmed 22.8954248 126.071895 2961.7581699
## mad 5.9304000 20.756400 834.7038000
## min 14.0000000 80.000000 709.0000000
## max 45.0000000 250.000000 4990.0000000
## range 31.0000000 170.000000 4281.0000000
## skew 0.7107606 1.379831 -0.2053370
## kurtosis 0.5307806 2.253148 -0.1436834
## se 0.3854221 2.224323 53.0425350
grid.arrange(
ggplot(data=tbwt, mapping=aes(x=age)) +
geom_histogram(bins = 30) +
coord_cartesian(ylim = c(0, 20)),
ggplot(data=tbwt, mapping=aes(x=lwt)) +
geom_histogram(bins = 30) +
coord_cartesian(ylim = c(0, 20)),
ggplot(data=tbwt, mapping=aes(x=bwt)) +
geom_histogram(bins = 30) +
coord_cartesian(ylim = c(0, 20)),
nrow=1,
top = "Histograms: Univariates")
grid.arrange(
ggplot(data=tbwt) +
geom_density(mapping=aes(x=age), color="blue"),
ggplot(data=tbwt) +
geom_density(mapping=aes(x=lwt), color="green"),
ggplot(data=tbwt) +
geom_density(mapping=aes(x=bwt), color="red"),
nrow=1,
top="Density plots: Univariates")
grid.arrange(
ggplot(data.frame(age=tbwt$age), aes(age)) +
stat_function(fun = dnorm, colour = "blue", geom = "point", args = list(mean = 23.24, sd = 5.3)),
ggplot(data.frame(lwt=tbwt$lwt), aes(lwt)) +
stat_function(fun = dnorm, colour = "green", geom = "point", args = list(mean = 129.81, sd = 30.58)),
ggplot(data.frame(bwt=tbwt$bwt), aes(bwt)) +
stat_function(fun = dnorm, colour = "red", geom = "point", args = list(mean = 2944.59, sd = 729.21)),
nrow=1,
top="Normal density plots: Univariates")
grid.arrange(
ggplot(data=tbwt, mapping=aes(y=age, x=1)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red") +
geom_text(aes(label = ifelse(age %in%
boxplot.stats(age)$out,
as.character(age), "")), hjust = 1.5),
ggplot(data=tbwt, mapping=aes(y=lwt, x=1)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red") +
geom_text(aes(label = ifelse(lwt %in%
boxplot.stats(lwt)$out,
as.character(lwt), "")), hjust = 1.5),
ggplot(data=tbwt, mapping=aes(y=bwt, x=1)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red") +
geom_text(aes(label = ifelse(bwt %in%
boxplot.stats(bwt)$out,
as.character(bwt), "")), hjust = 1.5),
nrow=1,
top="Box plots: Univariates")
Remember categorical variables have various levels.
grid.arrange(
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = race, fill=race)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = smoke, fill=smoke)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ptl, fill=ptl)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ht, fill=ht)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ui, fill=ui)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ftv, fill=ftv)) +
coord_flip(),
nrow=2,
top="Barplot plots: Univariates")
grid.arrange(
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = race, y = ..prop.., group=1)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = smoke, y = ..prop.., group=1)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ptl, y = ..prop.., group=1)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ht, y = ..prop.., group=1)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ui, y = ..prop.., group=1)) +
coord_flip(),
ggplot(data = tbwt) +
geom_bar(mapping = aes(x = ftv, y = ..prop.., group=1)) +
coord_flip(),
nrow=2,
top="Barplot plots: Univariates")
grid.arrange(
ggplot(data=tbwt, mapping=aes(x=age, y=lwt), position = "jitter" ) +
geom_point() +
stat_ellipse(level = .99, color = "red") +
geom_text(aes(label = ifelse((age>35.4 | lwt> 190.1), as.character(lwt), "")), hjust = 1.5),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt), position = "jitter" ) +
geom_point() +
stat_ellipse(level = .99, color = "red") +
geom_text(aes(label = ifelse((age>36 | bwt<1200), as.character(bwt), "")), hjust = 1.5),
ggplot(data=tbwt, mapping=aes(x=lwt, y=bwt), position = "jitter" ) +
geom_point() +
stat_ellipse(level = .99, color = "red") +
geom_text(aes(label = ifelse((lwt>190 | bwt<1200 | bwt> 4900), as.character(bwt), "")), hjust = 1.5),
nrow=1,
top="Scatter plots: Bivariates")
loess
, lm
and splines
)grid.arrange(
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point() +
geom_smooth(),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point() +
geom_smooth(method = lm, se = TRUE),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point() +
geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE),
nrow=1,
top="Scatter plots with smoothed conditional means: method=loess, lm and splines ")
Two continuous variables are analyzed wrt the levels of another categorical variable.
grid.arrange(
ggplot(data = tbwt) +
geom_point(mapping = aes(x = age, y = bwt, color=race)),
ggplot(data=tbwt) +
geom_point(mapping=aes(x=age, y=bwt, color=race)) +
geom_hline(yintercept=mean(tbwt$bwt), color="dark grey") +
annotate("text", label="Mean bwt", x=40, y=mean(tbwt$bwt)-100) +
geom_vline(xintercept=mean(tbwt$age), color="dark grey") +
annotate("text", label="Mean age", x=mean(tbwt$age)+3, y=5000) +
scale_color_discrete(name="race") +
scale_x_continuous(name="Age") +
scale_y_continuous(name="bwt"),
nrow=1,
top="Scatter plots wrt a categorical variable")
lm
, loess
and spline
)grid.arrange(
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point(mapping=aes(color=race)) +
geom_smooth(),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point(mapping=aes(color=race)) +
geom_smooth(method = "lm"),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point(mapping=aes(color=race)) +
geom_smooth(method="lm", formula = y ~ splines::bs(x, 3)),
nrow=1,
top="Scatter plots in relation to a categorical variable: method=loess, lm and splines")
loess
, lm
and splines
)grid.arrange(
ggplot(data=tbwt, mapping=aes(x=age, y=bwt, color=race)) +
geom_point() +
geom_smooth(),
ggplot(data = tbwt, mapping = aes(x = age, y = bwt, color=race)) +
geom_point() +
geom_smooth(method="lm"),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt, color=race)) +
geom_point() +
geom_smooth(method="lm", formula = y ~ splines::bs(x, 3)),
nrow=1,
top="Scatter plots for the levels of a categorical variable: method=loess, lm and splines")
loess
, lm
and splines
)grid.arrange(
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point(mapping=aes(color = race)) +
geom_smooth(data=filter(tbwt, race=="3")),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point(mapping=aes(color = race)) +
geom_smooth(data=filter(tbwt, race=="3"), method = "lm"),
ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
geom_point(mapping=aes(color = race)) +
geom_smooth(data=filter(tbwt, race=="3"), method = "lm", formula = y ~ splines::bs(x, 3)),
nrow=1,
top="Scatter plots for a particular level of a categorical variable: method=loess, lm and splines")
ggplot(data = tbwt) +
geom_point(mapping = aes(x = age, y = bwt, color=race)) +
facet_wrap(~ race, ncol = 3)
ggplot(tbwt, aes(age, bwt, color=race)) +
geom_smooth() +
geom_point() +
theme(text = element_text(size = 14)) +
facet_wrap(~race, ncol = 3)
ggplot(tbwt, aes(age, bwt, color=race)) +
geom_smooth(method="lm") +
geom_point() +
theme(text = element_text(size = 14)) +
facet_wrap(~race, ncol = 3)
ggplot(tbwt, aes(age, bwt, color=race)) +
geom_smooth(method="lm", formula = y ~ splines::bs(x, 3)) +
geom_point() +
theme(text = element_text(size = 14)) +
facet_wrap(~race, ncol = 3)
ggplot(data=tbwt) +
geom_point(mapping=aes(x=race, y=bwt))
ggplot(data=tbwt) +
geom_jitter(mapping=aes(x=race, y=bwt))
grid.arrange(
ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),
ggplot(data=tbwt, mapping=aes(x=smoke, y=bwt)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),
ggplot(data=tbwt, mapping=aes(x=ptl, y=bwt)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),
ggplot(data=tbwt, mapping=aes(x=ht, y=bwt)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),
ggplot(data=tbwt, mapping=aes(x=ui, y=bwt)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),
ggplot(data=tbwt, mapping=aes(x=ftv, y=bwt)) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),
nrow=2,
top="Box plots for the levels of a categorical variable")
ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
geom_boxplot(aes(colour = smoke)) +
coord_flip()
ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
geom_boxplot(aes(colour = ptl)) +
coord_flip()
ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
geom_boxplot(aes(colour = ht)) +
coord_flip()
ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
geom_boxplot(aes(colour = ui)) +
coord_flip()
ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
geom_boxplot(aes(colour = ftv)) +
coord_flip()
ggplot(data=tbwt, mapping=aes(x=reorder(race, bwt, FUN = median), y=bwt) ) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red")
ggplot(data=tbwt, mapping=aes(x=reorder(ftv, bwt, FUN = median), y=bwt) ) +
geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red")
Use geom_col not geom_bar.
grid.arrange(
tbwt %>%
group_by(race) %>%
summarize(average_bwt=mean(bwt)) %>%
ggplot(mapping=aes(x=race, y=average_bwt)) +
geom_col(aes(fill=race)) +
coord_flip(),
tbwt %>%
group_by(smoke) %>%
summarize(average_bwt=mean(bwt)) %>%
ggplot(mapping=aes(x=smoke, y=average_bwt)) +
geom_col(aes(fill=smoke)) +
coord_flip(),
tbwt %>%
group_by(ptl) %>%
summarize(average_bwt=mean(bwt)) %>%
ggplot(mapping=aes(x=ptl, y=average_bwt)) +
geom_col(aes(fill=ptl)) +
coord_flip(),
tbwt %>%
group_by(ht) %>%
summarize(average_bwt=mean(bwt)) %>%
ggplot(mapping=aes(x=ht, y=average_bwt)) +
geom_col(aes(fill=ht)) +
coord_flip(),
tbwt %>%
group_by(ui) %>%
summarize(average_bwt=mean(bwt)) %>%
ggplot(mapping=aes(x=ui, y=average_bwt)) +
geom_col(aes(fill=ui)) +
coord_flip(),
tbwt %>%
group_by(ftv) %>%
summarize(average_bwt=mean(bwt)) %>%
ggplot(mapping=aes(x=ftv, y=average_bwt)) +
geom_col(aes(fill=ftv)) +
coord_flip(),
nrow=3,
top="Barplots for the continuous outcome variable in relation to various categorical variables")
ggplot(data = tbwt, mapping = aes(x = bwt)) +
geom_freqpoly(mapping = aes(colour = race), binwidth = 500)
ggplot(data = tbwt, mapping = aes(x = bwt, y=..density..)) +
geom_freqpoly(mapping = aes(colour = race), binwidth = 500)
mcd = covMcd(tbwt[c(1, 2, 9)])
mcd
## Minimum Covariance Determinant (MCD) estimator approximation.
## Method: Fast MCD(alpha=0.5 ==> h=96); nsamp = 500; (n,k)mini = (300,5)
## Call:
## covMcd(x = tbwt[c(1, 2, 9)])
## Log(Det.): 19.65
##
## Robust Estimate of Location:
## age lwt bwt
## 22.82 119.32 2922.31
## Robust Estimate of Covariance:
## age lwt bwt
## age 33.00 20.64 184.5
## lwt 20.64 411.01 4577.6
## bwt 184.51 4577.64 688285.4
plot(mcd,
which = "dd",
labels.id = as.character(tbwt$race))
plot(mcd,
which = "qqchi2",
labels.id = as.character(tbwt$race))