Let’s load the data we want to analyze into R. The ficticious data we are loading is called “mydata”:
# If you code is a csv file saved in your local machine, this is how you should load it up:
mydata <- read.csv("C:\\Users\\username\\Documents\\UMn MPH\\Courses\\
3rd Semester\\Biostat II - PUBH6451\\Project Folder\\
Raw Data\\mydata.csv")
Suppose we want to transform a numerical var1 into a categorical variable that we will name var2 while also changing the variable names to something useful. This is how you do it:
mutated_mydata <- mydata %>%
transmute(mydata, var2 = factor(case_when(
va1 <= 2 ~ "Category 1",
va1 <= 4 ~ "Category 2",
va1 <= 6 ~ "Category 3",
TRUE ~ "Category 4")))
If you want to delete observations with NA variables, this is how you do it for var1
mydata_new <- mydata %>%
filter(!is.na(var1))
Using the packages gt_summary and gt, we can create some nice tables for our Table 1 and Regression Tables! Take a look:
#Table 1
table1<- mydata %>%
select(var1, var2) %>%
tbl_summary(by = var1,
missing = "no",
statistic = list(all_continuous() ~ "{mean} ({sd})"),
type = all_dichotomous() ~ "categorical")%>%
add_p(test = list(all_continuous() ~ "t.test", # If data is non parametric, this can be changed!
all_categorical() ~ "chisq.test")) %>% # Same for categorical!
modify_header(label ~ "**Var1 distribution in Var2**") %>%
modify_caption("**Table 1. Patient Characteristics**") %>%
bold_labels() %>%
as_gt()
#Regression Table
model_table <- model1 %>%
tbl_regression(exponentiate = TRUE) %>% #exponentiate when needed
as_gt()
Always good to have some understanding of general table making with gt() as well. In this case we are transforming a general data frame to a nice gt table!
table <- table.g %>%
gt() %>%
tab_header( # Adding title
title = md("Table 1. Generic Title for your table"),
subtitle = "Subtitle should go here") %>%
tab_footnote( # Adding a foot note to a specific column
locations = cells_column_labels(vars(specific_column)),
footnote = "Specific observation about this column") %>%
fmt_number(columns = vars(n), decimals = 0) %>% # Formating numbers to not have decimals
cols_align(
align = "center",
columns = vars(var1, var2, specific_column)) %>% # columns to middle
opt_table_outline() %>% #adding table outlines
opt_table_lines() %>% # adding lines
opt_row_striping() # adding fancy line stripping
I love density plots! They are a simple and intuitive way of presenting numerical variables by multiple categories. This is how you can create a nice looking one:
ggplot(mydata, aes(x=var1, fill=var2)) +
geom_density(alpha=0.8) +
theme_minimal() +
scale_fill_manual(values=c("maroon", "gold")) +
facet_wrap(~pred1) +
labs(
title = "Density plot",
subtitle = "Other information",
x = "Var1",
y = "Var2"
)
Ok, not really a regression model, but almost! Use this if you have a continuous outcome (var1) and 2 categorical variables as predictors (pred1 and pred2)
First build your interaction plot:
#remeber to omit NAs from your dataset!
stats::interaction.plot(
na.mydata$pred1, na.mydata$pred2,
na.mydata$var1,
lty=1,
col=c("cyan3", "brown1"),
main="Interaction Plot",
xlab="Group",
ylab="var1 numerical change",
trace.label="Categories that go in a Line")
After that, you should run the test itself. I have also included the Turkey method to quantify the relations that show up as significant in the initial analysis:
# Two-Way ANOVA
model1 <- stats::lm(data=mydata, var1 ~ pred1 * pred2,
contrasts=list(pred1=contr.sum, pred2=contr.sum))
car::Anova(model1, type="III")
#Turkey
model1.stats<-emmeans::emmeans(model1, var1 ~ pred1 * pred2)
model1.stats
Use this when you have a continuous outcome and multiple different predictors (numerical or categorical). An excellent test for sure; just be mindful of the L.I.N.E assumptions (don’t worry, we will go over how to check them!).
This is how you run the MLR itself:
model2<-lm(data=mydata, var1 ~ pred1 + pred2) # "*" instead of "+" creates an interaction term
summary(model2)
Let’s now check some of the L.I.N.E assumptions by data visualization
# L - Linearity with a scatter plot
ggplot(mydata, aes(x=pred1, y=var1, shape=pred2, color=pred2)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
scale_color_manual(values=c('blue3','deeppink')) +
labs(x = "Pred 1", y = "Var1") +
theme_classic()
# N - Homoscedasticity (no fanning of residuals)
plot(model2, which = 1)
# E - Normality of residuals with QQ plot
stats::qqnorm(residuals(model2))
stats::qqline(residuals(model2))
The following few codes are generally good in most types of regression modeling:
## Correlogram to check for Colinearity
mydata_c <- mydata %>%
select(-factor1, - charc1) %>% # Exclude all non numerical variables
na.omit()
corr <- round(stats::cor(mydata_c), 1) # Build the matrix
ggcorrplot::ggcorrplot(corr,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("pink2", "white", "springgreen3"),
title="Correlogram",
ggtheme=theme_bw)
## Variance Inflation Factor testing for multicollinearity
car::vif(model1) # Values bellow 4 can be trusted
## Aikake Information Criterion (AIC) to test best model!
stats::AIC(model1)
stats::AIC(model2) #Lowest AIC means best fit
## Cook's D for influential outliers
plot(model1, 4, id.n =3)
abline(h=1, lty = 2) # If above 0.5 investigate. If above 1 it is influential
Have a binary outcome and multiple predictors? Use this to find the odds ratio (actually the log(odds) but you will eventually get there)
model6 <- stats::glm(var1 ~ pred1 + pred2, data=mydata, family=binomial(link="logit"))
summary(model6)
confint(model6)
exp(coef(model6)) # Remeber the resulting parameters are in "log",
# exponetiante to get *odds ratio*
exp(confint(model6)) # Same for confidence intervals
# Don't forget to test model fit with Hosmer-Lemenshow!
hl.test <- ResourceSelection::hoslem.test(mydata$var1, fitted(model6))
hl.test
# Remeber this is one of the only times in stats where failling to reject the null is actually good!
# p-value of 0.413, means NO evidence of poor fit (model is a good fit)
Got count data? Need to count fish in a pound? Count on Poisson! (Unless mean is toooo different from variance, that violates Poisson (try negative binomial!))
# Mind you here I am normalizing by an offset!
model8 <- glm(var1 ~ pred1 + pred2, data=mydata, offset(log(var1)), family=poisson(link="log"))
exp(coef(model1)) # To get Rate Ratios, exponentiate!
exp(confint(model1))
# Model fit testing
pchisq(model7$deviance, model7$df.residual, lower.tail=FALSE)
# If p-value is close to 0, this suggests that the model has overdispersion
#(and isn’t a good fit to the data).
# With overdispersion, let's try a negative binomial
model1nb <- glm.nb(var1 ~ pred1 + pred2, data=mydata)
summary(model1nb)
So you got a binary outcome but Logistic regression was not a good fit(overdispersion? Indicence of outcome more than 10%?), use Log binomial!
model9 <- glm(var1 ~ pred1 + pred2, data=mydata, family=binomial(link="log"))
summary(model9)
exp(coef(model9)) # Exponiantiate to find the Relative Risk
Now your outcome is ordinal and categorical? This is your best choice!
model10 <- polr(var1 ~ pred1 + pred2, data=mydata, Hess=TRUE)
summary(model10)
exp(-coef(model10))
exp(-confint(model0)) # Exponiantiate to find Common Odds Ratio
# When using Ordinal Logistic Regression, check the proportional odds assumption
# It basically means that the change from categories is constant across each level
# From 1 to 2 is the same as 4 to 5 and so on
Remember when I told you this report would be in chornological order? Well, I am breaking that rule… We learned model selection at a later point in the course, but it make more sense for it to be here at the end of the regression section. So ready, set, go!
As the name implies, we are selecting the model that best fits the data, the model that has the lowest AIC. To put it in other terms, it finds what predictors should I have in my model to better explain the outcome I want?
Let’s see two ways R can do this for us.
# First you select the floor and celing of you analysis, which correspond to
# the model with the minimum and maximum amounts of predictors you want from your data
celling <- glm(data = mydata, morbid ~ pred1 + pred2) # We want to consider pred1 and pred2
# as possible variables in our final model
floor <-glm(data=mydata, var1 ~ 1) # In this case the floor has no predictors of interest
# model.foward stats from the floor and adds predictors until it gets to the celling
model.forward<-step(floor, direction="forward",
scope=list(
upper = celling,
lower = floor))
# model.backwards starts from the ceeling and removes predictors until it gets to the floor
model.backward<-step(celling,
direction = "backward",
scope = list(upper = celling,
lower = floor))
# Both of them will select the model that has the lowest AIC!
To my knowledge, this works with every regression model in R!
What if you are studying a phenomenon that considers time? For example, how long does it take for a Parkinson’s patient to undergo Deep Brain Stimulation Surgery? Or how long does it take for a patient to be weaned of mechanical ventilation? These techniques are going to show you how to deal with those situations!
Nothing better than to start with a visualization! Kaplan Meyer curve is the go to graph when dealing with time to event data.
First, calculate the estimated survival using the Kaplan-Meyer method
# Let's consider var1 a time to event variable and var2 a categorical variable to which
# we can "group_by" the time to event data
kmsurv<-survival::survfit(Surv(var1)~var2, data=mydata)
#If you only want the overall survival (not by group), replace var2 by "1"
Now you plot the Kaplan-Mayer curve using the results from the estimated survival (kmsurv)
# Load both ggplot and ggfortify to make it work:
autoplot(kmsurvv) +
xlab("Time frame") +
ylab("Percentage of patients with/without event") +
labs(title = "Time to event - Kaplan Meyer Curve") +
theme(plot.title = element_text(hjust=0.5, face="bold"),
plot.subtitle = element_text(hjust=0.5))
If you have a “+” in your curve, this indicates censored data.
Notice that the shaded areas are 95% intervals.
The thing about the Kaplan-Meyer curve is that it is only a data vizualization piece, it does not give you inferential information. Let’s learn some methods that will allow us to test hypothesis.
If you are comparing two groups in time to event data, you have these 2 options to choose from.
The Log-rank test considers all time points of the studied period to be the same in risk of event.
The Gehan-Breslow-Wilcox test considers it more likely tha the event will happen during the initial period of the observation.
A good example of a use of Log-Rank test would be weaning of mechanical ventilation. Overall the chances of a patient succefully returning to spontaneous respiration are constant over time. If we are comparing the effect of two different sedatives in time to this event, we should use the Log-Rank test.
Now if we are studying deaths due to trauma, the Gehan-Breslow-Wilcox method would be a better fit. This is because we know patients are at a higher likelihood of dying after the first few hours of a trauma, and then this risk gradually reduces. As such, when studying for example the best fluids to be given to a trauma patient, a Gehan-Breslow-Wilcox method would be appropriate.
Their code in R is super easy, take a look:
# Log-rank test
survival::survdiff(Surv(var1) ~ var2, data = mydata, rho = 0)
# Gehan-Breslow-Wilcox test
survival::survdiff(Surv(var1) ~ var2, data = mydata, rho = 1)
Notice that the only difference between the two codes is the "rho" argument!
What if you want a regression model for time to event data? Meaning, what if you want to consider multiple covariates when comparing groups that have a time to event outcome? That is when you use the Proportional Hazards Regression (commonly refered as Cox Regression).
This is the general:
mod2<-coxph(Surv(var1)~var2 + pred1 + pred2, data = mydata)
summary(mod2)
mod2_table <- mod2 %>%
tbl_regression(exponentiate = TRUE) %>%
as_gt()
Our final result will contain "subgroup" analysis of pred1 and 2, we will be able to infer reduction/increase in harm in these specific subgroups.
It will also control for the predictors in the main model analysis.
2 important topics that were presented and should be refreshed are power analysis and sampling techniques.
Power analysis is intimately intertwined with sample size determination. We discussed power analysis in two scenarios, mean and proportion analysis. Both require that you know 3 out of the 4 characteristics of a study to make general determinations about power/population of a study. Thse are:
Sample Size - Significance Level - Power - Minimum Clinically Important Difference (MCID)
Be mindful that MCID is determined in the formulas as the effect size statistic!
For comparison of means, we use Cohen’s d to determine the MCID Cohen’s d = (|µ1-µ2| )/σ
small effect corresponds to roughly d=0.20, a medium effect to d=0.50, and a large effect to d=0.80
For comparison of proportions, we use Cohen’s h :
Cohen’s h => (p1 – p2). =2*(ASIN(SQRT(p1))-ASIN(SQRT(p2))).
If we are doing powr/sample size calculations of a comparison of means, of a t-test, this is the code we use.
# If you want to determine sample size:
library(pwr)
twogrpmean.n <- pwr.t.test(d = 0.20, sig.level = 0.05, power = 0.90, type = "two.sample")
twogrpmean.n
If you are feeling frisky (you always should), it is a good idea to draw the power curve at different MCIDs to give the principal investigators/IRB an idea of how the sample size and the relationship between the groups can affect the design of the study.
Here is how you do it:
candidate.n <- seq(from = 50, to = 1000, by = 100)
plot2 <- pwr.t.test(d = 0.2, sig.level = 0.05, n = candidate.n)
plot4 <- pwr.t.test(d = 0.4, sig.level = 0.05, n = candidate.n)
n<-plot2$n
p2<- plot2$power
p4<- plot4$power
plot(n, p2, type = "b", xlab = "Sample Size", ylab = "Power", ylim=c(0,1))
points(n, p4, type = "b", col=2, lty=2, pch = 2)
legend("bottomright", c("d=0.2", "d=0.40"), lty = c(1,2),
pch = c(1,2), col = c(1, 2), cex=1.5)
Now if you want a table showing you similar information, here is the code:
pwr2 <- pwr.t.test(d = 0.2, sig.level = 0.05, power = 0.90)
pwr4 <- pwr.t.test(d = 0.4, sig.level = 0.05, power = 0.90)
table6 <- data.frame("Power" = c("90%", "90%"), `Cohen's D` = c("0.2", "0.4"), `Sample Size` = c(pwr2$n, pwr4$n))
print(table6)
Comparing proportions is similar, of course the formulas are quite different, but the applied concepts are the same. Take a look at sample size calculation, graphing and table construction:
Sample Size:
pwr6 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.66), sig.level = 0.05, power = 0.90)
# Notice here that you don;t calculate Cohen's H, you input the proportions and let the PC work!
Graphing
candidate.n <- seq(from = 400, to = 1800, by = 200)
plot6 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.66), sig.level = 0.05, n = candidate.n)
plot10 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.70), sig.level = 0.05, n = candidate.n)
n<-plot6$n
p6<- plot6$power
p10<- plot10$power
plot(n, p6, type = "b", xlab = "Sample Size", ylab = "Power", ylim=c(0,1))
points(n, p10, type = "b", col=2, lty=2, pch = 2)
legend("bottomright", c("p2=0.66", "p2=0.70"), lty = c(1,2),
pch = c(1,2), col = c(1, 2), cex=1.5)
Power Table
pwr6 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.66), sig.level = 0.05, power = 0.90)
pwr7 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.67), sig.level = 0.05, power = 0.90)
pwr8 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.68), sig.level = 0.05, power = 0.90)
pwr9 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.69), sig.level = 0.05, power = 0.90)
pwr10 <- pwr.2p.test(h = ES.h(p1 = 0.60, p2 = 0.70), sig.level = 0.05, power = 0.90)
table7 <- data.frame("Baseline-Expected" = c("60% - 66%", "60% - 67%", "60% - 68%", "60% - 69%", "60% - 70%"), MCID = c("6%", "7%", "8%", "9%", "10%"), n = c(pwr6$n, pwr7$n, pwr8$n, pwr9$n, pwr10$n))
print(table6)
Weighted calculations
When working with complex sample designs, you might be asked to find statistics of weighted variables. Here is how you do it for means and proportions:
library(survey)
# First define the sample design
sample.design=svydesign(
data=hhpub,
id= ~var1, # This defines the sample unit, var1 here could be patient id, a house, a hospital...
weight= ~var2 # var2 in this case is the variable that hold the sample weights
)
# No let's calculate statistics:
weighted.total.pred1=svytotal(~pred1, sample.design) # Weighted total value of pred1
weighted.mean.pred1=svymean(~pred1, sample.design) # Weighted mean of pred 1
confint(weighted.mean.pred1) # Confidence interval
#Dealing with frequencies
wt.freq.pubtr=svytotal(~factor(pred2), sample.design) # Weighted total value of pred1
weighted.mean.pred2=svymean(~factor(pred2), sample.design) # Mean PROPORTION of pred2
svyciprop(~I(pred2=="01"), sample.design) # If proportions are close to 0 or 1, try this!
What a ride! Hope you have learned enough for the real world!