This is the exercise from the “Advanced R” tutorial.

Setup

The code below sets up the R environment by loading the tidyverse package.

options(scipen=999, digits = 5, width = 88)
options(qwraps2_markup = "markdown")

options(stringAsFactors = F)
options(dplyr.print_max = 10)
options(dplyr.pring_min = 5)

knitr::opts_chunk$set(cache   = FALSE,
               echo    = TRUE,
               results = "hide",
               message = FALSE,
               warning = FALSE) 

options(repos = c(CRAN = "https://cran.rstudio.com"))

library(tidyverse)  
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages -----------------------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

Import

The code below imports the data using readr::read_csv. The csv stands for “Comma Seperate Values”.

data <- read_csv("https://raw.githubusercontent.com/vtmacox/teachr/master/data.csv", col_names = TRUE)

Data Cleaning

The code below assigns the med variable as a factor and provides labels. It also recodes values of 9 as missing for the anxiety variable.

data$med <- factor(data$med,
                   levels = c(0,1),
                   labels = c("Not on medication", "On Medication"))

data$chol <- as.numeric(data$chol)

data$anxiety <- if_else(data$anxiety == 9, NA_integer_, data$anxiety)

# Or you could do it this way
# data$anxiety <- ifelse(data$anxiety == 9, NA, data$anxiety)

Descriptive Stats

The code below shows two methods of creating descriptive stats. The first uses psych::describe. This prints out useful infromation such as mean, median, range, min, max, skew, and kurtosis. However, if you have factor variables, it does not output frequences. The second method uses functions from the qwraps2 package which creates tables that can be displayed in markdown documents.

psych::describe(data)
##         vars   n   mean     sd median trimmed    mad min   max range skew kurtosis   se
## id         1 400 200.50 115.61  200.5  200.50 148.26   1 400.0 399.0 0.00    -1.21 5.78
## exer       2 379   0.79   0.66    0.6    0.71   0.59   0   3.7   3.7 1.32     2.19 0.03
## fv         3 378   2.60   1.01    3.0    2.58   1.48   1   6.0   5.0 0.31    -0.26 0.05
## age        4 370  30.02   6.06   29.0   29.47   5.93  22  58.0  36.0 0.87     0.89 0.31
## anxiety    5 383   3.52   1.11    3.0    3.47   1.48   2   7.0   5.0 0.41    -0.29 0.06
## chol       6 400 204.53  22.63  199.0  201.73  22.24 176 282.0 106.0 1.00     0.53 1.13
## med*       7 400   1.02   0.16    1.0    1.00   0.00   1   2.0   1.0 6.06    34.84 0.01
tab1 <- 
  with(data,
       list( "Hours of Exercise" = qwraps2::tab_summary(exer),
             "Servings of Fruits and Vegetables" = qwraps2::tab_summary(fv),
             "Age" = qwraps2::tab_summary(age),
             "Anxiety" = qwraps2::tab_summary(anxiety),
             "Cholesterol Level" = qwraps2::tab_summary(chol),
             "On Cholesterol Medication" = qwraps2::tab_summary(med))
)

Display descriptive stats

outtab1 <- qwraps2::summary_table(data, tab1)
outtab1
data (N = 400)
Hours of Exercise   
   min 0
   median (IQR) 379; 0.60 (0.30, 1.10)
   mean (sd) 379; 0.79 ± 0.66
   max 3.7
   Unknown 21 (5)
Servings of Fruits and Vegetables   
   min 1
   median (IQR) 378; 3.00 (2.00, 3.00)
   mean (sd) 378; 2.60 ± 1.01
   max 6
   Unknown 22 (6)
Age   
   min 22
   median (IQR) 370; 29.00 (25.00, 34.00)
   mean (sd) 370; 30.02 ± 6.06
   max 58
   Unknown 30 (8)
Anxiety   
   min 2
   median (IQR) 383; 3 (3.00, 4.00)
   mean (sd) 383; 3.52 ± 1.11
   max 7
   Unknown 17 (4)
Cholesterol Level   
   min 176
   median (IQR) 199.00 (187.00, 218.00)
   mean (sd) 204.53 ± 22.63
   max 282
On Cholesterol Medication   
   Not on medication 390 (98)
   On Medication 10 (2)

Second dispaly option

Dispaly description stats using the tableone:: package.

tableone::CreateTableOne(data = data)
##                          
##                           Overall        
##   n                          400         
##   id (mean (sd))          200.50 (115.61)
##   exer (mean (sd))          0.79 (0.66)  
##   fv (mean (sd))            2.60 (1.01)  
##   age (mean (sd))          30.02 (6.06)  
##   anxiety (mean (sd))       3.52 (1.11)  
##   chol (mean (sd))        204.53 (22.63) 
##   med = On Medication (%)     10 (2.5)

Plots

Here we are using ggplot2.

ggplot(data=data, aes(x = med, y= as.numeric(chol), fill = med)) +
  stat_summary(fun.y = "mean", geom = "bar") + 
  theme(axis.title.x = element_blank()) + 
  ylab("Cholesterol (mg/dL)") +  guides(fill=FALSE) + 
  ggtitle("Cholesterol Levels by Medication Status")

ggplot(data=data, aes(x=age, y=chol)) +
  geom_point(shape = 1) + 
  ylab("Cholesterol (mg/dL)") + 
  geom_smooth(method = lm, se = FALSE) +
  ggtitle("Relationship between Cholesterol and Age") +
  xlab("Age (years)")

Regression Model

# Linear Regression
mod1 <- lm(chol ~ exer + fv + age + anxiety + med, data = data)

sum_mod1 <- summary(mod1)

sum_mod1
## 
## Call:
## lm(formula = chol ~ exer + fv + age + anxiety + med, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -47.3  -14.2   -2.1   12.3   61.2 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       147.882      7.797   18.97 < 0.0000000000000002 ***
## exer               11.222      1.749    6.42        0.00000000052 ***
## fv                  2.711      1.178    2.30                0.022 *  
## age                 1.235      0.185    6.67        0.00000000012 ***
## anxiety             1.174      1.078    1.09                0.277    
## medOn Medication    1.957      7.303    0.27                0.789    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20 on 310 degrees of freedom
##   (84 observations deleted due to missingness)
## Multiple R-squared:  0.25,   Adjusted R-squared:  0.238 
## F-statistic: 20.7 on 5 and 310 DF,  p-value: <0.0000000000000002
#sum_mod1$coefficients[, 4] <- pvalr(sum_mod1$coefficients[, 4], digits = 3)
#sum_mod1$coefficients[, 4] <- format.pval(sum_mod1$coefficients[, 4], digits = 3)

Other display options

# Method 1 for output
knitr::kable(sum_mod1$coef, digits = 5)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 147.8821 7.79653 18.96768 0.00000
exer 11.2223 1.74852 6.41820 0.00000
fv 2.7109 1.17775 2.30175 0.02201
age 1.2347 0.18523 6.66548 0.00000
anxiety 1.1744 1.07766 1.08978 0.27666
medOn Medication 1.9574 7.30281 0.26804 0.78885
# Method 2 for output
pander::pander(sum_mod1, digits = 5, round = 5)
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 147.88 7.7965 18.968 0
exer 11.222 1.7485 6.4182 0
fv 2.7109 1.1778 2.3018 0.02201
age 1.2347 0.18523 6.6655 0
anxiety 1.1744 1.0777 1.0898 0.27666
medOn Medication 1.9574 7.3028 0.26804 0.78885
Fitting linear model: chol ~ exer + fv + age + anxiety + med
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
316 20.04 0.2505 0.2384
# Method 3 for output (first install xtable package)
xtable::print.xtable(xtable::xtable(sum_mod1), type = "html")
Estimate Std. Error t value Pr(>|t|)
(Intercept) 147.8821 7.7965 18.97 0.0000
exer 11.2223 1.7485 6.42 0.0000
fv 2.7109 1.1777 2.30 0.0220
age 1.2347 0.1852 6.67 0.0000
anxiety 1.1744 1.0777 1.09 0.2767
medOn Medication 1.9574 7.3028 0.27 0.7888
# Method 4 for output
stargazer::stargazer(mod1, type = "html")
Dependent variable:
chol
exer 11.222***
(1.749)
fv 2.711**
(1.178)
age 1.235***
(0.185)
anxiety 1.174
(1.078)
medOn Medication 1.957
(7.303)
Constant 147.880***
(7.797)
Observations 316
R2 0.250
Adjusted R2 0.238
Residual Std. Error 20.038 (df = 310)
F Statistic 20.719*** (df = 5; 310)
Note: p<0.1; p<0.05; p<0.01