This is the exercise from the “Advanced R” tutorial.

Setup

The code below sets up the R environment by loading the tidyverse package.

options(scipen=999, digits = 5, width = 88)
options(qwraps2_markup = "markdown")

options(stringAsFactors = F)
options(dplyr.print_max = 10)
options(dplyr.pring_min = 5)

knitr::opts_chunk$set(cache   = FALSE,
               echo    = TRUE,
               results = "hide",
               message = FALSE,
               warning = FALSE) 

options(repos = c(CRAN = "https://cran.rstudio.com"))

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages -----------------------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

Import

The code below imports the data using readr::read_csv. The csv stands for “Comma Seperate Values”.

data <- read_csv("https://raw.githubusercontent.com/vtmacox/teachr/master/data.csv", col_names = TRUE)

Data Cleaning

The code below assigns the med variable as a factor and provides labels. It also recodes values of 9 as missing for the anxiety variable.

data$med <- factor(data$med,
                   levels = c(0,1),
                   labels = c("Not on medication", "On Medication"))

data$chol <- as.numeric(data$chol)

data$anxiety <- if_else(data$anxiety == 9, NA_integer_, data$anxiety)

# Or you could do it this way
# data$anxiety <- ifelse(data$anxiety == 9, NA, data$anxiety)

Descriptive Stats

The code below shows two methods of creating descriptive stats. The first uses psych::describe. This prints out useful infromation such as mean, median, range, min, max, skew, and kurtosis. However, if you have factor variables, it does not output frequences. The second method uses functions from the qwraps2 package which creates tables that can be displayed in markdown documents.

psych::describe(data)

##         vars   n   mean     sd median trimmed    mad min   max range skew kurtosis   se
## id         1 400 200.50 115.61  200.5  200.50 148.26   1 400.0 399.0 0.00    -1.21 5.78
## exer       2 379   0.79   0.66    0.6    0.71   0.59   0   3.7   3.7 1.32     2.19 0.03
## fv         3 378   2.60   1.01    3.0    2.58   1.48   1   6.0   5.0 0.31    -0.26 0.05
## age        4 370  30.02   6.06   29.0   29.47   5.93  22  58.0  36.0 0.87     0.89 0.31
## anxiety    5 383   3.52   1.11    3.0    3.47   1.48   2   7.0   5.0 0.41    -0.29 0.06
## chol       6 400 204.53  22.63  199.0  201.73  22.24 176 282.0 106.0 1.00     0.53 1.13
## med*       7 400   1.02   0.16    1.0    1.00   0.00   1   2.0   1.0 6.06    34.84 0.01

tab1 <- 
  with(data,
       list( "Hours of Exercise" = qwraps2::tab_summary(exer),
             "Servings of Fruits and Vegetables" = qwraps2::tab_summary(fv),
             "Age" = qwraps2::tab_summary(age),
             "Anxiety" = qwraps2::tab_summary(anxiety),
             "Cholesterol Level" = qwraps2::tab_summary(chol),
             "On Cholesterol Medication" = qwraps2::tab_summary(med))
)

Display descriptive stats

outtab1 <- qwraps2::summary_table(data, tab1)
outtab1

	data (N = 400)
Hours of Exercise
min	0
median (IQR)	379; 0.60 (0.30, 1.10)
mean (sd)	379; 0.79 ± 0.66
max	3.7
Unknown	21 (5)
Servings of Fruits and Vegetables
min	1
median (IQR)	378; 3.00 (2.00, 3.00)
mean (sd)	378; 2.60 ± 1.01
max	6
Unknown	22 (6)
Age
min	22
median (IQR)	370; 29.00 (25.00, 34.00)
mean (sd)	370; 30.02 ± 6.06
max	58
Unknown	30 (8)
Anxiety
min	2
median (IQR)	383; 3 (3.00, 4.00)
mean (sd)	383; 3.52 ± 1.11
max	7
Unknown	17 (4)
Cholesterol Level
min	176
median (IQR)	199.00 (187.00, 218.00)
mean (sd)	204.53 ± 22.63
max	282
On Cholesterol Medication
Not on medication	390 (98)
On Medication	10 (2)

Second dispaly option

Dispaly description stats using the tableone:: package.

tableone::CreateTableOne(data = data)

##                          
##                           Overall        
##   n                          400         
##   id (mean (sd))          200.50 (115.61)
##   exer (mean (sd))          0.79 (0.66)  
##   fv (mean (sd))            2.60 (1.01)  
##   age (mean (sd))          30.02 (6.06)  
##   anxiety (mean (sd))       3.52 (1.11)  
##   chol (mean (sd))        204.53 (22.63) 
##   med = On Medication (%)     10 (2.5)

Plots

Here we are using ggplot2.

ggplot(data=data, aes(x = med, y= as.numeric(chol), fill = med)) +
  stat_summary(fun.y = "mean", geom = "bar") + 
  theme(axis.title.x = element_blank()) + 
  ylab("Cholesterol (mg/dL)") +  guides(fill=FALSE) + 
  ggtitle("Cholesterol Levels by Medication Status")

ggplot(data=data, aes(x=age, y=chol)) +
  geom_point(shape = 1) + 
  ylab("Cholesterol (mg/dL)") + 
  geom_smooth(method = lm, se = FALSE) +
  ggtitle("Relationship between Cholesterol and Age") +
  xlab("Age (years)")

Regression Model

# Linear Regression
mod1 <- lm(chol ~ exer + fv + age + anxiety + med, data = data)

sum_mod1 <- summary(mod1)

sum_mod1

## 
## Call:
## lm(formula = chol ~ exer + fv + age + anxiety + med, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -47.3  -14.2   -2.1   12.3   61.2 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       147.882      7.797   18.97 < 0.0000000000000002 ***
## exer               11.222      1.749    6.42        0.00000000052 ***
## fv                  2.711      1.178    2.30                0.022 *  
## age                 1.235      0.185    6.67        0.00000000012 ***
## anxiety             1.174      1.078    1.09                0.277    
## medOn Medication    1.957      7.303    0.27                0.789    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20 on 310 degrees of freedom
##   (84 observations deleted due to missingness)
## Multiple R-squared:  0.25,   Adjusted R-squared:  0.238 
## F-statistic: 20.7 on 5 and 310 DF,  p-value: <0.0000000000000002

#sum_mod1$coefficients[, 4] <- pvalr(sum_mod1$coefficients[, 4], digits = 3)
#sum_mod1$coefficients[, 4] <- format.pval(sum_mod1$coefficients[, 4], digits = 3)

Other display options

# Method 1 for output
knitr::kable(sum_mod1$coef, digits = 5)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	147.8821	7.79653	18.96768	0.00000
exer	11.2223	1.74852	6.41820	0.00000
fv	2.7109	1.17775	2.30175	0.02201
age	1.2347	0.18523	6.66548	0.00000
anxiety	1.1744	1.07766	1.08978	0.27666
medOn Medication	1.9574	7.30281	0.26804	0.78885

# Method 2 for output
pander::pander(sum_mod1, digits = 5, round = 5)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	147.88	7.7965	18.968	0
exer	11.222	1.7485	6.4182	0
fv	2.7109	1.1778	2.3018	0.02201
age	1.2347	0.18523	6.6655	0
anxiety	1.1744	1.0777	1.0898	0.27666
medOn Medication	1.9574	7.3028	0.26804	0.78885

Fitting linear model: chol ~ exer + fv + age + anxiety + med
Observations	Residual Std. Error	\(R^2\)	Adjusted \(R^2\)
316	20.04	0.2505	0.2384

# Method 3 for output (first install xtable package)
xtable::print.xtable(xtable::xtable(sum_mod1), type = "html")

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	147.8821	7.7965	18.97	0.0000
exer	11.2223	1.7485	6.42	0.0000
fv	2.7109	1.1777	2.30	0.0220
age	1.2347	0.1852	6.67	0.0000
anxiety	1.1744	1.0777	1.09	0.2767
medOn Medication	1.9574	7.3028	0.27	0.7888

# Method 4 for output
stargazer::stargazer(mod1, type = "html")


	Dependent variable:

	chol

exer	11.222^***
	(1.749)

fv	2.711^**
	(1.178)

age	1.235^***
	(0.185)

anxiety	1.174
	(1.078)

medOn Medication	1.957
	(7.303)

Constant	147.880^***
	(7.797)


Observations	316
R²	0.250
Adjusted R²	0.238
Residual Std. Error	20.038 (df = 310)
F Statistic	20.719^*** (df = 5; 310)

Note:	p<0.1; p<0.05; p<0.01

RMarkdown

Matthew Cox-Martin

August 10, 2017