library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.1
## Warning: package 'tibble' was built under R version 4.2.1
## Warning: package 'tidyr' was built under R version 4.2.1
## Warning: package 'readr' was built under R version 4.2.1
## Warning: package 'purrr' was built under R version 4.2.1
## Warning: package 'dplyr' was built under R version 4.2.1
## Warning: package 'stringr' was built under R version 4.2.1
## Warning: package 'forcats' was built under R version 4.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.2.1
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(readr)
library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.2.1
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.2.1
## #StandWithUkraine
library(psych)
## Warning: package 'psych' was built under R version 4.2.1
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(broom)
## Warning: package 'broom' was built under R version 4.2.1
library(AICcmodavg)
## Warning: package 'AICcmodavg' was built under R version 4.2.1
library(car)
## Warning: package 'car' was built under R version 4.2.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.1
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.2.1
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.1
## corrplot 0.92 loaded
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following object is masked from 'package:psych':
##
## logit
library(tableone)
## Warning: package 'tableone' was built under R version 4.2.1
project1data <- read.csv("project1data.csv")
#Descriptive Statistics
Summary to look at structure and descriptive statistics for variables
str(project1data)
## 'data.frame': 130 obs. of 11 variables:
## $ id : int 101 102 103 104 105 106 107 108 109 110 ...
## $ trtgroup : int 4 5 2 3 1 4 3 2 5 1 ...
## $ sex : int 2 2 2 2 2 2 2 2 2 2 ...
## $ race : int 5 5 5 5 2 5 5 5 2 5 ...
## $ age : num 44.6 35.6 47.9 55.2 43.8 ...
## $ smoker : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sites : int 162 162 144 138 168 168 156 162 156 162 ...
## $ attachbase : num 2.43 2.54 2.88 4.96 1.77 ...
## $ attach1year: num 2.58 NA 3.08 5.3 1.45 ...
## $ pdbase : num 3.25 3.01 3.12 5.22 3.36 ...
## $ pd1year : num 3.41 NA 3.12 4.89 2.9 ...
summary(project1data)
## id trtgroup sex race age
## Min. :101.0 Min. :1 Min. :1.000 Min. :1.000 Min. :28.57
## 1st Qu.:133.2 1st Qu.:2 1st Qu.:1.000 1st Qu.:5.000 1st Qu.:41.98
## Median :166.5 Median :3 Median :2.000 Median :5.000 Median :48.63
## Mean :182.8 Mean :3 Mean :1.585 Mean :4.646 Mean :49.94
## 3rd Qu.:237.8 3rd Qu.:4 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:55.85
## Max. :270.0 Max. :5 Max. :2.000 Max. :5.000 Max. :74.53
## NA's :1
## smoker sites attachbase attach1year
## Min. :0.0000 Min. :114.0 Min. :0.8951 Min. :0.8654
## 1st Qu.:0.0000 1st Qu.:150.0 1st Qu.:1.5792 1st Qu.:1.4630
## Median :0.0000 Median :162.0 Median :2.0275 Median :1.9783
## Mean :0.3721 Mean :157.5 Mean :2.1461 Mean :2.1014
## 3rd Qu.:1.0000 3rd Qu.:168.0 3rd Qu.:2.5849 3rd Qu.:2.5117
## Max. :1.0000 Max. :168.0 Max. :5.0893 Max. :5.3043
## NA's :1 NA's :27
## pdbase pd1year
## Min. :2.263 Min. :1.964
## 1st Qu.:2.852 1st Qu.:2.516
## Median :3.098 Median :2.905
## Mean :3.138 Mean :2.875
## 3rd Qu.:3.336 3rd Qu.:3.173
## Max. :5.217 Max. :4.891
## NA's :27
project1data$trtgroup <- factor(project1data$trtgroup)
project1data$sex <- factor(project1data$sex,labels = c("male","female"))
project1data$race <- factor(project1data$race,labels = c("Native American","African","Asian","white"))
project1data$smoker <- factor(project1data$smoker, labels =c("non smoker", "smoker"))
Table <-CreateTableOne(data= project1data)
summary(Table)
##
## ### Summary of continuous variables ###
##
## strata: Overall
## n miss p.miss mean sd median p25 p75 min max skew kurt
## id 130 0 0.0 183 55.0 166 133 238 101.0 270 0.1 -1.5
## age 130 1 0.8 50 10.0 49 42 56 28.6 75 0.4 -0.4
## sites 130 0 0.0 158 11.3 162 150 168 114.0 168 -1.4 2.1
## attachbase 130 0 0.0 2 0.8 2 2 3 0.9 5 1.1 1.9
## attach1year 130 27 20.8 2 0.8 2 1 3 0.9 5 1.3 2.1
## pdbase 130 0 0.0 3 0.4 3 3 3 2.3 5 1.3 4.3
## pd1year 130 27 20.8 3 0.5 3 3 3 2.0 5 0.8 2.0
##
## =======================================================================================
##
## ### Summary of categorical variables ###
##
## strata: Overall
## var n miss p.miss level freq percent cum.percent
## trtgroup 130 0 0.0 1 26 20.0 20.0
## 2 26 20.0 40.0
## 3 26 20.0 60.0
## 4 26 20.0 80.0
## 5 26 20.0 100.0
##
## sex 130 0 0.0 male 54 41.5 41.5
## female 76 58.5 100.0
##
## race 130 0 0.0 Native American 4 3.1 3.1
## African 9 6.9 10.0
## Asian 3 2.3 12.3
## white 114 87.7 100.0
##
## smoker 130 1 0.8 non smoker 81 62.8 62.8
## smoker 48 37.2 100.0
##
Table 1- Descriptive Statistic Analysis with Pre/Post Outcome Variables by Treatment Group # This initial table includes missing values and frequency distributions
project1data %>%
tbl_summary(
by = trtgroup,
label = list(trtgroup ~ "Treatment Group Level", sex ~ "Sex", age ~ "Age", race ~ "Race", smoker ~ "Smoking"),
type = list(c(trtgroup, sex, race, smoker) ~ "categorical"),
missing_text = "Missing",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)")) %>%
modify_header(label ~ "**Header**") %>%
bold_labels() %>%
add_n() %>%
modify_caption("**Demographic Characteristics with Pre/Post Outcome Variables by Treatment Group**")
#Dealing with missing values
# retain only complete cases
project1data_complete <- na.omit(project1data)
# dataset of all observations with some missing values
project1data_missing <- project1data[-which(project1data$id %in% project1data_complete$id),]
# Now removed 29 observations dropping the number of total observations from 130 to 101
summary(project1data_complete)
## id trtgroup sex race age
## Min. :101.0 1:22 male :35 Native American: 3 Min. :28.57
## 1st Qu.:130.0 2:23 female:66 African : 6 1st Qu.:42.51
## Median :158.0 3:21 Asian : 3 Median :48.63
## Mean :177.5 4:19 white :89 Mean :50.17
## 3rd Qu.:234.0 5:16 3rd Qu.:55.30
## Max. :270.0 Max. :74.53
## smoker sites attachbase attach1year
## non smoker:65 Min. :114 Min. :0.8951 Min. :0.8654
## smoker :36 1st Qu.:150 1st Qu.:1.6369 1st Qu.:1.4630
## Median :162 Median :2.0476 Median :1.9783
## Mean :158 Mean :2.1912 Mean :2.0956
## 3rd Qu.:168 3rd Qu.:2.6369 3rd Qu.:2.5060
## Max. :168 Max. :5.0893 Max. :5.3043
## pdbase pd1year
## Min. :2.263 Min. :1.964
## 1st Qu.:2.827 1st Qu.:2.512
## Median :3.118 Median :2.899
## Mean :3.169 Mean :2.873
## 3rd Qu.:3.364 3rd Qu.:3.179
## Max. :5.217 Max. :4.891
project1data <- project1data %>% mutate(missing=ifelse(id %in% project1data_missing$id,"Yes","No"))
project1data %>%
tbl_summary(
by = missing,
label = list(trtgroup ~ "Treatment Group", sex ~ "Sex", age ~ "Age", race ~ "Race", smoker ~ "Smoking Status"),
type = list(c(trtgroup, sex, race, smoker) ~ "categorical"),
missing = "no",
missing_text = "Did not Respond",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)")) %>%
modify_header(label ~ "**MIssing No or Yes**") %>%
bold_labels() %>%
add_n() %>%
add_p() %>%
add_q() %>%
modify_caption("**Associations of Participants with Missing Values to Original Dataset**")
## add_q: Adjusting p-values with
## `stats::p.adjust(x$table_body$p.value, method = "fdr")`
#Since there is at least 20% (29/130)missing I decide to create a new dataset without the missing values.
#If I want my final dataset to only use complete cases
project1data_complete <- project1data[complete.cases(project1data), ] # this data frame n=101 as the missing values are taken out
#Histogram for Age using dataset without missing values
** The histogram of age shows that age is not normally distributed with a right skewed distribution.This means that age is more skewed towards younger participants. Earlier we saw that the mean age was mean=49.94, sd= 10.03, median= 48.63. However, this is not an extreme and to look at the quantitative values for the distribution, if the p value > 0.05 this implies that the distribution of the data are not signficantly different from normal distribution.Here the p value= 0.02 verifying that there is some problem with distribution.
hist(project1data_complete$age,
main= "Histogram of Age",
xlab="Age",
ylab="Frequency")
ggqqplot(project1data$age)
## Warning: Removed 1 rows containing non-finite values (stat_qq).
## Warning: Removed 1 rows containing non-finite values (stat_qq_line).
## Removed 1 rows containing non-finite values (stat_qq_line).
shapiro.test(project1data$age)
##
## Shapiro-Wilk normality test
##
## data: project1data$age
## W = 0.97601, p-value = 0.0217
# in other words p > 0.05 means that your variable is normally distributed
** THe histogram of sites- not normal distribution , not linear
hist(project1data_complete$sites,
main= "Histogram of Sites",
xlab="Sites",
ylab="Frequency")
ggqqplot(project1data$sites)
shapiro.test(project1data$sites)
##
## Shapiro-Wilk normality test
##
## data: project1data$sites
## W = 0.82887, p-value = 5.384e-11
hist(project1data_complete$attachbase,
main= "Histogram of Baseline Attachment Loss",
xlab="attachbase",
ylab="Frequency")
ggqqplot(project1data$attachbase)
shapiro.test(project1data$attachbase)
##
## Shapiro-Wilk normality test
##
## data: project1data$attachbase
## W = 0.92883, p-value = 3.705e-06
hist(project1data_complete$pdbase,
main= "Histogram of Baseline Pocket Depth",
xlab="pdbase",
ylab="Frequency")
##Distribution for Pd1year
hist(project1data_complete$pd1year,
main= "Histogram of Pocket Depth at 1 Year Post Treatment",
xlab="Pocket Depth 1 Year",
ylab="Frequency")
ggqqplot(project1data$pd1year)
## Warning: Removed 27 rows containing non-finite values (stat_qq).
## Warning: Removed 27 rows containing non-finite values (stat_qq_line).
## Removed 27 rows containing non-finite values (stat_qq_line).
shapiro.test(project1data$pd1year)
##
## Shapiro-Wilk normality test
##
## data: project1data$pd1year
## W = 0.9587, p-value = 0.002714
# in other words p > 0.05 means that your variable is normally distributed
Testing for the Linear Regression Assumptions: Linear, Independence, Normality, and Homogeneity of residuals variance
#Scatterplots- NOTE: if I have non-normality of residuals or outliers in Y, mitigation by including one or more additional confounder/precision variables
#First continuous variables-scatterplots #create scatterplot of age vs.attachbase, attach1year, pdbase, pd1year
#1- the relationship of age and baseline attachment loss shows as age increases the attachbase increases so their is a linear relationship, with some outliers especially around age 55. I do wonder if after 55 some people are getting dentures and that could explain the high numbers around 55 years of age. # At the post treatment attach1year we see a similar linear relationship as well, just with lower numbers (which points to a positive reaction to treatment since smaller values are better). # The pocket depth scatterplots show a linear relationship but with age the depth was lower at baseline (pdbase) and at pd1year, and the pocket depth did decrease post treatment.
project1data_complete %>% ggplot(aes(x=age,y=attachbase))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
project1data_complete %>% ggplot(aes(x=age,y=attach1year))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
project1data_complete %>% ggplot(aes(x=age,y=pdbase))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
project1data_complete %>% ggplot(aes(x=age,y=pd1year))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# now work with sites
project1data_complete %>% ggplot(aes(x=sites,y=attachbase))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
# The correlation between sites and attachment loss at 1 year shows that there is a decrease in attachment loss as the number of sites increases.
project1data_complete %>% ggplot(aes(x=sites,y=attach1year))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
project1data_complete %>% ggplot(aes(x=sites,y=pdbase))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
project1data_complete %>% ggplot(aes(x=sites,y=pd1year))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
project1data_complete %>% ggplot(aes(x=pdbase,y=pd1year))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
#Attach base and attach1year
project1data_complete %>% ggplot(aes(x=attachbase,y=attach1year))+geom_point()+geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
Descriptive statistics continued- categorical explanatory variables and outcome variables # Plot categorical variable against a numeric variable- boxplots
project1data_complete %>% ggplot(aes(x=trtgroup,y=attach1year, fill=trtgroup))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("baseline measurement (mm)")
project1data_complete %>% ggplot(aes(x=trtgroup,y=pd1year, fill=trtgroup))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("Pocket Depth 1 Year measurement (mm)")
## Plots of Two sample t-test for comparisons of means - Graphics for
sex and attachbase, attach1year,pdbase and pd1year
project1data_complete %>% ggplot(aes(x=sex,y=attachbase,fill=sex))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("baseline measurement (mm)")
project1data_complete %>% ggplot(aes(x=sex,y=attach1year,fill=sex))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("attach 1 year measurement (mm)")
project1data_complete %>% ggplot(aes(x=sex,y=pdbase,fill=sex))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("baseline measurement (mm)")
project1data_complete %>% ggplot(aes(x=sex,y=pd1year,fill=sex))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("pd 1 year measurement (mm)")
project1data_complete %>% ggplot(aes(x=smoker,y=pdbase,fill=smoker))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("Smoker and Pocket Depth at baseline measurement (mm)")
project1data_complete %>% ggplot(aes(x=smoker,y=pd1year,fill=smoker))+geom_boxplot()+stat_compare_means(method = "t.test",comparisons = list(c(1,2)))+ylab("Smoker and Pocket Depth at 1 year measurement (mm)")
#Regression
** Model testing- I will test to find the most parsimonious model** # To build the regression model, use the function lm # Evaluating change from baseline is important because we need to consider the baseline attachbase and pdbase for the participants in analysis of the outcomes. Example- smokers may have more baseline attachment loss and pocket depth at baseline values .In the article by the AAP it showed that smoking was a factor at baseline for periodontitis So I will use in my models to account for that as a change to the outcomes.
model1<- lm(attachbase~ sex+race+age+smoker+sites, data = project1data_complete)
plot(model1)
tab_model(model1)
| attachbase | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 6.23 | 3.51 – 8.95 | <0.001 |
| sex [female] | 0.35 | 0.04 – 0.67 | 0.028 |
| race [African] | 0.36 | -0.70 – 1.43 | 0.497 |
| race [Asian] | -0.24 | -1.52 – 1.04 | 0.710 |
| race [white] | 0.30 | -0.59 – 1.19 | 0.501 |
| age | 0.01 | -0.00 – 0.03 | 0.148 |
| smoker [smoker] | 0.27 | -0.05 – 0.59 | 0.103 |
| sites | -0.03 | -0.05 – -0.02 | <0.001 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.274 / 0.219 | ||
# backward variable selection method
model1.step <- step(model1,direction = "backward")
## Start: AIC=-52.77
## attachbase ~ sex + race + age + smoker + sites
##
## Df Sum of Sq RSS AIC
## - race 3 0.9106 52.035 -56.984
## <none> 51.124 -52.767
## - age 1 1.1677 52.292 -52.486
## - smoker 1 1.4922 52.617 -51.861
## - sex 1 2.7567 53.881 -49.462
## - sites 1 10.7155 61.840 -35.548
##
## Step: AIC=-56.98
## attachbase ~ sex + age + smoker + sites
##
## Df Sum of Sq RSS AIC
## <none> 52.035 -56.984
## - smoker 1 1.9551 53.990 -55.258
## - age 1 1.9663 54.001 -55.237
## - sex 1 2.6397 54.675 -53.986
## - sites 1 10.5032 62.538 -40.414
plot(model1.step)
tab_model(model1.step)
| attachbase | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 5.83 | 3.50 – 8.16 | <0.001 |
| sex [female] | 0.34 | 0.03 – 0.65 | 0.030 |
| age | 0.01 | -0.00 – 0.03 | 0.060 |
| smoker [smoker] | 0.30 | -0.01 – 0.61 | 0.061 |
| sites | -0.03 | -0.04 – -0.02 | <0.001 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.261 / 0.230 | ||
# Sex female has sig higher levels of attach compared to Sex male at the baseline
# higher number of sites was associated with slightly smaller attach at the baseline
model2<- lm(attach1year~ trtgroup+sex+race+age+smoker+attachbase+sites, data = project1data_complete)
plot(model2)
tab_model(model2)
| attach1year | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 0.35 | -0.67 – 1.37 | 0.496 |
| trtgroup [2] | -0.03 | -0.19 – 0.13 | 0.732 |
| trtgroup [3] | 0.14 | -0.01 – 0.30 | 0.071 |
| trtgroup [4] | 0.13 | -0.03 – 0.29 | 0.114 |
| trtgroup [5] | 0.02 | -0.16 – 0.19 | 0.859 |
| sex [female] | -0.04 | -0.14 – 0.07 | 0.511 |
| race [African] | -0.20 | -0.56 – 0.16 | 0.272 |
| race [Asian] | 0.07 | -0.37 – 0.50 | 0.764 |
| race [white] | 0.00 | -0.30 – 0.31 | 0.976 |
| age | -0.00 | -0.01 – 0.00 | 0.348 |
| smoker [smoker] | 0.09 | -0.01 – 0.20 | 0.088 |
| attachbase | 0.87 | 0.80 – 0.94 | <0.001 |
| sites | -0.00 | -0.01 – 0.00 | 0.832 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.910 / 0.898 | ||
# backward variable selection method
model2.step <- lm(attach1year~ trtgroup+sex+age+smoker+attachbase, data = project1data_complete)
plot(model2.step)
tab_model(model2.step) # we remove the race and sites because it was the least significant variables
| attach1year | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 0.25 | -0.04 – 0.54 | 0.088 |
| trtgroup [2] | -0.03 | -0.18 – 0.13 | 0.742 |
| trtgroup [3] | 0.12 | -0.03 – 0.27 | 0.128 |
| trtgroup [4] | 0.14 | -0.02 – 0.30 | 0.081 |
| trtgroup [5] | 0.01 | -0.16 – 0.18 | 0.916 |
| sex [female] | -0.04 | -0.15 – 0.06 | 0.423 |
| age | -0.00 | -0.01 – 0.00 | 0.352 |
| smoker [smoker] | 0.09 | -0.02 – 0.19 | 0.118 |
| attachbase | 0.88 | 0.81 – 0.94 | <0.001 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.906 / 0.898 | ||
# attachbase is confounding variable because different patients have different attach prior to treatment
# pdbase is confounding variable because different patients have different pd prior to treatment
model3<- lm(pdbase~ sex+race+age+smoker+sites, data = project1data_complete)
plot(model3)
tab_model(model3)
| pdbase | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 3.47 | 1.84 – 5.09 | <0.001 |
| sex [female] | 0.23 | 0.04 – 0.42 | 0.017 |
| race [African] | 0.35 | -0.28 – 0.99 | 0.270 |
| race [Asian] | 0.50 | -0.26 – 1.26 | 0.198 |
| race [white] | 0.30 | -0.23 – 0.83 | 0.269 |
| age | -0.00 | -0.01 – 0.01 | 0.776 |
| smoker [smoker] | 0.23 | 0.04 – 0.42 | 0.021 |
| sites | -0.00 | -0.01 – 0.00 | 0.288 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.174 / 0.111 | ||
# backward variable selection method
model3.step <- step(model3,direction = "backward")
## Start: AIC=-156.79
## pdbase ~ sex + race + age + smoker + sites
##
## Df Sum of Sq RSS AIC
## - race 3 0.37538 18.628 -160.74
## - age 1 0.01597 18.268 -158.71
## - sites 1 0.22446 18.477 -157.56
## <none> 18.252 -156.79
## - smoker 1 1.08803 19.340 -152.94
## - sex 1 1.15254 19.405 -152.61
##
## Step: AIC=-160.74
## pdbase ~ sex + age + smoker + sites
##
## Df Sum of Sq RSS AIC
## - age 1 0.01460 18.642 -162.66
## <none> 18.628 -160.74
## - sites 1 0.45412 19.082 -160.31
## - sex 1 1.24360 19.871 -156.21
## - smoker 1 1.28317 19.911 -156.01
##
## Step: AIC=-162.66
## pdbase ~ sex + smoker + sites
##
## Df Sum of Sq RSS AIC
## <none> 18.642 -162.66
## - sites 1 0.44846 19.091 -162.26
## - sex 1 1.25890 19.901 -158.06
## - smoker 1 1.41546 20.058 -157.27
plot(model3.step)
tab_model(model3.step)
| pdbase | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 3.89 | 2.61 – 5.18 | <0.001 |
| sex [female] | 0.24 | 0.05 – 0.42 | 0.012 |
| smoker [smoker] | 0.25 | 0.07 – 0.43 | 0.008 |
| sites | -0.01 | -0.01 – 0.00 | 0.130 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.156 / 0.130 | ||
# Sex 2 has sig higher levels of pd compared to Sex 1 at the baseline
# Smoker 1 has sig higher levels of pd compared to Smoker 0 at the baseline
model4<- lm(pd1year~ trtgroup+sex+race+age+smoker+pdbase+sites, data = project1data_complete)
plot(model4)
tab_model(model4)
| pd1year | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 0.40 | -0.69 – 1.49 | 0.464 |
| trtgroup [2] | 0.06 | -0.11 – 0.22 | 0.475 |
| trtgroup [3] | 0.20 | 0.04 – 0.37 | 0.018 |
| trtgroup [4] | 0.14 | -0.03 – 0.31 | 0.096 |
| trtgroup [5] | 0.03 | -0.15 – 0.21 | 0.713 |
| sex [female] | -0.09 | -0.21 – 0.03 | 0.125 |
| race [African] | 0.06 | -0.32 – 0.45 | 0.737 |
| race [Asian] | 0.23 | -0.23 – 0.70 | 0.325 |
| race [white] | 0.16 | -0.17 – 0.48 | 0.341 |
| age | -0.00 | -0.01 – 0.00 | 0.260 |
| smoker [smoker] | 0.02 | -0.10 – 0.14 | 0.761 |
| pdbase | 0.88 | 0.75 – 1.00 | <0.001 |
| sites | -0.00 | -0.01 – 0.00 | 0.458 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.751 / 0.717 | ||
# backward variable selection method
model4.step <- lm(pd1year~ trtgroup+sex+age+smoker+pdbase, data = project1data_complete)
plot(model4.step)
tab_model(model4.step) # we remove the race and sites because it was the least sig variables
| pd1year | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 0.13 | -0.34 – 0.60 | 0.581 |
| trtgroup [2] | 0.06 | -0.10 – 0.22 | 0.459 |
| trtgroup [3] | 0.17 | 0.01 – 0.34 | 0.034 |
| trtgroup [4] | 0.16 | -0.01 – 0.32 | 0.059 |
| trtgroup [5] | 0.02 | -0.16 – 0.19 | 0.835 |
| sex [female] | -0.09 | -0.21 – 0.02 | 0.114 |
| age | -0.00 | -0.01 – 0.00 | 0.327 |
| smoker [smoker] | 0.02 | -0.10 – 0.13 | 0.776 |
| pdbase | 0.90 | 0.78 – 1.02 | <0.001 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.744 / 0.721 | ||
# conclusions: The pdbase was the most important variable as expected. People that came in with high pd were expected to have high pd after one year. when we regress for pdbase, we found that trtgroup [3] (low dose treatment) had sig increased pd compared to trtgroup [1] (placebo) (trtgroup [3] (low dose treatment) had 0.17 on average higher pd compared to trtgroup [1] (placebo))
To verify how the baseline attachment loss measurement is significant to the outcome at 1 year. If attachbase is left out of the regression as we see in model 6, the other variables now have signficant relationships.
model6<- lm(attach1year~ trtgroup+sex+race+age+smoker+sites, data = project1data_complete)
plot(model6)
tab_model(model6)
| attach1year | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 5.01 | 2.40 – 7.63 | <0.001 |
| trtgroup [2] | 0.48 | 0.06 – 0.91 | 0.027 |
| trtgroup [3] | 0.48 | 0.05 – 0.91 | 0.028 |
| trtgroup [4] | 0.42 | -0.02 – 0.85 | 0.058 |
| trtgroup [5] | 0.38 | -0.09 – 0.85 | 0.111 |
| sex [female] | 0.23 | -0.06 – 0.53 | 0.122 |
| race [African] | 0.22 | -0.76 – 1.21 | 0.656 |
| race [Asian] | 0.08 | -1.12 – 1.28 | 0.895 |
| race [white] | 0.39 | -0.44 – 1.22 | 0.351 |
| age | 0.00 | -0.01 – 0.02 | 0.512 |
| smoker [smoker] | 0.36 | 0.06 – 0.65 | 0.019 |
| sites | -0.03 | -0.04 – -0.01 | <0.001 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.303 / 0.217 | ||
# backward variable selection method
model6.step <- lm(attach1year~ trtgroup+sex+age+smoker+sites, data = project1data_complete)
plot(model6.step)
tab_model(model6.step)
| attach1year | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 4.93 | 2.69 – 7.17 | <0.001 |
| trtgroup [2] | 0.47 | 0.06 – 0.89 | 0.027 |
| trtgroup [3] | 0.45 | 0.04 – 0.87 | 0.034 |
| trtgroup [4] | 0.43 | 0.00 – 0.86 | 0.047 |
| trtgroup [5] | 0.36 | -0.09 – 0.82 | 0.117 |
| sex [female] | 0.22 | -0.07 – 0.51 | 0.142 |
| age | 0.01 | -0.01 – 0.02 | 0.285 |
| smoker [smoker] | 0.38 | 0.09 – 0.67 | 0.011 |
| sites | -0.02 | -0.04 – -0.01 | <0.001 |
| Observations | 101 | ||
| R2 / R2 adjusted | 0.291 / 0.229 | ||
#Residuals
## Histogram of residuals (model2.step (DV=attach1year))
```r
ggplot(data = project1data_complete, aes(x =model2.step$residuals)) +
geom_histogram(fill = 'steelblue', color = 'black') +
labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = project1data_complete, aes(x =model4.step$residuals)) +
geom_histogram(fill = 'steelblue', color = 'black') +
labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.