title: " WORKED CODES Oral Analysis" “1-OralAnalysis Final” author: “Hui Han” output: html_document
# To do: 1-- teeth Categorical VR exploration
#2- Teeth Categorical VR to cross tab age category, gender, co-morbidity
# 3- Teeth Categorical VR to visulazation
# Logistic Regression
#4-- Teeth Numerical VR to explore
# teetch numerical VR to visualize
# Covariate to explore
# linear regression
## Covariates: age, agegroup, Sex, HTN, DM, High cholesterol, region, etc
### PLAN- 1) to create age category 2) covariates creation 3) Logistic Regression 4) Linear regression
# HE_HP.x,HE_HP.y, HE.DM.x, HE_DM.y, HE_BMI.x,HE_BMI.y, HE_obe.x, HE_obe.y,HE_hCHOL.x,HE_hCHOL.x, HE_hCHOL.y
## smoker = BS1_1, "Control-- 1 less than 100 pieces, 2 more than 100, 3 never smoked, 8 not applicable, 9 dont know")
# install.packages('lmtest') ## for linear model test
# install.packages('gridExtra') ## for log transformation
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gtools)
library(haven)
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(dplyr)
library(RColorBrewer) # ggplot2 color
library (lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library (gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
getwd()
## [1] "C:/Users/graci/Documents/698-GH Final May6/Project"
dataKH<-read_csv ('output/finalData/KHANES4yrsApr14.csv')
## Parsed with column specification:
## cols(
## .default = col_integer(),
## ID = col_character(),
## PSU = col_character(),
## WT_HS = col_double(),
## WT_ITV = col_double(),
## WT_EX = col_double(),
## WT_PFT = col_double(),
## WT_HM = col_double(),
## WT_EX1 = col_double(),
## WT_NTR = col_double(),
## WT_ITVEX = col_double(),
## WT_TOT = col_double(),
## WT_PFNT = col_double(),
## WT_HMNT = col_double(),
## WT_EX1NT = col_double(),
## WT_EX1PF = col_double(),
## WT_EX1HM = col_double(),
## WT_PFHM = col_double(),
## WT_TOT1 = col_double(),
## WT_EX1PFNT = col_double(),
## WT_EX1HMNT = col_double()
## # ... with 256 more columns
## )
## See spec(...) for full column specifications.
dim(dataKH)
## [1] 16109 966
## select only the variables of interest
dtemp<- dataKH %>% select ('ID','YEAR', 'REGION','SEX','AGE','BS1_1.x','HE_BMI.x', 'HE_HP.x','HE_DM.x', 'HE_hCHOL.x',
'numHTeeth', 'presentTeeth','NPERODONTAL', 'gumTrouble','newPero','PERO','PER0two')
dim(dtemp)
## [1] 16109 17
# write to the output folder
# write_csv(dtemp,"output/FinalData/KHanesFinalMay6.csv") ##16109*17VR
d<- read_csv ("output/FinalData/KHanesFinalMay6.csv")
## Parsed with column specification:
## cols(
## ID = col_character(),
## YEAR = col_integer(),
## REGION = col_integer(),
## SEX = col_integer(),
## AGE = col_integer(),
## BS1_1.x = col_integer(),
## HE_BMI.x = col_double(),
## HE_HP.x = col_integer(),
## HE_DM.x = col_integer(),
## HE_hCHOL.x = col_integer(),
## numHTeeth = col_integer(),
## presentTeeth = col_integer(),
## NPERODONTAL = col_integer(),
## gumTrouble = col_integer(),
## newPero = col_integer(),
## PERO = col_integer(),
## PER0two = col_integer()
## )
summary(d$newPero)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 1.00 1.00 0.75 1.00 1.00 4483
## pie chart, percentage
d2<-d %>% filter (!is.na(newPero)) ## to filter out the missing outcome
dim(d2)
## [1] 11626 17
table (d2$YEAR)
##
## 2008 2009 2010
## 2392 4954 4280
table (d2$newPero, d2$YEAR)
##
## 2008 2009 2010
## 0 723 1433 745
## 1 1669 3521 3535
table (d2$newPero, d2$SEX)
##
## 1 2
## 0 1269 1632
## 1 3755 4970
## proportions instead of counts
options (scipen=999, digits=3)
tab_cnt3 <- table (d2$YEAR, d2$newPero)
tab_cnt3
##
## 0 1
## 2008 723 1669
## 2009 1433 3521
## 2010 745 3535
prop.table (tab_cnt3)
##
## 0 1
## 2008 0.0622 0.1436
## 2009 0.1233 0.3029
## 2010 0.0641 0.3041
prop.table (tab_cnt3, 1)
##
## 0 1
## 2008 0.302 0.698
## 2009 0.289 0.711
## 2010 0.174 0.826
# ggplot (d2, aes(x=YEAR, fill=newPero))+
# geom_bar(position='fill')+
# ylab('proportion')
# ## Plot failed, everyone is 100% why?? maybe newpero need to be categorical VR (now integeter)
d3<-d2 %>%
mutate (newPero2= as.factor(newPero))
class (d3$newPero2)
## [1] "factor"
ggplot(d, aes(newPero)) +geom_bar (color = "#FF6666")
## Warning: Removed 4483 rows containing non-finite values (stat_count).
## try EDA, faceted barchart, by Year, by Gender (SEX)
ggplot(d3, aes(x=YEAR))+
geom_bar()+
facet_wrap((~newPero2))
## this worked as count
d3 %>%
mutate (SEX2= as.factor(SEX)) %>%
ggplot(aes(newPero))+
geom_bar(fill='blue', colour='red') +
facet_wrap(~ SEX2)
# next,to do proportion, prepare data first, then to plot
df3<-group_by (d3, .dots=c('YEAR','newPero')) %>%
summarize(counts=n()) %>%
mutate (perc= (counts/sum(counts))*100) %>%
arrange(desc(perc))
dim(df3) ## 4 by 4 table
## [1] 6 4
class (df3$YEAR)
## [1] "integer"
class (df3$newPero)
## [1] "integer"
tbl_cnt<-table(df3$YEAR, df3$newPero)
prop.table(tbl_cnt,1)
##
## 0 1
## 2008 0.5 0.5
## 2009 0.5 0.5
## 2010 0.5 0.5
# prepare the pie chart plot, NewPero Outcome (proportion) by YEAR,
ggplot (df3, aes('', counts)) +
geom_col (
position ='fill',
color='black',
width=1,
aes(fill=factor(newPero))
) +
facet_wrap (~YEAR, labeller = 'label_both') +
geom_label (
aes(label = paste0(round(perc), "%"), group = factor(newPero)),
position = position_fill (vjust=0.5)
, color='black', size=5, show.legend = FALSE) +
coord_polar (theta='y')
#PERO DONTAL DISEASE OUTCOME BY GENDER
# to prepare faceted pie chart
# summary(d2)
class (d2$newPero)
## [1] "integer"
df<-group_by (d2, .dots=c('SEX','newPero')) %>%
summarize(counts=n()) %>%
mutate (perc= (counts/sum(counts))*100) %>%
arrange(desc(perc))
dim(df) ## 4 by 4 table
## [1] 4 4
class (df$SEX)
## [1] "integer"
class (df$newPero)
## [1] "integer"
# prepare the plot, NewPero Outcome by SEX
ggplot (df, aes('', counts)) +
geom_col (
position ='fill',
color='black',
width=1,
aes(fill=factor(newPero))
) +
facet_wrap (~SEX, labeller = 'label_both') +
geom_label (
aes(label = paste0(round(perc), "%"), group = factor(newPero)),
position = position_fill (vjust=0.5)
, color='black', size=5, show.legend = FALSE) +
coord_polar (theta='y')
## AGE and periodontal disease
# table (d$AGE)
# table (d$newPero,d$AGE)
table (d3$AGE)
##
## 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
## 397 364 347 311 294 313 329 349 297 350 329 378 315 349 286 272 273 268
## 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 284 264 282 303 299 296 220 238 289 301 288 307 246 269 262 225 214 190
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
## 191 164 117 102 95 89 58 53 31 33 32 18 15 11 5 6 3 3
## 94 95
## 1 1
# table (d3$newPero,d3$AGE)
## Agegroups by PerioDontal Disease
# modify the age group to 40+ only (everyone is over 40),to use d3, not d
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
agebreaks <- c(39,58,77,78)
agelabels <- c("40-58","59-77","78+")
setDT(d3)[ , agegroups := cut(AGE,
breaks = agebreaks,
right = FALSE,
labels = agelabels)]
levels (d3$agegroups)
## [1] "40-58" "59-77" "78+"
ggplot(d3, aes(x=agegroups))+geom_bar()
table (d3$agegroups)
##
## 40-58 59-77 78+
## 5821 4968 164
table (d3$agegroups, d3$newPero)
##
## 0 1
## 40-58 1077 4744
## 59-77 1385 3583
## 78+ 77 87
# next,pie chart, proportion of PerioDontal Disease by Age groups, prepar datafirst
dfa<-group_by (d3, .dots=c('agegroups','newPero')) %>%
summarize(counts=n()) %>%
mutate (perc= (counts/sum(counts))*100) %>%
arrange(desc(perc))
dim(dfa) ## 4 by 4 table
## [1] 8 4
class (dfa$agegroups)
## [1] "factor"
class (dfa$newPero)
## [1] "integer"
tbl_cnt<-table(dfa$agegroups, dfa$newPero)
prop.table(tbl_cnt,1)
##
## 0 1
## 40-58 0.5 0.5
## 59-77 0.5 0.5
## 78+ 0.5 0.5
# prepare the pie chart plot, NewPero Outcome (proportion) by agegroups,
ggplot (dfa, aes('', counts)) +
geom_col (
position ='fill',
color='black',
width=1,
aes(fill=factor(newPero))
) +
facet_wrap (~agegroups, labeller = 'label_both') +
geom_label (
aes(label = paste0(round(perc), "%"), group = factor(newPero)),
position = position_fill (vjust=0.5)
, color='black', size=5, show.legend = FALSE) +
coord_polar (theta='y')
## Logistic Regression (Bionomial logistic regression)
class (d3$newPero)## very importantly, the dependent VR just be coded numerically
## [1] "integer"
# see Udemy course- Rogdan Anastasiei
fitglm = glm (newPero ~ SEX+ agegroups+YEAR, data=d3, family = binomial (link = 'logit') )
summary(fitglm)
##
## Call:
## glm(formula = newPero ~ SEX + agegroups + YEAR, family = binomial(link = "logit"),
## data = d3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.015 0.531 0.641 0.768 1.364
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -820.2946 62.0743 -13.21 <0.0000000000000002 ***
## SEX 0.0798 0.0464 1.72 0.085 .
## agegroups59-77 -0.5391 0.0467 -11.55 <0.0000000000000002 ***
## agegroups78+ -1.4204 0.1616 -8.79 <0.0000000000000002 ***
## YEAR 0.4090 0.0309 13.24 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11861 on 10952 degrees of freedom
## Residual deviance: 11503 on 10948 degrees of freedom
## (673 observations deleted due to missingness)
## AIC: 11513
##
## Number of Fisher Scoring iterations: 4
## proposed model have a lower deviance (11503) vs null deviance (saturated model)
expb <- exp (coef(fitglm))
print(expb)
## (Intercept) SEX agegroups59-77 agegroups78+ YEAR
## 0.000 1.083 0.583 0.242 1.505
## Antilogrithym (chance that a person have PerioDontal Disease)
## print out the confidence interval of these variables (too many decimal places)
intexp <- exp(confint(fitglm))
## Waiting for profiling to be done...
print(intexp)
## 2.5 %
## (Intercept) 0.000
## SEX 0.989
## agegroups59-77 0.532
## agegroups78+ 0.176
## YEAR 1.417
## 97.5 %
## (Intercept) 0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000336
## SEX 1.186001412384219522166972637933213263750076293945312500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## agegroups59-77 0.639084781620910979960115128051256760954856872558593750000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## agegroups78+ 0.332159671258197841670067873565130867063999176025390625000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## YEAR 1.599362369415308293696398322936147451400756835937500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
### Model2-Logistic Regression, to add more co-variate
fitglm2 = glm (newPero ~ agegroups+ YEAR+ BS1_1.x+ HE_BMI.x+ HE_HP.x+HE_DM.x+ HE_hCHOL.x,
data=d3, family = binomial (link = 'logit') )
## smoking (BS1_1)
summary (fitglm2)
##
## Call:
## glm(formula = newPero ~ agegroups + YEAR + BS1_1.x + HE_BMI.x +
## HE_HP.x + HE_DM.x + HE_hCHOL.x, family = binomial(link = "logit"),
## data = d3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.227 0.488 0.630 0.757 1.385
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -848.21984 65.43143 -12.96 < 0.0000000000000002 ***
## agegroups59-77 -0.49271 0.05164 -9.54 < 0.0000000000000002 ***
## agegroups78+ -1.22004 0.18231 -6.69 0.000000000022 ***
## YEAR 0.42245 0.03257 12.97 < 0.0000000000000002 ***
## BS1_1.x 0.06274 0.04029 1.56 0.119
## HE_BMI.x 0.04291 0.00832 5.15 0.000000254083 ***
## HE_HP.x -0.01635 0.03120 -0.52 0.600
## HE_DM.x -0.15451 0.03401 -4.54 0.000005546497 ***
## HE_hCHOL.x 0.16218 0.06916 2.35 0.019 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10913 on 10156 degrees of freedom
## Residual deviance: 10536 on 10148 degrees of freedom
## (1469 observations deleted due to missingness)
## AIC: 10554
##
## Number of Fisher Scoring iterations: 4
expb2 <- exp(coef(fitglm2))
print(expb2)
## (Intercept) agegroups59-77 agegroups78+ YEAR BS1_1.x
## 0.000 0.611 0.295 1.526 1.065
## HE_BMI.x HE_HP.x HE_DM.x HE_hCHOL.x
## 1.044 0.984 0.857 1.176
## Dependent VR as -- Number of teeth --Exploratory Data Analysis
## dependent variable check
class(d$numHTeeth)
## [1] "integer"
#numerical VR, to explore --Number of Healthy Teeth
class(d$numHTeeth)
## [1] "integer"
table (d$numHTeeth)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1194 173 198 203 210 194 239 196 200 230 208 245 271 273 292
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 297 350 361 430 420 488 545 714 682 922 1065 978 1311 3127
# count(is.na(d$numHTeeth))
mean (d$numHTeeth, na.rm=TRUE)
## [1] 19.2
sd(d$numHTeeth,na.rm=TRUE)
## [1] 9.13
var(d$numHTeeth,na.rm=TRUE)
## [1] 83.4
median (d$numHTeeth,na.rm=TRUE)
## [1] 23
IQR(d$numHTeeth, na.rm=TRUE)
## [1] 14
## Agegroups by Number of Healthy Teeth, apply on the entire data (d), not d3(non missing PerioDontal)
# modify the age group to 40+ only,
agebreaks <- c(39,58,77,78)
agelabels <- c("40-58","59-77","78+")
library(data.table)
setDT(d)[ , agegroups := cut(AGE,
breaks = agebreaks,
right = FALSE,
labels = agelabels)]
levels (d$agegroups)
## [1] "40-58" "59-77" "78+"
## how many people are in the database again
d %>% summarise(n())
## n()
## 1 16109
table (d$numHTeeth)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1194 173 198 203 210 194 239 196 200 230 208 245 271 273 292
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 297 350 361 430 420 488 545 714 682 922 1065 978 1311 3127
summary(d$numHTeeth) ## 93 people of NA, from a total of 16,109
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 13.0 23.0 19.2 27.0 28.0 93
### Mean STD, variance and Median for Num of Healthy Teeth, by Agegroups
d %>% group_by (agegroups) %>% summarise(n() )
## # A tibble: 4 x 2
## agegroups `n()`
## <fctr> <int>
## 1 40-58 7951
## 2 59-77 6926
## 3 78+ 232
## 4 <NA> 1000
d %>% group_by (agegroups) %>% summarise(mean(numHTeeth, na.rm=TRUE), sd(numHTeeth))
## # A tibble: 4 x 3
## agegroups `mean(numHTeeth, na.rm = TRUE)` `sd(numHTeeth)`
## <fctr> <dbl> <dbl>
## 1 40-58 23.6 NaN
## 2 59-77 16.0 NaN
## 3 78+ 10.0 9.02
## 4 <NA> 7.99 NaN
d %>% group_by (agegroups) %>% summarise(median(numHTeeth, na.rm=TRUE), IQR(numHTeeth, na.rm=TRUE) )
## # A tibble: 4 x 3
## agegroups `median(numHTeeth, na.rm = TRUE)` `IQR(numHTeeth, na.rm = TRU~
## <fctr> <dbl> <dbl>
## 1 40-58 26.0 6.00
## 2 59-77 18.0 16.0
## 3 78+ 8.00 18.0
## 4 <NA> 5.00 14.0
d %>% group_by (agegroups) %>% summarise(min(numHTeeth, na.rm=TRUE), max(numHTeeth, na.rm=TRUE))
## # A tibble: 4 x 3
## agegroups `min(numHTeeth, na.rm = TRUE)` `max(numHTeeth, na.rm = TRUE)`
## <fctr> <dbl> <dbl>
## 1 40-58 0 28.0
## 2 59-77 0 28.0
## 3 78+ 0 28.0
## 4 <NA> 0 28.0
d4<-d %>% filter (!is.na(numHTeeth)) ## to filter out the missing outcome, N= about 1000 people filtered out, now N=16016
# how many people are in the database again
d4 %>% summarise(n())
## n()
## 1 16016
table (d4$numHTeeth)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1194 173 198 203 210 194 239 196 200 230 208 245 271 273 292
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 297 350 361 430 420 488 545 714 682 922 1065 978 1311 3127
## num of Healthy Teeth, Shape and Skewness, # histogram
ggplot (d4, aes( x= numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## numb of Healthy Teeth by Gender, histogram
ggplot (d4, aes(x=numHTeeth)) + geom_histogram() +
facet_wrap (~SEX)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## looks like the 0 and 28 are outliers, let's exlude them and re-check the distributition and skewness
d4_3 <- d4 %>% filter (numHTeeth<=27 & numHTeeth>=1)
table (d4_3$numHTeeth)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 173 198 203 210 194 239 196 200 230 208 245 271 273 292 297
## 16 17 18 19 20 21 22 23 24 25 26 27
## 350 361 430 420 488 545 714 682 922 1065 978 1311
## histogram, looks better after exlusion
# Outliers excluded now, num of Healthy Teeth, Shape and Skewness
ggplot (d4_3, aes( x= numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# check histogram again after exclusion of outliers
ggplot (d4_3, aes( x= numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## numb of Healthy Teeth by Gender, histogram
ggplot (d4_3, aes(x=numHTeeth)) + geom_histogram() +
facet_wrap (~SEX)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## since data is right skewed, use transformation
# Transforming Data - Data Analysis with R
# https://www.youtube.com/watch?v=0L3Obq4FSVQ
d5<- d4_3 %>% mutate (log10_HTeethN = log10(numHTeeth+0.3)) ## 0.3 added so that there is no missing value
table (d5$log10_HTeethN)
##
## 0.113943352306837 0.361727836017593 0.518513939877887 0.633468455579587
## 173 198 203 210
## 0.724275869600789 0.799340549453582 0.863322860120456 0.919078092376074
## 194 239 196 200
## 0.968482948553935 1.01283722470517 1.05307844348342 1.0899051114394
## 230 208 245 271
## 1.12385164096709 1.15533603746506 1.1846914308176 1.21218760440396
## 273 292 297 350
## 1.2380461031288 1.26245108973043 1.28555730900777 1.30749603791321
## 361 430 420 488
## 1.32837960343874 1.34830486304816 1.36735592102602 1.38560627359831
## 545 714 682 922
## 1.40312052117582 1.41995574848976 1.43616264704076
## 1065 978 1311
ggplot (d5, aes( x= log10_HTeethN))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## still right skewed after the log 10 transformation
## log E transformation is not any better, also tried
## try square root transformation
d5TEMP<- d4_3 %>% mutate (sqrt_numHTeeth = sqrt(numHTeeth))
table (d5TEMP$sqrt_numHTeeth)
##
## 1 1.4142135623731 1.73205080756888 2
## 173 198 203 210
## 2.23606797749979 2.44948974278318 2.64575131106459 2.82842712474619
## 194 239 196 200
## 3 3.16227766016838 3.3166247903554 3.46410161513775
## 230 208 245 271
## 3.60555127546399 3.74165738677394 3.87298334620742 4
## 273 292 297 350
## 4.12310562561766 4.24264068711928 4.35889894354067 4.47213595499958
## 361 430 420 488
## 4.58257569495584 4.69041575982343 4.79583152331272 4.89897948556636
## 545 714 682 922
## 5 5.09901951359278 5.19615242270663
## 1065 978 1311
ggplot (d5TEMP, aes( x= sqrt_numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## loge transformation, did not use
d4_5<- d4_3 %>% mutate (loge_numHTeeth = log(numHTeeth))
table (d4_5$loge_numHTeeth)
##
## 0 0.693147180559945 1.09861228866811 1.38629436111989
## 173 198 203 210
## 1.6094379124341 1.79175946922805 1.94591014905531 2.07944154167984
## 194 239 196 200
## 2.19722457733622 2.30258509299405 2.39789527279837 2.484906649788
## 230 208 245 271
## 2.56494935746154 2.63905732961526 2.70805020110221 2.77258872223978
## 273 292 297 350
## 2.83321334405622 2.89037175789616 2.94443897916644 2.99573227355399
## 361 430 420 488
## 3.04452243772342 3.09104245335832 3.13549421592915 3.17805383034795
## 545 714 682 922
## 3.2188758248682 3.25809653802148 3.29583686600433
## 1065 978 1311
ggplot (d4_5, aes( x= loge_numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Dependent VR as -- Number of Healthy teeth --linear model
## used the transformed data, not the original VR
library(data.table)
require(broom)
## Loading required package: broom
mod1 <-lm (log10_HTeethN ~ YEAR+factor(SEX), data=d5)
mod1
##
## Call:
## lm(formula = log10_HTeethN ~ YEAR + factor(SEX), data = d5)
##
## Coefficients:
## (Intercept) YEAR factor(SEX)2
## -14.46044 0.00781 -0.01539
summary(mod1)## to view the coefficient and P value
##
## Call:
## lm(formula = log10_HTeethN ~ YEAR + factor(SEX), data = d5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1236 -0.0668 0.1140 0.1812 0.2374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.46044 5.00779 -2.89 0.0039 **
## YEAR 0.00781 0.00249 3.13 0.0017 **
## factor(SEX)2 -0.01539 0.00517 -2.98 0.0029 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.278 on 11692 degrees of freedom
## Multiple R-squared: 0.00163, Adjusted R-squared: 0.00146
## F-statistic: 9.55 on 2 and 11692 DF, p-value: 0.000072
## augment(mod1) not to use, too many lines (1 person per line)
# Parallel lines on the scatterplot
# d5 + geom_line(data=augment(mod1), aes(y=.fitted, color=factor.year))
mod2<-lm (log10_HTeethN~ factor(SEX) + agegroups + HE_BMI.x +BS1_1.x, data=d5 )
mod2
##
## Call:
## lm(formula = log10_HTeethN ~ factor(SEX) + agegroups + HE_BMI.x +
## BS1_1.x, data = d5)
##
## Coefficients:
## (Intercept) factor(SEX)2 agegroups59-77 agegroups78+
## 1.20222 -0.02464 -0.17554 -0.30590
## HE_BMI.x BS1_1.x
## 0.00427 0.01583
summary(mod2)
##
## Call:
## lm(formula = log10_HTeethN ~ factor(SEX) + agegroups + HE_BMI.x +
## BS1_1.x, data = d5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2422 -0.0565 0.0633 0.1285 0.4221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.202219 0.020601 58.36 < 0.0000000000000002 ***
## factor(SEX)2 -0.024639 0.005862 -4.20 0.0000265582 ***
## agegroups59-77 -0.175540 0.004687 -37.45 < 0.0000000000000002 ***
## agegroups78+ -0.305904 0.019366 -15.80 < 0.0000000000000002 ***
## HE_BMI.x 0.004272 0.000739 5.78 0.0000000077 ***
## BS1_1.x 0.015833 0.004473 3.54 0.0004 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.243 on 10896 degrees of freedom
## (793 observations deleted due to missingness)
## Multiple R-squared: 0.126, Adjusted R-squared: 0.126
## F-statistic: 315 on 5 and 10896 DF, p-value: <0.0000000000000002
# augment(mod2)
## adj R square 0.225, ## smoker = BS1_1
# plot out to see if the regression conditions are met
plot (mod2)
## test for auto corrlated/non-independence of erros
require (lmtest)
dwtest(mod2) ##conclusion: there is strong correlattion
##
## Durbin-Watson test
##
## data: mod2
## DW = 2, p-value = 0.00002
## alternative hypothesis: true autocorrelation is greater than 0
# ## test for constant variance
# ncvTest(mod7) ## P<.0001, conclusion: there is strong covariance
mod4<-lm (log10_HTeethN~ SEX+ agegroups+ YEAR+ BS1_1.x+ HE_BMI.x+ HE_HP.x+HE_DM.x+ HE_hCHOL.x, data=d5)
## next, to add more VR in
mod4
##
## Call:
## lm(formula = log10_HTeethN ~ SEX + agegroups + YEAR + BS1_1.x +
## HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x, data = d5)
##
## Coefficients:
## (Intercept) SEX agegroups59-77 agegroups78+
## -25.82157 -0.03031 -0.16978 -0.30401
## YEAR BS1_1.x HE_BMI.x HE_HP.x
## 0.01346 0.01912 0.00531 -0.00639
## HE_DM.x HE_hCHOL.x
## -0.01200 0.01185
summary(mod4)
##
## Call:
## lm(formula = log10_HTeethN ~ SEX + agegroups + YEAR + BS1_1.x +
## HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x, data = d5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2691 -0.0543 0.0616 0.1308 0.4202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -25.821569 4.666772 -5.53 0.000000032247 ***
## SEX -0.030307 0.006424 -4.72 0.000002417533 ***
## agegroups59-77 -0.169782 0.005080 -33.42 < 0.0000000000000002 ***
## agegroups78+ -0.304014 0.021037 -14.45 < 0.0000000000000002 ***
## YEAR 0.013461 0.002322 5.80 0.000000006966 ***
## BS1_1.x 0.019115 0.005208 3.67 0.00024 ***
## HE_BMI.x 0.005311 0.000804 6.61 0.000000000041 ***
## HE_HP.x -0.006390 0.003094 -2.07 0.03891 *
## HE_DM.x -0.012005 0.003411 -3.52 0.00043 ***
## HE_hCHOL.x 0.011849 0.006452 1.84 0.06634 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.241 on 10178 degrees of freedom
## (1507 observations deleted due to missingness)
## Multiple R-squared: 0.13, Adjusted R-squared: 0.129
## F-statistic: 169 on 9 and 10178 DF, p-value: <0.0000000000000002
## age as continous VR, also the other redundent VR are got rid of
mod6<- lm (numHTeeth ~ factor(SEX) + agegroups + HE_HP.x + HE_BMI.x+
HE_HP.x +HE_DM.x+ HE_hCHOL.x + BS1_1.x
, data=d)
mod6
##
## Call:
## lm(formula = numHTeeth ~ factor(SEX) + agegroups + HE_HP.x +
## HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x + BS1_1.x, data = d)
##
## Coefficients:
## (Intercept) factor(SEX)2 agegroups59-77 agegroups78+
## 19.071 -1.060 -7.393 -13.137
## HE_HP.x HE_BMI.x HE_DM.x HE_hCHOL.x
## -0.334 0.191 -0.607 0.570
## BS1_1.x
## 0.828
summary(mod6)
##
## Call:
## lm(formula = numHTeeth ~ factor(SEX) + agegroups + HE_HP.x +
## HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x + BS1_1.x, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.24 -3.76 1.99 4.46 17.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.0710 0.5850 32.60 < 0.0000000000000002 ***
## factor(SEX)2 -1.0604 0.1698 -6.25 0.000000000434 ***
## agegroups59-77 -7.3931 0.1369 -54.02 < 0.0000000000000002 ***
## agegroups78+ -13.1370 0.5548 -23.68 < 0.0000000000000002 ***
## HE_HP.x -0.3341 0.0824 -4.05 0.000050854767 ***
## HE_BMI.x 0.1907 0.0216 8.84 < 0.0000000000000002 ***
## HE_DM.x -0.6074 0.0936 -6.49 0.000000000089 ***
## HE_hCHOL.x 0.5698 0.1748 3.26 0.0011 **
## BS1_1.x 0.8283 0.1365 6.07 0.000000001327 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.51 on 13887 degrees of freedom
## (2213 observations deleted due to missingness)
## Multiple R-squared: 0.226, Adjusted R-squared: 0.225
## F-statistic: 505 on 8 and 13887 DF, p-value: <0.0000000000000002