title: " WORKED CODES Oral Analysis" “1-OralAnalysis Final” author: “Hui Han” output: html_document

# To do: 1-- teeth Categorical VR exploration
#2- Teeth Categorical VR to cross tab age category, gender, co-morbidity
# 3- Teeth Categorical VR to visulazation
# Logistic Regression
#4-- Teeth Numerical VR to explore
# teetch numerical VR to visualize
# Covariate to explore
# linear regression

## Covariates: age, agegroup, Sex, HTN, DM, High cholesterol, region, etc

### PLAN- 1) to create age category  2) covariates creation  3) Logistic Regression  4) Linear regression

# HE_HP.x,HE_HP.y, HE.DM.x, HE_DM.y, HE_BMI.x,HE_BMI.y, HE_obe.x, HE_obe.y,HE_hCHOL.x,HE_hCHOL.x, HE_hCHOL.y
## smoker = BS1_1,  "Control-- 1 less than 100 pieces, 2 more than 100, 3 never smoked, 8 not applicable, 9 dont know") 
# install.packages('lmtest')  ## for linear model test
# install.packages('gridExtra') ## for log transformation
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gtools)
library(haven)
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(dplyr)
library(RColorBrewer)  #  ggplot2 color
library (lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library (gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
getwd()
## [1] "C:/Users/graci/Documents/698-GH Final May6/Project"
dataKH<-read_csv ('output/finalData/KHANES4yrsApr14.csv')
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   ID = col_character(),
##   PSU = col_character(),
##   WT_HS = col_double(),
##   WT_ITV = col_double(),
##   WT_EX = col_double(),
##   WT_PFT = col_double(),
##   WT_HM = col_double(),
##   WT_EX1 = col_double(),
##   WT_NTR = col_double(),
##   WT_ITVEX = col_double(),
##   WT_TOT = col_double(),
##   WT_PFNT = col_double(),
##   WT_HMNT = col_double(),
##   WT_EX1NT = col_double(),
##   WT_EX1PF = col_double(),
##   WT_EX1HM = col_double(),
##   WT_PFHM = col_double(),
##   WT_TOT1 = col_double(),
##   WT_EX1PFNT = col_double(),
##   WT_EX1HMNT = col_double()
##   # ... with 256 more columns
## )
## See spec(...) for full column specifications.
dim(dataKH)
## [1] 16109   966
## select only the variables of interest
dtemp<- dataKH %>% select ('ID','YEAR', 'REGION','SEX','AGE','BS1_1.x','HE_BMI.x', 'HE_HP.x','HE_DM.x', 'HE_hCHOL.x',
                      'numHTeeth', 'presentTeeth','NPERODONTAL', 'gumTrouble','newPero','PERO','PER0two')
dim(dtemp)
## [1] 16109    17
# write to the output folder
# write_csv(dtemp,"output/FinalData/KHanesFinalMay6.csv")  ##16109*17VR
d<- read_csv ("output/FinalData/KHanesFinalMay6.csv") 
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   YEAR = col_integer(),
##   REGION = col_integer(),
##   SEX = col_integer(),
##   AGE = col_integer(),
##   BS1_1.x = col_integer(),
##   HE_BMI.x = col_double(),
##   HE_HP.x = col_integer(),
##   HE_DM.x = col_integer(),
##   HE_hCHOL.x = col_integer(),
##   numHTeeth = col_integer(),
##   presentTeeth = col_integer(),
##   NPERODONTAL = col_integer(),
##   gumTrouble = col_integer(),
##   newPero = col_integer(),
##   PERO = col_integer(),
##   PER0two = col_integer()
## )
summary(d$newPero)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    1.00    1.00    0.75    1.00    1.00    4483
## pie chart, percentage 
d2<-d %>% filter (!is.na(newPero))  ## to filter out the missing outcome
dim(d2)
## [1] 11626    17
table (d2$YEAR)
## 
## 2008 2009 2010 
## 2392 4954 4280
table (d2$newPero, d2$YEAR)
##    
##     2008 2009 2010
##   0  723 1433  745
##   1 1669 3521 3535
table (d2$newPero, d2$SEX)
##    
##        1    2
##   0 1269 1632
##   1 3755 4970
## proportions instead of counts
options (scipen=999, digits=3)
tab_cnt3 <- table (d2$YEAR, d2$newPero)
tab_cnt3
##       
##           0    1
##   2008  723 1669
##   2009 1433 3521
##   2010  745 3535
prop.table (tab_cnt3)
##       
##             0      1
##   2008 0.0622 0.1436
##   2009 0.1233 0.3029
##   2010 0.0641 0.3041
prop.table (tab_cnt3, 1)
##       
##            0     1
##   2008 0.302 0.698
##   2009 0.289 0.711
##   2010 0.174 0.826
# ggplot (d2, aes(x=YEAR, fill=newPero))+
#   geom_bar(position='fill')+
#   ylab('proportion')
# ## Plot failed, everyone is 100% why?? maybe newpero need to be categorical VR (now integeter)

d3<-d2 %>% 
  mutate (newPero2= as.factor(newPero))

class (d3$newPero2)
## [1] "factor"
ggplot(d, aes(newPero)) +geom_bar (color = "#FF6666")
## Warning: Removed 4483 rows containing non-finite values (stat_count).

  ## try EDA, faceted barchart, by Year, by Gender (SEX)
  ggplot(d3, aes(x=YEAR))+
    geom_bar()+
    facet_wrap((~newPero2))

  ## this worked as count
  
   d3 %>% 
    mutate (SEX2= as.factor(SEX)) %>%
     ggplot(aes(newPero))+
    geom_bar(fill='blue', colour='red') +
    facet_wrap(~ SEX2)  

  # next,to do proportion, prepare data first, then to plot
 df3<-group_by (d3, .dots=c('YEAR','newPero')) %>% 
  summarize(counts=n()) %>% 
  mutate (perc= (counts/sum(counts))*100) %>% 
  arrange(desc(perc))

dim(df3)  ## 4 by 4 table 
## [1] 6 4
class (df3$YEAR)
## [1] "integer"
class (df3$newPero)
## [1] "integer"
tbl_cnt<-table(df3$YEAR, df3$newPero)
prop.table(tbl_cnt,1)
##       
##          0   1
##   2008 0.5 0.5
##   2009 0.5 0.5
##   2010 0.5 0.5
# prepare the pie chart plot, NewPero Outcome (proportion) by YEAR, 
ggplot (df3, aes('', counts)) +
  geom_col (
    position ='fill',
    color='black', 
    width=1, 
    aes(fill=factor(newPero))
    ) +
  facet_wrap (~YEAR, labeller = 'label_both') + 
  geom_label (
    aes(label = paste0(round(perc), "%"), group = factor(newPero)),
                                 position = position_fill (vjust=0.5)
    , color='black',                                      size=5, show.legend = FALSE) +
                                coord_polar (theta='y')

#PERO DONTAL DISEASE OUTCOME BY GENDER
# to prepare faceted pie chart 

# summary(d2)
class (d2$newPero)
## [1] "integer"
df<-group_by (d2, .dots=c('SEX','newPero')) %>% 
  summarize(counts=n()) %>% 
  mutate (perc= (counts/sum(counts))*100) %>% 
  arrange(desc(perc))

dim(df)  ## 4 by 4 table 
## [1] 4 4
class (df$SEX)
## [1] "integer"
class (df$newPero)
## [1] "integer"
# prepare the plot, NewPero Outcome by SEX
ggplot (df, aes('', counts)) +
  geom_col (
    position ='fill',
    color='black', 
    width=1, 
    aes(fill=factor(newPero))
    ) +
  facet_wrap (~SEX, labeller = 'label_both') + 
  geom_label (
    aes(label = paste0(round(perc), "%"), group = factor(newPero)),
                                 position = position_fill (vjust=0.5)
    , color='black',                                      size=5, show.legend = FALSE) +
                                coord_polar (theta='y')

## AGE  and periodontal disease
# table (d$AGE)
# table (d$newPero,d$AGE)

table (d3$AGE)
## 
##  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57 
## 397 364 347 311 294 313 329 349 297 350 329 378 315 349 286 272 273 268 
##  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75 
## 284 264 282 303 299 296 220 238 289 301 288 307 246 269 262 225 214 190 
##  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93 
## 191 164 117 102  95  89  58  53  31  33  32  18  15  11   5   6   3   3 
##  94  95 
##   1   1
# table (d3$newPero,d3$AGE)
## Agegroups by PerioDontal Disease
# modify the age group to 40+ only (everyone is over 40),to use d3, not d
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
agebreaks <- c(39,58,77,78)
agelabels <- c("40-58","59-77","78+")
setDT(d3)[ , agegroups := cut(AGE, 
                                breaks = agebreaks, 
                                right = FALSE, 
                                labels = agelabels)]
levels (d3$agegroups)
## [1] "40-58" "59-77" "78+"
ggplot(d3, aes(x=agegroups))+geom_bar()

table (d3$agegroups)
## 
## 40-58 59-77   78+ 
##  5821  4968   164
table (d3$agegroups, d3$newPero)
##        
##            0    1
##   40-58 1077 4744
##   59-77 1385 3583
##   78+     77   87
  # next,pie chart, proportion of PerioDontal Disease by Age groups, prepar datafirst
 dfa<-group_by (d3, .dots=c('agegroups','newPero')) %>% 
  summarize(counts=n()) %>% 
  mutate (perc= (counts/sum(counts))*100) %>% 
  arrange(desc(perc))

dim(dfa)  ## 4 by 4 table 
## [1] 8 4
class (dfa$agegroups)
## [1] "factor"
class (dfa$newPero)
## [1] "integer"
tbl_cnt<-table(dfa$agegroups, dfa$newPero)
prop.table(tbl_cnt,1)
##        
##           0   1
##   40-58 0.5 0.5
##   59-77 0.5 0.5
##   78+   0.5 0.5
# prepare the pie chart plot, NewPero Outcome (proportion) by agegroups, 
ggplot (dfa, aes('', counts)) +
  geom_col (
    position ='fill',
    color='black', 
    width=1, 
    aes(fill=factor(newPero))
    ) +
  facet_wrap (~agegroups, labeller = 'label_both') + 
  geom_label (
    aes(label = paste0(round(perc), "%"), group = factor(newPero)),
                                 position = position_fill (vjust=0.5)
    , color='black',                                      size=5, show.legend = FALSE) +
                                coord_polar (theta='y')

## Logistic Regression (Bionomial logistic regression)

class (d3$newPero)## very importantly, the dependent VR just be coded numerically
## [1] "integer"
# see  Udemy course-  Rogdan Anastasiei
fitglm = glm (newPero ~ SEX+ agegroups+YEAR, data=d3, family = binomial (link = 'logit') )
summary(fitglm)
## 
## Call:
## glm(formula = newPero ~ SEX + agegroups + YEAR, family = binomial(link = "logit"), 
##     data = d3)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.015   0.531   0.641   0.768   1.364  
## 
## Coefficients:
##                 Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    -820.2946    62.0743  -13.21 <0.0000000000000002 ***
## SEX               0.0798     0.0464    1.72               0.085 .  
## agegroups59-77   -0.5391     0.0467  -11.55 <0.0000000000000002 ***
## agegroups78+     -1.4204     0.1616   -8.79 <0.0000000000000002 ***
## YEAR              0.4090     0.0309   13.24 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11861  on 10952  degrees of freedom
## Residual deviance: 11503  on 10948  degrees of freedom
##   (673 observations deleted due to missingness)
## AIC: 11513
## 
## Number of Fisher Scoring iterations: 4
## proposed model have a lower deviance (11503) vs null deviance (saturated model)

expb <- exp (coef(fitglm))
print(expb)
##    (Intercept)            SEX agegroups59-77   agegroups78+           YEAR 
##          0.000          1.083          0.583          0.242          1.505
## Antilogrithym (chance that a person have PerioDontal Disease)

## print out the confidence interval of these variables (too many decimal places)
intexp <- exp(confint(fitglm))
## Waiting for profiling to be done...
print(intexp)
##                2.5 %
## (Intercept)    0.000
## SEX            0.989
## agegroups59-77 0.532
## agegroups78+   0.176
## YEAR           1.417
##                                                                                                                                                                                                                                                                                                                              97.5 %
## (Intercept)    0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000336
## SEX            1.186001412384219522166972637933213263750076293945312500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## agegroups59-77 0.639084781620910979960115128051256760954856872558593750000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## agegroups78+   0.332159671258197841670067873565130867063999176025390625000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## YEAR           1.599362369415308293696398322936147451400756835937500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
### Model2-Logistic Regression, to add more co-variate

fitglm2 = glm (newPero ~  agegroups+ YEAR+ BS1_1.x+ HE_BMI.x+ HE_HP.x+HE_DM.x+ HE_hCHOL.x,
              data=d3, family = binomial (link = 'logit') )
## smoking (BS1_1)
summary (fitglm2)
## 
## Call:
## glm(formula = newPero ~ agegroups + YEAR + BS1_1.x + HE_BMI.x + 
##     HE_HP.x + HE_DM.x + HE_hCHOL.x, family = binomial(link = "logit"), 
##     data = d3)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.227   0.488   0.630   0.757   1.385  
## 
## Coefficients:
##                  Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)    -848.21984   65.43143  -12.96 < 0.0000000000000002 ***
## agegroups59-77   -0.49271    0.05164   -9.54 < 0.0000000000000002 ***
## agegroups78+     -1.22004    0.18231   -6.69       0.000000000022 ***
## YEAR              0.42245    0.03257   12.97 < 0.0000000000000002 ***
## BS1_1.x           0.06274    0.04029    1.56                0.119    
## HE_BMI.x          0.04291    0.00832    5.15       0.000000254083 ***
## HE_HP.x          -0.01635    0.03120   -0.52                0.600    
## HE_DM.x          -0.15451    0.03401   -4.54       0.000005546497 ***
## HE_hCHOL.x        0.16218    0.06916    2.35                0.019 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10913  on 10156  degrees of freedom
## Residual deviance: 10536  on 10148  degrees of freedom
##   (1469 observations deleted due to missingness)
## AIC: 10554
## 
## Number of Fisher Scoring iterations: 4
expb2 <- exp(coef(fitglm2))
print(expb2)
##    (Intercept) agegroups59-77   agegroups78+           YEAR        BS1_1.x 
##          0.000          0.611          0.295          1.526          1.065 
##       HE_BMI.x        HE_HP.x        HE_DM.x     HE_hCHOL.x 
##          1.044          0.984          0.857          1.176
## Dependent VR as -- Number of teeth --Exploratory Data Analysis
## dependent variable check
class(d$numHTeeth)
## [1] "integer"
#numerical VR, to explore --Number of Healthy Teeth
class(d$numHTeeth)
## [1] "integer"
table (d$numHTeeth)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 1194  173  198  203  210  194  239  196  200  230  208  245  271  273  292 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28 
##  297  350  361  430  420  488  545  714  682  922 1065  978 1311 3127
# count(is.na(d$numHTeeth))
mean (d$numHTeeth, na.rm=TRUE)
## [1] 19.2
sd(d$numHTeeth,na.rm=TRUE)
## [1] 9.13
var(d$numHTeeth,na.rm=TRUE)
## [1] 83.4
median (d$numHTeeth,na.rm=TRUE)
## [1] 23
IQR(d$numHTeeth, na.rm=TRUE)
## [1] 14
## Agegroups by Number of Healthy Teeth, apply on the entire data (d), not d3(non missing PerioDontal)
# modify the age group to 40+ only,
agebreaks <- c(39,58,77,78)
agelabels <- c("40-58","59-77","78+")

library(data.table)
setDT(d)[ , agegroups := cut(AGE, 
                                breaks = agebreaks, 
                                right = FALSE, 
                                labels = agelabels)]
levels (d$agegroups)
## [1] "40-58" "59-77" "78+"
## how many people are in the database again
d %>% summarise(n())
##     n()
## 1 16109
table (d$numHTeeth)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 1194  173  198  203  210  194  239  196  200  230  208  245  271  273  292 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28 
##  297  350  361  430  420  488  545  714  682  922 1065  978 1311 3127
summary(d$numHTeeth)  ## 93 people of NA, from a total of 16,109
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    13.0    23.0    19.2    27.0    28.0      93
### Mean STD, variance and Median for Num of Healthy Teeth, by Agegroups
d %>%   group_by (agegroups) %>%   summarise(n() )
## # A tibble: 4 x 2
##   agegroups `n()`
##   <fctr>    <int>
## 1 40-58      7951
## 2 59-77      6926
## 3 78+         232
## 4 <NA>       1000
d %>%   group_by (agegroups) %>%   summarise(mean(numHTeeth, na.rm=TRUE), sd(numHTeeth))
## # A tibble: 4 x 3
##   agegroups `mean(numHTeeth, na.rm = TRUE)` `sd(numHTeeth)`
##   <fctr>                              <dbl>           <dbl>
## 1 40-58                               23.6           NaN   
## 2 59-77                               16.0           NaN   
## 3 78+                                 10.0             9.02
## 4 <NA>                                 7.99          NaN
d %>%   group_by (agegroups) %>%   summarise(median(numHTeeth, na.rm=TRUE), IQR(numHTeeth, na.rm=TRUE) )
## # A tibble: 4 x 3
##   agegroups `median(numHTeeth, na.rm = TRUE)` `IQR(numHTeeth, na.rm = TRU~
##   <fctr>                                <dbl>                        <dbl>
## 1 40-58                                 26.0                          6.00
## 2 59-77                                 18.0                         16.0 
## 3 78+                                    8.00                        18.0 
## 4 <NA>                                   5.00                        14.0
d %>%   group_by (agegroups) %>%   summarise(min(numHTeeth, na.rm=TRUE), max(numHTeeth, na.rm=TRUE))
## # A tibble: 4 x 3
##   agegroups `min(numHTeeth, na.rm = TRUE)` `max(numHTeeth, na.rm = TRUE)`
##   <fctr>                             <dbl>                          <dbl>
## 1 40-58                                  0                           28.0
## 2 59-77                                  0                           28.0
## 3 78+                                    0                           28.0
## 4 <NA>                                   0                           28.0
d4<-d %>% filter (!is.na(numHTeeth))  ## to filter out the missing outcome, N= about 1000 people filtered out, now N=16016

# how many people are in the database again
d4 %>%     summarise(n())
##     n()
## 1 16016
table (d4$numHTeeth)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 1194  173  198  203  210  194  239  196  200  230  208  245  271  273  292 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28 
##  297  350  361  430  420  488  545  714  682  922 1065  978 1311 3127
## num of Healthy Teeth, Shape and Skewness, # histogram
ggplot (d4, aes( x= numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## numb of Healthy Teeth by Gender, histogram
ggplot (d4, aes(x=numHTeeth)) + geom_histogram() +
  facet_wrap (~SEX)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## looks like the 0 and 28 are outliers, let's exlude them and re-check the distributition and skewness
d4_3 <- d4 %>%  filter (numHTeeth<=27 & numHTeeth>=1)
table (d4_3$numHTeeth)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  173  198  203  210  194  239  196  200  230  208  245  271  273  292  297 
##   16   17   18   19   20   21   22   23   24   25   26   27 
##  350  361  430  420  488  545  714  682  922 1065  978 1311
## histogram, looks better after exlusion 
# Outliers excluded now, num of Healthy Teeth, Shape and Skewness
ggplot (d4_3, aes( x= numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# check histogram again after exclusion of outliers
ggplot (d4_3, aes( x= numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## numb of Healthy Teeth by Gender, histogram
ggplot (d4_3, aes(x=numHTeeth)) + geom_histogram() +
  facet_wrap (~SEX)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## since data is right skewed, use transformation
# Transforming Data - Data Analysis with R
 # https://www.youtube.com/watch?v=0L3Obq4FSVQ

d5<- d4_3 %>% mutate (log10_HTeethN = log10(numHTeeth+0.3))  ## 0.3 added so that there is no missing value
table (d5$log10_HTeethN)
## 
## 0.113943352306837 0.361727836017593 0.518513939877887 0.633468455579587 
##               173               198               203               210 
## 0.724275869600789 0.799340549453582 0.863322860120456 0.919078092376074 
##               194               239               196               200 
## 0.968482948553935  1.01283722470517  1.05307844348342   1.0899051114394 
##               230               208               245               271 
##  1.12385164096709  1.15533603746506   1.1846914308176  1.21218760440396 
##               273               292               297               350 
##   1.2380461031288  1.26245108973043  1.28555730900777  1.30749603791321 
##               361               430               420               488 
##  1.32837960343874  1.34830486304816  1.36735592102602  1.38560627359831 
##               545               714               682               922 
##  1.40312052117582  1.41995574848976  1.43616264704076 
##              1065               978              1311
ggplot (d5, aes( x= log10_HTeethN))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## still right skewed after the log 10 transformation
## log E transformation is not any better, also tried
## try square root transformation
d5TEMP<- d4_3 %>% mutate (sqrt_numHTeeth = sqrt(numHTeeth)) 
table (d5TEMP$sqrt_numHTeeth)
## 
##                1  1.4142135623731 1.73205080756888                2 
##              173              198              203              210 
## 2.23606797749979 2.44948974278318 2.64575131106459 2.82842712474619 
##              194              239              196              200 
##                3 3.16227766016838  3.3166247903554 3.46410161513775 
##              230              208              245              271 
## 3.60555127546399 3.74165738677394 3.87298334620742                4 
##              273              292              297              350 
## 4.12310562561766 4.24264068711928 4.35889894354067 4.47213595499958 
##              361              430              420              488 
## 4.58257569495584 4.69041575982343 4.79583152331272 4.89897948556636 
##              545              714              682              922 
##                5 5.09901951359278 5.19615242270663 
##             1065              978             1311
ggplot (d5TEMP, aes( x= sqrt_numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## loge transformation, did not use
d4_5<- d4_3 %>% mutate (loge_numHTeeth = log(numHTeeth))
table (d4_5$loge_numHTeeth)
## 
##                 0 0.693147180559945  1.09861228866811  1.38629436111989 
##               173               198               203               210 
##   1.6094379124341  1.79175946922805  1.94591014905531  2.07944154167984 
##               194               239               196               200 
##  2.19722457733622  2.30258509299405  2.39789527279837    2.484906649788 
##               230               208               245               271 
##  2.56494935746154  2.63905732961526  2.70805020110221  2.77258872223978 
##               273               292               297               350 
##  2.83321334405622  2.89037175789616  2.94443897916644  2.99573227355399 
##               361               430               420               488 
##  3.04452243772342  3.09104245335832  3.13549421592915  3.17805383034795 
##               545               714               682               922 
##   3.2188758248682  3.25809653802148  3.29583686600433 
##              1065               978              1311
ggplot (d4_5, aes( x= loge_numHTeeth))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Dependent VR as -- Number of Healthy teeth  --linear model
## used the transformed data, not the original VR

library(data.table)
require(broom)
## Loading required package: broom
mod1 <-lm (log10_HTeethN ~ YEAR+factor(SEX), data=d5)
mod1
## 
## Call:
## lm(formula = log10_HTeethN ~ YEAR + factor(SEX), data = d5)
## 
## Coefficients:
##  (Intercept)          YEAR  factor(SEX)2  
##    -14.46044       0.00781      -0.01539
summary(mod1)## to view the coefficient and P value
## 
## Call:
## lm(formula = log10_HTeethN ~ YEAR + factor(SEX), data = d5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1236 -0.0668  0.1140  0.1812  0.2374 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -14.46044    5.00779   -2.89   0.0039 **
## YEAR           0.00781    0.00249    3.13   0.0017 **
## factor(SEX)2  -0.01539    0.00517   -2.98   0.0029 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.278 on 11692 degrees of freedom
## Multiple R-squared:  0.00163,    Adjusted R-squared:  0.00146 
## F-statistic: 9.55 on 2 and 11692 DF,  p-value: 0.000072
 ## augment(mod1)  not to use, too many lines (1 person per line)
# Parallel lines on the scatterplot
 # d5 + geom_line(data=augment(mod1), aes(y=.fitted, color=factor.year))

mod2<-lm (log10_HTeethN~ factor(SEX) + agegroups + HE_BMI.x +BS1_1.x, data=d5 )
mod2
## 
## Call:
## lm(formula = log10_HTeethN ~ factor(SEX) + agegroups + HE_BMI.x + 
##     BS1_1.x, data = d5)
## 
## Coefficients:
##    (Intercept)    factor(SEX)2  agegroups59-77    agegroups78+  
##        1.20222        -0.02464        -0.17554        -0.30590  
##       HE_BMI.x         BS1_1.x  
##        0.00427         0.01583
summary(mod2) 
## 
## Call:
## lm(formula = log10_HTeethN ~ factor(SEX) + agegroups + HE_BMI.x + 
##     BS1_1.x, data = d5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2422 -0.0565  0.0633  0.1285  0.4221 
## 
## Coefficients:
##                 Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     1.202219   0.020601   58.36 < 0.0000000000000002 ***
## factor(SEX)2   -0.024639   0.005862   -4.20         0.0000265582 ***
## agegroups59-77 -0.175540   0.004687  -37.45 < 0.0000000000000002 ***
## agegroups78+   -0.305904   0.019366  -15.80 < 0.0000000000000002 ***
## HE_BMI.x        0.004272   0.000739    5.78         0.0000000077 ***
## BS1_1.x         0.015833   0.004473    3.54               0.0004 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.243 on 10896 degrees of freedom
##   (793 observations deleted due to missingness)
## Multiple R-squared:  0.126,  Adjusted R-squared:  0.126 
## F-statistic:  315 on 5 and 10896 DF,  p-value: <0.0000000000000002
# augment(mod2)
## adj R square 0.225,   ## smoker = BS1_1

# plot out to see if the regression conditions are met
plot (mod2)  

## test for auto corrlated/non-independence of erros
require (lmtest)
dwtest(mod2) ##conclusion: there is strong correlattion
## 
##  Durbin-Watson test
## 
## data:  mod2
## DW = 2, p-value = 0.00002
## alternative hypothesis: true autocorrelation is greater than 0
# ## test for constant variance
# ncvTest(mod7)  ## P<.0001, conclusion: there is strong covariance 


mod4<-lm (log10_HTeethN~ SEX+ agegroups+ YEAR+ BS1_1.x+ HE_BMI.x+ HE_HP.x+HE_DM.x+ HE_hCHOL.x, data=d5)
## next, to add more VR in
mod4
## 
## Call:
## lm(formula = log10_HTeethN ~ SEX + agegroups + YEAR + BS1_1.x + 
##     HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x, data = d5)
## 
## Coefficients:
##    (Intercept)             SEX  agegroups59-77    agegroups78+  
##      -25.82157        -0.03031        -0.16978        -0.30401  
##           YEAR         BS1_1.x        HE_BMI.x         HE_HP.x  
##        0.01346         0.01912         0.00531        -0.00639  
##        HE_DM.x      HE_hCHOL.x  
##       -0.01200         0.01185
summary(mod4) 
## 
## Call:
## lm(formula = log10_HTeethN ~ SEX + agegroups + YEAR + BS1_1.x + 
##     HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x, data = d5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2691 -0.0543  0.0616  0.1308  0.4202 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    -25.821569   4.666772   -5.53       0.000000032247 ***
## SEX             -0.030307   0.006424   -4.72       0.000002417533 ***
## agegroups59-77  -0.169782   0.005080  -33.42 < 0.0000000000000002 ***
## agegroups78+    -0.304014   0.021037  -14.45 < 0.0000000000000002 ***
## YEAR             0.013461   0.002322    5.80       0.000000006966 ***
## BS1_1.x          0.019115   0.005208    3.67              0.00024 ***
## HE_BMI.x         0.005311   0.000804    6.61       0.000000000041 ***
## HE_HP.x         -0.006390   0.003094   -2.07              0.03891 *  
## HE_DM.x         -0.012005   0.003411   -3.52              0.00043 ***
## HE_hCHOL.x       0.011849   0.006452    1.84              0.06634 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.241 on 10178 degrees of freedom
##   (1507 observations deleted due to missingness)
## Multiple R-squared:  0.13,   Adjusted R-squared:  0.129 
## F-statistic:  169 on 9 and 10178 DF,  p-value: <0.0000000000000002
## age as continous VR, also the other redundent VR are got rid of
mod6<- lm (numHTeeth ~ factor(SEX) + agegroups + HE_HP.x + HE_BMI.x+
  HE_HP.x +HE_DM.x+  HE_hCHOL.x  + BS1_1.x
  , data=d)
mod6
## 
## Call:
## lm(formula = numHTeeth ~ factor(SEX) + agegroups + HE_HP.x + 
##     HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x + BS1_1.x, data = d)
## 
## Coefficients:
##    (Intercept)    factor(SEX)2  agegroups59-77    agegroups78+  
##         19.071          -1.060          -7.393         -13.137  
##        HE_HP.x        HE_BMI.x         HE_DM.x      HE_hCHOL.x  
##         -0.334           0.191          -0.607           0.570  
##        BS1_1.x  
##          0.828
summary(mod6)
## 
## Call:
## lm(formula = numHTeeth ~ factor(SEX) + agegroups + HE_HP.x + 
##     HE_BMI.x + HE_HP.x + HE_DM.x + HE_hCHOL.x + BS1_1.x, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.24  -3.76   1.99   4.46  17.83 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     19.0710     0.5850   32.60 < 0.0000000000000002 ***
## factor(SEX)2    -1.0604     0.1698   -6.25       0.000000000434 ***
## agegroups59-77  -7.3931     0.1369  -54.02 < 0.0000000000000002 ***
## agegroups78+   -13.1370     0.5548  -23.68 < 0.0000000000000002 ***
## HE_HP.x         -0.3341     0.0824   -4.05       0.000050854767 ***
## HE_BMI.x         0.1907     0.0216    8.84 < 0.0000000000000002 ***
## HE_DM.x         -0.6074     0.0936   -6.49       0.000000000089 ***
## HE_hCHOL.x       0.5698     0.1748    3.26               0.0011 ** 
## BS1_1.x          0.8283     0.1365    6.07       0.000000001327 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.51 on 13887 degrees of freedom
##   (2213 observations deleted due to missingness)
## Multiple R-squared:  0.226,  Adjusted R-squared:  0.225 
## F-statistic:  505 on 8 and 13887 DF,  p-value: <0.0000000000000002