## Loading Packages
library(graphics)
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(rstatix)
## Warning: package 'rstatix' was built under R version 3.6.3
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
library(NHANES)
## Warning: package 'NHANES' was built under R version 3.6.3
library(arm)
## Warning: package 'arm' was built under R version 3.6.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:rstatix':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 3.6.3
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/Daivik/Desktop/EDA/Assignments/Assignment 5
## Preparing Data
setwd("C:/Users/Daivik/Desktop/EDA/Assignments/Assignment 5")
diab_nhanes <- NHANES%>% filter(!is.na(NHANES$Diabetes) & !is.na(NHANES$Age)) %>% dplyr::select(Diabetes,Age)
diab_raw<- NHANESraw %>% filter(!is.na(NHANESraw$Diabetes) & !is.na(NHANESraw$Age) & !is.na(NHANESraw$WTINT2YR)) %>% dplyr::select(Diabetes,Age,WTINT2YR)
diab_nhanes$Diabetes <- as.numeric(diab_nhanes$Diabetes) - 1
diab_raw$Diabetes <- as.numeric(diab_raw$Diabetes) - 1

Question 1

count(diab_nhanes, Diabetes)
## # A tibble: 2 x 2
##   Diabetes     n
##      <dbl> <int>
## 1        0  9098
## 2        1   760
prop_of_yes<- length(which(diab_nhanes$Diabetes==1))/length(diab_nhanes$Diabetes)
sprintf("The proportion of people having diabities is %f",prop_of_yes)
## [1] "The proportion of people having diabities is 0.077095"
prop_of_no<- length(which(diab_nhanes$Diabetes==0))/length(diab_nhanes$Diabetes)
sprintf("The proportion of people having diabities is %f",prop_of_no)
## [1] "The proportion of people having diabities is 0.922905"
count(diab_raw, Diabetes)
## # A tibble: 2 x 2
##   Diabetes     n
##      <dbl> <int>
## 1        0 17754
## 2        1  1706
prop_of_yes_raw<- length(which(diab_raw$Diabetes==1))/length(diab_raw$Diabetes)
sprintf("The proportion of people having diabities in raw dataset  is %f",prop_of_yes_raw)
## [1] "The proportion of people having diabities in raw dataset  is 0.087667"
prop_of_yes_raw<- length(which(diab_raw$Diabetes==0))/length(diab_raw$Diabetes)
sprintf("The proportion of people having diabities in raw dataset  is %f",prop_of_yes_raw)
## [1] "The proportion of people having diabities in raw dataset  is 0.912333"
agg<-aggregate(WTINT2YR ~ Diabetes, sum, data = diab_raw)
add<- agg[1,2]+agg[2,2]
prop<- agg[2,2]/add
sprintf("The Proportion of US Residents with diabetes (Nhanesraw data) %f",prop)
## [1] "The Proportion of US Residents with diabetes (Nhanesraw data) 0.079838"

Question 2

diab.logit = glm(Diabetes ~ Age, family = binomial, data = diab_nhanes)
display(diab.logit)
## glm(formula = Diabetes ~ Age, family = binomial, data = diab_nhanes)
##             coef.est coef.se
## (Intercept) -5.19     0.13  
## Age          0.06     0.00  
## ---
##   n = 9858, k = 2
##   residual deviance = 4495.1, null deviance = 5355.2 (difference = 860.1)
ggplot(diab_nhanes, aes(Age, Diabetes)) + 
  geom_jitter(height = 0.1, width = 0.25) + 
  geom_smooth(method = "glm", method.args = list(family = "binomial")) + 
  ggtitle("Graph 1")
## `geom_smooth()` using formula 'y ~ x'

diab.weighted.logit = glm(Diabetes ~ Age, family = quasibinomial, weights = WTINT2YR, data = diab_raw)
display(diab.weighted.logit)
## glm(formula = Diabetes ~ Age, family = quasibinomial, data = diab_raw, 
##     weights = WTINT2YR)
##             coef.est      coef.se      
## (Intercept) -4.979208e+15  1.797095e+13
## Age          3.192369e+13  4.137223e+11
## ---
##   n = 19460, k = 2
##   residual deviance = 3454325322.0, null deviance = 334158315.5 (difference = -3120167006.5)
##   overdispersion parameter = 1.109088e+19
ggplot(diab_raw, aes(Age, Diabetes)) + 
  geom_jitter(height = 0.1, width = 0.25) + 
  geom_smooth(method = "glm", method.args = list(family = "quasibinomial")) + 
  ggtitle("Graph 2")
## `geom_smooth()` using formula 'y ~ x'

### From the graph, we see that till the age of 40 both the graphs have the same trend. Graph 1 increases rapidly after 40 years. But in the second graph, the curve touches the value of 0.4. Due to high coefficients, the curve touches 0.4 value. The first graph dosent touch the 0.4 value. So we can say that the Graph 2 is a little far away from Graph 1.

Question 3

our.logit = function(x){
  coe = coef(diab.weighted.logit)
  y = coe[1] + coe[2] * x
  return(exp(y) / (1 + exp(y)))
}

ggplot(diab_raw, aes(Age, Diabetes)) + 
  geom_jitter(height = 0.1, width = 0.25) + 
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) + 
  stat_function(fun = our.logit, color = "red")
## `geom_smooth()` using formula 'y ~ x'

### Red Line does not fit the model properly because it has a weight factor in it. If we plot the curve, we see that the slope and the coefficient are major factors in the equation. So it may be because of the varied slope that the curve dosent fit the data correctly. The regression regression is not able to reach the 0.4 point as the dataset does.