options(repos = c(CRAN = "https://cran.r-project.org/"))
install.packages('pwr')
##
## The downloaded binary packages are in
## /var/folders/1t/lvl69_w12vj1sz_yxkxrvt7w0000gn/T//RtmpNQ2xh1/downloaded_packages
library(pwr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Obesity <- read.csv('/Users/ankit/Downloads/Obesity.csv')
str(Obesity)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
summary(Obesity)
## Gender Age Height Weight
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## family_history_with_overweight FAVC FCVC
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.386
## Mean :2.419
## 3rd Qu.:3.000
## Max. :3.000
## NCP CAEC SMOKE CH2O
## Min. :1.000 Length:2111 Length:2111 Min. :1.000
## 1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
## Median :3.000 Mode :character Mode :character Median :2.000
## Mean :2.686 Mean :2.008
## 3rd Qu.:3.000 3rd Qu.:2.477
## Max. :4.000 Max. :3.000
## SCC FAF TUE CALC
## Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.6253 Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## MTRANS NObeyesdad
## Length:2111 Length:2111
## Class :character Class :character
## Mode :character Mode :character
##
##
##
QUESTION: Build a linear (or generalized linear) model as you like. Use whatever response variable and explanatory variables you prefer
Use the tools from previous weeks to diagnose the model Highlight any issues with the model Interpret at least one of the coefficients
RESPONSE VARIABLE: Weight Category EXPLAINATORY VARIABLE: Family_history_of_overweight
unique(Obesity$NObeyesdad)
## [1] "Normal_Weight" "Overweight_Level_I" "Overweight_Level_II"
## [4] "Obesity_Type_I" "Insufficient_Weight" "Obesity_Type_II"
## [7] "Obesity_Type_III"
Obesity$WeightCategory <- ifelse(Obesity$NObeyesdad %in% c("Obesity_Type_I", "Overweight_Level_II", "Overweight_Level_I", "Insufficient_Weight", "Obesity_Type_II", "Obesity_Type_III"), "Obesed", "Normal Weight")
head(Obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad WeightCategory
## 1 Normal_Weight Normal Weight
## 2 Normal_Weight Normal Weight
## 3 Normal_Weight Normal Weight
## 4 Overweight_Level_I Obesed
## 5 Overweight_Level_II Obesed
## 6 Normal_Weight Normal Weight
unique(Obesity$WeightCategory)
## [1] "Normal Weight" "Obesed"
sum(is.na(Obesity$WeightCategory))
## [1] 0
Obesity$BinaryWeight <- ifelse(Obesity$WeightCategory == "Normal Weight", 0, 1)
head(Obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad WeightCategory BinaryWeight
## 1 Normal_Weight Normal Weight 0
## 2 Normal_Weight Normal Weight 0
## 3 Normal_Weight Normal Weight 0
## 4 Overweight_Level_I Obesed 1
## 5 Overweight_Level_II Obesed 1
## 6 Normal_Weight Normal Weight 0
LinearModel1 <- lm(BinaryWeight ~ Gender + family_history_with_overweight + SMOKE, data = Obesity)
# Coefficient
summary(LinearModel1)
##
## Call:
## lm(formula = BinaryWeight ~ Gender + family_history_with_overweight +
## SMOKE, data = Obesity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92440 0.07560 0.09525 0.09525 0.52391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.66763 0.01766 37.812 < 2e-16 ***
## GenderMale -0.01965 0.01436 -1.369 0.171174
## family_history_with_overweightyes 0.25677 0.01857 13.826 < 2e-16 ***
## SMOKEyes -0.17189 0.04998 -3.439 0.000596 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3277 on 2107 degrees of freedom
## Multiple R-squared: 0.08741, Adjusted R-squared: 0.08611
## F-statistic: 67.27 on 3 and 2107 DF, p-value: < 2.2e-16
# Hypothesis test (ANOVA)
anova(LinearModel1)
## Analysis of Variance Table
##
## Response: BinaryWeight
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 0.001 0.0012 0.0113 0.9153222
## family_history_with_overweight 1 20.404 20.4042 189.9715 < 2.2e-16 ***
## SMOKE 1 1.270 1.2701 11.8256 0.0005956 ***
## Residuals 2107 226.306 0.1074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculating AIC and BIC
aic <- AIC(LinearModel1)
bic <- BIC(LinearModel1)
aic
## [1] 1286.83
bic
## [1] 1315.104
Interpretation: the linear regression model suggests that “family_history_with_overweight” and “SMOKE” have a statistically significant influence on “BinaryWeight,” while “Gender” does not appear to be a significant predictor. However, the model’s R-squared values indicate that these predictors collectively explain only a small proportion of the variance in “BinaryWeight”.
LinearModel2 <- lm(BinaryWeight ~ family_history_with_overweight + SMOKE, data = Obesity)
# Summary statistics
summary(LinearModel2)
##
## Call:
## lm(formula = BinaryWeight ~ family_history_with_overweight +
## SMOKE, data = Obesity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.91405 0.08595 0.08595 0.08595 0.51497
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.65987 0.01672 39.46 < 2e-16 ***
## family_history_with_overweightyes 0.25418 0.01848 13.76 < 2e-16 ***
## SMOKEyes -0.17484 0.04995 -3.50 0.000474 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3278 on 2108 degrees of freedom
## Multiple R-squared: 0.0866, Adjusted R-squared: 0.08573
## F-statistic: 99.93 on 2 and 2108 DF, p-value: < 2.2e-16
# Hypothesis tests (ANOVA)
anova(LinearModel2)
## Analysis of Variance Table
##
## Response: BinaryWeight
## Df Sum Sq Mean Sq F value Pr(>F)
## family_history_with_overweight 1 20.158 20.1577 187.599 < 2.2e-16 ***
## SMOKE 1 1.317 1.3166 12.253 0.0004741 ***
## Residuals 2108 226.507 0.1075
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate AIC and BIC
aic <- AIC(LinearModel2)
bic <- BIC(LinearModel2)
aic
## [1] 1286.706
bic
## [1] 1309.326
Interpretation: this linear regression model suggests that both “family_history_with_overweight” and “SMOKE” have a statistically significant influence on “BinaryWeight.” The model’s R-squared values indicate that these predictors collectively explain a relatively small proportion of the variance in “BinaryWeight.”
Also, according to AIC and BIC, the second model is favored as it provides a better trade-off between model fit and model complexity. However, the difference in these values is relatively small, so considering other factors might change something.