options(repos = c(CRAN = "https://cran.r-project.org/"))
install.packages('pwr')
## 
## The downloaded binary packages are in
##  /var/folders/1t/lvl69_w12vj1sz_yxkxrvt7w0000gn/T//RtmpNQ2xh1/downloaded_packages
library(pwr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
Obesity <- read.csv('/Users/ankit/Downloads/Obesity.csv')
str(Obesity)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
summary(Obesity)
##     Gender               Age            Height          Weight      
##  Length:2111        Min.   :14.00   Min.   :1.450   Min.   : 39.00  
##  Class :character   1st Qu.:19.95   1st Qu.:1.630   1st Qu.: 65.47  
##  Mode  :character   Median :22.78   Median :1.700   Median : 83.00  
##                     Mean   :24.31   Mean   :1.702   Mean   : 86.59  
##                     3rd Qu.:26.00   3rd Qu.:1.768   3rd Qu.:107.43  
##                     Max.   :61.00   Max.   :1.980   Max.   :173.00  
##  family_history_with_overweight     FAVC                FCVC      
##  Length:2111                    Length:2111        Min.   :1.000  
##  Class :character               Class :character   1st Qu.:2.000  
##  Mode  :character               Mode  :character   Median :2.386  
##                                                    Mean   :2.419  
##                                                    3rd Qu.:3.000  
##                                                    Max.   :3.000  
##       NCP            CAEC              SMOKE                CH2O      
##  Min.   :1.000   Length:2111        Length:2111        Min.   :1.000  
##  1st Qu.:2.659   Class :character   Class :character   1st Qu.:1.585  
##  Median :3.000   Mode  :character   Mode  :character   Median :2.000  
##  Mean   :2.686                                         Mean   :2.008  
##  3rd Qu.:3.000                                         3rd Qu.:2.477  
##  Max.   :4.000                                         Max.   :3.000  
##      SCC                 FAF              TUE             CALC          
##  Length:2111        Min.   :0.0000   Min.   :0.0000   Length:2111       
##  Class :character   1st Qu.:0.1245   1st Qu.:0.0000   Class :character  
##  Mode  :character   Median :1.0000   Median :0.6253   Mode  :character  
##                     Mean   :1.0103   Mean   :0.6579                     
##                     3rd Qu.:1.6667   3rd Qu.:1.0000                     
##                     Max.   :3.0000   Max.   :2.0000                     
##     MTRANS           NObeyesdad       
##  Length:2111        Length:2111       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

QUESTION: Build a linear (or generalized linear) model as you like. Use whatever response variable and explanatory variables you prefer

Use the tools from previous weeks to diagnose the model Highlight any issues with the model Interpret at least one of the coefficients

RESPONSE VARIABLE: Weight Category EXPLAINATORY VARIABLE: Family_history_of_overweight

unique(Obesity$NObeyesdad)
## [1] "Normal_Weight"       "Overweight_Level_I"  "Overweight_Level_II"
## [4] "Obesity_Type_I"      "Insufficient_Weight" "Obesity_Type_II"    
## [7] "Obesity_Type_III"
Obesity$WeightCategory <- ifelse(Obesity$NObeyesdad %in% c("Obesity_Type_I", "Overweight_Level_II", "Overweight_Level_I", "Insufficient_Weight", "Obesity_Type_II", "Obesity_Type_III"), "Obesed", "Normal Weight")

head(Obesity)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad WeightCategory
## 1       Normal_Weight  Normal Weight
## 2       Normal_Weight  Normal Weight
## 3       Normal_Weight  Normal Weight
## 4  Overweight_Level_I         Obesed
## 5 Overweight_Level_II         Obesed
## 6       Normal_Weight  Normal Weight
unique(Obesity$WeightCategory)
## [1] "Normal Weight" "Obesed"
sum(is.na(Obesity$WeightCategory))
## [1] 0
Obesity$BinaryWeight <- ifelse(Obesity$WeightCategory == "Normal Weight", 0, 1)

head(Obesity)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad WeightCategory BinaryWeight
## 1       Normal_Weight  Normal Weight            0
## 2       Normal_Weight  Normal Weight            0
## 3       Normal_Weight  Normal Weight            0
## 4  Overweight_Level_I         Obesed            1
## 5 Overweight_Level_II         Obesed            1
## 6       Normal_Weight  Normal Weight            0
LinearModel1 <- lm(BinaryWeight ~ Gender + family_history_with_overweight + SMOKE, data = Obesity)

# Coefficient
summary(LinearModel1)
## 
## Call:
## lm(formula = BinaryWeight ~ Gender + family_history_with_overweight + 
##     SMOKE, data = Obesity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92440  0.07560  0.09525  0.09525  0.52391 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        0.66763    0.01766  37.812  < 2e-16 ***
## GenderMale                        -0.01965    0.01436  -1.369 0.171174    
## family_history_with_overweightyes  0.25677    0.01857  13.826  < 2e-16 ***
## SMOKEyes                          -0.17189    0.04998  -3.439 0.000596 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3277 on 2107 degrees of freedom
## Multiple R-squared:  0.08741,    Adjusted R-squared:  0.08611 
## F-statistic: 67.27 on 3 and 2107 DF,  p-value: < 2.2e-16
# Hypothesis test (ANOVA)
anova(LinearModel1)
## Analysis of Variance Table
## 
## Response: BinaryWeight
##                                  Df  Sum Sq Mean Sq  F value    Pr(>F)    
## Gender                            1   0.001  0.0012   0.0113 0.9153222    
## family_history_with_overweight    1  20.404 20.4042 189.9715 < 2.2e-16 ***
## SMOKE                             1   1.270  1.2701  11.8256 0.0005956 ***
## Residuals                      2107 226.306  0.1074                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculating AIC and BIC
aic <- AIC(LinearModel1)
bic <- BIC(LinearModel1)

aic
## [1] 1286.83
bic
## [1] 1315.104

Interpretation: the linear regression model suggests that “family_history_with_overweight” and “SMOKE” have a statistically significant influence on “BinaryWeight,” while “Gender” does not appear to be a significant predictor. However, the model’s R-squared values indicate that these predictors collectively explain only a small proportion of the variance in “BinaryWeight”.

LinearModel2 <- lm(BinaryWeight ~ family_history_with_overweight + SMOKE, data = Obesity)

# Summary statistics
summary(LinearModel2)
## 
## Call:
## lm(formula = BinaryWeight ~ family_history_with_overweight + 
##     SMOKE, data = Obesity)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.91405  0.08595  0.08595  0.08595  0.51497 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        0.65987    0.01672   39.46  < 2e-16 ***
## family_history_with_overweightyes  0.25418    0.01848   13.76  < 2e-16 ***
## SMOKEyes                          -0.17484    0.04995   -3.50 0.000474 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3278 on 2108 degrees of freedom
## Multiple R-squared:  0.0866, Adjusted R-squared:  0.08573 
## F-statistic: 99.93 on 2 and 2108 DF,  p-value: < 2.2e-16
# Hypothesis tests (ANOVA)
anova(LinearModel2)
## Analysis of Variance Table
## 
## Response: BinaryWeight
##                                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## family_history_with_overweight    1  20.158 20.1577 187.599 < 2.2e-16 ***
## SMOKE                             1   1.317  1.3166  12.253 0.0004741 ***
## Residuals                      2108 226.507  0.1075                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate AIC and BIC
aic <- AIC(LinearModel2)
bic <- BIC(LinearModel2)

aic
## [1] 1286.706
bic
## [1] 1309.326

Interpretation: this linear regression model suggests that both “family_history_with_overweight” and “SMOKE” have a statistically significant influence on “BinaryWeight.” The model’s R-squared values indicate that these predictors collectively explain a relatively small proportion of the variance in “BinaryWeight.”

Also, according to AIC and BIC, the second model is favored as it provides a better trade-off between model fit and model complexity. However, the difference in these values is relatively small, so considering other factors might change something.