knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

Olympics Data

We structure the data to add a new column called “Medals earned”

dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl  (5): ID, Age, Height, Weight, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_olympics <- dataset_olympics %>% arrange(Name, Year)
dataset_olympics <- dataset_olympics %>%
  group_by(Name) %>%
  mutate(is_Male = ifelse(Sex == 'M',1,0)) %>%
  ungroup()
head(dataset_olympics$is_Male)

Building a Logistic Regression Model

sexModel <- glm(is_Male ~ Height, data=dataset_olympics,family = binomial(link = 'logit'))
summary(sexModel)
## 
## Call:
## glm(formula = is_Male ~ Height, family = binomial(link = "logit"), 
##     data = dataset_olympics)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -22.952357   0.241134  -95.19   <2e-16 ***
## Height        0.137251   0.001402   97.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 65987  on 53745  degrees of freedom
## Residual deviance: 51526  on 53744  degrees of freedom
##   (16254 observations deleted due to missingness)
## AIC: 51530
## 
## Number of Fisher Scoring iterations: 5

New Model

We create a model that uses more variables (Weight)

sexModel2 <- glm(is_Male ~ Height + Weight + Age, data=dataset_olympics,family = binomial(link = 'logit'))
summary(sexModel2)
## 
## Call:
## glm(formula = is_Male ~ Height + Weight + Age, family = binomial(link = "logit"), 
##     data = dataset_olympics)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -13.735164   0.291612  -47.10   <2e-16 ***
## Height        0.037161   0.002088   17.80   <2e-16 ***
## Weight        0.111738   0.001917   58.28   <2e-16 ***
## Age           0.024383   0.002203   11.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64617  on 52265  degrees of freedom
## Residual deviance: 45486  on 52262  degrees of freedom
##   (17734 observations deleted due to missingness)
## AIC: 45494
## 
## Number of Fisher Scoring iterations: 5
# Model review
vif(sexModel2)
##   Height   Weight      Age 
## 1.989457 2.010679 1.015195

Weight is more important as a variable than the other 2 and Height is more important than Age for our Model.

Comparing Deviance of Models

paste("Deviance (First Model)", round(sexModel$deviance, 1))
## [1] "Deviance (First Model) 51526.4"
paste("Deviance (Second Model)", round(sexModel2$deviance, 1))
## [1] "Deviance (Second Model) 45485.9"

The second model is a better model as the deviance is lowest of the two.

Comparing AIC

AIC(sexModel, sexModel2)
## Warning in AIC.default(sexModel, sexModel2): models are not all fitted to the
## same number of observations
##           df      AIC
## sexModel   2 51530.42
## sexModel2  4 45493.91
BIC(sexModel, sexModel2)
## Warning in BIC.default(sexModel, sexModel2): models are not all fitted to the
## same number of observations
##           df      BIC
## sexModel   2 51548.20
## sexModel2  4 45529.36