knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
We structure the data to add a new column called “Medals earned”
dataset_olympics <- read_delim("dataset_olympics.csv")
## Rows: 70000 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_olympics <- dataset_olympics %>% arrange(Name, Year)
dataset_olympics <- dataset_olympics %>%
group_by(Name) %>%
mutate(is_Male = ifelse(Sex == 'M',1,0)) %>%
ungroup()
head(dataset_olympics$is_Male)
sexModel <- glm(is_Male ~ Height, data=dataset_olympics,family = binomial(link = 'logit'))
summary(sexModel)
##
## Call:
## glm(formula = is_Male ~ Height, family = binomial(link = "logit"),
## data = dataset_olympics)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.952357 0.241134 -95.19 <2e-16 ***
## Height 0.137251 0.001402 97.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 65987 on 53745 degrees of freedom
## Residual deviance: 51526 on 53744 degrees of freedom
## (16254 observations deleted due to missingness)
## AIC: 51530
##
## Number of Fisher Scoring iterations: 5
We create a model that uses more variables (Weight)
sexModel2 <- glm(is_Male ~ Height + Weight + Age, data=dataset_olympics,family = binomial(link = 'logit'))
summary(sexModel2)
##
## Call:
## glm(formula = is_Male ~ Height + Weight + Age, family = binomial(link = "logit"),
## data = dataset_olympics)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.735164 0.291612 -47.10 <2e-16 ***
## Height 0.037161 0.002088 17.80 <2e-16 ***
## Weight 0.111738 0.001917 58.28 <2e-16 ***
## Age 0.024383 0.002203 11.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 64617 on 52265 degrees of freedom
## Residual deviance: 45486 on 52262 degrees of freedom
## (17734 observations deleted due to missingness)
## AIC: 45494
##
## Number of Fisher Scoring iterations: 5
# Model review
vif(sexModel2)
## Height Weight Age
## 1.989457 2.010679 1.015195
Weight is more important as a variable than the other 2 and Height is more important than Age for our Model.
paste("Deviance (First Model)", round(sexModel$deviance, 1))
## [1] "Deviance (First Model) 51526.4"
paste("Deviance (Second Model)", round(sexModel2$deviance, 1))
## [1] "Deviance (Second Model) 45485.9"
The second model is a better model as the deviance is lowest of the two.
AIC(sexModel, sexModel2)
## Warning in AIC.default(sexModel, sexModel2): models are not all fitted to the
## same number of observations
## df AIC
## sexModel 2 51530.42
## sexModel2 4 45493.91
BIC(sexModel, sexModel2)
## Warning in BIC.default(sexModel, sexModel2): models are not all fitted to the
## same number of observations
## df BIC
## sexModel 2 51548.20
## sexModel2 4 45529.36