##data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
data(drug_use)
drug_clean <-drug_use %>%
select(student, parents) %>%
filter(!is.na(student), !is.na(parents))
##Exploratory data
summary(drug_clean)
## student parents
## not :226 not :235
## uses:219 used:210
table(drug_clean$student)
##
## not uses
## 226 219
table(drug_clean$parents)
##
## not used
## 235 210
##Seperates groups
drug_table <- drug_clean %>%
group_by(parents, student) %>%
summarise(count = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by parents and student.
## ℹ Output is grouped by parents.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(parents, student))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
##Arranges
drug_table <- drug_clean %>%
count(parents, student) %>%
mutate(proportion = n / sum(n))
drug_table
## # A tibble: 4 × 4
## parents student n proportion
## <fct> <fct> <int> <dbl>
## 1 not not 141 0.317
## 2 not uses 94 0.211
## 3 used not 85 0.191
## 4 used uses 125 0.281
##Summarizes
drug_model <-glm(student ~ parents,
data = drug_clean,
family = "binomial")
summary(drug_model)
##
## Call:
## glm(formula = student ~ parents, family = "binomial", data = drug_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4055 0.1332 -3.045 0.00233 **
## parentsused 0.7911 0.1936 4.086 4.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 616.79 on 444 degrees of freedom
## Residual deviance: 599.77 on 443 degrees of freedom
## AIC: 603.77
##
## Number of Fisher Scoring iterations: 4
##This can predict probabilities
prob <-predict(drug_model, type = "response")
predicted <- ifelse(prob >0.5, "uses", "not")
table(Predicted = predicted,
Actual = drug_clean$student)
## Actual
## Predicted not uses
## not 141 94
## uses 85 125
##ROC CURVE
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve <- roc(drug_clean$student, prob)
## Setting levels: control = not, case = uses
## Setting direction: controls < cases
plot(roc_curve)
auc(roc_curve)
## Area under the curve: 0.5973