R Markdown

##data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
data(drug_use)

Including Plots

drug_clean <-drug_use %>%
  select(student, parents) %>%
  filter(!is.na(student), !is.na(parents))

##Exploratory data

summary(drug_clean)
##  student    parents   
##  not :226   not :235  
##  uses:219   used:210
table(drug_clean$student)
## 
##  not uses 
##  226  219
table(drug_clean$parents)
## 
##  not used 
##  235  210

##Seperates groups

drug_table <- drug_clean %>%
  group_by(parents, student) %>%
  summarise(count = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by parents and student.
## ℹ Output is grouped by parents.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(parents, student))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

##Arranges

drug_table <- drug_clean %>%
  count(parents, student) %>%
  mutate(proportion = n / sum(n))
drug_table
## # A tibble: 4 × 4
##   parents student     n proportion
##   <fct>   <fct>   <int>      <dbl>
## 1 not     not       141      0.317
## 2 not     uses       94      0.211
## 3 used    not        85      0.191
## 4 used    uses      125      0.281

##Summarizes

drug_model <-glm(student ~ parents,
                 data = drug_clean,
                 family = "binomial")
summary(drug_model)
## 
## Call:
## glm(formula = student ~ parents, family = "binomial", data = drug_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.4055     0.1332  -3.045  0.00233 ** 
## parentsused   0.7911     0.1936   4.086  4.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 616.79  on 444  degrees of freedom
## Residual deviance: 599.77  on 443  degrees of freedom
## AIC: 603.77
## 
## Number of Fisher Scoring iterations: 4

##This can predict probabilities

prob <-predict(drug_model, type = "response")
predicted <- ifelse(prob >0.5, "uses", "not")
table(Predicted = predicted,
      Actual = drug_clean$student)
##          Actual
## Predicted not uses
##      not  141   94
##      uses  85  125

##ROC CURVE

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_curve <- roc(drug_clean$student, prob)
## Setting levels: control = not, case = uses
## Setting direction: controls < cases
plot(roc_curve)

auc(roc_curve)
## Area under the curve: 0.5973