library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
setwd("~/Documents/RStudio (DATA-101)")
smoking <- read.csv("smoking.csv")

Introduction

For my analysis, I will be looking at a study based at the United Kingdom, involving smoking but based around their marital status and ethnicity. These variables will help me to be able to come up with my conclusion if marital status and ethnicity influences smoking results, whether it’s yes or no.

Research Question:

How influential is a person’s marital status and ethnicity towards their smoking?

Dataset

This dataset comes from openintro which can be accessed here https://www.openintro.org/data/index.php?data=smoking

dim(smoking)
## [1] 1691   12
str(smoking)
## 'data.frame':    1691 obs. of  12 variables:
##  $ gender               : chr  "Male" "Female" "Male" "Female" ...
##  $ age                  : int  38 42 40 40 39 37 53 44 40 41 ...
##  $ marital_status       : chr  "Divorced" "Single" "Married" "Married" ...
##  $ highest_qualification: chr  "No Qualification" "No Qualification" "Degree" "Degree" ...
##  $ nationality          : chr  "British" "British" "English" "English" ...
##  $ ethnicity            : chr  "White" "White" "White" "White" ...
##  $ gross_income         : chr  "2,600 to 5,200" "Under 2,600" "28,600 to 36,400" "10,400 to 15,600" ...
##  $ region               : chr  "The North" "The North" "The North" "The North" ...
##  $ smoke                : chr  "No" "Yes" "No" "No" ...
##  $ amt_weekends         : int  NA 12 NA NA NA NA 6 NA 8 15 ...
##  $ amt_weekdays         : int  NA 12 NA NA NA NA 6 NA 8 12 ...
##  $ type                 : chr  "" "Packets" "" "" ...
head(smoking)
##   gender age marital_status highest_qualification nationality ethnicity
## 1   Male  38       Divorced      No Qualification     British     White
## 2 Female  42         Single      No Qualification     British     White
## 3   Male  40        Married                Degree     English     White
## 4 Female  40        Married                Degree     English     White
## 5 Female  39        Married          GCSE/O Level     British     White
## 6 Female  37        Married          GCSE/O Level     British     White
##       gross_income    region smoke amt_weekends amt_weekdays    type
## 1   2,600 to 5,200 The North    No           NA           NA        
## 2      Under 2,600 The North   Yes           12           12 Packets
## 3 28,600 to 36,400 The North    No           NA           NA        
## 4 10,400 to 15,600 The North    No           NA           NA        
## 5   2,600 to 5,200 The North    No           NA           NA        
## 6 15,600 to 20,800 The North    No           NA           NA
# Converting "smoke" column from categorical to numerical so "Yes" and "No" are changed to 1 and 0 to perform the logistic regression model. Yes = 1 & No = 0

 
smoking$smoke <- ifelse(smoking$smoke == "Yes", 1, 0)


str(smoking)
## 'data.frame':    1691 obs. of  12 variables:
##  $ gender               : chr  "Male" "Female" "Male" "Female" ...
##  $ age                  : int  38 42 40 40 39 37 53 44 40 41 ...
##  $ marital_status       : chr  "Divorced" "Single" "Married" "Married" ...
##  $ highest_qualification: chr  "No Qualification" "No Qualification" "Degree" "Degree" ...
##  $ nationality          : chr  "British" "British" "English" "English" ...
##  $ ethnicity            : chr  "White" "White" "White" "White" ...
##  $ gross_income         : chr  "2,600 to 5,200" "Under 2,600" "28,600 to 36,400" "10,400 to 15,600" ...
##  $ region               : chr  "The North" "The North" "The North" "The North" ...
##  $ smoke                : num  0 1 0 0 0 0 1 0 1 1 ...
##  $ amt_weekends         : int  NA 12 NA NA NA NA 6 NA 8 15 ...
##  $ amt_weekdays         : int  NA 12 NA NA NA NA 6 NA 8 12 ...
##  $ type                 : chr  "" "Packets" "" "" ...

Type of Data Analysis

As mentioned already in the last chunk, I will be using the logistic regression analysis method to help me answer my research question.

# Logistic Regression Model
logistic <- glm(smoke ~ marital_status + ethnicity , data = smoking, family = "binomial") 
summary(logistic)
## 
## Call:
## glm(formula = smoke ~ marital_status + ethnicity, family = "binomial", 
##     data = smoking)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.75001    0.43603  -1.720   0.0854 .  
## marital_statusMarried   -0.95885    0.18889  -5.076 3.85e-07 ***
## marital_statusSeparated -0.14868    0.30760  -0.483   0.6288    
## marital_statusSingle     0.06157    0.19339   0.318   0.7502    
## marital_statusWidowed   -0.94039    0.24006  -3.917 8.96e-05 ***
## ethnicityBlack          -0.20290    0.57675  -0.352   0.7250    
## ethnicityChinese        -0.25249    0.64537  -0.391   0.6956    
## ethnicityMixed           0.59522    0.70013   0.850   0.3952    
## ethnicityRefused         0.30066    0.73686   0.408   0.6833    
## ethnicityUnknown         1.22944    1.51162   0.813   0.4160    
## ethnicityWhite           0.17570    0.40638   0.432   0.6655    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1898.0  on 1690  degrees of freedom
## Residual deviance: 1821.6  on 1680  degrees of freedom
## AIC: 1843.6
## 
## Number of Fisher Scoring iterations: 4
# Clearing out N/A results
colSums(is.na(smoking))
##                gender                   age        marital_status 
##                     0                     0                     0 
## highest_qualification           nationality             ethnicity 
##                     0                     0                     0 
##          gross_income                region                 smoke 
##                     0                     0                     0 
##          amt_weekends          amt_weekdays                  type 
##                  1270                  1270                     0
r_square <- 1 - (logistic$deviance/logistic$null.deviance)

r_square
## [1] 0.04022142

Creating Predicted Classes

predicted.probabilities <- logistic$fitted.values
predicted.classes <- ifelse(predicted.probabilities > 0.5, 1, 0)

smoking_subset <- smoking[complete.cases(smoking[, c("marital_status", "ethnicity")]), ]

smoking_subset$smoke_num <- ifelse(smoking_subset$smoke == "1", 1, 0)

Building a Confusion Matrix

confusion <- table(
  Predicted = factor(predicted.classes, levels = c(0, 1)),
  Actual = factor(smoking_subset$smoke_num, levels = c(0, 1)))
confusion
##          Actual
## Predicted    0    1
##         0 1270  420
##         1    0    1

Again, Yes = 1 & No = 0

#Enter your code here
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# ROC curve & AUC on full data
roc_obj <- roc(response = smoking_subset$smoke,
               predictor = logistic$fitted.values,
               levels = c("0", "1"),
               direction = "<")  # smaller prob = Healthy
# Print AUC value
auc_val <- auc(roc_obj); auc_val
## Area under the curve: 0.6251
# Plot ROC with AUC displayed
plot.roc(roc_obj, print.auc = TRUE, legacy.axes = TRUE,
         xlab = "False Positive Rate (1 - Specificity)",
         ylab = "True Positive Rate (Sensitivity)")

Findings

Since the Area Under Curve (AUC) was determined as 0.625, it’s greater than 0.5 making this scale of the model while not near perfect, but rather decent.

This model tells that there is approximately 62.5% chance that a person’s marital status and ethnicity influences their smoking.

Suggestions for Future Research

There are a few other columns in the dataset I have not covered in this project such as age, gender, their highest level of education, and gross income. I was thinking that the crime rate or health rate can also be involved in influencing the smoking rate. I feel like those tie more towards smoking rates, at least whenever smoking topics comes to mind. This was still an interesting dataset to analyze nonetheless.