library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(tidyr)
setwd("~/Documents/RStudio (DATA-101)")
survey <- read.csv("acs12.csv")

Introduction

For my research, I will be diving into a survey where the results are based from 2012. This dataset consists of employees based from America in 2012 their status such as hours worked per week, annual income the individual earns, their race, gender, if they have a disability, etc. However, I will be looking at 3 of the variables out the 13 variables that are present in the dataset. The dataset is downloaded from OpenIntro.org but it was published by the American Community Survey.

Research Question: How influential is a person’s disability towards their employment and citizenship status?

Dimensions of the dataset

dim(survey)
## [1] 2000   13

All variables that are in the dataset

str(survey)
## 'data.frame':    2000 obs. of  13 variables:
##  $ income      : int  60000 0 NA 0 0 1700 NA NA NA 45000 ...
##  $ employment  : chr  "not in labor force" "not in labor force" NA "not in labor force" ...
##  $ hrs_work    : int  40 NA NA NA NA 40 NA NA NA 84 ...
##  $ race        : chr  "white" "white" "white" "white" ...
##  $ age         : int  68 88 12 17 77 35 11 7 6 27 ...
##  $ gender      : chr  "female" "male" "female" "male" ...
##  $ citizen     : chr  "yes" "yes" "yes" "yes" ...
##  $ time_to_work: int  NA NA NA NA NA 15 NA NA NA 40 ...
##  $ lang        : chr  "english" "english" "english" "other" ...
##  $ married     : chr  "no" "no" "no" "no" ...
##  $ edu         : chr  "college" "hs or lower" "hs or lower" "hs or lower" ...
##  $ disability  : chr  "no" "yes" "no" "no" ...
##  $ birth_qrtr  : chr  "jul thru sep" "jan thru mar" "oct thru dec" "oct thru dec" ...

Converting Disability from Categorical to Numerical

# Converting "disability" column from categorical to numerical so "Yes" and "No" are changed to 1 and 0 to perform the logistic regression model. Yes = 1 & No = 0

 
survey$disability <- ifelse(survey$disability == "yes", 1, 0)


str(survey)
## 'data.frame':    2000 obs. of  13 variables:
##  $ income      : int  60000 0 NA 0 0 1700 NA NA NA 45000 ...
##  $ employment  : chr  "not in labor force" "not in labor force" NA "not in labor force" ...
##  $ hrs_work    : int  40 NA NA NA NA 40 NA NA NA 84 ...
##  $ race        : chr  "white" "white" "white" "white" ...
##  $ age         : int  68 88 12 17 77 35 11 7 6 27 ...
##  $ gender      : chr  "female" "male" "female" "male" ...
##  $ citizen     : chr  "yes" "yes" "yes" "yes" ...
##  $ time_to_work: int  NA NA NA NA NA 15 NA NA NA 40 ...
##  $ lang        : chr  "english" "english" "english" "other" ...
##  $ married     : chr  "no" "no" "no" "no" ...
##  $ edu         : chr  "college" "hs or lower" "hs or lower" "hs or lower" ...
##  $ disability  : num  0 1 0 0 1 1 0 1 0 0 ...
##  $ birth_qrtr  : chr  "jul thru sep" "jan thru mar" "oct thru dec" "oct thru dec" ...

Here we perform the Logistic Regression Model to get the summary for our coefficients.

# Logistic Regression Model 
logistic <- glm(disability ~ employment + citizen, data = survey, family = "binomial") 
summary(logistic)
## 
## Call:
## glm(formula = disability ~ employment + citizen, family = "binomial", 
##     data = survey)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.1500     0.3265  -9.649  < 2e-16 ***
## employmentnot in labor force   1.7538     0.1500  11.694  < 2e-16 ***
## employmentunemployed           1.0335     0.2747   3.763 0.000168 ***
## citizenyes                     0.7832     0.3114   2.515 0.011896 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1586.7  on 1604  degrees of freedom
## Residual deviance: 1421.6  on 1601  degrees of freedom
##   (395 observations deleted due to missingness)
## AIC: 1429.6
## 
## Number of Fisher Scoring iterations: 5

We get our r-squared result here.

r_square <- 1 - (logistic$deviance/logistic$null.deviance)

r_square
## [1] 0.1040769

p-value will show as 0 which would indicate a null hypothesis being extremely strong.

## Getting our p-value

1 - pchisq((logistic$null.deviance - logistic$deviance), df=1)
## [1] 0

Creating Predicted Classes

predicted.probabilities <- logistic$fitted.values
predicted.classes <- ifelse(predicted.probabilities > 0.5, 1, 0)

survey_subset <- survey[complete.cases(survey[, c("employment", "citizen")]), ]

survey_subset$survey_num <- ifelse(survey_subset$disability == "1", 1, 0)

Building a Confusion Matrix

confusion <- table(
  Predicted = factor(predicted.classes, levels = c(0, 1)),
  Actual = factor(survey_subset$survey_num, levels = c(0, 1)))
confusion
##          Actual
## Predicted    0    1
##         0 1291  314
##         1    0    0
length(predicted.classes)
## [1] 1605
length(survey_subset$survey_num)
## [1] 1605

Plotting ROC to display AUC

#Enter your code here
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# ROC curve & AUC on full data
roc_obj <- roc(response = survey_subset$survey,
               predictor = logistic$fitted.values,
               levels = c("0", "1"),
               direction = "<") 
# Print AUC value
auc_val <- auc(roc_obj); auc_val
## Area under the curve: 0.7096
# Plot ROC with AUC displayed
plot.roc(roc_obj, print.auc = TRUE, legacy.axes = TRUE,
         xlab = "False Positive Rate (1 - Specificity)",
         ylab = "True Positive Rate (Sensitivity)")

Conclusion

The Area Under Curve (AUC) is 0.710 which is greater than our typical probability threshold of 0.5. Our AUC model is telling that there is approximately 71% chance that an American individual’s employment and citizenship status would be driven by the individual’s disability.

71% is decent with our model but definitely not perfect, especially when there are missing values that can fluctuate that value. This does an acceptable job of distinguishing the predicted variables for our observed disability variable.

Future Directions

If I were to work with a dataset like this again, I would like to be more specific with the education level the individual is currently in. The dataset I was working with had a good amount of variables in it, but it had a lot of missing values on it which can making sorting this dataset challenging.