###
# Project: QMSS 201 Group Project
# Purpose: Group Project
# Author: Leah Adams, Diya Chadha, Ava Morisi
# Date: April 6 2025
# Data: 
###

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
recreation_data <- read.csv("recreation_data.csv", header=TRUE)
recreation_data1 <- recreation_data %>%
  mutate(AGE = as.numeric(AGE),
         LIMIT1_A2 = as.factor(x = LIMIT1_A2),
         LIMIT1_A2 = recode(.x = LIMIT1_A2, "1" = "Limits Participation", "2" = "Limits Participation", "3" = "Does Not Limit Participation"),
         INCOME = na_if(INCOME, 99),
         INCOME = as.numeric(INCOME),
         EMP1 = as.character(EMP1),
         EMP1 = na_if(EMP1, "99"),
         EMP1 = recode(EMP1, '1' = 1, '2' = 2, '3' = 3, '4' = 3, '5' = 4, '6' = 4),
         EMP1 = as.numeric(EMP1))

renamed_recreation_data <- recreation_data1 %>% 
  rename("EmploymentStatus" = EMP1)%>% 
  rename("FinancialLimitations" = LIMIT1_A2)

table(renamed_recreation_data$EmploymentStatus)
## 
##    1    2    3    4 
## 1186  343  334 1046
levels(renamed_recreation_data$EmploymentStatus)
## NULL
summary(renamed_recreation_data$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   39.00   57.00   55.77   69.00   99.00
summary(renamed_recreation_data$INCOME)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   3.000   5.000   4.781   6.000   8.000     568
summary(renamed_recreation_data$EmploymentStatus)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.426   4.000   4.000     121
library(descr)
freq(x = renamed_recreation_data$EmploymentStatus, plot = FALSE)
## renamed_recreation_data$EmploymentStatus 
##       Frequency Percent Valid Percent
## 1          1186  39.142         40.77
## 2           343  11.320         11.79
## 3           334  11.023         11.48
## 4          1046  34.521         35.96
## NA's        121   3.993              
## Total      3030 100.000        100.00
freq(x = renamed_recreation_data$INCOME, plot = FALSE)
## renamed_recreation_data$INCOME 
##       Frequency Percent Valid Percent
## 1           163   5.380         6.621
## 2           226   7.459         9.180
## 3           272   8.977        11.048
## 4           351  11.584        14.257
## 5           521  17.195        21.162
## 6           366  12.079        14.866
## 7           369  12.178        14.988
## 8           194   6.403         7.880
## NA's        568  18.746              
## Total      3030 100.000       100.000
freq(x = renamed_recreation_data$FinancialLimitations, plot = FALSE)
## renamed_recreation_data$FinancialLimitations 
##                              Frequency Percent
## Limits Participation              1420   46.86
## Does Not Limit Participation      1610   53.14
## Total                             3030  100.00
# recode gender variable
cleanr_recreation_data <- renamed_recreation_data %>%
  mutate(AGE = na_if(AGE, 99),AGE = na_if(AGE, 4),AGE = na_if(AGE, 16))%>%
  mutate(agecategories = case_when(
    AGE >= 18 & AGE <= 29 ~ '18-29',
    AGE >= 30 & AGE <= 39 ~ '30-39',
    AGE >= 40 & AGE <= 49 ~ '40-49',
    AGE >= 50 & AGE <= 59 ~ '50-59',
    AGE >= 60 & AGE <= 69 ~ '60-69',
    AGE >= 70 & AGE <= 98 ~ '70+',
    TRUE ~ NA_character_ )) %>%
  mutate(agecategories = as.factor(agecategories)) 

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
table(cleanr_recreation_data$AGE)
## 
## 11 15 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 
##  1  1 17 13 23 29 33 18 32 17 36 35 43 30 53 41 28 43 48 47 42 41 41 44 45 30 
## 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 
## 39 39 32 42 38 51 21 43 51 52 45 48 54 50 51 64 43 61 74 53 64 74 64 76 69 68 
## 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 
## 62 68 70 66 47 43 46 35 27 26 21 29 14  8 15  8 12  2  4  4  1  3  3  1  1  1 
## 94 
##  2
table(cleanr_recreation_data$agecategories)
## 
## 18-29 30-39 40-49 50-59 60-69   70+ 
##   326   428   380   519   672   489
class(cleanr_recreation_data$AGE)
## [1] "numeric"
class(cleanr_recreation_data$agecategories)
## [1] "factor"
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
crosstab(cleanr_recreation_data, row.vars = "agecategories", col.vars = "FinancialLimitations", type = "c")
##               FinancialLimitations Limits Participation Does Not Limit Participation
## agecategories                                                                       
## 18-29                                             18.65                         5.35
## 30-39                                             19.71                        11.24
## 40-49                                             15.69                        11.57
## 50-59                                             17.29                        19.46
## 60-69                                             18.35                        28.76
## 70+                                               10.31                        23.61
## Sum                                              100.00                       100.00
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
crosstab(cleanr_recreation_data, row.vars = "INCOME", col.vars = "FinancialLimitations", type = "c")
##        FinancialLimitations Limits Participation Does Not Limit Participation
## INCOME                                                                       
## 1                                          10.68                         2.77
## 2                                          13.94                         4.67
## 3                                          12.85                         9.34
## 4                                          17.11                        11.55
## 5                                          19.12                        23.10
## 6                                          11.19                        18.35
## 7                                          11.60                        18.20
## 8                                           3.51                        12.03
## Sum                                       100.00                       100.00
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
crosstab(cleanr_recreation_data, row.vars = "EmploymentStatus", col.vars = "FinancialLimitations", type = "c")
##                  FinancialLimitations Limits Participation Does Not Limit Participation
## EmploymentStatus                                                                       
## 1                                                    41.07                        40.51
## 2                                                    14.23                         9.68
## 3                                                    16.53                         7.12
## 4                                                    28.17                        42.69
## Sum                                                 100.00                       100.00
class("EmploymentStatus")
## [1] "character"
chisq.test(table(cleanr_recreation_data$EmploymentStatus, cleanr_recreation_data$FinancialLimitations))
## 
##  Pearson's Chi-squared test
## 
## data:  table(cleanr_recreation_data$EmploymentStatus, cleanr_recreation_data$FinancialLimitations)
## X-squared = 111.07, df = 3, p-value < 2.2e-16
class("INCOME")
## [1] "character"
chisq.test(table(cleanr_recreation_data$INCOME, cleanr_recreation_data$FinancialLimitations))
## 
##  Pearson's Chi-squared test
## 
## data:  table(cleanr_recreation_data$INCOME, cleanr_recreation_data$FinancialLimitations)
## X-squared = 236.43, df = 7, p-value < 2.2e-16
class("agecategories")
## [1] "character"
chisq.test(table(cleanr_recreation_data$agecategories, cleanr_recreation_data$FinancialLimitations))
## 
##  Pearson's Chi-squared test
## 
## data:  table(cleanr_recreation_data$agecategories, cleanr_recreation_data$FinancialLimitations)
## X-squared = 253.87, df = 5, p-value < 2.2e-16
cleanr_recreation_data1 <- cleanr_recreation_data %>%
  mutate(INCOME = as.numeric(INCOME))


lm1<- lm(FinancialLimitations ~ EmploymentStatus + INCOME + AGE, data=cleanr_recreation_data1)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
lm1
## 
## Call:
## lm(formula = FinancialLimitations ~ EmploymentStatus + INCOME + 
##     AGE, data = cleanr_recreation_data1)
## 
## Coefficients:
##      (Intercept)  EmploymentStatus            INCOME               AGE  
##         0.767205         -0.001543          0.068967          0.008255
glm1 <- glm(FinancialLimitations ~ EmploymentStatus + INCOME + AGE, data=cleanr_recreation_data1, family = binomial("logit"))
glm1
## 
## Call:  glm(formula = FinancialLimitations ~ EmploymentStatus + INCOME + 
##     AGE, family = binomial("logit"), data = cleanr_recreation_data1)
## 
## Coefficients:
##      (Intercept)  EmploymentStatus            INCOME               AGE  
##         -3.42788          -0.01493           0.32151           0.03868  
## 
## Degrees of Freedom: 2372 Total (i.e. Null);  2369 Residual
##   (657 observations deleted due to missingness)
## Null Deviance:       3286 
## Residual Deviance: 2870  AIC: 2878
library(odds.n.ends)
odds.n.ends(glm1)
## Waiting for profiling to be done...
## $`Logistic regression model significance`
## Chi-squared        d.f.           p 
##     416.532           3       <.001 
## 
## $`Contingency tables (model fit): frequency predicted`
##                 Number observed
## Number predicted    1    0  Sum
##              1    904  449 1353
##              0    327  693 1020
##              Sum 1231 1142 2373
## 
## $`Count R-squared (model fit): percent correctly predicted`
## [1] 67.29878
## 
## $`Model sensitivity`
## [1] 0.7343623
## 
## $`Model specificity`
## [1] 0.6068301
## 
## $`Predictor odds ratios and 95% CI`
##                          OR      2.5 %     97.5 %
## (Intercept)      0.03245565 0.02161476 0.04822928
## EmploymentStatus 0.98518288 0.90767114 1.06909381
## INCOME           1.37921425 1.31228609 1.45081579
## AGE              1.03944290 1.03279444 1.04626248
library(pandoc)